Remove c-11 memcpy checks from perf-critical code
[vpp.git] / src / vlib / buffer_node.h
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  * buffer_node.h: VLIB buffer handling node helper macros/inlines
17  *
18  * Copyright (c) 2008 Eliot Dresselhaus
19  *
20  * Permission is hereby granted, free of charge, to any person obtaining
21  * a copy of this software and associated documentation files (the
22  * "Software"), to deal in the Software without restriction, including
23  * without limitation the rights to use, copy, modify, merge, publish,
24  * distribute, sublicense, and/or sell copies of the Software, and to
25  * permit persons to whom the Software is furnished to do so, subject to
26  * the following conditions:
27  *
28  * The above copyright notice and this permission notice shall be
29  * included in all copies or substantial portions of the Software.
30  *
31  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
35  *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
36  *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
37  *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38  */
39
40 #ifndef included_vlib_buffer_node_h
41 #define included_vlib_buffer_node_h
42
43 /** \file
44     vlib buffer/node functions
45 */
46
47 /** \brief Finish enqueueing two buffers forward in the graph.
48  Standard dual loop boilerplate element. This is a MACRO,
49  with MULTIPLE SIDE EFFECTS. In the ideal case,
50  <code>next_index == next0 == next1</code>,
51  which means that the speculative enqueue at the top of the dual loop
52  has correctly dealt with both packets. In that case, the macro does
53  nothing at all.
54
55  @param vm vlib_main_t pointer, varies by thread
56  @param node current node vlib_node_runtime_t pointer
57  @param next_index speculated next index used for both packets
58  @param to_next speculated vector pointer used for both packets
59  @param n_left_to_next number of slots left in speculated vector
60  @param bi0 first buffer index
61  @param bi1 second buffer index
62  @param next0 actual next index to be used for the first packet
63  @param next1 actual next index to be used for the second packet
64
65  @return @c next_index -- speculative next index to be used for future packets
66  @return @c to_next -- speculative frame to be used for future packets
67  @return @c n_left_to_next -- number of slots left in speculative frame
68 */
69
70 #define vlib_validate_buffer_enqueue_x2(vm,node,next_index,to_next,n_left_to_next,bi0,bi1,next0,next1) \
71 do {                                                                    \
72   int enqueue_code = (next0 != next_index) + 2*(next1 != next_index);   \
73                                                                         \
74   if (PREDICT_FALSE (enqueue_code != 0))                                \
75     {                                                                   \
76       switch (enqueue_code)                                             \
77         {                                                               \
78         case 1:                                                         \
79           /* A B A */                                                   \
80           to_next[-2] = bi1;                                            \
81           to_next -= 1;                                                 \
82           n_left_to_next += 1;                                          \
83           vlib_set_next_frame_buffer (vm, node, next0, bi0);            \
84           break;                                                        \
85                                                                         \
86         case 2:                                                         \
87           /* A A B */                                                   \
88           to_next -= 1;                                                 \
89           n_left_to_next += 1;                                          \
90           vlib_set_next_frame_buffer (vm, node, next1, bi1);            \
91           break;                                                        \
92                                                                         \
93         case 3:                                                         \
94           /* A B B or A B C */                                          \
95           to_next -= 2;                                                 \
96           n_left_to_next += 2;                                          \
97           vlib_set_next_frame_buffer (vm, node, next0, bi0);            \
98           vlib_set_next_frame_buffer (vm, node, next1, bi1);            \
99           if (next0 == next1)                                           \
100             {                                                           \
101               vlib_put_next_frame (vm, node, next_index,                \
102                                    n_left_to_next);                     \
103               next_index = next1;                                       \
104               vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); \
105             }                                                           \
106         }                                                               \
107     }                                                                   \
108 } while (0)
109
110
111 /** \brief Finish enqueueing four buffers forward in the graph.
112  Standard quad loop boilerplate element. This is a MACRO,
113  with MULTIPLE SIDE EFFECTS. In the ideal case,
114  <code>next_index == next0 == next1 == next2 == next3</code>,
115  which means that the speculative enqueue at the top of the quad loop
116  has correctly dealt with all four packets. In that case, the macro does
117  nothing at all.
118
119  @param vm vlib_main_t pointer, varies by thread
120  @param node current node vlib_node_runtime_t pointer
121  @param next_index speculated next index used for both packets
122  @param to_next speculated vector pointer used for both packets
123  @param n_left_to_next number of slots left in speculated vector
124  @param bi0 first buffer index
125  @param bi1 second buffer index
126  @param bi2 third buffer index
127  @param bi3 fourth buffer index
128  @param next0 actual next index to be used for the first packet
129  @param next1 actual next index to be used for the second packet
130  @param next2 actual next index to be used for the third packet
131  @param next3 actual next index to be used for the fourth packet
132
133  @return @c next_index -- speculative next index to be used for future packets
134  @return @c to_next -- speculative frame to be used for future packets
135  @return @c n_left_to_next -- number of slots left in speculative frame
136 */
137
138 #define vlib_validate_buffer_enqueue_x4(vm,node,next_index,to_next,n_left_to_next,bi0,bi1,bi2,bi3,next0,next1,next2,next3) \
139 do {                                                                    \
140   /* After the fact: check the [speculative] enqueue to "next" */       \
141   u32 fix_speculation = (next_index ^ next0) | (next_index ^ next1)     \
142     | (next_index ^ next2) | (next_index ^ next3);                      \
143   if (PREDICT_FALSE(fix_speculation))                                   \
144     {                                                                   \
145       /* rewind... */                                                   \
146       to_next -= 4;                                                     \
147       n_left_to_next += 4;                                              \
148                                                                         \
149       /* If bi0 belongs to "next", send it there */                     \
150       if (next_index == next0)                                          \
151         {                                                               \
152           to_next[0] = bi0;                                             \
153           to_next++;                                                    \
154           n_left_to_next --;                                            \
155         }                                                               \
156       else              /* send it where it needs to go */              \
157         vlib_set_next_frame_buffer (vm, node, next0, bi0);              \
158                                                                         \
159       if (next_index == next1)                                          \
160         {                                                               \
161           to_next[0] = bi1;                                             \
162           to_next++;                                                    \
163           n_left_to_next --;                                            \
164         }                                                               \
165       else                                                              \
166         vlib_set_next_frame_buffer (vm, node, next1, bi1);              \
167                                                                         \
168       if (next_index == next2)                                          \
169         {                                                               \
170           to_next[0] = bi2;                                             \
171           to_next++;                                                    \
172           n_left_to_next --;                                            \
173         }                                                               \
174       else                                                              \
175         vlib_set_next_frame_buffer (vm, node, next2, bi2);              \
176                                                                         \
177       if (next_index == next3)                                          \
178         {                                                               \
179           to_next[0] = bi3;                                             \
180           to_next++;                                                    \
181           n_left_to_next --;                                            \
182         }                                                               \
183       else                                                              \
184         {                                                               \
185           vlib_set_next_frame_buffer (vm, node, next3, bi3);            \
186                                                                         \
187           /* Change speculation: last 2 packets went to the same node*/ \
188           if (next2 == next3)                                           \
189             {                                                           \
190               vlib_put_next_frame (vm, node, next_index, n_left_to_next); \
191               next_index = next3;                                       \
192               vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); \
193             }                                                           \
194         }                                                               \
195     }                                                                   \
196  } while(0);
197
198 /** \brief Finish enqueueing one buffer forward in the graph.
199  Standard single loop boilerplate element. This is a MACRO,
200  with MULTIPLE SIDE EFFECTS. In the ideal case,
201  <code>next_index == next0</code>,
202  which means that the speculative enqueue at the top of the single loop
203  has correctly dealt with the packet in hand. In that case, the macro does
204  nothing at all.
205
206  @param vm vlib_main_t pointer, varies by thread
207  @param node current node vlib_node_runtime_t pointer
208  @param next_index speculated next index used for both packets
209  @param to_next speculated vector pointer used for both packets
210  @param n_left_to_next number of slots left in speculated vector
211  @param bi0 first buffer index
212  @param next0 actual next index to be used for the first packet
213
214  @return @c next_index -- speculative next index to be used for future packets
215  @return @c to_next -- speculative frame to be used for future packets
216  @return @c n_left_to_next -- number of slots left in speculative frame
217 */
218 #define vlib_validate_buffer_enqueue_x1(vm,node,next_index,to_next,n_left_to_next,bi0,next0) \
219 do {                                                                    \
220   if (PREDICT_FALSE (next0 != next_index))                              \
221     {                                                                   \
222       vlib_put_next_frame (vm, node, next_index, n_left_to_next + 1);   \
223       next_index = next0;                                               \
224       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); \
225                                                                         \
226       to_next[0] = bi0;                                                 \
227       to_next += 1;                                                     \
228       n_left_to_next -= 1;                                              \
229     }                                                                   \
230 } while (0)
231
232 always_inline uword
233 generic_buffer_node_inline (vlib_main_t * vm,
234                             vlib_node_runtime_t * node,
235                             vlib_frame_t * frame,
236                             uword sizeof_trace,
237                             void *opaque1,
238                             uword opaque2,
239                             void (*two_buffers) (vlib_main_t * vm,
240                                                  void *opaque1,
241                                                  uword opaque2,
242                                                  vlib_buffer_t * b0,
243                                                  vlib_buffer_t * b1,
244                                                  u32 * next0, u32 * next1),
245                             void (*one_buffer) (vlib_main_t * vm,
246                                                 void *opaque1, uword opaque2,
247                                                 vlib_buffer_t * b0,
248                                                 u32 * next0))
249 {
250   u32 n_left_from, *from, *to_next;
251   u32 next_index;
252
253   from = vlib_frame_vector_args (frame);
254   n_left_from = frame->n_vectors;
255   next_index = node->cached_next_index;
256
257   if (node->flags & VLIB_NODE_FLAG_TRACE)
258     vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
259                                    /* stride */ 1, sizeof_trace);
260
261   while (n_left_from > 0)
262     {
263       u32 n_left_to_next;
264
265       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
266
267       while (n_left_from >= 4 && n_left_to_next >= 2)
268         {
269           vlib_buffer_t *p0, *p1;
270           u32 pi0, next0;
271           u32 pi1, next1;
272
273           /* Prefetch next iteration. */
274           {
275             vlib_buffer_t *p2, *p3;
276
277             p2 = vlib_get_buffer (vm, from[2]);
278             p3 = vlib_get_buffer (vm, from[3]);
279
280             vlib_prefetch_buffer_header (p2, LOAD);
281             vlib_prefetch_buffer_header (p3, LOAD);
282
283             CLIB_PREFETCH (p2->data, 64, LOAD);
284             CLIB_PREFETCH (p3->data, 64, LOAD);
285           }
286
287           pi0 = to_next[0] = from[0];
288           pi1 = to_next[1] = from[1];
289           from += 2;
290           to_next += 2;
291           n_left_from -= 2;
292           n_left_to_next -= 2;
293
294           p0 = vlib_get_buffer (vm, pi0);
295           p1 = vlib_get_buffer (vm, pi1);
296
297           two_buffers (vm, opaque1, opaque2, p0, p1, &next0, &next1);
298
299           vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
300                                            to_next, n_left_to_next,
301                                            pi0, pi1, next0, next1);
302         }
303
304       while (n_left_from > 0 && n_left_to_next > 0)
305         {
306           vlib_buffer_t *p0;
307           u32 pi0, next0;
308
309           pi0 = from[0];
310           to_next[0] = pi0;
311           from += 1;
312           to_next += 1;
313           n_left_from -= 1;
314           n_left_to_next -= 1;
315
316           p0 = vlib_get_buffer (vm, pi0);
317
318           one_buffer (vm, opaque1, opaque2, p0, &next0);
319
320           vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
321                                            to_next, n_left_to_next,
322                                            pi0, next0);
323         }
324
325       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
326     }
327
328   return frame->n_vectors;
329 }
330
331 static_always_inline void
332 vlib_buffer_enqueue_to_next (vlib_main_t * vm, vlib_node_runtime_t * node,
333                              u32 * buffers, u16 * nexts, uword count)
334 {
335   u32 *to_next, n_left_to_next, max;
336   u16 next_index;
337
338   next_index = nexts[0];
339   vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
340   max = clib_min (n_left_to_next, count);
341
342   while (count)
343     {
344       u32 n_enqueued;
345       if ((nexts[0] != next_index) || n_left_to_next == 0)
346         {
347           vlib_put_next_frame (vm, node, next_index, n_left_to_next);
348           next_index = nexts[0];
349           vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
350           max = clib_min (n_left_to_next, count);
351         }
352 #if defined(CLIB_HAVE_VEC512)
353       u16x32 next32 = u16x32_load_unaligned (nexts);
354       next32 = (next32 == u16x32_splat (next32[0]));
355       u64 bitmap = u16x32_msb_mask (next32);
356       n_enqueued = count_trailing_zeros (~bitmap);
357 #elif defined(CLIB_HAVE_VEC256)
358       u16x16 next16 = u16x16_load_unaligned (nexts);
359       next16 = (next16 == u16x16_splat (next16[0]));
360       u64 bitmap = u8x32_msb_mask ((u8x32) next16);
361       n_enqueued = count_trailing_zeros (~bitmap) / 2;
362 #elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
363       u16x8 next8 = u16x8_load_unaligned (nexts);
364       next8 = (next8 == u16x8_splat (next8[0]));
365       u64 bitmap = u8x16_msb_mask ((u8x16) next8);
366       n_enqueued = count_trailing_zeros (~bitmap) / 2;
367 #else
368       u16 x = 0;
369       if (count + 3 < max)
370         {
371           x |= next_index ^ nexts[1];
372           x |= next_index ^ nexts[2];
373           x |= next_index ^ nexts[3];
374           n_enqueued = (x == 0) ? 4 : 1;
375         }
376       else
377         n_enqueued = 1;
378 #endif
379
380       if (PREDICT_FALSE (n_enqueued > max))
381         n_enqueued = max;
382
383 #ifdef CLIB_HAVE_VEC512
384       if (n_enqueued >= 32)
385         {
386           clib_memcpy_fast (to_next, buffers, 32 * sizeof (u32));
387           nexts += 32;
388           to_next += 32;
389           buffers += 32;
390           n_left_to_next -= 32;
391           count -= 32;
392           max -= 32;
393           continue;
394         }
395 #endif
396
397 #ifdef CLIB_HAVE_VEC256
398       if (n_enqueued >= 16)
399         {
400           clib_memcpy_fast (to_next, buffers, 16 * sizeof (u32));
401           nexts += 16;
402           to_next += 16;
403           buffers += 16;
404           n_left_to_next -= 16;
405           count -= 16;
406           max -= 16;
407           continue;
408         }
409 #endif
410
411 #ifdef CLIB_HAVE_VEC128
412       if (n_enqueued >= 8)
413         {
414           clib_memcpy_fast (to_next, buffers, 8 * sizeof (u32));
415           nexts += 8;
416           to_next += 8;
417           buffers += 8;
418           n_left_to_next -= 8;
419           count -= 8;
420           max -= 8;
421           continue;
422         }
423 #endif
424
425       if (n_enqueued >= 4)
426         {
427           clib_memcpy_fast (to_next, buffers, 4 * sizeof (u32));
428           nexts += 4;
429           to_next += 4;
430           buffers += 4;
431           n_left_to_next -= 4;
432           count -= 4;
433           max -= 4;
434           continue;
435         }
436
437       /* copy */
438       to_next[0] = buffers[0];
439
440       /* next */
441       nexts += 1;
442       to_next += 1;
443       buffers += 1;
444       n_left_to_next -= 1;
445       count -= 1;
446       max -= 1;
447     }
448   vlib_put_next_frame (vm, node, next_index, n_left_to_next);
449 }
450
451 static_always_inline u32
452 vlib_buffer_enqueue_to_thread (vlib_main_t * vm, u32 frame_queue_index,
453                                u32 * buffer_indices, u16 * thread_indices,
454                                u32 n_packets, int drop_on_congestion)
455 {
456   vlib_thread_main_t *tm = vlib_get_thread_main ();
457   vlib_frame_queue_main_t *fqm;
458   vlib_frame_queue_per_thread_data_t *ptd;
459   u32 n_left = n_packets;
460   u32 drop_list[VLIB_FRAME_SIZE], *dbi = drop_list, n_drop = 0;
461   vlib_frame_queue_elt_t *hf = 0;
462   u32 n_left_to_next_thread = 0, *to_next_thread = 0;
463   u32 next_thread_index, current_thread_index = ~0;
464   int i;
465
466   fqm = vec_elt_at_index (tm->frame_queue_mains, frame_queue_index);
467   ptd = vec_elt_at_index (fqm->per_thread_data, vm->thread_index);
468
469   while (n_left)
470     {
471       next_thread_index = thread_indices[0];
472
473       if (next_thread_index != current_thread_index)
474         {
475
476           if (drop_on_congestion &&
477               is_vlib_frame_queue_congested
478               (frame_queue_index, next_thread_index, fqm->queue_hi_thresh,
479                ptd->congested_handoff_queue_by_thread_index))
480             {
481               dbi[0] = buffer_indices[0];
482               dbi++;
483               n_drop++;
484               goto next;
485             }
486
487           if (hf)
488             hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_thread;
489
490           hf = vlib_get_worker_handoff_queue_elt (frame_queue_index,
491                                                   next_thread_index,
492                                                   ptd->handoff_queue_elt_by_thread_index);
493
494           n_left_to_next_thread = VLIB_FRAME_SIZE - hf->n_vectors;
495           to_next_thread = &hf->buffer_index[hf->n_vectors];
496           current_thread_index = next_thread_index;
497         }
498
499       to_next_thread[0] = buffer_indices[0];
500       to_next_thread++;
501       n_left_to_next_thread--;
502
503       if (n_left_to_next_thread == 0)
504         {
505           hf->n_vectors = VLIB_FRAME_SIZE;
506           vlib_put_frame_queue_elt (hf);
507           current_thread_index = ~0;
508           ptd->handoff_queue_elt_by_thread_index[next_thread_index] = 0;
509           hf = 0;
510         }
511
512       /* next */
513     next:
514       thread_indices += 1;
515       buffer_indices += 1;
516       n_left -= 1;
517     }
518
519   if (hf)
520     hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_thread;
521
522   /* Ship frames to the thread nodes */
523   for (i = 0; i < vec_len (ptd->handoff_queue_elt_by_thread_index); i++)
524     {
525       if (ptd->handoff_queue_elt_by_thread_index[i])
526         {
527           hf = ptd->handoff_queue_elt_by_thread_index[i];
528           /*
529            * It works better to let the handoff node
530            * rate-adapt, always ship the handoff queue element.
531            */
532           if (1 || hf->n_vectors == hf->last_n_vectors)
533             {
534               vlib_put_frame_queue_elt (hf);
535               ptd->handoff_queue_elt_by_thread_index[i] = 0;
536             }
537           else
538             hf->last_n_vectors = hf->n_vectors;
539         }
540       ptd->congested_handoff_queue_by_thread_index[i] =
541         (vlib_frame_queue_t *) (~0);
542     }
543
544   if (drop_on_congestion && n_drop)
545     vlib_buffer_free (vm, drop_list, n_drop);
546
547   return n_packets - n_drop;
548 }
549
550 #endif /* included_vlib_buffer_node_h */
551
552 /*
553  * fd.io coding-style-patch-verification: ON
554  *
555  * Local Variables:
556  * eval: (c-set-style "gnu")
557  * End:
558  */