2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
15 #include <vlib/vlib.h>
16 #include <vnet/vnet.h>
17 #include <vnet/pg/pg.h>
18 #include <vppinfra/error.h>
19 #include <sample/sample.h>
30 format_mac_address (u8 * s, va_list * args)
32 u8 *a = va_arg (*args, u8 *);
33 return format (s, "%02x:%02x:%02x:%02x:%02x:%02x",
34 a[0], a[1], a[2], a[3], a[4], a[5]);
37 /* packet trace format function */
39 format_sample_trace (u8 * s, va_list * args)
41 CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
42 CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
43 sample_trace_t *t = va_arg (*args, sample_trace_t *);
45 s = format (s, "SAMPLE: sw_if_index %d, next index %d\n",
46 t->sw_if_index, t->next_index);
47 s = format (s, " new src %U -> new dst %U",
48 format_mac_address, t->new_src_mac,
49 format_mac_address, t->new_dst_mac);
54 vlib_node_registration_t sample_node;
56 #define foreach_sample_error \
57 _(SWAPPED, "Mac swap packets processed")
61 #define _(sym,str) SAMPLE_ERROR_##sym,
67 static char *sample_error_strings[] = {
68 #define _(sym,string) string,
75 SAMPLE_NEXT_INTERFACE_OUTPUT,
80 * Simple dual/single loop version, default version which will compile
83 * Node costs 30 clocks/pkt at a vector size of 51
88 #define foreach_mac_address_offset \
97 sample_node_fn (vlib_main_t * vm,
98 vlib_node_runtime_t * node, vlib_frame_t * frame)
100 u32 n_left_from, *from, *to_next;
101 sample_next_t next_index;
102 u32 pkts_swapped = 0;
104 from = vlib_frame_vector_args (frame);
105 n_left_from = frame->n_vectors;
106 next_index = node->cached_next_index;
108 while (n_left_from > 0)
112 vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
114 while (n_left_from >= 4 && n_left_to_next >= 2)
116 u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
117 u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT;
118 u32 sw_if_index0, sw_if_index1;
120 ethernet_header_t *en0, *en1;
122 vlib_buffer_t *b0, *b1;
124 /* Prefetch next iteration. */
126 vlib_buffer_t *p2, *p3;
128 p2 = vlib_get_buffer (vm, from[2]);
129 p3 = vlib_get_buffer (vm, from[3]);
131 vlib_prefetch_buffer_header (p2, LOAD);
132 vlib_prefetch_buffer_header (p3, LOAD);
134 CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
135 CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
138 /* speculatively enqueue b0 and b1 to the current next frame */
139 to_next[0] = bi0 = from[0];
140 to_next[1] = bi1 = from[1];
146 b0 = vlib_get_buffer (vm, bi0);
147 b1 = vlib_get_buffer (vm, bi1);
149 ASSERT (b0->current_data == 0);
150 ASSERT (b1->current_data == 0);
152 en0 = vlib_buffer_get_current (b0);
153 en1 = vlib_buffer_get_current (b1);
155 /* This is not the fastest way to swap src + dst mac addresses */
156 #define _(a) tmp0[a] = en0->src_address[a];
157 foreach_mac_address_offset;
159 #define _(a) en0->src_address[a] = en0->dst_address[a];
160 foreach_mac_address_offset;
162 #define _(a) en0->dst_address[a] = tmp0[a];
163 foreach_mac_address_offset;
166 #define _(a) tmp1[a] = en1->src_address[a];
167 foreach_mac_address_offset;
169 #define _(a) en1->src_address[a] = en1->dst_address[a];
170 foreach_mac_address_offset;
172 #define _(a) en1->dst_address[a] = tmp1[a];
173 foreach_mac_address_offset;
176 sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
177 sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
179 /* Send pkt back out the RX interface */
180 vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
181 vnet_buffer (b1)->sw_if_index[VLIB_TX] = sw_if_index1;
185 if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
187 if (b0->flags & VLIB_BUFFER_IS_TRACED)
190 vlib_add_trace (vm, node, b0, sizeof (*t));
191 t->sw_if_index = sw_if_index0;
192 t->next_index = next0;
193 clib_memcpy (t->new_src_mac, en0->src_address,
194 sizeof (t->new_src_mac));
195 clib_memcpy (t->new_dst_mac, en0->dst_address,
196 sizeof (t->new_dst_mac));
199 if (b1->flags & VLIB_BUFFER_IS_TRACED)
202 vlib_add_trace (vm, node, b1, sizeof (*t));
203 t->sw_if_index = sw_if_index1;
204 t->next_index = next1;
205 clib_memcpy (t->new_src_mac, en1->src_address,
206 sizeof (t->new_src_mac));
207 clib_memcpy (t->new_dst_mac, en1->dst_address,
208 sizeof (t->new_dst_mac));
212 /* verify speculative enqueues, maybe switch current next frame */
213 vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
214 to_next, n_left_to_next,
215 bi0, bi1, next0, next1);
218 while (n_left_from > 0 && n_left_to_next > 0)
222 u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
225 ethernet_header_t *en0;
227 /* speculatively enqueue b0 to the current next frame */
235 b0 = vlib_get_buffer (vm, bi0);
237 * Direct from the driver, we should be at offset 0
238 * aka at &b0->data[0]
240 ASSERT (b0->current_data == 0);
242 en0 = vlib_buffer_get_current (b0);
244 /* This is not the fastest way to swap src + dst mac addresses */
245 #define _(a) tmp0[a] = en0->src_address[a];
246 foreach_mac_address_offset;
248 #define _(a) en0->src_address[a] = en0->dst_address[a];
249 foreach_mac_address_offset;
251 #define _(a) en0->dst_address[a] = tmp0[a];
252 foreach_mac_address_offset;
255 sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
257 /* Send pkt back out the RX interface */
258 vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
260 if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
261 && (b0->flags & VLIB_BUFFER_IS_TRACED)))
263 sample_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
264 t->sw_if_index = sw_if_index0;
265 t->next_index = next0;
266 clib_memcpy (t->new_src_mac, en0->src_address,
267 sizeof (t->new_src_mac));
268 clib_memcpy (t->new_dst_mac, en0->dst_address,
269 sizeof (t->new_dst_mac));
274 /* verify speculative enqueue, maybe switch current next frame */
275 vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
276 to_next, n_left_to_next,
280 vlib_put_next_frame (vm, node, next_index, n_left_to_next);
283 vlib_node_increment_counter (vm, sample_node.index,
284 SAMPLE_ERROR_SWAPPED, pkts_swapped);
285 return frame->n_vectors;
290 * This version swaps mac addresses using an MMX vector shuffle
291 * Node costs about 17 clocks/pkt at a vector size of 26
295 sample_node_fn (vlib_main_t * vm,
296 vlib_node_runtime_t * node, vlib_frame_t * frame)
298 u32 n_left_from, *from, *to_next;
299 sample_next_t next_index;
300 u32 pkts_swapped = 0;
301 /* Vector shuffle mask to swap src, dst */
302 u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };
304 from = vlib_frame_vector_args (frame);
305 n_left_from = frame->n_vectors;
306 next_index = node->cached_next_index;
308 while (n_left_from > 0)
312 vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
313 while (n_left_from >= 4 && n_left_to_next >= 2)
315 u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
316 u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT;
317 u32 sw_if_index0, sw_if_index1;
318 u8x16 src_dst0, src_dst1;
319 ethernet_header_t *en0, *en1;
321 vlib_buffer_t *b0, *b1;
323 /* Prefetch next iteration. */
325 vlib_buffer_t *p2, *p3;
327 p2 = vlib_get_buffer (vm, from[2]);
328 p3 = vlib_get_buffer (vm, from[3]);
330 vlib_prefetch_buffer_header (p2, LOAD);
331 vlib_prefetch_buffer_header (p3, LOAD);
333 CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
334 CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
337 /* speculatively enqueue b0 and b1 to the current next frame */
338 to_next[0] = bi0 = from[0];
339 to_next[1] = bi1 = from[1];
345 b0 = vlib_get_buffer (vm, bi0);
346 b1 = vlib_get_buffer (vm, bi1);
348 ASSERT (b0->current_data == 0);
349 ASSERT (b1->current_data == 0);
351 en0 = vlib_buffer_get_current (b0);
352 en1 = vlib_buffer_get_current (b1);
354 src_dst0 = ((u8x16 *) en0)[0];
355 src_dst1 = ((u8x16 *) en1)[0];
356 src_dst0 = u8x16_shuffle (src_dst0, swapmac);
357 src_dst1 = u8x16_shuffle (src_dst1, swapmac);
358 ((u8x16 *) en0)[0] = src_dst0;
359 ((u8x16 *) en1)[0] = src_dst1;
361 sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
362 sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
364 /* Send pkt back out the RX interface */
365 vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
366 vnet_buffer (b1)->sw_if_index[VLIB_TX] = sw_if_index1;
370 if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
372 if (b0->flags & VLIB_BUFFER_IS_TRACED)
375 vlib_add_trace (vm, node, b0, sizeof (*t));
376 t->sw_if_index = sw_if_index0;
377 t->next_index = next0;
378 clib_memcpy (t->new_src_mac, en0->src_address,
379 sizeof (t->new_src_mac));
380 clib_memcpy (t->new_dst_mac, en0->dst_address,
381 sizeof (t->new_dst_mac));
384 if (b1->flags & VLIB_BUFFER_IS_TRACED)
387 vlib_add_trace (vm, node, b1, sizeof (*t));
388 t->sw_if_index = sw_if_index1;
389 t->next_index = next1;
390 clib_memcpy (t->new_src_mac, en1->src_address,
391 sizeof (t->new_src_mac));
392 clib_memcpy (t->new_dst_mac, en1->dst_address,
393 sizeof (t->new_dst_mac));
397 /* verify speculative enqueues, maybe switch current next frame */
398 vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
399 to_next, n_left_to_next,
400 bi0, bi1, next0, next1);
403 while (n_left_from > 0 && n_left_to_next > 0)
407 u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
410 ethernet_header_t *en0;
412 /* speculatively enqueue b0 to the current next frame */
420 b0 = vlib_get_buffer (vm, bi0);
422 * Direct from the driver, we should be at offset 0
423 * aka at &b0->data[0]
425 ASSERT (b0->current_data == 0);
427 en0 = vlib_buffer_get_current (b0);
428 src_dst0 = ((u8x16 *) en0)[0];
429 src_dst0 = u8x16_shuffle (src_dst0, swapmac);
430 ((u8x16 *) en0)[0] = src_dst0;
432 sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
434 /* Send pkt back out the RX interface */
435 vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
437 if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
438 && (b0->flags & VLIB_BUFFER_IS_TRACED)))
440 sample_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
441 t->sw_if_index = sw_if_index0;
442 t->next_index = next0;
443 clib_memcpy (t->new_src_mac, en0->src_address,
444 sizeof (t->new_src_mac));
445 clib_memcpy (t->new_dst_mac, en0->dst_address,
446 sizeof (t->new_dst_mac));
451 /* verify speculative enqueue, maybe switch current next frame */
452 vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
453 to_next, n_left_to_next,
457 vlib_put_next_frame (vm, node, next_index, n_left_to_next);
460 vlib_node_increment_counter (vm, sample_node.index,
461 SAMPLE_ERROR_SWAPPED, pkts_swapped);
462 return frame->n_vectors;
468 * This version computes all of the buffer pointers in
469 * one motion, uses a quad/single loop model, and
470 * traces the entire frame in one motion.
472 * Node costs about 16 clocks/pkt at a vector size of 26
474 * Some compilation drama with u8x16_shuffle, so turned off by
480 #define u8x16_shuffle __builtin_shuffle
481 /* This would normally be a stack local, but since it's a constant... */
482 static const u16 nexts[VLIB_FRAME_SIZE] = { 0 };
485 sample_node_fn (vlib_main_t * vm,
486 vlib_node_runtime_t * node, vlib_frame_t * frame)
488 u32 n_left_from, *from;
489 u32 pkts_swapped = 0;
490 /* Vector shuffle mask to swap src, dst */
491 u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };
492 vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
493 /* See comment below about sending all pkts to the same place... */
494 u16 *next __attribute__ ((unused));
496 from = vlib_frame_vector_args (frame);
497 n_left_from = frame->n_vectors;
499 vlib_get_buffers (vm, from, bufs, n_left_from);
504 * We send all pkts to SAMPLE_NEXT_INTERFACE_OUTPUT, aka
505 * graph arc 0. So the usual setting of next[0...3] is commented
509 while (n_left_from >= 4)
511 u8x16 src_dst0, src_dst1, src_dst2, src_dst3;
512 /* Prefetch next iteration. */
513 if (PREDICT_TRUE (n_left_from >= 8))
515 vlib_prefetch_buffer_header (b[4], STORE);
516 vlib_prefetch_buffer_header (b[5], STORE);
517 vlib_prefetch_buffer_header (b[6], STORE);
518 vlib_prefetch_buffer_header (b[7], STORE);
519 CLIB_PREFETCH (&b[4]->data, CLIB_CACHE_LINE_BYTES, STORE);
520 CLIB_PREFETCH (&b[5]->data, CLIB_CACHE_LINE_BYTES, STORE);
521 CLIB_PREFETCH (&b[6]->data, CLIB_CACHE_LINE_BYTES, STORE);
522 CLIB_PREFETCH (&b[7]->data, CLIB_CACHE_LINE_BYTES, STORE);
525 src_dst0 = ((u8x16 *) vlib_buffer_get_current (b[0]))[0];
526 src_dst1 = ((u8x16 *) vlib_buffer_get_current (b[1]))[0];
527 src_dst2 = ((u8x16 *) vlib_buffer_get_current (b[2]))[0];
528 src_dst3 = ((u8x16 *) vlib_buffer_get_current (b[3]))[0];
530 src_dst0 = u8x16_shuffle (src_dst0, swapmac);
531 src_dst1 = u8x16_shuffle (src_dst1, swapmac);
532 src_dst2 = u8x16_shuffle (src_dst2, swapmac);
533 src_dst3 = u8x16_shuffle (src_dst3, swapmac);
535 ((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0;
536 ((u8x16 *) vlib_buffer_get_current (b[1]))[0] = src_dst1;
537 ((u8x16 *) vlib_buffer_get_current (b[2]))[0] = src_dst2;
538 ((u8x16 *) vlib_buffer_get_current (b[3]))[0] = src_dst3;
540 vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
541 vnet_buffer (b[0])->sw_if_index[VLIB_RX];
542 vnet_buffer (b[1])->sw_if_index[VLIB_TX] =
543 vnet_buffer (b[1])->sw_if_index[VLIB_RX];
544 vnet_buffer (b[2])->sw_if_index[VLIB_TX] =
545 vnet_buffer (b[2])->sw_if_index[VLIB_RX];
546 vnet_buffer (b[3])->sw_if_index[VLIB_TX] =
547 vnet_buffer (b[3])->sw_if_index[VLIB_RX];
549 // next[0] = SAMPLE_NEXT_INTERFACE_OUTPUT;
550 // next[1] = SAMPLE_NEXT_INTERFACE_OUTPUT;
551 // next[2] = SAMPLE_NEXT_INTERFACE_OUTPUT;
552 // next[3] = SAMPLE_NEXT_INTERFACE_OUTPUT;
560 while (n_left_from > 0)
563 src_dst0 = ((u8x16 *) vlib_buffer_get_current (b[0]))[0];
564 src_dst0 = u8x16_shuffle (src_dst0, swapmac);
565 ((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0;
566 vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
567 vnet_buffer (b[0])->sw_if_index[VLIB_RX];
568 // next[0] = SAMPLE_NEXT_INTERFACE_OUTPUT;
576 vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
579 vlib_node_increment_counter (vm, sample_node.index,
580 SAMPLE_ERROR_SWAPPED, pkts_swapped);
582 if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
587 for (i = 0; i < frame->n_vectors; i++)
589 if (b[0]->flags & VLIB_BUFFER_IS_TRACED)
591 ethernet_header_t *en;
593 vlib_add_trace (vm, node, b[0], sizeof (*t));
594 t->sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
595 t->next_index = SAMPLE_NEXT_INTERFACE_OUTPUT;
596 en = vlib_buffer_get_current (b[0]);
597 clib_memcpy (t->new_src_mac, en->src_address,
598 sizeof (t->new_src_mac));
599 clib_memcpy (t->new_dst_mac, en->dst_address,
600 sizeof (t->new_dst_mac));
607 return frame->n_vectors;
612 VLIB_REGISTER_NODE (sample_node) =
614 .function = sample_node_fn,
616 .vector_size = sizeof (u32),
617 .format_trace = format_sample_trace,
618 .type = VLIB_NODE_TYPE_INTERNAL,
620 .n_errors = ARRAY_LEN(sample_error_strings),
621 .error_strings = sample_error_strings,
623 .n_next_nodes = SAMPLE_N_NEXT,
625 /* edit / add dispositions here */
627 [SAMPLE_NEXT_INTERFACE_OUTPUT] = "interface-output",
632 VLIB_NODE_FUNCTION_MULTIARCH (sample_node, sample_node_fn);
635 * fd.io coding-style-patch-verification: ON
638 * eval: (c-set-style "gnu")