2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
15 #include <vlib/vlib.h>
16 #include <vnet/vnet.h>
17 #include <vnet/pg/pg.h>
18 #include <vnet/ethernet/ethernet.h>
19 #include <vppinfra/error.h>
20 #include <sample/sample.h>
31 /* packet trace format function */
33 format_sample_trace (u8 * s, va_list * args)
35 CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
36 CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
37 sample_trace_t *t = va_arg (*args, sample_trace_t *);
39 s = format (s, "SAMPLE: sw_if_index %d, next index %d\n",
40 t->sw_if_index, t->next_index);
41 s = format (s, " new src %U -> new dst %U",
42 format_mac_address, t->new_src_mac,
43 format_mac_address, t->new_dst_mac);
48 extern vlib_node_registration_t sample_node;
50 #define foreach_sample_error \
51 _(SWAPPED, "Mac swap packets processed")
55 #define _(sym,str) SAMPLE_ERROR_##sym,
61 static char *sample_error_strings[] = {
62 #define _(sym,string) string,
69 SAMPLE_NEXT_INTERFACE_OUTPUT,
74 * Simple dual/single loop version, default version which will compile
77 * Node costs 30 clocks/pkt at a vector size of 51
82 #define foreach_mac_address_offset \
90 VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
93 u32 n_left_from, *from, *to_next;
94 sample_next_t next_index;
97 from = vlib_frame_vector_args (frame);
98 n_left_from = frame->n_vectors;
99 next_index = node->cached_next_index;
101 while (n_left_from > 0)
105 vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
107 while (n_left_from >= 4 && n_left_to_next >= 2)
109 u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
110 u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT;
111 u32 sw_if_index0, sw_if_index1;
113 ethernet_header_t *en0, *en1;
115 vlib_buffer_t *b0, *b1;
117 /* Prefetch next iteration. */
119 vlib_buffer_t *p2, *p3;
121 p2 = vlib_get_buffer (vm, from[2]);
122 p3 = vlib_get_buffer (vm, from[3]);
124 vlib_prefetch_buffer_header (p2, LOAD);
125 vlib_prefetch_buffer_header (p3, LOAD);
127 CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
128 CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
131 /* speculatively enqueue b0 and b1 to the current next frame */
132 to_next[0] = bi0 = from[0];
133 to_next[1] = bi1 = from[1];
139 b0 = vlib_get_buffer (vm, bi0);
140 b1 = vlib_get_buffer (vm, bi1);
142 ASSERT (b0->current_data == 0);
143 ASSERT (b1->current_data == 0);
145 en0 = vlib_buffer_get_current (b0);
146 en1 = vlib_buffer_get_current (b1);
148 /* This is not the fastest way to swap src + dst mac addresses */
149 #define _(a) tmp0[a] = en0->src_address[a];
150 foreach_mac_address_offset;
152 #define _(a) en0->src_address[a] = en0->dst_address[a];
153 foreach_mac_address_offset;
155 #define _(a) en0->dst_address[a] = tmp0[a];
156 foreach_mac_address_offset;
159 #define _(a) tmp1[a] = en1->src_address[a];
160 foreach_mac_address_offset;
162 #define _(a) en1->src_address[a] = en1->dst_address[a];
163 foreach_mac_address_offset;
165 #define _(a) en1->dst_address[a] = tmp1[a];
166 foreach_mac_address_offset;
169 sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
170 sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
172 /* Send pkt back out the RX interface */
173 vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
174 vnet_buffer (b1)->sw_if_index[VLIB_TX] = sw_if_index1;
178 if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
180 if (b0->flags & VLIB_BUFFER_IS_TRACED)
183 vlib_add_trace (vm, node, b0, sizeof (*t));
184 t->sw_if_index = sw_if_index0;
185 t->next_index = next0;
186 clib_memcpy_fast (t->new_src_mac, en0->src_address,
187 sizeof (t->new_src_mac));
188 clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
189 sizeof (t->new_dst_mac));
192 if (b1->flags & VLIB_BUFFER_IS_TRACED)
195 vlib_add_trace (vm, node, b1, sizeof (*t));
196 t->sw_if_index = sw_if_index1;
197 t->next_index = next1;
198 clib_memcpy_fast (t->new_src_mac, en1->src_address,
199 sizeof (t->new_src_mac));
200 clib_memcpy_fast (t->new_dst_mac, en1->dst_address,
201 sizeof (t->new_dst_mac));
205 /* verify speculative enqueues, maybe switch current next frame */
206 vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
207 to_next, n_left_to_next,
208 bi0, bi1, next0, next1);
211 while (n_left_from > 0 && n_left_to_next > 0)
215 u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
218 ethernet_header_t *en0;
220 /* speculatively enqueue b0 to the current next frame */
228 b0 = vlib_get_buffer (vm, bi0);
230 * Direct from the driver, we should be at offset 0
231 * aka at &b0->data[0]
233 ASSERT (b0->current_data == 0);
235 en0 = vlib_buffer_get_current (b0);
237 /* This is not the fastest way to swap src + dst mac addresses */
238 #define _(a) tmp0[a] = en0->src_address[a];
239 foreach_mac_address_offset;
241 #define _(a) en0->src_address[a] = en0->dst_address[a];
242 foreach_mac_address_offset;
244 #define _(a) en0->dst_address[a] = tmp0[a];
245 foreach_mac_address_offset;
248 sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
250 /* Send pkt back out the RX interface */
251 vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
253 if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
254 && (b0->flags & VLIB_BUFFER_IS_TRACED)))
256 sample_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
257 t->sw_if_index = sw_if_index0;
258 t->next_index = next0;
259 clib_memcpy_fast (t->new_src_mac, en0->src_address,
260 sizeof (t->new_src_mac));
261 clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
262 sizeof (t->new_dst_mac));
267 /* verify speculative enqueue, maybe switch current next frame */
268 vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
269 to_next, n_left_to_next,
273 vlib_put_next_frame (vm, node, next_index, n_left_to_next);
276 vlib_node_increment_counter (vm, sample_node.index,
277 SAMPLE_ERROR_SWAPPED, pkts_swapped);
278 return frame->n_vectors;
283 * This version swaps mac addresses using an MMX vector shuffle
284 * Node costs about 17 clocks/pkt at a vector size of 26
287 VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
288 vlib_frame_t * frame)
290 u32 n_left_from, *from, *to_next;
291 sample_next_t next_index;
292 u32 pkts_swapped = 0;
293 /* Vector shuffle mask to swap src, dst */
294 u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };
296 from = vlib_frame_vector_args (frame);
297 n_left_from = frame->n_vectors;
298 next_index = node->cached_next_index;
300 while (n_left_from > 0)
304 vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
305 while (n_left_from >= 4 && n_left_to_next >= 2)
307 u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
308 u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT;
309 u32 sw_if_index0, sw_if_index1;
310 u8x16 src_dst0, src_dst1;
311 ethernet_header_t *en0, *en1;
313 vlib_buffer_t *b0, *b1;
315 /* Prefetch next iteration. */
317 vlib_buffer_t *p2, *p3;
319 p2 = vlib_get_buffer (vm, from[2]);
320 p3 = vlib_get_buffer (vm, from[3]);
322 vlib_prefetch_buffer_header (p2, LOAD);
323 vlib_prefetch_buffer_header (p3, LOAD);
325 CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
326 CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
329 /* speculatively enqueue b0 and b1 to the current next frame */
330 to_next[0] = bi0 = from[0];
331 to_next[1] = bi1 = from[1];
337 b0 = vlib_get_buffer (vm, bi0);
338 b1 = vlib_get_buffer (vm, bi1);
340 ASSERT (b0->current_data == 0);
341 ASSERT (b1->current_data == 0);
343 en0 = vlib_buffer_get_current (b0);
344 en1 = vlib_buffer_get_current (b1);
346 src_dst0 = ((u8x16 *) en0)[0];
347 src_dst1 = ((u8x16 *) en1)[0];
348 src_dst0 = u8x16_shuffle (src_dst0, swapmac);
349 src_dst1 = u8x16_shuffle (src_dst1, swapmac);
350 ((u8x16 *) en0)[0] = src_dst0;
351 ((u8x16 *) en1)[0] = src_dst1;
353 sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
354 sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
356 /* Send pkt back out the RX interface */
357 vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
358 vnet_buffer (b1)->sw_if_index[VLIB_TX] = sw_if_index1;
362 if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
364 if (b0->flags & VLIB_BUFFER_IS_TRACED)
367 vlib_add_trace (vm, node, b0, sizeof (*t));
368 t->sw_if_index = sw_if_index0;
369 t->next_index = next0;
370 clib_memcpy_fast (t->new_src_mac, en0->src_address,
371 sizeof (t->new_src_mac));
372 clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
373 sizeof (t->new_dst_mac));
376 if (b1->flags & VLIB_BUFFER_IS_TRACED)
379 vlib_add_trace (vm, node, b1, sizeof (*t));
380 t->sw_if_index = sw_if_index1;
381 t->next_index = next1;
382 clib_memcpy_fast (t->new_src_mac, en1->src_address,
383 sizeof (t->new_src_mac));
384 clib_memcpy_fast (t->new_dst_mac, en1->dst_address,
385 sizeof (t->new_dst_mac));
389 /* verify speculative enqueues, maybe switch current next frame */
390 vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
391 to_next, n_left_to_next,
392 bi0, bi1, next0, next1);
395 while (n_left_from > 0 && n_left_to_next > 0)
399 u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
402 ethernet_header_t *en0;
404 /* speculatively enqueue b0 to the current next frame */
412 b0 = vlib_get_buffer (vm, bi0);
414 * Direct from the driver, we should be at offset 0
415 * aka at &b0->data[0]
417 ASSERT (b0->current_data == 0);
419 en0 = vlib_buffer_get_current (b0);
420 src_dst0 = ((u8x16 *) en0)[0];
421 src_dst0 = u8x16_shuffle (src_dst0, swapmac);
422 ((u8x16 *) en0)[0] = src_dst0;
424 sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
426 /* Send pkt back out the RX interface */
427 vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
429 if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
430 && (b0->flags & VLIB_BUFFER_IS_TRACED)))
432 sample_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
433 t->sw_if_index = sw_if_index0;
434 t->next_index = next0;
435 clib_memcpy_fast (t->new_src_mac, en0->src_address,
436 sizeof (t->new_src_mac));
437 clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
438 sizeof (t->new_dst_mac));
443 /* verify speculative enqueue, maybe switch current next frame */
444 vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
445 to_next, n_left_to_next,
449 vlib_put_next_frame (vm, node, next_index, n_left_to_next);
452 vlib_node_increment_counter (vm, sample_node.index,
453 SAMPLE_ERROR_SWAPPED, pkts_swapped);
454 return frame->n_vectors;
460 * This version computes all of the buffer pointers in
461 * one motion, uses a quad/single loop model, and
462 * traces the entire frame in one motion.
464 * Node costs about 16 clocks/pkt at a vector size of 26
466 * Some compilation drama with u8x16_shuffle, so turned off by
472 #define u8x16_shuffle __builtin_shuffle
473 /* This would normally be a stack local, but since it's a constant... */
474 static const u16 nexts[VLIB_FRAME_SIZE] = { 0 };
476 VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
477 vlib_frame_t * frame)
479 u32 n_left_from, *from;
480 u32 pkts_swapped = 0;
481 /* Vector shuffle mask to swap src, dst */
482 u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };
483 vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
484 /* See comment below about sending all pkts to the same place... */
485 u16 *next __attribute__ ((unused));
487 from = vlib_frame_vector_args (frame);
488 n_left_from = frame->n_vectors;
490 vlib_get_buffers (vm, from, bufs, n_left_from);
495 * We send all pkts to SAMPLE_NEXT_INTERFACE_OUTPUT, aka
496 * graph arc 0. So the usual setting of next[0...3] is commented
500 while (n_left_from >= 4)
502 u8x16 src_dst0, src_dst1, src_dst2, src_dst3;
503 /* Prefetch next iteration. */
504 if (PREDICT_TRUE (n_left_from >= 8))
506 vlib_prefetch_buffer_header (b[4], STORE);
507 vlib_prefetch_buffer_header (b[5], STORE);
508 vlib_prefetch_buffer_header (b[6], STORE);
509 vlib_prefetch_buffer_header (b[7], STORE);
510 CLIB_PREFETCH (&b[4]->data, CLIB_CACHE_LINE_BYTES, STORE);
511 CLIB_PREFETCH (&b[5]->data, CLIB_CACHE_LINE_BYTES, STORE);
512 CLIB_PREFETCH (&b[6]->data, CLIB_CACHE_LINE_BYTES, STORE);
513 CLIB_PREFETCH (&b[7]->data, CLIB_CACHE_LINE_BYTES, STORE);
516 src_dst0 = ((u8x16 *) vlib_buffer_get_current (b[0]))[0];
517 src_dst1 = ((u8x16 *) vlib_buffer_get_current (b[1]))[0];
518 src_dst2 = ((u8x16 *) vlib_buffer_get_current (b[2]))[0];
519 src_dst3 = ((u8x16 *) vlib_buffer_get_current (b[3]))[0];
521 src_dst0 = u8x16_shuffle (src_dst0, swapmac);
522 src_dst1 = u8x16_shuffle (src_dst1, swapmac);
523 src_dst2 = u8x16_shuffle (src_dst2, swapmac);
524 src_dst3 = u8x16_shuffle (src_dst3, swapmac);
526 ((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0;
527 ((u8x16 *) vlib_buffer_get_current (b[1]))[0] = src_dst1;
528 ((u8x16 *) vlib_buffer_get_current (b[2]))[0] = src_dst2;
529 ((u8x16 *) vlib_buffer_get_current (b[3]))[0] = src_dst3;
531 vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
532 vnet_buffer (b[0])->sw_if_index[VLIB_RX];
533 vnet_buffer (b[1])->sw_if_index[VLIB_TX] =
534 vnet_buffer (b[1])->sw_if_index[VLIB_RX];
535 vnet_buffer (b[2])->sw_if_index[VLIB_TX] =
536 vnet_buffer (b[2])->sw_if_index[VLIB_RX];
537 vnet_buffer (b[3])->sw_if_index[VLIB_TX] =
538 vnet_buffer (b[3])->sw_if_index[VLIB_RX];
540 // next[0] = SAMPLE_NEXT_INTERFACE_OUTPUT;
541 // next[1] = SAMPLE_NEXT_INTERFACE_OUTPUT;
542 // next[2] = SAMPLE_NEXT_INTERFACE_OUTPUT;
543 // next[3] = SAMPLE_NEXT_INTERFACE_OUTPUT;
551 while (n_left_from > 0)
554 src_dst0 = ((u8x16 *) vlib_buffer_get_current (b[0]))[0];
555 src_dst0 = u8x16_shuffle (src_dst0, swapmac);
556 ((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0;
557 vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
558 vnet_buffer (b[0])->sw_if_index[VLIB_RX];
559 // next[0] = SAMPLE_NEXT_INTERFACE_OUTPUT;
567 vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
570 vlib_node_increment_counter (vm, sample_node.index,
571 SAMPLE_ERROR_SWAPPED, pkts_swapped);
573 if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
578 for (i = 0; i < frame->n_vectors; i++)
580 if (b[0]->flags & VLIB_BUFFER_IS_TRACED)
582 ethernet_header_t *en;
584 vlib_add_trace (vm, node, b[0], sizeof (*t));
585 t->sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
586 t->next_index = SAMPLE_NEXT_INTERFACE_OUTPUT;
587 en = vlib_buffer_get_current (b[0]);
588 clib_memcpy_fast (t->new_src_mac, en->src_address,
589 sizeof (t->new_src_mac));
590 clib_memcpy_fast (t->new_dst_mac, en->dst_address,
591 sizeof (t->new_dst_mac));
598 return frame->n_vectors;
603 VLIB_REGISTER_NODE (sample_node) =
606 .vector_size = sizeof (u32),
607 .format_trace = format_sample_trace,
608 .type = VLIB_NODE_TYPE_INTERNAL,
610 .n_errors = ARRAY_LEN(sample_error_strings),
611 .error_strings = sample_error_strings,
613 .n_next_nodes = SAMPLE_N_NEXT,
615 /* edit / add dispositions here */
617 [SAMPLE_NEXT_INTERFACE_OUTPUT] = "interface-output",
623 * fd.io coding-style-patch-verification: ON
626 * eval: (c-set-style "gnu")