From 9594b56e6f795751cfef562920ded7874376e91c Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Wed, 25 Jul 2018 16:56:38 -0400 Subject: [PATCH] Improve the sample plugin node dispatch function Three separate implementations, which vary by nearly a factor of two in performance. Most of the performance difference is due to swapping the src/dst mac addresses with an avx2 vector shuffle instruction. Change-Id: Ieb36546d6074e4ac720d452a99d013c698135c57 Signed-off-by: Dave Barach --- src/examples/sample-plugin/sample/node.c | 601 ++++++++++++++++++++++++------- 1 file changed, 473 insertions(+), 128 deletions(-) diff --git a/src/examples/sample-plugin/sample/node.c b/src/examples/sample-plugin/sample/node.c index 94c1706b20a..1b512090469 100644 --- a/src/examples/sample-plugin/sample/node.c +++ b/src/examples/sample-plugin/sample/node.c @@ -18,7 +18,8 @@ #include #include -typedef struct { +typedef struct +{ u32 next_index; u32 sw_if_index; u8 new_src_mac[6]; @@ -34,17 +35,18 @@ format_mac_address (u8 * s, va_list * args) } /* packet trace format function */ -static u8 * format_sample_trace (u8 * s, va_list * args) +static u8 * +format_sample_trace (u8 * s, va_list * args) { CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); - sample_trace_t * t = va_arg (*args, sample_trace_t *); - + sample_trace_t *t = va_arg (*args, sample_trace_t *); + s = format (s, "SAMPLE: sw_if_index %d, next index %d\n", - t->sw_if_index, t->next_index); + t->sw_if_index, t->next_index); s = format (s, " new src %U -> new dst %U", - format_mac_address, t->new_src_mac, - format_mac_address, t->new_dst_mac); + format_mac_address, t->new_src_mac, + format_mac_address, t->new_dst_mac); return s; } @@ -54,24 +56,35 @@ vlib_node_registration_t sample_node; #define foreach_sample_error \ _(SWAPPED, "Mac swap packets processed") -typedef enum { +typedef enum +{ #define _(sym,str) SAMPLE_ERROR_##sym, foreach_sample_error #undef _ - SAMPLE_N_ERROR, + SAMPLE_N_ERROR, } sample_error_t; -static char * sample_error_strings[] = { +static char *sample_error_strings[] = { #define _(sym,string) string, foreach_sample_error #undef _ }; -typedef enum { +typedef enum +{ SAMPLE_NEXT_INTERFACE_OUTPUT, SAMPLE_N_NEXT, } sample_next_t; +/* + * Simple dual/single loop version, default version which will compile + * everywhere. + * + * Node costs 30 clocks/pkt at a vector size of 51 + */ +#define VERSION_1 1 + +#ifdef VERSION_1 #define foreach_mac_address_offset \ _(0) \ _(1) \ @@ -82,10 +95,9 @@ _(5) static uword sample_node_fn (vlib_main_t * vm, - vlib_node_runtime_t * node, - vlib_frame_t * frame) + vlib_node_runtime_t * node, vlib_frame_t * frame) { - u32 n_left_from, * from, * to_next; + u32 n_left_from, *from, *to_next; sample_next_t next_index; u32 pkts_swapped = 0; @@ -97,26 +109,25 @@ sample_node_fn (vlib_main_t * vm, { u32 n_left_to_next; - vlib_get_next_frame (vm, node, next_index, - to_next, n_left_to_next); + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); while (n_left_from >= 4 && n_left_to_next >= 2) { - u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT; - u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT; - u32 sw_if_index0, sw_if_index1; - u8 tmp0[6], tmp1[6]; - ethernet_header_t *en0, *en1; - u32 bi0, bi1; - vlib_buffer_t * b0, * b1; - + u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT; + u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT; + u32 sw_if_index0, sw_if_index1; + u8 tmp0[6], tmp1[6]; + ethernet_header_t *en0, *en1; + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + /* Prefetch next iteration. */ { - vlib_buffer_t * p2, * p3; - + vlib_buffer_t *p2, *p3; + p2 = vlib_get_buffer (vm, from[2]); p3 = vlib_get_buffer (vm, from[3]); - + vlib_prefetch_buffer_header (p2, LOAD); vlib_prefetch_buffer_header (p3, LOAD); @@ -124,7 +135,7 @@ sample_node_fn (vlib_main_t * vm, CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE); } - /* speculatively enqueue b0 and b1 to the current next frame */ + /* speculatively enqueue b0 and b1 to the current next frame */ to_next[0] = bi0 = from[0]; to_next[1] = bi1 = from[1]; from += 2; @@ -135,87 +146,85 @@ sample_node_fn (vlib_main_t * vm, b0 = vlib_get_buffer (vm, bi0); b1 = vlib_get_buffer (vm, bi1); - ASSERT (b0->current_data == 0); - ASSERT (b1->current_data == 0); - - en0 = vlib_buffer_get_current (b0); - en1 = vlib_buffer_get_current (b1); + ASSERT (b0->current_data == 0); + ASSERT (b1->current_data == 0); + + en0 = vlib_buffer_get_current (b0); + en1 = vlib_buffer_get_current (b1); - /* This is not the fastest way to swap src + dst mac addresses */ + /* This is not the fastest way to swap src + dst mac addresses */ #define _(a) tmp0[a] = en0->src_address[a]; - foreach_mac_address_offset; + foreach_mac_address_offset; #undef _ #define _(a) en0->src_address[a] = en0->dst_address[a]; - foreach_mac_address_offset; + foreach_mac_address_offset; #undef _ #define _(a) en0->dst_address[a] = tmp0[a]; - foreach_mac_address_offset; + foreach_mac_address_offset; #undef _ #define _(a) tmp1[a] = en1->src_address[a]; - foreach_mac_address_offset; + foreach_mac_address_offset; #undef _ #define _(a) en1->src_address[a] = en1->dst_address[a]; - foreach_mac_address_offset; + foreach_mac_address_offset; #undef _ #define _(a) en1->dst_address[a] = tmp1[a]; - foreach_mac_address_offset; + foreach_mac_address_offset; #undef _ - - - sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; - sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX]; - - /* Send pkt back out the RX interface */ - vnet_buffer(b0)->sw_if_index[VLIB_TX] = sw_if_index0; - vnet_buffer(b1)->sw_if_index[VLIB_TX] = sw_if_index1; - - pkts_swapped += 2; - - if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE))) - { - if (b0->flags & VLIB_BUFFER_IS_TRACED) - { - sample_trace_t *t = - vlib_add_trace (vm, node, b0, sizeof (*t)); - t->sw_if_index = sw_if_index0; - t->next_index = next0; - clib_memcpy (t->new_src_mac, en0->src_address, - sizeof (t->new_src_mac)); - clib_memcpy (t->new_dst_mac, en0->dst_address, - sizeof (t->new_dst_mac)); - - } - if (b1->flags & VLIB_BUFFER_IS_TRACED) - { - sample_trace_t *t = - vlib_add_trace (vm, node, b1, sizeof (*t)); - t->sw_if_index = sw_if_index1; - t->next_index = next1; - clib_memcpy (t->new_src_mac, en1->src_address, - sizeof (t->new_src_mac)); - clib_memcpy (t->new_dst_mac, en1->dst_address, - sizeof (t->new_dst_mac)); - } - } - - /* verify speculative enqueues, maybe switch current next frame */ - vlib_validate_buffer_enqueue_x2 (vm, node, next_index, - to_next, n_left_to_next, - bi0, bi1, next0, next1); - } + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX]; + + /* Send pkt back out the RX interface */ + vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0; + vnet_buffer (b1)->sw_if_index[VLIB_TX] = sw_if_index1; + + pkts_swapped += 2; + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE))) + { + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + sample_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + clib_memcpy (t->new_src_mac, en0->src_address, + sizeof (t->new_src_mac)); + clib_memcpy (t->new_dst_mac, en0->dst_address, + sizeof (t->new_dst_mac)); + + } + if (b1->flags & VLIB_BUFFER_IS_TRACED) + { + sample_trace_t *t = + vlib_add_trace (vm, node, b1, sizeof (*t)); + t->sw_if_index = sw_if_index1; + t->next_index = next1; + clib_memcpy (t->new_src_mac, en1->src_address, + sizeof (t->new_src_mac)); + clib_memcpy (t->new_dst_mac, en1->dst_address, + sizeof (t->new_dst_mac)); + } + } + + /* verify speculative enqueues, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } while (n_left_from > 0 && n_left_to_next > 0) { - u32 bi0; - vlib_buffer_t * b0; - u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT; - u32 sw_if_index0; - u8 tmp0[6]; - ethernet_header_t *en0; - - /* speculatively enqueue b0 to the current next frame */ + u32 bi0; + vlib_buffer_t *b0; + u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT; + u32 sw_if_index0; + u8 tmp0[6]; + ethernet_header_t *en0; + + /* speculatively enqueue b0 to the current next frame */ bi0 = from[0]; to_next[0] = bi0; from += 1; @@ -224,45 +233,222 @@ sample_node_fn (vlib_main_t * vm, n_left_to_next -= 1; b0 = vlib_get_buffer (vm, bi0); - /* - * Direct from the driver, we should be at offset 0 - * aka at &b0->data[0] - */ - ASSERT (b0->current_data == 0); - - en0 = vlib_buffer_get_current (b0); - - /* This is not the fastest way to swap src + dst mac addresses */ + /* + * Direct from the driver, we should be at offset 0 + * aka at &b0->data[0] + */ + ASSERT (b0->current_data == 0); + + en0 = vlib_buffer_get_current (b0); + + /* This is not the fastest way to swap src + dst mac addresses */ #define _(a) tmp0[a] = en0->src_address[a]; - foreach_mac_address_offset; + foreach_mac_address_offset; #undef _ #define _(a) en0->src_address[a] = en0->dst_address[a]; - foreach_mac_address_offset; + foreach_mac_address_offset; #undef _ #define _(a) en0->dst_address[a] = tmp0[a]; - foreach_mac_address_offset; + foreach_mac_address_offset; #undef _ - sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; - - /* Send pkt back out the RX interface */ - vnet_buffer(b0)->sw_if_index[VLIB_TX] = sw_if_index0; - - if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) - && (b0->flags & VLIB_BUFFER_IS_TRACED))) { - sample_trace_t *t = - vlib_add_trace (vm, node, b0, sizeof (*t)); - t->sw_if_index = sw_if_index0; - t->next_index = next0; - clib_memcpy (t->new_src_mac, en0->src_address, - sizeof (t->new_src_mac)); - clib_memcpy (t->new_dst_mac, en0->dst_address, - sizeof (t->new_dst_mac)); - } - - pkts_swapped += 1; - - /* verify speculative enqueue, maybe switch current next frame */ + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + + /* Send pkt back out the RX interface */ + vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0; + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + sample_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + clib_memcpy (t->new_src_mac, en0->src_address, + sizeof (t->new_src_mac)); + clib_memcpy (t->new_dst_mac, en0->dst_address, + sizeof (t->new_dst_mac)); + } + + pkts_swapped += 1; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, sample_node.index, + SAMPLE_ERROR_SWAPPED, pkts_swapped); + return frame->n_vectors; +} +#endif + +/* + * This version swaps mac addresses using an MMX vector shuffle + * Node costs about 17 clocks/pkt at a vector size of 26 + */ +#ifdef VERSION_2 +static uword +sample_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + u32 n_left_from, *from, *to_next; + sample_next_t next_index; + u32 pkts_swapped = 0; + /* Vector shuffle mask to swap src, dst */ + u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 }; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT; + u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT; + u32 sw_if_index0, sw_if_index1; + u8x16 src_dst0, src_dst1; + ethernet_header_t *en0, *en1; + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t *p2, *p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + /* speculatively enqueue b0 and b1 to the current next frame */ + to_next[0] = bi0 = from[0]; + to_next[1] = bi1 = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + ASSERT (b0->current_data == 0); + ASSERT (b1->current_data == 0); + + en0 = vlib_buffer_get_current (b0); + en1 = vlib_buffer_get_current (b1); + + src_dst0 = ((u8x16 *) en0)[0]; + src_dst1 = ((u8x16 *) en1)[0]; + src_dst0 = u8x16_shuffle (src_dst0, swapmac); + src_dst1 = u8x16_shuffle (src_dst1, swapmac); + ((u8x16 *) en0)[0] = src_dst0; + ((u8x16 *) en1)[0] = src_dst1; + + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX]; + + /* Send pkt back out the RX interface */ + vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0; + vnet_buffer (b1)->sw_if_index[VLIB_TX] = sw_if_index1; + + pkts_swapped += 2; + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE))) + { + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + sample_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + clib_memcpy (t->new_src_mac, en0->src_address, + sizeof (t->new_src_mac)); + clib_memcpy (t->new_dst_mac, en0->dst_address, + sizeof (t->new_dst_mac)); + + } + if (b1->flags & VLIB_BUFFER_IS_TRACED) + { + sample_trace_t *t = + vlib_add_trace (vm, node, b1, sizeof (*t)); + t->sw_if_index = sw_if_index1; + t->next_index = next1; + clib_memcpy (t->new_src_mac, en1->src_address, + sizeof (t->new_src_mac)); + clib_memcpy (t->new_dst_mac, en1->dst_address, + sizeof (t->new_dst_mac)); + } + } + + /* verify speculative enqueues, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT; + u32 sw_if_index0; + u8x16 src_dst0; + ethernet_header_t *en0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + /* + * Direct from the driver, we should be at offset 0 + * aka at &b0->data[0] + */ + ASSERT (b0->current_data == 0); + + en0 = vlib_buffer_get_current (b0); + src_dst0 = ((u8x16 *) en0)[0]; + src_dst0 = u8x16_shuffle (src_dst0, swapmac); + ((u8x16 *) en0)[0] = src_dst0; + + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + + /* Send pkt back out the RX interface */ + vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0; + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + sample_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + clib_memcpy (t->new_src_mac, en0->src_address, + sizeof (t->new_src_mac)); + clib_memcpy (t->new_dst_mac, en0->dst_address, + sizeof (t->new_dst_mac)); + } + + pkts_swapped += 1; + + /* verify speculative enqueue, maybe switch current next frame */ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); @@ -271,18 +457,166 @@ sample_node_fn (vlib_main_t * vm, vlib_put_next_frame (vm, node, next_index, n_left_to_next); } - vlib_node_increment_counter (vm, sample_node.index, - SAMPLE_ERROR_SWAPPED, pkts_swapped); + vlib_node_increment_counter (vm, sample_node.index, + SAMPLE_ERROR_SWAPPED, pkts_swapped); return frame->n_vectors; } +#endif -VLIB_REGISTER_NODE (sample_node) = { + +/* + * This version computes all of the buffer pointers in + * one motion, uses a quad/single loop model, and + * traces the entire frame in one motion. + * + * Node costs about 16 clocks/pkt at a vector size of 26 + * + * Some compilation drama with u8x16_shuffle, so turned off by + * default. + */ + +#ifdef VERSION_3 + +#define u8x16_shuffle __builtin_shuffle +/* This would normally be a stack local, but since it's a constant... */ +static const u16 nexts[VLIB_FRAME_SIZE] = { 0 }; + +static uword +sample_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + u32 n_left_from, *from; + u32 pkts_swapped = 0; + /* Vector shuffle mask to swap src, dst */ + u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 }; + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b; + /* See comment below about sending all pkts to the same place... */ + u16 *next __attribute__ ((unused)); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + + vlib_get_buffers (vm, from, bufs, n_left_from); + b = bufs; + // next = nexts; + + /* + * We send all pkts to SAMPLE_NEXT_INTERFACE_OUTPUT, aka + * graph arc 0. So the usual setting of next[0...3] is commented + * out below + */ + + while (n_left_from >= 4) + { + u8x16 src_dst0, src_dst1, src_dst2, src_dst3; + /* Prefetch next iteration. */ + if (PREDICT_TRUE (n_left_from >= 8)) + { + vlib_prefetch_buffer_header (b[4], STORE); + vlib_prefetch_buffer_header (b[5], STORE); + vlib_prefetch_buffer_header (b[6], STORE); + vlib_prefetch_buffer_header (b[7], STORE); + CLIB_PREFETCH (&b[4]->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (&b[5]->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (&b[6]->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (&b[7]->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + src_dst0 = ((u8x16 *) vlib_buffer_get_current (b[0]))[0]; + src_dst1 = ((u8x16 *) vlib_buffer_get_current (b[1]))[0]; + src_dst2 = ((u8x16 *) vlib_buffer_get_current (b[2]))[0]; + src_dst3 = ((u8x16 *) vlib_buffer_get_current (b[3]))[0]; + + src_dst0 = u8x16_shuffle (src_dst0, swapmac); + src_dst1 = u8x16_shuffle (src_dst1, swapmac); + src_dst2 = u8x16_shuffle (src_dst2, swapmac); + src_dst3 = u8x16_shuffle (src_dst3, swapmac); + + ((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0; + ((u8x16 *) vlib_buffer_get_current (b[1]))[0] = src_dst1; + ((u8x16 *) vlib_buffer_get_current (b[2]))[0] = src_dst2; + ((u8x16 *) vlib_buffer_get_current (b[3]))[0] = src_dst3; + + vnet_buffer (b[0])->sw_if_index[VLIB_TX] = + vnet_buffer (b[0])->sw_if_index[VLIB_RX]; + vnet_buffer (b[1])->sw_if_index[VLIB_TX] = + vnet_buffer (b[1])->sw_if_index[VLIB_RX]; + vnet_buffer (b[2])->sw_if_index[VLIB_TX] = + vnet_buffer (b[2])->sw_if_index[VLIB_RX]; + vnet_buffer (b[3])->sw_if_index[VLIB_TX] = + vnet_buffer (b[3])->sw_if_index[VLIB_RX]; + + // next[0] = SAMPLE_NEXT_INTERFACE_OUTPUT; + // next[1] = SAMPLE_NEXT_INTERFACE_OUTPUT; + // next[2] = SAMPLE_NEXT_INTERFACE_OUTPUT; + // next[3] = SAMPLE_NEXT_INTERFACE_OUTPUT; + + b += 4; + // next += 4; + n_left_from -= 4; + pkts_swapped += 4; + } + + while (n_left_from > 0) + { + u8x16 src_dst0; + src_dst0 = ((u8x16 *) vlib_buffer_get_current (b[0]))[0]; + src_dst0 = u8x16_shuffle (src_dst0, swapmac); + ((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0; + vnet_buffer (b[0])->sw_if_index[VLIB_TX] = + vnet_buffer (b[0])->sw_if_index[VLIB_RX]; + // next[0] = SAMPLE_NEXT_INTERFACE_OUTPUT; + + b += 1; + // next += 1; + n_left_from -= 1; + pkts_swapped += 1; + + } + vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts, + frame->n_vectors); + + vlib_node_increment_counter (vm, sample_node.index, + SAMPLE_ERROR_SWAPPED, pkts_swapped); + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE))) + { + int i; + b = bufs; + + for (i = 0; i < frame->n_vectors; i++) + { + if (b[0]->flags & VLIB_BUFFER_IS_TRACED) + { + ethernet_header_t *en; + sample_trace_t *t = + vlib_add_trace (vm, node, b[0], sizeof (*t)); + t->sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_TX]; + t->next_index = SAMPLE_NEXT_INTERFACE_OUTPUT; + en = vlib_buffer_get_current (b[0]); + clib_memcpy (t->new_src_mac, en->src_address, + sizeof (t->new_src_mac)); + clib_memcpy (t->new_dst_mac, en->dst_address, + sizeof (t->new_dst_mac)); + b++; + } + else + break; + } + } + return frame->n_vectors; +} +#endif + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (sample_node) = +{ .function = sample_node_fn, .name = "sample", .vector_size = sizeof (u32), .format_trace = format_sample_trace, .type = VLIB_NODE_TYPE_INTERNAL, - + .n_errors = ARRAY_LEN(sample_error_strings), .error_strings = sample_error_strings, @@ -290,6 +624,17 @@ VLIB_REGISTER_NODE (sample_node) = { /* edit / add dispositions here */ .next_nodes = { - [SAMPLE_NEXT_INTERFACE_OUTPUT] = "interface-output", + [SAMPLE_NEXT_INTERFACE_OUTPUT] = "interface-output", }, }; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (sample_node, sample_node_fn); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ -- 2.16.6