+ sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+
+ /* Send pkt back out the RX interface */
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
+
+ if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ sample_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ clib_memcpy (t->new_src_mac, en0->src_address,
+ sizeof (t->new_src_mac));
+ clib_memcpy (t->new_dst_mac, en0->dst_address,
+ sizeof (t->new_dst_mac));
+ }
+
+ pkts_swapped += 1;
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, sample_node.index,
+ SAMPLE_ERROR_SWAPPED, pkts_swapped);
+ return frame->n_vectors;
+}
+#endif
+
+/*
+ * This version swaps mac addresses using an MMX vector shuffle
+ * Node costs about 17 clocks/pkt at a vector size of 26
+ */
+#ifdef VERSION_2
+static uword
+sample_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * frame)
+{
+ u32 n_left_from, *from, *to_next;
+ sample_next_t next_index;
+ u32 pkts_swapped = 0;
+ /* Vector shuffle mask to swap src, dst */
+ u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
+ u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT;
+ u32 sw_if_index0, sw_if_index1;
+ u8x16 src_dst0, src_dst1;
+ ethernet_header_t *en0, *en1;
+ u32 bi0, bi1;
+ vlib_buffer_t *b0, *b1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t *p2, *p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ ASSERT (b0->current_data == 0);
+ ASSERT (b1->current_data == 0);
+
+ en0 = vlib_buffer_get_current (b0);
+ en1 = vlib_buffer_get_current (b1);
+
+ src_dst0 = ((u8x16 *) en0)[0];
+ src_dst1 = ((u8x16 *) en1)[0];
+ src_dst0 = u8x16_shuffle (src_dst0, swapmac);
+ src_dst1 = u8x16_shuffle (src_dst1, swapmac);
+ ((u8x16 *) en0)[0] = src_dst0;
+ ((u8x16 *) en1)[0] = src_dst1;
+
+ sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
+
+ /* Send pkt back out the RX interface */
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
+ vnet_buffer (b1)->sw_if_index[VLIB_TX] = sw_if_index1;
+
+ pkts_swapped += 2;
+
+ if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ sample_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ clib_memcpy (t->new_src_mac, en0->src_address,
+ sizeof (t->new_src_mac));
+ clib_memcpy (t->new_dst_mac, en0->dst_address,
+ sizeof (t->new_dst_mac));
+
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ sample_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ clib_memcpy (t->new_src_mac, en1->src_address,
+ sizeof (t->new_src_mac));
+ clib_memcpy (t->new_dst_mac, en1->dst_address,
+ sizeof (t->new_dst_mac));
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t *b0;
+ u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
+ u32 sw_if_index0;
+ u8x16 src_dst0;
+ ethernet_header_t *en0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ /*
+ * Direct from the driver, we should be at offset 0
+ * aka at &b0->data[0]
+ */
+ ASSERT (b0->current_data == 0);
+
+ en0 = vlib_buffer_get_current (b0);
+ src_dst0 = ((u8x16 *) en0)[0];
+ src_dst0 = u8x16_shuffle (src_dst0, swapmac);
+ ((u8x16 *) en0)[0] = src_dst0;
+
+ sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+
+ /* Send pkt back out the RX interface */
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
+
+ if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ sample_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ clib_memcpy (t->new_src_mac, en0->src_address,
+ sizeof (t->new_src_mac));
+ clib_memcpy (t->new_dst_mac, en0->dst_address,
+ sizeof (t->new_dst_mac));
+ }
+
+ pkts_swapped += 1;
+
+ /* verify speculative enqueue, maybe switch current next frame */