vxlan:offload RX flow
[vpp.git] / src / plugins / dpdk / device / node.c
index b8fe834..602891d 100644 (file)
@@ -28,7 +28,7 @@
 
 #include <dpdk/device/dpdk_priv.h>
 
-#ifndef CLIB_MULTIARCH_VARIANT
+#ifndef CLIB_MARCH_VARIANT
 static char *dpdk_error_strings[] = {
 #define _(n,s) s,
   foreach_dpdk_error
@@ -213,40 +213,6 @@ poll_rate_limit (dpdk_main_t * dm)
       <code>xd->per_interface_next_index</code>
 */
 
-static_always_inline void
-dpdk_mbuf_to_buffer_index_x4 (vlib_main_t * vm, struct rte_mbuf **mb,
-                             u32 * buffers)
-{
-#ifdef CLIB_HAVE_VEC256
-  vlib_buffer_main_t *bm = &buffer_main;
-  u64x4 v = *(u64x4 *) mb;
-  u32x8 v2, mask = { 0, 2, 4, 6, 1, 3, 5, 7 };
-
-  /* load 4 pointers into 256-bit register */
-  v = u64x4_load_unaligned (mb);
-
-  /* vlib_buffer_t is straight after rte_mbuf so advance all 4
-     pointers for size of rte_mbuf */
-  v += u64x4_splat (sizeof (struct rte_mbuf));
-
-  /* calculate 4 buffer indices in paralled */
-  v = (v - u64x4_splat (bm->buffer_mem_start)) >> CLIB_LOG2_CACHE_LINE_BYTES;
-
-  /* permute 256-bit register so lower u32s of each buffer index are
-   * placed into lower 128-bits */
-  v2 = u32x8_permute ((u32x8) v, mask);
-
-  /* extract lower 128-bits and save them to the array of buffer indices */
-  u32x4_store_unaligned (u32x8_extract_lo (v2), buffers);
-#else
-  /* equivalent non-nector implementation */
-  buffers[0] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[0]));
-  buffers[1] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[1]));
-  buffers[2] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[2]));
-  buffers[3] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[3]));
-#endif
-}
-
 static_always_inline u8
 dpdk_ol_flags_extract (struct rte_mbuf **mb, u8 * flags, int count)
 {
@@ -308,19 +274,19 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd,
       vnet_buffer (b[0])->l2_hdr_offset = off;
       b[0]->current_data = off;
 
-      off = mb[0]->data_off;
+      off = mb[1]->data_off;
       next[1] = off + STRUCT_OFFSET_OF (ethernet_header_t, type);
       off -= RTE_PKTMBUF_HEADROOM;
       vnet_buffer (b[1])->l2_hdr_offset = off;
       b[1]->current_data = off;
 
-      off = mb[0]->data_off;
+      off = mb[2]->data_off;
       next[2] = off + STRUCT_OFFSET_OF (ethernet_header_t, type);
       off -= RTE_PKTMBUF_HEADROOM;
       vnet_buffer (b[2])->l2_hdr_offset = off;
       b[2]->current_data = off;
 
-      off = mb[0]->data_off;
+      off = mb[3]->data_off;
       next[3] = off + STRUCT_OFFSET_OF (ethernet_header_t, type);
       off -= RTE_PKTMBUF_HEADROOM;
       vnet_buffer (b[3])->l2_hdr_offset = off;
@@ -449,6 +415,40 @@ dpdk_set_next_from_etype (vlib_main_t * vm, vlib_node_runtime_t * node,
     }
 }
 
+static_always_inline void
+dpdk_process_flow_offload (dpdk_device_t * xd, dpdk_per_thread_data_t * ptd,
+                          uword n_rx_packets)
+{
+  uword n;
+  dpdk_flow_lookup_entry_t *fle;
+  vlib_buffer_t *b0;
+
+  /* TODO prefetch and quad-loop */
+  for (n = 0; n < n_rx_packets; n++)
+    {
+      if ((ptd->flags[n] & (1 << DPDK_RX_F_FDIR)) == 0)
+       continue;
+
+      fle = vec_elt_at_index (xd->flow_lookup_entries,
+                             ptd->mbufs[n]->hash.fdir.hi);
+
+      if (fle->next_index != (u16) ~ 0)
+       ptd->next[n] = fle->next_index;
+
+      if (fle->flow_id != ~0)
+       {
+         b0 = vlib_buffer_from_rte_mbuf (ptd->mbufs[n]);
+         b0->flow_id = fle->flow_id;
+       }
+
+      if (fle->buffer_advance != ~0)
+       {
+         b0 = vlib_buffer_from_rte_mbuf (ptd->mbufs[n]);
+         vlib_buffer_advance (b0, fle->buffer_advance);
+       }
+    }
+}
+
 static_always_inline u32
 dpdk_device_input (vlib_main_t * vm, dpdk_main_t * dm, dpdk_device_t * xd,
                   vlib_node_runtime_t * node, u32 thread_index, u16 queue_id)
@@ -474,7 +474,7 @@ dpdk_device_input (vlib_main_t * vm, dpdk_main_t * dm, dpdk_device_t * xd,
   /* get up to DPDK_RX_BURST_SZ buffers from PMD */
   while (n_rx_packets < DPDK_RX_BURST_SZ)
     {
-      n = rte_eth_rx_burst (xd->device_index, queue_id,
+      n = rte_eth_rx_burst (xd->port_id, queue_id,
                            ptd->mbufs + n_rx_packets,
                            DPDK_RX_BURST_SZ - n_rx_packets);
       n_rx_packets += n;
@@ -500,7 +500,7 @@ dpdk_device_input (vlib_main_t * vm, dpdk_main_t * dm, dpdk_device_t * xd,
       next_index = xd->per_interface_next_index;
     }
 
-  /* as all packets belong to thr same interface feature arc lookup
+  /* as all packets belong to the same interface feature arc lookup
      can be don once and result stored in the buffer template */
   if (PREDICT_FALSE (vnet_device_input_have_features (xd->sw_if_index)))
     {
@@ -524,6 +524,12 @@ dpdk_device_input (vlib_main_t * vm, dpdk_main_t * dm, dpdk_device_t * xd,
   else
     dpdk_set_next_from_etype (vm, node, ptd, n_rx_packets);
 
+  /* flow offload - process if rx flow offlaod enabled and at least one packet
+     is marked */
+  if (PREDICT_FALSE ((xd->flags & DPDK_DEVICE_FLAG_RX_FLOW_OFFLOAD) &&
+                    (or_flags & (1 << DPDK_RX_F_FDIR))))
+    dpdk_process_flow_offload (xd, ptd, n_rx_packets);
+
   /* is at least one packet marked as ip4 checksum bad? */
   if (PREDICT_FALSE (or_flags & (1 << DPDK_RX_F_CKSUM_BAD)))
     for (n = 0; n < n_rx_packets; n++)
@@ -539,91 +545,15 @@ dpdk_device_input (vlib_main_t * vm, dpdk_main_t * dm, dpdk_device_t * xd,
       }
 
   /* enqueue buffers to the next node */
-  n_left = n_rx_packets;
-  next = ptd->next;
-  buffers = ptd->buffers;
-  mb = ptd->mbufs;
-  while (n_left)
-    {
-      u32 n_left_to_next;
-      u32 *to_next;
-      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
-#ifdef CLIB_HAVE_VEC256
-      while (n_left >= 16 && n_left_to_next >= 16)
-       {
-         u16x16 next16 = u16x16_load_unaligned (next);
-         if (u16x16_is_all_equal (next16, next_index))
-           {
-             dpdk_mbuf_to_buffer_index_x4 (vm, mb, buffers);
-             dpdk_mbuf_to_buffer_index_x4 (vm, mb + 4, buffers + 4);
-             dpdk_mbuf_to_buffer_index_x4 (vm, mb + 8, buffers + 8);
-             dpdk_mbuf_to_buffer_index_x4 (vm, mb + 12, buffers + 12);
-             clib_memcpy (to_next, buffers, 16 * sizeof (u32));
-             to_next += 16;
-             n_left_to_next -= 16;
-             buffers += 16;
-             n_left -= 16;
-             next += 16;
-             mb += 16;
-           }
-         else
-           {
-             dpdk_mbuf_to_buffer_index_x4 (vm, mb, buffers);
-             clib_memcpy (to_next, buffers, 4 * sizeof (u32));
-             to_next += 4;
-             n_left_to_next -= 4;
-
-             vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next,
-                                              n_left_to_next, buffers[0],
-                                              buffers[1], buffers[2],
-                                              buffers[3], next[0], next[1],
-                                              next[2], next[3]);
-             /* next */
-             buffers += 4;
-             n_left -= 4;
-             next += 4;
-             mb += 4;
-           }
-       }
-#endif
-      while (n_left >= 4 && n_left_to_next >= 4)
-       {
-         dpdk_mbuf_to_buffer_index_x4 (vm, mb, buffers);
-         clib_memcpy (to_next, buffers, 4 * sizeof (u32));
-         to_next += 4;
-         n_left_to_next -= 4;
-
-         vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next,
-                                          n_left_to_next, buffers[0],
-                                          buffers[1], buffers[2], buffers[3],
-                                          next[0], next[1], next[2],
-                                          next[3]);
-         /* next */
-         buffers += 4;
-         n_left -= 4;
-         next += 4;
-         mb += 4;
-       }
-      while (n_left && n_left_to_next)
-       {
-         to_next[0] = buffers[0] =
-           vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[0]));
-         to_next += 1;
-         n_left_to_next -= 1;
-         vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
-                                          n_left_to_next, buffers[0],
-                                          next[0]);
-         /* next */
-         buffers += 1;
-         n_left -= 1;
-         next += 1;
-         mb += 1;
-       }
-      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
-    }
+  vlib_get_buffer_indices_with_offset (vm, (void **) ptd->mbufs, ptd->buffers,
+                                      n_rx_packets,
+                                      sizeof (struct rte_mbuf));
+
+  vlib_buffer_enqueue_to_next (vm, node, ptd->buffers, ptd->next,
+                              n_rx_packets);
 
   /* packet trace if enabled */
-  if ((n_trace = vlib_get_trace_count (vm, node)))
+  if (PREDICT_FALSE ((n_trace = vlib_get_trace_count (vm, node))))
     {
       n_left = n_rx_packets;
       buffers = ptd->buffers;
@@ -654,6 +584,48 @@ dpdk_device_input (vlib_main_t * vm, dpdk_main_t * dm, dpdk_device_t * xd,
       vlib_set_trace_count (vm, node, n_trace);
     }
 
+  /* rx pcap capture if enabled */
+  if (PREDICT_FALSE (dm->pcap[VLIB_RX].pcap_enable))
+    {
+      u32 bi0;
+      n_left = n_rx_packets;
+      buffers = ptd->buffers;
+      while (n_left)
+       {
+         bi0 = buffers[0];
+         b0 = vlib_get_buffer (vm, bi0);
+         buffers++;
+
+         if (dm->pcap[VLIB_RX].pcap_sw_if_index == 0 ||
+             dm->pcap[VLIB_RX].pcap_sw_if_index
+             == vnet_buffer (b0)->sw_if_index[VLIB_RX])
+           {
+             struct rte_mbuf *mb;
+             i16 data_start;
+             i32 temp_advance;
+
+             /*
+              * Note: current_data will have advanced
+              * when we skip ethernet input.
+              * Temporarily back up to the original DMA
+              * target, so we capture a valid ethernet frame
+              */
+             mb = rte_mbuf_from_vlib_buffer (b0);
+
+             /* Figure out the original data_start */
+             data_start = (mb->buf_addr + mb->data_off) - (void *) b0->data;
+             /* Back up that far */
+             temp_advance = b0->current_data - data_start;
+             vlib_buffer_advance (b0, -temp_advance);
+             /* Trace the packet */
+             pcap_add_buffer (&dm->pcap[VLIB_RX].pcap_main, vm, bi0, 512);
+             /* and advance again */
+             vlib_buffer_advance (b0, temp_advance);
+           }
+         n_left--;
+       }
+    }
+
   vlib_increment_combined_counter
     (vnet_get_main ()->interface_main.combined_sw_if_counters
      + VNET_INTERFACE_COUNTER_RX, thread_index, xd->sw_if_index,
@@ -664,8 +636,7 @@ dpdk_device_input (vlib_main_t * vm, dpdk_main_t * dm, dpdk_device_t * xd,
   return n_rx_packets;
 }
 
-uword CLIB_CPU_OPTIMIZED
-CLIB_MULTIARCH_FN (dpdk_input) (vlib_main_t * vm, vlib_node_runtime_t * node,
+VLIB_NODE_FN (dpdk_input_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
                                vlib_frame_t * f)
 {
   dpdk_main_t *dm = &dpdk_main;
@@ -694,10 +665,9 @@ CLIB_MULTIARCH_FN (dpdk_input) (vlib_main_t * vm, vlib_node_runtime_t * node,
   return n_rx_packets;
 }
 
-#ifndef CLIB_MULTIARCH_VARIANT
+#ifndef CLIB_MARCH_VARIANT
 /* *INDENT-OFF* */
 VLIB_REGISTER_NODE (dpdk_input_node) = {
-  .function = dpdk_input,
   .type = VLIB_NODE_TYPE_INPUT,
   .name = "dpdk-input",
   .sibling_of = "device-input",
@@ -712,20 +682,6 @@ VLIB_REGISTER_NODE (dpdk_input_node) = {
   .error_strings = dpdk_error_strings,
 };
 /* *INDENT-ON* */
-
-vlib_node_function_t __clib_weak dpdk_input_avx512;
-vlib_node_function_t __clib_weak dpdk_input_avx2;
-
-#if __x86_64__
-static void __clib_constructor
-dpdk_input_multiarch_select (void)
-{
-  if (dpdk_input_avx512 && clib_cpu_supports_avx512f ())
-    dpdk_input_node.function = dpdk_input_avx512;
-  else if (dpdk_input_avx2 && clib_cpu_supports_avx2 ())
-    dpdk_input_node.function = dpdk_input_avx2;
-}
-#endif
 #endif
 
 /*