Add vlib_buffer_enqueue_to_next inline function
[vpp.git] / src / plugins / dpdk / device / node.c
index c2d66ae..a1acc1f 100644 (file)
@@ -213,40 +213,6 @@ poll_rate_limit (dpdk_main_t * dm)
       <code>xd->per_interface_next_index</code>
 */
 
-static_always_inline void
-dpdk_mbuf_to_buffer_index_x4 (vlib_main_t * vm, struct rte_mbuf **mb,
-                             u32 * buffers)
-{
-#ifdef CLIB_HAVE_VEC256
-  vlib_buffer_main_t *bm = &buffer_main;
-  u64x4 v = *(u64x4 *) mb;
-  u32x8 v2, mask = { 0, 2, 4, 6, 1, 3, 5, 7 };
-
-  /* load 4 pointers into 256-bit register */
-  v = u64x4_load_unaligned (mb);
-
-  /* vlib_buffer_t is straight after rte_mbuf so advance all 4
-     pointers for size of rte_mbuf */
-  v += u64x4_splat (sizeof (struct rte_mbuf));
-
-  /* calculate 4 buffer indices in paralled */
-  v = (v - u64x4_splat (bm->buffer_mem_start)) >> CLIB_LOG2_CACHE_LINE_BYTES;
-
-  /* permute 256-bit register so lower u32s of each buffer index are
-   * placed into lower 128-bits */
-  v2 = u32x8_permute ((u32x8) v, mask);
-
-  /* extract lower 128-bits and save them to the array of buffer indices */
-  u32x4_store_unaligned (u32x8_extract_lo (v2), buffers);
-#else
-  /* equivalent non-nector implementation */
-  buffers[0] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[0]));
-  buffers[1] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[1]));
-  buffers[2] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[2]));
-  buffers[3] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[3]));
-#endif
-}
-
 static_always_inline u8
 dpdk_ol_flags_extract (struct rte_mbuf **mb, u8 * flags, int count)
 {
@@ -308,19 +274,19 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd,
       vnet_buffer (b[0])->l2_hdr_offset = off;
       b[0]->current_data = off;
 
-      off = mb[0]->data_off;
+      off = mb[1]->data_off;
       next[1] = off + STRUCT_OFFSET_OF (ethernet_header_t, type);
       off -= RTE_PKTMBUF_HEADROOM;
       vnet_buffer (b[1])->l2_hdr_offset = off;
       b[1]->current_data = off;
 
-      off = mb[0]->data_off;
+      off = mb[2]->data_off;
       next[2] = off + STRUCT_OFFSET_OF (ethernet_header_t, type);
       off -= RTE_PKTMBUF_HEADROOM;
       vnet_buffer (b[2])->l2_hdr_offset = off;
       b[2]->current_data = off;
 
-      off = mb[0]->data_off;
+      off = mb[3]->data_off;
       next[3] = off + STRUCT_OFFSET_OF (ethernet_header_t, type);
       off -= RTE_PKTMBUF_HEADROOM;
       vnet_buffer (b[3])->l2_hdr_offset = off;
@@ -449,6 +415,40 @@ dpdk_set_next_from_etype (vlib_main_t * vm, vlib_node_runtime_t * node,
     }
 }
 
+static_always_inline void
+dpdk_process_flow_offload (dpdk_device_t * xd, dpdk_per_thread_data_t * ptd,
+                          uword n_rx_packets)
+{
+  uword n;
+  dpdk_flow_lookup_entry_t *fle;
+  vlib_buffer_t *b0;
+
+  /* TODO prefetch and quad-loop */
+  for (n = 0; n < n_rx_packets; n++)
+    {
+      if ((ptd->flags[n] & (1 << DPDK_RX_F_FDIR)) == 0)
+       continue;
+
+      fle = vec_elt_at_index (xd->flow_lookup_entries,
+                             ptd->mbufs[n]->hash.fdir.hi);
+
+      if (fle->next_index != (u16) ~ 0)
+       ptd->next[n] = fle->next_index;
+
+      if (fle->flow_id != ~0)
+       {
+         b0 = vlib_buffer_from_rte_mbuf (ptd->mbufs[n]);
+         b0->flow_id = fle->flow_id;
+       }
+
+      if (fle->buffer_advance != ~0)
+       {
+         b0 = vlib_buffer_from_rte_mbuf (ptd->mbufs[n]);
+         vlib_buffer_advance (b0, fle->buffer_advance);
+       }
+    }
+}
+
 static_always_inline u32
 dpdk_device_input (vlib_main_t * vm, dpdk_main_t * dm, dpdk_device_t * xd,
                   vlib_node_runtime_t * node, u32 thread_index, u16 queue_id)
@@ -518,12 +518,18 @@ dpdk_device_input (vlib_main_t * vm, dpdk_main_t * dm, dpdk_device_t * xd,
       for (n = 0; n < n_rx_packets; n++)
        ptd->next[n] = next_index;
 
-      bt->feature_arc_index = 0;
+      vnet_buffer (bt)->feature_arc_index = 0;
       bt->current_config_index = 0;
     }
   else
     dpdk_set_next_from_etype (vm, node, ptd, n_rx_packets);
 
+  /* flow offload - process if rx flow offlaod enabled and at least one packet
+     is marked */
+  if (PREDICT_FALSE ((xd->flags & DPDK_DEVICE_FLAG_RX_FLOW_OFFLOAD) &&
+                    (or_flags & (1 << DPDK_RX_F_FDIR))))
+    dpdk_process_flow_offload (xd, ptd, n_rx_packets);
+
   /* is at least one packet marked as ip4 checksum bad? */
   if (PREDICT_FALSE (or_flags & (1 << DPDK_RX_F_CKSUM_BAD)))
     for (n = 0; n < n_rx_packets; n++)
@@ -539,88 +545,12 @@ dpdk_device_input (vlib_main_t * vm, dpdk_main_t * dm, dpdk_device_t * xd,
       }
 
   /* enqueue buffers to the next node */
-  n_left = n_rx_packets;
-  next = ptd->next;
-  buffers = ptd->buffers;
-  mb = ptd->mbufs;
-  while (n_left)
-    {
-      u32 n_left_to_next;
-      u32 *to_next;
-      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
-#ifdef CLIB_HAVE_VEC256
-      while (n_left >= 16 && n_left_to_next >= 16)
-       {
-         u16x16 next16 = u16x16_load_unaligned (next);
-         if (u16x16_is_all_equal (next16, next_index))
-           {
-             dpdk_mbuf_to_buffer_index_x4 (vm, mb, buffers);
-             dpdk_mbuf_to_buffer_index_x4 (vm, mb + 4, buffers + 4);
-             dpdk_mbuf_to_buffer_index_x4 (vm, mb + 8, buffers + 8);
-             dpdk_mbuf_to_buffer_index_x4 (vm, mb + 12, buffers + 12);
-             clib_memcpy (to_next, buffers, 16 * sizeof (u32));
-             to_next += 16;
-             n_left_to_next -= 16;
-             buffers += 16;
-             n_left -= 16;
-             next += 16;
-             mb += 16;
-           }
-         else
-           {
-             dpdk_mbuf_to_buffer_index_x4 (vm, mb, buffers);
-             clib_memcpy (to_next, buffers, 4 * sizeof (u32));
-             to_next += 4;
-             n_left_to_next -= 4;
-
-             vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next,
-                                              n_left_to_next, buffers[0],
-                                              buffers[1], buffers[2],
-                                              buffers[3], next[0], next[1],
-                                              next[2], next[3]);
-             /* next */
-             buffers += 4;
-             n_left -= 4;
-             next += 4;
-             mb += 4;
-           }
-       }
-#endif
-      while (n_left >= 4 && n_left_to_next >= 4)
-       {
-         dpdk_mbuf_to_buffer_index_x4 (vm, mb, buffers);
-         clib_memcpy (to_next, buffers, 4 * sizeof (u32));
-         to_next += 4;
-         n_left_to_next -= 4;
-
-         vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next,
-                                          n_left_to_next, buffers[0],
-                                          buffers[1], buffers[2], buffers[3],
-                                          next[0], next[1], next[2],
-                                          next[3]);
-         /* next */
-         buffers += 4;
-         n_left -= 4;
-         next += 4;
-         mb += 4;
-       }
-      while (n_left && n_left_to_next)
-       {
-         to_next[0] = buffers[0] =
-           vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[0]));
-         to_next += 1;
-         n_left_to_next -= 1;
-         vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
-                                          n_left_to_next, buffers[0],
-                                          next[0]);
-         /* next */
-         buffers += 1;
-         n_left -= 1;
-         next += 1;
-         mb += 1;
-       }
-      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
-    }
+  vlib_get_buffer_indices_with_offset (vm, (void **) ptd->mbufs, ptd->buffers,
+                                      n_rx_packets,
+                                      sizeof (struct rte_mbuf));
+
+  vlib_buffer_enqueue_to_next (vm, node, ptd->buffers, ptd->next,
+                              n_rx_packets);
 
   /* packet trace if enabled */
   if ((n_trace = vlib_get_trace_count (vm, node)))