Add vlib_buffer_enqueue_to_next inline function 35/12635/3
authorDamjan Marion <damarion@cisco.com>
Thu, 17 May 2018 19:12:13 +0000 (21:12 +0200)
committerDave Barach <openvpp@barachs.net>
Fri, 18 May 2018 12:15:52 +0000 (12:15 +0000)
Change-Id: I1042c0fe179b57a00ce99c8d62cb1bdbe24d9184
Signed-off-by: Damjan Marion <damarion@cisco.com>
src/plugins/dpdk/device/node.c
src/vlib/buffer_node.h
src/vppinfra/vector_avx2.h
src/vppinfra/vector_avx512.h
src/vppinfra/vector_sse42.h

index 3311ac4..a1acc1f 100644 (file)
@@ -548,81 +548,9 @@ dpdk_device_input (vlib_main_t * vm, dpdk_main_t * dm, dpdk_device_t * xd,
   vlib_get_buffer_indices_with_offset (vm, (void **) ptd->mbufs, ptd->buffers,
                                       n_rx_packets,
                                       sizeof (struct rte_mbuf));
-  n_left = n_rx_packets;
-  next = ptd->next;
-  buffers = ptd->buffers;
-  mb = ptd->mbufs;
-  while (n_left)
-    {
-      u32 n_left_to_next;
-      u32 *to_next;
-      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
-#ifdef CLIB_HAVE_VEC256
-      while (n_left >= 16 && n_left_to_next >= 16)
-       {
-         u16x16 next16 = u16x16_load_unaligned (next);
-         if (u16x16_is_all_equal (next16, next_index))
-           {
-             clib_memcpy (to_next, buffers, 16 * sizeof (u32));
-             to_next += 16;
-             n_left_to_next -= 16;
-             buffers += 16;
-             n_left -= 16;
-             next += 16;
-             mb += 16;
-           }
-         else
-           {
-             clib_memcpy (to_next, buffers, 4 * sizeof (u32));
-             to_next += 4;
-             n_left_to_next -= 4;
-
-             vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next,
-                                              n_left_to_next, buffers[0],
-                                              buffers[1], buffers[2],
-                                              buffers[3], next[0], next[1],
-                                              next[2], next[3]);
-             /* next */
-             buffers += 4;
-             n_left -= 4;
-             next += 4;
-             mb += 4;
-           }
-       }
-#endif
-      while (n_left >= 4 && n_left_to_next >= 4)
-       {
-         clib_memcpy (to_next, buffers, 4 * sizeof (u32));
-         to_next += 4;
-         n_left_to_next -= 4;
-
-         vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next,
-                                          n_left_to_next, buffers[0],
-                                          buffers[1], buffers[2], buffers[3],
-                                          next[0], next[1], next[2],
-                                          next[3]);
-         /* next */
-         buffers += 4;
-         n_left -= 4;
-         next += 4;
-         mb += 4;
-       }
-      while (n_left && n_left_to_next)
-       {
-         clib_memcpy (to_next, buffers, 1 * sizeof (u32));
-         to_next += 1;
-         n_left_to_next -= 1;
-         vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
-                                          n_left_to_next, buffers[0],
-                                          next[0]);
-         /* next */
-         buffers += 1;
-         n_left -= 1;
-         next += 1;
-         mb += 1;
-       }
-      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
-    }
+
+  vlib_buffer_enqueue_to_next (vm, node, ptd->buffers, ptd->next,
+                              n_rx_packets);
 
   /* packet trace if enabled */
   if ((n_trace = vlib_get_trace_count (vm, node)))
index f9e8b3f..1c4f4e7 100644 (file)
@@ -328,6 +328,121 @@ generic_buffer_node_inline (vlib_main_t * vm,
   return frame->n_vectors;
 }
 
+static_always_inline void
+vlib_buffer_enqueue_to_next (vlib_main_t * vm, vlib_node_runtime_t * node,
+                            u32 * buffers, u16 * nexts, uword count)
+{
+  u32 *to_next, n_left_to_next, max;
+  u16 next_index;
+
+  next_index = nexts[0];
+  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+  max = clib_min (n_left_to_next, count);
+
+  while (count)
+    {
+      u32 n_enqueued;
+      if ((nexts[0] != next_index) || n_left_to_next == 0)
+       {
+         vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+         next_index = nexts[0];
+         vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+         max = clib_min (n_left_to_next, count);
+       }
+#if defined(CLIB_HAVE_VEC512)
+      u16x32 next32 = u16x32_load_unaligned (nexts);
+      next32 = (next32 == u16x32_splat (next32[0]));
+      u64 bitmap = u16x32_msb_mask (next32);
+      n_enqueued = count_trailing_zeros (~bitmap);
+#elif defined(CLIB_HAVE_VEC256)
+      u16x16 next16 = u16x16_load_unaligned (nexts);
+      next16 = (next16 == u16x16_splat (next16[0]));
+      u64 bitmap = u8x32_msb_mask ((u8x32) next16);
+      n_enqueued = count_trailing_zeros (~bitmap) / 2;
+#elif defined(CLIB_HAVE_VEC128)
+      u16x8 next8 = u16x8_load_unaligned (nexts);
+      next8 = (next8 == u16x8_splat (next8[0]));
+      u64 bitmap = u8x16_msb_mask ((u8x16) next8);
+      n_enqueued = count_trailing_zeros (~bitmap) / 2;
+#else
+      u16 x = 0;
+      x |= next_index ^ nexts[1];
+      x |= next_index ^ nexts[2];
+      x |= next_index ^ nexts[3];
+      n_enqueued = (x == 0) ? 4 : 1;
+#endif
+
+      if (PREDICT_FALSE (n_enqueued > max))
+       n_enqueued = max;
+
+#ifdef CLIB_HAVE_VEC512
+      if (n_enqueued >= 32)
+       {
+         clib_memcpy (to_next, buffers, 32 * sizeof (u32));
+         nexts += 32;
+         to_next += 32;
+         buffers += 32;
+         n_left_to_next -= 32;
+         count -= 32;
+         max -= 32;
+         continue;
+       }
+#endif
+
+#ifdef CLIB_HAVE_VEC256
+      if (n_enqueued >= 16)
+       {
+         clib_memcpy (to_next, buffers, 16 * sizeof (u32));
+         nexts += 16;
+         to_next += 16;
+         buffers += 16;
+         n_left_to_next -= 16;
+         count -= 16;
+         max -= 16;
+         continue;
+       }
+#endif
+
+#ifdef CLIB_HAVE_VEC128
+      if (n_enqueued >= 8)
+       {
+         clib_memcpy (to_next, buffers, 8 * sizeof (u32));
+         nexts += 8;
+         to_next += 8;
+         buffers += 8;
+         n_left_to_next -= 8;
+         count -= 8;
+         max -= 8;
+         continue;
+       }
+#endif
+
+      if (n_enqueued >= 4)
+       {
+         clib_memcpy (to_next, buffers, 4 * sizeof (u32));
+         nexts += 4;
+         to_next += 4;
+         buffers += 4;
+         n_left_to_next -= 4;
+         count -= 4;
+         max -= 4;
+         continue;
+       }
+
+      /* copy */
+      to_next[0] = buffers[0];
+
+      /* next */
+      nexts += 1;
+      to_next += 1;
+      buffers += 1;
+      n_left_to_next -= 1;
+      count -= 1;
+      max -= 1;
+    }
+  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+}
+
 #endif /* included_vlib_buffer_node_h */
 
 /*
index f651392..3f0b397 100644 (file)
@@ -81,6 +81,12 @@ u32x8_insert_hi (u32x8 v1, u32x4 v2)
   return (u32x8) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 1);
 }
 
+static_always_inline u32
+u8x32_msb_mask (u8x32 v)
+{
+  return _mm256_movemask_epi8 ((__m256i) v);
+}
+
 /* _extend_to_ */
 /* *INDENT-OFF* */
 #define _(f,t,i) \
index ac4c09b..c1b7c42 100644 (file)
@@ -27,6 +27,7 @@
   _(f,32,8,ps) _(f,64,4,pd)
 
 /* splat, load_unaligned, store_unaligned */
+/* *INDENT-OFF* */
 #define _(t, s, c, i) \
 static_always_inline t##s##x##c                                                \
 t##s##x##c##_splat (t##s x)                                            \
@@ -43,7 +44,15 @@ t##s##x##c##_store_unaligned (t##s##x##c v, void *p)                 \
 
 foreach_avx512_vec512i foreach_avx512_vec512u
 #undef _
-#endif                         /* included_vector_avx512_h */
+/* *INDENT-ON* */
+
+static_always_inline u32
+u16x32_msb_mask (u16x32 v)
+{
+  return (u32) _mm512_movepi16_mask ((__m512i) v);
+}
+
+#endif /* included_vector_avx512_h */
 /*
  * fd.io coding-style-patch-verification: ON
  *
index cf7f158..50aa662 100644 (file)
@@ -574,6 +574,12 @@ i16x8_min_scalar (i16x8 x)
   return _mm_extract_epi16 ((__m128i) x, 0);
 }
 
+static_always_inline u16
+u8x16_msb_mask (u8x16 v)
+{
+  return _mm_movemask_epi8 ((__m128i) v);
+}
+
 #undef _signed_binop
 
 #endif /* included_vector_sse2_h */