- /* extract lower 128-bits and save them to the array of buffer indices */
- u32x4_store_unaligned (u32x8_extract_lo (v2), buffers);
+ while (n_left >= 8)
+ {
+#ifdef CLIB_HAVE_VEC256
+ /* load 4 pointers into 256-bit register */
+ u64x4 v0 = u64x4_load_unaligned (mb);
+ u64x4 v1 = u64x4_load_unaligned (mb + 4);
+ u32x8 v2, v3;
+
+ /* calculate 4 buffer indices in parallel
+ vlib_buffer_t is straight after rte_mbuf so advance all 4
+ pointers for size of rte_mbuf */
+ v0 -= off4;
+ v1 -= off4;
+
+ v0 >>= CLIB_LOG2_CACHE_LINE_BYTES;
+ v1 >>= CLIB_LOG2_CACHE_LINE_BYTES;
+
+ /* permute 256-bit register so lower u32s of each buffer index are
+ * placed into lower 128-bits */
+ v2 = u32x8_permute ((u32x8) v0, mask);
+ v3 = u32x8_permute ((u32x8) v1, mask);
+
+ /* extract lower 128-bits and save them to the array of buffer indices */
+ u32x4_store_unaligned (u32x8_extract_lo (v2), bi);
+ u32x4_store_unaligned (u32x8_extract_lo (v3), bi + 4);