-#ifdef CLIB_HAVE_VEC256
- u64x4 off = u64x4_splat (buffer_mem_start + offset);
- /* if count is not const, compiler will not unroll while loop
- se we maintain two-in-parallel variant */
- while (count >= 8)
- {
- u64x4 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (bi));
- u64x4 b1 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 4));
- /* shift and add to get vlib_buffer_t pointer */
- u64x4_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b);
- u64x4_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 4);
- b += 8;
- bi += 8;
- count -= 8;
- }
-#endif
- while (count >= 4)
- {
-#ifdef CLIB_HAVE_VEC256
- u64x4 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (bi));
- /* shift and add to get vlib_buffer_t pointer */
- u64x4_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b);
-#elif defined (CLIB_HAVE_VEC128)
- u64x2 off = u64x2_splat (buffer_mem_start + offset);
- u32x4 bi4 = u32x4_load_unaligned (bi);
- u64x2 b0 = u64x2_from_u32x4 ((u32x4) bi4);
-#if defined (__aarch64__)
- u64x2 b1 = u64x2_from_u32x4_high ((u32x4) bi4);
-#else
- bi4 = u32x4_shuffle (bi4, 2, 3, 0, 1);
- u64x2 b1 = u64x2_from_u32x4 ((u32x4) bi4);
-#endif
- u64x2_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b);
- u64x2_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 2);
-#else
- b[0] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[0], offset);
- b[1] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[1], offset);
- b[2] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[2], offset);
- b[3] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[3], offset);
-#endif
- b += 4;
- bi += 4;
- count -= 4;
- }
- while (count)
+ void *base = (void *) (buffer_mem_start + offset);
+ int objsize = __builtin_object_size (b, 0);
+ const int sh = CLIB_LOG2_CACHE_LINE_BYTES;
+
+ if (COMPILE_TIME_CONST (count) == 0 && objsize >= 64 * sizeof (b[0]) &&
+ (objsize & ((8 * sizeof (b[0])) - 1)) == 0)