-#if defined (CLIB_HAVE_VEC512)
- u8x64 __attribute__ ((aligned (1))) r0 = *(((u8x64 *) s) + 0);
-
- *(((u8x64 *) d0) + 0) = r0;
- *(((u8x64 *) d1) + 0) = r0;
- *(((u8x64 *) d2) + 0) = r0;
- *(((u8x64 *) d3) + 0) = r0;
-#elif defined (CLIB_HAVE_VEC256)
- u8x32 __attribute__ ((aligned (1))) r0 = *(((u8x32 *) s) + 0);
- u8x32 __attribute__ ((aligned (1))) r1 = *(((u8x32 *) s) + 1);
-
- *(((u8x32 *) d0) + 0) = r0;
- *(((u8x32 *) d0) + 1) = r1;
-
- *(((u8x32 *) d1) + 0) = r0;
- *(((u8x32 *) d1) + 1) = r1;
-
- *(((u8x32 *) d2) + 0) = r0;
- *(((u8x32 *) d2) + 1) = r1;
-
- *(((u8x32 *) d3) + 0) = r0;
- *(((u8x32 *) d3) + 1) = r1;
-#elif defined (CLIB_HAVE_VEC128)
- u8x16 __attribute__ ((aligned (1))) r0 = *(((u8x16 *) s) + 0);
- u8x16 __attribute__ ((aligned (1))) r1 = *(((u8x16 *) s) + 1);
- u8x16 __attribute__ ((aligned (1))) r2 = *(((u8x16 *) s) + 3);
- u8x16 __attribute__ ((aligned (1))) r3 = *(((u8x16 *) s) + 4);
-
- *(((u8x16 *) d0) + 0) = r0;
- *(((u8x16 *) d0) + 1) = r1;
- *(((u8x16 *) d0) + 2) = r2;
- *(((u8x16 *) d0) + 3) = r3;
-
- *(((u8x16 *) d1) + 0) = r0;
- *(((u8x16 *) d1) + 1) = r1;
- *(((u8x16 *) d1) + 2) = r2;
- *(((u8x16 *) d1) + 3) = r3;
-
- *(((u8x16 *) d2) + 0) = r0;
- *(((u8x16 *) d2) + 1) = r1;
- *(((u8x16 *) d2) + 2) = r2;
- *(((u8x16 *) d2) + 3) = r3;
-
- *(((u8x16 *) d3) + 0) = r0;
- *(((u8x16 *) d3) + 1) = r1;
- *(((u8x16 *) d3) + 2) = r2;
- *(((u8x16 *) d3) + 3) = r3;
+#if defined (__AVX512F__)
+ __m512i r0 = _mm512_loadu_si512 (s);
+
+ _mm512_storeu_si512 (d0, r0);
+ _mm512_storeu_si512 (d1, r0);
+ _mm512_storeu_si512 (d2, r0);
+ _mm512_storeu_si512 (d3, r0);
+
+#elif defined (__AVX2__)
+ __m256i r0 = _mm256_loadu_si256 ((__m256i *) (s + 0 * 32));
+ __m256i r1 = _mm256_loadu_si256 ((__m256i *) (s + 1 * 32));
+
+ _mm256_storeu_si256 ((__m256i *) (d0 + 0 * 32), r0);
+ _mm256_storeu_si256 ((__m256i *) (d0 + 1 * 32), r1);
+
+ _mm256_storeu_si256 ((__m256i *) (d1 + 0 * 32), r0);
+ _mm256_storeu_si256 ((__m256i *) (d1 + 1 * 32), r1);
+
+ _mm256_storeu_si256 ((__m256i *) (d2 + 0 * 32), r0);
+ _mm256_storeu_si256 ((__m256i *) (d2 + 1 * 32), r1);
+
+ _mm256_storeu_si256 ((__m256i *) (d3 + 0 * 32), r0);
+ _mm256_storeu_si256 ((__m256i *) (d3 + 1 * 32), r1);
+
+#elif defined (__SSSE3__)
+ __m128i r0 = _mm_loadu_si128 ((__m128i *) (s + 0 * 16));
+ __m128i r1 = _mm_loadu_si128 ((__m128i *) (s + 1 * 16));
+ __m128i r2 = _mm_loadu_si128 ((__m128i *) (s + 2 * 16));
+ __m128i r3 = _mm_loadu_si128 ((__m128i *) (s + 3 * 16));
+
+ _mm_storeu_si128 ((__m128i *) (d0 + 0 * 16), r0);
+ _mm_storeu_si128 ((__m128i *) (d0 + 1 * 16), r1);
+ _mm_storeu_si128 ((__m128i *) (d0 + 2 * 16), r2);
+ _mm_storeu_si128 ((__m128i *) (d0 + 3 * 16), r3);
+
+ _mm_storeu_si128 ((__m128i *) (d1 + 0 * 16), r0);
+ _mm_storeu_si128 ((__m128i *) (d1 + 1 * 16), r1);
+ _mm_storeu_si128 ((__m128i *) (d1 + 2 * 16), r2);
+ _mm_storeu_si128 ((__m128i *) (d1 + 3 * 16), r3);
+
+ _mm_storeu_si128 ((__m128i *) (d2 + 0 * 16), r0);
+ _mm_storeu_si128 ((__m128i *) (d2 + 1 * 16), r1);
+ _mm_storeu_si128 ((__m128i *) (d2 + 2 * 16), r2);
+ _mm_storeu_si128 ((__m128i *) (d2 + 3 * 16), r3);
+
+ _mm_storeu_si128 ((__m128i *) (d3 + 0 * 16), r0);
+ _mm_storeu_si128 ((__m128i *) (d3 + 1 * 16), r1);
+ _mm_storeu_si128 ((__m128i *) (d3 + 2 * 16), r2);
+ _mm_storeu_si128 ((__m128i *) (d3 + 3 * 16), r3);
+
+#else
+ clib_memcpy_fast (d0, s, 64);
+ clib_memcpy_fast (d1, s, 64);
+ clib_memcpy_fast (d2, s, 64);
+ clib_memcpy_fast (d3, s, 64);
+#endif
+}
+
+static_always_inline void
+clib_memset_u64 (void *p, u64 val, uword count)
+{
+ u64 *ptr = p;
+#if defined(CLIB_HAVE_VEC512)
+ u64x8 v512 = u64x8_splat (val);
+ while (count >= 8)
+ {
+ u64x8_store_unaligned (v512, ptr);
+ ptr += 8;
+ count -= 8;
+ }
+ if (count == 0)
+ return;
+#endif
+#if defined(CLIB_HAVE_VEC256)
+ u64x4 v256 = u64x4_splat (val);
+ while (count >= 4)
+ {
+ u64x4_store_unaligned (v256, ptr);
+ ptr += 4;
+ count -= 4;
+ }
+ if (count == 0)
+ return;
+#else
+ while (count >= 4)
+ {
+ ptr[0] = ptr[1] = ptr[2] = ptr[3] = val;
+ ptr += 4;
+ count -= 4;
+ }
+#endif
+ while (count--)
+ ptr++[0] = val;
+}
+
+static_always_inline void
+clib_memset_u32 (void *p, u32 val, uword count)
+{
+ u32 *ptr = p;
+#if defined(CLIB_HAVE_VEC512)
+ u32x16 v512 = u32x16_splat (val);
+ while (count >= 16)
+ {
+ u32x16_store_unaligned (v512, ptr);
+ ptr += 16;
+ count -= 16;
+ }
+ if (count == 0)
+ return;
+#endif
+#if defined(CLIB_HAVE_VEC256)
+ u32x8 v256 = u32x8_splat (val);
+ while (count >= 8)
+ {
+ u32x8_store_unaligned (v256, ptr);
+ ptr += 8;
+ count -= 8;
+ }
+ if (count == 0)
+ return;
+#endif
+#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE)
+ u32x4 v128 = u32x4_splat (val);
+ while (count >= 4)
+ {
+ u32x4_store_unaligned (v128, ptr);
+ ptr += 4;
+ count -= 4;
+ }
+#else
+ while (count >= 4)
+ {
+ ptr[0] = ptr[1] = ptr[2] = ptr[3] = val;
+ ptr += 4;
+ count -= 4;
+ }
+#endif
+ while (count--)
+ ptr++[0] = val;
+}
+
+static_always_inline void
+clib_memset_u16 (void *p, u16 val, uword count)
+{
+ u16 *ptr = p;
+#if defined(CLIB_HAVE_VEC512)
+ u16x32 v512 = u16x32_splat (val);
+ while (count >= 32)
+ {
+ u16x32_store_unaligned (v512, ptr);
+ ptr += 32;
+ count -= 32;
+ }
+ if (count == 0)
+ return;
+#endif
+#if defined(CLIB_HAVE_VEC256)
+ u16x16 v256 = u16x16_splat (val);
+ while (count >= 16)
+ {
+ u16x16_store_unaligned (v256, ptr);
+ ptr += 16;
+ count -= 16;
+ }
+ if (count == 0)
+ return;
+#endif
+#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE)
+ u16x8 v128 = u16x8_splat (val);
+ while (count >= 8)
+ {
+ u16x8_store_unaligned (v128, ptr);
+ ptr += 8;
+ count -= 8;
+ }
+#else
+ while (count >= 4)
+ {
+ ptr[0] = ptr[1] = ptr[2] = ptr[3] = val;
+ ptr += 4;
+ count -= 4;
+ }
+#endif
+ while (count--)
+ ptr++[0] = val;
+}
+
+static_always_inline void
+clib_memset_u8 (void *p, u8 val, uword count)
+{
+ u8 *ptr = p;
+#if defined(CLIB_HAVE_VEC512)
+ u8x64 v512 = u8x64_splat (val);
+ while (count >= 64)
+ {
+ u8x64_store_unaligned (v512, ptr);
+ ptr += 64;
+ count -= 64;
+ }
+ if (count == 0)
+ return;
+#endif
+#if defined(CLIB_HAVE_VEC256)
+ u8x32 v256 = u8x32_splat (val);
+ while (count >= 32)
+ {
+ u8x32_store_unaligned (v256, ptr);
+ ptr += 32;
+ count -= 32;
+ }
+ if (count == 0)
+ return;
+#endif
+#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE)
+ u8x16 v128 = u8x16_splat (val);
+ while (count >= 16)
+ {
+ u8x16_store_unaligned (v128, ptr);
+ ptr += 16;
+ count -= 16;
+ }