static_always_inline void
clib_memcpy_le (u8 * dst, u8 * src, u8 len, u8 max_len)
{
static_always_inline void
clib_memcpy_le (u8 * dst, u8 * src, u8 len, u8 max_len)
{
u8x32 mask = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
};
u8x32 lv = u8x32_splat (len);
u8x32 add = u8x32_splat (32);
u8x32 mask = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
};
u8x32 lv = u8x32_splat (len);
u8x32 add = u8x32_splat (32);
- s = u8x32_load_unaligned (src);
- d = u8x32_load_unaligned (dst);
- d = u8x32_blend (d, s, u8x32_is_greater (lv, mask));
- u8x32_store_unaligned (d, dst);
+ s0 = u8x32_load_unaligned (src);
+ s1 = u8x32_load_unaligned (src + 32);
+ d0 = u8x32_load_unaligned (dst);
+ d1 = u8x32_load_unaligned (dst + 32);
+
+ d0 = u8x32_blend (d0, s0, u8x32_is_greater (lv, mask));
+ u8x32_store_unaligned (d0, dst);
- s = u8x32_load_unaligned (src + 32);
- d = u8x32_load_unaligned (dst + 32);
- d = u8x32_blend (d, s, u8x32_is_greater (lv, mask));
- u8x32_store_unaligned (d, dst + 32);
+ d1 = u8x32_blend (d1, s1, u8x32_is_greater (lv, mask));
+ u8x32_store_unaligned (d1, dst + 32);
-#elif defined (CLIB_HAVE_VEC128) && !defined (__aarch64__)
- u8x16 s, d;
+#elif defined (CLIB_HAVE_VEC128)
+ u8x16 s0, s1, s2, s3, d0, d1, d2, d3;
u8x16 mask = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
u8x16 lv = u8x16_splat (len);
u8x16 add = u8x16_splat (16);
u8x16 mask = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
u8x16 lv = u8x16_splat (len);
u8x16 add = u8x16_splat (16);
- s = u8x16_load_unaligned (src);
- d = u8x16_load_unaligned (dst);
- d = u8x16_blend (d, s, u8x16_is_greater (lv, mask));
- u8x16_store_unaligned (d, dst);
+ s0 = u8x16_load_unaligned (src);
+ s1 = u8x16_load_unaligned (src + 16);
+ s2 = u8x16_load_unaligned (src + 32);
+ s3 = u8x16_load_unaligned (src + 48);
+ d0 = u8x16_load_unaligned (dst);
+ d1 = u8x16_load_unaligned (dst + 16);
+ d2 = u8x16_load_unaligned (dst + 32);
+ d3 = u8x16_load_unaligned (dst + 48);
+
+ d0 = u8x16_blend (d0, s0, u8x16_is_greater (lv, mask));
+ u8x16_store_unaligned (d0, dst);
- s = u8x16_load_unaligned (src + 16);
- d = u8x16_load_unaligned (dst + 16);
- d = u8x16_blend (d, s, u8x16_is_greater (lv, mask));
- u8x16_store_unaligned (d, dst + 16);
+ d1 = u8x16_blend (d1, s1, u8x16_is_greater (lv, mask));
+ u8x16_store_unaligned (d1, dst + 16);
- s = u8x16_load_unaligned (src + 32);
- d = u8x16_load_unaligned (dst + 32);
- d = u8x16_blend (d, s, u8x16_is_greater (lv, mask));
- u8x16_store_unaligned (d, dst + 32);
+ d2 = u8x16_blend (d2, s2, u8x16_is_greater (lv, mask));
+ u8x16_store_unaligned (d2, dst + 32);
- s = u8x16_load_unaligned (src + 48);
- d = u8x16_load_unaligned (dst + 48);
- d = u8x16_blend (d, s, u8x16_is_greater (lv, mask));
- u8x16_store_unaligned (d, dst + 48);
+ d3 = u8x16_blend (d3, s3, u8x16_is_greater (lv, mask));
+ u8x16_store_unaligned (d3, dst + 48);
{
u64 bmp;
bmp = u8x32_msb_mask ((u8x32) (u64x4_load_unaligned (data) == splat));
if (bmp != 0xffffffff)
{
count += count_trailing_zeros (~bmp) / 8;
{
u64 bmp;
bmp = u8x32_msb_mask ((u8x32) (u64x4_load_unaligned (data) == splat));
if (bmp != 0xffffffff)
{
count += count_trailing_zeros (~bmp) / 8;
{
u64 bmp;
bmp = u8x32_msb_mask ((u8x32) (u32x8_load_unaligned (data) == splat));
if (bmp != 0xffffffff)
{
count += count_trailing_zeros (~bmp) / 4;
{
u64 bmp;
bmp = u8x32_msb_mask ((u8x32) (u32x8_load_unaligned (data) == splat));
if (bmp != 0xffffffff)
{
count += count_trailing_zeros (~bmp) / 4;
}
#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
u32x4 splat = u32x4_splat (first);
}
#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
u32x4 splat = u32x4_splat (first);
{
u64 bmp;
bmp = u8x16_msb_mask ((u8x16) (u32x4_load_unaligned (data) == splat));
if (bmp != 0xffff)
{
count += count_trailing_zeros (~bmp) / 4;
{
u64 bmp;
bmp = u8x16_msb_mask ((u8x16) (u32x4_load_unaligned (data) == splat));
if (bmp != 0xffff)
{
count += count_trailing_zeros (~bmp) / 4;
{
u64 bmp;
bmp = u8x32_msb_mask ((u8x32) (u16x16_load_unaligned (data) == splat));
if (bmp != 0xffffffff)
{
count += count_trailing_zeros (~bmp) / 2;
{
u64 bmp;
bmp = u8x32_msb_mask ((u8x32) (u16x16_load_unaligned (data) == splat));
if (bmp != 0xffffffff)
{
count += count_trailing_zeros (~bmp) / 2;
}
#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
u16x8 splat = u16x8_splat (first);
}
#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
u16x8 splat = u16x8_splat (first);
{
u64 bmp;
bmp = u8x16_msb_mask ((u8x16) (u16x8_load_unaligned (data) == splat));
if (bmp != 0xffff)
{
count += count_trailing_zeros (~bmp) / 2;
{
u64 bmp;
bmp = u8x16_msb_mask ((u8x16) (u16x8_load_unaligned (data) == splat));
if (bmp != 0xffff)
{
count += count_trailing_zeros (~bmp) / 2;
{
u64 bmp;
bmp = u8x32_msb_mask ((u8x32) (u8x32_load_unaligned (data) == splat));
if (bmp != 0xffffffff)
{
count += count_trailing_zeros (~bmp);
{
u64 bmp;
bmp = u8x32_msb_mask ((u8x32) (u8x32_load_unaligned (data) == splat));
if (bmp != 0xffffffff)
{
count += count_trailing_zeros (~bmp);
}
#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
u8x16 splat = u8x16_splat (first);
}
#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
u8x16 splat = u8x16_splat (first);
{
u64 bmp;
bmp = u8x16_msb_mask ((u8x16) (u8x16_load_unaligned (data) == splat));
if (bmp != 0xffff)
{
count += count_trailing_zeros (~bmp);
{
u64 bmp;
bmp = u8x16_msb_mask ((u8x16) (u8x16_load_unaligned (data) == splat));
if (bmp != 0xffff)
{
count += count_trailing_zeros (~bmp);