X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvppinfra%2Fstring.h;h=a3db9264cac9eb1561b0defcc7e63a92ab935429;hb=282872127;hp=a25d461868b61c13c0f4b1e9ba273b2cb95db6a0;hpb=b0598497afde60146fe8480331c9f96e7a79475a;p=vpp.git diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h index a25d461868b..a3db9264cac 100644 --- a/src/vppinfra/string.h +++ b/src/vppinfra/string.h @@ -103,7 +103,7 @@ void clib_memswap (void *_a, void *_b, uword bytes); * In order to provide smooth mapping from unsafe string API to the clib string * macro, we often have to improvise s1max and s2max due to the additional * arguments are required for implementing the safe API. This macro is used - * to provide the s1max/s2max. It is not perfect becuase the actual + * to provide the s1max/s2max. It is not perfect because the actual * s1max/s2max may be greater than 4k and the mapping from the unsafe API to * the macro would cause a regression. However, it is not terribly likely. * So I bet against the odds. @@ -213,74 +213,84 @@ memset_s_inline (void *s, rsize_t smax, int c, rsize_t n) */ #define clib_memset(s,c,n) memset_s_inline(s,n,c,n) -/* - * Copy 64 bytes of data to 4 destinations - * this function is typically used in quad-loop case when whole cacheline - * needs to be copied to 4 different places. First it reads whole cacheline - * to 1/2/4 SIMD registers and then it writes data to 4 destinations. - */ - static_always_inline void -clib_memcpy64_x4 (void *d0, void *d1, void *d2, void *d3, void *s) +clib_memcpy_le (u8 * dst, u8 * src, u8 len, u8 max_len) { -#if defined (__AVX512F__) - __m512i r0 = _mm512_loadu_si512 (s); - - _mm512_storeu_si512 (d0, r0); - _mm512_storeu_si512 (d1, r0); - _mm512_storeu_si512 (d2, r0); - _mm512_storeu_si512 (d3, r0); - -#elif defined (__AVX2__) - __m256i r0 = _mm256_loadu_si256 ((__m256i *) (s + 0 * 32)); - __m256i r1 = _mm256_loadu_si256 ((__m256i *) (s + 1 * 32)); - - _mm256_storeu_si256 ((__m256i *) (d0 + 0 * 32), r0); - _mm256_storeu_si256 ((__m256i *) (d0 + 1 * 32), r1); - - _mm256_storeu_si256 ((__m256i *) (d1 + 0 * 32), r0); - _mm256_storeu_si256 ((__m256i *) (d1 + 1 * 32), r1); - - _mm256_storeu_si256 ((__m256i *) (d2 + 0 * 32), r0); - _mm256_storeu_si256 ((__m256i *) (d2 + 1 * 32), r1); - - _mm256_storeu_si256 ((__m256i *) (d3 + 0 * 32), r0); - _mm256_storeu_si256 ((__m256i *) (d3 + 1 * 32), r1); - -#elif defined (__SSSE3__) - __m128i r0 = _mm_loadu_si128 ((__m128i *) (s + 0 * 16)); - __m128i r1 = _mm_loadu_si128 ((__m128i *) (s + 1 * 16)); - __m128i r2 = _mm_loadu_si128 ((__m128i *) (s + 2 * 16)); - __m128i r3 = _mm_loadu_si128 ((__m128i *) (s + 3 * 16)); +#if defined (CLIB_HAVE_VEC256) + u8x32 s0, s1, d0, d1; + u8x32 mask = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + }; + u8x32 lv = u8x32_splat (len); + u8x32 add = u8x32_splat (32); + + s0 = u8x32_load_unaligned (src); + s1 = u8x32_load_unaligned (src + 32); + d0 = u8x32_load_unaligned (dst); + d1 = u8x32_load_unaligned (dst + 32); + + d0 = u8x32_blend (d0, s0, u8x32_is_greater (lv, mask)); + u8x32_store_unaligned (d0, dst); + + if (max_len <= 32) + return; - _mm_storeu_si128 ((__m128i *) (d0 + 0 * 16), r0); - _mm_storeu_si128 ((__m128i *) (d0 + 1 * 16), r1); - _mm_storeu_si128 ((__m128i *) (d0 + 2 * 16), r2); - _mm_storeu_si128 ((__m128i *) (d0 + 3 * 16), r3); + mask += add; + d1 = u8x32_blend (d1, s1, u8x32_is_greater (lv, mask)); + u8x32_store_unaligned (d1, dst + 32); + +#elif defined (CLIB_HAVE_VEC128) + u8x16 s0, s1, s2, s3, d0, d1, d2, d3; + u8x16 mask = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; + u8x16 lv = u8x16_splat (len); + u8x16 add = u8x16_splat (16); + + s0 = u8x16_load_unaligned (src); + s1 = u8x16_load_unaligned (src + 16); + s2 = u8x16_load_unaligned (src + 32); + s3 = u8x16_load_unaligned (src + 48); + d0 = u8x16_load_unaligned (dst); + d1 = u8x16_load_unaligned (dst + 16); + d2 = u8x16_load_unaligned (dst + 32); + d3 = u8x16_load_unaligned (dst + 48); + + d0 = u8x16_blend (d0, s0, u8x16_is_greater (lv, mask)); + u8x16_store_unaligned (d0, dst); + + if (max_len <= 16) + return; - _mm_storeu_si128 ((__m128i *) (d1 + 0 * 16), r0); - _mm_storeu_si128 ((__m128i *) (d1 + 1 * 16), r1); - _mm_storeu_si128 ((__m128i *) (d1 + 2 * 16), r2); - _mm_storeu_si128 ((__m128i *) (d1 + 3 * 16), r3); + mask += add; + d1 = u8x16_blend (d1, s1, u8x16_is_greater (lv, mask)); + u8x16_store_unaligned (d1, dst + 16); - _mm_storeu_si128 ((__m128i *) (d2 + 0 * 16), r0); - _mm_storeu_si128 ((__m128i *) (d2 + 1 * 16), r1); - _mm_storeu_si128 ((__m128i *) (d2 + 2 * 16), r2); - _mm_storeu_si128 ((__m128i *) (d2 + 3 * 16), r3); + if (max_len <= 32) + return; - _mm_storeu_si128 ((__m128i *) (d3 + 0 * 16), r0); - _mm_storeu_si128 ((__m128i *) (d3 + 1 * 16), r1); - _mm_storeu_si128 ((__m128i *) (d3 + 2 * 16), r2); - _mm_storeu_si128 ((__m128i *) (d3 + 3 * 16), r3); + mask += add; + d2 = u8x16_blend (d2, s2, u8x16_is_greater (lv, mask)); + u8x16_store_unaligned (d2, dst + 32); + mask += add; + d3 = u8x16_blend (d3, s3, u8x16_is_greater (lv, mask)); + u8x16_store_unaligned (d3, dst + 48); #else - clib_memcpy_fast (d0, s, 64); - clib_memcpy_fast (d1, s, 64); - clib_memcpy_fast (d2, s, 64); - clib_memcpy_fast (d3, s, 64); + memmove (dst, src, len); #endif } +static_always_inline void +clib_memcpy_le64 (u8 * dst, u8 * src, u8 len) +{ + clib_memcpy_le (dst, src, len, 64); +} + +static_always_inline void +clib_memcpy_le32 (u8 * dst, u8 * src, u8 len) +{ + clib_memcpy_le (dst, src, len, 32); +} + static_always_inline void clib_memset_u64 (void *p, u64 val, uword count) { @@ -1025,16 +1035,27 @@ strncpy_s_inline (char *__restrict__ dest, rsize_t dmax, } } else - m = n; + /* cap the copy to strlen(src) in case n > strlen(src) */ + m = clib_strnlen (src, n); /* Check for src/dst overlap, which is not allowed */ low = (uword) (src < dest ? src : dest); hi = (uword) (src < dest ? dest : src); + /* + * This check may fail innocently if src + dmax >= dst, but + * src + strlen(src) < dst. If it fails, check more carefully before + * blowing the whistle. + */ if (PREDICT_FALSE (low + (m - 1) >= hi)) { - clib_c11_violation ("src/dest overlap"); - return EINVAL; + m = clib_strnlen (src, m); + + if (low + (m - 1) >= hi) + { + clib_c11_violation ("src/dest overlap"); + return EINVAL; + } } clib_memcpy_fast (dest, src, m);