X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvppinfra%2Fstring.h;h=b00c0cfbcc231d198fb0b2c64fee533101d9e7de;hb=178cf493d009995b28fdf220f04c98860ff79a9b;hp=4d2ff7875a2d358d3af4fab076b672813bc8a9a1;hpb=b2e1fe9c4d6263ced0e37c78a7ba1837f5ff1c86;p=vpp.git diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h index 4d2ff7875a2..b00c0cfbcc2 100644 --- a/src/vppinfra/string.h +++ b/src/vppinfra/string.h @@ -35,6 +35,12 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/** \file + + Optimized string handling code, including c11-compliant + "safe C library" variants. +*/ + #ifndef included_clib_string_h #define included_clib_string_h @@ -65,17 +71,131 @@ void clib_memswap (void *_a, void *_b, uword bytes); * so don't let it anywhere near them. */ #ifndef __COVERITY__ -#if __AVX__ -#include +#if __AVX512F__ +#include +#elif __AVX2__ +#include #elif __SSSE3__ #include #else -#define clib_memcpy(a,b,c) memcpy(a,b,c) +#define clib_memcpy_fast(a,b,c) memcpy(a,b,c) #endif #else /* __COVERITY__ */ -#define clib_memcpy(a,b,c) memcpy(a,b,c) +#define clib_memcpy_fast(a,b,c) memcpy(a,b,c) +#endif + +/* c-11 string manipulation variants */ + +#ifndef EOK +#define EOK 0 +#endif +#ifndef EINVAL +#define EINVAL 22 #endif +typedef int errno_t; +typedef uword rsize_t; + +void clib_c11_violation (const char *s); +errno_t memcpy_s (void *__restrict__ dest, rsize_t dmax, + const void *__restrict__ src, rsize_t n); + +always_inline errno_t +memcpy_s_inline (void *__restrict__ dest, rsize_t dmax, + const void *__restrict__ src, rsize_t n) +{ + uword low, hi; + u8 bad; + + /* + * Optimize constant-number-of-bytes calls without asking + * "too many questions for someone from New Jersey" + */ + if (__builtin_constant_p (n)) + { + clib_memcpy_fast (dest, src, n); + return EOK; + } + + /* + * call bogus if: src or dst NULL, trying to copy + * more data than we have space in dst, or src == dst. + * n == 0 isn't really "bad", so check first in the + * "wall-of-shame" department... + */ + bad = (dest == 0) + (src == 0) + (n > dmax) + (dest == src) + (n == 0); + if (PREDICT_FALSE (bad != 0)) + { + /* Not actually trying to copy anything is OK */ + if (n == 0) + return EOK; + if (dest == NULL) + clib_c11_violation ("dest NULL"); + if (src == NULL) + clib_c11_violation ("src NULL"); + if (n > dmax) + clib_c11_violation ("n > dmax"); + if (dest == src) + clib_c11_violation ("dest == src"); + return EINVAL; + } + + /* Check for src/dst overlap, which is not allowed */ + low = (uword) (src < dest ? src : dest); + hi = (uword) (src < dest ? dest : src); + + if (PREDICT_FALSE (low + (n - 1) >= hi)) + { + clib_c11_violation ("src/dest overlap"); + return EINVAL; + } + + clib_memcpy_fast (dest, src, n); + return EOK; +} + +/* + * Note: $$$ This macro is a crutch. Folks need to manually + * inspect every extant clib_memcpy(...) call and + * attempt to provide a real destination buffer size + * argument... + */ +#define clib_memcpy(d,s,n) memcpy_s_inline(d,n,s,n) + +errno_t memset_s (void *s, rsize_t smax, int c, rsize_t n); + +always_inline errno_t +memset_s_inline (void *s, rsize_t smax, int c, rsize_t n) +{ + u8 bad; + + bad = (s == 0) + (n > smax); + + if (PREDICT_FALSE (bad != 0)) + { + if (s == 0) + clib_c11_violation ("s NULL"); + if (n > smax) + clib_c11_violation ("n > smax"); + return (EINVAL); + } + memset (s, c, n); + return (EOK); +} + +/* + * This macro is not [so much of] a crutch. + * It's super-typical to write: + * + * ep = pool_get (); + * clib_memset(ep, 0, sizeof (*ep)); + * + * The compiler should delete the not-so useful + * (n > smax) test. TBH the NULL pointer check isn't + * so useful in this case, but so be it. + */ +#define clib_memset(s,c,n) memset_s_inline(s,n,c,n) + /* * Copy 64 bytes of data to 4 destinations * this function is typically used in quad-loop case when whole cacheline @@ -95,53 +215,482 @@ clib_memcpy64_x4 (void *d0, void *d1, void *d2, void *d3, void *s) _mm512_storeu_si512 (d3, r0); #elif defined (__AVX2__) - __m256i r0 = _mm256_loadu_si256 ((__m256i *) s + 0 * 32); - __m256i r1 = _mm256_loadu_si256 ((__m256i *) s + 1 * 32); + __m256i r0 = _mm256_loadu_si256 ((__m256i *) (s + 0 * 32)); + __m256i r1 = _mm256_loadu_si256 ((__m256i *) (s + 1 * 32)); - _mm256_storeu_si256 ((__m256i *) d0 + 0 * 32, r0); - _mm256_storeu_si256 ((__m256i *) d0 + 1 * 32, r1); + _mm256_storeu_si256 ((__m256i *) (d0 + 0 * 32), r0); + _mm256_storeu_si256 ((__m256i *) (d0 + 1 * 32), r1); - _mm256_storeu_si256 ((__m256i *) d1 + 0 * 32, r0); - _mm256_storeu_si256 ((__m256i *) d1 + 1 * 32, r1); + _mm256_storeu_si256 ((__m256i *) (d1 + 0 * 32), r0); + _mm256_storeu_si256 ((__m256i *) (d1 + 1 * 32), r1); - _mm256_storeu_si256 ((__m256i *) d2 + 0 * 32, r0); - _mm256_storeu_si256 ((__m256i *) d2 + 1 * 32, r1); + _mm256_storeu_si256 ((__m256i *) (d2 + 0 * 32), r0); + _mm256_storeu_si256 ((__m256i *) (d2 + 1 * 32), r1); - _mm256_storeu_si256 ((__m256i *) d3 + 0 * 32, r0); - _mm256_storeu_si256 ((__m256i *) d3 + 1 * 32, r1); + _mm256_storeu_si256 ((__m256i *) (d3 + 0 * 32), r0); + _mm256_storeu_si256 ((__m256i *) (d3 + 1 * 32), r1); #elif defined (__SSSE3__) - __m128i r0 = _mm_loadu_si128 ((__m128i *) s + 0 * 16); - __m128i r1 = _mm_loadu_si128 ((__m128i *) s + 1 * 16); - __m128i r2 = _mm_loadu_si128 ((__m128i *) s + 2 * 16); - __m128i r3 = _mm_loadu_si128 ((__m128i *) s + 3 * 16); - - _mm_storeu_si128 ((__m128i *) d0 + 0 * 16, r0); - _mm_storeu_si128 ((__m128i *) d0 + 1 * 16, r1); - _mm_storeu_si128 ((__m128i *) d0 + 2 * 16, r2); - _mm_storeu_si128 ((__m128i *) d0 + 3 * 16, r3); - - _mm_storeu_si128 ((__m128i *) d1 + 0 * 16, r0); - _mm_storeu_si128 ((__m128i *) d1 + 1 * 16, r1); - _mm_storeu_si128 ((__m128i *) d1 + 2 * 16, r2); - _mm_storeu_si128 ((__m128i *) d1 + 3 * 16, r3); - - _mm_storeu_si128 ((__m128i *) d2 + 0 * 16, r0); - _mm_storeu_si128 ((__m128i *) d2 + 1 * 16, r1); - _mm_storeu_si128 ((__m128i *) d2 + 2 * 16, r2); - _mm_storeu_si128 ((__m128i *) d2 + 3 * 16, r3); - - _mm_storeu_si128 ((__m128i *) d3 + 0 * 16, r0); - _mm_storeu_si128 ((__m128i *) d3 + 1 * 16, r1); - _mm_storeu_si128 ((__m128i *) d3 + 2 * 16, r2); - _mm_storeu_si128 ((__m128i *) d3 + 3 * 16, r3); + __m128i r0 = _mm_loadu_si128 ((__m128i *) (s + 0 * 16)); + __m128i r1 = _mm_loadu_si128 ((__m128i *) (s + 1 * 16)); + __m128i r2 = _mm_loadu_si128 ((__m128i *) (s + 2 * 16)); + __m128i r3 = _mm_loadu_si128 ((__m128i *) (s + 3 * 16)); + + _mm_storeu_si128 ((__m128i *) (d0 + 0 * 16), r0); + _mm_storeu_si128 ((__m128i *) (d0 + 1 * 16), r1); + _mm_storeu_si128 ((__m128i *) (d0 + 2 * 16), r2); + _mm_storeu_si128 ((__m128i *) (d0 + 3 * 16), r3); + + _mm_storeu_si128 ((__m128i *) (d1 + 0 * 16), r0); + _mm_storeu_si128 ((__m128i *) (d1 + 1 * 16), r1); + _mm_storeu_si128 ((__m128i *) (d1 + 2 * 16), r2); + _mm_storeu_si128 ((__m128i *) (d1 + 3 * 16), r3); + + _mm_storeu_si128 ((__m128i *) (d2 + 0 * 16), r0); + _mm_storeu_si128 ((__m128i *) (d2 + 1 * 16), r1); + _mm_storeu_si128 ((__m128i *) (d2 + 2 * 16), r2); + _mm_storeu_si128 ((__m128i *) (d2 + 3 * 16), r3); + + _mm_storeu_si128 ((__m128i *) (d3 + 0 * 16), r0); + _mm_storeu_si128 ((__m128i *) (d3 + 1 * 16), r1); + _mm_storeu_si128 ((__m128i *) (d3 + 2 * 16), r2); + _mm_storeu_si128 ((__m128i *) (d3 + 3 * 16), r3); + +#else + clib_memcpy_fast (d0, s, 64); + clib_memcpy_fast (d1, s, 64); + clib_memcpy_fast (d2, s, 64); + clib_memcpy_fast (d3, s, 64); +#endif +} + +static_always_inline void +clib_memset_u64 (void *p, u64 val, uword count) +{ + u64 *ptr = p; +#if defined(CLIB_HAVE_VEC512) + u64x8 v512 = u64x8_splat (val); + while (count >= 8) + { + u64x8_store_unaligned (v512, ptr); + ptr += 8; + count -= 8; + } + if (count == 0) + return; +#endif +#if defined(CLIB_HAVE_VEC256) + u64x4 v256 = u64x4_splat (val); + while (count >= 4) + { + u64x4_store_unaligned (v256, ptr); + ptr += 4; + count -= 4; + } + if (count == 0) + return; +#else + while (count >= 4) + { + ptr[0] = ptr[1] = ptr[2] = ptr[3] = val; + ptr += 4; + count -= 4; + } +#endif + while (count--) + ptr++[0] = val; +} + +static_always_inline void +clib_memset_u32 (void *p, u32 val, uword count) +{ + u32 *ptr = p; +#if defined(CLIB_HAVE_VEC512) + u32x16 v512 = u32x16_splat (val); + while (count >= 16) + { + u32x16_store_unaligned (v512, ptr); + ptr += 16; + count -= 16; + } + if (count == 0) + return; +#endif +#if defined(CLIB_HAVE_VEC256) + u32x8 v256 = u32x8_splat (val); + while (count >= 8) + { + u32x8_store_unaligned (v256, ptr); + ptr += 8; + count -= 8; + } + if (count == 0) + return; +#endif +#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE) + u32x4 v128 = u32x4_splat (val); + while (count >= 4) + { + u32x4_store_unaligned (v128, ptr); + ptr += 4; + count -= 4; + } +#else + while (count >= 4) + { + ptr[0] = ptr[1] = ptr[2] = ptr[3] = val; + ptr += 4; + count -= 4; + } +#endif + while (count--) + ptr++[0] = val; +} +static_always_inline void +clib_memset_u16 (void *p, u16 val, uword count) +{ + u16 *ptr = p; +#if defined(CLIB_HAVE_VEC512) + u16x32 v512 = u16x32_splat (val); + while (count >= 32) + { + u16x32_store_unaligned (v512, ptr); + ptr += 32; + count -= 32; + } + if (count == 0) + return; +#endif +#if defined(CLIB_HAVE_VEC256) + u16x16 v256 = u16x16_splat (val); + while (count >= 16) + { + u16x16_store_unaligned (v256, ptr); + ptr += 16; + count -= 16; + } + if (count == 0) + return; +#endif +#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE) + u16x8 v128 = u16x8_splat (val); + while (count >= 8) + { + u16x8_store_unaligned (v128, ptr); + ptr += 8; + count -= 8; + } +#else + while (count >= 4) + { + ptr[0] = ptr[1] = ptr[2] = ptr[3] = val; + ptr += 4; + count -= 4; + } +#endif + while (count--) + ptr++[0] = val; +} + +static_always_inline void +clib_memset_u8 (void *p, u8 val, uword count) +{ + u8 *ptr = p; +#if defined(CLIB_HAVE_VEC512) + u8x64 v512 = u8x64_splat (val); + while (count >= 64) + { + u8x64_store_unaligned (v512, ptr); + ptr += 64; + count -= 64; + } + if (count == 0) + return; +#endif +#if defined(CLIB_HAVE_VEC256) + u8x32 v256 = u8x32_splat (val); + while (count >= 32) + { + u8x32_store_unaligned (v256, ptr); + ptr += 32; + count -= 32; + } + if (count == 0) + return; +#endif +#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE) + u8x16 v128 = u8x16_splat (val); + while (count >= 16) + { + u8x16_store_unaligned (v128, ptr); + ptr += 16; + count -= 16; + } #else - clib_memcpy (d0, s, 64); - clib_memcpy (d1, s, 64); - clib_memcpy (d2, s, 64); - clib_memcpy (d3, s, 64); + while (count >= 4) + { + ptr[0] = ptr[1] = ptr[2] = ptr[3] = val; + ptr += 4; + count -= 4; + } +#endif + while (count--) + ptr++[0] = val; +} + +static_always_inline uword +clib_count_equal_u64 (u64 * data, uword max_count) +{ + uword count; + u64 first; + + if (max_count == 1) + return 1; + if (data[0] != data[1]) + return 1; + + count = 0; + first = data[0]; + +#if defined(CLIB_HAVE_VEC256) + u64x4 splat = u64x4_splat (first); + while (1) + { + u64 bmp; + bmp = u8x32_msb_mask ((u8x32) (u64x4_load_unaligned (data) == splat)); + if (bmp != 0xffffffff) + { + count += count_trailing_zeros (~bmp) / 8; + return clib_min (count, max_count); + } + + data += 4; + count += 4; + + if (count >= max_count) + return max_count; + } +#endif + count += 2; + data += 2; + while (count + 3 < max_count && + ((data[0] ^ first) | (data[1] ^ first) | + (data[2] ^ first) | (data[3] ^ first)) == 0) + { + data += 4; + count += 4; + } + while (count < max_count && (data[0] == first)) + { + data += 1; + count += 1; + } + return count; +} + +static_always_inline uword +clib_count_equal_u32 (u32 * data, uword max_count) +{ + uword count; + u32 first; + + if (max_count == 1) + return 1; + if (data[0] != data[1]) + return 1; + + count = 0; + first = data[0]; + +#if defined(CLIB_HAVE_VEC256) + u32x8 splat = u32x8_splat (first); + while (1) + { + u64 bmp; + bmp = u8x32_msb_mask ((u8x32) (u32x8_load_unaligned (data) == splat)); + if (bmp != 0xffffffff) + { + count += count_trailing_zeros (~bmp) / 4; + return clib_min (count, max_count); + } + + data += 8; + count += 8; + + if (count >= max_count) + return max_count; + } +#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK) + u32x4 splat = u32x4_splat (first); + while (1) + { + u64 bmp; + bmp = u8x16_msb_mask ((u8x16) (u32x4_load_unaligned (data) == splat)); + if (bmp != 0xffff) + { + count += count_trailing_zeros (~bmp) / 4; + return clib_min (count, max_count); + } + + data += 4; + count += 4; + + if (count >= max_count) + return max_count; + } +#endif + count += 2; + data += 2; + while (count + 3 < max_count && + ((data[0] ^ first) | (data[1] ^ first) | + (data[2] ^ first) | (data[3] ^ first)) == 0) + { + data += 4; + count += 4; + } + while (count < max_count && (data[0] == first)) + { + data += 1; + count += 1; + } + return count; +} + +static_always_inline uword +clib_count_equal_u16 (u16 * data, uword max_count) +{ + uword count; + u16 first; + + if (max_count == 1) + return 1; + if (data[0] != data[1]) + return 1; + + count = 0; + first = data[0]; + +#if defined(CLIB_HAVE_VEC256) + u16x16 splat = u16x16_splat (first); + while (1) + { + u64 bmp; + bmp = u8x32_msb_mask ((u8x32) (u16x16_load_unaligned (data) == splat)); + if (bmp != 0xffffffff) + { + count += count_trailing_zeros (~bmp) / 2; + return clib_min (count, max_count); + } + + data += 16; + count += 16; + + if (count >= max_count) + return max_count; + } +#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK) + u16x8 splat = u16x8_splat (first); + while (1) + { + u64 bmp; + bmp = u8x16_msb_mask ((u8x16) (u16x8_load_unaligned (data) == splat)); + if (bmp != 0xffff) + { + count += count_trailing_zeros (~bmp) / 2; + return clib_min (count, max_count); + } + + data += 8; + count += 8; + + if (count >= max_count) + return max_count; + } +#endif + count += 2; + data += 2; + while (count + 3 < max_count && + ((data[0] ^ first) | (data[1] ^ first) | + (data[2] ^ first) | (data[3] ^ first)) == 0) + { + data += 4; + count += 4; + } + while (count < max_count && (data[0] == first)) + { + data += 1; + count += 1; + } + return count; +} + +static_always_inline uword +clib_count_equal_u8 (u8 * data, uword max_count) +{ + uword count; + u8 first; + + if (max_count == 1) + return 1; + if (data[0] != data[1]) + return 1; + + count = 0; + first = data[0]; + +#if defined(CLIB_HAVE_VEC256) + u8x32 splat = u8x32_splat (first); + while (1) + { + u64 bmp; + bmp = u8x32_msb_mask ((u8x32) (u8x32_load_unaligned (data) == splat)); + if (bmp != 0xffffffff) + { + count += count_trailing_zeros (~bmp); + return clib_min (count, max_count); + } + + data += 32; + count += 32; + + if (count >= max_count) + return max_count; + } +#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK) + u8x16 splat = u8x16_splat (first); + while (1) + { + u64 bmp; + bmp = u8x16_msb_mask ((u8x16) (u8x16_load_unaligned (data) == splat)); + if (bmp != 0xffff) + { + count += count_trailing_zeros (~bmp); + return clib_min (count, max_count); + } + + data += 16; + count += 16; + + if (count >= max_count) + return max_count; + } #endif + count += 2; + data += 2; + while (count + 3 < max_count && + ((data[0] ^ first) | (data[1] ^ first) | + (data[2] ^ first) | (data[3] ^ first)) == 0) + { + data += 4; + count += 4; + } + while (count < max_count && (data[0] == first)) + { + data += 1; + count += 1; + } + return count; } #endif /* included_clib_string_h */