vppinfra: add COMPILE_TIME_CONST() macro

[vpp.git] / src / vppinfra / string.h
diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h

index d568670..76d345e 100644 (file)
--- a/src/vppinfra/string.h
+++ b/src/vppinfra/string.h
@@ -46,6 +46,7 @@
  
  #include <vppinfra/clib.h>     /* for CLIB_LINUX_KERNEL */
  #include <vppinfra/vector.h>
+#include <vppinfra/error_bootstrap.h>
  
  #ifdef CLIB_LINUX_KERNEL
  #include <linux/string.h>
@@ -71,18 +72,32 @@ void clib_memswap (void *_a, void *_b, uword bytes);
   * so don't let it anywhere near them.
   */
  #ifndef __COVERITY__
-#if __AVX512F__
+#if __AVX512BITALG__
  #include <vppinfra/memcpy_avx512.h>
+#define clib_memcpy_fast_arch(a, b, c) clib_memcpy_fast_avx512 (a, b, c)
  #elif __AVX2__
  #include <vppinfra/memcpy_avx2.h>
+#define clib_memcpy_fast_arch(a, b, c) clib_memcpy_fast_avx2 (a, b, c)
  #elif __SSSE3__
  #include <vppinfra/memcpy_sse3.h>
-#else
-#define clib_memcpy_fast(a,b,c) memcpy(a,b,c)
-#endif
-#else /* __COVERITY__ */
-#define clib_memcpy_fast(a,b,c) memcpy(a,b,c)
-#endif
+#define clib_memcpy_fast_arch(a, b, c) clib_memcpy_fast_sse3 (a, b, c)
+#endif /* __AVX512BITALG__ */
+#endif /* __COVERITY__ */
+
+#ifndef clib_memcpy_fast_arch
+#define clib_memcpy_fast_arch(a, b, c) memcpy (a, b, c)
+#endif /* clib_memcpy_fast_arch */
+
+static_always_inline void *
+clib_memcpy_fast (void *restrict dst, const void *restrict src, size_t n)
+{
+  ASSERT (dst && src &&
+         "memcpy(src, dst, n) with src == NULL or dst == NULL is undefined "
+         "behaviour");
+  return clib_memcpy_fast_arch (dst, src, n);
+}
+
+#undef clib_memcpy_fast_arch
  
  /* c-11 string manipulation variants */
  
@@ -103,7 +118,7 @@ void clib_memswap (void *_a, void *_b, uword bytes);
   * In order to provide smooth mapping from unsafe string API to the clib string
   * macro, we often have to improvise s1max and s2max due to the additional
   * arguments are required for implementing the safe API. This macro is used
- * to provide the s1max/s2max. It is not perfect becuase the actual
+ * to provide the s1max/s2max. It is not perfect because the actual
   * s1max/s2max may be greater than 4k and the mapping from the unsafe API to
   * the macro would cause a regression. However, it is not terribly likely.
   * So I bet against the odds.
@@ -128,7 +143,7 @@ memcpy_s_inline (void *__restrict__ dest, rsize_t dmax,
     * Optimize constant-number-of-bytes calls without asking
     * "too many questions for someone from New Jersey"
     */
-  if (__builtin_constant_p (n))
+  if (COMPILE_TIME_CONST (n))
      {
        clib_memcpy_fast (dest, src, n);
        return EOK;
@@ -213,74 +228,84 @@ memset_s_inline (void *s, rsize_t smax, int c, rsize_t n)
   */
  #define clib_memset(s,c,n) memset_s_inline(s,n,c,n)
  
-/*
- * Copy 64 bytes of data to 4 destinations
- * this function is typically used in quad-loop case when whole cacheline
- * needs to be copied to 4 different places. First it reads whole cacheline
- * to 1/2/4 SIMD registers and then it writes data to 4 destinations.
- */
-
  static_always_inline void
-clib_memcpy64_x4 (void *d0, void *d1, void *d2, void *d3, void *s)
+clib_memcpy_le (u8 * dst, u8 * src, u8 len, u8 max_len)
  {
-#if defined (__AVX512F__)
-  __m512i r0 = _mm512_loadu_si512 (s);
-
-  _mm512_storeu_si512 (d0, r0);
-  _mm512_storeu_si512 (d1, r0);
-  _mm512_storeu_si512 (d2, r0);
-  _mm512_storeu_si512 (d3, r0);
-
-#elif defined (__AVX2__)
-  __m256i r0 = _mm256_loadu_si256 ((__m256i *) (s + 0 * 32));
-  __m256i r1 = _mm256_loadu_si256 ((__m256i *) (s + 1 * 32));
-
-  _mm256_storeu_si256 ((__m256i *) (d0 + 0 * 32), r0);
-  _mm256_storeu_si256 ((__m256i *) (d0 + 1 * 32), r1);
-
-  _mm256_storeu_si256 ((__m256i *) (d1 + 0 * 32), r0);
-  _mm256_storeu_si256 ((__m256i *) (d1 + 1 * 32), r1);
-
-  _mm256_storeu_si256 ((__m256i *) (d2 + 0 * 32), r0);
-  _mm256_storeu_si256 ((__m256i *) (d2 + 1 * 32), r1);
-
-  _mm256_storeu_si256 ((__m256i *) (d3 + 0 * 32), r0);
-  _mm256_storeu_si256 ((__m256i *) (d3 + 1 * 32), r1);
-
-#elif defined (__SSSE3__)
-  __m128i r0 = _mm_loadu_si128 ((__m128i *) (s + 0 * 16));
-  __m128i r1 = _mm_loadu_si128 ((__m128i *) (s + 1 * 16));
-  __m128i r2 = _mm_loadu_si128 ((__m128i *) (s + 2 * 16));
-  __m128i r3 = _mm_loadu_si128 ((__m128i *) (s + 3 * 16));
+#if defined (CLIB_HAVE_VEC256)
+  u8x32 s0, s1, d0, d1;
+  u8x32 mask = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+    18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+  };
+  u8x32 lv = u8x32_splat (len);
+  u8x32 add = u8x32_splat (32);
+
+  s0 = u8x32_load_unaligned (src);
+  s1 = u8x32_load_unaligned (src + 32);
+  d0 = u8x32_load_unaligned (dst);
+  d1 = u8x32_load_unaligned (dst + 32);
+
+  d0 = u8x32_blend (d0, s0, u8x32_is_greater (lv, mask));
+  u8x32_store_unaligned (d0, dst);
+
+  if (max_len <= 32)
+    return;
  
-  _mm_storeu_si128 ((__m128i *) (d0 + 0 * 16), r0);
-  _mm_storeu_si128 ((__m128i *) (d0 + 1 * 16), r1);
-  _mm_storeu_si128 ((__m128i *) (d0 + 2 * 16), r2);
-  _mm_storeu_si128 ((__m128i *) (d0 + 3 * 16), r3);
+  mask += add;
+  d1 = u8x32_blend (d1, s1, u8x32_is_greater (lv, mask));
+  u8x32_store_unaligned (d1, dst + 32);
+
+#elif defined (CLIB_HAVE_VEC128)
+  u8x16 s0, s1, s2, s3, d0, d1, d2, d3;
+  u8x16 mask = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+  u8x16 lv = u8x16_splat (len);
+  u8x16 add = u8x16_splat (16);
+
+  s0 = u8x16_load_unaligned (src);
+  s1 = u8x16_load_unaligned (src + 16);
+  s2 = u8x16_load_unaligned (src + 32);
+  s3 = u8x16_load_unaligned (src + 48);
+  d0 = u8x16_load_unaligned (dst);
+  d1 = u8x16_load_unaligned (dst + 16);
+  d2 = u8x16_load_unaligned (dst + 32);
+  d3 = u8x16_load_unaligned (dst + 48);
+
+  d0 = u8x16_blend (d0, s0, u8x16_is_greater (lv, mask));
+  u8x16_store_unaligned (d0, dst);
+
+  if (max_len <= 16)
+    return;
  
-  _mm_storeu_si128 ((__m128i *) (d1 + 0 * 16), r0);
-  _mm_storeu_si128 ((__m128i *) (d1 + 1 * 16), r1);
-  _mm_storeu_si128 ((__m128i *) (d1 + 2 * 16), r2);
-  _mm_storeu_si128 ((__m128i *) (d1 + 3 * 16), r3);
+  mask += add;
+  d1 = u8x16_blend (d1, s1, u8x16_is_greater (lv, mask));
+  u8x16_store_unaligned (d1, dst + 16);
  
-  _mm_storeu_si128 ((__m128i *) (d2 + 0 * 16), r0);
-  _mm_storeu_si128 ((__m128i *) (d2 + 1 * 16), r1);
-  _mm_storeu_si128 ((__m128i *) (d2 + 2 * 16), r2);
-  _mm_storeu_si128 ((__m128i *) (d2 + 3 * 16), r3);
+  if (max_len <= 32)
+    return;
  
-  _mm_storeu_si128 ((__m128i *) (d3 + 0 * 16), r0);
-  _mm_storeu_si128 ((__m128i *) (d3 + 1 * 16), r1);
-  _mm_storeu_si128 ((__m128i *) (d3 + 2 * 16), r2);
-  _mm_storeu_si128 ((__m128i *) (d3 + 3 * 16), r3);
+  mask += add;
+  d2 = u8x16_blend (d2, s2, u8x16_is_greater (lv, mask));
+  u8x16_store_unaligned (d2, dst + 32);
  
+  mask += add;
+  d3 = u8x16_blend (d3, s3, u8x16_is_greater (lv, mask));
+  u8x16_store_unaligned (d3, dst + 48);
  #else
-  clib_memcpy_fast (d0, s, 64);
-  clib_memcpy_fast (d1, s, 64);
-  clib_memcpy_fast (d2, s, 64);
-  clib_memcpy_fast (d3, s, 64);
+  memmove (dst, src, len);
  #endif
  }
  
+static_always_inline void
+clib_memcpy_le64 (u8 * dst, u8 * src, u8 len)
+{
+  clib_memcpy_le (dst, src, len, 64);
+}
+
+static_always_inline void
+clib_memcpy_le32 (u8 * dst, u8 * src, u8 len)
+{
+  clib_memcpy_le (dst, src, len, 32);
+}
+
  static_always_inline void
  clib_memset_u64 (void *p, u64 val, uword count)
  {
@@ -462,8 +487,8 @@ clib_count_equal_u64 (u64 * data, uword max_count)
    uword count;
    u64 first;
  
-  if (max_count == 1)
-    return 1;
+  if (max_count <= 1)
+    return max_count;
    if (data[0] != data[1])
      return 1;
  
@@ -472,23 +497,20 @@ clib_count_equal_u64 (u64 * data, uword max_count)
  
  #if defined(CLIB_HAVE_VEC256)
    u64x4 splat = u64x4_splat (first);
-  while (1)
+  while (count + 3 < max_count)
      {
        u64 bmp;
        bmp = u8x32_msb_mask ((u8x32) (u64x4_load_unaligned (data) == splat));
        if (bmp != 0xffffffff)
         {
           count += count_trailing_zeros (~bmp) / 8;
-         return clib_min (count, max_count);
+         return count;
         }
  
        data += 4;
        count += 4;
-
-      if (count >= max_count)
-       return max_count;
      }
-#endif
+#else
    count += 2;
    data += 2;
    while (count + 3 < max_count &&
@@ -498,6 +520,7 @@ clib_count_equal_u64 (u64 * data, uword max_count)
        data += 4;
        count += 4;
      }
+#endif
    while (count < max_count && (data[0] == first))
      {
        data += 1;
@@ -512,8 +535,8 @@ clib_count_equal_u32 (u32 * data, uword max_count)
    uword count;
    u32 first;
  
-  if (max_count == 1)
-    return 1;
+  if (max_count <= 1)
+    return max_count;
    if (data[0] != data[1])
      return 1;
  
@@ -522,41 +545,35 @@ clib_count_equal_u32 (u32 * data, uword max_count)
  
  #if defined(CLIB_HAVE_VEC256)
    u32x8 splat = u32x8_splat (first);
-  while (1)
+  while (count + 7 < max_count)
      {
        u64 bmp;
        bmp = u8x32_msb_mask ((u8x32) (u32x8_load_unaligned (data) == splat));
        if (bmp != 0xffffffff)
         {
           count += count_trailing_zeros (~bmp) / 4;
-         return clib_min (count, max_count);
+         return count;
         }
  
        data += 8;
        count += 8;
-
-      if (count >= max_count)
-       return max_count;
      }
  #elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
    u32x4 splat = u32x4_splat (first);
-  while (1)
+  while (count + 3 < max_count)
      {
        u64 bmp;
        bmp = u8x16_msb_mask ((u8x16) (u32x4_load_unaligned (data) == splat));
        if (bmp != 0xffff)
         {
           count += count_trailing_zeros (~bmp) / 4;
-         return clib_min (count, max_count);
+         return count;
         }
  
        data += 4;
        count += 4;
-
-      if (count >= max_count)
-       return max_count;
      }
-#endif
+#else
    count += 2;
    data += 2;
    while (count + 3 < max_count &&
@@ -566,6 +583,7 @@ clib_count_equal_u32 (u32 * data, uword max_count)
        data += 4;
        count += 4;
      }
+#endif
    while (count < max_count && (data[0] == first))
      {
        data += 1;
@@ -580,8 +598,8 @@ clib_count_equal_u16 (u16 * data, uword max_count)
    uword count;
    u16 first;
  
-  if (max_count == 1)
-    return 1;
+  if (max_count <= 1)
+    return max_count;
    if (data[0] != data[1])
      return 1;
  
@@ -590,41 +608,35 @@ clib_count_equal_u16 (u16 * data, uword max_count)
  
  #if defined(CLIB_HAVE_VEC256)
    u16x16 splat = u16x16_splat (first);
-  while (1)
+  while (count + 15 < max_count)
      {
        u64 bmp;
        bmp = u8x32_msb_mask ((u8x32) (u16x16_load_unaligned (data) == splat));
        if (bmp != 0xffffffff)
         {
           count += count_trailing_zeros (~bmp) / 2;
-         return clib_min (count, max_count);
+         return count;
         }
  
        data += 16;
        count += 16;
-
-      if (count >= max_count)
-       return max_count;
      }
  #elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
    u16x8 splat = u16x8_splat (first);
-  while (1)
+  while (count + 7 < max_count)
      {
        u64 bmp;
        bmp = u8x16_msb_mask ((u8x16) (u16x8_load_unaligned (data) == splat));
        if (bmp != 0xffff)
         {
           count += count_trailing_zeros (~bmp) / 2;
-         return clib_min (count, max_count);
+         return count;
         }
  
        data += 8;
        count += 8;
-
-      if (count >= max_count)
-       return max_count;
      }
-#endif
+#else
    count += 2;
    data += 2;
    while (count + 3 < max_count &&
@@ -634,6 +646,7 @@ clib_count_equal_u16 (u16 * data, uword max_count)
        data += 4;
        count += 4;
      }
+#endif
    while (count < max_count && (data[0] == first))
      {
        data += 1;
@@ -648,8 +661,8 @@ clib_count_equal_u8 (u8 * data, uword max_count)
    uword count;
    u8 first;
  
-  if (max_count == 1)
-    return 1;
+  if (max_count <= 1)
+    return max_count;
    if (data[0] != data[1])
      return 1;
  
@@ -658,41 +671,35 @@ clib_count_equal_u8 (u8 * data, uword max_count)
  
  #if defined(CLIB_HAVE_VEC256)
    u8x32 splat = u8x32_splat (first);
-  while (1)
+  while (count + 31 < max_count)
      {
        u64 bmp;
        bmp = u8x32_msb_mask ((u8x32) (u8x32_load_unaligned (data) == splat));
        if (bmp != 0xffffffff)
         {
           count += count_trailing_zeros (~bmp);
-         return clib_min (count, max_count);
+         return max_count;
         }
  
        data += 32;
        count += 32;
-
-      if (count >= max_count)
-       return max_count;
      }
  #elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
    u8x16 splat = u8x16_splat (first);
-  while (1)
+  while (count + 15 < max_count)
      {
        u64 bmp;
        bmp = u8x16_msb_mask ((u8x16) (u8x16_load_unaligned (data) == splat));
        if (bmp != 0xffff)
         {
           count += count_trailing_zeros (~bmp);
-         return clib_min (count, max_count);
+         return count;
         }
  
        data += 16;
        count += 16;
-
-      if (count >= max_count)
-       return max_count;
      }
-#endif
+#else
    count += 2;
    data += 2;
    while (count + 3 < max_count &&
@@ -702,6 +709,7 @@ clib_count_equal_u8 (u8 * data, uword max_count)
        data += 4;
        count += 4;
      }
+#endif
    while (count < max_count && (data[0] == first))
      {
        data += 1;
@@ -1025,7 +1033,8 @@ strncpy_s_inline (char *__restrict__ dest, rsize_t dmax,
         }
      }
    else
-    m = n;
+    /* cap the copy to strlen(src) in case n > strlen(src) */
+    m = clib_strnlen (src, n);
  
    /* Check for src/dst overlap, which is not allowed */
    low = (uword) (src < dest ? src : dest);