X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvppinfra%2Fstring.h;h=b00c0cfbcc231d198fb0b2c64fee533101d9e7de;hb=178cf493d009995b28fdf220f04c98860ff79a9b;hp=4d2ff7875a2d358d3af4fab076b672813bc8a9a1;hpb=b2e1fe9c4d6263ced0e37c78a7ba1837f5ff1c86;p=vpp.git

diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h
index 4d2ff7875a2..b00c0cfbcc2 100644
--- a/src/vppinfra/string.h
+++ b/src/vppinfra/string.h
@@ -35,6 +35,12 @@
   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 
+/** \file
+
+    Optimized string handling code, including c11-compliant
+    "safe C library" variants.
+*/
+
 #ifndef included_clib_string_h
 #define included_clib_string_h
 
@@ -65,17 +71,131 @@ void clib_memswap (void *_a, void *_b, uword bytes);
  * so don't let it anywhere near them.
  */
 #ifndef __COVERITY__
-#if __AVX__
-#include <vppinfra/memcpy_avx.h>
+#if __AVX512F__
+#include <vppinfra/memcpy_avx512.h>
+#elif __AVX2__
+#include <vppinfra/memcpy_avx2.h>
 #elif __SSSE3__
 #include <vppinfra/memcpy_sse3.h>
 #else
-#define clib_memcpy(a,b,c) memcpy(a,b,c)
+#define clib_memcpy_fast(a,b,c) memcpy(a,b,c)
 #endif
 #else /* __COVERITY__ */
-#define clib_memcpy(a,b,c) memcpy(a,b,c)
+#define clib_memcpy_fast(a,b,c) memcpy(a,b,c)
+#endif
+
+/* c-11 string manipulation variants */
+
+#ifndef EOK
+#define EOK 0
+#endif
+#ifndef EINVAL
+#define EINVAL 22
 #endif
 
+typedef int errno_t;
+typedef uword rsize_t;
+
+void clib_c11_violation (const char *s);
+errno_t memcpy_s (void *__restrict__ dest, rsize_t dmax,
+		  const void *__restrict__ src, rsize_t n);
+
+always_inline errno_t
+memcpy_s_inline (void *__restrict__ dest, rsize_t dmax,
+		 const void *__restrict__ src, rsize_t n)
+{
+  uword low, hi;
+  u8 bad;
+
+  /*
+   * Optimize constant-number-of-bytes calls without asking
+   * "too many questions for someone from New Jersey"
+   */
+  if (__builtin_constant_p (n))
+    {
+      clib_memcpy_fast (dest, src, n);
+      return EOK;
+    }
+
+  /*
+   * call bogus if: src or dst NULL, trying to copy
+   * more data than we have space in dst, or src == dst.
+   * n == 0 isn't really "bad", so check first in the
+   * "wall-of-shame" department...
+   */
+  bad = (dest == 0) + (src == 0) + (n > dmax) + (dest == src) + (n == 0);
+  if (PREDICT_FALSE (bad != 0))
+    {
+      /* Not actually trying to copy anything is OK */
+      if (n == 0)
+	return EOK;
+      if (dest == NULL)
+	clib_c11_violation ("dest NULL");
+      if (src == NULL)
+	clib_c11_violation ("src NULL");
+      if (n > dmax)
+	clib_c11_violation ("n > dmax");
+      if (dest == src)
+	clib_c11_violation ("dest == src");
+      return EINVAL;
+    }
+
+  /* Check for src/dst overlap, which is not allowed */
+  low = (uword) (src < dest ? src : dest);
+  hi = (uword) (src < dest ? dest : src);
+
+  if (PREDICT_FALSE (low + (n - 1) >= hi))
+    {
+      clib_c11_violation ("src/dest overlap");
+      return EINVAL;
+    }
+
+  clib_memcpy_fast (dest, src, n);
+  return EOK;
+}
+
+/*
+ * Note: $$$ This macro is a crutch. Folks need to manually
+ * inspect every extant clib_memcpy(...) call and
+ * attempt to provide a real destination buffer size
+ * argument...
+ */
+#define clib_memcpy(d,s,n) memcpy_s_inline(d,n,s,n)
+
+errno_t memset_s (void *s, rsize_t smax, int c, rsize_t n);
+
+always_inline errno_t
+memset_s_inline (void *s, rsize_t smax, int c, rsize_t n)
+{
+  u8 bad;
+
+  bad = (s == 0) + (n > smax);
+
+  if (PREDICT_FALSE (bad != 0))
+    {
+      if (s == 0)
+	clib_c11_violation ("s NULL");
+      if (n > smax)
+	clib_c11_violation ("n > smax");
+      return (EINVAL);
+    }
+  memset (s, c, n);
+  return (EOK);
+}
+
+/*
+ * This macro is not [so much of] a crutch.
+ * It's super-typical to write:
+ *
+ *   ep = pool_get (<pool>);
+ *   clib_memset(ep, 0, sizeof (*ep));
+ *
+ * The compiler should delete the not-so useful
+ * (n > smax) test. TBH the NULL pointer check isn't
+ * so useful in this case, but so be it.
+ */
+#define clib_memset(s,c,n) memset_s_inline(s,n,c,n)
+
 /*
  * Copy 64 bytes of data to 4 destinations
  * this function is typically used in quad-loop case when whole cacheline
@@ -95,53 +215,482 @@ clib_memcpy64_x4 (void *d0, void *d1, void *d2, void *d3, void *s)
   _mm512_storeu_si512 (d3, r0);
 
 #elif defined (__AVX2__)
-  __m256i r0 = _mm256_loadu_si256 ((__m256i *) s + 0 * 32);
-  __m256i r1 = _mm256_loadu_si256 ((__m256i *) s + 1 * 32);
+  __m256i r0 = _mm256_loadu_si256 ((__m256i *) (s + 0 * 32));
+  __m256i r1 = _mm256_loadu_si256 ((__m256i *) (s + 1 * 32));
 
-  _mm256_storeu_si256 ((__m256i *) d0 + 0 * 32, r0);
-  _mm256_storeu_si256 ((__m256i *) d0 + 1 * 32, r1);
+  _mm256_storeu_si256 ((__m256i *) (d0 + 0 * 32), r0);
+  _mm256_storeu_si256 ((__m256i *) (d0 + 1 * 32), r1);
 
-  _mm256_storeu_si256 ((__m256i *) d1 + 0 * 32, r0);
-  _mm256_storeu_si256 ((__m256i *) d1 + 1 * 32, r1);
+  _mm256_storeu_si256 ((__m256i *) (d1 + 0 * 32), r0);
+  _mm256_storeu_si256 ((__m256i *) (d1 + 1 * 32), r1);
 
-  _mm256_storeu_si256 ((__m256i *) d2 + 0 * 32, r0);
-  _mm256_storeu_si256 ((__m256i *) d2 + 1 * 32, r1);
+  _mm256_storeu_si256 ((__m256i *) (d2 + 0 * 32), r0);
+  _mm256_storeu_si256 ((__m256i *) (d2 + 1 * 32), r1);
 
-  _mm256_storeu_si256 ((__m256i *) d3 + 0 * 32, r0);
-  _mm256_storeu_si256 ((__m256i *) d3 + 1 * 32, r1);
+  _mm256_storeu_si256 ((__m256i *) (d3 + 0 * 32), r0);
+  _mm256_storeu_si256 ((__m256i *) (d3 + 1 * 32), r1);
 
 #elif defined (__SSSE3__)
-  __m128i r0 = _mm_loadu_si128 ((__m128i *) s + 0 * 16);
-  __m128i r1 = _mm_loadu_si128 ((__m128i *) s + 1 * 16);
-  __m128i r2 = _mm_loadu_si128 ((__m128i *) s + 2 * 16);
-  __m128i r3 = _mm_loadu_si128 ((__m128i *) s + 3 * 16);
-
-  _mm_storeu_si128 ((__m128i *) d0 + 0 * 16, r0);
-  _mm_storeu_si128 ((__m128i *) d0 + 1 * 16, r1);
-  _mm_storeu_si128 ((__m128i *) d0 + 2 * 16, r2);
-  _mm_storeu_si128 ((__m128i *) d0 + 3 * 16, r3);
-
-  _mm_storeu_si128 ((__m128i *) d1 + 0 * 16, r0);
-  _mm_storeu_si128 ((__m128i *) d1 + 1 * 16, r1);
-  _mm_storeu_si128 ((__m128i *) d1 + 2 * 16, r2);
-  _mm_storeu_si128 ((__m128i *) d1 + 3 * 16, r3);
-
-  _mm_storeu_si128 ((__m128i *) d2 + 0 * 16, r0);
-  _mm_storeu_si128 ((__m128i *) d2 + 1 * 16, r1);
-  _mm_storeu_si128 ((__m128i *) d2 + 2 * 16, r2);
-  _mm_storeu_si128 ((__m128i *) d2 + 3 * 16, r3);
-
-  _mm_storeu_si128 ((__m128i *) d3 + 0 * 16, r0);
-  _mm_storeu_si128 ((__m128i *) d3 + 1 * 16, r1);
-  _mm_storeu_si128 ((__m128i *) d3 + 2 * 16, r2);
-  _mm_storeu_si128 ((__m128i *) d3 + 3 * 16, r3);
+  __m128i r0 = _mm_loadu_si128 ((__m128i *) (s + 0 * 16));
+  __m128i r1 = _mm_loadu_si128 ((__m128i *) (s + 1 * 16));
+  __m128i r2 = _mm_loadu_si128 ((__m128i *) (s + 2 * 16));
+  __m128i r3 = _mm_loadu_si128 ((__m128i *) (s + 3 * 16));
+
+  _mm_storeu_si128 ((__m128i *) (d0 + 0 * 16), r0);
+  _mm_storeu_si128 ((__m128i *) (d0 + 1 * 16), r1);
+  _mm_storeu_si128 ((__m128i *) (d0 + 2 * 16), r2);
+  _mm_storeu_si128 ((__m128i *) (d0 + 3 * 16), r3);
+
+  _mm_storeu_si128 ((__m128i *) (d1 + 0 * 16), r0);
+  _mm_storeu_si128 ((__m128i *) (d1 + 1 * 16), r1);
+  _mm_storeu_si128 ((__m128i *) (d1 + 2 * 16), r2);
+  _mm_storeu_si128 ((__m128i *) (d1 + 3 * 16), r3);
+
+  _mm_storeu_si128 ((__m128i *) (d2 + 0 * 16), r0);
+  _mm_storeu_si128 ((__m128i *) (d2 + 1 * 16), r1);
+  _mm_storeu_si128 ((__m128i *) (d2 + 2 * 16), r2);
+  _mm_storeu_si128 ((__m128i *) (d2 + 3 * 16), r3);
+
+  _mm_storeu_si128 ((__m128i *) (d3 + 0 * 16), r0);
+  _mm_storeu_si128 ((__m128i *) (d3 + 1 * 16), r1);
+  _mm_storeu_si128 ((__m128i *) (d3 + 2 * 16), r2);
+  _mm_storeu_si128 ((__m128i *) (d3 + 3 * 16), r3);
+
+#else
+  clib_memcpy_fast (d0, s, 64);
+  clib_memcpy_fast (d1, s, 64);
+  clib_memcpy_fast (d2, s, 64);
+  clib_memcpy_fast (d3, s, 64);
+#endif
+}
+
+static_always_inline void
+clib_memset_u64 (void *p, u64 val, uword count)
+{
+  u64 *ptr = p;
+#if defined(CLIB_HAVE_VEC512)
+  u64x8 v512 = u64x8_splat (val);
+  while (count >= 8)
+    {
+      u64x8_store_unaligned (v512, ptr);
+      ptr += 8;
+      count -= 8;
+    }
+  if (count == 0)
+    return;
+#endif
+#if defined(CLIB_HAVE_VEC256)
+  u64x4 v256 = u64x4_splat (val);
+  while (count >= 4)
+    {
+      u64x4_store_unaligned (v256, ptr);
+      ptr += 4;
+      count -= 4;
+    }
+  if (count == 0)
+    return;
+#else
+  while (count >= 4)
+    {
+      ptr[0] = ptr[1] = ptr[2] = ptr[3] = val;
+      ptr += 4;
+      count -= 4;
+    }
+#endif
+  while (count--)
+    ptr++[0] = val;
+}
+
+static_always_inline void
+clib_memset_u32 (void *p, u32 val, uword count)
+{
+  u32 *ptr = p;
+#if defined(CLIB_HAVE_VEC512)
+  u32x16 v512 = u32x16_splat (val);
+  while (count >= 16)
+    {
+      u32x16_store_unaligned (v512, ptr);
+      ptr += 16;
+      count -= 16;
+    }
+  if (count == 0)
+    return;
+#endif
+#if defined(CLIB_HAVE_VEC256)
+  u32x8 v256 = u32x8_splat (val);
+  while (count >= 8)
+    {
+      u32x8_store_unaligned (v256, ptr);
+      ptr += 8;
+      count -= 8;
+    }
+  if (count == 0)
+    return;
+#endif
+#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE)
+  u32x4 v128 = u32x4_splat (val);
+  while (count >= 4)
+    {
+      u32x4_store_unaligned (v128, ptr);
+      ptr += 4;
+      count -= 4;
+    }
+#else
+  while (count >= 4)
+    {
+      ptr[0] = ptr[1] = ptr[2] = ptr[3] = val;
+      ptr += 4;
+      count -= 4;
+    }
+#endif
+  while (count--)
+    ptr++[0] = val;
+}
 
+static_always_inline void
+clib_memset_u16 (void *p, u16 val, uword count)
+{
+  u16 *ptr = p;
+#if defined(CLIB_HAVE_VEC512)
+  u16x32 v512 = u16x32_splat (val);
+  while (count >= 32)
+    {
+      u16x32_store_unaligned (v512, ptr);
+      ptr += 32;
+      count -= 32;
+    }
+  if (count == 0)
+    return;
+#endif
+#if defined(CLIB_HAVE_VEC256)
+  u16x16 v256 = u16x16_splat (val);
+  while (count >= 16)
+    {
+      u16x16_store_unaligned (v256, ptr);
+      ptr += 16;
+      count -= 16;
+    }
+  if (count == 0)
+    return;
+#endif
+#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE)
+  u16x8 v128 = u16x8_splat (val);
+  while (count >= 8)
+    {
+      u16x8_store_unaligned (v128, ptr);
+      ptr += 8;
+      count -= 8;
+    }
+#else
+  while (count >= 4)
+    {
+      ptr[0] = ptr[1] = ptr[2] = ptr[3] = val;
+      ptr += 4;
+      count -= 4;
+    }
+#endif
+  while (count--)
+    ptr++[0] = val;
+}
+
+static_always_inline void
+clib_memset_u8 (void *p, u8 val, uword count)
+{
+  u8 *ptr = p;
+#if defined(CLIB_HAVE_VEC512)
+  u8x64 v512 = u8x64_splat (val);
+  while (count >= 64)
+    {
+      u8x64_store_unaligned (v512, ptr);
+      ptr += 64;
+      count -= 64;
+    }
+  if (count == 0)
+    return;
+#endif
+#if defined(CLIB_HAVE_VEC256)
+  u8x32 v256 = u8x32_splat (val);
+  while (count >= 32)
+    {
+      u8x32_store_unaligned (v256, ptr);
+      ptr += 32;
+      count -= 32;
+    }
+  if (count == 0)
+    return;
+#endif
+#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE)
+  u8x16 v128 = u8x16_splat (val);
+  while (count >= 16)
+    {
+      u8x16_store_unaligned (v128, ptr);
+      ptr += 16;
+      count -= 16;
+    }
 #else
-  clib_memcpy (d0, s, 64);
-  clib_memcpy (d1, s, 64);
-  clib_memcpy (d2, s, 64);
-  clib_memcpy (d3, s, 64);
+  while (count >= 4)
+    {
+      ptr[0] = ptr[1] = ptr[2] = ptr[3] = val;
+      ptr += 4;
+      count -= 4;
+    }
+#endif
+  while (count--)
+    ptr++[0] = val;
+}
+
+static_always_inline uword
+clib_count_equal_u64 (u64 * data, uword max_count)
+{
+  uword count;
+  u64 first;
+
+  if (max_count == 1)
+    return 1;
+  if (data[0] != data[1])
+    return 1;
+
+  count = 0;
+  first = data[0];
+
+#if defined(CLIB_HAVE_VEC256)
+  u64x4 splat = u64x4_splat (first);
+  while (1)
+    {
+      u64 bmp;
+      bmp = u8x32_msb_mask ((u8x32) (u64x4_load_unaligned (data) == splat));
+      if (bmp != 0xffffffff)
+	{
+	  count += count_trailing_zeros (~bmp) / 8;
+	  return clib_min (count, max_count);
+	}
+
+      data += 4;
+      count += 4;
+
+      if (count >= max_count)
+	return max_count;
+    }
+#endif
+  count += 2;
+  data += 2;
+  while (count + 3 < max_count &&
+	 ((data[0] ^ first) | (data[1] ^ first) |
+	  (data[2] ^ first) | (data[3] ^ first)) == 0)
+    {
+      data += 4;
+      count += 4;
+    }
+  while (count < max_count && (data[0] == first))
+    {
+      data += 1;
+      count += 1;
+    }
+  return count;
+}
+
+static_always_inline uword
+clib_count_equal_u32 (u32 * data, uword max_count)
+{
+  uword count;
+  u32 first;
+
+  if (max_count == 1)
+    return 1;
+  if (data[0] != data[1])
+    return 1;
+
+  count = 0;
+  first = data[0];
+
+#if defined(CLIB_HAVE_VEC256)
+  u32x8 splat = u32x8_splat (first);
+  while (1)
+    {
+      u64 bmp;
+      bmp = u8x32_msb_mask ((u8x32) (u32x8_load_unaligned (data) == splat));
+      if (bmp != 0xffffffff)
+	{
+	  count += count_trailing_zeros (~bmp) / 4;
+	  return clib_min (count, max_count);
+	}
+
+      data += 8;
+      count += 8;
+
+      if (count >= max_count)
+	return max_count;
+    }
+#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
+  u32x4 splat = u32x4_splat (first);
+  while (1)
+    {
+      u64 bmp;
+      bmp = u8x16_msb_mask ((u8x16) (u32x4_load_unaligned (data) == splat));
+      if (bmp != 0xffff)
+	{
+	  count += count_trailing_zeros (~bmp) / 4;
+	  return clib_min (count, max_count);
+	}
+
+      data += 4;
+      count += 4;
+
+      if (count >= max_count)
+	return max_count;
+    }
+#endif
+  count += 2;
+  data += 2;
+  while (count + 3 < max_count &&
+	 ((data[0] ^ first) | (data[1] ^ first) |
+	  (data[2] ^ first) | (data[3] ^ first)) == 0)
+    {
+      data += 4;
+      count += 4;
+    }
+  while (count < max_count && (data[0] == first))
+    {
+      data += 1;
+      count += 1;
+    }
+  return count;
+}
+
+static_always_inline uword
+clib_count_equal_u16 (u16 * data, uword max_count)
+{
+  uword count;
+  u16 first;
+
+  if (max_count == 1)
+    return 1;
+  if (data[0] != data[1])
+    return 1;
+
+  count = 0;
+  first = data[0];
+
+#if defined(CLIB_HAVE_VEC256)
+  u16x16 splat = u16x16_splat (first);
+  while (1)
+    {
+      u64 bmp;
+      bmp = u8x32_msb_mask ((u8x32) (u16x16_load_unaligned (data) == splat));
+      if (bmp != 0xffffffff)
+	{
+	  count += count_trailing_zeros (~bmp) / 2;
+	  return clib_min (count, max_count);
+	}
+
+      data += 16;
+      count += 16;
+
+      if (count >= max_count)
+	return max_count;
+    }
+#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
+  u16x8 splat = u16x8_splat (first);
+  while (1)
+    {
+      u64 bmp;
+      bmp = u8x16_msb_mask ((u8x16) (u16x8_load_unaligned (data) == splat));
+      if (bmp != 0xffff)
+	{
+	  count += count_trailing_zeros (~bmp) / 2;
+	  return clib_min (count, max_count);
+	}
+
+      data += 8;
+      count += 8;
+
+      if (count >= max_count)
+	return max_count;
+    }
+#endif
+  count += 2;
+  data += 2;
+  while (count + 3 < max_count &&
+	 ((data[0] ^ first) | (data[1] ^ first) |
+	  (data[2] ^ first) | (data[3] ^ first)) == 0)
+    {
+      data += 4;
+      count += 4;
+    }
+  while (count < max_count && (data[0] == first))
+    {
+      data += 1;
+      count += 1;
+    }
+  return count;
+}
+
+static_always_inline uword
+clib_count_equal_u8 (u8 * data, uword max_count)
+{
+  uword count;
+  u8 first;
+
+  if (max_count == 1)
+    return 1;
+  if (data[0] != data[1])
+    return 1;
+
+  count = 0;
+  first = data[0];
+
+#if defined(CLIB_HAVE_VEC256)
+  u8x32 splat = u8x32_splat (first);
+  while (1)
+    {
+      u64 bmp;
+      bmp = u8x32_msb_mask ((u8x32) (u8x32_load_unaligned (data) == splat));
+      if (bmp != 0xffffffff)
+	{
+	  count += count_trailing_zeros (~bmp);
+	  return clib_min (count, max_count);
+	}
+
+      data += 32;
+      count += 32;
+
+      if (count >= max_count)
+	return max_count;
+    }
+#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
+  u8x16 splat = u8x16_splat (first);
+  while (1)
+    {
+      u64 bmp;
+      bmp = u8x16_msb_mask ((u8x16) (u8x16_load_unaligned (data) == splat));
+      if (bmp != 0xffff)
+	{
+	  count += count_trailing_zeros (~bmp);
+	  return clib_min (count, max_count);
+	}
+
+      data += 16;
+      count += 16;
+
+      if (count >= max_count)
+	return max_count;
+    }
 #endif
+  count += 2;
+  data += 2;
+  while (count + 3 < max_count &&
+	 ((data[0] ^ first) | (data[1] ^ first) |
+	  (data[2] ^ first) | (data[3] ^ first)) == 0)
+    {
+      data += 4;
+      count += 4;
+    }
+  while (count < max_count && (data[0] == first))
+    {
+      data += 1;
+      count += 1;
+    }
+  return count;
 }
 
 #endif /* included_clib_string_h */