From: Damjan Marion Date: Fri, 18 May 2018 22:04:23 +0000 (+0200) Subject: vector functions cleanup and improvements X-Git-Tag: v18.07-rc1~309 X-Git-Url: https://gerrit.fd.io/r/gitweb?a=commitdiff_plain;h=a52e1668c9976bd5cdd20d02b668df41ea41f16f;p=vpp.git vector functions cleanup and improvements Remove functions which have native C equivalent (i.e. _is_equal can be replaced with ==, _add with +) Add SSE4.2, AVX-512 implementations of splat, load_unaligned, store_unaligned, is_all_zero, is_equal, is_all_equal Change-Id: Ie80b0e482e7a76248ad79399c2576468532354cd Signed-off-by: Damjan Marion --- diff --git a/src/vnet/ip/ip4_source_and_port_range_check.c b/src/vnet/ip/ip4_source_and_port_range_check.c index 06e6e7c6b63..2889a899678 100644 --- a/src/vnet/ip/ip4_source_and_port_range_check.c +++ b/src/vnet/ip/ip4_source_and_port_range_check.c @@ -133,9 +133,8 @@ check_adj_port_range_x1 (const protocol_port_range_dpo_t * ppr_dpo, u16x8_sub_saturate (ppr_dpo->blocks[i].low.as_u16x8, key.as_u16x8); diff2.as_u16x8 = u16x8_sub_saturate (ppr_dpo->blocks[i].hi.as_u16x8, key.as_u16x8); - sum.as_u16x8 = u16x8_add (diff1.as_u16x8, diff2.as_u16x8); - sum_equal_diff2.as_u16x8 = - u16x8_is_equal (sum.as_u16x8, diff2.as_u16x8); + sum.as_u16x8 = diff1.as_u16x8 + diff2.as_u16x8; + sum_equal_diff2.as_u16x8 = (sum.as_u16x8 == diff2.as_u16x8); sum_nonzero = ~u16x8_zero_byte_mask (sum.as_u16x8); sum_equal = ~u16x8_zero_byte_mask (sum_equal_diff2.as_u16x8); winner_mask = sum_nonzero & sum_equal; diff --git a/src/vppinfra/mheap.c b/src/vppinfra/mheap.c index c703545954d..4d27d419e64 100644 --- a/src/vppinfra/mheap.c +++ b/src/vppinfra/mheap.c @@ -311,7 +311,7 @@ mheap_small_object_cache_mask (mheap_small_object_cache_t * c, uword bin) ASSERT (bin < 256); -#define _(i) ((uword) u8x16_compare_byte_mask (u8x16_is_equal (b, c->bins.as_u8x16[i])) << (uword) ((i)*16)) +#define _(i) ((uword) u8x16_compare_byte_mask ((b == c->bins.as_u8x16[i])) << (uword) ((i)*16)) mask = _(0) | _(1); if (BITS (uword) > 32) mask |= _(2) | _(3); diff --git a/src/vppinfra/pfhash.h b/src/vppinfra/pfhash.h index e054c668f3b..2884fa81cf9 100644 --- a/src/vppinfra/pfhash.h +++ b/src/vppinfra/pfhash.h @@ -249,8 +249,8 @@ pfhash_search_kv_4 (pfhash_t * p, u32 bucket_contents, u32 * key) vector_key = u32x4_splat (key[0]); - is_equal[0] = u32x4_is_equal (kv->kb.k_u32x4[0], vector_key); - is_equal[1] = u32x4_is_equal (kv->kb.k_u32x4[1], vector_key); + is_equal[0] = (kv->kb.k_u32x4[0] == vector_key); + is_equal[1] = (kv->kb.k_u32x4[1] == vector_key); zbm[0] = ~u32x4_zero_byte_mask (is_equal[0]) & 0xFFFF; zbm[1] = ~u32x4_zero_byte_mask (is_equal[1]) & 0xFFFF; diff --git a/src/vppinfra/vector.h b/src/vppinfra/vector.h index fcff5e79d95..2157ab7d1d0 100644 --- a/src/vppinfra/vector.h +++ b/src/vppinfra/vector.h @@ -157,47 +157,6 @@ typedef u64 u64x _vector_size (8); #define VECTOR_WORD_TYPE(t) t##x #define VECTOR_WORD_TYPE_LEN(t) (sizeof (VECTOR_WORD_TYPE(t)) / sizeof (t)) -/* this series of macros generate _is_equal, _is_greater, _is_zero, _add - and _sub inline funcitons for each vector type */ -#define _(t, s, c) \ - static_always_inline t##s##x##c \ -t##s##x##c##_is_equal (t##s##x##c v1, t##s##x##c v2) \ -{ return (v1 == v2); } \ - \ -static_always_inline t##s##x##c \ -t##s##x##c##_is_greater (t##s##x##c v1, t##s##x##c v2) \ -{ return (v1 > v2); } \ - \ -static_always_inline t##s##x##c \ -t##s##x##c##_is_zero (t##s##x##c v1) \ -{ t##s##x##c z = {0}; return (v1 == z); } \ - \ -static_always_inline t##s##x##c \ -t##s##x##c##_add (t##s##x##c v1, t##s##x##c v2) \ -{ return (v1 + v2); } \ - \ -static_always_inline t##s##x##c \ -t##s##x##c##_sub (t##s##x##c v1, t##s##x##c v2) \ -{ return (v1 - v2); } - foreach_vec -#undef _ - -/* this macro generate _splat inline functions for each scalar vector type */ -#define _(t, s, c) \ - static_always_inline t##s##x##c \ -t##s##x##c##_splat (t##s x) \ -{ \ - t##s##x##c r; \ - int i; \ - \ - for (i = 0; i < c; i++) \ - r[i] = x; \ - \ - return r; \ -} - foreach_vec128i foreach_vec128u -#undef _ - #if defined (__SSE4_2__) && __GNUC__ >= 4 #include #endif @@ -222,6 +181,24 @@ t##s##x##c##_splat (t##s x) \ #include #endif +/* this macro generate _splat inline functions for each scalar vector type */ +#ifndef CLIB_VEC128_SPLAT_DEFINED +#define _(t, s, c) \ + static_always_inline t##s##x##c \ +t##s##x##c##_splat (t##s x) \ +{ \ + t##s##x##c r; \ + int i; \ + \ + for (i = 0; i < c; i++) \ + r[i] = x; \ + \ + return r; \ +} + foreach_vec128i foreach_vec128u +#undef _ +#endif + /* *INDENT-ON* */ #endif /* included_clib_vector_h */ diff --git a/src/vppinfra/vector_avx2.h b/src/vppinfra/vector_avx2.h index 3f0b397b828..1fb41dfd7df 100644 --- a/src/vppinfra/vector_avx2.h +++ b/src/vppinfra/vector_avx2.h @@ -19,6 +19,7 @@ #include #include +/* *INDENT-OFF* */ #define foreach_avx2_vec256i \ _(i,8,32,epi8) _(i,16,16,epi16) _(i,32,8,epi32) _(i,64,4,epi64x) #define foreach_avx2_vec256u \ @@ -26,7 +27,8 @@ #define foreach_avx2_vec256f \ _(f,32,8,ps) _(f,64,4,pd) -/* splat, load_unaligned, store_unaligned, is_all_zero, is_all_equal */ +/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal, + is_all_equal */ #define _(t, s, c, i) \ static_always_inline t##s##x##c \ t##s##x##c##_splat (t##s x) \ @@ -45,13 +47,18 @@ t##s##x##c##_is_all_zero (t##s##x##c x) \ { return _mm256_testz_si256 ((__m256i) x, (__m256i) x); } \ \ static_always_inline int \ -t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \ -{ return t##s##x##c##_is_all_zero (v != t##s##x##c##_splat (x)); }; \ +t##s##x##c##_is_equal (t##s##x##c x, t##s##x##c y) \ +{ return _mm256_testc_si256 ((__m256i) x, (__m256i) y); } \ \ +static_always_inline int \ +t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \ +{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); }; \ foreach_avx2_vec256i foreach_avx2_vec256u #undef _ - always_inline u32x8 +/* *INDENT-ON* */ + +always_inline u32x8 u32x8_permute (u32x8 v, u32x8 idx) { return (u32x8) _mm256_permutevar8x32_epi32 ((__m256i) v, (__m256i) idx); diff --git a/src/vppinfra/vector_avx512.h b/src/vppinfra/vector_avx512.h index c1b7c42a260..c2903e2aa1a 100644 --- a/src/vppinfra/vector_avx512.h +++ b/src/vppinfra/vector_avx512.h @@ -19,6 +19,7 @@ #include #include +/* *INDENT-OFF* */ #define foreach_avx512_vec512i \ _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32) _(i,64,8,epi64) #define foreach_avx512_vec512u \ @@ -26,8 +27,8 @@ #define foreach_avx512_vec512f \ _(f,32,8,ps) _(f,64,4,pd) -/* splat, load_unaligned, store_unaligned */ -/* *INDENT-OFF* */ +/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal, + is_all_equal */ #define _(t, s, c, i) \ static_always_inline t##s##x##c \ t##s##x##c##_splat (t##s x) \ @@ -41,6 +42,17 @@ static_always_inline void \ t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \ { _mm512_storeu_si512 ((__m512i *) p, (__m512i) v); } \ \ +static_always_inline int \ +t##s##x##c##_is_all_zero (t##s##x##c v) \ +{ return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0); } \ +\ +static_always_inline int \ +t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \ +{ return t##s##x##c##_is_all_zero (a ^b); } \ +\ +static_always_inline int \ +t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \ +{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \ foreach_avx512_vec512i foreach_avx512_vec512u #undef _ diff --git a/src/vppinfra/vector_sse42.h b/src/vppinfra/vector_sse42.h index 0e334c828b6..053826d92bc 100644 --- a/src/vppinfra/vector_sse42.h +++ b/src/vppinfra/vector_sse42.h @@ -41,6 +41,48 @@ #include /* for ASSERT */ #include +/* *INDENT-OFF* */ +#define foreach_sse42_vec128i \ + _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64x) +#define foreach_sse42_vec128u \ + _(u,8,16,epi8) _(u,16,8,epi16) _(u,32,4,epi32) _(u,64,2,epi64x) +#define foreach_sse42_vec128f \ + _(f,32,4,ps) _(f,64,2,pd) + +/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal, + is_all_equal */ +#define _(t, s, c, i) \ +static_always_inline t##s##x##c \ +t##s##x##c##_splat (t##s x) \ +{ return (t##s##x##c) _mm_set1_##i (x); } \ +\ +static_always_inline t##s##x##c \ +t##s##x##c##_load_unaligned (void *p) \ +{ return (t##s##x##c) _mm_loadu_si128 (p); } \ +\ +static_always_inline void \ +t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \ +{ _mm_storeu_si128 ((__m128i *) p, (__m128i) v); } \ +\ +static_always_inline int \ +t##s##x##c##_is_all_zero (t##s##x##c x) \ +{ return _mm_testz_si128 ((__m128i) x, (__m128i) x); } \ +\ +static_always_inline int \ +t##s##x##c##_is_equal (t##s##x##c x, t##s##x##c y) \ +{ return _mm_testc_si128 ((__m128i) x, (__m128i) y); } \ +\ +static_always_inline int \ +t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \ +{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); }; \ + +foreach_sse42_vec128i foreach_sse42_vec128u +#undef _ +/* *INDENT-ON* */ + +#define CLIB_VEC128_SPLAT_DEFINED +#define CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE + /* 128 bit interleaves. */ always_inline u8x16 u8x16_interleave_hi (u8x16 a, u8x16 b) @@ -197,16 +239,6 @@ u64x2_write_hi (u64x2 x, u64 * a) } #endif -/* Unaligned loads/stores. */ - -#define _(t) \ - always_inline void t##_store_unaligned (t x, void * a) \ - { _mm_storeu_si128 ((__m128i *) a, (__m128i) x); } \ - always_inline t t##_load_unaligned (void * a) \ - { return (t) _mm_loadu_si128 ((__m128i *) a); } - -_(u8x16) _(u16x8) _(u32x4) _(u64x2) _(i8x16) _(i16x8) _(i32x4) _(i64x2) -#undef _ #define _signed_binop(n,m,f,g) \ /* Unsigned */ \ always_inline u##n##x##m \ @@ -218,7 +250,7 @@ _(u8x16) _(u16x8) _(u32x4) _(u64x2) _(i8x16) _(i16x8) _(i32x4) _(i64x2) i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \ { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } /* Addition/subtraction with saturation. */ - _signed_binop (8, 16, add_saturate, adds_epu) +_signed_binop (8, 16, add_saturate, adds_epu) _signed_binop (16, 8, add_saturate, adds_epu) _signed_binop (8, 16, sub_saturate, subs_epu) _signed_binop (16, 8, sub_saturate, subs_epu) @@ -403,30 +435,6 @@ _(u64, 2, right, left); #undef _ #endif -always_inline int -u8x16_is_all_zero (u8x16 x) -{ - return _mm_testz_si128 ((__m128i) x, (__m128i) x); -} - -always_inline int -u16x8_is_all_zero (u16x8 x) -{ - return _mm_testz_si128 ((__m128i) x, (__m128i) x); -} - -always_inline int -u32x4_is_all_zero (u32x4 x) -{ - return _mm_testz_si128 ((__m128i) x, (__m128i) x); -} - -always_inline int -u64x2_is_all_zero (u64x2 x) -{ - return _mm_testz_si128 ((__m128i) x, (__m128i) x); -} - #define u32x4_select(A,MASK) \ ({ \ u32x4 _x, _y; \ @@ -495,21 +503,21 @@ always_inline u32 u8x16_zero_byte_mask (u8x16 x) { u8x16 zero = { 0 }; - return u8x16_compare_byte_mask (u8x16_is_equal (x, zero)); + return u8x16_compare_byte_mask (x == zero); } always_inline u32 u16x8_zero_byte_mask (u16x8 x) { u16x8 zero = { 0 }; - return u8x16_compare_byte_mask ((u8x16) u16x8_is_equal (x, zero)); + return u8x16_compare_byte_mask ((u8x16) (x == zero)); } always_inline u32 u32x4_zero_byte_mask (u32x4 x) { u32x4 zero = { 0 }; - return u8x16_compare_byte_mask ((u8x16) u32x4_is_equal (x, zero)); + return u8x16_compare_byte_mask ((u8x16) (x == zero)); } always_inline u8x16 diff --git a/src/vppinfra/vhash.h b/src/vppinfra/vhash.h index 5ab42292001..85dfb788308 100644 --- a/src/vppinfra/vhash.h +++ b/src/vppinfra/vhash.h @@ -412,7 +412,7 @@ vhash_bucket_compare (vhash_t * h, { u32 k = vhash_get_key_word (h, key_word_index, vi); u32x4 x = { k, k, k, k }; - return u32x4_is_equal (bucket[key_word_index].as_u32x4, x); + return (bucket[key_word_index].as_u32x4 == x); } #define vhash_bucket_compare_4(h,wi,vi,b0,b1,b2,b3,cmp0,cmp1,cmp2,cmp3) \ @@ -423,10 +423,10 @@ do { \ u32x4 _k2 = u32x4_splat_word (_k4, 2); \ u32x4 _k3 = u32x4_splat_word (_k4, 3); \ \ - cmp0 = u32x4_is_equal (b0->key[wi].as_u32x4, _k0); \ - cmp1 = u32x4_is_equal (b1->key[wi].as_u32x4, _k1); \ - cmp2 = u32x4_is_equal (b2->key[wi].as_u32x4, _k2); \ - cmp3 = u32x4_is_equal (b3->key[wi].as_u32x4, _k3); \ + cmp0 = (b0->key[wi].as_u32x4 == _k0); \ + cmp1 = (b1->key[wi].as_u32x4 == _k1); \ + cmp2 = (b2->key[wi].as_u32x4 == _k2); \ + cmp3 = (b3->key[wi].as_u32x4 == _k3); \ } while (0) u32 vhash_get_overflow (vhash_t * h, u32 key_hash, u32 vi, u32 n_key_u32s);