From 4c53ff459595c9ddc05002ee1847313127175b5f Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 28 Oct 2021 23:03:04 +0200 Subject: [PATCH] vppinfra: vectorized index to pointer function Type: improvement Change-Id: I05e1a8fa31761b113355123429d72da18881d4b0 Signed-off-by: Damjan Marion --- src/vlib/buffer_funcs.h | 121 ++++----------- src/vppinfra/CMakeLists.txt | 2 + src/vppinfra/vector/index_to_ptr.h | 254 ++++++++++++++++++++++++++++++++ src/vppinfra/vector/test/index_to_ptr.c | 58 ++++++++ 4 files changed, 343 insertions(+), 92 deletions(-) create mode 100644 src/vppinfra/vector/index_to_ptr.h create mode 100644 src/vppinfra/vector/test/index_to_ptr.c diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 77964fde821..30fe23443ab 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -42,6 +42,7 @@ #include #include +#include #include #include #include @@ -201,102 +202,38 @@ vlib_buffer_pool_get_default_for_numa (vlib_main_t * vm, u32 numa_node) @param offset - (i32) offset applied to each pointer */ static_always_inline void -vlib_get_buffers_with_offset (vlib_main_t * vm, u32 * bi, void **b, int count, +vlib_get_buffers_with_offset (vlib_main_t *vm, u32 *bi, void **b, u32 count, i32 offset) { uword buffer_mem_start = vm->buffer_main->buffer_mem_start; -#ifdef CLIB_HAVE_VEC512 - u64x8 of8 = u64x8_splat (buffer_mem_start + offset); - u64x4 off = u64x8_extract_lo (of8); - /* if count is not const, compiler will not unroll while loop - se we maintain two-in-parallel variant */ - while (count >= 32) - { - u64x8 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (bi)); - u64x8 b1 = u64x8_from_u32x8 (u32x8_load_unaligned (bi + 8)); - u64x8 b2 = u64x8_from_u32x8 (u32x8_load_unaligned (bi + 16)); - u64x8 b3 = u64x8_from_u32x8 (u32x8_load_unaligned (bi + 24)); - /* shift and add to get vlib_buffer_t pointer */ - u64x8_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b); - u64x8_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b + 8); - u64x8_store_unaligned ((b2 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b + 16); - u64x8_store_unaligned ((b3 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b + 24); - b += 32; - bi += 32; - count -= 32; - } - while (count >= 8) - { - u64x8 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (bi)); - /* shift and add to get vlib_buffer_t pointer */ - u64x8_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b); - b += 8; - bi += 8; - count -= 8; - } -#elif defined CLIB_HAVE_VEC256 - u64x4 off = u64x4_splat (buffer_mem_start + offset); - /* if count is not const, compiler will not unroll while loop - se we maintain two-in-parallel variant */ - while (count >= 32) - { - u64x4 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (bi)); - u64x4 b1 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 4)); - u64x4 b2 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 8)); - u64x4 b3 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 12)); - u64x4 b4 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 16)); - u64x4 b5 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 20)); - u64x4 b6 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 24)); - u64x4 b7 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 28)); - /* shift and add to get vlib_buffer_t pointer */ - u64x4_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b); - u64x4_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 4); - u64x4_store_unaligned ((b2 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 8); - u64x4_store_unaligned ((b3 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 12); - u64x4_store_unaligned ((b4 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 16); - u64x4_store_unaligned ((b5 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 20); - u64x4_store_unaligned ((b6 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 24); - u64x4_store_unaligned ((b7 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 28); - b += 32; - bi += 32; - count -= 32; - } -#endif - while (count >= 4) - { -#ifdef CLIB_HAVE_VEC256 - u64x4 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (bi)); - /* shift and add to get vlib_buffer_t pointer */ - u64x4_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b); -#elif defined (CLIB_HAVE_VEC128) - u64x2 off = u64x2_splat (buffer_mem_start + offset); - u32x4 bi4 = u32x4_load_unaligned (bi); - u64x2 b0 = u64x2_from_u32x4 ((u32x4) bi4); -#if defined (__aarch64__) - u64x2 b1 = u64x2_from_u32x4_high ((u32x4) bi4); -#else - bi4 = u32x4_shuffle (bi4, 2, 3, 0, 1); - u64x2 b1 = u64x2_from_u32x4 ((u32x4) bi4); -#endif - u64x2_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b); - u64x2_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 2); -#else - b[0] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[0], offset); - b[1] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[1], offset); - b[2] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[2], offset); - b[3] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[3], offset); -#endif - b += 4; - bi += 4; - count -= 4; - } - while (count) + void *base = (void *) (buffer_mem_start + offset); + int objsize = __builtin_object_size (b, 0); + const int sh = CLIB_LOG2_CACHE_LINE_BYTES; + + if (COMPILE_TIME_CONST (count) == 0 && objsize >= 64 * sizeof (b[0]) && + (objsize & ((8 * sizeof (b[0])) - 1)) == 0) { - b[0] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[0], offset); - b += 1; - bi += 1; - count -= 1; + u32 n = round_pow2 (count, 8); + ASSERT (objsize >= count); + CLIB_ASSUME (objsize >= count); + while (n >= 64) + { + clib_index_to_ptr_u32 (bi, base, sh, b, 64); + b += 64; + bi += 64; + n -= 64; + } + + while (n) + { + clib_index_to_ptr_u32 (bi, base, sh, b, 8); + b += 8; + bi += 8; + n -= 8; + } } + else + clib_index_to_ptr_u32 (bi, base, sh, b, count); } /** \brief Translate array of buffer indices into buffer pointers @@ -308,7 +245,7 @@ vlib_get_buffers_with_offset (vlib_main_t * vm, u32 * bi, void **b, int count, */ static_always_inline void -vlib_get_buffers (vlib_main_t * vm, u32 * bi, vlib_buffer_t ** b, int count) +vlib_get_buffers (vlib_main_t *vm, u32 *bi, vlib_buffer_t **b, u32 count) { vlib_get_buffers_with_offset (vm, bi, (void **) b, count, 0); } diff --git a/src/vppinfra/CMakeLists.txt b/src/vppinfra/CMakeLists.txt index 6900995e644..11d4a5d539b 100644 --- a/src/vppinfra/CMakeLists.txt +++ b/src/vppinfra/CMakeLists.txt @@ -195,6 +195,7 @@ set(VPPINFRA_HEADERS vector/array_mask.h vector/compress.h vector/count_equal.h + vector/index_to_ptr.h vector/mask_compare.h vector.h vector_neon.h @@ -275,6 +276,7 @@ set(test_files vector/test/array_mask.c vector/test/compress.c vector/test/count_equal.c + vector/test/index_to_ptr.c vector/test/mask_compare.c ) diff --git a/src/vppinfra/vector/index_to_ptr.h b/src/vppinfra/vector/index_to_ptr.h new file mode 100644 index 00000000000..91de3546439 --- /dev/null +++ b/src/vppinfra/vector/index_to_ptr.h @@ -0,0 +1,254 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2021 Cisco Systems, Inc. + */ + +#ifndef included_vector_index_to_ptr_h +#define included_vector_index_to_ptr_h +#include + +#ifdef CLIB_HAVE_VEC128 +static_always_inline void +clib_index_to_ptr_u32x4 (u32 *indices, void **ptrs, i32 i, u64x2 ov, u8 shift) +{ + u32x4 iv4 = u32x4_load_unaligned (indices + i); + u64x2 pv2; + pv2 = u64x2_from_u32x4 (iv4); + u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i); +#ifdef __aarch64__ + pv2 = u64x2_from_u32x4_high (iv4); +#else + pv2 = u64x2_from_u32x4 ((u32x4) u8x16_word_shift_right (iv4, 8)); +#endif + u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i + 2); +} +#endif + +/** \brief Convert array of indices to pointers with base and shift + + @param indices source array of u32 indices + @param base base pointer + @param shift numbers of bits to be shifted + @param ptrs destinatin array of pointers + @param n_elts number of elements in the source array +*/ + +static_always_inline void +clib_index_to_ptr_u32 (u32 *indices, void *base, u8 shift, void **ptrs, + u32 n_elts) +{ +#if defined CLIB_HAVE_VEC512 + if (n_elts >= 8) + { + u64x8 off = u64x8_splat ((u64) base); + u64x8 b0, b1, b2, b3, b4, b5, b6, b7; + + while (n_elts >= 64) + { + b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); + b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8)); + b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16)); + b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24)); + b4 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 32)); + b5 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 40)); + b6 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 48)); + b7 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 56)); + u64x8_store_unaligned ((b0 << shift) + off, ptrs); + u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8); + u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16); + u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24); + u64x8_store_unaligned ((b4 << shift) + off, ptrs + 32); + u64x8_store_unaligned ((b5 << shift) + off, ptrs + 40); + u64x8_store_unaligned ((b6 << shift) + off, ptrs + 48); + u64x8_store_unaligned ((b7 << shift) + off, ptrs + 56); + ptrs += 64; + indices += 64; + n_elts -= 64; + } + + if (n_elts == 0) + return; + + if (n_elts >= 32) + { + b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); + b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8)); + b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16)); + b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24)); + u64x8_store_unaligned ((b0 << shift) + off, ptrs); + u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8); + u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16); + u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24); + ptrs += 32; + indices += 32; + n_elts -= 32; + } + if (n_elts >= 16) + { + b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); + b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8)); + u64x8_store_unaligned ((b0 << shift) + off, ptrs); + u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8); + ptrs += 16; + indices += 16; + n_elts -= 16; + } + if (n_elts > 8) + { + b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); + u64x8_store_unaligned ((b0 << shift) + off, ptrs); + ptrs += 8; + indices += 8; + n_elts -= 8; + } + + b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + n_elts - 8)); + u64x8_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 8); + } + else + { + u32 mask = pow2_mask (n_elts); + u64x8 r = u64x8_from_u32x8 (u32x8_mask_load_zero (indices, mask)); + u64x8_mask_store ((r << shift) + u64x8_splat ((u64) base), ptrs, mask); + return; + } +#elif defined CLIB_HAVE_VEC256 + if (n_elts >= 4) + { + u64x4 off = u64x4_splat ((u64) base); + u64x4 b0, b1, b2, b3, b4, b5, b6, b7; + + while (n_elts >= 32) + { + b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); + b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4)); + b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8)); + b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12)); + b4 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 16)); + b5 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 20)); + b6 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 24)); + b7 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 28)); + u64x4_store_unaligned ((b0 << shift) + off, ptrs); + u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4); + u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8); + u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12); + u64x4_store_unaligned ((b4 << shift) + off, ptrs + 16); + u64x4_store_unaligned ((b5 << shift) + off, ptrs + 20); + u64x4_store_unaligned ((b6 << shift) + off, ptrs + 24); + u64x4_store_unaligned ((b7 << shift) + off, ptrs + 28); + ptrs += 32; + indices += 32; + n_elts -= 32; + } + + if (n_elts == 0) + return; + + if (n_elts >= 16) + { + b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); + b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4)); + b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8)); + b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12)); + u64x4_store_unaligned ((b0 << shift) + off, ptrs); + u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4); + u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8); + u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12); + ptrs += 16; + indices += 16; + n_elts -= 16; + } + if (n_elts >= 8) + { + b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); + b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4)); + u64x4_store_unaligned ((b0 << shift) + off, ptrs); + u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4); + ptrs += 8; + indices += 8; + n_elts -= 8; + } + if (n_elts > 4) + { + b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); + u64x4_store_unaligned ((b0 << shift) + off, ptrs); + ptrs += 4; + indices += 4; + n_elts -= 4; + } + + b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + n_elts - 4)); + u64x4_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 4); + return; + } +#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE + else + { + u32 mask = pow2_mask (n_elts); + u64x4 r = u64x4_from_u32x4 (u32x4_mask_load_zero (indices, mask)); + u64x4_mask_store ((r << shift) + u64x4_splat ((u64) base), ptrs, mask); + return; + } +#endif +#elif defined(CLIB_HAVE_VEC128) + if (n_elts >= 4) + { + u64x2 ov = u64x2_splat ((u64) base); + u32 *i = (u32 *) indices; + void **p = (void **) ptrs; + u32 n = n_elts; + + while (n >= 32) + { + clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 16, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 20, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 24, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 28, ov, shift); + indices += 32; + ptrs += 32; + n -= 32; + } + + if (n == 0) + return; + + if (n >= 16) + { + clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift); + indices += 16; + ptrs += 16; + n -= 16; + } + + if (n >= 8) + { + clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift); + indices += 8; + ptrs += 8; + n -= 8; + } + + if (n > 4) + clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); + + clib_index_to_ptr_u32x4 (i, p, n_elts - 4, ov, shift); + return; + } +#endif + while (n_elts) + { + ptrs[0] = base + ((u64) indices[0] << shift); + ptrs += 1; + indices += 1; + n_elts -= 1; + } +} + +#endif diff --git a/src/vppinfra/vector/test/index_to_ptr.c b/src/vppinfra/vector/test/index_to_ptr.c new file mode 100644 index 00000000000..ae33020328a --- /dev/null +++ b/src/vppinfra/vector/test/index_to_ptr.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2021 Cisco Systems, Inc. + */ + +#include +#include +#include + +typedef void (wrapper_fn) (u32 *indices, void *base, u8 shift, void **ptrs, + u32 n_elts); + +__clib_test_fn void +clib_index_to_ptr_u32_wrapper (u32 *indices, void *base, u8 shift, void **ptrs, + u32 n_elts) +{ + clib_index_to_ptr_u32 (indices, base, shift, ptrs, n_elts); +} + +static wrapper_fn *wfn = &clib_index_to_ptr_u32_wrapper; + +static clib_error_t * +test_clib_index_to_ptr_u32 (clib_error_t *err) +{ + void *_ptrs[512 + 128], **ptrs = _ptrs + 64; + u32 _indices[512 + 128], *indices = _indices + 64; + u16 lengths[] = { 1, 3, 5, 7, 9, 15, 16, 17, 31, 32, + 33, 40, 41, 42, 63, 64, 65, 511, 512 }; + + for (int i = 0; i < ARRAY_LEN (_indices); i++) + _indices[i] = i; + + for (int i = 0; i < ARRAY_LEN (lengths); i++) + { + u16 len = lengths[i]; + u8 shift = 6; + void *base = (void *) 0x100000000 + i; + + for (int j = -64; j < len + 64; j++) + ptrs[j] = (void *) 0xfefefefefefefefe; + + wfn (indices, base, shift, ptrs, len); + for (int j = 0; j < len; j++) + { + void *expected = base + ((u64) indices[j] << shift); + if (ptrs[j] != expected) + return clib_error_return (err, + "testcase failed for length %u " + "(offset %u, expected %p, found %p)", + len, j, expected, ptrs[j]); + } + } + return err; +} + +REGISTER_TEST (clib_index_to_ptr_u32) = { + .name = "clib_index_to_ptr_u32", + .fn = test_clib_index_to_ptr_u32, +}; -- 2.16.6