src/vppinfra/vector/index_to_ptr.h

   1 /* SPDX-License-Identifier: Apache-2.0
   2  * Copyright(c) 2021 Cisco Systems, Inc.
   3  */
   4
   5 #ifndef included_vector_index_to_ptr_h
   6 #define included_vector_index_to_ptr_h
   7 #include <vppinfra/clib.h>
   8
   9 #ifdef CLIB_HAVE_VEC128
  10 static_always_inline void
  11 clib_index_to_ptr_u32x4 (u32 *indices, void **ptrs, i32 i, u64x2 ov, u8 shift)
  12 {
  13   u32x4 iv4 = u32x4_load_unaligned (indices + i);
  14   u64x2 pv2;
  15   pv2 = u64x2_from_u32x4 (iv4);
  16   u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i);
  17 #ifdef __aarch64__
  18   pv2 = u64x2_from_u32x4_high (iv4);
  19 #else
  20   pv2 = u64x2_from_u32x4 ((u32x4) u8x16_word_shift_right (iv4, 8));
  21 #endif
  22   u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i + 2);
  23 }
  24 #endif
  25
  26 /** \brief Convert array of indices to pointers with base and shift
  27
  28     @param indices source array of u32 indices
  29     @param base base pointer
  30     @param shift numbers of bits to be shifted
  31     @param ptrs destinatin array of pointers
  32     @param n_elts number of elements in the source array
  33 */
  34
  35 static_always_inline void
  36 clib_index_to_ptr_u32 (u32 *indices, void *base, u8 shift, void **ptrs,
  37                        u32 n_elts)
  38 {
  39 #if defined CLIB_HAVE_VEC512
  40   if (n_elts >= 8)
  41     {
  42       u64x8 off = u64x8_splat ((u64) base);
  43       u64x8 b0, b1, b2, b3, b4, b5, b6, b7;
  44
  45       while (n_elts >= 64)
  46         {
  47           b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
  48           b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
  49           b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16));
  50           b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24));
  51           b4 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 32));
  52           b5 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 40));
  53           b6 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 48));
  54           b7 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 56));
  55           u64x8_store_unaligned ((b0 << shift) + off, ptrs);
  56           u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
  57           u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16);
  58           u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24);
  59           u64x8_store_unaligned ((b4 << shift) + off, ptrs + 32);
  60           u64x8_store_unaligned ((b5 << shift) + off, ptrs + 40);
  61           u64x8_store_unaligned ((b6 << shift) + off, ptrs + 48);
  62           u64x8_store_unaligned ((b7 << shift) + off, ptrs + 56);
  63           ptrs += 64;
  64           indices += 64;
  65           n_elts -= 64;
  66         }
  67
  68       if (n_elts == 0)
  69         return;
  70
  71       if (n_elts >= 32)
  72         {
  73           b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
  74           b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
  75           b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16));
  76           b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24));
  77           u64x8_store_unaligned ((b0 << shift) + off, ptrs);
  78           u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
  79           u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16);
  80           u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24);
  81           ptrs += 32;
  82           indices += 32;
  83           n_elts -= 32;
  84         }
  85       if (n_elts >= 16)
  86         {
  87           b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
  88           b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
  89           u64x8_store_unaligned ((b0 << shift) + off, ptrs);
  90           u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
  91           ptrs += 16;
  92           indices += 16;
  93           n_elts -= 16;
  94         }
  95       if (n_elts >= 8)
  96         {
  97           b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
  98           u64x8_store_unaligned ((b0 << shift) + off, ptrs);
  99           ptrs += 8;
 100           indices += 8;
 101           n_elts -= 8;
 102         }
 103
 104       if (n_elts == 0)
 105         return;
 106
 107       b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + n_elts - 8));
 108       u64x8_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 8);
 109     }
 110   else
 111     {
 112       u32 mask = pow2_mask (n_elts);
 113       u64x8 r = u64x8_from_u32x8 (u32x8_mask_load_zero (indices, mask));
 114       u64x8_mask_store ((r << shift) + u64x8_splat ((u64) base), ptrs, mask);
 115       return;
 116     }
 117 #elif defined CLIB_HAVE_VEC256
 118   if (n_elts >= 4)
 119     {
 120       u64x4 off = u64x4_splat ((u64) base);
 121       u64x4 b0, b1, b2, b3, b4, b5, b6, b7;
 122
 123       while (n_elts >= 32)
 124         {
 125           b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
 126           b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
 127           b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8));
 128           b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12));
 129           b4 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 16));
 130           b5 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 20));
 131           b6 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 24));
 132           b7 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 28));
 133           u64x4_store_unaligned ((b0 << shift) + off, ptrs);
 134           u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
 135           u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8);
 136           u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12);
 137           u64x4_store_unaligned ((b4 << shift) + off, ptrs + 16);
 138           u64x4_store_unaligned ((b5 << shift) + off, ptrs + 20);
 139           u64x4_store_unaligned ((b6 << shift) + off, ptrs + 24);
 140           u64x4_store_unaligned ((b7 << shift) + off, ptrs + 28);
 141           ptrs += 32;
 142           indices += 32;
 143           n_elts -= 32;
 144         }
 145
 146       if (n_elts == 0)
 147         return;
 148
 149       if (n_elts >= 16)
 150         {
 151           b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
 152           b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
 153           b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8));
 154           b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12));
 155           u64x4_store_unaligned ((b0 << shift) + off, ptrs);
 156           u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
 157           u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8);
 158           u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12);
 159           ptrs += 16;
 160           indices += 16;
 161           n_elts -= 16;
 162         }
 163       if (n_elts >= 8)
 164         {
 165           b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
 166           b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
 167           u64x4_store_unaligned ((b0 << shift) + off, ptrs);
 168           u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
 169           ptrs += 8;
 170           indices += 8;
 171           n_elts -= 8;
 172         }
 173       if (n_elts > 4)
 174         {
 175           b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
 176           u64x4_store_unaligned ((b0 << shift) + off, ptrs);
 177           ptrs += 4;
 178           indices += 4;
 179           n_elts -= 4;
 180         }
 181
 182       b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + n_elts - 4));
 183       u64x4_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 4);
 184       return;
 185     }
 186 #ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
 187   else
 188     {
 189       u32 mask = pow2_mask (n_elts);
 190       u64x4 r = u64x4_from_u32x4 (u32x4_mask_load_zero (indices, mask));
 191       u64x4_mask_store ((r << shift) + u64x4_splat ((u64) base), ptrs, mask);
 192       return;
 193     }
 194 #endif
 195 #elif defined(CLIB_HAVE_VEC128)
 196   if (n_elts >= 4)
 197     {
 198       u64x2 ov = u64x2_splat ((u64) base);
 199       u32 *i = (u32 *) indices;
 200       void **p = (void **) ptrs;
 201       u32 n = n_elts;
 202
 203       while (n >= 32)
 204         {
 205           clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
 206           clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
 207           clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift);
 208           clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift);
 209           clib_index_to_ptr_u32x4 (indices, ptrs, 16, ov, shift);
 210           clib_index_to_ptr_u32x4 (indices, ptrs, 20, ov, shift);
 211           clib_index_to_ptr_u32x4 (indices, ptrs, 24, ov, shift);
 212           clib_index_to_ptr_u32x4 (indices, ptrs, 28, ov, shift);
 213           indices += 32;
 214           ptrs += 32;
 215           n -= 32;
 216         }
 217
 218       if (n == 0)
 219         return;
 220
 221       if (n >= 16)
 222         {
 223           clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
 224           clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
 225           clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift);
 226           clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift);
 227           indices += 16;
 228           ptrs += 16;
 229           n -= 16;
 230         }
 231
 232       if (n >= 8)
 233         {
 234           clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
 235           clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
 236           indices += 8;
 237           ptrs += 8;
 238           n -= 8;
 239         }
 240
 241       if (n > 4)
 242         clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
 243
 244       clib_index_to_ptr_u32x4 (i, p, n_elts - 4, ov, shift);
 245       return;
 246     }
 247 #endif
 248   while (n_elts)
 249     {
 250       ptrs[0] = base + ((u64) indices[0] << shift);
 251       ptrs += 1;
 252       indices += 1;
 253       n_elts -= 1;
 254     }
 255 }
 256
 257 #endif