2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 #ifndef included_vector_avx512_h
17 #define included_vector_avx512_h
19 #include <vppinfra/clib.h>
20 #include <x86intrin.h>
23 #define foreach_avx512_vec512i \
24 _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32) _(i,64,8,epi64)
25 #define foreach_avx512_vec512u \
26 _(u,8,64,epi8) _(u,16,32,epi16) _(u,32,16,epi32) _(u,64,8,epi64)
27 #define foreach_avx512_vec512f \
28 _(f,32,8,ps) _(f,64,4,pd)
30 /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
31 is_all_equal, is_zero_mask */
32 #define _(t, s, c, i) \
33 static_always_inline t##s##x##c t##s##x##c##_splat (t##s x) \
35 return (t##s##x##c) _mm512_set1_##i (x); \
38 static_always_inline t##s##x##c t##s##x##c##_load_aligned (void *p) \
40 return (t##s##x##c) _mm512_load_si512 (p); \
43 static_always_inline void t##s##x##c##_store_aligned (t##s##x##c v, \
46 _mm512_store_si512 ((__m512i *) p, (__m512i) v); \
49 static_always_inline t##s##x##c t##s##x##c##_load_unaligned (void *p) \
51 return (t##s##x##c) _mm512_loadu_si512 (p); \
54 static_always_inline void t##s##x##c##_store_unaligned (t##s##x##c v, \
57 _mm512_storeu_si512 ((__m512i *) p, (__m512i) v); \
60 static_always_inline int t##s##x##c##_is_all_zero (t##s##x##c v) \
62 return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0); \
65 static_always_inline int t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
67 return (_mm512_cmpneq_epi64_mask ((__m512i) a, (__m512i) b) == 0); \
70 static_always_inline int t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
72 return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); \
75 static_always_inline u##c t##s##x##c##_is_zero_mask (t##s##x##c v) \
77 return _mm512_test_##i##_mask ((__m512i) v, (__m512i) v); \
80 static_always_inline t##s##x##c t##s##x##c##_interleave_lo (t##s##x##c a, \
83 return (t##s##x##c) _mm512_unpacklo_##i ((__m512i) a, (__m512i) b); \
86 static_always_inline t##s##x##c t##s##x##c##_interleave_hi (t##s##x##c a, \
89 return (t##s##x##c) _mm512_unpackhi_##i ((__m512i) a, (__m512i) b); \
92 foreach_avx512_vec512i foreach_avx512_vec512u
96 static_always_inline u32
97 u16x32_msb_mask (u16x32 v)
99 return (u32) _mm512_movepi16_mask ((__m512i) v);
102 #define u64x8_i64gather(index, base, scale) \
103 (u64x8) _mm512_i64gather_epi64 ((__m512i) index, base, scale)
106 #define _(f, t, fn) \
107 always_inline t t##_pack (f lo, f hi) \
109 return (t) fn ((__m512i) lo, (__m512i) hi); \
112 _ (i16x32, i8x64, _mm512_packs_epi16)
113 _ (i16x32, u8x64, _mm512_packus_epi16)
114 _ (i32x16, i16x32, _mm512_packs_epi32)
115 _ (i32x16, u16x32, _mm512_packus_epi32)
118 static_always_inline u64x8
119 u64x8_byte_swap (u64x8 v)
122 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
123 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
124 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
125 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
127 return (u64x8) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
130 static_always_inline u32x16
131 u32x16_byte_swap (u32x16 v)
134 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
135 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
136 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
137 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
139 return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
142 static_always_inline u16x32
143 u16x32_byte_swap (u16x32 v)
146 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
147 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
148 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
149 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
151 return (u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
155 static_always_inline t f##_extract_lo (f v) \
157 return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 0); \
159 static_always_inline t f##_extract_hi (f v) \
161 return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 1); \
170 static_always_inline u32
171 u32x16_min_scalar (u32x16 v)
173 return u32x8_min_scalar (u32x8_min (u32x16_extract_lo (v),
174 u32x16_extract_hi (v)));
177 static_always_inline u32x16
178 u32x16_insert_lo (u32x16 r, u32x8 v)
180 return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 0);
183 static_always_inline u32x16
184 u32x16_insert_hi (u32x16 r, u32x8 v)
186 return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 1);
189 static_always_inline u64x8
190 u64x8_permute (u64x8 a, u64x8 b, u64x8 mask)
192 return (u64x8) _mm512_permutex2var_epi64 ((__m512i) a, (__m512i) mask,
197 #define u32x16_ternary_logic(a, b, c, d) \
198 (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d)
200 #define u8x64_insert_u8x16(a, b, n) \
201 (u8x64) _mm512_inserti64x2 ((__m512i) (a), (__m128i) (b), n)
203 #define u8x64_extract_u8x16(a, n) \
204 (u8x16) _mm512_extracti64x2_epi64 ((__m512i) (a), n)
206 #define u8x64_word_shift_left(a,n) (u8x64) _mm512_bslli_epi128((__m512i) a, n)
207 #define u8x64_word_shift_right(a,n) (u8x64) _mm512_bsrli_epi128((__m512i) a, n)
209 static_always_inline u8x64
210 u8x64_xor3 (u8x64 a, u8x64 b, u8x64 c)
212 return (u8x64) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b,
216 static_always_inline u64x8
217 u64x8_xor3 (u64x8 a, u64x8 b, u64x8 c)
219 return (u64x8) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b,
223 static_always_inline u8x64
224 u8x64_reflect_u8x16 (u8x64 x)
226 static const u8x64 mask = {
227 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
228 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
229 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
230 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
232 return (u8x64) _mm512_shuffle_epi8 ((__m512i) x, (__m512i) mask);
235 #define u8x64_align_right(a, b, imm) \
236 (u8x64) _mm512_alignr_epi8 ((__m512i) a, (__m512i) b, imm)
238 #define u64x8_align_right(a, b, imm) \
239 (u64x8) _mm512_alignr_epi64 ((__m512i) a, (__m512i) b, imm)
241 static_always_inline u32
242 u32x16_sum_elts (u32x16 sum16)
245 sum16 += (u32x16) u8x64_align_right (sum16, sum16, 8);
246 sum16 += (u32x16) u8x64_align_right (sum16, sum16, 4);
247 sum8 = u32x16_extract_hi (sum16) + u32x16_extract_lo (sum16);
248 return sum8[0] + sum8[4];
251 #define _(t, m, p, i, e) \
252 static_always_inline t t##_mask_load (t a, void *p, m mask) \
254 return (t) p##_mask_loadu_##e ((i) a, mask, p); \
256 static_always_inline t t##_mask_load_zero (void *p, m mask) \
258 return (t) p##_maskz_loadu_##e (mask, p); \
260 static_always_inline void t##_mask_store (t a, void *p, m mask) \
262 p##_mask_storeu_##e (p, mask, (i) a); \
265 _ (u8x64, u64, _mm512, __m512i, epi8)
266 _ (u8x32, u32, _mm256, __m256i, epi8)
267 _ (u8x16, u16, _mm, __m128i, epi8)
268 _ (u16x32, u32, _mm512, __m512i, epi16)
269 _ (u16x16, u16, _mm256, __m256i, epi16)
270 _ (u16x8, u8, _mm, __m128i, epi16)
271 _ (u32x16, u16, _mm512, __m512i, epi32)
272 _ (u32x8, u8, _mm256, __m256i, epi32)
273 _ (u32x4, u8, _mm, __m128i, epi32)
274 _ (u64x8, u8, _mm512, __m512i, epi64)
275 _ (u64x4, u8, _mm256, __m256i, epi64)
276 _ (u64x2, u8, _mm, __m128i, epi64)
279 #define _(t, m, p, i, e) \
280 static_always_inline t t##_mask_and (t a, t b, m mask) \
282 return (t) p##_mask_and_##e ((i) a, mask, (i) a, (i) b); \
284 static_always_inline t t##_mask_andnot (t a, t b, m mask) \
286 return (t) p##_mask_andnot_##e ((i) a, mask, (i) a, (i) b); \
288 static_always_inline t t##_mask_xor (t a, t b, m mask) \
290 return (t) p##_mask_xor_##e ((i) a, mask, (i) a, (i) b); \
292 static_always_inline t t##_mask_or (t a, t b, m mask) \
294 return (t) p##_mask_or_##e ((i) a, mask, (i) a, (i) b); \
296 _ (u32x16, u16, _mm512, __m512i, epi32)
297 _ (u32x8, u8, _mm256, __m256i, epi32)
298 _ (u32x4, u8, _mm, __m128i, epi32)
299 _ (u64x8, u8, _mm512, __m512i, epi64)
300 _ (u64x4, u8, _mm256, __m256i, epi64)
301 _ (u64x2, u8, _mm, __m128i, epi64)
304 #ifdef CLIB_HAVE_VEC512
305 #define CLIB_HAVE_VEC512_MASK_LOAD_STORE
306 #define CLIB_HAVE_VEC512_MASK_BITWISE_OPS
308 #ifdef CLIB_HAVE_VEC256
309 #define CLIB_HAVE_VEC256_MASK_LOAD_STORE
310 #define CLIB_HAVE_VEC256_MASK_BITWISE_OPS
312 #ifdef CLIB_HAVE_VEC128
313 #define CLIB_HAVE_VEC128_MASK_LOAD_STORE
314 #define CLIB_HAVE_VEC128_MASK_BITWISE_OPS
317 static_always_inline u8x64
318 u8x64_splat_u8x16 (u8x16 a)
320 return (u8x64) _mm512_broadcast_i64x2 ((__m128i) a);
323 static_always_inline u32x16
324 u32x16_splat_u32x4 (u32x4 a)
326 return (u32x16) _mm512_broadcast_i64x2 ((__m128i) a);
329 static_always_inline u32x16
330 u32x16_mask_blend (u32x16 a, u32x16 b, u16 mask)
332 return (u32x16) _mm512_mask_blend_epi32 (mask, (__m512i) a, (__m512i) b);
335 static_always_inline u8x64
336 u8x64_mask_blend (u8x64 a, u8x64 b, u64 mask)
338 return (u8x64) _mm512_mask_blend_epi8 (mask, (__m512i) a, (__m512i) b);
341 static_always_inline u8x64
342 u8x64_permute (u8x64 idx, u8x64 a)
344 return (u8x64) _mm512_permutexvar_epi8 ((__m512i) idx, (__m512i) a);
347 static_always_inline u8x64
348 u8x64_permute2 (u8x64 idx, u8x64 a, u8x64 b)
350 return (u8x64) _mm512_permutex2var_epi8 ((__m512i) a, (__m512i) idx,
354 #define _(t, m, e, p, it) \
355 static_always_inline m t##_is_equal_mask (t a, t b) \
357 return p##_cmpeq_##e##_mask ((it) a, (it) b); \
359 _ (u8x16, u16, epu8, _mm, __m128i)
360 _ (u16x8, u8, epu16, _mm, __m128i)
361 _ (u32x4, u8, epu32, _mm, __m128i)
362 _ (u64x2, u8, epu64, _mm, __m128i)
364 _ (u8x32, u32, epu8, _mm256, __m256i)
365 _ (u16x16, u16, epu16, _mm256, __m256i)
366 _ (u32x8, u8, epu32, _mm256, __m256i)
367 _ (u64x4, u8, epu64, _mm256, __m256i)
369 _ (u8x64, u64, epu8, _mm512, __m512i)
370 _ (u16x32, u32, epu16, _mm512, __m512i)
371 _ (u32x16, u16, epu32, _mm512, __m512i)
372 _ (u64x8, u8, epu64, _mm512, __m512i)
375 #define _(t, m, e, p, it) \
376 static_always_inline m t##_is_not_equal_mask (t a, t b) \
378 return p##_cmpneq_##e##_mask ((it) a, (it) b); \
380 _ (u8x16, u16, epu8, _mm, __m128i)
381 _ (u16x8, u8, epu16, _mm, __m128i)
382 _ (u32x4, u8, epu32, _mm, __m128i)
383 _ (u64x2, u8, epu64, _mm, __m128i)
385 _ (u8x32, u32, epu8, _mm256, __m256i)
386 _ (u16x16, u16, epu16, _mm256, __m256i)
387 _ (u32x8, u8, epu32, _mm256, __m256i)
388 _ (u64x4, u8, epu64, _mm256, __m256i)
390 _ (u8x64, u64, epu8, _mm512, __m512i)
391 _ (u16x32, u32, epu16, _mm512, __m512i)
392 _ (u32x16, u16, epu32, _mm512, __m512i)
393 _ (u64x8, u8, epu64, _mm512, __m512i)
396 #define _(f, t, fn, it) \
397 static_always_inline t t##_from_##f (f x) { return (t) fn ((it) x); }
398 _ (u16x16, u32x16, _mm512_cvtepi16_epi32, __m256i)
399 _ (u32x16, u16x16, _mm512_cvtusepi32_epi16, __m512i)
400 _ (u32x8, u16x8, _mm256_cvtusepi32_epi16, __m256i)
401 _ (u32x8, u64x8, _mm512_cvtepu32_epi64, __m256i)
404 #define _(vt, mt, p, it, epi) \
405 static_always_inline vt vt##_compress (vt a, mt mask) \
407 return (vt) p##_maskz_compress_##epi (mask, (it) a); \
409 static_always_inline vt vt##_expand (vt a, mt mask) \
411 return (vt) p##_maskz_expand_##epi (mask, (it) a); \
413 static_always_inline void vt##_compress_store (vt v, mt mask, void *p) \
415 p##_mask_compressstoreu_##epi (p, mask, (it) v); \
418 _ (u64x8, u8, _mm512, __m512i, epi64)
419 _ (u32x16, u16, _mm512, __m512i, epi32)
420 _ (u64x4, u8, _mm256, __m256i, epi64)
421 _ (u32x8, u8, _mm256, __m256i, epi32)
422 _ (u64x2, u8, _mm, __m128i, epi64)
423 _ (u32x4, u8, _mm, __m128i, epi32)
424 #ifdef __AVX512VBMI2__
425 _ (u16x32, u32, _mm512, __m512i, epi16)
426 _ (u8x64, u64, _mm512, __m512i, epi8)
427 _ (u16x16, u16, _mm256, __m256i, epi16)
428 _ (u8x32, u32, _mm256, __m256i, epi8)
429 _ (u16x8, u8, _mm, __m128i, epi16)
430 _ (u8x16, u16, _mm, __m128i, epi8)
434 #ifdef CLIB_HAVE_VEC256
435 #define CLIB_HAVE_VEC256_COMPRESS
436 #ifdef __AVX512VBMI2__
437 #define CLIB_HAVE_VEC256_COMPRESS_U8_U16
441 #ifdef CLIB_HAVE_VEC512
442 #define CLIB_HAVE_VEC512_COMPRESS
443 #ifdef __AVX512VBMI2__
444 #define CLIB_HAVE_VEC512_COMPRESS_U8_U16
449 #ifndef __AVX512VBMI2__
450 static_always_inline u16x16
451 u16x16_compress (u16x16 v, u16 mask)
453 return u16x16_from_u32x16 (u32x16_compress (u32x16_from_u16x16 (v), mask));
456 static_always_inline u16x8
457 u16x8_compress (u16x8 v, u8 mask)
459 return u16x8_from_u32x8 (u32x8_compress (u32x8_from_u16x8 (v), mask));
463 static_always_inline u64
466 v ^= u64x8_align_right (v, v, 4);
467 v ^= u64x8_align_right (v, v, 2);
471 static_always_inline void
472 u32x16_transpose (u32x16 m[16])
474 __m512i r[16], a, b, c, d, x, y;
477 __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
478 __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
479 __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
480 __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
483 r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]);
484 r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]);
485 r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]);
486 r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]);
487 r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]);
488 r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]);
489 r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]);
490 r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]);
492 r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]);
493 r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]);
494 r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]);
495 r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]);
496 r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]);
497 r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]);
498 r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]);
499 r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]);
501 a = _mm512_unpacklo_epi64 (r[0], r[1]);
502 b = _mm512_unpacklo_epi64 (r[2], r[3]);
503 c = _mm512_unpacklo_epi64 (r[4], r[5]);
504 d = _mm512_unpacklo_epi64 (r[6], r[7]);
505 x = _mm512_permutex2var_epi64 (a, pm1, b);
506 y = _mm512_permutex2var_epi64 (c, pm1, d);
507 m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
508 m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
509 x = _mm512_permutex2var_epi64 (a, pm2, b);
510 y = _mm512_permutex2var_epi64 (c, pm2, d);
511 m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
512 m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
514 a = _mm512_unpacklo_epi64 (r[8], r[9]);
515 b = _mm512_unpacklo_epi64 (r[10], r[11]);
516 c = _mm512_unpacklo_epi64 (r[12], r[13]);
517 d = _mm512_unpacklo_epi64 (r[14], r[15]);
518 x = _mm512_permutex2var_epi64 (a, pm1, b);
519 y = _mm512_permutex2var_epi64 (c, pm1, d);
520 m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
521 m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
522 x = _mm512_permutex2var_epi64 (a, pm2, b);
523 y = _mm512_permutex2var_epi64 (c, pm2, d);
524 m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
525 m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
527 a = _mm512_unpackhi_epi64 (r[0], r[1]);
528 b = _mm512_unpackhi_epi64 (r[2], r[3]);
529 c = _mm512_unpackhi_epi64 (r[4], r[5]);
530 d = _mm512_unpackhi_epi64 (r[6], r[7]);
531 x = _mm512_permutex2var_epi64 (a, pm1, b);
532 y = _mm512_permutex2var_epi64 (c, pm1, d);
533 m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
534 m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
535 x = _mm512_permutex2var_epi64 (a, pm2, b);
536 y = _mm512_permutex2var_epi64 (c, pm2, d);
537 m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
538 m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
540 a = _mm512_unpackhi_epi64 (r[8], r[9]);
541 b = _mm512_unpackhi_epi64 (r[10], r[11]);
542 c = _mm512_unpackhi_epi64 (r[12], r[13]);
543 d = _mm512_unpackhi_epi64 (r[14], r[15]);
544 x = _mm512_permutex2var_epi64 (a, pm1, b);
545 y = _mm512_permutex2var_epi64 (c, pm1, d);
546 m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
547 m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
548 x = _mm512_permutex2var_epi64 (a, pm2, b);
549 y = _mm512_permutex2var_epi64 (c, pm2, d);
550 m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
551 m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
556 static_always_inline void
557 u64x8_transpose (u64x8 m[8])
562 __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
563 __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
564 __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
565 __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
568 r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]);
569 r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]);
570 r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]);
571 r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]);
572 r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]);
573 r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]);
574 r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]);
575 r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]);
577 x = _mm512_permutex2var_epi64 (r[0], pm1, r[1]);
578 y = _mm512_permutex2var_epi64 (r[2], pm1, r[3]);
579 m[0] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
580 m[4] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
581 x = _mm512_permutex2var_epi64 (r[0], pm2, r[1]);
582 y = _mm512_permutex2var_epi64 (r[2], pm2, r[3]);
583 m[2] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
584 m[6] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
586 x = _mm512_permutex2var_epi64 (r[4], pm1, r[5]);
587 y = _mm512_permutex2var_epi64 (r[6], pm1, r[7]);
588 m[1] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
589 m[5] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
590 x = _mm512_permutex2var_epi64 (r[4], pm2, r[5]);
591 y = _mm512_permutex2var_epi64 (r[6], pm2, r[7]);
592 m[3] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
593 m[7] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
596 static_always_inline u8x64
597 u8x64_load_partial (u8 *data, uword n)
599 return u8x64_mask_load_zero (data, pow2_mask (n));
602 static_always_inline void
603 u8x64_store_partial (u8x64 r, u8 *data, uword n)
605 u8x64_mask_store (r, data, pow2_mask (n));
608 #endif /* included_vector_avx512_h */
610 * fd.io coding-style-patch-verification: ON
613 * eval: (c-set-style "gnu")