2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 #ifndef included_vector_avx512_h
17 #define included_vector_avx512_h
19 #include <vppinfra/clib.h>
20 #include <x86intrin.h>
22 #define foreach_avx512_vec512i \
23 _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32) _(i,64,8,epi64)
24 #define foreach_avx512_vec512u \
25 _(u,8,64,epi8) _(u,16,32,epi16) _(u,32,16,epi32) _(u,64,8,epi64)
26 #define foreach_avx512_vec512f \
27 _(f,32,8,ps) _(f,64,4,pd)
29 /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
30 is_all_equal, is_zero_mask */
31 #define _(t, s, c, i) \
32 static_always_inline t##s##x##c t##s##x##c##_splat (t##s x) \
34 return (t##s##x##c) _mm512_set1_##i (x); \
37 static_always_inline t##s##x##c t##s##x##c##_load_aligned (void *p) \
39 return (t##s##x##c) _mm512_load_si512 (p); \
42 static_always_inline void t##s##x##c##_store_aligned (t##s##x##c v, \
45 _mm512_store_si512 ((__m512i *) p, (__m512i) v); \
48 static_always_inline t##s##x##c t##s##x##c##_load_unaligned (void *p) \
50 return (t##s##x##c) _mm512_loadu_si512 (p); \
53 static_always_inline void t##s##x##c##_store_unaligned (t##s##x##c v, \
56 _mm512_storeu_si512 ((__m512i *) p, (__m512i) v); \
59 static_always_inline int t##s##x##c##_is_all_zero (t##s##x##c v) \
61 return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0); \
64 static_always_inline int t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
66 return (_mm512_cmpneq_epi64_mask ((__m512i) a, (__m512i) b) == 0); \
69 static_always_inline int t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
71 return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); \
74 static_always_inline u##c t##s##x##c##_is_zero_mask (t##s##x##c v) \
76 return _mm512_test_##i##_mask ((__m512i) v, (__m512i) v); \
79 static_always_inline t##s##x##c t##s##x##c##_interleave_lo (t##s##x##c a, \
82 return (t##s##x##c) _mm512_unpacklo_##i ((__m512i) a, (__m512i) b); \
85 static_always_inline t##s##x##c t##s##x##c##_interleave_hi (t##s##x##c a, \
88 return (t##s##x##c) _mm512_unpackhi_##i ((__m512i) a, (__m512i) b); \
91 foreach_avx512_vec512i foreach_avx512_vec512u
94 static_always_inline u32
95 u16x32_msb_mask (u16x32 v)
97 return (u32) _mm512_movepi16_mask ((__m512i) v);
100 #define u64x8_i64gather(index, base, scale) \
101 (u64x8) _mm512_i64gather_epi64 ((__m512i) index, base, scale)
104 #define _(f, t, fn) \
105 always_inline t t##_pack (f lo, f hi) \
107 return (t) fn ((__m512i) lo, (__m512i) hi); \
110 _ (i16x32, i8x64, _mm512_packs_epi16)
111 _ (i16x32, u8x64, _mm512_packus_epi16)
112 _ (i32x16, i16x32, _mm512_packs_epi32)
113 _ (i32x16, u16x32, _mm512_packus_epi32)
116 static_always_inline u64x8
117 u64x8_byte_swap (u64x8 v)
120 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
121 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
122 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
123 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
125 return (u64x8) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
128 static_always_inline u32x16
129 u32x16_byte_swap (u32x16 v)
132 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
133 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
134 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
135 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
137 return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
140 static_always_inline u16x32
141 u16x32_byte_swap (u16x32 v)
144 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
145 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
146 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
147 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
149 return (u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
153 static_always_inline t f##_extract_lo (f v) \
155 return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 0); \
157 static_always_inline t f##_extract_hi (f v) \
159 return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 1); \
168 static_always_inline u32
169 u32x16_min_scalar (u32x16 v)
171 return u32x8_min_scalar (u32x8_min (u32x16_extract_lo (v),
172 u32x16_extract_hi (v)));
175 static_always_inline u32x16
176 u32x16_insert_lo (u32x16 r, u32x8 v)
178 return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 0);
181 static_always_inline u32x16
182 u32x16_insert_hi (u32x16 r, u32x8 v)
184 return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 1);
187 static_always_inline u64x8
188 u64x8_permute (u64x8 a, u64x8 b, u64x8 mask)
190 return (u64x8) _mm512_permutex2var_epi64 ((__m512i) a, (__m512i) mask,
195 #define u32x16_ternary_logic(a, b, c, d) \
196 (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d)
198 #define u8x64_insert_u8x16(a, b, n) \
199 (u8x64) _mm512_inserti64x2 ((__m512i) (a), (__m128i) (b), n)
201 #define u8x64_extract_u8x16(a, n) \
202 (u8x16) _mm512_extracti64x2_epi64 ((__m512i) (a), n)
204 #define u8x64_word_shift_left(a,n) (u8x64) _mm512_bslli_epi128((__m512i) a, n)
205 #define u8x64_word_shift_right(a,n) (u8x64) _mm512_bsrli_epi128((__m512i) a, n)
207 static_always_inline u8x64
208 u8x64_xor3 (u8x64 a, u8x64 b, u8x64 c)
210 return (u8x64) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b,
214 static_always_inline u64x8
215 u64x8_xor3 (u64x8 a, u64x8 b, u64x8 c)
217 return (u64x8) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b,
221 static_always_inline u8x64
222 u8x64_reflect_u8x16 (u8x64 x)
224 static const u8x64 mask = {
225 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
226 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
227 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
228 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
230 return (u8x64) _mm512_shuffle_epi8 ((__m512i) x, (__m512i) mask);
233 #define u8x64_align_right(a, b, imm) \
234 (u8x64) _mm512_alignr_epi8 ((__m512i) a, (__m512i) b, imm)
236 #define u64x8_align_right(a, b, imm) \
237 (u64x8) _mm512_alignr_epi64 ((__m512i) a, (__m512i) b, imm)
239 static_always_inline u32
240 u32x16_sum_elts (u32x16 sum16)
243 sum16 += (u32x16) u8x64_align_right (sum16, sum16, 8);
244 sum16 += (u32x16) u8x64_align_right (sum16, sum16, 4);
245 sum8 = u32x16_extract_hi (sum16) + u32x16_extract_lo (sum16);
246 return sum8[0] + sum8[4];
249 #define _(t, m, p, i, e) \
250 static_always_inline t t##_mask_load (t a, void *p, m mask) \
252 return (t) p##_mask_loadu_##e ((i) a, mask, p); \
254 static_always_inline t t##_mask_load_zero (void *p, m mask) \
256 return (t) p##_maskz_loadu_##e (mask, p); \
258 static_always_inline void t##_mask_store (t a, void *p, m mask) \
260 p##_mask_storeu_##e (p, mask, (i) a); \
263 _ (u8x64, u64, _mm512, __m512i, epi8)
264 _ (u8x32, u32, _mm256, __m256i, epi8)
265 _ (u8x16, u16, _mm, __m128i, epi8)
266 _ (u16x32, u32, _mm512, __m512i, epi16)
267 _ (u16x16, u16, _mm256, __m256i, epi16)
268 _ (u16x8, u8, _mm, __m128i, epi16)
269 _ (u32x16, u16, _mm512, __m512i, epi32)
270 _ (u32x8, u8, _mm256, __m256i, epi32)
271 _ (u32x4, u8, _mm, __m128i, epi32)
272 _ (u64x8, u8, _mm512, __m512i, epi64)
273 _ (u64x4, u8, _mm256, __m256i, epi64)
274 _ (u64x2, u8, _mm, __m128i, epi64)
277 #define _(t, m, p, i, e) \
278 static_always_inline t t##_mask_and (t a, t b, m mask) \
280 return (t) p##_mask_and_##e ((i) a, mask, (i) a, (i) b); \
282 static_always_inline t t##_mask_andnot (t a, t b, m mask) \
284 return (t) p##_mask_andnot_##e ((i) a, mask, (i) a, (i) b); \
286 static_always_inline t t##_mask_xor (t a, t b, m mask) \
288 return (t) p##_mask_xor_##e ((i) a, mask, (i) a, (i) b); \
290 static_always_inline t t##_mask_or (t a, t b, m mask) \
292 return (t) p##_mask_or_##e ((i) a, mask, (i) a, (i) b); \
294 _ (u32x16, u16, _mm512, __m512i, epi32)
295 _ (u32x8, u8, _mm256, __m256i, epi32)
296 _ (u32x4, u8, _mm, __m128i, epi32)
297 _ (u64x8, u8, _mm512, __m512i, epi64)
298 _ (u64x4, u8, _mm256, __m256i, epi64)
299 _ (u64x2, u8, _mm, __m128i, epi64)
302 #ifdef CLIB_HAVE_VEC512
303 #define CLIB_HAVE_VEC512_MASK_LOAD_STORE
304 #define CLIB_HAVE_VEC512_MASK_BITWISE_OPS
306 #ifdef CLIB_HAVE_VEC256
307 #define CLIB_HAVE_VEC256_MASK_LOAD_STORE
308 #define CLIB_HAVE_VEC256_MASK_BITWISE_OPS
310 #ifdef CLIB_HAVE_VEC128
311 #define CLIB_HAVE_VEC128_MASK_LOAD_STORE
312 #define CLIB_HAVE_VEC128_MASK_BITWISE_OPS
315 static_always_inline u8x64
316 u8x64_splat_u8x16 (u8x16 a)
318 return (u8x64) _mm512_broadcast_i64x2 ((__m128i) a);
321 static_always_inline u32x16
322 u32x16_splat_u32x4 (u32x4 a)
324 return (u32x16) _mm512_broadcast_i64x2 ((__m128i) a);
327 static_always_inline u64x8
328 u64x8_splat_u64x2 (u64x2 a)
330 return (u64x8) _mm512_broadcast_i64x2 ((__m128i) a);
333 static_always_inline u32x16
334 u32x16_mask_blend (u32x16 a, u32x16 b, u16 mask)
336 return (u32x16) _mm512_mask_blend_epi32 (mask, (__m512i) a, (__m512i) b);
339 static_always_inline u8x64
340 u8x64_mask_blend (u8x64 a, u8x64 b, u64 mask)
342 return (u8x64) _mm512_mask_blend_epi8 (mask, (__m512i) a, (__m512i) b);
345 static_always_inline u8x64
346 u8x64_permute (u8x64 idx, u8x64 a)
348 return (u8x64) _mm512_permutexvar_epi8 ((__m512i) idx, (__m512i) a);
351 static_always_inline u8x64
352 u8x64_permute2 (u8x64 idx, u8x64 a, u8x64 b)
354 return (u8x64) _mm512_permutex2var_epi8 ((__m512i) a, (__m512i) idx,
358 #define _(t, m, e, p, it) \
359 static_always_inline m t##_is_equal_mask (t a, t b) \
361 return p##_cmpeq_##e##_mask ((it) a, (it) b); \
363 _ (u8x16, u16, epu8, _mm, __m128i)
364 _ (u16x8, u8, epu16, _mm, __m128i)
365 _ (u32x4, u8, epu32, _mm, __m128i)
366 _ (u64x2, u8, epu64, _mm, __m128i)
368 _ (u8x32, u32, epu8, _mm256, __m256i)
369 _ (u16x16, u16, epu16, _mm256, __m256i)
370 _ (u32x8, u8, epu32, _mm256, __m256i)
371 _ (u64x4, u8, epu64, _mm256, __m256i)
373 _ (u8x64, u64, epu8, _mm512, __m512i)
374 _ (u16x32, u32, epu16, _mm512, __m512i)
375 _ (u32x16, u16, epu32, _mm512, __m512i)
376 _ (u64x8, u8, epu64, _mm512, __m512i)
379 #define _(t, m, e, p, it) \
380 static_always_inline m t##_is_not_equal_mask (t a, t b) \
382 return p##_cmpneq_##e##_mask ((it) a, (it) b); \
384 _ (u8x16, u16, epu8, _mm, __m128i)
385 _ (u16x8, u8, epu16, _mm, __m128i)
386 _ (u32x4, u8, epu32, _mm, __m128i)
387 _ (u64x2, u8, epu64, _mm, __m128i)
389 _ (u8x32, u32, epu8, _mm256, __m256i)
390 _ (u16x16, u16, epu16, _mm256, __m256i)
391 _ (u32x8, u8, epu32, _mm256, __m256i)
392 _ (u64x4, u8, epu64, _mm256, __m256i)
394 _ (u8x64, u64, epu8, _mm512, __m512i)
395 _ (u16x32, u32, epu16, _mm512, __m512i)
396 _ (u32x16, u16, epu32, _mm512, __m512i)
397 _ (u64x8, u8, epu64, _mm512, __m512i)
400 #define _(f, t, fn, it) \
401 static_always_inline t t##_from_##f (f x) { return (t) fn ((it) x); }
402 _ (u16x16, u32x16, _mm512_cvtepi16_epi32, __m256i)
403 _ (u32x16, u16x16, _mm512_cvtusepi32_epi16, __m512i)
404 _ (u32x8, u16x8, _mm256_cvtusepi32_epi16, __m256i)
405 _ (u32x8, u64x8, _mm512_cvtepu32_epi64, __m256i)
408 #define _(vt, mt, p, it, epi) \
409 static_always_inline vt vt##_compress (vt a, mt mask) \
411 return (vt) p##_maskz_compress_##epi (mask, (it) a); \
413 static_always_inline vt vt##_expand (vt a, mt mask) \
415 return (vt) p##_maskz_expand_##epi (mask, (it) a); \
417 static_always_inline void vt##_compress_store (vt v, mt mask, void *p) \
419 p##_mask_compressstoreu_##epi (p, mask, (it) v); \
422 _ (u64x8, u8, _mm512, __m512i, epi64)
423 _ (u32x16, u16, _mm512, __m512i, epi32)
424 _ (u64x4, u8, _mm256, __m256i, epi64)
425 _ (u32x8, u8, _mm256, __m256i, epi32)
426 _ (u64x2, u8, _mm, __m128i, epi64)
427 _ (u32x4, u8, _mm, __m128i, epi32)
428 #ifdef __AVX512VBMI2__
429 _ (u16x32, u32, _mm512, __m512i, epi16)
430 _ (u8x64, u64, _mm512, __m512i, epi8)
431 _ (u16x16, u16, _mm256, __m256i, epi16)
432 _ (u8x32, u32, _mm256, __m256i, epi8)
433 _ (u16x8, u8, _mm, __m128i, epi16)
434 _ (u8x16, u16, _mm, __m128i, epi8)
438 #ifdef CLIB_HAVE_VEC256
439 #define CLIB_HAVE_VEC256_COMPRESS
440 #ifdef __AVX512VBMI2__
441 #define CLIB_HAVE_VEC256_COMPRESS_U8_U16
445 #ifdef CLIB_HAVE_VEC512
446 #define CLIB_HAVE_VEC512_COMPRESS
447 #ifdef __AVX512VBMI2__
448 #define CLIB_HAVE_VEC512_COMPRESS_U8_U16
453 #ifndef __AVX512VBMI2__
454 static_always_inline u16x16
455 u16x16_compress (u16x16 v, u16 mask)
457 return u16x16_from_u32x16 (u32x16_compress (u32x16_from_u16x16 (v), mask));
460 static_always_inline u16x8
461 u16x8_compress (u16x8 v, u8 mask)
463 return u16x8_from_u32x8 (u32x8_compress (u32x8_from_u16x8 (v), mask));
467 static_always_inline u64
470 v ^= u64x8_align_right (v, v, 4);
471 v ^= u64x8_align_right (v, v, 2);
475 static_always_inline void
476 u32x16_transpose (u32x16 m[16])
478 __m512i r[16], a, b, c, d, x, y;
480 __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
481 __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
482 __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
483 __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
485 r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]);
486 r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]);
487 r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]);
488 r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]);
489 r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]);
490 r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]);
491 r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]);
492 r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]);
494 r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]);
495 r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]);
496 r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]);
497 r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]);
498 r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]);
499 r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]);
500 r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]);
501 r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]);
503 a = _mm512_unpacklo_epi64 (r[0], r[1]);
504 b = _mm512_unpacklo_epi64 (r[2], r[3]);
505 c = _mm512_unpacklo_epi64 (r[4], r[5]);
506 d = _mm512_unpacklo_epi64 (r[6], r[7]);
507 x = _mm512_permutex2var_epi64 (a, pm1, b);
508 y = _mm512_permutex2var_epi64 (c, pm1, d);
509 m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
510 m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
511 x = _mm512_permutex2var_epi64 (a, pm2, b);
512 y = _mm512_permutex2var_epi64 (c, pm2, d);
513 m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
514 m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
516 a = _mm512_unpacklo_epi64 (r[8], r[9]);
517 b = _mm512_unpacklo_epi64 (r[10], r[11]);
518 c = _mm512_unpacklo_epi64 (r[12], r[13]);
519 d = _mm512_unpacklo_epi64 (r[14], r[15]);
520 x = _mm512_permutex2var_epi64 (a, pm1, b);
521 y = _mm512_permutex2var_epi64 (c, pm1, d);
522 m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
523 m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
524 x = _mm512_permutex2var_epi64 (a, pm2, b);
525 y = _mm512_permutex2var_epi64 (c, pm2, d);
526 m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
527 m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
529 a = _mm512_unpackhi_epi64 (r[0], r[1]);
530 b = _mm512_unpackhi_epi64 (r[2], r[3]);
531 c = _mm512_unpackhi_epi64 (r[4], r[5]);
532 d = _mm512_unpackhi_epi64 (r[6], r[7]);
533 x = _mm512_permutex2var_epi64 (a, pm1, b);
534 y = _mm512_permutex2var_epi64 (c, pm1, d);
535 m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
536 m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
537 x = _mm512_permutex2var_epi64 (a, pm2, b);
538 y = _mm512_permutex2var_epi64 (c, pm2, d);
539 m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
540 m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
542 a = _mm512_unpackhi_epi64 (r[8], r[9]);
543 b = _mm512_unpackhi_epi64 (r[10], r[11]);
544 c = _mm512_unpackhi_epi64 (r[12], r[13]);
545 d = _mm512_unpackhi_epi64 (r[14], r[15]);
546 x = _mm512_permutex2var_epi64 (a, pm1, b);
547 y = _mm512_permutex2var_epi64 (c, pm1, d);
548 m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
549 m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
550 x = _mm512_permutex2var_epi64 (a, pm2, b);
551 y = _mm512_permutex2var_epi64 (c, pm2, d);
552 m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
553 m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
558 static_always_inline void
559 u64x8_transpose (u64x8 m[8])
563 __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
564 __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
565 __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
566 __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
568 r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]);
569 r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]);
570 r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]);
571 r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]);
572 r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]);
573 r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]);
574 r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]);
575 r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]);
577 x = _mm512_permutex2var_epi64 (r[0], pm1, r[1]);
578 y = _mm512_permutex2var_epi64 (r[2], pm1, r[3]);
579 m[0] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
580 m[4] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
581 x = _mm512_permutex2var_epi64 (r[0], pm2, r[1]);
582 y = _mm512_permutex2var_epi64 (r[2], pm2, r[3]);
583 m[2] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
584 m[6] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
586 x = _mm512_permutex2var_epi64 (r[4], pm1, r[5]);
587 y = _mm512_permutex2var_epi64 (r[6], pm1, r[7]);
588 m[1] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
589 m[5] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
590 x = _mm512_permutex2var_epi64 (r[4], pm2, r[5]);
591 y = _mm512_permutex2var_epi64 (r[6], pm2, r[7]);
592 m[3] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
593 m[7] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
596 static_always_inline u8x64
597 u8x64_load_partial (u8 *data, uword n)
599 return u8x64_mask_load_zero (data, pow2_mask (n));
602 static_always_inline void
603 u8x64_store_partial (u8x64 r, u8 *data, uword n)
605 u8x64_mask_store (r, data, pow2_mask (n));
608 #endif /* included_vector_avx512_h */
610 * fd.io coding-style-patch-verification: ON
613 * eval: (c-set-style "gnu")