2 * Copyright (c) 2018 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 #ifndef included_vector_avx2_h
17 #define included_vector_avx2_h
19 #include <vppinfra/clib.h>
20 #include <x86intrin.h>
22 #define foreach_avx2_vec256i \
23 _(i,8,32,epi8) _(i,16,16,epi16) _(i,32,8,epi32) _(i,64,4,epi64)
24 #define foreach_avx2_vec256u \
25 _(u,8,32,epi8) _(u,16,16,epi16) _(u,32,8,epi32) _(u,64,4,epi64)
26 #define foreach_avx2_vec256f \
27 _(f,32,8,ps) _(f,64,4,pd)
29 #define _mm256_set1_epi64 _mm256_set1_epi64x
31 /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
33 #define _(t, s, c, i) \
34 static_always_inline t##s##x##c \
35 t##s##x##c##_splat (t##s x) \
36 { return (t##s##x##c) _mm256_set1_##i (x); } \
38 static_always_inline t##s##x##c \
39 t##s##x##c##_load_unaligned (void *p) \
40 { return (t##s##x##c) _mm256_loadu_si256 (p); } \
42 static_always_inline void \
43 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
44 { _mm256_storeu_si256 ((__m256i *) p, (__m256i) v); } \
46 static_always_inline int \
47 t##s##x##c##_is_all_zero (t##s##x##c x) \
48 { return _mm256_testz_si256 ((__m256i) x, (__m256i) x); } \
50 static_always_inline int \
51 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
52 { return t##s##x##c##_is_all_zero (a ^ b); } \
54 static_always_inline int \
55 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
56 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
58 static_always_inline t##s##x##c \
59 t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
60 { return (t##s##x##c) _mm256_unpacklo_##i ((__m256i) a, (__m256i) b); } \
62 static_always_inline t##s##x##c \
63 t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
64 { return (t##s##x##c) _mm256_unpackhi_##i ((__m256i) a, (__m256i) b); } \
67 foreach_avx2_vec256i foreach_avx2_vec256u
71 u32x8_permute (u32x8 v, u32x8 idx)
73 return (u32x8) _mm256_permutevar8x32_epi32 ((__m256i) v, (__m256i) idx);
76 #define u64x4_permute(v, m0, m1, m2, m3) \
77 (u64x4) _mm256_permute4x64_epi64 ( \
78 (__m256i) v, ((m0) | (m1) << 2 | (m2) << 4 | (m3) << 6))
80 /* _extract_lo, _extract_hi */
83 t2##_extract_lo (t2 v) \
84 { return (t1) _mm256_extracti128_si256 ((__m256i) v, 0); } \
87 t2##_extract_hi (t2 v) \
88 { return (t1) _mm256_extracti128_si256 ((__m256i) v, 1); } \
91 t2##_insert_lo (t2 v1, t1 v2) \
92 { return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 0); }\
95 t2##_insert_hi (t2 v1, t1 v2) \
96 { return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 1); }\
105 #define _(f, t, fn) \
106 always_inline t t##_pack (f lo, f hi) \
108 return (t) fn ((__m256i) lo, (__m256i) hi); \
111 _ (i16x16, i8x32, _mm256_packs_epi16)
112 _ (i16x16, u8x32, _mm256_packus_epi16)
113 _ (i32x8, i16x16, _mm256_packs_epi32)
114 _ (i32x8, u16x16, _mm256_packus_epi32)
118 static_always_inline u32
119 u8x32_msb_mask (u8x32 v)
121 return _mm256_movemask_epi8 ((__m256i) v);
124 static_always_inline u32
125 i8x32_msb_mask (i8x32 v)
127 return _mm256_movemask_epi8 ((__m256i) v);
132 static_always_inline t \
134 { return (t) _mm256_cvt##i ((__m128i) x); }
136 _(u16x8, u32x8, epu16_epi32)
137 _(u16x8, u64x4, epu16_epi64)
138 _(u32x4, u64x4, epu32_epi64)
139 _ (u8x16, u16x16, epu8_epi16)
140 _(u8x16, u32x8, epu8_epi32)
141 _(u8x16, u64x4, epu8_epi64)
142 _(i16x8, i32x8, epi16_epi32)
143 _(i16x8, i64x4, epi16_epi64)
144 _(i32x4, i64x4, epi32_epi64)
145 _ (i8x16, i16x16, epi8_epi16)
146 _(i8x16, i32x8, epi8_epi32)
147 _(i8x16, i64x4, epi8_epi64)
150 static_always_inline u64x4
151 u64x4_byte_swap (u64x4 v)
154 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
155 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
157 return (u64x4) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
160 static_always_inline u32x8
161 u32x8_byte_swap (u32x8 v)
164 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
165 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
167 return (u32x8) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
170 static_always_inline u16x16
171 u16x16_byte_swap (u16x16 v)
174 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
175 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
177 return (u16x16) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
180 #define u8x32_align_right(a, b, imm) \
181 (u8x32) _mm256_alignr_epi8 ((__m256i) a, (__m256i) b, imm)
183 #define u64x4_align_right(a, b, imm) \
184 (u64x4) _mm256_alignr_epi64 ((__m256i) a, (__m256i) b, imm)
186 static_always_inline u32
187 u32x8_sum_elts (u32x8 sum8)
189 sum8 += (u32x8) u8x32_align_right (sum8, sum8, 8);
190 sum8 += (u32x8) u8x32_align_right (sum8, sum8, 4);
191 return sum8[0] + sum8[4];
194 static_always_inline u32x8
195 u32x8_hadd (u32x8 v1, u32x8 v2)
197 return (u32x8) _mm256_hadd_epi32 ((__m256i) v1, (__m256i) v2);
200 static_always_inline u32
204 v4 = u32x8_extract_lo (v) ^ u32x8_extract_hi (v);
205 v4 ^= (u32x4) u8x16_align_right (v4, v4, 8);
206 v4 ^= (u32x4) u8x16_align_right (v4, v4, 4);
210 static_always_inline u8x32
211 u8x32_xor3 (u8x32 a, u8x32 b, u8x32 c)
214 return (u8x32) _mm256_ternarylogic_epi32 ((__m256i) a, (__m256i) b,
220 static_always_inline u8x32
221 u8x32_reflect_u8x16 (u8x32 x)
223 static const u8x32 mask = {
224 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
225 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
227 return (u8x32) _mm256_shuffle_epi8 ((__m256i) x, (__m256i) mask);
230 static_always_inline u16x16
231 u16x16_mask_last (u16x16 v, u8 n_last)
233 const u16x16 masks[17] = {
239 {-1, -1, -1, -1, -1},
240 {-1, -1, -1, -1, -1, -1},
241 {-1, -1, -1, -1, -1, -1, -1},
242 {-1, -1, -1, -1, -1, -1, -1, -1},
243 {-1, -1, -1, -1, -1, -1, -1, -1, -1},
244 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
245 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
246 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
247 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
248 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
249 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
250 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
253 ASSERT (n_last < 17);
255 return v & masks[16 - n_last];
258 static_always_inline f32x8
259 f32x8_from_u32x8 (u32x8 v)
261 return (f32x8) _mm256_cvtepi32_ps ((__m256i) v);
264 static_always_inline u32x8
265 u32x8_from_f32x8 (f32x8 v)
267 return (u32x8) _mm256_cvttps_epi32 ((__m256) v);
270 #define u32x8_blend(a,b,m) \
271 (u32x8) _mm256_blend_epi32 ((__m256i) a, (__m256i) b, m)
273 #define u16x16_blend(v1, v2, mask) \
274 (u16x16) _mm256_blend_epi16 ((__m256i) (v1), (__m256i) (v2), mask)
276 static_always_inline u64x4
277 u64x4_gather (void *p0, void *p1, void *p2, void *p3)
280 *(u64 *) p0, *(u64 *) p1, *(u64 *) p2, *(u64 *) p3
285 static_always_inline u32x8
286 u32x8_gather (void *p0, void *p1, void *p2, void *p3, void *p4, void *p5,
290 *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3,
291 *(u32 *) p4, *(u32 *) p5, *(u32 *) p6, *(u32 *) p7,
297 static_always_inline void
298 u64x4_scatter (u64x4 r, void *p0, void *p1, void *p2, void *p3)
306 static_always_inline void
307 u32x8_scatter (u32x8 r, void *p0, void *p1, void *p2, void *p3, void *p4,
308 void *p5, void *p6, void *p7)
320 static_always_inline void
321 u64x4_scatter_one (u64x4 r, int index, void *p)
323 *(u64 *) p = r[index];
326 static_always_inline void
327 u32x8_scatter_one (u32x8 r, int index, void *p)
329 *(u32 *) p = r[index];
332 #define u32x8_gather_u32(base, indices, scale) \
333 (u32x8) _mm256_i32gather_epi32 ((const int *) base, (__m256i) indices, scale)
336 #define u32x8_scatter_u32(base, indices, v, scale) \
337 _mm256_i32scatter_epi32 (base, (__m256i) indices, (__m256i) v, scale)
339 #define u32x8_scatter_u32(base, indices, v, scale) \
340 for (u32 i = 0; i < 8; i++) \
341 *((u32u *) ((u8 *) base + (scale) * (indices)[i])) = (v)[i];
344 static_always_inline u8x32
345 u8x32_blend (u8x32 v1, u8x32 v2, u8x32 mask)
347 return (u8x32) _mm256_blendv_epi8 ((__m256i) v1, (__m256i) v2,
351 #define u8x32_word_shift_left(a, n) \
352 (u8x32) _mm256_bslli_epi128 ((__m256i) a, n)
353 #define u8x32_word_shift_right(a, n) \
354 (u8x32) _mm256_bsrli_epi128 ((__m256i) a, n)
356 #define u32x8_permute_lanes(a, b, m) \
357 (u32x8) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
358 #define u64x4_permute_lanes(a, b, m) \
359 (u64x4) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
361 static_always_inline u32x8
362 u32x8_min (u32x8 a, u32x8 b)
364 return (u32x8) _mm256_min_epu32 ((__m256i) a, (__m256i) b);
367 static_always_inline u32
368 u32x8_min_scalar (u32x8 v)
370 return u32x4_min_scalar (u32x4_min (u32x8_extract_lo (v),
371 u32x8_extract_hi (v)));
374 static_always_inline void
375 u32x8_transpose (u32x8 a[8])
379 r[0] = (u64x4) u32x8_interleave_lo (a[0], a[1]);
380 r[1] = (u64x4) u32x8_interleave_hi (a[0], a[1]);
381 r[2] = (u64x4) u32x8_interleave_lo (a[2], a[3]);
382 r[3] = (u64x4) u32x8_interleave_hi (a[2], a[3]);
383 r[4] = (u64x4) u32x8_interleave_lo (a[4], a[5]);
384 r[5] = (u64x4) u32x8_interleave_hi (a[4], a[5]);
385 r[6] = (u64x4) u32x8_interleave_lo (a[6], a[7]);
386 r[7] = (u64x4) u32x8_interleave_hi (a[6], a[7]);
388 x = u64x4_interleave_lo (r[0], r[2]);
389 y = u64x4_interleave_lo (r[4], r[6]);
390 a[0] = u32x8_permute_lanes (x, y, 0x20);
391 a[4] = u32x8_permute_lanes (x, y, 0x31);
393 x = u64x4_interleave_hi (r[0], r[2]);
394 y = u64x4_interleave_hi (r[4], r[6]);
395 a[1] = u32x8_permute_lanes (x, y, 0x20);
396 a[5] = u32x8_permute_lanes (x, y, 0x31);
398 x = u64x4_interleave_lo (r[1], r[3]);
399 y = u64x4_interleave_lo (r[5], r[7]);
400 a[2] = u32x8_permute_lanes (x, y, 0x20);
401 a[6] = u32x8_permute_lanes (x, y, 0x31);
403 x = u64x4_interleave_hi (r[1], r[3]);
404 y = u64x4_interleave_hi (r[5], r[7]);
405 a[3] = u32x8_permute_lanes (x, y, 0x20);
406 a[7] = u32x8_permute_lanes (x, y, 0x31);
409 static_always_inline void
410 u64x4_transpose (u64x4 a[8])
414 r[0] = u64x4_interleave_lo (a[0], a[1]);
415 r[1] = u64x4_interleave_hi (a[0], a[1]);
416 r[2] = u64x4_interleave_lo (a[2], a[3]);
417 r[3] = u64x4_interleave_hi (a[2], a[3]);
419 a[0] = u64x4_permute_lanes (r[0], r[2], 0x20);
420 a[1] = u64x4_permute_lanes (r[1], r[3], 0x20);
421 a[2] = u64x4_permute_lanes (r[0], r[2], 0x31);
422 a[3] = u64x4_permute_lanes (r[1], r[3], 0x31);
425 static_always_inline u8x32
426 u8x32_splat_u8x16 (u8x16 a)
428 return (u8x32) _mm256_broadcastsi128_si256 ((__m128i) a);
431 static_always_inline u32x8
432 u32x8_splat_u32x4 (u32x4 a)
434 return (u32x8) _mm256_broadcastsi128_si256 ((__m128i) a);
437 static_always_inline u64x4
438 u64x4_splat_u64x2 (u64x2 a)
440 return (u64x4) _mm256_broadcastsi128_si256 ((__m128i) a);
443 static_always_inline u8x32
444 u8x32_load_partial (u8 *data, uword n)
446 #if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
447 return u8x32_mask_load_zero (data, pow2_mask (n));
452 r = u8x32_insert_lo (r, *(u8x16u *) data);
453 r = u8x32_insert_hi (r, u8x16_load_partial (data + 16, n - 16));
456 r = u8x32_insert_lo (r, u8x16_load_partial (data, n));
461 static_always_inline void
462 u8x32_store_partial (u8x32 r, u8 *data, uword n)
464 #if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
465 u8x32_mask_store (r, data, pow2_mask (n));
469 *(u8x16u *) data = u8x32_extract_lo (r);
470 u8x16_store_partial (u8x32_extract_hi (r), data + 16, n - 16);
473 u8x16_store_partial (u8x32_extract_lo (r), data, n);
477 #endif /* included_vector_avx2_h */
480 * fd.io coding-style-patch-verification: ON
483 * eval: (c-set-style "gnu")