2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 #ifndef included_vector_avx512_h
17 #define included_vector_avx512_h
19 #include <vppinfra/clib.h>
20 #include <x86intrin.h>
23 #define foreach_avx512_vec512i \
24 _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32) _(i,64,8,epi64)
25 #define foreach_avx512_vec512u \
26 _(u,8,64,epi8) _(u,16,32,epi16) _(u,32,16,epi32) _(u,64,8,epi64)
27 #define foreach_avx512_vec512f \
28 _(f,32,8,ps) _(f,64,4,pd)
30 /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
31 is_all_equal, is_zero_mask */
32 #define _(t, s, c, i) \
33 static_always_inline t##s##x##c \
34 t##s##x##c##_splat (t##s x) \
35 { return (t##s##x##c) _mm512_set1_##i (x); } \
37 static_always_inline t##s##x##c \
38 t##s##x##c##_load_unaligned (void *p) \
39 { return (t##s##x##c) _mm512_loadu_si512 (p); } \
41 static_always_inline void \
42 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
43 { _mm512_storeu_si512 ((__m512i *) p, (__m512i) v); } \
45 static_always_inline int \
46 t##s##x##c##_is_all_zero (t##s##x##c v) \
47 { return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0); } \
49 static_always_inline int \
50 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
51 { return t##s##x##c##_is_all_zero (a ^ b); } \
53 static_always_inline int \
54 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
55 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
57 static_always_inline u##c \
58 t##s##x##c##_is_zero_mask (t##s##x##c v) \
59 { return _mm512_test_##i##_mask ((__m512i) v, (__m512i) v); } \
61 static_always_inline t##s##x##c \
62 t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
63 { return (t##s##x##c) _mm512_unpacklo_##i ((__m512i) a, (__m512i) b); } \
65 static_always_inline t##s##x##c \
66 t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
67 { return (t##s##x##c) _mm512_unpackhi_##i ((__m512i) a, (__m512i) b); } \
70 foreach_avx512_vec512i foreach_avx512_vec512u
74 static_always_inline u32
75 u16x32_msb_mask (u16x32 v)
77 return (u32) _mm512_movepi16_mask ((__m512i) v);
80 static_always_inline u32x16
81 u32x16_byte_swap (u32x16 v)
84 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
85 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
86 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
87 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
89 return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
92 static_always_inline u16x32
93 u16x32_byte_swap (u16x32 v)
96 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
97 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
98 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
99 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
101 return (u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
104 static_always_inline u32x8
105 u32x16_extract_lo (u32x16 v)
107 return (u32x8) _mm512_extracti64x4_epi64 ((__m512i) v, 0);
110 static_always_inline u32x8
111 u32x16_extract_hi (u32x16 v)
113 return (u32x8) _mm512_extracti64x4_epi64 ((__m512i) v, 1);
116 static_always_inline u8x32
117 u8x64_extract_lo (u8x64 v)
119 return (u8x32) _mm512_extracti64x4_epi64 ((__m512i) v, 0);
122 static_always_inline u8x32
123 u8x64_extract_hi (u8x64 v)
125 return (u8x32) _mm512_extracti64x4_epi64 ((__m512i) v, 1);
128 static_always_inline u32
129 u32x16_min_scalar (u32x16 v)
131 return u32x8_min_scalar (u32x8_min (u32x16_extract_lo (v),
132 u32x16_extract_hi (v)));
135 static_always_inline u32x16
136 u32x16_insert_lo (u32x16 r, u32x8 v)
138 return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 0);
141 static_always_inline u32x16
142 u32x16_insert_hi (u32x16 r, u32x8 v)
144 return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 1);
147 static_always_inline u64x8
148 u64x8_permute (u64x8 a, u64x8 b, u64x8 mask)
150 return (u64x8) _mm512_permutex2var_epi64 ((__m512i) a, (__m512i) mask,
155 #define u32x16_ternary_logic(a, b, c, d) \
156 (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d)
158 #define u8x64_insert_u8x16(a, b, n) \
159 (u8x64) _mm512_inserti64x2 ((__m512i) (a), (__m128i) (b), n)
161 #define u8x64_extract_u8x16(a, n) \
162 (u8x16) _mm512_extracti64x2_epi64 ((__m512i) (a), n)
164 #define u8x64_word_shift_left(a,n) (u8x64) _mm512_bslli_epi128((__m512i) a, n)
165 #define u8x64_word_shift_right(a,n) (u8x64) _mm512_bsrli_epi128((__m512i) a, n)
167 static_always_inline u8x64
168 u8x64_xor3 (u8x64 a, u8x64 b, u8x64 c)
170 return (u8x64) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b,
174 static_always_inline u8x64
175 u8x64_reflect_u8x16 (u8x64 x)
177 static const u8x64 mask = {
178 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
179 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
180 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
181 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
183 return (u8x64) _mm512_shuffle_epi8 ((__m512i) x, (__m512i) mask);
186 static_always_inline u8x64
187 u8x64_mask_load (u8x64 a, void *p, u64 mask)
189 return (u8x64) _mm512_mask_loadu_epi8 ((__m512i) a, mask, p);
192 static_always_inline void
193 u8x64_mask_store (u8x64 a, void *p, u64 mask)
195 _mm512_mask_storeu_epi8 (p, mask, (__m512i) a);
198 static_always_inline u8x64
199 u8x64_splat_u8x16 (u8x16 a)
201 return (u8x64) _mm512_broadcast_i64x2 ((__m128i) a);
204 static_always_inline u32x16
205 u32x16_splat_u32x4 (u32x4 a)
207 return (u32x16) _mm512_broadcast_i64x2 ((__m128i) a);
210 static_always_inline u32x16
211 u32x16_mask_blend (u32x16 a, u32x16 b, u16 mask)
213 return (u32x16) _mm512_mask_blend_epi32 (mask, (__m512i) a, (__m512i) b);
216 static_always_inline u8x64
217 u8x64_mask_blend (u8x64 a, u8x64 b, u64 mask)
219 return (u8x64) _mm512_mask_blend_epi8 (mask, (__m512i) a, (__m512i) b);
222 static_always_inline void
223 u32x16_transpose (u32x16 m[16])
225 __m512i r[16], a, b, c, d, x, y;
228 __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
229 __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
230 __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
231 __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
234 r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]);
235 r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]);
236 r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]);
237 r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]);
238 r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]);
239 r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]);
240 r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]);
241 r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]);
243 r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]);
244 r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]);
245 r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]);
246 r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]);
247 r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]);
248 r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]);
249 r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]);
250 r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]);
252 a = _mm512_unpacklo_epi64 (r[0], r[1]);
253 b = _mm512_unpacklo_epi64 (r[2], r[3]);
254 c = _mm512_unpacklo_epi64 (r[4], r[5]);
255 d = _mm512_unpacklo_epi64 (r[6], r[7]);
256 x = _mm512_permutex2var_epi64 (a, pm1, b);
257 y = _mm512_permutex2var_epi64 (c, pm1, d);
258 m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
259 m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
260 x = _mm512_permutex2var_epi64 (a, pm2, b);
261 y = _mm512_permutex2var_epi64 (c, pm2, d);
262 m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
263 m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
265 a = _mm512_unpacklo_epi64 (r[8], r[9]);
266 b = _mm512_unpacklo_epi64 (r[10], r[11]);
267 c = _mm512_unpacklo_epi64 (r[12], r[13]);
268 d = _mm512_unpacklo_epi64 (r[14], r[15]);
269 x = _mm512_permutex2var_epi64 (a, pm1, b);
270 y = _mm512_permutex2var_epi64 (c, pm1, d);
271 m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
272 m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
273 x = _mm512_permutex2var_epi64 (a, pm2, b);
274 y = _mm512_permutex2var_epi64 (c, pm2, d);
275 m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
276 m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
278 a = _mm512_unpackhi_epi64 (r[0], r[1]);
279 b = _mm512_unpackhi_epi64 (r[2], r[3]);
280 c = _mm512_unpackhi_epi64 (r[4], r[5]);
281 d = _mm512_unpackhi_epi64 (r[6], r[7]);
282 x = _mm512_permutex2var_epi64 (a, pm1, b);
283 y = _mm512_permutex2var_epi64 (c, pm1, d);
284 m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
285 m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
286 x = _mm512_permutex2var_epi64 (a, pm2, b);
287 y = _mm512_permutex2var_epi64 (c, pm2, d);
288 m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
289 m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
291 a = _mm512_unpackhi_epi64 (r[8], r[9]);
292 b = _mm512_unpackhi_epi64 (r[10], r[11]);
293 c = _mm512_unpackhi_epi64 (r[12], r[13]);
294 d = _mm512_unpackhi_epi64 (r[14], r[15]);
295 x = _mm512_permutex2var_epi64 (a, pm1, b);
296 y = _mm512_permutex2var_epi64 (c, pm1, d);
297 m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
298 m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
299 x = _mm512_permutex2var_epi64 (a, pm2, b);
300 y = _mm512_permutex2var_epi64 (c, pm2, d);
301 m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
302 m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
307 static_always_inline void
308 u64x8_transpose (u64x8 m[8])
313 __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
314 __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
315 __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
316 __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
319 r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]);
320 r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]);
321 r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]);
322 r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]);
323 r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]);
324 r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]);
325 r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]);
326 r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]);
328 x = _mm512_permutex2var_epi64 (r[0], pm1, r[1]);
329 y = _mm512_permutex2var_epi64 (r[2], pm1, r[3]);
330 m[0] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
331 m[4] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
332 x = _mm512_permutex2var_epi64 (r[0], pm2, r[1]);
333 y = _mm512_permutex2var_epi64 (r[2], pm2, r[3]);
334 m[2] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
335 m[6] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
337 x = _mm512_permutex2var_epi64 (r[4], pm1, r[5]);
338 y = _mm512_permutex2var_epi64 (r[6], pm1, r[7]);
339 m[1] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
340 m[5] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
341 x = _mm512_permutex2var_epi64 (r[4], pm2, r[5]);
342 y = _mm512_permutex2var_epi64 (r[6], pm2, r[7]);
343 m[3] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
344 m[7] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
347 #endif /* included_vector_avx512_h */
349 * fd.io coding-style-patch-verification: ON
352 * eval: (c-set-style "gnu")