2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 #ifndef included_vector_avx512_h
17 #define included_vector_avx512_h
19 #include <vppinfra/clib.h>
20 #include <x86intrin.h>
23 #define foreach_avx512_vec512i \
24 _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32) _(i,64,8,epi64)
25 #define foreach_avx512_vec512u \
26 _(u,8,64,epi8) _(u,16,32,epi16) _(u,32,16,epi32) _(u,64,8,epi64)
27 #define foreach_avx512_vec512f \
28 _(f,32,8,ps) _(f,64,4,pd)
30 /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
31 is_all_equal, is_zero_mask */
32 #define _(t, s, c, i) \
33 static_always_inline t##s##x##c \
34 t##s##x##c##_splat (t##s x) \
35 { return (t##s##x##c) _mm512_set1_##i (x); } \
37 static_always_inline t##s##x##c \
38 t##s##x##c##_load_unaligned (void *p) \
39 { return (t##s##x##c) _mm512_loadu_si512 (p); } \
41 static_always_inline void \
42 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
43 { _mm512_storeu_si512 ((__m512i *) p, (__m512i) v); } \
45 static_always_inline int \
46 t##s##x##c##_is_all_zero (t##s##x##c v) \
47 { return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0); } \
49 static_always_inline int \
50 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
51 { return t##s##x##c##_is_all_zero (a ^ b); } \
53 static_always_inline int \
54 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
55 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
57 static_always_inline u##c \
58 t##s##x##c##_is_zero_mask (t##s##x##c v) \
59 { return _mm512_test_##i##_mask ((__m512i) v, (__m512i) v); } \
61 static_always_inline t##s##x##c \
62 t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
63 { return (t##s##x##c) _mm512_unpacklo_##i ((__m512i) a, (__m512i) b); } \
65 static_always_inline t##s##x##c \
66 t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
67 { return (t##s##x##c) _mm512_unpackhi_##i ((__m512i) a, (__m512i) b); } \
70 foreach_avx512_vec512i foreach_avx512_vec512u
74 static_always_inline u32
75 u16x32_msb_mask (u16x32 v)
77 return (u32) _mm512_movepi16_mask ((__m512i) v);
80 static_always_inline u32x16
81 u32x16_byte_swap (u32x16 v)
84 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
85 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
86 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
87 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
89 return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
92 static_always_inline u16x32
93 u16x32_byte_swap (u16x32 v)
96 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
97 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
98 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
99 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
101 return (u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
104 static_always_inline u32x8
105 u32x16_extract_lo (u32x16 v)
107 return (u32x8) _mm512_extracti64x4_epi64 ((__m512i) v, 0);
110 static_always_inline u32x8
111 u32x16_extract_hi (u32x16 v)
113 return (u32x8) _mm512_extracti64x4_epi64 ((__m512i) v, 1);
116 static_always_inline u32
117 u32x16_min_scalar (u32x16 v)
119 return u32x8_min_scalar (u32x8_min (u32x16_extract_lo (v),
120 u32x16_extract_hi (v)));
123 static_always_inline u32x16
124 u32x16_insert_lo (u32x16 r, u32x8 v)
126 return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 0);
129 static_always_inline u32x16
130 u32x16_insert_hi (u32x16 r, u32x8 v)
132 return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 1);
135 static_always_inline u64x8
136 u64x8_permute (u64x8 a, u64x8 b, u64x8 mask)
138 return (u64x8) _mm512_permutex2var_epi64 ((__m512i) a, (__m512i) mask,
143 #define u32x16_ternary_logic(a, b, c, d) \
144 (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d)
146 static_always_inline void
147 u32x16_transpose (u32x16 m[16])
149 __m512i r[16], a, b, c, d, x, y;
152 __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
153 __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
154 __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
155 __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
158 r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]);
159 r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]);
160 r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]);
161 r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]);
162 r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]);
163 r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]);
164 r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]);
165 r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]);
167 r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]);
168 r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]);
169 r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]);
170 r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]);
171 r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]);
172 r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]);
173 r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]);
174 r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]);
176 a = _mm512_unpacklo_epi64 (r[0], r[1]);
177 b = _mm512_unpacklo_epi64 (r[2], r[3]);
178 c = _mm512_unpacklo_epi64 (r[4], r[5]);
179 d = _mm512_unpacklo_epi64 (r[6], r[7]);
180 x = _mm512_permutex2var_epi64 (a, pm1, b);
181 y = _mm512_permutex2var_epi64 (c, pm1, d);
182 m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
183 m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
184 x = _mm512_permutex2var_epi64 (a, pm2, b);
185 y = _mm512_permutex2var_epi64 (c, pm2, d);
186 m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
187 m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
189 a = _mm512_unpacklo_epi64 (r[8], r[9]);
190 b = _mm512_unpacklo_epi64 (r[10], r[11]);
191 c = _mm512_unpacklo_epi64 (r[12], r[13]);
192 d = _mm512_unpacklo_epi64 (r[14], r[15]);
193 x = _mm512_permutex2var_epi64 (a, pm1, b);
194 y = _mm512_permutex2var_epi64 (c, pm1, d);
195 m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
196 m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
197 x = _mm512_permutex2var_epi64 (a, pm2, b);
198 y = _mm512_permutex2var_epi64 (c, pm2, d);
199 m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
200 m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
202 a = _mm512_unpackhi_epi64 (r[0], r[1]);
203 b = _mm512_unpackhi_epi64 (r[2], r[3]);
204 c = _mm512_unpackhi_epi64 (r[4], r[5]);
205 d = _mm512_unpackhi_epi64 (r[6], r[7]);
206 x = _mm512_permutex2var_epi64 (a, pm1, b);
207 y = _mm512_permutex2var_epi64 (c, pm1, d);
208 m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
209 m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
210 x = _mm512_permutex2var_epi64 (a, pm2, b);
211 y = _mm512_permutex2var_epi64 (c, pm2, d);
212 m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
213 m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
215 a = _mm512_unpackhi_epi64 (r[8], r[9]);
216 b = _mm512_unpackhi_epi64 (r[10], r[11]);
217 c = _mm512_unpackhi_epi64 (r[12], r[13]);
218 d = _mm512_unpackhi_epi64 (r[14], r[15]);
219 x = _mm512_permutex2var_epi64 (a, pm1, b);
220 y = _mm512_permutex2var_epi64 (c, pm1, d);
221 m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
222 m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
223 x = _mm512_permutex2var_epi64 (a, pm2, b);
224 y = _mm512_permutex2var_epi64 (c, pm2, d);
225 m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
226 m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
231 static_always_inline void
232 u64x8_transpose (u64x8 m[8])
237 __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
238 __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
239 __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
240 __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
243 r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]);
244 r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]);
245 r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]);
246 r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]);
247 r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]);
248 r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]);
249 r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]);
250 r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]);
252 x = _mm512_permutex2var_epi64 (r[0], pm1, r[1]);
253 y = _mm512_permutex2var_epi64 (r[2], pm1, r[3]);
254 m[0] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
255 m[4] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
256 x = _mm512_permutex2var_epi64 (r[0], pm2, r[1]);
257 y = _mm512_permutex2var_epi64 (r[2], pm2, r[3]);
258 m[2] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
259 m[6] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
261 x = _mm512_permutex2var_epi64 (r[4], pm1, r[5]);
262 y = _mm512_permutex2var_epi64 (r[6], pm1, r[7]);
263 m[1] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
264 m[5] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
265 x = _mm512_permutex2var_epi64 (r[4], pm2, r[5]);
266 y = _mm512_permutex2var_epi64 (r[6], pm2, r[7]);
267 m[3] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
268 m[7] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
271 #endif /* included_vector_avx512_h */
273 * fd.io coding-style-patch-verification: ON
276 * eval: (c-set-style "gnu")