2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 #ifndef included_vector_avx512_h
17 #define included_vector_avx512_h
19 #include <vppinfra/clib.h>
20 #include <x86intrin.h>
23 #define foreach_avx512_vec512i \
24 _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32) _(i,64,8,epi64)
25 #define foreach_avx512_vec512u \
26 _(u,8,64,epi8) _(u,16,32,epi16) _(u,32,16,epi32) _(u,64,8,epi64)
27 #define foreach_avx512_vec512f \
28 _(f,32,8,ps) _(f,64,4,pd)
30 /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
31 is_all_equal, is_zero_mask */
32 #define _(t, s, c, i) \
33 static_always_inline t##s##x##c \
34 t##s##x##c##_splat (t##s x) \
35 { return (t##s##x##c) _mm512_set1_##i (x); } \
37 static_always_inline t##s##x##c \
38 t##s##x##c##_load_unaligned (void *p) \
39 { return (t##s##x##c) _mm512_loadu_si512 (p); } \
41 static_always_inline void \
42 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
43 { _mm512_storeu_si512 ((__m512i *) p, (__m512i) v); } \
45 static_always_inline int \
46 t##s##x##c##_is_all_zero (t##s##x##c v) \
47 { return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0); } \
49 static_always_inline int \
50 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
51 { return t##s##x##c##_is_all_zero (a ^ b); } \
53 static_always_inline int \
54 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
55 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
57 static_always_inline u##c \
58 t##s##x##c##_is_zero_mask (t##s##x##c v) \
59 { return _mm512_test_##i##_mask ((__m512i) v, (__m512i) v); } \
62 foreach_avx512_vec512i foreach_avx512_vec512u
66 static_always_inline u32
67 u16x32_msb_mask (u16x32 v)
69 return (u32) _mm512_movepi16_mask ((__m512i) v);
72 static_always_inline u32x16
73 u32x16_byte_swap (u32x16 v)
76 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
77 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
78 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
79 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
81 return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
84 static_always_inline u16x32
85 u16x32_byte_swap (u16x32 v)
88 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
89 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
90 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
91 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
93 return (u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
96 static_always_inline u32x8
97 u32x16_extract_lo (u32x16 v)
99 return (u32x8) _mm512_extracti64x4_epi64 ((__m512i) v, 0);
102 static_always_inline u32x8
103 u32x16_extract_hi (u32x16 v)
105 return (u32x8) _mm512_extracti64x4_epi64 ((__m512i) v, 1);
108 static_always_inline u32
109 u32x16_min_scalar (u32x16 v)
111 return u32x8_min_scalar (u32x8_min (u32x16_extract_lo (v),
112 u32x16_extract_hi (v)));
116 #define u32x16_ternary_logic(a, b, c, d) \
117 (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d)
119 static_always_inline void
120 u32x16_transpose (u32x16 m[16])
122 __m512i r[16], a, b, c, d, x, y;
125 __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
126 __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
127 __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
128 __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
131 r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]);
132 r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]);
133 r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]);
134 r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]);
135 r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]);
136 r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]);
137 r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]);
138 r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]);
140 r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]);
141 r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]);
142 r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]);
143 r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]);
144 r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]);
145 r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]);
146 r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]);
147 r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]);
149 a = _mm512_unpacklo_epi64 (r[0], r[1]);
150 b = _mm512_unpacklo_epi64 (r[2], r[3]);
151 c = _mm512_unpacklo_epi64 (r[4], r[5]);
152 d = _mm512_unpacklo_epi64 (r[6], r[7]);
153 x = _mm512_permutex2var_epi64 (a, pm1, b);
154 y = _mm512_permutex2var_epi64 (c, pm1, d);
155 m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
156 m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
157 x = _mm512_permutex2var_epi64 (a, pm2, b);
158 y = _mm512_permutex2var_epi64 (c, pm2, d);
159 m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
160 m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
162 a = _mm512_unpacklo_epi64 (r[8], r[9]);
163 b = _mm512_unpacklo_epi64 (r[10], r[11]);
164 c = _mm512_unpacklo_epi64 (r[12], r[13]);
165 d = _mm512_unpacklo_epi64 (r[14], r[15]);
166 x = _mm512_permutex2var_epi64 (a, pm1, b);
167 y = _mm512_permutex2var_epi64 (c, pm1, d);
168 m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
169 m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
170 x = _mm512_permutex2var_epi64 (a, pm2, b);
171 y = _mm512_permutex2var_epi64 (c, pm2, d);
172 m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
173 m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
175 a = _mm512_unpackhi_epi64 (r[0], r[1]);
176 b = _mm512_unpackhi_epi64 (r[2], r[3]);
177 c = _mm512_unpackhi_epi64 (r[4], r[5]);
178 d = _mm512_unpackhi_epi64 (r[6], r[7]);
179 x = _mm512_permutex2var_epi64 (a, pm1, b);
180 y = _mm512_permutex2var_epi64 (c, pm1, d);
181 m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
182 m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
183 x = _mm512_permutex2var_epi64 (a, pm2, b);
184 y = _mm512_permutex2var_epi64 (c, pm2, d);
185 m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
186 m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
188 a = _mm512_unpackhi_epi64 (r[8], r[9]);
189 b = _mm512_unpackhi_epi64 (r[10], r[11]);
190 c = _mm512_unpackhi_epi64 (r[12], r[13]);
191 d = _mm512_unpackhi_epi64 (r[14], r[15]);
192 x = _mm512_permutex2var_epi64 (a, pm1, b);
193 y = _mm512_permutex2var_epi64 (c, pm1, d);
194 m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
195 m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
196 x = _mm512_permutex2var_epi64 (a, pm2, b);
197 y = _mm512_permutex2var_epi64 (c, pm2, d);
198 m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
199 m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
204 static_always_inline void
205 u64x8_transpose (u64x8 m[8])
210 __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
211 __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
212 __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
213 __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
216 r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]);
217 r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]);
218 r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]);
219 r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]);
220 r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]);
221 r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]);
222 r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]);
223 r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]);
225 x = _mm512_permutex2var_epi64 (r[0], pm1, r[1]);
226 y = _mm512_permutex2var_epi64 (r[2], pm1, r[3]);
227 m[0] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
228 m[4] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
229 x = _mm512_permutex2var_epi64 (r[0], pm2, r[1]);
230 y = _mm512_permutex2var_epi64 (r[2], pm2, r[3]);
231 m[2] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
232 m[6] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
234 x = _mm512_permutex2var_epi64 (r[4], pm1, r[5]);
235 y = _mm512_permutex2var_epi64 (r[6], pm1, r[7]);
236 m[1] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
237 m[5] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
238 x = _mm512_permutex2var_epi64 (r[4], pm2, r[5]);
239 y = _mm512_permutex2var_epi64 (r[6], pm2, r[7]);
240 m[3] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
241 m[7] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
244 #endif /* included_vector_avx512_h */
246 * fd.io coding-style-patch-verification: ON
249 * eval: (c-set-style "gnu")