2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 Copyright (c) 2005 Eliot Dresselhaus
18 Permission is hereby granted, free of charge, to any person obtaining
19 a copy of this software and associated documentation files (the
20 "Software"), to deal in the Software without restriction, including
21 without limitation the rights to use, copy, modify, merge, publish,
22 distribute, sublicense, and/or sell copies of the Software, and to
23 permit persons to whom the Software is furnished to do so, subject to
24 the following conditions:
26 The above copyright notice and this permission notice shall be
27 included in all copies or substantial portions of the Software.
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38 #ifndef included_vector_sse2_h
39 #define included_vector_sse2_h
41 #include <vppinfra/error_bootstrap.h> /* for ASSERT */
42 #include <x86intrin.h>
44 #define foreach_sse42_vec128i \
45 _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64x)
46 #define foreach_sse42_vec128u \
47 _(u,8,16,epi8) _(u,16,8,epi16) _(u,32,4,epi32) _(u,64,2,epi64x)
48 #define foreach_sse42_vec128f \
49 _(f,32,4,ps) _(f,64,2,pd)
51 /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
53 #define _(t, s, c, i) \
54 static_always_inline t##s##x##c \
55 t##s##x##c##_splat (t##s x) \
56 { return (t##s##x##c) _mm_set1_##i (x); } \
58 static_always_inline t##s##x##c \
59 t##s##x##c##_load_unaligned (void *p) \
60 { return (t##s##x##c) _mm_loadu_si128 (p); } \
62 static_always_inline void \
63 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
64 { _mm_storeu_si128 ((__m128i *) p, (__m128i) v); } \
66 static_always_inline int \
67 t##s##x##c##_is_all_zero (t##s##x##c x) \
68 { return _mm_testz_si128 ((__m128i) x, (__m128i) x); } \
70 static_always_inline int \
71 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
72 { return t##s##x##c##_is_all_zero (a ^ b); } \
74 static_always_inline int \
75 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
76 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); }; \
78 foreach_sse42_vec128i foreach_sse42_vec128u
82 #define _(t, s, c, i) \
83 static_always_inline t##s##x##c \
84 t##s##x##c##_min (t##s##x##c a, t##s##x##c b) \
85 { return (t##s##x##c) _mm_min_##i ((__m128i) a, (__m128i) b); } \
87 static_always_inline t##s##x##c \
88 t##s##x##c##_max (t##s##x##c a, t##s##x##c b) \
89 { return (t##s##x##c) _mm_max_##i ((__m128i) a, (__m128i) b); } \
91 _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64)
92 _(u,8,16,epu8) _(u,16,8,epu16) _(u,32,4,epu32) _(u,64,2,epu64)
95 #define CLIB_VEC128_SPLAT_DEFINED
96 #define CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE
98 /* 128 bit interleaves. */
100 u8x16_interleave_hi (u8x16 a, u8x16 b)
102 return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
106 u8x16_interleave_lo (u8x16 a, u8x16 b)
108 return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
112 u16x8_interleave_hi (u16x8 a, u16x8 b)
114 return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
118 u16x8_interleave_lo (u16x8 a, u16x8 b)
120 return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
124 u32x4_interleave_hi (u32x4 a, u32x4 b)
126 return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
130 u32x4_interleave_lo (u32x4 a, u32x4 b)
132 return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
136 u64x2_interleave_hi (u64x2 a, u64x2 b)
138 return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
142 u64x2_interleave_lo (u64x2 a, u64x2 b)
144 return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
148 #define _(f, t, fn) \
149 always_inline t t##_pack (f lo, f hi) \
151 return (t) fn ((__m128i) lo, (__m128i) hi); \
154 _ (i16x8, i8x16, _mm_packs_epi16)
155 _ (i16x8, u8x16, _mm_packus_epi16)
156 _ (i32x4, i16x8, _mm_packs_epi32)
157 _ (i32x4, u16x8, _mm_packus_epi32)
161 #define _signed_binop(n,m,f,g) \
163 always_inline u##n##x##m \
164 u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
165 { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
168 always_inline i##n##x##m \
169 i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
170 { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
171 /* Addition/subtraction with saturation. */
172 _signed_binop (8, 16, add_saturate, adds_epu)
173 _signed_binop (16, 8, add_saturate, adds_epu)
174 _signed_binop (8, 16, sub_saturate, subs_epu)
175 _signed_binop (16, 8, sub_saturate, subs_epu)
176 /* Multiplication. */
177 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
179 return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
183 u16x8_mul_lo (u16x8 x, u16x8 y)
185 return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
189 i16x8_mul_hi (i16x8 x, i16x8 y)
191 return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
195 u16x8_mul_hi (u16x8 x, u16x8 y)
197 return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
200 /* 128 bit shifts. */
202 #define _(p,a,b,c,f) \
203 always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
204 { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
206 always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
207 { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
209 _(u, 16, 8, left, sll)
210 _(u, 32, 4, left, sll)
211 _(u, 64, 2, left, sll)
212 _(u, 16, 8, right, srl)
213 _(u, 32, 4, right, srl)
214 _(u, 64, 2, right, srl)
215 _(i, 16, 8, left, sll)
216 _(i, 32, 4, left, sll)
217 _(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
220 #define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
221 #define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
223 #define i8x16_word_shift_left(a,n) \
224 ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
225 #define i8x16_word_shift_right(a,n) \
226 ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
228 #define u16x8_word_shift_left(a,n) \
229 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
230 #define i16x8_word_shift_left(a,n) \
231 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
232 #define u16x8_word_shift_right(a,n) \
233 ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
234 #define i16x8_word_shift_right(a,n) \
235 ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
237 #define u32x4_word_shift_left(a,n) \
238 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
239 #define i32x4_word_shift_left(a,n) \
240 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
241 #define u32x4_word_shift_right(a,n) \
242 ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
243 #define i32x4_word_shift_right(a,n) \
244 ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
246 #define u64x2_word_shift_left(a,n) \
247 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
248 #define i64x2_word_shift_left(a,n) \
249 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
250 #define u64x2_word_shift_right(a,n) \
251 ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
252 #define i64x2_word_shift_right(a,n) \
253 ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
255 /* SSE2 has no rotate instructions: use shifts to simulate them. */
256 #define _(t,n,lr1,lr2) \
257 always_inline t##x##n \
258 t##x##n##_irotate_##lr1 (t##x##n w, int i) \
260 ASSERT (i >= 0 && i <= BITS (t)); \
261 return (t##x##n##_ishift_##lr1 (w, i) \
262 | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
265 always_inline t##x##n \
266 t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
268 t##x##n j = t##x##n##_splat (BITS (t)); \
269 return (t##x##n##_shift_##lr1 (w, i) \
270 | t##x##n##_shift_##lr2 (w, j - i)); \
273 _(u16, 8, left, right);
274 _(u16, 8, right, left);
275 _(u32, 4, left, right);
276 _(u32, 4, right, left);
277 _(u64, 2, left, right);
278 _(u64, 2, right, left);
283 u8x16_max_scalar (u8x16 x)
285 x = u8x16_max (x, u8x16_word_shift_right (x, 8));
286 x = u8x16_max (x, u8x16_word_shift_right (x, 4));
287 x = u8x16_max (x, u8x16_word_shift_right (x, 2));
288 x = u8x16_max (x, u8x16_word_shift_right (x, 1));
289 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
293 u8x16_min_scalar (u8x16 x)
295 x = u8x16_min (x, u8x16_word_shift_right (x, 8));
296 x = u8x16_min (x, u8x16_word_shift_right (x, 4));
297 x = u8x16_min (x, u8x16_word_shift_right (x, 2));
298 x = u8x16_min (x, u8x16_word_shift_right (x, 1));
299 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
303 i16x8_max_scalar (i16x8 x)
305 x = i16x8_max (x, i16x8_word_shift_right (x, 4));
306 x = i16x8_max (x, i16x8_word_shift_right (x, 2));
307 x = i16x8_max (x, i16x8_word_shift_right (x, 1));
308 return _mm_extract_epi16 ((__m128i) x, 0);
312 i16x8_min_scalar (i16x8 x)
314 x = i16x8_min (x, i16x8_word_shift_right (x, 4));
315 x = i16x8_min (x, i16x8_word_shift_right (x, 2));
316 x = i16x8_min (x, i16x8_word_shift_right (x, 1));
317 return _mm_extract_epi16 ((__m128i) x, 0);
320 #define u8x16_align_right(a, b, imm) \
321 (u8x16) _mm_alignr_epi8 ((__m128i) a, (__m128i) b, imm)
323 static_always_inline u32
324 u32x4_min_scalar (u32x4 v)
326 v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
327 v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
331 static_always_inline u32
332 u32x4_max_scalar (u32x4 v)
334 v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
335 v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
339 static_always_inline u32
340 i32x4_min_scalar (i32x4 v)
342 v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
343 v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
347 static_always_inline u32
348 i32x4_max_scalar (i32x4 v)
350 v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
351 v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
355 static_always_inline u16
356 u8x16_msb_mask (u8x16 v)
358 return _mm_movemask_epi8 ((__m128i) v);
361 static_always_inline u16
362 i8x16_msb_mask (i8x16 v)
364 return _mm_movemask_epi8 ((__m128i) v);
367 #define CLIB_HAVE_VEC128_MSB_MASK
371 static_always_inline u32x4
372 u32x4_byte_swap (u32x4 v)
375 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
377 return (u32x4) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
380 static_always_inline u16x8
381 u16x8_byte_swap (u16x8 v)
384 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
386 return (u16x8) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
389 static_always_inline u8x16
390 u8x16_reflect (u8x16 v)
393 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
395 return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) mask);
398 static_always_inline u32x4
399 u32x4_hadd (u32x4 v1, u32x4 v2)
401 return (u32x4) _mm_hadd_epi32 ((__m128i) v1, (__m128i) v2);
404 static_always_inline u32 __clib_unused
405 u32x4_sum_elts (u32x4 sum4)
407 sum4 += (u32x4) u8x16_align_right (sum4, sum4, 8);
408 sum4 += (u32x4) u8x16_align_right (sum4, sum4, 4);
414 static_always_inline t \
416 { return (t) _mm_cvt##i ((__m128i) x); }
418 _(u8x16, u16x8, epu8_epi16)
419 _(u8x16, u32x4, epu8_epi32)
420 _(u8x16, u64x2, epu8_epi64)
421 _(u16x8, u32x4, epu16_epi32)
422 _(u16x8, u64x2, epu16_epi64)
423 _(u32x4, u64x2, epu32_epi64)
425 _(i8x16, i16x8, epi8_epi16)
426 _(i8x16, i32x4, epi8_epi32)
427 _(i8x16, i64x2, epi8_epi64)
428 _(i16x8, i32x4, epi16_epi32)
429 _(i16x8, i64x2, epi16_epi64)
430 _(i32x4, i64x2, epi32_epi64)
433 static_always_inline u64x2
434 u64x2_gather (void *p0, void *p1)
436 u64x2 r = { *(u64 *) p0, *(u64 *) p1 };
440 static_always_inline u32x4
441 u32x4_gather (void *p0, void *p1, void *p2, void *p3)
443 u32x4 r = { *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3 };
448 static_always_inline void
449 u64x2_scatter (u64x2 r, void *p0, void *p1)
455 static_always_inline void
456 u32x4_scatter (u32x4 r, void *p0, void *p1, void *p2, void *p3)
464 static_always_inline void
465 u64x2_scatter_one (u64x2 r, int index, void *p)
467 *(u64 *) p = r[index];
470 static_always_inline void
471 u32x4_scatter_one (u32x4 r, int index, void *p)
473 *(u32 *) p = r[index];
476 static_always_inline u8x16
477 u8x16_blend (u8x16 v1, u8x16 v2, u8x16 mask)
479 return (u8x16) _mm_blendv_epi8 ((__m128i) v1, (__m128i) v2, (__m128i) mask);
482 static_always_inline u8x16
483 u8x16_xor3 (u8x16 a, u8x16 b, u8x16 c)
486 return (u8x16) _mm_ternarylogic_epi32 ((__m128i) a, (__m128i) b,
492 static_always_inline u8x16
493 u8x16_load_partial (u8 *data, uword n)
496 #if defined(CLIB_HAVE_VEC128_MASK_LOAD_STORE)
497 return u8x16_mask_load_zero (data, pow2_mask (n));
502 r[1] = *(u64u *) (data + n - 8);
504 r[0] = *(u64u *) data;
510 r[1] = *(u32u *) (data + n - 4);
512 r[0] = *(u32u *) data;
518 r[1] = *(u16u *) (data + n - 2);
520 r[0] = *(u16u *) data;
528 static_always_inline void
529 u8x16_store_partial (u8x16 r, u8 *data, uword n)
531 #if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
532 u8x16_mask_store (r, data, pow2_mask (n));
536 *(u64u *) (data + n - 8) = ((u64x2) r)[1] << ((16 - n) * 8);
537 *(u64u *) data = ((u64x2) r)[0];
541 *(u32u *) (data + n - 4) = ((u32x4) r)[1] << ((8 - n) * 8);
542 *(u32u *) data = ((u32x4) r)[0];
546 *(u16u *) (data + n - 2) = ((u16x8) r)[1] << ((4 - n) * 8);
547 *(u16u *) data = ((u16x8) r)[0];
554 #endif /* included_vector_sse2_h */
557 * fd.io coding-style-patch-verification: ON
560 * eval: (c-set-style "gnu")