2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 Copyright (c) 2005 Eliot Dresselhaus
18 Permission is hereby granted, free of charge, to any person obtaining
19 a copy of this software and associated documentation files (the
20 "Software"), to deal in the Software without restriction, including
21 without limitation the rights to use, copy, modify, merge, publish,
22 distribute, sublicense, and/or sell copies of the Software, and to
23 permit persons to whom the Software is furnished to do so, subject to
24 the following conditions:
26 The above copyright notice and this permission notice shall be
27 included in all copies or substantial portions of the Software.
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38 #ifndef included_vector_sse2_h
39 #define included_vector_sse2_h
41 #include <vppinfra/error_bootstrap.h> /* for ASSERT */
42 #include <x86intrin.h>
45 #define foreach_sse42_vec128i \
46 _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64x)
47 #define foreach_sse42_vec128u \
48 _(u,8,16,epi8) _(u,16,8,epi16) _(u,32,4,epi32) _(u,64,2,epi64x)
49 #define foreach_sse42_vec128f \
50 _(f,32,4,ps) _(f,64,2,pd)
52 /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
54 #define _(t, s, c, i) \
55 static_always_inline t##s##x##c \
56 t##s##x##c##_splat (t##s x) \
57 { return (t##s##x##c) _mm_set1_##i (x); } \
59 static_always_inline t##s##x##c \
60 t##s##x##c##_load_unaligned (void *p) \
61 { return (t##s##x##c) _mm_loadu_si128 (p); } \
63 static_always_inline void \
64 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
65 { _mm_storeu_si128 ((__m128i *) p, (__m128i) v); } \
67 static_always_inline int \
68 t##s##x##c##_is_all_zero (t##s##x##c x) \
69 { return _mm_testz_si128 ((__m128i) x, (__m128i) x); } \
71 static_always_inline int \
72 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
73 { return t##s##x##c##_is_all_zero (a ^ b); } \
75 static_always_inline int \
76 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
77 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); }; \
79 foreach_sse42_vec128i foreach_sse42_vec128u
83 #define _(t, s, c, i) \
84 static_always_inline t##s##x##c \
85 t##s##x##c##_min (t##s##x##c a, t##s##x##c b) \
86 { return (t##s##x##c) _mm_min_##i ((__m128i) a, (__m128i) b); } \
88 static_always_inline t##s##x##c \
89 t##s##x##c##_max (t##s##x##c a, t##s##x##c b) \
90 { return (t##s##x##c) _mm_max_##i ((__m128i) a, (__m128i) b); } \
92 _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64)
93 _(u,8,16,epu8) _(u,16,8,epu16) _(u,32,4,epu32) _(u,64,2,epu64)
97 #define CLIB_VEC128_SPLAT_DEFINED
98 #define CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE
100 /* 128 bit interleaves. */
102 u8x16_interleave_hi (u8x16 a, u8x16 b)
104 return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
108 u8x16_interleave_lo (u8x16 a, u8x16 b)
110 return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
114 u16x8_interleave_hi (u16x8 a, u16x8 b)
116 return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
120 u16x8_interleave_lo (u16x8 a, u16x8 b)
122 return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
126 u32x4_interleave_hi (u32x4 a, u32x4 b)
128 return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
132 u32x4_interleave_lo (u32x4 a, u32x4 b)
134 return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
138 u64x2_interleave_hi (u64x2 a, u64x2 b)
140 return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
144 u64x2_interleave_lo (u64x2 a, u64x2 b)
146 return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
149 /* 64 bit interleaves. */
151 u8x8_interleave_hi (u8x8 a, u8x8 b)
153 return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
157 u8x8_interleave_lo (u8x8 a, u8x8 b)
159 return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
163 u16x4_interleave_hi (u16x4 a, u16x4 b)
165 return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
169 u16x4_interleave_lo (u16x4 a, u16x4 b)
171 return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
175 u32x2_interleave_hi (u32x2 a, u32x2 b)
177 return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
181 u32x2_interleave_lo (u32x2 a, u32x2 b)
183 return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
187 #define _(f, t, fn) \
188 always_inline t t##_pack (f lo, f hi) \
190 return (t) fn ((__m128i) lo, (__m128i) hi); \
193 _ (i16x8, i8x16, _mm_packs_epi16)
194 _ (i16x8, u8x16, _mm_packus_epi16)
195 _ (i32x4, i16x8, _mm_packs_epi32)
196 _ (i32x4, u16x8, _mm_packus_epi32)
200 #define _signed_binop(n,m,f,g) \
202 always_inline u##n##x##m \
203 u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
204 { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
207 always_inline i##n##x##m \
208 i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
209 { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
210 /* Addition/subtraction with saturation. */
211 _signed_binop (8, 16, add_saturate, adds_epu)
212 _signed_binop (16, 8, add_saturate, adds_epu)
213 _signed_binop (8, 16, sub_saturate, subs_epu)
214 _signed_binop (16, 8, sub_saturate, subs_epu)
215 /* Multiplication. */
216 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
218 return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
222 u16x8_mul_lo (u16x8 x, u16x8 y)
224 return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
228 i16x8_mul_hi (i16x8 x, i16x8 y)
230 return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
234 u16x8_mul_hi (u16x8 x, u16x8 y)
236 return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
239 /* 128 bit shifts. */
241 #define _(p,a,b,c,f) \
242 always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
243 { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
245 always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
246 { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
248 _(u, 16, 8, left, sll)
249 _(u, 32, 4, left, sll)
250 _(u, 64, 2, left, sll)
251 _(u, 16, 8, right, srl)
252 _(u, 32, 4, right, srl)
253 _(u, 64, 2, right, srl)
254 _(i, 16, 8, left, sll)
255 _(i, 32, 4, left, sll)
256 _(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
260 u16x4_shift_left (u16x4 x, u16x4 i)
262 return (u16x4) _m_psllw ((__m64) x, (__m64) i);
266 u32x2_shift_left (u32x2 x, u32x2 i)
268 return (u32x2) _m_pslld ((__m64) x, (__m64) i);
272 u16x4_shift_right (u16x4 x, u16x4 i)
274 return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
278 u32x2_shift_right (u32x2 x, u32x2 i)
280 return (u32x2) _m_psrld ((__m64) x, (__m64) i);
284 i16x4_shift_left (i16x4 x, i16x4 i)
286 return (i16x4) _m_psllw ((__m64) x, (__m64) i);
290 i32x2_shift_left (i32x2 x, i32x2 i)
292 return (i32x2) _m_pslld ((__m64) x, (__m64) i);
296 i16x4_shift_right (i16x4 x, i16x4 i)
298 return (i16x4) _m_psraw ((__m64) x, (__m64) i);
302 i32x2_shift_right (i32x2 x, i32x2 i)
304 return (i32x2) _m_psrad ((__m64) x, (__m64) i);
307 #define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
308 #define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
310 #define i8x16_word_shift_left(a,n) \
311 ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
312 #define i8x16_word_shift_right(a,n) \
313 ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
315 #define u16x8_word_shift_left(a,n) \
316 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
317 #define i16x8_word_shift_left(a,n) \
318 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
319 #define u16x8_word_shift_right(a,n) \
320 ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
321 #define i16x8_word_shift_right(a,n) \
322 ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
324 #define u32x4_word_shift_left(a,n) \
325 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
326 #define i32x4_word_shift_left(a,n) \
327 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
328 #define u32x4_word_shift_right(a,n) \
329 ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
330 #define i32x4_word_shift_right(a,n) \
331 ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
333 #define u64x2_word_shift_left(a,n) \
334 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
335 #define i64x2_word_shift_left(a,n) \
336 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
337 #define u64x2_word_shift_right(a,n) \
338 ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
339 #define i64x2_word_shift_right(a,n) \
340 ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
342 /* SSE2 has no rotate instructions: use shifts to simulate them. */
343 #define _(t,n,lr1,lr2) \
344 always_inline t##x##n \
345 t##x##n##_irotate_##lr1 (t##x##n w, int i) \
347 ASSERT (i >= 0 && i <= BITS (t)); \
348 return (t##x##n##_ishift_##lr1 (w, i) \
349 | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
352 always_inline t##x##n \
353 t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
355 t##x##n j = t##x##n##_splat (BITS (t)); \
356 return (t##x##n##_shift_##lr1 (w, i) \
357 | t##x##n##_shift_##lr2 (w, j - i)); \
360 _(u16, 8, left, right);
361 _(u16, 8, right, left);
362 _(u32, 4, left, right);
363 _(u32, 4, right, left);
364 _(u64, 2, left, right);
365 _(u64, 2, right, left);
370 #define _(t,n,lr1,lr2) \
371 always_inline t##x##n \
372 t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
374 int m = sizeof (t##x##n) / sizeof (t); \
375 ASSERT (i >= 0 && i < m); \
376 return (t##x##n##_word_shift_##lr1 (w0, i) \
377 | t##x##n##_word_shift_##lr2 (w1, m - i)); \
380 always_inline t##x##n \
381 t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
382 { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
384 _(u8, 16, left, right);
385 _(u8, 16, right, left);
386 _(u16, 8, left, right);
387 _(u16, 8, right, left);
388 _(u32, 4, left, right);
389 _(u32, 4, right, left);
390 _(u64, 2, left, right);
391 _(u64, 2, right, left);
396 #define u32x4_select(A,MASK) \
400 asm volatile ("pshufd %[mask], %[x], %[y]" \
401 : /* outputs */ [y] "=x" (_y) \
402 : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
406 #define u32x4_splat_word(x,i) \
407 u32x4_select ((x), (((i) << (2*0)) \
412 /* Extract low order 32 bit word. */
417 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
418 : /* inputs */ [x] "x" (x));
426 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
427 : /* inputs */ [x] "r" (x));
434 return (i32x4) u32x4_set0 ((u32) x);
440 return (i32) u32x4_get0 ((u32x4) x);
443 /* Converts all ones/zeros compare mask to bitmap. */
445 u8x16_compare_byte_mask (u8x16 x)
447 return _mm_movemask_epi8 ((__m128i) x);
450 extern u8 u32x4_compare_word_mask_table[256];
453 u32x4_compare_word_mask (u32x4 x)
455 u32 m = u8x16_compare_byte_mask ((u8x16) x);
456 return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
457 | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
461 u8x16_zero_byte_mask (u8x16 x)
464 return u8x16_compare_byte_mask (x == zero);
468 u16x8_zero_byte_mask (u16x8 x)
471 return u8x16_compare_byte_mask ((u8x16) (x == zero));
475 u32x4_zero_byte_mask (u32x4 x)
478 return u8x16_compare_byte_mask ((u8x16) (x == zero));
482 u8x16_max_scalar (u8x16 x)
484 x = u8x16_max (x, u8x16_word_shift_right (x, 8));
485 x = u8x16_max (x, u8x16_word_shift_right (x, 4));
486 x = u8x16_max (x, u8x16_word_shift_right (x, 2));
487 x = u8x16_max (x, u8x16_word_shift_right (x, 1));
488 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
492 u8x16_min_scalar (u8x16 x)
494 x = u8x16_min (x, u8x16_word_shift_right (x, 8));
495 x = u8x16_min (x, u8x16_word_shift_right (x, 4));
496 x = u8x16_min (x, u8x16_word_shift_right (x, 2));
497 x = u8x16_min (x, u8x16_word_shift_right (x, 1));
498 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
502 i16x8_max_scalar (i16x8 x)
504 x = i16x8_max (x, i16x8_word_shift_right (x, 4));
505 x = i16x8_max (x, i16x8_word_shift_right (x, 2));
506 x = i16x8_max (x, i16x8_word_shift_right (x, 1));
507 return _mm_extract_epi16 ((__m128i) x, 0);
511 i16x8_min_scalar (i16x8 x)
513 x = i16x8_min (x, i16x8_word_shift_right (x, 4));
514 x = i16x8_min (x, i16x8_word_shift_right (x, 2));
515 x = i16x8_min (x, i16x8_word_shift_right (x, 1));
516 return _mm_extract_epi16 ((__m128i) x, 0);
519 #define u8x16_align_right(a, b, imm) \
520 (u8x16) _mm_alignr_epi8 ((__m128i) a, (__m128i) b, imm)
522 static_always_inline u32
523 u32x4_min_scalar (u32x4 v)
525 v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
526 v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
530 static_always_inline u32
531 u32x4_max_scalar (u32x4 v)
533 v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
534 v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
538 static_always_inline u32
539 i32x4_min_scalar (i32x4 v)
541 v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
542 v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
546 static_always_inline u32
547 i32x4_max_scalar (i32x4 v)
549 v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
550 v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
554 static_always_inline u16
555 u8x16_msb_mask (u8x16 v)
557 return _mm_movemask_epi8 ((__m128i) v);
560 static_always_inline u16
561 i8x16_msb_mask (i8x16 v)
563 return _mm_movemask_epi8 ((__m128i) v);
566 #define CLIB_HAVE_VEC128_MSB_MASK
570 static_always_inline u32x4
571 u32x4_byte_swap (u32x4 v)
574 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
576 return (u32x4) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
579 static_always_inline u16x8
580 u16x8_byte_swap (u16x8 v)
583 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
585 return (u16x8) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
588 static_always_inline u8x16
589 u8x16_reflect (u8x16 v)
592 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
594 return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) mask);
597 static_always_inline u32x4
598 u32x4_hadd (u32x4 v1, u32x4 v2)
600 return (u32x4) _mm_hadd_epi32 ((__m128i) v1, (__m128i) v2);
603 static_always_inline u32 __clib_unused
604 u32x4_sum_elts (u32x4 sum4)
606 sum4 += (u32x4) u8x16_align_right (sum4, sum4, 8);
607 sum4 += (u32x4) u8x16_align_right (sum4, sum4, 4);
611 static_always_inline u8x16
612 u8x16_shuffle (u8x16 v, u8x16 m)
614 return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) m);
617 static_always_inline u32x4
618 u32x4_shuffle (u32x4 v, const int a, const int b, const int c, const int d)
620 #if defined(__clang__) || !__OPTIMIZE__
621 u32x4 r = { v[a], v[b], v[c], v[d] };
624 return (u32x4) _mm_shuffle_epi32 ((__m128i) v,
625 a | b << 2 | c << 4 | d << 6);
632 static_always_inline t \
634 { return (t) _mm_cvt##i ((__m128i) x); }
636 _(u8x16, u16x8, epu8_epi16)
637 _(u8x16, u32x4, epu8_epi32)
638 _(u8x16, u64x2, epu8_epi64)
639 _(u16x8, u32x4, epu16_epi32)
640 _(u16x8, u64x2, epu16_epi64)
641 _(u32x4, u64x2, epu32_epi64)
643 _(i8x16, i16x8, epi8_epi16)
644 _(i8x16, i32x4, epi8_epi32)
645 _(i8x16, i64x2, epi8_epi64)
646 _(i16x8, i32x4, epi16_epi32)
647 _(i16x8, i64x2, epi16_epi64)
648 _(i32x4, i64x2, epi32_epi64)
652 static_always_inline u64x2
653 u64x2_gather (void *p0, void *p1)
655 u64x2 r = { *(u64 *) p0, *(u64 *) p1 };
659 static_always_inline u32x4
660 u32x4_gather (void *p0, void *p1, void *p2, void *p3)
662 u32x4 r = { *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3 };
667 static_always_inline void
668 u64x2_scatter (u64x2 r, void *p0, void *p1)
674 static_always_inline void
675 u32x4_scatter (u32x4 r, void *p0, void *p1, void *p2, void *p3)
683 static_always_inline void
684 u64x2_scatter_one (u64x2 r, int index, void *p)
686 *(u64 *) p = r[index];
689 static_always_inline void
690 u32x4_scatter_one (u32x4 r, int index, void *p)
692 *(u32 *) p = r[index];
695 static_always_inline u8x16
696 u8x16_is_greater (u8x16 v1, u8x16 v2)
698 return (u8x16) _mm_cmpgt_epi8 ((__m128i) v1, (__m128i) v2);
701 static_always_inline u8x16
702 u8x16_blend (u8x16 v1, u8x16 v2, u8x16 mask)
704 return (u8x16) _mm_blendv_epi8 ((__m128i) v1, (__m128i) v2, (__m128i) mask);
707 static_always_inline u8x16
708 u8x16_xor3 (u8x16 a, u8x16 b, u8x16 c)
711 return (u8x16) _mm_ternarylogic_epi32 ((__m128i) a, (__m128i) b,
717 #endif /* included_vector_sse2_h */
720 * fd.io coding-style-patch-verification: ON
723 * eval: (c-set-style "gnu")