2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 Copyright (c) 2005 Eliot Dresselhaus
18 Permission is hereby granted, free of charge, to any person obtaining
19 a copy of this software and associated documentation files (the
20 "Software"), to deal in the Software without restriction, including
21 without limitation the rights to use, copy, modify, merge, publish,
22 distribute, sublicense, and/or sell copies of the Software, and to
23 permit persons to whom the Software is furnished to do so, subject to
24 the following conditions:
26 The above copyright notice and this permission notice shall be
27 included in all copies or substantial portions of the Software.
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38 #ifndef included_vector_sse2_h
39 #define included_vector_sse2_h
41 #include <vppinfra/error_bootstrap.h> /* for ASSERT */
42 #include <x86intrin.h>
45 #define foreach_sse42_vec128i \
46 _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64x)
47 #define foreach_sse42_vec128u \
48 _(u,8,16,epi8) _(u,16,8,epi16) _(u,32,4,epi32) _(u,64,2,epi64x)
49 #define foreach_sse42_vec128f \
50 _(f,32,4,ps) _(f,64,2,pd)
52 /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
54 #define _(t, s, c, i) \
55 static_always_inline t##s##x##c \
56 t##s##x##c##_splat (t##s x) \
57 { return (t##s##x##c) _mm_set1_##i (x); } \
59 static_always_inline t##s##x##c \
60 t##s##x##c##_load_unaligned (void *p) \
61 { return (t##s##x##c) _mm_loadu_si128 (p); } \
63 static_always_inline void \
64 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
65 { _mm_storeu_si128 ((__m128i *) p, (__m128i) v); } \
67 static_always_inline int \
68 t##s##x##c##_is_all_zero (t##s##x##c x) \
69 { return _mm_testz_si128 ((__m128i) x, (__m128i) x); } \
71 static_always_inline int \
72 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
73 { return t##s##x##c##_is_all_zero (a ^ b); } \
75 static_always_inline int \
76 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
77 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); }; \
79 foreach_sse42_vec128i foreach_sse42_vec128u
83 #define CLIB_VEC128_SPLAT_DEFINED
84 #define CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE
86 /* 128 bit interleaves. */
88 u8x16_interleave_hi (u8x16 a, u8x16 b)
90 return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
94 u8x16_interleave_lo (u8x16 a, u8x16 b)
96 return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
100 u16x8_interleave_hi (u16x8 a, u16x8 b)
102 return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
106 u16x8_interleave_lo (u16x8 a, u16x8 b)
108 return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
112 u32x4_interleave_hi (u32x4 a, u32x4 b)
114 return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
118 u32x4_interleave_lo (u32x4 a, u32x4 b)
120 return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
124 u64x2_interleave_hi (u64x2 a, u64x2 b)
126 return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
130 u64x2_interleave_lo (u64x2 a, u64x2 b)
132 return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
135 /* 64 bit interleaves. */
137 u8x8_interleave_hi (u8x8 a, u8x8 b)
139 return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
143 u8x8_interleave_lo (u8x8 a, u8x8 b)
145 return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
149 u16x4_interleave_hi (u16x4 a, u16x4 b)
151 return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
155 u16x4_interleave_lo (u16x4 a, u16x4 b)
157 return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
161 u32x2_interleave_hi (u32x2 a, u32x2 b)
163 return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
167 u32x2_interleave_lo (u32x2 a, u32x2 b)
169 return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
174 u16x8_pack (u16x8 lo, u16x8 hi)
176 return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
180 i16x8_pack (i16x8 lo, i16x8 hi)
182 return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
186 u32x4_pack (u32x4 lo, u32x4 hi)
188 return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
193 u16x4_pack (u16x4 lo, u16x4 hi)
195 return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
199 i16x4_pack (i16x4 lo, i16x4 hi)
201 return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
205 u32x2_pack (u32x2 lo, u32x2 hi)
207 return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
211 i32x2_pack (i32x2 lo, i32x2 hi)
213 return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
218 u64x2_read_lo (u64x2 x, u64 * a)
220 return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
224 u64x2_read_hi (u64x2 x, u64 * a)
226 return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
230 u64x2_write_lo (u64x2 x, u64 * a)
232 _mm_storel_pi ((__m64 *) a, (__m128) x);
236 u64x2_write_hi (u64x2 x, u64 * a)
238 _mm_storeh_pi ((__m64 *) a, (__m128) x);
242 #define _signed_binop(n,m,f,g) \
244 always_inline u##n##x##m \
245 u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
246 { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
249 always_inline i##n##x##m \
250 i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
251 { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
252 /* Addition/subtraction with saturation. */
253 _signed_binop (8, 16, add_saturate, adds_epu)
254 _signed_binop (16, 8, add_saturate, adds_epu)
255 _signed_binop (8, 16, sub_saturate, subs_epu)
256 _signed_binop (16, 8, sub_saturate, subs_epu)
257 /* Multiplication. */
258 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
260 return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
264 u16x8_mul_lo (u16x8 x, u16x8 y)
266 return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
270 i16x8_mul_hi (i16x8 x, i16x8 y)
272 return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
276 u16x8_mul_hi (u16x8 x, u16x8 y)
278 return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
281 /* 128 bit shifts. */
283 #define _(p,a,b,c,f) \
284 always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
285 { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
287 always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
288 { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
290 _(u, 16, 8, left, sll)
291 _(u, 32, 4, left, sll)
292 _(u, 64, 2, left, sll)
293 _(u, 16, 8, right, srl)
294 _(u, 32, 4, right, srl)
295 _(u, 64, 2, right, srl)
296 _(i, 16, 8, left, sll)
297 _(i, 32, 4, left, sll)
298 _(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
302 u16x4_shift_left (u16x4 x, u16x4 i)
304 return (u16x4) _m_psllw ((__m64) x, (__m64) i);
308 u32x2_shift_left (u32x2 x, u32x2 i)
310 return (u32x2) _m_pslld ((__m64) x, (__m64) i);
314 u16x4_shift_right (u16x4 x, u16x4 i)
316 return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
320 u32x2_shift_right (u32x2 x, u32x2 i)
322 return (u32x2) _m_psrld ((__m64) x, (__m64) i);
326 i16x4_shift_left (i16x4 x, i16x4 i)
328 return (i16x4) _m_psllw ((__m64) x, (__m64) i);
332 i32x2_shift_left (i32x2 x, i32x2 i)
334 return (i32x2) _m_pslld ((__m64) x, (__m64) i);
338 i16x4_shift_right (i16x4 x, i16x4 i)
340 return (i16x4) _m_psraw ((__m64) x, (__m64) i);
344 i32x2_shift_right (i32x2 x, i32x2 i)
346 return (i32x2) _m_psrad ((__m64) x, (__m64) i);
349 #define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
350 #define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
352 #define i8x16_word_shift_left(a,n) \
353 ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
354 #define i8x16_word_shift_right(a,n) \
355 ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
357 #define u16x8_word_shift_left(a,n) \
358 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
359 #define i16x8_word_shift_left(a,n) \
360 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
361 #define u16x8_word_shift_right(a,n) \
362 ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
363 #define i16x8_word_shift_right(a,n) \
364 ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
366 #define u32x4_word_shift_left(a,n) \
367 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
368 #define i32x4_word_shift_left(a,n) \
369 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
370 #define u32x4_word_shift_right(a,n) \
371 ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
372 #define i32x4_word_shift_right(a,n) \
373 ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
375 #define u64x2_word_shift_left(a,n) \
376 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
377 #define i64x2_word_shift_left(a,n) \
378 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
379 #define u64x2_word_shift_right(a,n) \
380 ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
381 #define i64x2_word_shift_right(a,n) \
382 ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
384 /* SSE2 has no rotate instructions: use shifts to simulate them. */
385 #define _(t,n,lr1,lr2) \
386 always_inline t##x##n \
387 t##x##n##_irotate_##lr1 (t##x##n w, int i) \
389 ASSERT (i >= 0 && i <= BITS (t)); \
390 return (t##x##n##_ishift_##lr1 (w, i) \
391 | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
394 always_inline t##x##n \
395 t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
397 t##x##n j = t##x##n##_splat (BITS (t)); \
398 return (t##x##n##_shift_##lr1 (w, i) \
399 | t##x##n##_shift_##lr2 (w, j - i)); \
402 _(u16, 8, left, right);
403 _(u16, 8, right, left);
404 _(u32, 4, left, right);
405 _(u32, 4, right, left);
406 _(u64, 2, left, right);
407 _(u64, 2, right, left);
412 #define _(t,n,lr1,lr2) \
413 always_inline t##x##n \
414 t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
416 int m = sizeof (t##x##n) / sizeof (t); \
417 ASSERT (i >= 0 && i < m); \
418 return (t##x##n##_word_shift_##lr1 (w0, i) \
419 | t##x##n##_word_shift_##lr2 (w1, m - i)); \
422 always_inline t##x##n \
423 t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
424 { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
426 _(u8, 16, left, right);
427 _(u8, 16, right, left);
428 _(u16, 8, left, right);
429 _(u16, 8, right, left);
430 _(u32, 4, left, right);
431 _(u32, 4, right, left);
432 _(u64, 2, left, right);
433 _(u64, 2, right, left);
438 #define u32x4_select(A,MASK) \
442 asm volatile ("pshufd %[mask], %[x], %[y]" \
443 : /* outputs */ [y] "=x" (_y) \
444 : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
448 #define u32x4_splat_word(x,i) \
449 u32x4_select ((x), (((i) << (2*0)) \
454 /* Extract low order 32 bit word. */
459 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
460 : /* inputs */ [x] "x" (x));
468 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
469 : /* inputs */ [x] "r" (x));
476 return (i32x4) u32x4_set0 ((u32) x);
482 return (i32) u32x4_get0 ((u32x4) x);
485 /* Converts all ones/zeros compare mask to bitmap. */
487 u8x16_compare_byte_mask (u8x16 x)
489 return _mm_movemask_epi8 ((__m128i) x);
492 extern u8 u32x4_compare_word_mask_table[256];
495 u32x4_compare_word_mask (u32x4 x)
497 u32 m = u8x16_compare_byte_mask ((u8x16) x);
498 return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
499 | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
503 u8x16_zero_byte_mask (u8x16 x)
506 return u8x16_compare_byte_mask (x == zero);
510 u16x8_zero_byte_mask (u16x8 x)
513 return u8x16_compare_byte_mask ((u8x16) (x == zero));
517 u32x4_zero_byte_mask (u32x4 x)
520 return u8x16_compare_byte_mask ((u8x16) (x == zero));
524 u8x16_max (u8x16 x, u8x16 y)
526 return (u8x16) _mm_max_epu8 ((__m128i) x, (__m128i) y);
530 u8x16_max_scalar (u8x16 x)
532 x = u8x16_max (x, u8x16_word_shift_right (x, 8));
533 x = u8x16_max (x, u8x16_word_shift_right (x, 4));
534 x = u8x16_max (x, u8x16_word_shift_right (x, 2));
535 x = u8x16_max (x, u8x16_word_shift_right (x, 1));
536 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
540 u8x16_min (u8x16 x, u8x16 y)
542 return (u8x16) _mm_min_epu8 ((__m128i) x, (__m128i) y);
546 u8x16_min_scalar (u8x16 x)
548 x = u8x16_min (x, u8x16_word_shift_right (x, 8));
549 x = u8x16_min (x, u8x16_word_shift_right (x, 4));
550 x = u8x16_min (x, u8x16_word_shift_right (x, 2));
551 x = u8x16_min (x, u8x16_word_shift_right (x, 1));
552 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
556 i16x8_max (i16x8 x, i16x8 y)
558 return (i16x8) _mm_max_epi16 ((__m128i) x, (__m128i) y);
562 i16x8_max_scalar (i16x8 x)
564 x = i16x8_max (x, i16x8_word_shift_right (x, 4));
565 x = i16x8_max (x, i16x8_word_shift_right (x, 2));
566 x = i16x8_max (x, i16x8_word_shift_right (x, 1));
567 return _mm_extract_epi16 ((__m128i) x, 0);
571 i16x8_min (i16x8 x, i16x8 y)
573 return (i16x8) _mm_min_epi16 ((__m128i) x, (__m128i) y);
577 i16x8_min_scalar (i16x8 x)
579 x = i16x8_min (x, i16x8_word_shift_right (x, 4));
580 x = i16x8_min (x, i16x8_word_shift_right (x, 2));
581 x = i16x8_min (x, i16x8_word_shift_right (x, 1));
582 return _mm_extract_epi16 ((__m128i) x, 0);
585 static_always_inline u16
586 u8x16_msb_mask (u8x16 v)
588 return _mm_movemask_epi8 ((__m128i) v);
591 #define CLIB_HAVE_VEC128_MSB_MASK
595 static_always_inline u16x8
596 u16x8_byte_swap (u16x8 v)
599 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
601 return (u16x8) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
604 static_always_inline u32x4
605 u32x4_hadd (u32x4 v1, u32x4 v2)
607 return (u32x4) _mm_hadd_epi32 ((__m128i) v1, (__m128i) v2);
610 static_always_inline u8x16
611 u8x16_shuffle (u8x16 v, u8x16 m)
613 return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) m);
616 static_always_inline u32x4
617 u32x4_shuffle (u32x4 v, const int a, const int b, const int c, const int d)
619 #if defined(__clang__) || !__OPTIMIZE__
620 u32x4 r = { v[a], v[b], v[c], v[d] };
623 return (u32x4) _mm_shuffle_epi32 ((__m128i) v,
624 a | b << 2 | c << 4 | d << 6);
631 static_always_inline t \
632 f##_extend_to_##t (f x) \
633 { return (t) _mm_cvt##i ((__m128i) x); }
635 _(u8x16, u16x8, epu8_epi16)
636 _(u8x16, u32x4, epu8_epi32)
637 _(u8x16, u64x2, epu8_epi64)
638 _(u16x8, u32x4, epu16_epi32)
639 _(u16x8, u64x2, epu16_epi64)
640 _(u32x4, u64x2, epu32_epi64)
642 _(i8x16, i16x8, epi8_epi16)
643 _(i8x16, i32x4, epi8_epi32)
644 _(i8x16, i64x2, epi8_epi64)
645 _(i16x8, i32x4, epi16_epi32)
646 _(i16x8, i64x2, epi16_epi64)
647 _(i32x4, i64x2, epi32_epi64)
651 static_always_inline u64x2
652 u64x2_gather (void *p0, void *p1)
654 u64x2 r = { *(u64 *) p0, *(u64 *) p1 };
658 static_always_inline u32x4
659 u32x4_gather (void *p0, void *p1, void *p2, void *p3, void *p4)
661 u32x4 r = { *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3 };
666 static_always_inline void
667 u64x2_scatter (u64x2 r, void *p0, void *p1)
673 static_always_inline void
674 u32x4_scatter (u32x4 r, void *p0, void *p1, void *p2, void *p3)
682 static_always_inline void
683 u64x2_scatter_one (u64x2 r, int index, void *p)
685 *(u64 *) p = r[index];
688 static_always_inline void
689 u32x4_scatter_one (u32x4 r, int index, void *p)
691 *(u32 *) p = r[index];
695 #endif /* included_vector_sse2_h */
698 * fd.io coding-style-patch-verification: ON
701 * eval: (c-set-style "gnu")