2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 Copyright (c) 2005 Eliot Dresselhaus
18 Permission is hereby granted, free of charge, to any person obtaining
19 a copy of this software and associated documentation files (the
20 "Software"), to deal in the Software without restriction, including
21 without limitation the rights to use, copy, modify, merge, publish,
22 distribute, sublicense, and/or sell copies of the Software, and to
23 permit persons to whom the Software is furnished to do so, subject to
24 the following conditions:
26 The above copyright notice and this permission notice shall be
27 included in all copies or substantial portions of the Software.
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38 #ifndef included_vector_sse2_h
39 #define included_vector_sse2_h
41 #include <vppinfra/error_bootstrap.h> /* for ASSERT */
42 #include <x86intrin.h>
44 /* 128 bit interleaves. */
46 u8x16_interleave_hi (u8x16 a, u8x16 b)
48 return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
52 u8x16_interleave_lo (u8x16 a, u8x16 b)
54 return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
58 u16x8_interleave_hi (u16x8 a, u16x8 b)
60 return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
64 u16x8_interleave_lo (u16x8 a, u16x8 b)
66 return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
70 u32x4_interleave_hi (u32x4 a, u32x4 b)
72 return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
76 u32x4_interleave_lo (u32x4 a, u32x4 b)
78 return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
82 u64x2_interleave_hi (u64x2 a, u64x2 b)
84 return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
88 u64x2_interleave_lo (u64x2 a, u64x2 b)
90 return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
93 /* 64 bit interleaves. */
95 u8x8_interleave_hi (u8x8 a, u8x8 b)
97 return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
101 u8x8_interleave_lo (u8x8 a, u8x8 b)
103 return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
107 u16x4_interleave_hi (u16x4 a, u16x4 b)
109 return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
113 u16x4_interleave_lo (u16x4 a, u16x4 b)
115 return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
119 u32x2_interleave_hi (u32x2 a, u32x2 b)
121 return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
125 u32x2_interleave_lo (u32x2 a, u32x2 b)
127 return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
132 u16x8_pack (u16x8 lo, u16x8 hi)
134 return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
138 i16x8_pack (i16x8 lo, i16x8 hi)
140 return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
144 u32x4_pack (u32x4 lo, u32x4 hi)
146 return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
151 u16x4_pack (u16x4 lo, u16x4 hi)
153 return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
157 i16x4_pack (i16x4 lo, i16x4 hi)
159 return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
163 u32x2_pack (u32x2 lo, u32x2 hi)
165 return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
169 i32x2_pack (i32x2 lo, i32x2 hi)
171 return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
174 /* Splats: replicate scalar value into vector. */
179 x = u64x2_interleave_lo (x, x);
187 x = u32x4_interleave_lo (x, x);
188 x = (u32x4) u64x2_interleave_lo ((u64x2) x, (u64x2) x);
195 u32 t = (u32) a | ((u32) a << 16);
196 return (u16x8) u32x4_splat (t);
202 u32 t = (u32) a | ((u32) a << 8);
204 return (u8x16) u16x8_splat (t);
211 x = u32x2_interleave_lo (x, x);
218 u32 t = (u32) a | ((u32) a << 16);
219 return (u16x4) u32x2_splat (t);
225 u32 t = (u32) a | ((u32) a << 8);
227 return (u8x8) u32x2_splat (t);
230 #define i64x2_splat u64x2_splat
231 #define i32x4_splat u32x4_splat
232 #define i16x8_splat u16x8_splat
233 #define i8x16_splat u8x16_splat
234 #define i32x2_splat u32x2_splat
235 #define i16x4_splat u16x4_splat
236 #define i8x8_splat u8x8_splat
240 u64x2_read_lo (u64x2 x, u64 * a)
242 return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
246 u64x2_read_hi (u64x2 x, u64 * a)
248 return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
252 u64x2_write_lo (u64x2 x, u64 * a)
254 _mm_storel_pi ((__m64 *) a, (__m128) x);
258 u64x2_write_hi (u64x2 x, u64 * a)
260 _mm_storeh_pi ((__m64 *) a, (__m128) x);
264 /* Unaligned loads/stores. */
267 always_inline void t##_store_unaligned (t x, t * a) \
268 { _mm_storeu_si128 ((__m128i *) a, (__m128i) x); } \
269 always_inline t t##_load_unaligned (t * a) \
270 { return (t) _mm_loadu_si128 ((__m128i *) a); }
272 _(u8x16) _(u16x8) _(u32x4) _(u64x2) _(i8x16) _(i16x8) _(i32x4) _(i64x2)
274 #define _signed_binop(n,m,f,g) \
276 always_inline u##n##x##m \
277 u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
278 { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
281 always_inline i##n##x##m \
282 i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
283 { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
284 /* Addition/subtraction. */
285 _signed_binop (8, 16, add, add_epi)
286 _signed_binop (16, 8, add, add_epi)
287 _signed_binop (32, 4, add, add_epi)
288 _signed_binop (64, 2, add, add_epi)
289 _signed_binop (8, 16, sub, sub_epi)
290 _signed_binop (16, 8, sub, sub_epi)
291 _signed_binop (32, 4, sub, sub_epi) _signed_binop (64, 2, sub, sub_epi)
292 /* Addition/subtraction with saturation. */
293 _signed_binop (8, 16, add_saturate, adds_epu)
294 _signed_binop (16, 8, add_saturate, adds_epu)
295 _signed_binop (8, 16, sub_saturate, subs_epu)
296 _signed_binop (16, 8, sub_saturate, subs_epu)
297 /* Multiplication. */
298 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
300 return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
304 u16x8_mul_lo (u16x8 x, u16x8 y)
306 return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
310 i16x8_mul_hi (i16x8 x, i16x8 y)
312 return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
316 u16x8_mul_hi (u16x8 x, u16x8 y)
318 return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
321 /* 128 bit shifts. */
323 #define _(p,a,b,c,f) \
324 always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
325 { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
327 always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
328 { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
330 _(u, 16, 8, left, sll)
331 _(u, 32, 4, left, sll)
332 _(u, 64, 2, left, sll)
333 _(u, 16, 8, right, srl)
334 _(u, 32, 4, right, srl)
335 _(u, 64, 2, right, srl)
336 _(i, 16, 8, left, sll)
337 _(i, 32, 4, left, sll)
338 _(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
342 u16x4_shift_left (u16x4 x, u16x4 i)
344 return (u16x4) _m_psllw ((__m64) x, (__m64) i);
348 u32x2_shift_left (u32x2 x, u32x2 i)
350 return (u32x2) _m_pslld ((__m64) x, (__m64) i);
354 u16x4_shift_right (u16x4 x, u16x4 i)
356 return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
360 u32x2_shift_right (u32x2 x, u32x2 i)
362 return (u32x2) _m_psrld ((__m64) x, (__m64) i);
366 i16x4_shift_left (i16x4 x, i16x4 i)
368 return (i16x4) _m_psllw ((__m64) x, (__m64) i);
372 i32x2_shift_left (i32x2 x, i32x2 i)
374 return (i32x2) _m_pslld ((__m64) x, (__m64) i);
378 i16x4_shift_right (i16x4 x, i16x4 i)
380 return (i16x4) _m_psraw ((__m64) x, (__m64) i);
384 i32x2_shift_right (i32x2 x, i32x2 i)
386 return (i32x2) _m_psrad ((__m64) x, (__m64) i);
389 #define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
390 #define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
392 #define i8x16_word_shift_left(a,n) \
393 ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
394 #define i8x16_word_shift_right(a,n) \
395 ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
397 #define u16x8_word_shift_left(a,n) \
398 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
399 #define i16x8_word_shift_left(a,n) \
400 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
401 #define u16x8_word_shift_right(a,n) \
402 ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
403 #define i16x8_word_shift_right(a,n) \
404 ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
406 #define u32x4_word_shift_left(a,n) \
407 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
408 #define i32x4_word_shift_left(a,n) \
409 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
410 #define u32x4_word_shift_right(a,n) \
411 ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
412 #define i32x4_word_shift_right(a,n) \
413 ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
415 #define u64x2_word_shift_left(a,n) \
416 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
417 #define i64x2_word_shift_left(a,n) \
418 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
419 #define u64x2_word_shift_right(a,n) \
420 ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
421 #define i64x2_word_shift_right(a,n) \
422 ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
424 /* SSE2 has no rotate instructions: use shifts to simulate them. */
425 #define _(t,n,lr1,lr2) \
426 always_inline t##x##n \
427 t##x##n##_irotate_##lr1 (t##x##n w, int i) \
429 ASSERT (i >= 0 && i <= BITS (t)); \
430 return (t##x##n##_ishift_##lr1 (w, i) \
431 | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
434 always_inline t##x##n \
435 t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
437 t##x##n j = t##x##n##_splat (BITS (t)); \
438 return (t##x##n##_shift_##lr1 (w, i) \
439 | t##x##n##_shift_##lr2 (w, j - i)); \
442 _(u16, 8, left, right);
443 _(u16, 8, right, left);
444 _(u32, 4, left, right);
445 _(u32, 4, right, left);
446 _(u64, 2, left, right);
447 _(u64, 2, right, left);
452 #define _(t,n,lr1,lr2) \
453 always_inline t##x##n \
454 t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
456 int m = sizeof (t##x##n) / sizeof (t); \
457 ASSERT (i >= 0 && i < m); \
458 return (t##x##n##_word_shift_##lr1 (w0, i) \
459 | t##x##n##_word_shift_##lr2 (w1, m - i)); \
462 always_inline t##x##n \
463 t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
464 { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
466 _(u8, 16, left, right);
467 _(u8, 16, right, left);
468 _(u16, 8, left, right);
469 _(u16, 8, right, left);
470 _(u32, 4, left, right);
471 _(u32, 4, right, left);
472 _(u64, 2, left, right);
473 _(u64, 2, right, left);
478 /* Compare operations. */
480 u8x16_is_equal (u8x16 x, u8x16 y)
482 return (u8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y);
486 i8x16_is_equal (i8x16 x, i8x16 y)
488 return (i8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y);
492 u16x8_is_equal (u16x8 x, u16x8 y)
494 return (u16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y);
498 i16x8_is_equal (i16x8 x, i16x8 y)
500 return (i16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y);
504 u32x4_is_equal (u32x4 x, u32x4 y)
506 return (u32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y);
510 i32x4_is_equal (i32x4 x, i32x4 y)
512 return (i32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y);
516 i8x16_is_greater (i8x16 x, i8x16 y)
518 return (u8x16) _mm_cmpgt_epi8 ((__m128i) x, (__m128i) y);
522 i16x8_is_greater (i16x8 x, i16x8 y)
524 return (u16x8) _mm_cmpgt_epi16 ((__m128i) x, (__m128i) y);
528 i32x4_is_greater (i32x4 x, i32x4 y)
530 return (u32x4) _mm_cmpgt_epi32 ((__m128i) x, (__m128i) y);
534 u8x16_is_zero (u8x16 x)
537 return u8x16_is_equal (x, zero);
541 u16x8_is_zero (u16x8 x)
544 return u16x8_is_equal (x, zero);
548 u32x4_is_zero (u32x4 x)
551 return u32x4_is_equal (x, zero);
554 #define u32x4_select(A,MASK) \
558 asm volatile ("pshufd %[mask], %[x], %[y]" \
559 : /* outputs */ [y] "=x" (_y) \
560 : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
564 #define u32x4_splat_word(x,i) \
565 u32x4_select ((x), (((i) << (2*0)) \
570 /* Extract low order 32 bit word. */
575 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
576 : /* inputs */ [x] "x" (x));
584 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
585 : /* inputs */ [x] "r" (x));
592 return (i32x4) u32x4_set0 ((u32) x);
598 return (i32) u32x4_get0 ((u32x4) x);
601 /* Converts all ones/zeros compare mask to bitmap. */
603 u8x16_compare_byte_mask (u8x16 x)
605 return _mm_movemask_epi8 ((__m128i) x);
608 extern u8 u32x4_compare_word_mask_table[256];
611 u32x4_compare_word_mask (u32x4 x)
613 u32 m = u8x16_compare_byte_mask ((u8x16) x);
614 return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
615 | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
619 u8x16_zero_byte_mask (u8x16 x)
622 return u8x16_compare_byte_mask (u8x16_is_equal (x, zero));
626 u16x8_zero_byte_mask (u16x8 x)
629 return u8x16_compare_byte_mask ((u8x16) u16x8_is_equal (x, zero));
633 u32x4_zero_byte_mask (u32x4 x)
636 return u8x16_compare_byte_mask ((u8x16) u32x4_is_equal (x, zero));
640 u8x16_max (u8x16 x, u8x16 y)
642 return (u8x16) _mm_max_epu8 ((__m128i) x, (__m128i) y);
646 u8x16_max_scalar (u8x16 x)
648 x = u8x16_max (x, u8x16_word_shift_right (x, 8));
649 x = u8x16_max (x, u8x16_word_shift_right (x, 4));
650 x = u8x16_max (x, u8x16_word_shift_right (x, 2));
651 x = u8x16_max (x, u8x16_word_shift_right (x, 1));
652 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
656 u8x16_min (u8x16 x, u8x16 y)
658 return (u8x16) _mm_min_epu8 ((__m128i) x, (__m128i) y);
662 u8x16_min_scalar (u8x16 x)
664 x = u8x16_min (x, u8x16_word_shift_right (x, 8));
665 x = u8x16_min (x, u8x16_word_shift_right (x, 4));
666 x = u8x16_min (x, u8x16_word_shift_right (x, 2));
667 x = u8x16_min (x, u8x16_word_shift_right (x, 1));
668 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
672 i16x8_max (i16x8 x, i16x8 y)
674 return (i16x8) _mm_max_epi16 ((__m128i) x, (__m128i) y);
678 i16x8_max_scalar (i16x8 x)
680 x = i16x8_max (x, i16x8_word_shift_right (x, 4));
681 x = i16x8_max (x, i16x8_word_shift_right (x, 2));
682 x = i16x8_max (x, i16x8_word_shift_right (x, 1));
683 return _mm_extract_epi16 ((__m128i) x, 0);
687 i16x8_min (i16x8 x, i16x8 y)
689 return (i16x8) _mm_min_epi16 ((__m128i) x, (__m128i) y);
693 i16x8_min_scalar (i16x8 x)
695 x = i16x8_min (x, i16x8_word_shift_right (x, 4));
696 x = i16x8_min (x, i16x8_word_shift_right (x, 2));
697 x = i16x8_min (x, i16x8_word_shift_right (x, 1));
698 return _mm_extract_epi16 ((__m128i) x, 0);
703 #endif /* included_vector_sse2_h */
706 * fd.io coding-style-patch-verification: ON
709 * eval: (c-set-style "gnu")