2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 Copyright (c) 2005 Eliot Dresselhaus
18 Permission is hereby granted, free of charge, to any person obtaining
19 a copy of this software and associated documentation files (the
20 "Software"), to deal in the Software without restriction, including
21 without limitation the rights to use, copy, modify, merge, publish,
22 distribute, sublicense, and/or sell copies of the Software, and to
23 permit persons to whom the Software is furnished to do so, subject to
24 the following conditions:
26 The above copyright notice and this permission notice shall be
27 included in all copies or substantial portions of the Software.
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38 #ifndef included_vector_sse2_h
39 #define included_vector_sse2_h
41 #include <vppinfra/error_bootstrap.h> /* for ASSERT */
42 #include <x86intrin.h>
44 /* 128 bit interleaves. */
46 u8x16_interleave_hi (u8x16 a, u8x16 b)
48 return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
52 u8x16_interleave_lo (u8x16 a, u8x16 b)
54 return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
58 u16x8_interleave_hi (u16x8 a, u16x8 b)
60 return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
64 u16x8_interleave_lo (u16x8 a, u16x8 b)
66 return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
70 u32x4_interleave_hi (u32x4 a, u32x4 b)
72 return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
76 u32x4_interleave_lo (u32x4 a, u32x4 b)
78 return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
82 u64x2_interleave_hi (u64x2 a, u64x2 b)
84 return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
88 u64x2_interleave_lo (u64x2 a, u64x2 b)
90 return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
93 /* 64 bit interleaves. */
95 u8x8_interleave_hi (u8x8 a, u8x8 b)
97 return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
101 u8x8_interleave_lo (u8x8 a, u8x8 b)
103 return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
107 u16x4_interleave_hi (u16x4 a, u16x4 b)
109 return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
113 u16x4_interleave_lo (u16x4 a, u16x4 b)
115 return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
119 u32x2_interleave_hi (u32x2 a, u32x2 b)
121 return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
125 u32x2_interleave_lo (u32x2 a, u32x2 b)
127 return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
132 u16x8_pack (u16x8 lo, u16x8 hi)
134 return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
138 i16x8_pack (i16x8 lo, i16x8 hi)
140 return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
144 u32x4_pack (u32x4 lo, u32x4 hi)
146 return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
151 u16x4_pack (u16x4 lo, u16x4 hi)
153 return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
157 i16x4_pack (i16x4 lo, i16x4 hi)
159 return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
163 u32x2_pack (u32x2 lo, u32x2 hi)
165 return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
169 i32x2_pack (i32x2 lo, i32x2 hi)
171 return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
174 /* Splats: replicate scalar value into vector. */
185 u32x4 x = { a, a, a, a };
192 u16x8 x = { a, a, a, a, a, a, a, a };
199 u8x16 x = { a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a };
213 u16x4 x = { a, a, a, a };
220 u8x8 x = { a, a, a, a, a, a, a, a };
224 #define i64x2_splat u64x2_splat
225 #define i32x4_splat u32x4_splat
226 #define i16x8_splat u16x8_splat
227 #define i8x16_splat u8x16_splat
228 #define i32x2_splat u32x2_splat
229 #define i16x4_splat u16x4_splat
230 #define i8x8_splat u8x8_splat
234 u64x2_read_lo (u64x2 x, u64 * a)
236 return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
240 u64x2_read_hi (u64x2 x, u64 * a)
242 return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
246 u64x2_write_lo (u64x2 x, u64 * a)
248 _mm_storel_pi ((__m64 *) a, (__m128) x);
252 u64x2_write_hi (u64x2 x, u64 * a)
254 _mm_storeh_pi ((__m64 *) a, (__m128) x);
258 /* Unaligned loads/stores. */
261 always_inline void t##_store_unaligned (t x, t * a) \
262 { _mm_storeu_si128 ((__m128i *) a, (__m128i) x); } \
263 always_inline t t##_load_unaligned (t * a) \
264 { return (t) _mm_loadu_si128 ((__m128i *) a); }
266 _(u8x16) _(u16x8) _(u32x4) _(u64x2) _(i8x16) _(i16x8) _(i32x4) _(i64x2)
268 #define _signed_binop(n,m,f,g) \
270 always_inline u##n##x##m \
271 u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
272 { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
275 always_inline i##n##x##m \
276 i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
277 { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
278 /* Addition/subtraction. */
279 _signed_binop (8, 16, add, add_epi)
280 _signed_binop (16, 8, add, add_epi)
281 _signed_binop (32, 4, add, add_epi)
282 _signed_binop (64, 2, add, add_epi)
283 _signed_binop (8, 16, sub, sub_epi)
284 _signed_binop (16, 8, sub, sub_epi)
285 _signed_binop (32, 4, sub, sub_epi) _signed_binop (64, 2, sub, sub_epi)
286 /* Addition/subtraction with saturation. */
287 _signed_binop (8, 16, add_saturate, adds_epu)
288 _signed_binop (16, 8, add_saturate, adds_epu)
289 _signed_binop (8, 16, sub_saturate, subs_epu)
290 _signed_binop (16, 8, sub_saturate, subs_epu)
291 /* Multiplication. */
292 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
294 return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
298 u16x8_mul_lo (u16x8 x, u16x8 y)
300 return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
304 i16x8_mul_hi (i16x8 x, i16x8 y)
306 return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
310 u16x8_mul_hi (u16x8 x, u16x8 y)
312 return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
315 /* 128 bit shifts. */
317 #define _(p,a,b,c,f) \
318 always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
319 { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
321 always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
322 { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
324 _(u, 16, 8, left, sll)
325 _(u, 32, 4, left, sll)
326 _(u, 64, 2, left, sll)
327 _(u, 16, 8, right, srl)
328 _(u, 32, 4, right, srl)
329 _(u, 64, 2, right, srl)
330 _(i, 16, 8, left, sll)
331 _(i, 32, 4, left, sll)
332 _(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
336 u16x4_shift_left (u16x4 x, u16x4 i)
338 return (u16x4) _m_psllw ((__m64) x, (__m64) i);
342 u32x2_shift_left (u32x2 x, u32x2 i)
344 return (u32x2) _m_pslld ((__m64) x, (__m64) i);
348 u16x4_shift_right (u16x4 x, u16x4 i)
350 return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
354 u32x2_shift_right (u32x2 x, u32x2 i)
356 return (u32x2) _m_psrld ((__m64) x, (__m64) i);
360 i16x4_shift_left (i16x4 x, i16x4 i)
362 return (i16x4) _m_psllw ((__m64) x, (__m64) i);
366 i32x2_shift_left (i32x2 x, i32x2 i)
368 return (i32x2) _m_pslld ((__m64) x, (__m64) i);
372 i16x4_shift_right (i16x4 x, i16x4 i)
374 return (i16x4) _m_psraw ((__m64) x, (__m64) i);
378 i32x2_shift_right (i32x2 x, i32x2 i)
380 return (i32x2) _m_psrad ((__m64) x, (__m64) i);
383 #define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
384 #define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
386 #define i8x16_word_shift_left(a,n) \
387 ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
388 #define i8x16_word_shift_right(a,n) \
389 ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
391 #define u16x8_word_shift_left(a,n) \
392 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
393 #define i16x8_word_shift_left(a,n) \
394 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
395 #define u16x8_word_shift_right(a,n) \
396 ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
397 #define i16x8_word_shift_right(a,n) \
398 ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
400 #define u32x4_word_shift_left(a,n) \
401 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
402 #define i32x4_word_shift_left(a,n) \
403 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
404 #define u32x4_word_shift_right(a,n) \
405 ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
406 #define i32x4_word_shift_right(a,n) \
407 ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
409 #define u64x2_word_shift_left(a,n) \
410 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
411 #define i64x2_word_shift_left(a,n) \
412 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
413 #define u64x2_word_shift_right(a,n) \
414 ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
415 #define i64x2_word_shift_right(a,n) \
416 ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
418 /* SSE2 has no rotate instructions: use shifts to simulate them. */
419 #define _(t,n,lr1,lr2) \
420 always_inline t##x##n \
421 t##x##n##_irotate_##lr1 (t##x##n w, int i) \
423 ASSERT (i >= 0 && i <= BITS (t)); \
424 return (t##x##n##_ishift_##lr1 (w, i) \
425 | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
428 always_inline t##x##n \
429 t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
431 t##x##n j = t##x##n##_splat (BITS (t)); \
432 return (t##x##n##_shift_##lr1 (w, i) \
433 | t##x##n##_shift_##lr2 (w, j - i)); \
436 _(u16, 8, left, right);
437 _(u16, 8, right, left);
438 _(u32, 4, left, right);
439 _(u32, 4, right, left);
440 _(u64, 2, left, right);
441 _(u64, 2, right, left);
446 #define _(t,n,lr1,lr2) \
447 always_inline t##x##n \
448 t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
450 int m = sizeof (t##x##n) / sizeof (t); \
451 ASSERT (i >= 0 && i < m); \
452 return (t##x##n##_word_shift_##lr1 (w0, i) \
453 | t##x##n##_word_shift_##lr2 (w1, m - i)); \
456 always_inline t##x##n \
457 t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
458 { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
460 _(u8, 16, left, right);
461 _(u8, 16, right, left);
462 _(u16, 8, left, right);
463 _(u16, 8, right, left);
464 _(u32, 4, left, right);
465 _(u32, 4, right, left);
466 _(u64, 2, left, right);
467 _(u64, 2, right, left);
472 /* Compare operations. */
474 u8x16_is_equal (u8x16 x, u8x16 y)
476 return (u8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y);
480 i8x16_is_equal (i8x16 x, i8x16 y)
482 return (i8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y);
486 u16x8_is_equal (u16x8 x, u16x8 y)
488 return (u16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y);
492 i16x8_is_equal (i16x8 x, i16x8 y)
494 return (i16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y);
498 u32x4_is_equal (u32x4 x, u32x4 y)
500 return (u32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y);
504 i32x4_is_equal (i32x4 x, i32x4 y)
506 return (i32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y);
510 i8x16_is_greater (i8x16 x, i8x16 y)
512 return (u8x16) _mm_cmpgt_epi8 ((__m128i) x, (__m128i) y);
516 i16x8_is_greater (i16x8 x, i16x8 y)
518 return (u16x8) _mm_cmpgt_epi16 ((__m128i) x, (__m128i) y);
522 i32x4_is_greater (i32x4 x, i32x4 y)
524 return (u32x4) _mm_cmpgt_epi32 ((__m128i) x, (__m128i) y);
528 u8x16_is_zero (u8x16 x)
531 return u8x16_is_equal (x, zero);
535 u16x8_is_zero (u16x8 x)
538 return u16x8_is_equal (x, zero);
542 u32x4_is_zero (u32x4 x)
545 return u32x4_is_equal (x, zero);
548 #define u32x4_select(A,MASK) \
552 asm volatile ("pshufd %[mask], %[x], %[y]" \
553 : /* outputs */ [y] "=x" (_y) \
554 : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
558 #define u32x4_splat_word(x,i) \
559 u32x4_select ((x), (((i) << (2*0)) \
564 /* Extract low order 32 bit word. */
569 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
570 : /* inputs */ [x] "x" (x));
578 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
579 : /* inputs */ [x] "r" (x));
586 return (i32x4) u32x4_set0 ((u32) x);
592 return (i32) u32x4_get0 ((u32x4) x);
595 /* Converts all ones/zeros compare mask to bitmap. */
597 u8x16_compare_byte_mask (u8x16 x)
599 return _mm_movemask_epi8 ((__m128i) x);
602 extern u8 u32x4_compare_word_mask_table[256];
605 u32x4_compare_word_mask (u32x4 x)
607 u32 m = u8x16_compare_byte_mask ((u8x16) x);
608 return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
609 | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
613 u8x16_zero_byte_mask (u8x16 x)
616 return u8x16_compare_byte_mask (u8x16_is_equal (x, zero));
620 u16x8_zero_byte_mask (u16x8 x)
623 return u8x16_compare_byte_mask ((u8x16) u16x8_is_equal (x, zero));
627 u32x4_zero_byte_mask (u32x4 x)
630 return u8x16_compare_byte_mask ((u8x16) u32x4_is_equal (x, zero));
634 u8x16_max (u8x16 x, u8x16 y)
636 return (u8x16) _mm_max_epu8 ((__m128i) x, (__m128i) y);
640 u8x16_max_scalar (u8x16 x)
642 x = u8x16_max (x, u8x16_word_shift_right (x, 8));
643 x = u8x16_max (x, u8x16_word_shift_right (x, 4));
644 x = u8x16_max (x, u8x16_word_shift_right (x, 2));
645 x = u8x16_max (x, u8x16_word_shift_right (x, 1));
646 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
650 u8x16_min (u8x16 x, u8x16 y)
652 return (u8x16) _mm_min_epu8 ((__m128i) x, (__m128i) y);
656 u8x16_min_scalar (u8x16 x)
658 x = u8x16_min (x, u8x16_word_shift_right (x, 8));
659 x = u8x16_min (x, u8x16_word_shift_right (x, 4));
660 x = u8x16_min (x, u8x16_word_shift_right (x, 2));
661 x = u8x16_min (x, u8x16_word_shift_right (x, 1));
662 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
666 i16x8_max (i16x8 x, i16x8 y)
668 return (i16x8) _mm_max_epi16 ((__m128i) x, (__m128i) y);
672 i16x8_max_scalar (i16x8 x)
674 x = i16x8_max (x, i16x8_word_shift_right (x, 4));
675 x = i16x8_max (x, i16x8_word_shift_right (x, 2));
676 x = i16x8_max (x, i16x8_word_shift_right (x, 1));
677 return _mm_extract_epi16 ((__m128i) x, 0);
681 i16x8_min (i16x8 x, i16x8 y)
683 return (i16x8) _mm_min_epi16 ((__m128i) x, (__m128i) y);
687 i16x8_min_scalar (i16x8 x)
689 x = i16x8_min (x, i16x8_word_shift_right (x, 4));
690 x = i16x8_min (x, i16x8_word_shift_right (x, 2));
691 x = i16x8_min (x, i16x8_word_shift_right (x, 1));
692 return _mm_extract_epi16 ((__m128i) x, 0);
697 #endif /* included_vector_sse2_h */
700 * fd.io coding-style-patch-verification: ON
703 * eval: (c-set-style "gnu")