2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 Copyright (c) 2005 Eliot Dresselhaus
18 Permission is hereby granted, free of charge, to any person obtaining
19 a copy of this software and associated documentation files (the
20 "Software"), to deal in the Software without restriction, including
21 without limitation the rights to use, copy, modify, merge, publish,
22 distribute, sublicense, and/or sell copies of the Software, and to
23 permit persons to whom the Software is furnished to do so, subject to
24 the following conditions:
26 The above copyright notice and this permission notice shall be
27 included in all copies or substantial portions of the Software.
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38 #ifndef included_vector_sse2_h
39 #define included_vector_sse2_h
41 #include <vppinfra/error_bootstrap.h> /* for ASSERT */
42 #include <x86intrin.h>
44 /* 128 bit interleaves. */
46 u8x16_interleave_hi (u8x16 a, u8x16 b)
48 return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
52 u8x16_interleave_lo (u8x16 a, u8x16 b)
54 return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
58 u16x8_interleave_hi (u16x8 a, u16x8 b)
60 return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
64 u16x8_interleave_lo (u16x8 a, u16x8 b)
66 return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
70 u32x4_interleave_hi (u32x4 a, u32x4 b)
72 return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
76 u32x4_interleave_lo (u32x4 a, u32x4 b)
78 return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
82 u64x2_interleave_hi (u64x2 a, u64x2 b)
84 return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
88 u64x2_interleave_lo (u64x2 a, u64x2 b)
90 return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
93 /* 64 bit interleaves. */
95 u8x8_interleave_hi (u8x8 a, u8x8 b)
97 return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
101 u8x8_interleave_lo (u8x8 a, u8x8 b)
103 return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
107 u16x4_interleave_hi (u16x4 a, u16x4 b)
109 return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
113 u16x4_interleave_lo (u16x4 a, u16x4 b)
115 return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
119 u32x2_interleave_hi (u32x2 a, u32x2 b)
121 return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
125 u32x2_interleave_lo (u32x2 a, u32x2 b)
127 return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
132 u16x8_pack (u16x8 lo, u16x8 hi)
134 return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
138 i16x8_pack (i16x8 lo, i16x8 hi)
140 return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
144 u32x4_pack (u32x4 lo, u32x4 hi)
146 return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
151 u16x4_pack (u16x4 lo, u16x4 hi)
153 return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
157 i16x4_pack (i16x4 lo, i16x4 hi)
159 return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
163 u32x2_pack (u32x2 lo, u32x2 hi)
165 return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
169 i32x2_pack (i32x2 lo, i32x2 hi)
171 return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
176 u64x2_read_lo (u64x2 x, u64 * a)
178 return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
182 u64x2_read_hi (u64x2 x, u64 * a)
184 return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
188 u64x2_write_lo (u64x2 x, u64 * a)
190 _mm_storel_pi ((__m64 *) a, (__m128) x);
194 u64x2_write_hi (u64x2 x, u64 * a)
196 _mm_storeh_pi ((__m64 *) a, (__m128) x);
200 /* Unaligned loads/stores. */
203 always_inline void t##_store_unaligned (t x, t * a) \
204 { _mm_storeu_si128 ((__m128i *) a, (__m128i) x); } \
205 always_inline t t##_load_unaligned (t * a) \
206 { return (t) _mm_loadu_si128 ((__m128i *) a); }
208 _(u8x16) _(u16x8) _(u32x4) _(u64x2) _(i8x16) _(i16x8) _(i32x4) _(i64x2)
210 #define _signed_binop(n,m,f,g) \
212 always_inline u##n##x##m \
213 u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
214 { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
217 always_inline i##n##x##m \
218 i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
219 { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
220 /* Addition/subtraction with saturation. */
221 _signed_binop (8, 16, add_saturate, adds_epu)
222 _signed_binop (16, 8, add_saturate, adds_epu)
223 _signed_binop (8, 16, sub_saturate, subs_epu)
224 _signed_binop (16, 8, sub_saturate, subs_epu)
225 /* Multiplication. */
226 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
228 return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
232 u16x8_mul_lo (u16x8 x, u16x8 y)
234 return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
238 i16x8_mul_hi (i16x8 x, i16x8 y)
240 return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
244 u16x8_mul_hi (u16x8 x, u16x8 y)
246 return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
249 /* 128 bit shifts. */
251 #define _(p,a,b,c,f) \
252 always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
253 { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
255 always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
256 { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
258 _(u, 16, 8, left, sll)
259 _(u, 32, 4, left, sll)
260 _(u, 64, 2, left, sll)
261 _(u, 16, 8, right, srl)
262 _(u, 32, 4, right, srl)
263 _(u, 64, 2, right, srl)
264 _(i, 16, 8, left, sll)
265 _(i, 32, 4, left, sll)
266 _(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
270 u16x4_shift_left (u16x4 x, u16x4 i)
272 return (u16x4) _m_psllw ((__m64) x, (__m64) i);
276 u32x2_shift_left (u32x2 x, u32x2 i)
278 return (u32x2) _m_pslld ((__m64) x, (__m64) i);
282 u16x4_shift_right (u16x4 x, u16x4 i)
284 return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
288 u32x2_shift_right (u32x2 x, u32x2 i)
290 return (u32x2) _m_psrld ((__m64) x, (__m64) i);
294 i16x4_shift_left (i16x4 x, i16x4 i)
296 return (i16x4) _m_psllw ((__m64) x, (__m64) i);
300 i32x2_shift_left (i32x2 x, i32x2 i)
302 return (i32x2) _m_pslld ((__m64) x, (__m64) i);
306 i16x4_shift_right (i16x4 x, i16x4 i)
308 return (i16x4) _m_psraw ((__m64) x, (__m64) i);
312 i32x2_shift_right (i32x2 x, i32x2 i)
314 return (i32x2) _m_psrad ((__m64) x, (__m64) i);
317 #define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
318 #define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
320 #define i8x16_word_shift_left(a,n) \
321 ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
322 #define i8x16_word_shift_right(a,n) \
323 ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
325 #define u16x8_word_shift_left(a,n) \
326 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
327 #define i16x8_word_shift_left(a,n) \
328 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
329 #define u16x8_word_shift_right(a,n) \
330 ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
331 #define i16x8_word_shift_right(a,n) \
332 ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
334 #define u32x4_word_shift_left(a,n) \
335 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
336 #define i32x4_word_shift_left(a,n) \
337 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
338 #define u32x4_word_shift_right(a,n) \
339 ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
340 #define i32x4_word_shift_right(a,n) \
341 ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
343 #define u64x2_word_shift_left(a,n) \
344 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
345 #define i64x2_word_shift_left(a,n) \
346 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
347 #define u64x2_word_shift_right(a,n) \
348 ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
349 #define i64x2_word_shift_right(a,n) \
350 ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
352 /* SSE2 has no rotate instructions: use shifts to simulate them. */
353 #define _(t,n,lr1,lr2) \
354 always_inline t##x##n \
355 t##x##n##_irotate_##lr1 (t##x##n w, int i) \
357 ASSERT (i >= 0 && i <= BITS (t)); \
358 return (t##x##n##_ishift_##lr1 (w, i) \
359 | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
362 always_inline t##x##n \
363 t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
365 t##x##n j = t##x##n##_splat (BITS (t)); \
366 return (t##x##n##_shift_##lr1 (w, i) \
367 | t##x##n##_shift_##lr2 (w, j - i)); \
370 _(u16, 8, left, right);
371 _(u16, 8, right, left);
372 _(u32, 4, left, right);
373 _(u32, 4, right, left);
374 _(u64, 2, left, right);
375 _(u64, 2, right, left);
380 #define _(t,n,lr1,lr2) \
381 always_inline t##x##n \
382 t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
384 int m = sizeof (t##x##n) / sizeof (t); \
385 ASSERT (i >= 0 && i < m); \
386 return (t##x##n##_word_shift_##lr1 (w0, i) \
387 | t##x##n##_word_shift_##lr2 (w1, m - i)); \
390 always_inline t##x##n \
391 t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
392 { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
394 _(u8, 16, left, right);
395 _(u8, 16, right, left);
396 _(u16, 8, left, right);
397 _(u16, 8, right, left);
398 _(u32, 4, left, right);
399 _(u32, 4, right, left);
400 _(u64, 2, left, right);
401 _(u64, 2, right, left);
407 u8x16_is_all_zero (u8x16 x)
409 return _mm_testz_si128 ((__m128i) x, (__m128i) x);
413 u16x8_is_all_zero (u16x8 x)
415 return _mm_testz_si128 ((__m128i) x, (__m128i) x);
419 u32x4_is_all_zero (u32x4 x)
421 return _mm_testz_si128 ((__m128i) x, (__m128i) x);
425 u64x2_is_all_zero (u64x2 x)
427 return _mm_testz_si128 ((__m128i) x, (__m128i) x);
430 #define u32x4_select(A,MASK) \
434 asm volatile ("pshufd %[mask], %[x], %[y]" \
435 : /* outputs */ [y] "=x" (_y) \
436 : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
440 #define u32x4_splat_word(x,i) \
441 u32x4_select ((x), (((i) << (2*0)) \
446 /* Extract low order 32 bit word. */
451 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
452 : /* inputs */ [x] "x" (x));
460 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
461 : /* inputs */ [x] "r" (x));
468 return (i32x4) u32x4_set0 ((u32) x);
474 return (i32) u32x4_get0 ((u32x4) x);
477 /* Converts all ones/zeros compare mask to bitmap. */
479 u8x16_compare_byte_mask (u8x16 x)
481 return _mm_movemask_epi8 ((__m128i) x);
484 extern u8 u32x4_compare_word_mask_table[256];
487 u32x4_compare_word_mask (u32x4 x)
489 u32 m = u8x16_compare_byte_mask ((u8x16) x);
490 return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
491 | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
495 u8x16_zero_byte_mask (u8x16 x)
498 return u8x16_compare_byte_mask (u8x16_is_equal (x, zero));
502 u16x8_zero_byte_mask (u16x8 x)
505 return u8x16_compare_byte_mask ((u8x16) u16x8_is_equal (x, zero));
509 u32x4_zero_byte_mask (u32x4 x)
512 return u8x16_compare_byte_mask ((u8x16) u32x4_is_equal (x, zero));
516 u8x16_max (u8x16 x, u8x16 y)
518 return (u8x16) _mm_max_epu8 ((__m128i) x, (__m128i) y);
522 u8x16_max_scalar (u8x16 x)
524 x = u8x16_max (x, u8x16_word_shift_right (x, 8));
525 x = u8x16_max (x, u8x16_word_shift_right (x, 4));
526 x = u8x16_max (x, u8x16_word_shift_right (x, 2));
527 x = u8x16_max (x, u8x16_word_shift_right (x, 1));
528 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
532 u8x16_min (u8x16 x, u8x16 y)
534 return (u8x16) _mm_min_epu8 ((__m128i) x, (__m128i) y);
538 u8x16_min_scalar (u8x16 x)
540 x = u8x16_min (x, u8x16_word_shift_right (x, 8));
541 x = u8x16_min (x, u8x16_word_shift_right (x, 4));
542 x = u8x16_min (x, u8x16_word_shift_right (x, 2));
543 x = u8x16_min (x, u8x16_word_shift_right (x, 1));
544 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
548 i16x8_max (i16x8 x, i16x8 y)
550 return (i16x8) _mm_max_epi16 ((__m128i) x, (__m128i) y);
554 i16x8_max_scalar (i16x8 x)
556 x = i16x8_max (x, i16x8_word_shift_right (x, 4));
557 x = i16x8_max (x, i16x8_word_shift_right (x, 2));
558 x = i16x8_max (x, i16x8_word_shift_right (x, 1));
559 return _mm_extract_epi16 ((__m128i) x, 0);
563 i16x8_min (i16x8 x, i16x8 y)
565 return (i16x8) _mm_min_epi16 ((__m128i) x, (__m128i) y);
569 i16x8_min_scalar (i16x8 x)
571 x = i16x8_min (x, i16x8_word_shift_right (x, 4));
572 x = i16x8_min (x, i16x8_word_shift_right (x, 2));
573 x = i16x8_min (x, i16x8_word_shift_right (x, 1));
574 return _mm_extract_epi16 ((__m128i) x, 0);
579 #endif /* included_vector_sse2_h */
582 * fd.io coding-style-patch-verification: ON
585 * eval: (c-set-style "gnu")