2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 Copyright (c) 2005 Eliot Dresselhaus
18 Permission is hereby granted, free of charge, to any person obtaining
19 a copy of this software and associated documentation files (the
20 "Software"), to deal in the Software without restriction, including
21 without limitation the rights to use, copy, modify, merge, publish,
22 distribute, sublicense, and/or sell copies of the Software, and to
23 permit persons to whom the Software is furnished to do so, subject to
24 the following conditions:
26 The above copyright notice and this permission notice shall be
27 included in all copies or substantial portions of the Software.
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38 #ifndef included_vector_sse2_h
39 #define included_vector_sse2_h
41 #include <vppinfra/error_bootstrap.h> /* for ASSERT */
42 #include <x86intrin.h>
44 /* 128 bit interleaves. */
45 always_inline u8x16 u8x16_interleave_hi (u8x16 a, u8x16 b)
46 { return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b); }
48 always_inline u8x16 u8x16_interleave_lo (u8x16 a, u8x16 b)
49 { return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b); }
51 always_inline u16x8 u16x8_interleave_hi (u16x8 a, u16x8 b)
52 { return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b); }
54 always_inline u16x8 u16x8_interleave_lo (u16x8 a, u16x8 b)
55 { return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b); }
57 always_inline u32x4 u32x4_interleave_hi (u32x4 a, u32x4 b)
58 { return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b); }
60 always_inline u32x4 u32x4_interleave_lo (u32x4 a, u32x4 b)
61 { return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b); }
63 always_inline u64x2 u64x2_interleave_hi (u64x2 a, u64x2 b)
64 { return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b); }
66 always_inline u64x2 u64x2_interleave_lo (u64x2 a, u64x2 b)
67 { return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b); }
69 /* 64 bit interleaves. */
70 always_inline u8x8 u8x8_interleave_hi (u8x8 a, u8x8 b)
71 { return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b); }
73 always_inline u8x8 u8x8_interleave_lo (u8x8 a, u8x8 b)
74 { return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b); }
76 always_inline u16x4 u16x4_interleave_hi (u16x4 a, u16x4 b)
77 { return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b); }
79 always_inline u16x4 u16x4_interleave_lo (u16x4 a, u16x4 b)
80 { return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b); }
82 always_inline u32x2 u32x2_interleave_hi (u32x2 a, u32x2 b)
83 { return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b); }
85 always_inline u32x2 u32x2_interleave_lo (u32x2 a, u32x2 b)
86 { return (u32x2) _m_punpckldq ((__m64) a, (__m64) b); }
89 always_inline u8x16 u16x8_pack (u16x8 lo, u16x8 hi)
90 { return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi); }
92 always_inline i8x16 i16x8_pack (i16x8 lo, i16x8 hi)
93 { return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi); }
95 always_inline u16x8 u32x4_pack (u32x4 lo, u32x4 hi)
96 { return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi); }
99 always_inline u8x8 u16x4_pack (u16x4 lo, u16x4 hi)
100 { return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi); }
102 always_inline i8x8 i16x4_pack (i16x4 lo, i16x4 hi)
103 { return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi); }
105 always_inline u16x4 u32x2_pack (u32x2 lo, u32x2 hi)
106 { return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi); }
108 always_inline i16x4 i32x2_pack (i32x2 lo, i32x2 hi)
109 { return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi); }
111 /* Splats: replicate scalar value into vector. */
112 always_inline u64x2 u64x2_splat (u64 a)
115 x = u64x2_interleave_lo (x, x);
119 always_inline u32x4 u32x4_splat (u32 a)
122 x = u32x4_interleave_lo (x, x);
123 x = (u32x4) u64x2_interleave_lo ((u64x2) x, (u64x2) x);
127 always_inline u16x8 u16x8_splat (u16 a)
129 u32 t = (u32) a | ((u32) a << 16);
130 return (u16x8) u32x4_splat (t);
133 always_inline u8x16 u8x16_splat (u8 a)
135 u32 t = (u32) a | ((u32) a << 8);
137 return (u8x16) u16x8_splat (t);
140 always_inline u32x2 u32x2_splat (u32 a)
143 x = u32x2_interleave_lo (x, x);
147 always_inline u16x4 u16x4_splat (u16 a)
149 u32 t = (u32) a | ((u32) a << 16);
150 return (u16x4) u32x2_splat (t);
153 always_inline u8x8 u8x8_splat (u8 a)
155 u32 t = (u32) a | ((u32) a << 8);
157 return (u8x8) u32x2_splat (t);
160 #define i64x2_splat u64x2_splat
161 #define i32x4_splat u32x4_splat
162 #define i16x8_splat u16x8_splat
163 #define i8x16_splat u8x16_splat
164 #define i32x2_splat u32x2_splat
165 #define i16x4_splat u16x4_splat
166 #define i8x8_splat u8x8_splat
169 always_inline u64x2 u64x2_read_lo (u64x2 x, u64 * a)
170 { return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a); }
172 always_inline u64x2 u64x2_read_hi (u64x2 x, u64 * a)
173 { return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a); }
175 always_inline void u64x2_write_lo (u64x2 x, u64 * a)
176 { _mm_storel_pi ((__m64 *) a, (__m128) x); }
178 always_inline void u64x2_write_hi (u64x2 x, u64 * a)
179 { _mm_storeh_pi ((__m64 *) a, (__m128) x); }
182 /* Unaligned loads/stores. */
185 always_inline void t##_store_unaligned (t x, t * a) \
186 { _mm_storeu_si128 ((__m128i *) a, (__m128i) x); } \
187 always_inline t t##_load_unaligned (t * a) \
188 { return (t) _mm_loadu_si128 ((__m128i *) a); }
201 #define _signed_binop(n,m,f,g) \
203 always_inline u##n##x##m \
204 u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
205 { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
208 always_inline i##n##x##m \
209 i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
210 { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
212 /* Addition/subtraction. */
213 _signed_binop (8, 16, add, add_epi)
214 _signed_binop (16, 8, add, add_epi)
215 _signed_binop (32, 4, add, add_epi)
216 _signed_binop (64, 2, add, add_epi)
217 _signed_binop (8, 16, sub, sub_epi)
218 _signed_binop (16, 8, sub, sub_epi)
219 _signed_binop (32, 4, sub, sub_epi)
220 _signed_binop (64, 2, sub, sub_epi)
222 /* Addition/subtraction with saturation. */
224 _signed_binop (8, 16, add_saturate, adds_epu)
225 _signed_binop (16, 8, add_saturate, adds_epu)
226 _signed_binop (8, 16, sub_saturate, subs_epu)
227 _signed_binop (16, 8, sub_saturate, subs_epu)
229 /* Multiplication. */
230 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
231 { return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y); }
233 always_inline u16x8 u16x8_mul_lo (u16x8 x, u16x8 y)
234 { return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y); }
236 always_inline i16x8 i16x8_mul_hi (i16x8 x, i16x8 y)
237 { return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y); }
239 always_inline u16x8 u16x8_mul_hi (u16x8 x, u16x8 y)
240 { return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y); }
242 /* 128 bit shifts. */
244 #define _(p,a,b,c,f) \
245 always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
246 { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
248 always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
249 { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
251 _ (u, 16, 8, left, sll)
252 _ (u, 32, 4, left, sll)
253 _ (u, 64, 2, left, sll)
254 _ (u, 16, 8, right, srl)
255 _ (u, 32, 4, right, srl)
256 _ (u, 64, 2, right, srl)
257 _ (i, 16, 8, left, sll)
258 _ (i, 32, 4, left, sll)
259 _ (i, 64, 2, left, sll)
260 _ (i, 16, 8, right, sra)
261 _ (i, 32, 4, right, sra)
266 always_inline u16x4 u16x4_shift_left (u16x4 x, u16x4 i)
267 { return (u16x4) _m_psllw ((__m64) x, (__m64) i); };
269 always_inline u32x2 u32x2_shift_left (u32x2 x, u32x2 i)
270 { return (u32x2) _m_pslld ((__m64) x, (__m64) i); };
272 always_inline u16x4 u16x4_shift_right (u16x4 x, u16x4 i)
273 { return (u16x4) _m_psrlw ((__m64) x, (__m64) i); };
275 always_inline u32x2 u32x2_shift_right (u32x2 x, u32x2 i)
276 { return (u32x2) _m_psrld ((__m64) x, (__m64) i); };
278 always_inline i16x4 i16x4_shift_left (i16x4 x, i16x4 i)
279 { return (i16x4) _m_psllw ((__m64) x, (__m64) i); };
281 always_inline i32x2 i32x2_shift_left (i32x2 x, i32x2 i)
282 { return (i32x2) _m_pslld ((__m64) x, (__m64) i); };
284 always_inline i16x4 i16x4_shift_right (i16x4 x, i16x4 i)
285 { return (i16x4) _m_psraw ((__m64) x, (__m64) i); };
287 always_inline i32x2 i32x2_shift_right (i32x2 x, i32x2 i)
288 { return (i32x2) _m_psrad ((__m64) x, (__m64) i); };
290 #define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
291 #define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
293 #define i8x16_word_shift_left(a,n) \
294 ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
295 #define i8x16_word_shift_right(a,n) \
296 ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
298 #define u16x8_word_shift_left(a,n) \
299 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
300 #define i16x8_word_shift_left(a,n) \
301 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
302 #define u16x8_word_shift_right(a,n) \
303 ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
304 #define i16x8_word_shift_right(a,n) \
305 ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
307 #define u32x4_word_shift_left(a,n) \
308 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
309 #define i32x4_word_shift_left(a,n) \
310 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
311 #define u32x4_word_shift_right(a,n) \
312 ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
313 #define i32x4_word_shift_right(a,n) \
314 ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
316 #define u64x2_word_shift_left(a,n) \
317 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
318 #define i64x2_word_shift_left(a,n) \
319 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
320 #define u64x2_word_shift_right(a,n) \
321 ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
322 #define i64x2_word_shift_right(a,n) \
323 ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
325 /* SSE2 has no rotate instructions: use shifts to simulate them. */
326 #define _(t,n,lr1,lr2) \
327 always_inline t##x##n \
328 t##x##n##_irotate_##lr1 (t##x##n w, int i) \
330 ASSERT (i >= 0 && i <= BITS (t)); \
331 return (t##x##n##_ishift_##lr1 (w, i) \
332 | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
335 always_inline t##x##n \
336 t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
338 t##x##n j = t##x##n##_splat (BITS (t)); \
339 return (t##x##n##_shift_##lr1 (w, i) \
340 | t##x##n##_shift_##lr2 (w, j - i)); \
343 _ (u16, 8, left, right);
344 _ (u16, 8, right, left);
345 _ (u32, 4, left, right);
346 _ (u32, 4, right, left);
347 _ (u64, 2, left, right);
348 _ (u64, 2, right, left);
353 #define _(t,n,lr1,lr2) \
354 always_inline t##x##n \
355 t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
357 int m = sizeof (t##x##n) / sizeof (t); \
358 ASSERT (i >= 0 && i < m); \
359 return (t##x##n##_word_shift_##lr1 (w0, i) \
360 | t##x##n##_word_shift_##lr2 (w1, m - i)); \
363 always_inline t##x##n \
364 t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
365 { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
367 _ (u8, 16, left, right);
368 _ (u8, 16, right, left);
369 _ (u16, 8, left, right);
370 _ (u16, 8, right, left);
371 _ (u32, 4, left, right);
372 _ (u32, 4, right, left);
373 _ (u64, 2, left, right);
374 _ (u64, 2, right, left);
379 /* Compare operations. */
380 always_inline u8x16 u8x16_is_equal (u8x16 x, u8x16 y)
381 { return (u8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y); }
383 always_inline i8x16 i8x16_is_equal (i8x16 x, i8x16 y)
384 { return (i8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y); }
386 always_inline u16x8 u16x8_is_equal (u16x8 x, u16x8 y)
387 { return (u16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y); }
389 always_inline i16x8 i16x8_is_equal (i16x8 x, i16x8 y)
390 { return (i16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y); }
392 always_inline u32x4 u32x4_is_equal (u32x4 x, u32x4 y)
393 { return (u32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y); }
395 always_inline i32x4 i32x4_is_equal (i32x4 x, i32x4 y)
396 { return (i32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y); }
399 i8x16_is_greater (i8x16 x, i8x16 y)
400 { return (u8x16) _mm_cmpgt_epi8 ((__m128i) x, (__m128i) y); }
403 i16x8_is_greater (i16x8 x, i16x8 y)
404 { return (u16x8) _mm_cmpgt_epi16 ((__m128i) x, (__m128i) y); }
407 i32x4_is_greater (i32x4 x, i32x4 y)
408 { return (u32x4) _mm_cmpgt_epi32 ((__m128i) x, (__m128i) y); }
410 always_inline u8x16 u8x16_is_zero (u8x16 x)
413 return u8x16_is_equal (x, zero);
416 always_inline u16x8 u16x8_is_zero (u16x8 x)
419 return u16x8_is_equal (x, zero);
422 always_inline u32x4 u32x4_is_zero (u32x4 x)
425 return u32x4_is_equal (x, zero);
428 #define u32x4_select(A,MASK) \
432 asm volatile ("pshufd %[mask], %[x], %[y]" \
433 : /* outputs */ [y] "=x" (_y) \
434 : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
438 #define u32x4_splat_word(x,i) \
439 u32x4_select ((x), (((i) << (2*0)) \
444 /* Extract low order 32 bit word. */
449 asm volatile ("movd %[x], %[result]"
450 : /* outputs */ [result] "=r" (result)
451 : /* inputs */ [x] "x" (x));
459 asm volatile ("movd %[x], %[result]"
460 : /* outputs */ [result] "=x" (result)
461 : /* inputs */ [x] "r" (x));
467 { return (i32x4) u32x4_set0 ((u32) x); }
471 { return (i32) u32x4_get0 ((u32x4) x); }
473 /* Converts all ones/zeros compare mask to bitmap. */
474 always_inline u32 u8x16_compare_byte_mask (u8x16 x)
475 { return _mm_movemask_epi8 ((__m128i) x); }
477 u8 u32x4_compare_word_mask_table[256];
479 always_inline u32 u32x4_compare_word_mask (u32x4 x)
481 u32 m = u8x16_compare_byte_mask ((u8x16) x);
482 return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
483 | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
486 always_inline u32 u8x16_zero_byte_mask (u8x16 x)
489 return u8x16_compare_byte_mask (u8x16_is_equal (x, zero));
492 always_inline u32 u16x8_zero_byte_mask (u16x8 x)
495 return u8x16_compare_byte_mask ((u8x16) u16x8_is_equal (x, zero));
498 always_inline u32 u32x4_zero_byte_mask (u32x4 x)
501 return u8x16_compare_byte_mask ((u8x16) u32x4_is_equal (x, zero));
504 always_inline u8x16 u8x16_max (u8x16 x, u8x16 y)
505 { return (u8x16) _mm_max_epu8 ((__m128i) x, (__m128i) y); }
507 always_inline u32 u8x16_max_scalar (u8x16 x)
509 x = u8x16_max (x, u8x16_word_shift_right (x, 8));
510 x = u8x16_max (x, u8x16_word_shift_right (x, 4));
511 x = u8x16_max (x, u8x16_word_shift_right (x, 2));
512 x = u8x16_max (x, u8x16_word_shift_right (x, 1));
513 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
516 always_inline u8x16 u8x16_min (u8x16 x, u8x16 y)
517 { return (u8x16) _mm_min_epu8 ((__m128i) x, (__m128i) y); }
519 always_inline u8 u8x16_min_scalar (u8x16 x)
521 x = u8x16_min (x, u8x16_word_shift_right (x, 8));
522 x = u8x16_min (x, u8x16_word_shift_right (x, 4));
523 x = u8x16_min (x, u8x16_word_shift_right (x, 2));
524 x = u8x16_min (x, u8x16_word_shift_right (x, 1));
525 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
528 always_inline i16x8 i16x8_max (i16x8 x, i16x8 y)
529 { return (i16x8) _mm_max_epi16 ((__m128i) x, (__m128i) y); }
531 always_inline i16 i16x8_max_scalar (i16x8 x)
533 x = i16x8_max (x, i16x8_word_shift_right (x, 4));
534 x = i16x8_max (x, i16x8_word_shift_right (x, 2));
535 x = i16x8_max (x, i16x8_word_shift_right (x, 1));
536 return _mm_extract_epi16 ((__m128i) x, 0);
539 always_inline i16x8 i16x8_min (i16x8 x, i16x8 y)
540 { return (i16x8) _mm_min_epi16 ((__m128i) x, (__m128i) y); }
542 always_inline i16 i16x8_min_scalar (i16x8 x)
544 x = i16x8_min (x, i16x8_word_shift_right (x, 4));
545 x = i16x8_min (x, i16x8_word_shift_right (x, 2));
546 x = i16x8_min (x, i16x8_word_shift_right (x, 1));
547 return _mm_extract_epi16 ((__m128i) x, 0);
552 #endif /* included_vector_sse2_h */