return (u16x16) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
}
-static_always_inline u8x32
-u8x32_shuffle (u8x32 v, u8x32 m)
-{
- return (u8x32) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) m);
-}
-
#define u8x32_align_right(a, b, imm) \
(u8x32) _mm256_alignr_epi8 ((__m256i) a, (__m256i) b, imm)
+#define u64x4_align_right(a, b, imm) \
+ (u64x4) _mm256_alignr_epi64 ((__m256i) a, (__m256i) b, imm)
+
static_always_inline u32
u32x8_sum_elts (u32x8 sum8)
{
return (u32x8) _mm256_hadd_epi32 ((__m256i) v1, (__m256i) v2);
}
+static_always_inline u32
+u32x8_hxor (u32x8 v)
+{
+ u32x4 v4;
+ v4 = u32x8_extract_lo (v) ^ u32x8_extract_hi (v);
+ v4 ^= (u32x4) u8x16_align_right (v4, v4, 8);
+ v4 ^= (u32x4) u8x16_align_right (v4, v4, 4);
+ return v4[0];
+}
+
static_always_inline u16x16
u16x16_mask_last (u16x16 v, u8 n_last)
{
*(u32 *) p = r[index];
}
-static_always_inline u8x32
-u8x32_is_greater (u8x32 v1, u8x32 v2)
-{
- return (u8x32) _mm256_cmpgt_epi8 ((__m256i) v1, (__m256i) v2);
-}
-
static_always_inline u8x32
u8x32_blend (u8x32 v1, u8x32 v2, u8x32 mask)
{