+#ifdef __VAES__
+static_always_inline void
+vaes_cbc_dec (__m512i * k, u8 * src, u8 * dst, u8 * iv, int count,
+ aesni_key_size_t rounds)
+{
+ __m512i permute = { 6, 7, 8, 9, 10, 11, 12, 13 };
+ __m512i r0, r1, r2, r3, c0, c1, c2, c3, f = { };
+ __mmask8 m;
+ int i, n_blocks = count >> 4;
+
+ f = _mm512_mask_loadu_epi64 (f, 0xc0, (__m512i *) (iv - 48));
+
+ while (n_blocks >= 16)
+ {
+ c0 = _mm512_loadu_si512 ((__m512i *) src);
+ c1 = _mm512_loadu_si512 ((__m512i *) (src + 64));
+ c2 = _mm512_loadu_si512 ((__m512i *) (src + 128));
+ c3 = _mm512_loadu_si512 ((__m512i *) (src + 192));
+
+ r0 = c0 ^ k[0];
+ r1 = c1 ^ k[0];
+ r2 = c2 ^ k[0];
+ r3 = c3 ^ k[0];
+
+ for (i = 1; i < rounds; i++)
+ {
+ r0 = _mm512_aesdec_epi128 (r0, k[i]);
+ r1 = _mm512_aesdec_epi128 (r1, k[i]);
+ r2 = _mm512_aesdec_epi128 (r2, k[i]);
+ r3 = _mm512_aesdec_epi128 (r3, k[i]);
+ }
+
+ r0 = _mm512_aesdeclast_epi128 (r0, k[i]);
+ r1 = _mm512_aesdeclast_epi128 (r1, k[i]);
+ r2 = _mm512_aesdeclast_epi128 (r2, k[i]);
+ r3 = _mm512_aesdeclast_epi128 (r3, k[i]);
+
+ r0 ^= _mm512_permutex2var_epi64 (f, permute, c0);
+ _mm512_storeu_si512 ((__m512i *) dst, r0);
+
+ r1 ^= _mm512_permutex2var_epi64 (c0, permute, c1);
+ _mm512_storeu_si512 ((__m512i *) (dst + 64), r1);
+
+ r2 ^= _mm512_permutex2var_epi64 (c1, permute, c2);
+ _mm512_storeu_si512 ((__m512i *) (dst + 128), r2);
+
+ r3 ^= _mm512_permutex2var_epi64 (c2, permute, c3);
+ _mm512_storeu_si512 ((__m512i *) (dst + 192), r3);
+ f = c3;
+
+ n_blocks -= 16;
+ src += 256;
+ dst += 256;
+ }
+
+ while (n_blocks > 0)
+ {
+ m = (1 << (n_blocks * 2)) - 1;
+ c0 = _mm512_mask_loadu_epi64 (c0, m, (__m512i *) src);
+ f = _mm512_permutex2var_epi64 (f, permute, c0);
+ r0 = c0 ^ k[0];
+ for (i = 1; i < rounds; i++)
+ r0 = _mm512_aesdec_epi128 (r0, k[i]);
+ r0 = _mm512_aesdeclast_epi128 (r0, k[i]);
+ _mm512_mask_storeu_epi64 ((__m512i *) dst, m, r0 ^ f);
+ f = c0;
+ n_blocks -= 4;
+ src += 64;
+ dst += 64;
+ }
+}
+#endif
+
+#ifdef __VAES__
+#define N 16
+#define u32xN u32x16
+#define u32xN_min_scalar u32x16_min_scalar
+#define u32xN_is_all_zero u32x16_is_all_zero
+#else
+#define N 4
+#define u32xN u32x4
+#define u32xN_min_scalar u32x4_min_scalar
+#define u32xN_is_all_zero u32x4_is_all_zero
+#endif
+