+ }
+}
+#elif defined(__VAES__)
+
+static_always_inline u8x32
+aes_block_load_x2 (u8 *src[], int i)
+{
+ u8x32 r = {};
+ r = u8x32_insert_lo (r, aes_block_load (src[0] + i));
+ r = u8x32_insert_hi (r, aes_block_load (src[1] + i));
+ return r;
+}
+
+static_always_inline void
+aes_block_store_x2 (u8 *dst[], int i, u8x32 r)
+{
+ aes_block_store (dst[0] + i, u8x32_extract_lo (r));
+ aes_block_store (dst[1] + i, u8x32_extract_hi (r));
+}
+
+static_always_inline u8x32
+aes2_cbc_dec_permute (u8x32 a, u8x32 b)
+{
+ return (u8x32) u64x4_shuffle2 ((u64x4) a, (u64x4) b, 2, 3, 4, 5);
+}
+
+static_always_inline void
+aes2_cbc_dec (u8x32 *k, u8x32u *src, u8x32u *dst, u8x16u *iv, int count,
+ aes_key_size_t rounds)
+{
+ u8x32 f = {}, r[4], c[4] = {};
+ int i, n_blocks = count >> 4;
+
+ f = u8x32_insert_hi (f, *iv);
+
+ while (n_blocks >= 8)
+ {
+ c[0] = src[0];
+ c[1] = src[1];
+ c[2] = src[2];
+ c[3] = src[3];
+
+ r[0] = c[0] ^ k[0];
+ r[1] = c[1] ^ k[0];
+ r[2] = c[2] ^ k[0];
+ r[3] = c[3] ^ k[0];
+
+ for (i = 1; i < rounds; i++)
+ {
+ r[0] = aes_dec_round_x2 (r[0], k[i]);
+ r[1] = aes_dec_round_x2 (r[1], k[i]);
+ r[2] = aes_dec_round_x2 (r[2], k[i]);
+ r[3] = aes_dec_round_x2 (r[3], k[i]);
+ }
+
+ r[0] = aes_dec_last_round_x2 (r[0], k[i]);
+ r[1] = aes_dec_last_round_x2 (r[1], k[i]);
+ r[2] = aes_dec_last_round_x2 (r[2], k[i]);
+ r[3] = aes_dec_last_round_x2 (r[3], k[i]);
+
+ dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+ dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
+ dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]);
+ dst[3] = r[3] ^= aes2_cbc_dec_permute (c[2], c[3]);
+ f = c[3];
+
+ n_blocks -= 8;
+ src += 4;
+ dst += 4;
+ }
+
+ if (n_blocks >= 6)
+ {
+ c[0] = src[0];
+ c[1] = src[1];
+ c[2] = src[2];
+
+ r[0] = c[0] ^ k[0];
+ r[1] = c[1] ^ k[0];
+ r[2] = c[2] ^ k[0];
+
+ for (i = 1; i < rounds; i++)
+ {
+ r[0] = aes_dec_round_x2 (r[0], k[i]);
+ r[1] = aes_dec_round_x2 (r[1], k[i]);
+ r[2] = aes_dec_round_x2 (r[2], k[i]);
+ }
+
+ r[0] = aes_dec_last_round_x2 (r[0], k[i]);
+ r[1] = aes_dec_last_round_x2 (r[1], k[i]);
+ r[2] = aes_dec_last_round_x2 (r[2], k[i]);
+
+ dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+ dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
+ dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]);
+ f = c[2];
+
+ n_blocks -= 6;
+ src += 3;
+ dst += 3;
+ }
+ else if (n_blocks >= 4)
+ {
+ c[0] = src[0];
+ c[1] = src[1];
+
+ r[0] = c[0] ^ k[0];
+ r[1] = c[1] ^ k[0];
+
+ for (i = 1; i < rounds; i++)
+ {
+ r[0] = aes_dec_round_x2 (r[0], k[i]);
+ r[1] = aes_dec_round_x2 (r[1], k[i]);
+ }
+
+ r[0] = aes_dec_last_round_x2 (r[0], k[i]);
+ r[1] = aes_dec_last_round_x2 (r[1], k[i]);
+
+ dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+ dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
+ f = c[1];
+