2 *------------------------------------------------------------------
3 * Copyright (c) 2019 Cisco and/or its affiliates.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *------------------------------------------------------------------
18 #include <vlib/vlib.h>
19 #include <vnet/plugin/plugin.h>
20 #include <vnet/crypto/crypto.h>
21 #include <crypto_native/crypto_native.h>
22 #include <crypto_native/aes.h>
24 #if __GNUC__ > 4 && !__clang__ && CLIB_DEBUG == 0
25 #pragma GCC optimize ("O3")
28 #if defined(__VAES__) && defined(__AVX512F__)
32 #define u32xN_min_scalar u32x16_min_scalar
33 #define u32xN_is_all_zero u32x16_is_all_zero
34 #define u32xN_splat u32x16_splat
35 #elif defined(__VAES__)
39 #define u32xN_min_scalar u32x8_min_scalar
40 #define u32xN_is_all_zero u32x8_is_all_zero
41 #define u32xN_splat u32x8_splat
46 #define u32xN_min_scalar u32x4_min_scalar
47 #define u32xN_is_all_zero u32x4_is_all_zero
48 #define u32xN_splat u32x4_splat
53 u8x16 encrypt_key[15];
57 static_always_inline void __clib_unused
58 aes_cbc_dec (u8x16 * k, u8x16u * src, u8x16u * dst, u8x16u * iv, int count,
66 clib_prefetch_load (src + 8);
67 clib_prefetch_load (dst + 8);
80 for (int i = 1; i < rounds; i++)
82 r[0] = aes_dec_round (r[0], k[i]);
83 r[1] = aes_dec_round (r[1], k[i]);
84 r[2] = aes_dec_round (r[2], k[i]);
85 r[3] = aes_dec_round (r[3], k[i]);
88 r[0] = aes_dec_last_round (r[0], k[rounds]);
89 r[1] = aes_dec_last_round (r[1], k[rounds]);
90 r[2] = aes_dec_last_round (r[2], k[rounds]);
91 r[3] = aes_dec_last_round (r[3], k[rounds]);
93 for (int i = 0; i < rounds - 1; i++)
95 r[0] = vaesimcq_u8 (vaesdq_u8 (r[0], k[i]));
96 r[1] = vaesimcq_u8 (vaesdq_u8 (r[1], k[i]));
97 r[2] = vaesimcq_u8 (vaesdq_u8 (r[2], k[i]));
98 r[3] = vaesimcq_u8 (vaesdq_u8 (r[3], k[i]));
100 r[0] = vaesdq_u8 (r[0], k[rounds - 1]) ^ k[rounds];
101 r[1] = vaesdq_u8 (r[1], k[rounds - 1]) ^ k[rounds];
102 r[2] = vaesdq_u8 (r[2], k[rounds - 1]) ^ k[rounds];
103 r[3] = vaesdq_u8 (r[3], k[rounds - 1]) ^ k[rounds];
106 dst[1] = r[1] ^ c[0];
107 dst[2] = r[2] ^ c[1];
108 dst[3] = r[3] ^ c[2];
118 c[0] = r[0] = src[0];
121 for (int i = 1; i < rounds; i++)
122 r[0] = aes_dec_round (r[0], k[i]);
123 r[0] = aes_dec_last_round (r[0], k[rounds]);
125 c[0] = r[0] = src[0];
126 for (int i = 0; i < rounds - 1; i++)
127 r[0] = vaesimcq_u8 (vaesdq_u8 (r[0], k[i]));
128 r[0] = vaesdq_u8 (r[0], k[rounds - 1]) ^ k[rounds];
140 #if defined(__VAES__) && defined(__AVX512F__)
142 static_always_inline u8x64
143 aes_block_load_x4 (u8 * src[], int i)
146 r = u8x64_insert_u8x16 (r, aes_block_load (src[0] + i), 0);
147 r = u8x64_insert_u8x16 (r, aes_block_load (src[1] + i), 1);
148 r = u8x64_insert_u8x16 (r, aes_block_load (src[2] + i), 2);
149 r = u8x64_insert_u8x16 (r, aes_block_load (src[3] + i), 3);
153 static_always_inline void
154 aes_block_store_x4 (u8 * dst[], int i, u8x64 r)
156 aes_block_store (dst[0] + i, u8x64_extract_u8x16 (r, 0));
157 aes_block_store (dst[1] + i, u8x64_extract_u8x16 (r, 1));
158 aes_block_store (dst[2] + i, u8x64_extract_u8x16 (r, 2));
159 aes_block_store (dst[3] + i, u8x64_extract_u8x16 (r, 3));
162 static_always_inline u8x64
163 aes4_cbc_dec_permute (u8x64 a, u8x64 b)
165 return (u8x64) u64x8_shuffle2 (a, b, 6, 7, 8, 9, 10, 11, 12, 13);
168 static_always_inline void
169 aes4_cbc_dec (u8x64 *k, u8x64u *src, u8x64u *dst, u8x16u *iv, int count,
170 aes_key_size_t rounds)
172 u8x64 f, r[4], c[4] = { };
174 int i, n_blocks = count >> 4;
176 f = (u8x64) _mm512_mask_loadu_epi64 (_mm512_setzero_si512 (), 0xc0,
177 (__m512i *) (iv - 3));
179 while (n_blocks >= 16)
191 for (i = 1; i < rounds; i++)
193 r[0] = aes_dec_round_x4 (r[0], k[i]);
194 r[1] = aes_dec_round_x4 (r[1], k[i]);
195 r[2] = aes_dec_round_x4 (r[2], k[i]);
196 r[3] = aes_dec_round_x4 (r[3], k[i]);
199 r[0] = aes_dec_last_round_x4 (r[0], k[i]);
200 r[1] = aes_dec_last_round_x4 (r[1], k[i]);
201 r[2] = aes_dec_last_round_x4 (r[2], k[i]);
202 r[3] = aes_dec_last_round_x4 (r[3], k[i]);
204 dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
205 dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]);
206 dst[2] = r[2] ^= aes4_cbc_dec_permute (c[1], c[2]);
207 dst[3] = r[3] ^= aes4_cbc_dec_permute (c[2], c[3]);
225 for (i = 1; i < rounds; i++)
227 r[0] = aes_dec_round_x4 (r[0], k[i]);
228 r[1] = aes_dec_round_x4 (r[1], k[i]);
229 r[2] = aes_dec_round_x4 (r[2], k[i]);
232 r[0] = aes_dec_last_round_x4 (r[0], k[i]);
233 r[1] = aes_dec_last_round_x4 (r[1], k[i]);
234 r[2] = aes_dec_last_round_x4 (r[2], k[i]);
236 dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
237 dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]);
238 dst[2] = r[2] ^= aes4_cbc_dec_permute (c[1], c[2]);
245 else if (n_blocks >= 8)
253 for (i = 1; i < rounds; i++)
255 r[0] = aes_dec_round_x4 (r[0], k[i]);
256 r[1] = aes_dec_round_x4 (r[1], k[i]);
259 r[0] = aes_dec_last_round_x4 (r[0], k[i]);
260 r[1] = aes_dec_last_round_x4 (r[1], k[i]);
262 dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
263 dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]);
270 else if (n_blocks >= 4)
276 for (i = 1; i < rounds; i++)
278 r[0] = aes_dec_round_x4 (r[0], k[i]);
281 r[0] = aes_dec_last_round_x4 (r[0], k[i]);
283 dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
293 m = (1 << (n_blocks * 2)) - 1;
294 c[0] = (u8x64) _mm512_mask_loadu_epi64 ((__m512i) c[0], m,
296 f = aes4_cbc_dec_permute (f, c[0]);
298 for (i = 1; i < rounds; i++)
299 r[0] = aes_dec_round_x4 (r[0], k[i]);
300 r[0] = aes_dec_last_round_x4 (r[0], k[i]);
301 _mm512_mask_storeu_epi64 ((__m512i *) dst, m, (__m512i) (r[0] ^ f));
304 #elif defined(__VAES__)
306 static_always_inline u8x32
307 aes_block_load_x2 (u8 *src[], int i)
310 r = u8x32_insert_lo (r, aes_block_load (src[0] + i));
311 r = u8x32_insert_hi (r, aes_block_load (src[1] + i));
315 static_always_inline void
316 aes_block_store_x2 (u8 *dst[], int i, u8x32 r)
318 aes_block_store (dst[0] + i, u8x32_extract_lo (r));
319 aes_block_store (dst[1] + i, u8x32_extract_hi (r));
322 static_always_inline u8x32
323 aes2_cbc_dec_permute (u8x32 a, u8x32 b)
325 return (u8x32) u64x4_shuffle2 ((u64x4) a, (u64x4) b, 2, 3, 4, 5);
328 static_always_inline void
329 aes2_cbc_dec (u8x32 *k, u8x32u *src, u8x32u *dst, u8x16u *iv, int count,
330 aes_key_size_t rounds)
332 u8x32 f = {}, r[4], c[4] = {};
333 int i, n_blocks = count >> 4;
335 f = u8x32_insert_hi (f, *iv);
337 while (n_blocks >= 8)
349 for (i = 1; i < rounds; i++)
351 r[0] = aes_dec_round_x2 (r[0], k[i]);
352 r[1] = aes_dec_round_x2 (r[1], k[i]);
353 r[2] = aes_dec_round_x2 (r[2], k[i]);
354 r[3] = aes_dec_round_x2 (r[3], k[i]);
357 r[0] = aes_dec_last_round_x2 (r[0], k[i]);
358 r[1] = aes_dec_last_round_x2 (r[1], k[i]);
359 r[2] = aes_dec_last_round_x2 (r[2], k[i]);
360 r[3] = aes_dec_last_round_x2 (r[3], k[i]);
362 dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
363 dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
364 dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]);
365 dst[3] = r[3] ^= aes2_cbc_dec_permute (c[2], c[3]);
383 for (i = 1; i < rounds; i++)
385 r[0] = aes_dec_round_x2 (r[0], k[i]);
386 r[1] = aes_dec_round_x2 (r[1], k[i]);
387 r[2] = aes_dec_round_x2 (r[2], k[i]);
390 r[0] = aes_dec_last_round_x2 (r[0], k[i]);
391 r[1] = aes_dec_last_round_x2 (r[1], k[i]);
392 r[2] = aes_dec_last_round_x2 (r[2], k[i]);
394 dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
395 dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
396 dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]);
403 else if (n_blocks >= 4)
411 for (i = 1; i < rounds; i++)
413 r[0] = aes_dec_round_x2 (r[0], k[i]);
414 r[1] = aes_dec_round_x2 (r[1], k[i]);
417 r[0] = aes_dec_last_round_x2 (r[0], k[i]);
418 r[1] = aes_dec_last_round_x2 (r[1], k[i]);
420 dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
421 dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
428 else if (n_blocks >= 2)
433 for (i = 1; i < rounds; i++)
434 r[0] = aes_dec_round_x2 (r[0], k[i]);
436 r[0] = aes_dec_last_round_x2 (r[0], k[i]);
437 dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
447 u8x16 rl = *(u8x16u *) src ^ u8x32_extract_lo (k[0]);
448 for (i = 1; i < rounds; i++)
449 rl = aes_dec_round (rl, u8x32_extract_lo (k[i]));
450 rl = aes_dec_last_round (rl, u8x32_extract_lo (k[i]));
451 *(u8x16 *) dst = rl ^ u8x32_extract_hi (f);
457 static_always_inline u32
458 aes_ops_enc_aes_cbc (vlib_main_t * vm, vnet_crypto_op_t * ops[],
459 u32 n_ops, aes_key_size_t ks)
461 crypto_native_main_t *cm = &crypto_native_main;
462 int rounds = AES_KEY_ROUNDS (ks);
463 u8 placeholder[8192];
464 u32 i, j, count, n_left = n_ops;
465 u32xN placeholder_mask = { };
467 vnet_crypto_key_index_t key_index[N];
473 for (i = 0; i < N; i++)
477 for (i = 0; i < N; i++)
482 /* no more work to enqueue, so we are enqueueing placeholder buffer */
483 src[i] = dst[i] = placeholder;
484 len[i] = sizeof (placeholder);
485 placeholder_mask[i] = 0;
489 u8x16 t = aes_block_load (ops[0]->iv);
490 ((u8x16 *) r)[i] = t;
492 src[i] = ops[0]->src;
493 dst[i] = ops[0]->dst;
494 len[i] = ops[0]->len;
495 placeholder_mask[i] = ~0;
496 if (key_index[i] != ops[0]->key_index)
498 aes_cbc_key_data_t *kd;
499 key_index[i] = ops[0]->key_index;
500 kd = (aes_cbc_key_data_t *) cm->key_data[key_index[i]];
501 for (j = 0; j < rounds + 1; j++)
502 ((u8x16 *) k[j])[i] = kd->encrypt_key[j];
504 ops[0]->status = VNET_CRYPTO_OP_STATUS_COMPLETED;
510 count = u32xN_min_scalar (len);
512 ASSERT (count % 16 == 0);
514 for (i = 0; i < count; i += 16)
516 #if defined(__VAES__) && defined(__AVX512F__)
517 r[0] = u8x64_xor3 (r[0], aes_block_load_x4 (src, i), k[0][0]);
518 r[1] = u8x64_xor3 (r[1], aes_block_load_x4 (src + 4, i), k[0][1]);
519 r[2] = u8x64_xor3 (r[2], aes_block_load_x4 (src + 8, i), k[0][2]);
520 r[3] = u8x64_xor3 (r[3], aes_block_load_x4 (src + 12, i), k[0][3]);
522 for (j = 1; j < rounds; j++)
524 r[0] = aes_enc_round_x4 (r[0], k[j][0]);
525 r[1] = aes_enc_round_x4 (r[1], k[j][1]);
526 r[2] = aes_enc_round_x4 (r[2], k[j][2]);
527 r[3] = aes_enc_round_x4 (r[3], k[j][3]);
529 r[0] = aes_enc_last_round_x4 (r[0], k[j][0]);
530 r[1] = aes_enc_last_round_x4 (r[1], k[j][1]);
531 r[2] = aes_enc_last_round_x4 (r[2], k[j][2]);
532 r[3] = aes_enc_last_round_x4 (r[3], k[j][3]);
534 aes_block_store_x4 (dst, i, r[0]);
535 aes_block_store_x4 (dst + 4, i, r[1]);
536 aes_block_store_x4 (dst + 8, i, r[2]);
537 aes_block_store_x4 (dst + 12, i, r[3]);
538 #elif defined(__VAES__)
539 r[0] = u8x32_xor3 (r[0], aes_block_load_x2 (src, i), k[0][0]);
540 r[1] = u8x32_xor3 (r[1], aes_block_load_x2 (src + 2, i), k[0][1]);
541 r[2] = u8x32_xor3 (r[2], aes_block_load_x2 (src + 4, i), k[0][2]);
542 r[3] = u8x32_xor3 (r[3], aes_block_load_x2 (src + 6, i), k[0][3]);
544 for (j = 1; j < rounds; j++)
546 r[0] = aes_enc_round_x2 (r[0], k[j][0]);
547 r[1] = aes_enc_round_x2 (r[1], k[j][1]);
548 r[2] = aes_enc_round_x2 (r[2], k[j][2]);
549 r[3] = aes_enc_round_x2 (r[3], k[j][3]);
551 r[0] = aes_enc_last_round_x2 (r[0], k[j][0]);
552 r[1] = aes_enc_last_round_x2 (r[1], k[j][1]);
553 r[2] = aes_enc_last_round_x2 (r[2], k[j][2]);
554 r[3] = aes_enc_last_round_x2 (r[3], k[j][3]);
556 aes_block_store_x2 (dst, i, r[0]);
557 aes_block_store_x2 (dst + 2, i, r[1]);
558 aes_block_store_x2 (dst + 4, i, r[2]);
559 aes_block_store_x2 (dst + 6, i, r[3]);
562 r[0] = u8x16_xor3 (r[0], aes_block_load (src[0] + i), k[0][0]);
563 r[1] = u8x16_xor3 (r[1], aes_block_load (src[1] + i), k[0][1]);
564 r[2] = u8x16_xor3 (r[2], aes_block_load (src[2] + i), k[0][2]);
565 r[3] = u8x16_xor3 (r[3], aes_block_load (src[3] + i), k[0][3]);
567 for (j = 1; j < rounds; j++)
569 r[0] = aes_enc_round (r[0], k[j][0]);
570 r[1] = aes_enc_round (r[1], k[j][1]);
571 r[2] = aes_enc_round (r[2], k[j][2]);
572 r[3] = aes_enc_round (r[3], k[j][3]);
575 r[0] = aes_enc_last_round (r[0], k[j][0]);
576 r[1] = aes_enc_last_round (r[1], k[j][1]);
577 r[2] = aes_enc_last_round (r[2], k[j][2]);
578 r[3] = aes_enc_last_round (r[3], k[j][3]);
580 aes_block_store (dst[0] + i, r[0]);
581 aes_block_store (dst[1] + i, r[1]);
582 aes_block_store (dst[2] + i, r[2]);
583 aes_block_store (dst[3] + i, r[3]);
585 r[0] ^= aes_block_load (src[0] + i);
586 r[1] ^= aes_block_load (src[1] + i);
587 r[2] ^= aes_block_load (src[2] + i);
588 r[3] ^= aes_block_load (src[3] + i);
589 for (j = 0; j < rounds - 1; j++)
591 r[0] = vaesmcq_u8 (vaeseq_u8 (r[0], k[j][0]));
592 r[1] = vaesmcq_u8 (vaeseq_u8 (r[1], k[j][1]));
593 r[2] = vaesmcq_u8 (vaeseq_u8 (r[2], k[j][2]));
594 r[3] = vaesmcq_u8 (vaeseq_u8 (r[3], k[j][3]));
596 r[0] = vaeseq_u8 (r[0], k[j][0]) ^ k[rounds][0];
597 r[1] = vaeseq_u8 (r[1], k[j][1]) ^ k[rounds][1];
598 r[2] = vaeseq_u8 (r[2], k[j][2]) ^ k[rounds][2];
599 r[3] = vaeseq_u8 (r[3], k[j][3]) ^ k[rounds][3];
600 aes_block_store (dst[0] + i, r[0]);
601 aes_block_store (dst[1] + i, r[1]);
602 aes_block_store (dst[2] + i, r[2]);
603 aes_block_store (dst[3] + i, r[3]);
608 len -= u32xN_splat (count);
610 for (i = 0; i < N; i++)
619 if (!u32xN_is_all_zero (len & placeholder_mask))
626 static_always_inline u32
627 aes_ops_dec_aes_cbc (vlib_main_t * vm, vnet_crypto_op_t * ops[],
628 u32 n_ops, aes_key_size_t ks)
630 crypto_native_main_t *cm = &crypto_native_main;
631 int rounds = AES_KEY_ROUNDS (ks);
632 vnet_crypto_op_t *op = ops[0];
633 aes_cbc_key_data_t *kd = (aes_cbc_key_data_t *) cm->key_data[op->key_index];
639 #if defined(__VAES__) && defined(__AVX512F__)
640 aes4_cbc_dec (kd->decrypt_key, (u8x64u *) op->src, (u8x64u *) op->dst,
641 (u8x16u *) op->iv, op->len, rounds);
642 #elif defined(__VAES__)
643 aes2_cbc_dec (kd->decrypt_key, (u8x32u *) op->src, (u8x32u *) op->dst,
644 (u8x16u *) op->iv, op->len, rounds);
646 aes_cbc_dec (kd->decrypt_key, (u8x16u *) op->src, (u8x16u *) op->dst,
647 (u8x16u *) op->iv, op->len, rounds);
649 op->status = VNET_CRYPTO_OP_STATUS_COMPLETED;
654 kd = (aes_cbc_key_data_t *) cm->key_data[op->key_index];
661 static_always_inline void *
662 aes_cbc_key_exp (vnet_crypto_key_t * key, aes_key_size_t ks)
665 aes_cbc_key_data_t *kd;
666 kd = clib_mem_alloc_aligned (sizeof (*kd), CLIB_CACHE_LINE_BYTES);
667 aes_key_expand (e, key->data, ks);
668 aes_key_enc_to_dec (e, d, ks);
669 for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++)
671 #if defined(__VAES__) && defined(__AVX512F__)
672 kd->decrypt_key[i] = u8x64_splat_u8x16 (d[i]);
673 #elif defined(__VAES__)
674 kd->decrypt_key[i] = u8x32_splat_u8x16 (d[i]);
676 kd->decrypt_key[i] = d[i];
678 kd->encrypt_key[i] = e[i];
683 #define foreach_aes_cbc_handler_type _(128) _(192) _(256)
686 static u32 aes_ops_dec_aes_cbc_##x \
687 (vlib_main_t * vm, vnet_crypto_op_t * ops[], u32 n_ops) \
688 { return aes_ops_dec_aes_cbc (vm, ops, n_ops, AES_KEY_##x); } \
689 static u32 aes_ops_enc_aes_cbc_##x \
690 (vlib_main_t * vm, vnet_crypto_op_t * ops[], u32 n_ops) \
691 { return aes_ops_enc_aes_cbc (vm, ops, n_ops, AES_KEY_##x); } \
692 static void * aes_cbc_key_exp_##x (vnet_crypto_key_t *key) \
693 { return aes_cbc_key_exp (key, AES_KEY_##x); }
695 foreach_aes_cbc_handler_type;
701 #if defined(__VAES__) && defined(__AVX512F__)
702 crypto_native_aes_cbc_init_icl (vlib_main_t *vm)
703 #elif defined(__VAES__)
704 crypto_native_aes_cbc_init_adl (vlib_main_t *vm)
706 crypto_native_aes_cbc_init_skx (vlib_main_t * vm)
708 crypto_native_aes_cbc_init_neon (vlib_main_t * vm)
710 crypto_native_aes_cbc_init_hsw (vlib_main_t * vm)
712 crypto_native_aes_cbc_init_slm (vlib_main_t * vm)
715 crypto_native_main_t *cm = &crypto_native_main;
718 vnet_crypto_register_ops_handler (vm, cm->crypto_engine_index, \
719 VNET_CRYPTO_OP_AES_##x##_CBC_ENC, \
720 aes_ops_enc_aes_cbc_##x); \
721 vnet_crypto_register_ops_handler (vm, cm->crypto_engine_index, \
722 VNET_CRYPTO_OP_AES_##x##_CBC_DEC, \
723 aes_ops_dec_aes_cbc_##x); \
724 cm->key_fn[VNET_CRYPTO_ALG_AES_##x##_CBC] = aes_cbc_key_exp_##x;
725 foreach_aes_cbc_handler_type;
732 * fd.io coding-style-patch-verification: ON
735 * eval: (c-set-style "gnu")