2 *------------------------------------------------------------------
3 * Copyright (c) 2019 Cisco and/or its affiliates.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *------------------------------------------------------------------
19 *------------------------------------------------------------------
20 * Copyright(c) 2018, Intel Corporation All rights reserved.
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions
25 * * Redistributions of source code must retain the above copyright
26 * notice, this list of conditions and the following disclaimer.
27 * * Redistributions in binary form must reproduce the above copyright
28 * notice, this list of conditions and the following disclaimer in
29 * the documentation and/or other materials provided with the
31 * * Neither the name of Intel Corporation nor the names of its
32 * contributors may be used to endorse or promote products derived
33 * from this software without specific prior written permission.
35 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
36 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
39 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES * LOSS OF USE,
42 * DATA, OR PROFITS * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
45 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46 *------------------------------------------------------------------
50 * Based on work by: Shay Gueron, Michael E. Kounavis, Erdinc Ozturk,
51 * Vinodh Gopal, James Guilford, Tomasz Kantecki
54 * [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
55 * Intel Architecture Processors. August, 2010
56 * [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
57 * Intel Architecture Processors. October, 2012.
58 * [3] intel-ipsec-mb library, https://github.com/01org/intel-ipsec-mb.git
61 * GF Galois Extension Field GF(2^128) - finite field where elements are
62 * represented as polynomials with coefficients in GF(2) with the
63 * highest degree of 127. Polynomials are represented as 128-bit binary
64 * numbers where each bit represents one coefficient.
65 * e.g. polynomial x^5 + x^3 + x + 1 is represented in binary 101011.
66 * H hash key (128 bit)
67 * POLY irreducible polynomial x^127 + x^7 + x^2 + x + 1
68 * RPOLY irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1
69 * + addition in GF, which equals to XOR operation
70 * * multiplication in GF
72 * GF multiplication consists of 2 steps:
73 * - carry-less multiplication of two 128-bit operands into 256-bit result
74 * - reduction of 256-bit result into 128-bit with modulo POLY
76 * GHash is calculated on 128-bit blocks of data according to the following
78 * GH = (GH + data) * hash_key
80 * To avoid bit-reflection of data, this code uses GF multipication
81 * with reversed polynomial:
82 * a * b * x^-127 mod RPOLY
84 * To improve computation speed table Hi is precomputed with powers of H',
85 * where H' is calculated as H<<1 mod RPOLY.
86 * This allows us to improve performance by deferring reduction. For example
87 * to caclulate ghash of 4 128-bit blocks of data (b0, b1, b2, b3), we can do:
90 * ghash_precompute (H, Hi, 4);
92 * ghash_data_t _gd, *gd = &_gd;
93 * ghash_mul_first (gd, GH ^ b0, Hi[3]);
94 * ghash_mul_next (gd, b1, Hi[2]);
95 * ghash_mul_next (gd, b2, Hi[1]);
96 * ghash_mul_next (gd, b3, Hi[0]);
99 * GH = ghash_final (gd);
101 * Reduction step is split into 3 functions so it can be better interleaved
102 * with other code, (i.e. with AES computation).
108 static_always_inline u8x16
109 gmul_lo_lo (u8x16 a, u8x16 b)
111 #if defined (__PCLMUL__)
112 return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x00);
113 #elif defined (__ARM_FEATURE_CRYPTO)
114 return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
115 (poly64_t) vget_low_p64 ((poly64x2_t) b));
119 static_always_inline u8x16
120 gmul_hi_lo (u8x16 a, u8x16 b)
122 #if defined (__PCLMUL__)
123 return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x01);
124 #elif defined (__ARM_FEATURE_CRYPTO)
125 return (u8x16) vmull_p64 ((poly64_t) vget_high_p64 ((poly64x2_t) a),
126 (poly64_t) vget_low_p64 ((poly64x2_t) b));
130 static_always_inline u8x16
131 gmul_lo_hi (u8x16 a, u8x16 b)
133 #if defined (__PCLMUL__)
134 return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x10);
135 #elif defined (__ARM_FEATURE_CRYPTO)
136 return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
137 (poly64_t) vget_high_p64 ((poly64x2_t) b));
141 static_always_inline u8x16
142 gmul_hi_hi (u8x16 a, u8x16 b)
144 #if defined (__PCLMUL__)
145 return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x11);
146 #elif defined (__ARM_FEATURE_CRYPTO)
147 return (u8x16) vmull_high_p64 ((poly64x2_t) a, (poly64x2_t) b);
153 u8x16 mid, hi, lo, tmp_lo, tmp_hi;
154 u8x32 hi2, lo2, mid2, tmp_lo2, tmp_hi2;
155 u8x64 hi4, lo4, mid4, tmp_lo4, tmp_hi4;
159 static const u8x16 ghash_poly = {
160 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
161 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
164 static const u8x16 ghash_poly2 = {
165 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
166 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
169 static_always_inline void
170 ghash_mul_first (ghash_data_t * gd, u8x16 a, u8x16 b)
173 gd->hi = gmul_hi_hi (a, b);
175 gd->lo = gmul_lo_lo (a, b);
176 /* a0 * b1 ^ a1 * b0 */
177 gd->mid = gmul_hi_lo (a, b) ^ gmul_lo_hi (a, b);
179 /* set gd->pending to 0 so next invocation of ghash_mul_next(...) knows that
180 there is no pending data in tmp_lo and tmp_hi */
184 static_always_inline void
185 ghash_mul_next (ghash_data_t * gd, u8x16 a, u8x16 b)
188 u8x16 hi = gmul_hi_hi (a, b);
190 u8x16 lo = gmul_lo_lo (a, b);
192 /* this branch will be optimized out by the compiler, and it allows us to
193 reduce number of XOR operations by using ternary logic */
196 /* there is peding data from previous invocation so we can XOR */
197 gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, hi);
198 gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, lo);
203 /* there is no peding data from previous invocation so we postpone XOR */
209 /* gd->mid ^= a0 * b1 ^ a1 * b0 */
210 gd->mid = u8x16_xor3 (gd->mid, gmul_hi_lo (a, b), gmul_lo_hi (a, b));
213 static_always_inline void
214 ghash_reduce (ghash_data_t * gd)
218 /* Final combination:
219 gd->lo ^= gd->mid << 64
220 gd->hi ^= gd->mid >> 64 */
221 u8x16 midl = u8x16_word_shift_left (gd->mid, 8);
222 u8x16 midr = u8x16_word_shift_right (gd->mid, 8);
226 gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, midl);
227 gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, midr);
234 r = gmul_hi_lo (ghash_poly2, gd->lo);
235 gd->lo ^= u8x16_word_shift_left (r, 8);
238 static_always_inline void
239 ghash_reduce2 (ghash_data_t * gd)
241 gd->tmp_lo = gmul_lo_lo (ghash_poly2, gd->lo);
242 gd->tmp_hi = gmul_lo_hi (ghash_poly2, gd->lo);
245 static_always_inline u8x16
246 ghash_final (ghash_data_t * gd)
248 return u8x16_xor3 (gd->hi, u8x16_word_shift_right (gd->tmp_lo, 4),
249 u8x16_word_shift_left (gd->tmp_hi, 4));
252 static_always_inline u8x16
253 ghash_mul (u8x16 a, u8x16 b)
255 ghash_data_t _gd, *gd = &_gd;
256 ghash_mul_first (gd, a, b);
259 return ghash_final (gd);
262 #if defined(__VPCLMULQDQ__) && defined(__AVX512F__)
264 static const u8x64 ghash4_poly2 = {
265 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
266 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
267 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
268 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
269 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
270 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
271 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
272 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
275 static_always_inline u8x64
276 gmul4_lo_lo (u8x64 a, u8x64 b)
278 return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x00);
281 static_always_inline u8x64
282 gmul4_hi_lo (u8x64 a, u8x64 b)
284 return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x01);
287 static_always_inline u8x64
288 gmul4_lo_hi (u8x64 a, u8x64 b)
290 return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x10);
293 static_always_inline u8x64
294 gmul4_hi_hi (u8x64 a, u8x64 b)
296 return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x11);
299 static_always_inline void
300 ghash4_mul_first (ghash_data_t *gd, u8x64 a, u8x64 b)
302 gd->hi4 = gmul4_hi_hi (a, b);
303 gd->lo4 = gmul4_lo_lo (a, b);
304 gd->mid4 = gmul4_hi_lo (a, b) ^ gmul4_lo_hi (a, b);
308 static_always_inline void
309 ghash4_mul_next (ghash_data_t *gd, u8x64 a, u8x64 b)
311 u8x64 hi = gmul4_hi_hi (a, b);
312 u8x64 lo = gmul4_lo_lo (a, b);
316 /* there is peding data from previous invocation so we can XOR */
317 gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, hi);
318 gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, lo);
323 /* there is no peding data from previous invocation so we postpone XOR */
328 gd->mid4 = u8x64_xor3 (gd->mid4, gmul4_hi_lo (a, b), gmul4_lo_hi (a, b));
331 static_always_inline void
332 ghash4_reduce (ghash_data_t *gd)
336 /* Final combination:
337 gd->lo4 ^= gd->mid4 << 64
338 gd->hi4 ^= gd->mid4 >> 64 */
340 u8x64 midl = u8x64_word_shift_left (gd->mid4, 8);
341 u8x64 midr = u8x64_word_shift_right (gd->mid4, 8);
345 gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, midl);
346 gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, midr);
354 r = gmul4_hi_lo (ghash4_poly2, gd->lo4);
355 gd->lo4 ^= u8x64_word_shift_left (r, 8);
358 static_always_inline void
359 ghash4_reduce2 (ghash_data_t *gd)
361 gd->tmp_lo4 = gmul4_lo_lo (ghash4_poly2, gd->lo4);
362 gd->tmp_hi4 = gmul4_lo_hi (ghash4_poly2, gd->lo4);
365 static_always_inline u8x16
366 ghash4_final (ghash_data_t *gd)
371 r = u8x64_xor3 (gd->hi4, u8x64_word_shift_right (gd->tmp_lo4, 4),
372 u8x64_word_shift_left (gd->tmp_hi4, 4));
374 /* horizontal XOR of 4 128-bit lanes */
375 t = u8x64_extract_lo (r) ^ u8x64_extract_hi (r);
376 return u8x32_extract_hi (t) ^ u8x32_extract_lo (t);
380 #if defined(__VPCLMULQDQ__)
382 static const u8x32 ghash2_poly2 = {
383 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
384 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00,
385 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
388 static_always_inline u8x32
389 gmul2_lo_lo (u8x32 a, u8x32 b)
391 return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x00);
394 static_always_inline u8x32
395 gmul2_hi_lo (u8x32 a, u8x32 b)
397 return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x01);
400 static_always_inline u8x32
401 gmul2_lo_hi (u8x32 a, u8x32 b)
403 return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x10);
406 static_always_inline u8x32
407 gmul2_hi_hi (u8x32 a, u8x32 b)
409 return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x11);
412 static_always_inline void
413 ghash2_mul_first (ghash_data_t *gd, u8x32 a, u8x32 b)
415 gd->hi2 = gmul2_hi_hi (a, b);
416 gd->lo2 = gmul2_lo_lo (a, b);
417 gd->mid2 = gmul2_hi_lo (a, b) ^ gmul2_lo_hi (a, b);
421 static_always_inline void
422 ghash2_mul_next (ghash_data_t *gd, u8x32 a, u8x32 b)
424 u8x32 hi = gmul2_hi_hi (a, b);
425 u8x32 lo = gmul2_lo_lo (a, b);
429 /* there is peding data from previous invocation so we can XOR */
430 gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, hi);
431 gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, lo);
436 /* there is no peding data from previous invocation so we postpone XOR */
441 gd->mid2 = u8x32_xor3 (gd->mid2, gmul2_hi_lo (a, b), gmul2_lo_hi (a, b));
444 static_always_inline void
445 ghash2_reduce (ghash_data_t *gd)
449 /* Final combination:
450 gd->lo2 ^= gd->mid2 << 64
451 gd->hi2 ^= gd->mid2 >> 64 */
453 u8x32 midl = u8x32_word_shift_left (gd->mid2, 8);
454 u8x32 midr = u8x32_word_shift_right (gd->mid2, 8);
458 gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, midl);
459 gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, midr);
467 r = gmul2_hi_lo (ghash2_poly2, gd->lo2);
468 gd->lo2 ^= u8x32_word_shift_left (r, 8);
471 static_always_inline void
472 ghash2_reduce2 (ghash_data_t *gd)
474 gd->tmp_lo2 = gmul2_lo_lo (ghash2_poly2, gd->lo2);
475 gd->tmp_hi2 = gmul2_lo_hi (ghash2_poly2, gd->lo2);
478 static_always_inline u8x16
479 ghash2_final (ghash_data_t *gd)
483 r = u8x32_xor3 (gd->hi2, u8x32_word_shift_right (gd->tmp_lo2, 4),
484 u8x32_word_shift_left (gd->tmp_hi2, 4));
486 /* horizontal XOR of 2 128-bit lanes */
487 return u8x32_extract_hi (r) ^ u8x32_extract_lo (r);
491 static_always_inline void
492 ghash_precompute (u8x16 H, u8x16 * Hi, int n)
496 /* calcullate H<<1 mod poly from the hash key */
497 r8 = (u8x16) ((u64x2) H >> 63);
498 H = (u8x16) ((u64x2) H << 1);
499 H |= u8x16_word_shift_left (r8, 8);
500 r32 = (u32x4) u8x16_word_shift_right (r8, 8);
502 r32 = u32x4_shuffle (r32, 0, 1, 2, 0);
506 r32 = r32 == (u32x4) {1, 0, 0, 1};
507 Hi[n - 1] = H = H ^ ((u8x16) r32 & ghash_poly);
509 /* calculate H^(i + 1) */
510 for (int i = n - 2; i >= 0; i--)
511 Hi[i] = ghash_mul (H, Hi[i + 1]);
514 #endif /* __ghash_h__ */