src/plugins/crypto_native/ghash.h

   1 /*
   2  *------------------------------------------------------------------
   3  * Copyright (c) 2019 Cisco and/or its affiliates.
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  *------------------------------------------------------------------
  16  */
  17
  18 /*
  19  *------------------------------------------------------------------
  20  *  Copyright(c) 2018, Intel Corporation All rights reserved.
  21  *
  22  *  Redistribution and use in source and binary forms, with or without
  23  *  modification, are permitted provided that the following conditions
  24  *  are met:
  25  *    * Redistributions of source code must retain the above copyright
  26  *      notice, this list of conditions and the following disclaimer.
  27  *    * Redistributions in binary form must reproduce the above copyright
  28  *      notice, this list of conditions and the following disclaimer in
  29  *      the documentation and/or other materials provided with the
  30  *      distribution.
  31  *    * Neither the name of Intel Corporation nor the names of its
  32  *      contributors may be used to endorse or promote products derived
  33  *      from this software without specific prior written permission.
  34  *
  35  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  36  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  37  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  38  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  39  *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  40  *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  41  *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES * LOSS OF USE,
  42  *  DATA, OR PROFITS * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  43  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  44  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  45  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  46  *------------------------------------------------------------------
  47  */
  48
  49 /*
  50  * Based on work by: Shay Gueron, Michael E. Kounavis, Erdinc Ozturk,
  51  *                   Vinodh Gopal, James Guilford, Tomasz Kantecki
  52  *
  53  * References:
  54  * [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
  55  *     Intel Architecture Processors. August, 2010
  56  * [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
  57  *     Intel Architecture Processors. October, 2012.
  58  * [3] intel-ipsec-mb library, https://github.com/01org/intel-ipsec-mb.git
  59  *
  60  * Definitions:
  61  *  GF    Galois Extension Field GF(2^128) - finite field where elements are
  62  *        represented as polynomials with coefficients in GF(2) with the
  63  *        highest degree of 127. Polynomials are represented as 128-bit binary
  64  *        numbers where each bit represents one coefficient.
  65  *        e.g. polynomial x^5 + x^3 + x + 1 is represented in binary 101011.
  66  *  H     hash key (128 bit)
  67  *  POLY  irreducible polynomial x^127 + x^7 + x^2 + x + 1
  68  *  RPOLY irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1
  69  *  +     addition in GF, which equals to XOR operation
  70  *  *     multiplication in GF
  71  *
  72  * GF multiplication consists of 2 steps:
  73  *  - carry-less multiplication of two 128-bit operands into 256-bit result
  74  *  - reduction of 256-bit result into 128-bit with modulo POLY
  75  *
  76  * GHash is calculated on 128-bit blocks of data according to the following
  77  * formula:
  78  *    GH = (GH + data) * hash_key
  79  *
  80  * To avoid bit-reflection of data, this code uses GF multipication
  81  * with reversed polynomial:
  82  *   a * b * x^-127 mod RPOLY
  83  *
  84  * To improve computation speed table Hi is precomputed with powers of H',
  85  * where H' is calculated as H<<1 mod RPOLY.
  86  * This allows us to improve performance by deferring reduction. For example
  87  * to caclulate ghash of 4 128-bit blocks of data (b0, b1, b2, b3), we can do:
  88  *
  89  * __i128 Hi[4];
  90  * ghash_precompute (H, Hi, 4);
  91  *
  92  * ghash_data_t _gd, *gd = &_gd;
  93  * ghash_mul_first (gd, GH ^ b0, Hi[3]);
  94  * ghash_mul_next (gd, b1, Hi[2]);
  95  * ghash_mul_next (gd, b2, Hi[1]);
  96  * ghash_mul_next (gd, b3, Hi[0]);
  97  * ghash_reduce (gd);
  98  * ghash_reduce2 (gd);
  99  * GH = ghash_final (gd);
 100  *
 101  * Reduction step is split into 3 functions so it can be better interleaved
 102  * with other code, (i.e. with AES computation).
 103  */
 104
 105 #ifndef __ghash_h__
 106 #define __ghash_h__
 107
 108 static_always_inline u8x16
 109 gmul_lo_lo (u8x16 a, u8x16 b)
 110 {
 111 #if defined (__PCLMUL__)
 112   return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x00);
 113 #elif defined (__ARM_FEATURE_CRYPTO)
 114   return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
 115                             (poly64_t) vget_low_p64 ((poly64x2_t) b));
 116 #endif
 117 }
 118
 119 static_always_inline u8x16
 120 gmul_hi_lo (u8x16 a, u8x16 b)
 121 {
 122 #if defined (__PCLMUL__)
 123   return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x01);
 124 #elif defined (__ARM_FEATURE_CRYPTO)
 125   return (u8x16) vmull_p64 ((poly64_t) vget_high_p64 ((poly64x2_t) a),
 126                             (poly64_t) vget_low_p64 ((poly64x2_t) b));
 127 #endif
 128 }
 129
 130 static_always_inline u8x16
 131 gmul_lo_hi (u8x16 a, u8x16 b)
 132 {
 133 #if defined (__PCLMUL__)
 134   return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x10);
 135 #elif defined (__ARM_FEATURE_CRYPTO)
 136   return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
 137                             (poly64_t) vget_high_p64 ((poly64x2_t) b));
 138 #endif
 139 }
 140
 141 static_always_inline u8x16
 142 gmul_hi_hi (u8x16 a, u8x16 b)
 143 {
 144 #if defined (__PCLMUL__)
 145   return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x11);
 146 #elif defined (__ARM_FEATURE_CRYPTO)
 147   return (u8x16) vmull_high_p64 ((poly64x2_t) a, (poly64x2_t) b);
 148 #endif
 149 }
 150
 151 typedef struct
 152 {
 153   u8x16 mid, hi, lo, tmp_lo, tmp_hi;
 154   int pending;
 155 } ghash_data_t;
 156
 157 static const u8x16 ghash_poly = {
 158   0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 159   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
 160 };
 161
 162 static const u8x16 ghash_poly2 = {
 163   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
 164   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
 165 };
 166
 167 static_always_inline void
 168 ghash_mul_first (ghash_data_t * gd, u8x16 a, u8x16 b)
 169 {
 170   /* a1 * b1 */
 171   gd->hi = gmul_hi_hi (a, b);
 172   /* a0 * b0 */
 173   gd->lo = gmul_lo_lo (a, b);
 174   /* a0 * b1 ^ a1 * b0 */
 175   gd->mid = (gmul_hi_lo (a, b) ^ gmul_lo_hi (a, b));
 176
 177   /* set gd->pending to 0 so next invocation of ghash_mul_next(...) knows that
 178      there is no pending data in tmp_lo and tmp_hi */
 179   gd->pending = 0;
 180 }
 181
 182 static_always_inline void
 183 ghash_mul_next (ghash_data_t * gd, u8x16 a, u8x16 b)
 184 {
 185   /* a1 * b1 */
 186   u8x16 hi = gmul_hi_hi (a, b);
 187   /* a0 * b0 */
 188   u8x16 lo = gmul_lo_lo (a, b);
 189
 190   /* this branch will be optimized out by the compiler, and it allows us to
 191      reduce number of XOR operations by using ternary logic */
 192   if (gd->pending)
 193     {
 194       /* there is peding data from previous invocation so we can XOR */
 195       gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, hi);
 196       gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, lo);
 197       gd->pending = 0;
 198     }
 199   else
 200     {
 201       /* there is no peding data from previous invocation so we postpone XOR */
 202       gd->tmp_hi = hi;
 203       gd->tmp_lo = lo;
 204       gd->pending = 1;
 205     }
 206
 207   /* gd->mid ^= a0 * b1 ^ a1 * b0  */
 208   gd->mid = u8x16_xor3 (gd->mid, gmul_hi_lo (a, b), gmul_lo_hi (a, b));
 209 }
 210
 211 static_always_inline void
 212 ghash_reduce (ghash_data_t * gd)
 213 {
 214   u8x16 r;
 215
 216   /* Final combination:
 217      gd->lo ^= gd->mid << 64
 218      gd->hi ^= gd->mid >> 64 */
 219   u8x16 midl = u8x16_word_shift_left (gd->mid, 8);
 220   u8x16 midr = u8x16_word_shift_right (gd->mid, 8);
 221
 222   if (gd->pending)
 223     {
 224       gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, midl);
 225       gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, midr);
 226     }
 227   else
 228     {
 229       gd->lo ^= midl;
 230       gd->hi ^= midr;
 231     }
 232   r = gmul_hi_lo (ghash_poly2, gd->lo);
 233   gd->lo ^= u8x16_word_shift_left (r, 8);
 234 }
 235
 236 static_always_inline void
 237 ghash_reduce2 (ghash_data_t * gd)
 238 {
 239   gd->tmp_lo = gmul_lo_lo (ghash_poly2, gd->lo);
 240   gd->tmp_hi = gmul_lo_hi (ghash_poly2, gd->lo);
 241 }
 242
 243 static_always_inline u8x16
 244 ghash_final (ghash_data_t * gd)
 245 {
 246   return u8x16_xor3 (gd->hi, u8x16_word_shift_right (gd->tmp_lo, 4),
 247                      u8x16_word_shift_left (gd->tmp_hi, 4));
 248 }
 249
 250 static_always_inline u8x16
 251 ghash_mul (u8x16 a, u8x16 b)
 252 {
 253   ghash_data_t _gd, *gd = &_gd;
 254   ghash_mul_first (gd, a, b);
 255   ghash_reduce (gd);
 256   ghash_reduce2 (gd);
 257   return ghash_final (gd);
 258 }
 259
 260 #ifdef __VPCLMULQDQ__
 261
 262 static const u8x64 ghash4_poly2 = {
 263   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
 264   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
 265   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
 266   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
 267   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
 268   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
 269   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
 270   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
 271 };
 272
 273 typedef struct
 274 {
 275   u8x64 hi, lo, mid, tmp_lo, tmp_hi;
 276   int pending;
 277 } ghash4_data_t;
 278
 279 static_always_inline u8x64
 280 gmul4_lo_lo (u8x64 a, u8x64 b)
 281 {
 282   return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x00);
 283 }
 284
 285 static_always_inline u8x64
 286 gmul4_hi_lo (u8x64 a, u8x64 b)
 287 {
 288   return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x01);
 289 }
 290
 291 static_always_inline u8x64
 292 gmul4_lo_hi (u8x64 a, u8x64 b)
 293 {
 294   return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x10);
 295 }
 296
 297 static_always_inline u8x64
 298 gmul4_hi_hi (u8x64 a, u8x64 b)
 299 {
 300   return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x11);
 301 }
 302
 303
 304 static_always_inline void
 305 ghash4_mul_first (ghash4_data_t * gd, u8x64 a, u8x64 b)
 306 {
 307   gd->hi = gmul4_hi_hi (a, b);
 308   gd->lo = gmul4_lo_lo (a, b);
 309   gd->mid = (gmul4_hi_lo (a, b) ^ gmul4_lo_hi (a, b));
 310   gd->pending = 0;
 311 }
 312
 313 static_always_inline void
 314 ghash4_mul_next (ghash4_data_t * gd, u8x64 a, u8x64 b)
 315 {
 316   u8x64 hi = gmul4_hi_hi (a, b);
 317   u8x64 lo = gmul4_lo_lo (a, b);
 318
 319   if (gd->pending)
 320     {
 321       /* there is peding data from previous invocation so we can XOR */
 322       gd->hi = u8x64_xor3 (gd->hi, gd->tmp_hi, hi);
 323       gd->lo = u8x64_xor3 (gd->lo, gd->tmp_lo, lo);
 324       gd->pending = 0;
 325     }
 326   else
 327     {
 328       /* there is no peding data from previous invocation so we postpone XOR */
 329       gd->tmp_hi = hi;
 330       gd->tmp_lo = lo;
 331       gd->pending = 1;
 332     }
 333   gd->mid = u8x64_xor3 (gd->mid, gmul4_hi_lo (a, b), gmul4_lo_hi (a, b));
 334 }
 335
 336 static_always_inline void
 337 ghash4_reduce (ghash4_data_t * gd)
 338 {
 339   u8x64 r;
 340
 341   /* Final combination:
 342      gd->lo ^= gd->mid << 64
 343      gd->hi ^= gd->mid >> 64 */
 344
 345   u8x64 midl = u8x64_word_shift_left (gd->mid, 8);
 346   u8x64 midr = u8x64_word_shift_right (gd->mid, 8);
 347
 348   if (gd->pending)
 349     {
 350       gd->lo = u8x64_xor3 (gd->lo, gd->tmp_lo, midl);
 351       gd->hi = u8x64_xor3 (gd->hi, gd->tmp_hi, midr);
 352     }
 353   else
 354     {
 355       gd->lo ^= midl;
 356       gd->hi ^= midr;
 357     }
 358
 359   r = gmul4_hi_lo (ghash4_poly2, gd->lo);
 360   gd->lo ^= u8x64_word_shift_left (r, 8);
 361
 362 }
 363
 364 static_always_inline void
 365 ghash4_reduce2 (ghash4_data_t * gd)
 366 {
 367   gd->tmp_lo = gmul4_lo_lo (ghash4_poly2, gd->lo);
 368   gd->tmp_hi = gmul4_lo_hi (ghash4_poly2, gd->lo);
 369 }
 370
 371 static_always_inline u8x16
 372 ghash4_final (ghash4_data_t * gd)
 373 {
 374   u8x64 r;
 375   u8x32 t;
 376
 377   r = u8x64_xor3 (gd->hi, u8x64_word_shift_right (gd->tmp_lo, 4),
 378                   u8x64_word_shift_left (gd->tmp_hi, 4));
 379
 380   /* horizontal XOR of 4 128-bit lanes */
 381   t = u8x64_extract_lo (r) ^ u8x64_extract_hi (r);
 382   return u8x32_extract_hi (t) ^ u8x32_extract_lo (t);
 383 }
 384 #endif
 385
 386 static_always_inline void
 387 ghash_precompute (u8x16 H, u8x16 * Hi, int n)
 388 {
 389   u8x16 r8;
 390   u32x4 r32;
 391   /* calcullate H<<1 mod poly from the hash key */
 392   r8 = (u8x16) ((u64x2) H >> 63);
 393   H = (u8x16) ((u64x2) H << 1);
 394   H |= u8x16_word_shift_left (r8, 8);
 395   r32 = (u32x4) u8x16_word_shift_right (r8, 8);
 396 #ifdef __SSE2__
 397   r32 = u32x4_shuffle (r32, 0, 1, 2, 0);
 398 #else
 399   r32[3] = r32[0];
 400 #endif
 401   /* *INDENT-OFF* */
 402   r32 = r32 == (u32x4) {1, 0, 0, 1};
 403   /* *INDENT-ON* */
 404   Hi[n - 1] = H = H ^ ((u8x16) r32 & ghash_poly);
 405
 406   /* calculate H^(i + 1) */
 407   for (int i = n - 2; i >= 0; i--)
 408     Hi[i] = ghash_mul (H, Hi[i + 1]);
 409 }
 410
 411 #endif /* __ghash_h__ */
 412
 413 /*
 414  * fd.io coding-style-patch-verification: ON
 415  *
 416  * Local Variables:
 417  * eval: (c-set-style "gnu")
 418  * End:
 419  */