src/vppinfra/crypto/ghash.h

   1 /*
   2  *------------------------------------------------------------------
   3  * Copyright (c) 2019 Cisco and/or its affiliates.
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  *------------------------------------------------------------------
  16  */
  17
  18 /*
  19  *------------------------------------------------------------------
  20  *  Copyright(c) 2018, Intel Corporation All rights reserved.
  21  *
  22  *  Redistribution and use in source and binary forms, with or without
  23  *  modification, are permitted provided that the following conditions
  24  *  are met:
  25  *    * Redistributions of source code must retain the above copyright
  26  *      notice, this list of conditions and the following disclaimer.
  27  *    * Redistributions in binary form must reproduce the above copyright
  28  *      notice, this list of conditions and the following disclaimer in
  29  *      the documentation and/or other materials provided with the
  30  *      distribution.
  31  *    * Neither the name of Intel Corporation nor the names of its
  32  *      contributors may be used to endorse or promote products derived
  33  *      from this software without specific prior written permission.
  34  *
  35  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  36  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  37  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  38  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  39  *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  40  *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  41  *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES * LOSS OF USE,
  42  *  DATA, OR PROFITS * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  43  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  44  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  45  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  46  *------------------------------------------------------------------
  47  */
  48
  49 /*
  50  * Based on work by: Shay Gueron, Michael E. Kounavis, Erdinc Ozturk,
  51  *                   Vinodh Gopal, James Guilford, Tomasz Kantecki
  52  *
  53  * References:
  54  * [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
  55  *     Intel Architecture Processors. August, 2010
  56  * [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
  57  *     Intel Architecture Processors. October, 2012.
  58  * [3] intel-ipsec-mb library, https://github.com/01org/intel-ipsec-mb.git
  59  *
  60  * Definitions:
  61  *  GF    Galois Extension Field GF(2^128) - finite field where elements are
  62  *        represented as polynomials with coefficients in GF(2) with the
  63  *        highest degree of 127. Polynomials are represented as 128-bit binary
  64  *        numbers where each bit represents one coefficient.
  65  *        e.g. polynomial x^5 + x^3 + x + 1 is represented in binary 101011.
  66  *  H     hash key (128 bit)
  67  *  POLY  irreducible polynomial x^127 + x^7 + x^2 + x + 1
  68  *  RPOLY irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1
  69  *  +     addition in GF, which equals to XOR operation
  70  *  *     multiplication in GF
  71  *
  72  * GF multiplication consists of 2 steps:
  73  *  - carry-less multiplication of two 128-bit operands into 256-bit result
  74  *  - reduction of 256-bit result into 128-bit with modulo POLY
  75  *
  76  * GHash is calculated on 128-bit blocks of data according to the following
  77  * formula:
  78  *    GH = (GH + data) * hash_key
  79  *
  80  * To avoid bit-reflection of data, this code uses GF multipication
  81  * with reversed polynomial:
  82  *   a * b * x^-127 mod RPOLY
  83  *
  84  * To improve computation speed table Hi is precomputed with powers of H',
  85  * where H' is calculated as H<<1 mod RPOLY.
  86  * This allows us to improve performance by deferring reduction. For example
  87  * to caclulate ghash of 4 128-bit blocks of data (b0, b1, b2, b3), we can do:
  88  *
  89  * u8x16 Hi[4];
  90  * ghash_precompute (H, Hi, 4);
  91  *
  92  * ghash_data_t _gd, *gd = &_gd;
  93  * ghash_mul_first (gd, GH ^ b0, Hi[3]);
  94  * ghash_mul_next (gd, b1, Hi[2]);
  95  * ghash_mul_next (gd, b2, Hi[1]);
  96  * ghash_mul_next (gd, b3, Hi[0]);
  97  * ghash_reduce (gd);
  98  * ghash_reduce2 (gd);
  99  * GH = ghash_final (gd);
 100  *
 101  * Reduction step is split into 3 functions so it can be better interleaved
 102  * with other code, (i.e. with AES computation).
 103  */
 104
 105 #ifndef __ghash_h__
 106 #define __ghash_h__
 107
 108 static_always_inline u8x16
 109 gmul_lo_lo (u8x16 a, u8x16 b)
 110 {
 111 #if defined (__PCLMUL__)
 112   return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x00);
 113 #elif defined (__ARM_FEATURE_CRYPTO)
 114   return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
 115                             (poly64_t) vget_low_p64 ((poly64x2_t) b));
 116 #endif
 117 }
 118
 119 static_always_inline u8x16
 120 gmul_hi_lo (u8x16 a, u8x16 b)
 121 {
 122 #if defined (__PCLMUL__)
 123   return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x01);
 124 #elif defined (__ARM_FEATURE_CRYPTO)
 125   return (u8x16) vmull_p64 ((poly64_t) vget_high_p64 ((poly64x2_t) a),
 126                             (poly64_t) vget_low_p64 ((poly64x2_t) b));
 127 #endif
 128 }
 129
 130 static_always_inline u8x16
 131 gmul_lo_hi (u8x16 a, u8x16 b)
 132 {
 133 #if defined (__PCLMUL__)
 134   return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x10);
 135 #elif defined (__ARM_FEATURE_CRYPTO)
 136   return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
 137                             (poly64_t) vget_high_p64 ((poly64x2_t) b));
 138 #endif
 139 }
 140
 141 static_always_inline u8x16
 142 gmul_hi_hi (u8x16 a, u8x16 b)
 143 {
 144 #if defined (__PCLMUL__)
 145   return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x11);
 146 #elif defined (__ARM_FEATURE_CRYPTO)
 147   return (u8x16) vmull_high_p64 ((poly64x2_t) a, (poly64x2_t) b);
 148 #endif
 149 }
 150
 151 typedef struct
 152 {
 153   u8x16 mid, hi, lo, tmp_lo, tmp_hi;
 154   u8x32 hi2, lo2, mid2, tmp_lo2, tmp_hi2;
 155   u8x64 hi4, lo4, mid4, tmp_lo4, tmp_hi4;
 156   int pending;
 157 } ghash_data_t;
 158
 159 static const u8x16 ghash_poly = {
 160   0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 161   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
 162 };
 163
 164 static const u8x16 ghash_poly2 = {
 165   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
 166   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
 167 };
 168
 169 static_always_inline void
 170 ghash_mul_first (ghash_data_t * gd, u8x16 a, u8x16 b)
 171 {
 172   /* a1 * b1 */
 173   gd->hi = gmul_hi_hi (a, b);
 174   /* a0 * b0 */
 175   gd->lo = gmul_lo_lo (a, b);
 176   /* a0 * b1 ^ a1 * b0 */
 177   gd->mid = gmul_hi_lo (a, b) ^ gmul_lo_hi (a, b);
 178
 179   /* set gd->pending to 0 so next invocation of ghash_mul_next(...) knows that
 180      there is no pending data in tmp_lo and tmp_hi */
 181   gd->pending = 0;
 182 }
 183
 184 static_always_inline void
 185 ghash_mul_next (ghash_data_t * gd, u8x16 a, u8x16 b)
 186 {
 187   /* a1 * b1 */
 188   u8x16 hi = gmul_hi_hi (a, b);
 189   /* a0 * b0 */
 190   u8x16 lo = gmul_lo_lo (a, b);
 191
 192   /* this branch will be optimized out by the compiler, and it allows us to
 193      reduce number of XOR operations by using ternary logic */
 194   if (gd->pending)
 195     {
 196       /* there is peding data from previous invocation so we can XOR */
 197       gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, hi);
 198       gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, lo);
 199       gd->pending = 0;
 200     }
 201   else
 202     {
 203       /* there is no peding data from previous invocation so we postpone XOR */
 204       gd->tmp_hi = hi;
 205       gd->tmp_lo = lo;
 206       gd->pending = 1;
 207     }
 208
 209   /* gd->mid ^= a0 * b1 ^ a1 * b0  */
 210   gd->mid = u8x16_xor3 (gd->mid, gmul_hi_lo (a, b), gmul_lo_hi (a, b));
 211 }
 212
 213 static_always_inline void
 214 ghash_reduce (ghash_data_t * gd)
 215 {
 216   u8x16 r;
 217
 218   /* Final combination:
 219      gd->lo ^= gd->mid << 64
 220      gd->hi ^= gd->mid >> 64 */
 221   u8x16 midl = u8x16_word_shift_left (gd->mid, 8);
 222   u8x16 midr = u8x16_word_shift_right (gd->mid, 8);
 223
 224   if (gd->pending)
 225     {
 226       gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, midl);
 227       gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, midr);
 228     }
 229   else
 230     {
 231       gd->lo ^= midl;
 232       gd->hi ^= midr;
 233     }
 234   r = gmul_hi_lo (ghash_poly2, gd->lo);
 235   gd->lo ^= u8x16_word_shift_left (r, 8);
 236 }
 237
 238 static_always_inline void
 239 ghash_reduce2 (ghash_data_t * gd)
 240 {
 241   gd->tmp_lo = gmul_lo_lo (ghash_poly2, gd->lo);
 242   gd->tmp_hi = gmul_lo_hi (ghash_poly2, gd->lo);
 243 }
 244
 245 static_always_inline u8x16
 246 ghash_final (ghash_data_t * gd)
 247 {
 248   return u8x16_xor3 (gd->hi, u8x16_word_shift_right (gd->tmp_lo, 4),
 249                      u8x16_word_shift_left (gd->tmp_hi, 4));
 250 }
 251
 252 static_always_inline u8x16
 253 ghash_mul (u8x16 a, u8x16 b)
 254 {
 255   ghash_data_t _gd, *gd = &_gd;
 256   ghash_mul_first (gd, a, b);
 257   ghash_reduce (gd);
 258   ghash_reduce2 (gd);
 259   return ghash_final (gd);
 260 }
 261
 262 #if defined(__VPCLMULQDQ__) && defined(__AVX512F__)
 263
 264 static const u8x64 ghash4_poly2 = {
 265   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
 266   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
 267   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
 268   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
 269   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
 270   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
 271   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
 272   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
 273 };
 274
 275 static_always_inline u8x64
 276 gmul4_lo_lo (u8x64 a, u8x64 b)
 277 {
 278   return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x00);
 279 }
 280
 281 static_always_inline u8x64
 282 gmul4_hi_lo (u8x64 a, u8x64 b)
 283 {
 284   return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x01);
 285 }
 286
 287 static_always_inline u8x64
 288 gmul4_lo_hi (u8x64 a, u8x64 b)
 289 {
 290   return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x10);
 291 }
 292
 293 static_always_inline u8x64
 294 gmul4_hi_hi (u8x64 a, u8x64 b)
 295 {
 296   return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x11);
 297 }
 298
 299 static_always_inline void
 300 ghash4_mul_first (ghash_data_t *gd, u8x64 a, u8x64 b)
 301 {
 302   gd->hi4 = gmul4_hi_hi (a, b);
 303   gd->lo4 = gmul4_lo_lo (a, b);
 304   gd->mid4 = gmul4_hi_lo (a, b) ^ gmul4_lo_hi (a, b);
 305   gd->pending = 0;
 306 }
 307
 308 static_always_inline void
 309 ghash4_mul_next (ghash_data_t *gd, u8x64 a, u8x64 b)
 310 {
 311   u8x64 hi = gmul4_hi_hi (a, b);
 312   u8x64 lo = gmul4_lo_lo (a, b);
 313
 314   if (gd->pending)
 315     {
 316       /* there is peding data from previous invocation so we can XOR */
 317       gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, hi);
 318       gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, lo);
 319       gd->pending = 0;
 320     }
 321   else
 322     {
 323       /* there is no peding data from previous invocation so we postpone XOR */
 324       gd->tmp_hi4 = hi;
 325       gd->tmp_lo4 = lo;
 326       gd->pending = 1;
 327     }
 328   gd->mid4 = u8x64_xor3 (gd->mid4, gmul4_hi_lo (a, b), gmul4_lo_hi (a, b));
 329 }
 330
 331 static_always_inline void
 332 ghash4_reduce (ghash_data_t *gd)
 333 {
 334   u8x64 r;
 335
 336   /* Final combination:
 337      gd->lo4 ^= gd->mid4 << 64
 338      gd->hi4 ^= gd->mid4 >> 64 */
 339
 340   u8x64 midl = u8x64_word_shift_left (gd->mid4, 8);
 341   u8x64 midr = u8x64_word_shift_right (gd->mid4, 8);
 342
 343   if (gd->pending)
 344     {
 345       gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, midl);
 346       gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, midr);
 347     }
 348   else
 349     {
 350       gd->lo4 ^= midl;
 351       gd->hi4 ^= midr;
 352     }
 353
 354   r = gmul4_hi_lo (ghash4_poly2, gd->lo4);
 355   gd->lo4 ^= u8x64_word_shift_left (r, 8);
 356 }
 357
 358 static_always_inline void
 359 ghash4_reduce2 (ghash_data_t *gd)
 360 {
 361   gd->tmp_lo4 = gmul4_lo_lo (ghash4_poly2, gd->lo4);
 362   gd->tmp_hi4 = gmul4_lo_hi (ghash4_poly2, gd->lo4);
 363 }
 364
 365 static_always_inline u8x16
 366 ghash4_final (ghash_data_t *gd)
 367 {
 368   u8x64 r;
 369   u8x32 t;
 370
 371   r = u8x64_xor3 (gd->hi4, u8x64_word_shift_right (gd->tmp_lo4, 4),
 372                   u8x64_word_shift_left (gd->tmp_hi4, 4));
 373
 374   /* horizontal XOR of 4 128-bit lanes */
 375   t = u8x64_extract_lo (r) ^ u8x64_extract_hi (r);
 376   return u8x32_extract_hi (t) ^ u8x32_extract_lo (t);
 377 }
 378 #endif
 379
 380 #if defined(__VPCLMULQDQ__)
 381
 382 static const u8x32 ghash2_poly2 = {
 383   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 384   0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00,
 385   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
 386 };
 387
 388 static_always_inline u8x32
 389 gmul2_lo_lo (u8x32 a, u8x32 b)
 390 {
 391   return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x00);
 392 }
 393
 394 static_always_inline u8x32
 395 gmul2_hi_lo (u8x32 a, u8x32 b)
 396 {
 397   return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x01);
 398 }
 399
 400 static_always_inline u8x32
 401 gmul2_lo_hi (u8x32 a, u8x32 b)
 402 {
 403   return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x10);
 404 }
 405
 406 static_always_inline u8x32
 407 gmul2_hi_hi (u8x32 a, u8x32 b)
 408 {
 409   return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x11);
 410 }
 411
 412 static_always_inline void
 413 ghash2_mul_first (ghash_data_t *gd, u8x32 a, u8x32 b)
 414 {
 415   gd->hi2 = gmul2_hi_hi (a, b);
 416   gd->lo2 = gmul2_lo_lo (a, b);
 417   gd->mid2 = gmul2_hi_lo (a, b) ^ gmul2_lo_hi (a, b);
 418   gd->pending = 0;
 419 }
 420
 421 static_always_inline void
 422 ghash2_mul_next (ghash_data_t *gd, u8x32 a, u8x32 b)
 423 {
 424   u8x32 hi = gmul2_hi_hi (a, b);
 425   u8x32 lo = gmul2_lo_lo (a, b);
 426
 427   if (gd->pending)
 428     {
 429       /* there is peding data from previous invocation so we can XOR */
 430       gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, hi);
 431       gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, lo);
 432       gd->pending = 0;
 433     }
 434   else
 435     {
 436       /* there is no peding data from previous invocation so we postpone XOR */
 437       gd->tmp_hi2 = hi;
 438       gd->tmp_lo2 = lo;
 439       gd->pending = 1;
 440     }
 441   gd->mid2 = u8x32_xor3 (gd->mid2, gmul2_hi_lo (a, b), gmul2_lo_hi (a, b));
 442 }
 443
 444 static_always_inline void
 445 ghash2_reduce (ghash_data_t *gd)
 446 {
 447   u8x32 r;
 448
 449   /* Final combination:
 450      gd->lo2 ^= gd->mid2 << 64
 451      gd->hi2 ^= gd->mid2 >> 64 */
 452
 453   u8x32 midl = u8x32_word_shift_left (gd->mid2, 8);
 454   u8x32 midr = u8x32_word_shift_right (gd->mid2, 8);
 455
 456   if (gd->pending)
 457     {
 458       gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, midl);
 459       gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, midr);
 460     }
 461   else
 462     {
 463       gd->lo2 ^= midl;
 464       gd->hi2 ^= midr;
 465     }
 466
 467   r = gmul2_hi_lo (ghash2_poly2, gd->lo2);
 468   gd->lo2 ^= u8x32_word_shift_left (r, 8);
 469 }
 470
 471 static_always_inline void
 472 ghash2_reduce2 (ghash_data_t *gd)
 473 {
 474   gd->tmp_lo2 = gmul2_lo_lo (ghash2_poly2, gd->lo2);
 475   gd->tmp_hi2 = gmul2_lo_hi (ghash2_poly2, gd->lo2);
 476 }
 477
 478 static_always_inline u8x16
 479 ghash2_final (ghash_data_t *gd)
 480 {
 481   u8x32 r;
 482
 483   r = u8x32_xor3 (gd->hi2, u8x32_word_shift_right (gd->tmp_lo2, 4),
 484                   u8x32_word_shift_left (gd->tmp_hi2, 4));
 485
 486   /* horizontal XOR of 2 128-bit lanes */
 487   return u8x32_extract_hi (r) ^ u8x32_extract_lo (r);
 488 }
 489 #endif
 490
 491 static_always_inline void
 492 ghash_precompute (u8x16 H, u8x16 * Hi, int n)
 493 {
 494   u8x16 r8;
 495   u32x4 r32;
 496   /* calcullate H<<1 mod poly from the hash key */
 497   r8 = (u8x16) ((u64x2) H >> 63);
 498   H = (u8x16) ((u64x2) H << 1);
 499   H |= u8x16_word_shift_left (r8, 8);
 500   r32 = (u32x4) u8x16_word_shift_right (r8, 8);
 501 #ifdef __SSE2__
 502   r32 = u32x4_shuffle (r32, 0, 1, 2, 0);
 503 #else
 504   r32[3] = r32[0];
 505 #endif
 506   r32 = r32 == (u32x4) {1, 0, 0, 1};
 507   Hi[n - 1] = H = H ^ ((u8x16) r32 & ghash_poly);
 508
 509   /* calculate H^(i + 1) */
 510   for (int i = n - 2; i >= 0; i--)
 511     Hi[i] = ghash_mul (H, Hi[i + 1]);
 512 }
 513
 514 #endif /* __ghash_h__ */
 515