lib/librte_net/net_crc_sse.h

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2017 Intel Corporation
   3  */
   4
   5 #ifndef _RTE_NET_CRC_SSE_H_
   6 #define _RTE_NET_CRC_SSE_H_
   7
   8 #include <rte_branch_prediction.h>
   9
  10 #include <x86intrin.h>
  11 #include <cpuid.h>
  12
  13 #ifdef __cplusplus
  14 extern "C" {
  15 #endif
  16
  17 /** PCLMULQDQ CRC computation context structure */
  18 struct crc_pclmulqdq_ctx {
  19         __m128i rk1_rk2;
  20         __m128i rk5_rk6;
  21         __m128i rk7_rk8;
  22 };
  23
  24 static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
  25 static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
  26 /**
  27  * @brief Performs one folding round
  28  *
  29  * Logically function operates as follows:
  30  *     DATA = READ_NEXT_16BYTES();
  31  *     F1 = LSB8(FOLD)
  32  *     F2 = MSB8(FOLD)
  33  *     T1 = CLMUL(F1, RK1)
  34  *     T2 = CLMUL(F2, RK2)
  35  *     FOLD = XOR(T1, T2, DATA)
  36  *
  37  * @param data_block
  38  *   16 byte data block
  39  * @param precomp
  40  *   Precomputed rk1 constant
  41  * @param fold
  42  *   Current16 byte folded data
  43  *
  44  * @return
  45  *   New 16 byte folded data
  46  */
  47 static __rte_always_inline __m128i
  48 crcr32_folding_round(__m128i data_block,
  49                 __m128i precomp,
  50                 __m128i fold)
  51 {
  52         __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
  53         __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
  54
  55         return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
  56 }
  57
  58 /**
  59  * Performs reduction from 128 bits to 64 bits
  60  *
  61  * @param data128
  62  *   128 bits data to be reduced
  63  * @param precomp
  64  *   precomputed constants rk5, rk6
  65  *
  66  * @return
  67  *  64 bits reduced data
  68  */
  69
  70 static __rte_always_inline __m128i
  71 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
  72 {
  73         __m128i tmp0, tmp1, tmp2;
  74
  75         /* 64b fold */
  76         tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
  77         tmp1 = _mm_srli_si128(data128, 8);
  78         tmp0 = _mm_xor_si128(tmp0, tmp1);
  79
  80         /* 32b fold */
  81         tmp2 = _mm_slli_si128(tmp0, 4);
  82         tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
  83
  84         return _mm_xor_si128(tmp1, tmp0);
  85 }
  86
  87 /**
  88  * Performs Barret's reduction from 64 bits to 32 bits
  89  *
  90  * @param data64
  91  *   64 bits data to be reduced
  92  * @param precomp
  93  *   rk7 precomputed constant
  94  *
  95  * @return
  96  *   reduced 32 bits data
  97  */
  98
  99 static __rte_always_inline uint32_t
 100 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
 101 {
 102         static const uint32_t mask1[4] __rte_aligned(16) = {
 103                 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
 104         };
 105
 106         static const uint32_t mask2[4] __rte_aligned(16) = {
 107                 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
 108         };
 109         __m128i tmp0, tmp1, tmp2;
 110
 111         tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
 112
 113         tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
 114         tmp1 = _mm_xor_si128(tmp1, tmp0);
 115         tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
 116
 117         tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
 118         tmp2 = _mm_xor_si128(tmp2, tmp1);
 119         tmp2 = _mm_xor_si128(tmp2, tmp0);
 120
 121         return _mm_extract_epi32(tmp2, 2);
 122 }
 123
 124 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = {
 125         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 126         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 127         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 128         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 129         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 130         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 131 };
 132
 133 /**
 134  * Shifts left 128 bit register by specified number of bytes
 135  *
 136  * @param reg
 137  *   128 bit value
 138  * @param num
 139  *   number of bytes to shift left reg by (0-16)
 140  *
 141  * @return
 142  *   reg << (num * 8)
 143  */
 144
 145 static __rte_always_inline __m128i
 146 xmm_shift_left(__m128i reg, const unsigned int num)
 147 {
 148         const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
 149
 150         return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
 151 }
 152
 153 static __rte_always_inline uint32_t
 154 crc32_eth_calc_pclmulqdq(
 155         const uint8_t *data,
 156         uint32_t data_len,
 157         uint32_t crc,
 158         const struct crc_pclmulqdq_ctx *params)
 159 {
 160         __m128i temp, fold, k;
 161         uint32_t n;
 162
 163         /* Get CRC init value */
 164         temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
 165
 166         /**
 167          * Folding all data into single 16 byte data block
 168          * Assumes: fold holds first 16 bytes of data
 169          */
 170
 171         if (unlikely(data_len < 32)) {
 172                 if (unlikely(data_len == 16)) {
 173                         /* 16 bytes */
 174                         fold = _mm_loadu_si128((const __m128i *)data);
 175                         fold = _mm_xor_si128(fold, temp);
 176                         goto reduction_128_64;
 177                 }
 178
 179                 if (unlikely(data_len < 16)) {
 180                         /* 0 to 15 bytes */
 181                         uint8_t buffer[16] __rte_aligned(16);
 182
 183                         memset(buffer, 0, sizeof(buffer));
 184                         memcpy(buffer, data, data_len);
 185
 186                         fold = _mm_load_si128((const __m128i *)buffer);
 187                         fold = _mm_xor_si128(fold, temp);
 188                         if (unlikely(data_len < 4)) {
 189                                 fold = xmm_shift_left(fold, 8 - data_len);
 190                                 goto barret_reduction;
 191                         }
 192                         fold = xmm_shift_left(fold, 16 - data_len);
 193                         goto reduction_128_64;
 194                 }
 195                 /* 17 to 31 bytes */
 196                 fold = _mm_loadu_si128((const __m128i *)data);
 197                 fold = _mm_xor_si128(fold, temp);
 198                 n = 16;
 199                 k = params->rk1_rk2;
 200                 goto partial_bytes;
 201         }
 202
 203         /** At least 32 bytes in the buffer */
 204         /** Apply CRC initial value */
 205         fold = _mm_loadu_si128((const __m128i *)data);
 206         fold = _mm_xor_si128(fold, temp);
 207
 208         /** Main folding loop - the last 16 bytes is processed separately */
 209         k = params->rk1_rk2;
 210         for (n = 16; (n + 16) <= data_len; n += 16) {
 211                 temp = _mm_loadu_si128((const __m128i *)&data[n]);
 212                 fold = crcr32_folding_round(temp, k, fold);
 213         }
 214
 215 partial_bytes:
 216         if (likely(n < data_len)) {
 217
 218                 const uint32_t mask3[4] __rte_aligned(16) = {
 219                         0x80808080, 0x80808080, 0x80808080, 0x80808080
 220                 };
 221
 222                 const uint8_t shf_table[32] __rte_aligned(16) = {
 223                         0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
 224                         0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
 225                         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 226                         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
 227                 };
 228
 229                 __m128i last16, a, b;
 230
 231                 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
 232
 233                 temp = _mm_loadu_si128((const __m128i *)
 234                         &shf_table[data_len & 15]);
 235                 a = _mm_shuffle_epi8(fold, temp);
 236
 237                 temp = _mm_xor_si128(temp,
 238                         _mm_load_si128((const __m128i *)mask3));
 239                 b = _mm_shuffle_epi8(fold, temp);
 240                 b = _mm_blendv_epi8(b, last16, temp);
 241
 242                 /* k = rk1 & rk2 */
 243                 temp = _mm_clmulepi64_si128(a, k, 0x01);
 244                 fold = _mm_clmulepi64_si128(a, k, 0x10);
 245
 246                 fold = _mm_xor_si128(fold, temp);
 247                 fold = _mm_xor_si128(fold, b);
 248         }
 249
 250         /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
 251 reduction_128_64:
 252         k = params->rk5_rk6;
 253         fold = crcr32_reduce_128_to_64(fold, k);
 254
 255 barret_reduction:
 256         k = params->rk7_rk8;
 257         n = crcr32_reduce_64_to_32(fold, k);
 258
 259         return n;
 260 }
 261
 262
 263 static inline void
 264 rte_net_crc_sse42_init(void)
 265 {
 266         uint64_t k1, k2, k5, k6;
 267         uint64_t p = 0, q = 0;
 268
 269         /** Initialize CRC16 data */
 270         k1 = 0x189aeLLU;
 271         k2 = 0x8e10LLU;
 272         k5 = 0x189aeLLU;
 273         k6 = 0x114aaLLU;
 274         q =  0x11c581910LLU;
 275         p =  0x10811LLU;
 276
 277         /** Save the params in context structure */
 278         crc16_ccitt_pclmulqdq.rk1_rk2 =
 279                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
 280         crc16_ccitt_pclmulqdq.rk5_rk6 =
 281                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
 282         crc16_ccitt_pclmulqdq.rk7_rk8 =
 283                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
 284
 285         /** Initialize CRC32 data */
 286         k1 = 0xccaa009eLLU;
 287         k2 = 0x1751997d0LLU;
 288         k5 = 0xccaa009eLLU;
 289         k6 = 0x163cd6124LLU;
 290         q =  0x1f7011640LLU;
 291         p =  0x1db710641LLU;
 292
 293         /** Save the params in context structure */
 294         crc32_eth_pclmulqdq.rk1_rk2 =
 295                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
 296         crc32_eth_pclmulqdq.rk5_rk6 =
 297                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
 298         crc32_eth_pclmulqdq.rk7_rk8 =
 299                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
 300
 301         /**
 302          * Reset the register as following calculation may
 303          * use other data types such as float, double, etc.
 304          */
 305         _mm_empty();
 306
 307 }
 308
 309 static inline uint32_t
 310 rte_crc16_ccitt_sse42_handler(const uint8_t *data,
 311         uint32_t data_len)
 312 {
 313         /** return 16-bit CRC value */
 314         return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
 315                 data_len,
 316                 0xffff,
 317                 &crc16_ccitt_pclmulqdq);
 318 }
 319
 320 static inline uint32_t
 321 rte_crc32_eth_sse42_handler(const uint8_t *data,
 322         uint32_t data_len)
 323 {
 324         return ~crc32_eth_calc_pclmulqdq(data,
 325                 data_len,
 326                 0xffffffffUL,
 327                 &crc32_eth_pclmulqdq);
 328 }
 329
 330 #ifdef __cplusplus
 331 }
 332 #endif
 333
 334 #endif /* _RTE_NET_CRC_SSE_H_ */