lib/librte_net/net_crc_neon.h

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2017 Cavium, Inc
   3  */
   4
   5 #ifndef _NET_CRC_NEON_H_
   6 #define _NET_CRC_NEON_H_
   7
   8 #include <rte_branch_prediction.h>
   9 #include <rte_net_crc.h>
  10 #include <rte_vect.h>
  11 #include <rte_cpuflags.h>
  12
  13 #ifdef __cplusplus
  14 extern "C" {
  15 #endif
  16
  17 /** PMULL CRC computation context structure */
  18 struct crc_pmull_ctx {
  19         uint64x2_t rk1_rk2;
  20         uint64x2_t rk5_rk6;
  21         uint64x2_t rk7_rk8;
  22 };
  23
  24 struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
  25 struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
  26
  27 /**
  28  * @brief Performs one folding round
  29  *
  30  * Logically function operates as follows:
  31  *     DATA = READ_NEXT_16BYTES();
  32  *     F1 = LSB8(FOLD)
  33  *     F2 = MSB8(FOLD)
  34  *     T1 = CLMUL(F1, RK1)
  35  *     T2 = CLMUL(F2, RK2)
  36  *     FOLD = XOR(T1, T2, DATA)
  37  *
  38  * @param data_block 16 byte data block
  39  * @param precomp precomputed rk1 constant
  40  * @param fold running 16 byte folded data
  41  *
  42  * @return New 16 byte folded data
  43  */
  44 static inline uint64x2_t
  45 crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
  46         uint64x2_t fold)
  47 {
  48         uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
  49                         vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
  50                         vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
  51
  52         uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
  53                         vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
  54                         vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
  55
  56         return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
  57 }
  58
  59 /**
  60  * Performs reduction from 128 bits to 64 bits
  61  *
  62  * @param data128 128 bits data to be reduced
  63  * @param precomp rk5 and rk6 precomputed constants
  64  *
  65  * @return data reduced to 64 bits
  66  */
  67 static inline uint64x2_t
  68 crcr32_reduce_128_to_64(uint64x2_t data128,
  69         uint64x2_t precomp)
  70 {
  71         uint64x2_t tmp0, tmp1, tmp2;
  72
  73         /* 64b fold */
  74         tmp0 = vreinterpretq_u64_p128(vmull_p64(
  75                 vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
  76                 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
  77         tmp1 = vshift_bytes_right(data128, 8);
  78         tmp0 = veorq_u64(tmp0, tmp1);
  79
  80         /* 32b fold */
  81         tmp2 = vshift_bytes_left(tmp0, 4);
  82         tmp1 = vreinterpretq_u64_p128(vmull_p64(
  83                 vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
  84                 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
  85
  86         return veorq_u64(tmp1, tmp0);
  87 }
  88
  89 /**
  90  * Performs Barret's reduction from 64 bits to 32 bits
  91  *
  92  * @param data64 64 bits data to be reduced
  93  * @param precomp rk7 precomputed constant
  94  *
  95  * @return data reduced to 32 bits
  96  */
  97 static inline uint32_t
  98 crcr32_reduce_64_to_32(uint64x2_t data64,
  99         uint64x2_t precomp)
 100 {
 101         static uint32_t mask1[4] __rte_aligned(16) = {
 102                 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
 103         };
 104         static uint32_t mask2[4] __rte_aligned(16) = {
 105                 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
 106         };
 107         uint64x2_t tmp0, tmp1, tmp2;
 108
 109         tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
 110
 111         tmp1 = vreinterpretq_u64_p128(vmull_p64(
 112                 vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
 113                 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
 114         tmp1 = veorq_u64(tmp1, tmp0);
 115         tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
 116
 117         tmp2 = vreinterpretq_u64_p128(vmull_p64(
 118                 vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
 119                 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
 120         tmp2 = veorq_u64(tmp2, tmp1);
 121         tmp2 = veorq_u64(tmp2, tmp0);
 122
 123         return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
 124 }
 125
 126 static inline uint32_t
 127 crc32_eth_calc_pmull(
 128         const uint8_t *data,
 129         uint32_t data_len,
 130         uint32_t crc,
 131         const struct crc_pmull_ctx *params)
 132 {
 133         uint64x2_t temp, fold, k;
 134         uint32_t n;
 135
 136         /* Get CRC init value */
 137         temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
 138
 139         /**
 140          * Folding all data into single 16 byte data block
 141          * Assumes: fold holds first 16 bytes of data
 142          */
 143         if (unlikely(data_len < 32)) {
 144                 if (unlikely(data_len == 16)) {
 145                         /* 16 bytes */
 146                         fold = vld1q_u64((const uint64_t *)data);
 147                         fold = veorq_u64(fold, temp);
 148                         goto reduction_128_64;
 149                 }
 150
 151                 if (unlikely(data_len < 16)) {
 152                         /* 0 to 15 bytes */
 153                         uint8_t buffer[16] __rte_aligned(16);
 154
 155                         memset(buffer, 0, sizeof(buffer));
 156                         memcpy(buffer, data, data_len);
 157
 158                         fold = vld1q_u64((uint64_t *)buffer);
 159                         fold = veorq_u64(fold, temp);
 160                         if (unlikely(data_len < 4)) {
 161                                 fold = vshift_bytes_left(fold, 8 - data_len);
 162                                 goto barret_reduction;
 163                         }
 164                         fold = vshift_bytes_left(fold, 16 - data_len);
 165                         goto reduction_128_64;
 166                 }
 167                 /* 17 to 31 bytes */
 168                 fold = vld1q_u64((const uint64_t *)data);
 169                 fold = veorq_u64(fold, temp);
 170                 n = 16;
 171                 k = params->rk1_rk2;
 172                 goto partial_bytes;
 173         }
 174
 175         /** At least 32 bytes in the buffer */
 176         /** Apply CRC initial value */
 177         fold = vld1q_u64((const uint64_t *)data);
 178         fold = veorq_u64(fold, temp);
 179
 180         /** Main folding loop - the last 16 bytes is processed separately */
 181         k = params->rk1_rk2;
 182         for (n = 16; (n + 16) <= data_len; n += 16) {
 183                 temp = vld1q_u64((const uint64_t *)&data[n]);
 184                 fold = crcr32_folding_round(temp, k, fold);
 185         }
 186
 187 partial_bytes:
 188         if (likely(n < data_len)) {
 189                 uint64x2_t last16, a, b, mask;
 190                 uint32_t rem = data_len & 15;
 191
 192                 last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
 193                 a = vshift_bytes_left(fold, 16 - rem);
 194                 b = vshift_bytes_right(fold, rem);
 195                 mask = vshift_bytes_left(vdupq_n_u64(-1), 16 - rem);
 196                 b = vorrq_u64(b, vandq_u64(mask, last16));
 197
 198                 /* k = rk1 & rk2 */
 199                 temp = vreinterpretq_u64_p128(vmull_p64(
 200                                 vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
 201                                 vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
 202                 fold = vreinterpretq_u64_p128(vmull_p64(
 203                                 vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
 204                                 vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
 205                 fold = veorq_u64(fold, temp);
 206                 fold = veorq_u64(fold, b);
 207         }
 208
 209         /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
 210 reduction_128_64:
 211         k = params->rk5_rk6;
 212         fold = crcr32_reduce_128_to_64(fold, k);
 213
 214 barret_reduction:
 215         k = params->rk7_rk8;
 216         n = crcr32_reduce_64_to_32(fold, k);
 217
 218         return n;
 219 }
 220
 221 static inline void
 222 rte_net_crc_neon_init(void)
 223 {
 224         /* Initialize CRC16 data */
 225         uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
 226         uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
 227         uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
 228
 229         /* Initialize CRC32 data */
 230         uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
 231         uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
 232         uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
 233
 234         /** Save the params in context structure */
 235         crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
 236         crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
 237         crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
 238
 239         /** Save the params in context structure */
 240         crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
 241         crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
 242         crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
 243 }
 244
 245 static inline uint32_t
 246 rte_crc16_ccitt_neon_handler(const uint8_t *data,
 247         uint32_t data_len)
 248 {
 249         return (uint16_t)~crc32_eth_calc_pmull(data,
 250                 data_len,
 251                 0xffff,
 252                 &crc16_ccitt_pmull);
 253 }
 254
 255 static inline uint32_t
 256 rte_crc32_eth_neon_handler(const uint8_t *data,
 257         uint32_t data_len)
 258 {
 259         return ~crc32_eth_calc_pmull(data,
 260                 data_len,
 261                 0xffffffffUL,
 262                 &crc32_eth_pmull);
 263 }
 264
 265 #ifdef __cplusplus
 266 }
 267 #endif
 268
 269 #endif /* _NET_CRC_NEON_H_ */