New upstream version 18.11-rc1
[deb_dpdk.git] / lib / librte_net / net_crc_sse.h
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2017 Intel Corporation
3  */
4
5 #ifndef _RTE_NET_CRC_SSE_H_
6 #define _RTE_NET_CRC_SSE_H_
7
8 #include <rte_branch_prediction.h>
9
10 #include <x86intrin.h>
11 #include <cpuid.h>
12
13 #ifdef __cplusplus
14 extern "C" {
15 #endif
16
17 /** PCLMULQDQ CRC computation context structure */
18 struct crc_pclmulqdq_ctx {
19         __m128i rk1_rk2;
20         __m128i rk5_rk6;
21         __m128i rk7_rk8;
22 };
23
24 static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
25 static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
26 /**
27  * @brief Performs one folding round
28  *
29  * Logically function operates as follows:
30  *     DATA = READ_NEXT_16BYTES();
31  *     F1 = LSB8(FOLD)
32  *     F2 = MSB8(FOLD)
33  *     T1 = CLMUL(F1, RK1)
34  *     T2 = CLMUL(F2, RK2)
35  *     FOLD = XOR(T1, T2, DATA)
36  *
37  * @param data_block
38  *   16 byte data block
39  * @param precomp
40  *   Precomputed rk1 constant
41  * @param fold
42  *   Current16 byte folded data
43  *
44  * @return
45  *   New 16 byte folded data
46  */
47 static __rte_always_inline __m128i
48 crcr32_folding_round(__m128i data_block,
49                 __m128i precomp,
50                 __m128i fold)
51 {
52         __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
53         __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
54
55         return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
56 }
57
58 /**
59  * Performs reduction from 128 bits to 64 bits
60  *
61  * @param data128
62  *   128 bits data to be reduced
63  * @param precomp
64  *   precomputed constants rk5, rk6
65  *
66  * @return
67  *  64 bits reduced data
68  */
69
70 static __rte_always_inline __m128i
71 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
72 {
73         __m128i tmp0, tmp1, tmp2;
74
75         /* 64b fold */
76         tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
77         tmp1 = _mm_srli_si128(data128, 8);
78         tmp0 = _mm_xor_si128(tmp0, tmp1);
79
80         /* 32b fold */
81         tmp2 = _mm_slli_si128(tmp0, 4);
82         tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
83
84         return _mm_xor_si128(tmp1, tmp0);
85 }
86
87 /**
88  * Performs Barret's reduction from 64 bits to 32 bits
89  *
90  * @param data64
91  *   64 bits data to be reduced
92  * @param precomp
93  *   rk7 precomputed constant
94  *
95  * @return
96  *   reduced 32 bits data
97  */
98
99 static __rte_always_inline uint32_t
100 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
101 {
102         static const uint32_t mask1[4] __rte_aligned(16) = {
103                 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
104         };
105
106         static const uint32_t mask2[4] __rte_aligned(16) = {
107                 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
108         };
109         __m128i tmp0, tmp1, tmp2;
110
111         tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
112
113         tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
114         tmp1 = _mm_xor_si128(tmp1, tmp0);
115         tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
116
117         tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
118         tmp2 = _mm_xor_si128(tmp2, tmp1);
119         tmp2 = _mm_xor_si128(tmp2, tmp0);
120
121         return _mm_extract_epi32(tmp2, 2);
122 }
123
124 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = {
125         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
126         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
127         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
128         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
129         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
130         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
131 };
132
133 /**
134  * Shifts left 128 bit register by specified number of bytes
135  *
136  * @param reg
137  *   128 bit value
138  * @param num
139  *   number of bytes to shift left reg by (0-16)
140  *
141  * @return
142  *   reg << (num * 8)
143  */
144
145 static __rte_always_inline __m128i
146 xmm_shift_left(__m128i reg, const unsigned int num)
147 {
148         const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
149
150         return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
151 }
152
153 static __rte_always_inline uint32_t
154 crc32_eth_calc_pclmulqdq(
155         const uint8_t *data,
156         uint32_t data_len,
157         uint32_t crc,
158         const struct crc_pclmulqdq_ctx *params)
159 {
160         __m128i temp, fold, k;
161         uint32_t n;
162
163         /* Get CRC init value */
164         temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
165
166         /**
167          * Folding all data into single 16 byte data block
168          * Assumes: fold holds first 16 bytes of data
169          */
170
171         if (unlikely(data_len < 32)) {
172                 if (unlikely(data_len == 16)) {
173                         /* 16 bytes */
174                         fold = _mm_loadu_si128((const __m128i *)data);
175                         fold = _mm_xor_si128(fold, temp);
176                         goto reduction_128_64;
177                 }
178
179                 if (unlikely(data_len < 16)) {
180                         /* 0 to 15 bytes */
181                         uint8_t buffer[16] __rte_aligned(16);
182
183                         memset(buffer, 0, sizeof(buffer));
184                         memcpy(buffer, data, data_len);
185
186                         fold = _mm_load_si128((const __m128i *)buffer);
187                         fold = _mm_xor_si128(fold, temp);
188                         if (unlikely(data_len < 4)) {
189                                 fold = xmm_shift_left(fold, 8 - data_len);
190                                 goto barret_reduction;
191                         }
192                         fold = xmm_shift_left(fold, 16 - data_len);
193                         goto reduction_128_64;
194                 }
195                 /* 17 to 31 bytes */
196                 fold = _mm_loadu_si128((const __m128i *)data);
197                 fold = _mm_xor_si128(fold, temp);
198                 n = 16;
199                 k = params->rk1_rk2;
200                 goto partial_bytes;
201         }
202
203         /** At least 32 bytes in the buffer */
204         /** Apply CRC initial value */
205         fold = _mm_loadu_si128((const __m128i *)data);
206         fold = _mm_xor_si128(fold, temp);
207
208         /** Main folding loop - the last 16 bytes is processed separately */
209         k = params->rk1_rk2;
210         for (n = 16; (n + 16) <= data_len; n += 16) {
211                 temp = _mm_loadu_si128((const __m128i *)&data[n]);
212                 fold = crcr32_folding_round(temp, k, fold);
213         }
214
215 partial_bytes:
216         if (likely(n < data_len)) {
217
218                 const uint32_t mask3[4] __rte_aligned(16) = {
219                         0x80808080, 0x80808080, 0x80808080, 0x80808080
220                 };
221
222                 const uint8_t shf_table[32] __rte_aligned(16) = {
223                         0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
224                         0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
225                         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
226                         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
227                 };
228
229                 __m128i last16, a, b;
230
231                 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
232
233                 temp = _mm_loadu_si128((const __m128i *)
234                         &shf_table[data_len & 15]);
235                 a = _mm_shuffle_epi8(fold, temp);
236
237                 temp = _mm_xor_si128(temp,
238                         _mm_load_si128((const __m128i *)mask3));
239                 b = _mm_shuffle_epi8(fold, temp);
240                 b = _mm_blendv_epi8(b, last16, temp);
241
242                 /* k = rk1 & rk2 */
243                 temp = _mm_clmulepi64_si128(a, k, 0x01);
244                 fold = _mm_clmulepi64_si128(a, k, 0x10);
245
246                 fold = _mm_xor_si128(fold, temp);
247                 fold = _mm_xor_si128(fold, b);
248         }
249
250         /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
251 reduction_128_64:
252         k = params->rk5_rk6;
253         fold = crcr32_reduce_128_to_64(fold, k);
254
255 barret_reduction:
256         k = params->rk7_rk8;
257         n = crcr32_reduce_64_to_32(fold, k);
258
259         return n;
260 }
261
262
263 static inline void
264 rte_net_crc_sse42_init(void)
265 {
266         uint64_t k1, k2, k5, k6;
267         uint64_t p = 0, q = 0;
268
269         /** Initialize CRC16 data */
270         k1 = 0x189aeLLU;
271         k2 = 0x8e10LLU;
272         k5 = 0x189aeLLU;
273         k6 = 0x114aaLLU;
274         q =  0x11c581910LLU;
275         p =  0x10811LLU;
276
277         /** Save the params in context structure */
278         crc16_ccitt_pclmulqdq.rk1_rk2 =
279                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
280         crc16_ccitt_pclmulqdq.rk5_rk6 =
281                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
282         crc16_ccitt_pclmulqdq.rk7_rk8 =
283                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
284
285         /** Initialize CRC32 data */
286         k1 = 0xccaa009eLLU;
287         k2 = 0x1751997d0LLU;
288         k5 = 0xccaa009eLLU;
289         k6 = 0x163cd6124LLU;
290         q =  0x1f7011640LLU;
291         p =  0x1db710641LLU;
292
293         /** Save the params in context structure */
294         crc32_eth_pclmulqdq.rk1_rk2 =
295                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
296         crc32_eth_pclmulqdq.rk5_rk6 =
297                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
298         crc32_eth_pclmulqdq.rk7_rk8 =
299                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
300
301         /**
302          * Reset the register as following calculation may
303          * use other data types such as float, double, etc.
304          */
305         _mm_empty();
306
307 }
308
309 static inline uint32_t
310 rte_crc16_ccitt_sse42_handler(const uint8_t *data,
311         uint32_t data_len)
312 {
313         /** return 16-bit CRC value */
314         return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
315                 data_len,
316                 0xffff,
317                 &crc16_ccitt_pclmulqdq);
318 }
319
320 static inline uint32_t
321 rte_crc32_eth_sse42_handler(const uint8_t *data,
322         uint32_t data_len)
323 {
324         return ~crc32_eth_calc_pclmulqdq(data,
325                 data_len,
326                 0xffffffffUL,
327                 &crc32_eth_pclmulqdq);
328 }
329
330 #ifdef __cplusplus
331 }
332 #endif
333
334 #endif /* _RTE_NET_CRC_SSE_H_ */