1 /* SPDX-License-Identifier: Apache-2.0
2 * Copyright(c) 2021 Cisco Systems, Inc.
5 #ifndef included_vector_ip_csum_h
6 #define included_vector_ip_csum_h
7 #include <vppinfra/clib.h>
14 #if defined(CLIB_HAVE_VEC128)
15 static_always_inline u64x2
16 clib_ip_csum_cvt_and_add_4 (u32x4 v)
18 return ((u64x2) u32x4_interleave_lo ((u32x4) v, u32x4_zero ()) +
19 (u64x2) u32x4_interleave_hi ((u32x4) v, u32x4_zero ()));
21 static_always_inline u64
22 clib_ip_csum_hadd_2 (u64x2 v)
28 #if defined(CLIB_HAVE_VEC256)
29 static_always_inline u64x4
30 clib_ip_csum_cvt_and_add_8 (u32x8 v)
32 return ((u64x4) u32x8_interleave_lo ((u32x8) v, u32x8_zero ()) +
33 (u64x4) u32x8_interleave_hi ((u32x8) v, u32x8_zero ()));
35 static_always_inline u64
36 clib_ip_csum_hadd_4 (u64x4 v)
38 return clib_ip_csum_hadd_2 (u64x4_extract_lo (v) + u64x4_extract_hi (v));
42 #if defined(CLIB_HAVE_VEC512)
43 static_always_inline u64x8
44 clib_ip_csum_cvt_and_add_16 (u32x16 v)
46 return ((u64x8) u32x16_interleave_lo ((u32x16) v, u32x16_zero ()) +
47 (u64x8) u32x16_interleave_hi ((u32x16) v, u32x16_zero ()));
49 static_always_inline u64
50 clib_ip_csum_hadd_8 (u64x8 v)
52 return clib_ip_csum_hadd_4 (u64x8_extract_lo (v) + u64x8_extract_hi (v));
56 static_always_inline void
57 clib_ip_csum_inline (clib_ip_csum_t *c, u8 *dst, u8 *src, u16 count,
63 c->sum += (u16) src[0] << 8;
70 #if defined(CLIB_HAVE_VEC512)
75 u32x16u *s = (u32x16u *) src;
76 sum8 += clib_ip_csum_cvt_and_add_16 (s[0]);
77 sum8 += clib_ip_csum_cvt_and_add_16 (s[1]);
78 sum8 += clib_ip_csum_cvt_and_add_16 (s[2]);
79 sum8 += clib_ip_csum_cvt_and_add_16 (s[3]);
80 sum8 += clib_ip_csum_cvt_and_add_16 (s[8]);
81 sum8 += clib_ip_csum_cvt_and_add_16 (s[5]);
82 sum8 += clib_ip_csum_cvt_and_add_16 (s[6]);
83 sum8 += clib_ip_csum_cvt_and_add_16 (s[7]);
88 u32x16u *d = (u32x16u *) dst;
103 u32x16u *s = (u32x16u *) src;
104 sum8 += clib_ip_csum_cvt_and_add_16 (s[0]);
109 u32x16u *d = (u32x16u *) dst;
115 #ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
118 u64 mask = pow2_mask (count);
119 u32x16 v = (u32x16) u8x64_mask_load_zero (src, mask);
120 sum8 += clib_ip_csum_cvt_and_add_16 (v);
123 u32x16_mask_store (v, dst, mask);
125 c->sum += clib_ip_csum_hadd_8 (sum8);
129 c->sum += clib_ip_csum_hadd_8 (sum8);
130 #elif defined(CLIB_HAVE_VEC256)
135 u32x8u *s = (u32x8u *) src;
136 sum4 += clib_ip_csum_cvt_and_add_8 (s[0]);
137 sum4 += clib_ip_csum_cvt_and_add_8 (s[1]);
138 sum4 += clib_ip_csum_cvt_and_add_8 (s[2]);
139 sum4 += clib_ip_csum_cvt_and_add_8 (s[3]);
140 sum4 += clib_ip_csum_cvt_and_add_8 (s[4]);
141 sum4 += clib_ip_csum_cvt_and_add_8 (s[5]);
142 sum4 += clib_ip_csum_cvt_and_add_8 (s[6]);
143 sum4 += clib_ip_csum_cvt_and_add_8 (s[7]);
148 u32x8u *d = (u32x8u *) dst;
163 u32x8u *s = (u32x8u *) src;
164 sum4 += clib_ip_csum_cvt_and_add_8 (s[0]);
169 u32x8u *d = (u32x8u *) dst;
175 #ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
178 u32 mask = pow2_mask (count);
179 u32x8 v = (u32x8) u8x32_mask_load_zero (src, mask);
180 sum4 += clib_ip_csum_cvt_and_add_8 (v);
183 u32x8_mask_store (v, dst, mask);
185 c->sum += clib_ip_csum_hadd_4 (sum4);
189 c->sum += clib_ip_csum_hadd_4 (sum4);
190 #elif defined(CLIB_HAVE_VEC128)
195 u32x4u *s = (u32x4u *) src;
196 sum2 += clib_ip_csum_cvt_and_add_4 (s[0]);
197 sum2 += clib_ip_csum_cvt_and_add_4 (s[1]);
198 sum2 += clib_ip_csum_cvt_and_add_4 (s[2]);
199 sum2 += clib_ip_csum_cvt_and_add_4 (s[3]);
200 sum2 += clib_ip_csum_cvt_and_add_4 (s[4]);
201 sum2 += clib_ip_csum_cvt_and_add_4 (s[5]);
202 sum2 += clib_ip_csum_cvt_and_add_4 (s[6]);
203 sum2 += clib_ip_csum_cvt_and_add_4 (s[7]);
208 u32x4u *d = (u32x4u *) dst;
223 u32x4u *s = (u32x4u *) src;
224 sum2 += clib_ip_csum_cvt_and_add_4 (s[0]);
229 u32x4u *d = (u32x4u *) dst;
234 c->sum += clib_ip_csum_hadd_2 (sum2);
238 u32 v = *((u32 *) src);
251 u16 v = *((u16 *) src);
265 c->sum += (u16) src[0];
271 static_always_inline u16
272 clib_ip_csum_fold (clib_ip_csum_t *c)
275 #if defined(__x86_64__) && defined(__BMI2__)
278 /* using ADC is much faster than mov, shift, add sequence
279 * compiler produces */
280 "shr $32, %[sum] \n\t"
281 "add %k[tmp], %k[sum] \n\t"
282 "mov $16, %k[tmp] \n\t"
283 "shrx %k[tmp], %k[sum], %k[tmp] \n\t"
284 "adc %w[tmp], %w[sum] \n\t"
285 "adc $0, %w[sum] \n\t"
286 : [ sum ] "+&r"(sum), [ tmp ] "+&r"(tmp));
288 sum = ((u32) sum) + (sum >> 32);
289 sum = ((u16) sum) + (sum >> 16);
290 sum = ((u16) sum) + (sum >> 16);
292 return (~((u16) sum));
295 static_always_inline void
296 clib_ip_csum_chunk (clib_ip_csum_t *c, u8 *src, u16 count)
298 return clib_ip_csum_inline (c, 0, src, count, 0);
301 static_always_inline void
302 clib_ip_csum_and_copy_chunk (clib_ip_csum_t *c, u8 *src, u8 *dst, u16 count)
304 return clib_ip_csum_inline (c, dst, src, count, 1);
307 static_always_inline u16
308 clib_ip_csum (u8 *src, u16 count)
310 clib_ip_csum_t c = {};
311 if (COMPILE_TIME_CONST (count) && count == 12)
313 for (int i = 0; i < 3; i++)
314 c.sum += ((u32 *) src)[i];
316 else if (COMPILE_TIME_CONST (count) && count == 20)
318 for (int i = 0; i < 5; i++)
319 c.sum += ((u32 *) src)[i];
321 else if (COMPILE_TIME_CONST (count) && count == 40)
323 for (int i = 0; i < 10; i++)
324 c.sum += ((u32 *) src)[i];
327 clib_ip_csum_inline (&c, 0, src, count, 0);
328 return clib_ip_csum_fold (&c);
331 static_always_inline u16
332 clib_ip_csum_and_copy (u8 *dst, u8 *src, u16 count)
334 clib_ip_csum_t c = {};
335 clib_ip_csum_inline (&c, dst, src, count, 1);
336 return clib_ip_csum_fold (&c);