1 /* SPDX-License-Identifier: Apache-2.0
2 * Copyright(c) 2021 Cisco Systems, Inc.
5 #ifndef included_vector_toeplitz_h
6 #define included_vector_toeplitz_h
7 #include <vppinfra/clib.h>
14 } clib_toeplitz_hash_key_t;
16 clib_toeplitz_hash_key_t *clib_toeplitz_hash_key_init (u8 *key, u32 keylen);
17 void clib_toeplitz_hash_key_free (clib_toeplitz_hash_key_t *k);
19 #ifdef CLIB_HAVE_VEC256
20 static_always_inline u32x8
21 toeplitz_hash_one_x8 (u32x8 hash, u64x4 v4, u8 data, u8 off)
23 u32x8 v8 = u32x8_shuffle2 (v4 << (off * 8), v4 << (off * 8 + 4),
24 /*uppper 32 bits of each u64 in reverse order */
25 15, 13, 11, 9, 7, 5, 3, 1);
27 #ifdef CLIB_HAVE_VEC256_MASK_BITWISE_OPS
28 return u32x8_mask_xor (hash, v8, data);
30 static const u32x8 bits = { 1, 2, 4, 8, 16, 32, 64, 128 };
31 return hash ^ (((u32x8_splat (data) & bits) != u32x8_zero ()) & v8);
36 #if defined(__GFNI__) && defined(__AVX512F__)
37 static const u8x64 __clib_toeplitz_hash_gfni_permute = {
38 /* clang-format off */
39 0x00, 0x01, 0x02, 0x03, 0x40, 0x41, 0x42, 0x43,
40 0x01, 0x02, 0x03, 0x04, 0x41, 0x42, 0x43, 0x44,
41 0x02, 0x03, 0x04, 0x05, 0x42, 0x43, 0x44, 0x45,
42 0x03, 0x04, 0x05, 0x06, 0x43, 0x44, 0x45, 0x46,
43 0x04, 0x05, 0x06, 0x07, 0x44, 0x45, 0x46, 0x47,
44 0x05, 0x06, 0x07, 0x08, 0x45, 0x46, 0x47, 0x48,
45 0x06, 0x07, 0x08, 0x09, 0x46, 0x47, 0x48, 0x49,
46 0x07, 0x08, 0x09, 0x0a, 0x47, 0x48, 0x49, 0x4a
49 static_always_inline u64x8
50 clib_toeplitz_hash_gfni_one (u8x64 d0, u64x8 m, int i)
53 d0 = i == 1 ? (u8x64) u64x8_align_right (d0, d0, 1) : d0;
54 d0 = i == 2 ? (u8x64) u64x8_align_right (d0, d0, 2) : d0;
55 d0 = i == 3 ? (u8x64) u64x8_align_right (d0, d0, 3) : d0;
56 d0 = i == 4 ? (u8x64) u64x8_align_right (d0, d0, 4) : d0;
57 d0 = i == 5 ? (u8x64) u64x8_align_right (d0, d0, 5) : d0;
58 d0 = i == 6 ? (u8x64) u64x8_align_right (d0, d0, 6) : d0;
60 d0 = u8x64_permute (__clib_toeplitz_hash_gfni_permute, d0);
62 return (u64x8) _mm512_gf2p8affine_epi64_epi8 ((__m512i) d0, (__m512i) m, 0);
65 static_always_inline u64x8
66 clib_toeplitz_hash_gfni_two (u8x64 d0, u8x64 d1, u64x8 m, int i)
69 d0 = i == 1 ? (u8x64) u64x8_align_right (d0, d0, 1) : d0;
70 d1 = i == 1 ? (u8x64) u64x8_align_right (d1, d1, 1) : d1;
71 d0 = i == 2 ? (u8x64) u64x8_align_right (d0, d0, 2) : d0;
72 d1 = i == 2 ? (u8x64) u64x8_align_right (d1, d1, 2) : d1;
73 d0 = i == 3 ? (u8x64) u64x8_align_right (d0, d0, 3) : d0;
74 d1 = i == 3 ? (u8x64) u64x8_align_right (d1, d1, 3) : d1;
75 d0 = i == 4 ? (u8x64) u64x8_align_right (d0, d0, 4) : d0;
76 d1 = i == 4 ? (u8x64) u64x8_align_right (d1, d1, 4) : d1;
77 d0 = i == 5 ? (u8x64) u64x8_align_right (d0, d0, 5) : d0;
78 d1 = i == 5 ? (u8x64) u64x8_align_right (d1, d1, 5) : d1;
79 d0 = i == 6 ? (u8x64) u64x8_align_right (d0, d0, 6) : d0;
80 d1 = i == 6 ? (u8x64) u64x8_align_right (d1, d1, 6) : d1;
82 d0 = u8x64_permute2 (__clib_toeplitz_hash_gfni_permute, d0, d1);
84 return (u64x8) _mm512_gf2p8affine_epi64_epi8 ((__m512i) d0, (__m512i) m, 0);
88 static_always_inline u32
89 clib_toeplitz_hash (clib_toeplitz_hash_key_t *k, u8 *data, int n_bytes)
92 /* key must be 4 bytes longer than data */
93 ASSERT (k->key_length - n_bytes >= 4);
95 #if defined(__GFNI__) && defined(__AVX512F__)
98 u64x8u *m = (u64x8u *) ((u8 *) k + k->gfni_offset);
100 /* move data ptr backwards for 3 byte so mask load "prepends" three zeros */
106 d0 = u8x64_mask_load_zero ((u8 *) data, pow2_mask (n_bytes - 3) << 3);
110 d0 = u8x64_mask_load_zero ((u8 *) data, -1ULL << 3);
112 h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_one (d0, m[0], 0),
113 clib_toeplitz_hash_gfni_one (d0, m[1], 1));
114 h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_one (d0, m[2], 2),
115 clib_toeplitz_hash_gfni_one (d0, m[3], 3));
116 h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_one (d0, m[4], 4),
117 clib_toeplitz_hash_gfni_one (d0, m[5], 5));
118 h0 ^= clib_toeplitz_hash_gfni_one (d0, m[6], 6);
125 d0 = *(u8x64u *) data;
132 d0 = u8x64_mask_load_zero ((u8 *) data, pow2_mask (n_bytes));
134 h0 ^= clib_toeplitz_hash_gfni_one (d0, m[0], 0);
140 d0 = (u8x64) u64x8_align_right (u64x8_zero (), d0, 1);
145 return u64x8_hxor (h0);
146 #elif defined(CLIB_HAVE_VEC256)
147 u64x4 v4, shift = { 0, 1, 2, 3 };
152 v4 = u64x4_splat (clib_net_to_host_u64 (*(u64u *) key)) << shift;
154 h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0);
155 h0 = toeplitz_hash_one_x8 (h0, v4, data[1], 1);
156 h0 = toeplitz_hash_one_x8 (h0, v4, data[2], 2);
157 h0 = toeplitz_hash_one_x8 (h0, v4, data[3], 3);
166 u64 v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32;
167 v |= (u64) key[4] << 24;
171 v |= (u64) key[5] << 16;
172 v |= (u64) key[6] << 8;
173 v4 = u64x4_splat (v) << shift;
174 h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0);
175 h0 = toeplitz_hash_one_x8 (h0, v4, data[1], 1);
176 h0 = toeplitz_hash_one_x8 (h0, v4, data[2], 2);
178 else if (n_bytes == 2)
180 v |= (u64) key[5] << 16;
181 v4 = u64x4_splat (v) << shift;
182 h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0);
183 h0 = toeplitz_hash_one_x8 (h0, v4, data[1], 1);
187 v4 = u64x4_splat (v) << shift;
188 h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0);
192 return u32x8_hxor (h0);
198 v = clib_net_to_host_u64 (*(u64u *) key);
200 for (u8 bit = 1 << 7, byte = data[0]; bit; bit >>= 1, v <<= 1)
201 hash ^= byte & bit ? v : 0;
202 for (u8 bit = 1 << 7, byte = data[1]; bit; bit >>= 1, v <<= 1)
203 hash ^= byte & bit ? v : 0;
204 for (u8 bit = 1 << 7, byte = data[2]; bit; bit >>= 1, v <<= 1)
205 hash ^= byte & bit ? v : 0;
206 for (u8 bit = 1 << 7, byte = data[3]; bit; bit >>= 1, v <<= 1)
207 hash ^= byte & bit ? v : 0;
216 v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32;
217 v |= (u64) key[4] << 24;
218 for (u8 bit = 1 << 7, byte = data[0]; bit; bit >>= 1, v <<= 1)
219 hash ^= byte & bit ? v : 0;
222 v |= (u64) key[5] << 24;
223 for (u8 bit = 1 << 7, byte = data[1]; bit; bit >>= 1, v <<= 1)
224 hash ^= byte & bit ? v : 0;
228 v |= (u64) key[6] << 24;
229 for (u8 bit = 1 << 7, byte = data[2]; bit; bit >>= 1, v <<= 1)
230 hash ^= byte & bit ? v : 0;
236 static_always_inline void
237 clib_toeplitz_hash_x4 (clib_toeplitz_hash_key_t *k, u8 *data0, u8 *data1,
238 u8 *data2, u8 *data3, u32 *hash0, u32 *hash1,
239 u32 *hash2, u32 *hash3, int n_bytes)
241 /* key must be 4 bytes longer than data */
242 ASSERT (k->key_length - n_bytes >= 4);
243 #if defined(__GFNI__) && defined(__AVX512F__)
244 u64x8u *m = (u64x8u *) ((u8 *) k + k->gfni_offset);
245 u8x64 d0, d1, d2, d3;
246 u64x8 h0 = {}, h2 = {};
249 /* move data ptr backwards for 3 byte so mask load "prepends" three zeros */
258 mask = pow2_mask (n_bytes - 3) << 3;
259 d0 = u8x64_mask_load_zero ((u8 *) data0, mask);
260 d1 = u8x64_mask_load_zero ((u8 *) data1, mask);
261 d2 = u8x64_mask_load_zero ((u8 *) data2, mask);
262 d3 = u8x64_mask_load_zero ((u8 *) data3, mask);
267 d0 = u8x64_mask_load_zero ((u8 *) data0, mask);
268 d1 = u8x64_mask_load_zero ((u8 *) data1, mask);
269 d2 = u8x64_mask_load_zero ((u8 *) data2, mask);
270 d3 = u8x64_mask_load_zero ((u8 *) data3, mask);
272 h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_two (d0, d1, m[0], 0),
273 clib_toeplitz_hash_gfni_two (d0, d1, m[1], 1));
274 h2 = u64x8_xor3 (h2, clib_toeplitz_hash_gfni_two (d2, d3, m[0], 0),
275 clib_toeplitz_hash_gfni_two (d2, d3, m[1], 1));
277 h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_two (d0, d1, m[2], 2),
278 clib_toeplitz_hash_gfni_two (d0, d1, m[3], 3));
279 h2 = u64x8_xor3 (h2, clib_toeplitz_hash_gfni_two (d2, d3, m[2], 2),
280 clib_toeplitz_hash_gfni_two (d2, d3, m[3], 3));
282 h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_two (d0, d1, m[4], 4),
283 clib_toeplitz_hash_gfni_two (d0, d1, m[5], 5));
284 h2 = u64x8_xor3 (h2, clib_toeplitz_hash_gfni_two (d2, d3, m[4], 4),
285 clib_toeplitz_hash_gfni_two (d2, d3, m[5], 5));
287 h0 ^= clib_toeplitz_hash_gfni_two (d0, d1, m[6], 6);
288 h2 ^= clib_toeplitz_hash_gfni_two (d2, d3, m[6], 6);
299 d0 = *(u8x64u *) data0;
300 d1 = *(u8x64u *) data1;
301 d2 = *(u8x64u *) data2;
302 d3 = *(u8x64u *) data3;
309 mask = pow2_mask (n_bytes);
310 d0 = u8x64_mask_load_zero ((u8 *) data0, mask);
311 d1 = u8x64_mask_load_zero ((u8 *) data1, mask);
312 d2 = u8x64_mask_load_zero ((u8 *) data2, mask);
313 d3 = u8x64_mask_load_zero ((u8 *) data3, mask);
315 h0 ^= clib_toeplitz_hash_gfni_two (d0, d1, m[0], 0);
316 h2 ^= clib_toeplitz_hash_gfni_two (d2, d3, m[0], 0);
323 d0 = (u8x64) u64x8_align_right (zero, d0, 1);
324 d1 = (u8x64) u64x8_align_right (zero, d1, 1);
325 d2 = (u8x64) u64x8_align_right (zero, d2, 1);
326 d3 = (u8x64) u64x8_align_right (zero, d3, 1);
337 #elif defined(CLIB_HAVE_VEC256)
339 u64x4 v4, shift = { 0, 1, 2, 3 };
340 u32x8 h0 = {}, h1 = {}, h2 = {}, h3 = {};
344 v4 = u64x4_splat (clib_net_to_host_u64 (*(u64u *) key)) << shift;
346 h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0);
347 h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0);
348 h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0);
349 h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0);
351 h0 = toeplitz_hash_one_x8 (h0, v4, data0[1], 1);
352 h1 = toeplitz_hash_one_x8 (h1, v4, data1[1], 1);
353 h2 = toeplitz_hash_one_x8 (h2, v4, data2[1], 1);
354 h3 = toeplitz_hash_one_x8 (h3, v4, data3[1], 1);
356 h0 = toeplitz_hash_one_x8 (h0, v4, data0[2], 2);
357 h1 = toeplitz_hash_one_x8 (h1, v4, data1[2], 2);
358 h2 = toeplitz_hash_one_x8 (h2, v4, data2[2], 2);
359 h3 = toeplitz_hash_one_x8 (h3, v4, data3[2], 2);
361 h0 = toeplitz_hash_one_x8 (h0, v4, data0[3], 3);
362 h1 = toeplitz_hash_one_x8 (h1, v4, data1[3], 3);
363 h2 = toeplitz_hash_one_x8 (h2, v4, data2[3], 3);
364 h3 = toeplitz_hash_one_x8 (h3, v4, data3[3], 3);
376 u64 v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32;
377 v |= (u64) key[4] << 24;
381 v |= (u64) key[5] << 16;
382 v |= (u64) key[6] << 8;
383 v4 = u64x4_splat (v) << shift;
384 h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0);
385 h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0);
386 h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0);
387 h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0);
389 h0 = toeplitz_hash_one_x8 (h0, v4, data0[1], 1);
390 h1 = toeplitz_hash_one_x8 (h1, v4, data1[1], 1);
391 h2 = toeplitz_hash_one_x8 (h2, v4, data2[1], 1);
392 h3 = toeplitz_hash_one_x8 (h3, v4, data3[1], 1);
394 h0 = toeplitz_hash_one_x8 (h0, v4, data0[2], 2);
395 h1 = toeplitz_hash_one_x8 (h1, v4, data1[2], 2);
396 h2 = toeplitz_hash_one_x8 (h2, v4, data2[2], 2);
397 h3 = toeplitz_hash_one_x8 (h3, v4, data3[2], 2);
399 else if (n_bytes == 2)
401 v |= (u64) key[5] << 16;
402 v4 = u64x4_splat (v) << shift;
403 h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0);
404 h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0);
405 h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0);
406 h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0);
408 h0 = toeplitz_hash_one_x8 (h0, v4, data0[1], 1);
409 h1 = toeplitz_hash_one_x8 (h1, v4, data1[1], 1);
410 h2 = toeplitz_hash_one_x8 (h2, v4, data2[1], 1);
411 h3 = toeplitz_hash_one_x8 (h3, v4, data3[1], 1);
415 v4 = u64x4_splat (v) << shift;
416 h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0);
417 h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0);
418 h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0);
419 h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0);
423 *hash0 = u32x8_hxor (h0);
424 *hash1 = u32x8_hxor (h1);
425 *hash2 = u32x8_hxor (h2);
426 *hash3 = u32x8_hxor (h3);
429 u64 v, h0 = 0, h1 = 0, h2 = 0, h3 = 0;
433 v = clib_net_to_host_u64 (*(u64u *) key);
435 for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
437 h0 ^= data0[0] & bit ? v : 0;
438 h1 ^= data1[0] & bit ? v : 0;
439 h2 ^= data2[0] & bit ? v : 0;
440 h3 ^= data3[0] & bit ? v : 0;
442 for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
444 h0 ^= data0[1] & bit ? v : 0;
445 h1 ^= data1[1] & bit ? v : 0;
446 h2 ^= data2[1] & bit ? v : 0;
447 h3 ^= data3[1] & bit ? v : 0;
449 for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
451 h0 ^= data0[2] & bit ? v : 0;
452 h1 ^= data1[2] & bit ? v : 0;
453 h2 ^= data2[2] & bit ? v : 0;
454 h3 ^= data3[2] & bit ? v : 0;
456 for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
458 h0 ^= data0[3] & bit ? v : 0;
459 h1 ^= data1[3] & bit ? v : 0;
460 h2 ^= data2[3] & bit ? v : 0;
461 h3 ^= data3[3] & bit ? v : 0;
474 v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32;
475 v |= (u64) key[4] << 24;
476 for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
478 h0 ^= data0[0] & bit ? v : 0;
479 h1 ^= data1[0] & bit ? v : 0;
480 h2 ^= data2[0] & bit ? v : 0;
481 h3 ^= data3[0] & bit ? v : 0;
485 v |= (u64) key[5] << 24;
486 for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
488 h0 ^= data0[1] & bit ? v : 0;
489 h1 ^= data1[1] & bit ? v : 0;
490 h2 ^= data2[1] & bit ? v : 0;
491 h3 ^= data3[1] & bit ? v : 0;
496 v |= (u64) key[6] << 24;
497 for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
499 h0 ^= data0[2] & bit ? v : 0;
500 h1 ^= data1[2] & bit ? v : 0;
501 h2 ^= data2[2] & bit ? v : 0;
502 h3 ^= data3[2] & bit ? v : 0;