1 /* SPDX-License-Identifier: Apache-2.0
2 * Copyright(c) 2021 Damjan Marion
5 #ifndef included_clib_memcpy_x86_64_h
6 #define included_clib_memcpy_x86_64_h
9 #include <vppinfra/clib.h>
10 #include <vppinfra/warnings.h>
13 /* clang-format off */
14 WARN_OFF (stringop-overflow)
17 static_always_inline void
18 clib_memcpy1 (void *d, void *s)
20 *(u8 *) d = *(u8 *) s;
23 static_always_inline void
24 clib_memcpy2 (void *d, void *s)
26 *(u16u *) d = *(u16u *) s;
29 static_always_inline void
30 clib_memcpy4 (void *d, void *s)
32 *(u32u *) d = *(u32u *) s;
35 static_always_inline void
36 clib_memcpy8 (void *d, void *s)
38 *(u64u *) d = *(u64u *) s;
41 #ifdef CLIB_HAVE_VEC128
42 static_always_inline void
43 clib_memcpy16 (void *d, void *s)
45 *(u8x16u *) d = *(u8x16u *) s;
49 #ifdef CLIB_HAVE_VEC256
50 static_always_inline void
51 clib_memcpy32 (void *d, void *s)
53 *(u8x32u *) d = *(u8x32u *) s;
57 #ifdef CLIB_HAVE_VEC512
58 static_always_inline void
59 clib_memcpy64 (void *d, void *s)
61 *(u8x64u *) d = *(u8x64u *) s;
65 static_always_inline void
66 clib_memcpy_const_le32 (u8 *dst, u8 *src, size_t n)
71 clib_memcpy1 (dst, src);
74 clib_memcpy2 (dst, src);
77 clib_memcpy2 (dst, src);
78 clib_memcpy1 (dst + 2, src + 2);
81 clib_memcpy4 (dst, src);
84 clib_memcpy4 (dst, src);
85 clib_memcpy1 (dst + 4, src + 4);
88 clib_memcpy4 (dst, src);
89 clib_memcpy2 (dst + 4, src + 4);
92 clib_memcpy4 (dst, src);
93 clib_memcpy4 (dst + 3, src + 3);
96 clib_memcpy8 (dst, src);
99 clib_memcpy8 (dst, src);
100 clib_memcpy1 (dst + 8, src + 8);
103 clib_memcpy8 (dst, src);
104 clib_memcpy2 (dst + 8, src + 8);
108 clib_memcpy8 (dst, src);
109 clib_memcpy4 (dst + n - 4, src + n - 4);
114 clib_memcpy8 (dst, src);
115 clib_memcpy8 (dst + n - 8, src + n - 8);
118 clib_memcpy16 (dst, src);
121 clib_memcpy16 (dst, src);
122 clib_memcpy1 (dst + 16, src + 16);
125 clib_memcpy16 (dst, src);
126 clib_memcpy2 (dst + 16, src + 16);
129 clib_memcpy16 (dst, src);
130 clib_memcpy4 (dst + 16, src + 16);
133 clib_memcpy16 (dst, src);
134 clib_memcpy8 (dst + 16, src + 16);
137 clib_memcpy16 (dst, src);
138 clib_memcpy16 (dst + n - 16, src + n - 16);
143 static_always_inline void
144 clib_memcpy_const_le64 (u8 *dst, u8 *src, size_t n)
148 clib_memcpy_const_le32 (dst, src, n);
152 #if defined(CLIB_HAVE_VEC256)
156 clib_memcpy32 (dst, src);
159 clib_memcpy32 (dst, src);
160 clib_memcpy1 (dst + 32, src + 32);
163 clib_memcpy32 (dst, src);
164 clib_memcpy2 (dst + 32, src + 32);
167 clib_memcpy32 (dst, src);
168 clib_memcpy4 (dst + 32, src + 32);
171 clib_memcpy32 (dst, src);
172 clib_memcpy8 (dst + 32, src + 32);
175 clib_memcpy32 (dst, src);
176 clib_memcpy16 (dst + 32, src + 32);
179 clib_memcpy32 (dst, src);
180 clib_memcpy32 (dst + n - 32, src + n - 32);
186 clib_memcpy16 (dst, src);
187 clib_memcpy16 (dst + 16, src + 16);
192 clib_memcpy_const_le32 (dst, src, n);
196 static_always_inline void
197 clib_memcpy_x86_64_const (u8 *dst, u8 *src, size_t n)
199 #if defined(CLIB_HAVE_VEC512)
202 clib_memcpy64 (dst, src);
210 clib_memcpy_const_le64 (dst, src, n);
217 clib_memcpy64 (dst, src);
220 clib_memcpy64 (dst, src);
221 clib_memcpy1 (dst + 64, src + 64);
224 clib_memcpy64 (dst, src);
225 clib_memcpy2 (dst + 64, src + 64);
228 clib_memcpy64 (dst, src);
229 clib_memcpy4 (dst + 64, src + 64);
232 clib_memcpy64 (dst, src);
233 clib_memcpy8 (dst + 64, src + 64);
236 clib_memcpy64 (dst, src);
237 clib_memcpy16 (dst + 64, src + 64);
240 clib_memcpy64 (dst, src);
241 clib_memcpy32 (dst + 64, src + 64);
244 clib_memcpy64 (dst, src);
245 clib_memcpy64 (dst + n - 64, src + n - 64);
248 #elif defined(CLIB_HAVE_VEC256)
251 clib_memcpy32 (dst, src);
256 clib_memcpy_const_le64 (dst, src, n);
260 clib_memcpy16 (dst, src);
265 clib_memcpy_const_le32 (dst, src, n);
269 static_always_inline void *
270 clib_memcpy_x86_64 (void *restrict dst, const void *restrict src, size_t n)
272 u8 *d = (u8 *) dst, *s = (u8 *) src;
277 if (COMPILE_TIME_CONST (n))
280 clib_memcpy_x86_64_const (d, s, n);
286 #if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
287 u32 mask = pow2_mask (n);
288 u8x32_mask_store (u8x32_mask_load_zero (s, mask), d, mask);
290 if (PREDICT_TRUE (n >= 16))
292 clib_memcpy16 (d, s);
293 clib_memcpy16 (d + n - 16, s + n - 16);
295 else if (PREDICT_TRUE (n >= 8))
298 clib_memcpy8 (d + n - 8, s + n - 8);
300 else if (PREDICT_TRUE (n >= 4))
303 clib_memcpy4 (d + n - 4, s + n - 4);
305 else if (PREDICT_TRUE (n > 1))
308 clib_memcpy2 (d + n - 2, s + n - 2);
314 #ifdef CLIB_HAVE_VEC512
317 u8x64 v0, v1, v2, v3;
318 u64 final_off, nr, off = 64;
323 u8x32_store_unaligned (u8x32_load_unaligned (s), d);
324 u8x32_store_unaligned (u8x32_load_unaligned (s + n), d + n);
328 u8x64_store_unaligned (u8x64_load_unaligned (s), d);
338 nr = round_pow2 (n - 128, 64);
342 off -= ((u64) d) & 0x3f;
343 nr = round_pow2 (n - off - 64, 64);
344 final_off = (nr & ~(u64) 0x1ff) + off;
347 v0 = u8x64_load_unaligned (s + off + 0x000);
348 v1 = u8x64_load_unaligned (s + off + 0x040);
349 v2 = u8x64_load_unaligned (s + off + 0x080);
350 v3 = u8x64_load_unaligned (s + off + 0x0c0);
351 u8x64_store_unaligned (v0, d + off + 0x000);
352 u8x64_store_unaligned (v1, d + off + 0x040);
353 u8x64_store_unaligned (v2, d + off + 0x080);
354 u8x64_store_unaligned (v3, d + off + 0x0c0);
355 v0 = u8x64_load_unaligned (s + off + 0x100);
356 v1 = u8x64_load_unaligned (s + off + 0x140);
357 v2 = u8x64_load_unaligned (s + off + 0x180);
358 v3 = u8x64_load_unaligned (s + off + 0x1c0);
359 u8x64_store_unaligned (v0, d + off + 0x100);
360 u8x64_store_unaligned (v1, d + off + 0x140);
361 u8x64_store_unaligned (v2, d + off + 0x180);
362 u8x64_store_unaligned (v3, d + off + 0x1c0);
364 if (off != final_off)
367 if ((nr & 0x1ff) == 0)
371 if (PREDICT_TRUE (nr & 256))
373 v0 = u8x64_load_unaligned (s + off + 0x000);
374 v1 = u8x64_load_unaligned (s + off + 0x040);
375 v2 = u8x64_load_unaligned (s + off + 0x080);
376 v3 = u8x64_load_unaligned (s + off + 0x0c0);
377 u8x64_store_unaligned (v0, d + off + 0x000);
378 u8x64_store_unaligned (v1, d + off + 0x040);
379 u8x64_store_unaligned (v2, d + off + 0x080);
380 u8x64_store_unaligned (v3, d + off + 0x0c0);
383 if (PREDICT_TRUE (nr & 128))
385 v0 = u8x64_load_unaligned (s + off + 0x000);
386 v1 = u8x64_load_unaligned (s + off + 0x040);
387 u8x64_store_unaligned (v0, d + off + 0x000);
388 u8x64_store_unaligned (v1, d + off + 0x040);
391 if (PREDICT_TRUE (nr & 64))
394 u8x64_store_unaligned (u8x64_load_unaligned (s + off), d + off);
397 u8x64_store_unaligned (u8x64_load_unaligned (s + n - 64), d + n - 64);
400 #elif defined(CLIB_HAVE_VEC256)
403 u8x32 v0, v1, v2, v3;
404 u64 final_off, nr, off = 32;
406 u8x32_store_unaligned (u8x32_load_unaligned (s), d);
416 nr = round_pow2 (n - 64, 32);
420 off -= ((u64) d) & 0x1f;
421 nr = round_pow2 (n - off - 32, 32);
422 final_off = (nr & ~(u64) 0xff) + off;
425 v0 = u8x32_load_unaligned (s + off + 0x00);
426 v1 = u8x32_load_unaligned (s + off + 0x20);
427 v2 = u8x32_load_unaligned (s + off + 0x40);
428 v3 = u8x32_load_unaligned (s + off + 0x60);
429 u8x32_store_unaligned (v0, d + off + 0x00);
430 u8x32_store_unaligned (v1, d + off + 0x20);
431 u8x32_store_unaligned (v2, d + off + 0x40);
432 u8x32_store_unaligned (v3, d + off + 0x60);
433 v0 = u8x32_load_unaligned (s + off + 0x80);
434 v1 = u8x32_load_unaligned (s + off + 0xa0);
435 v2 = u8x32_load_unaligned (s + off + 0xc0);
436 v3 = u8x32_load_unaligned (s + off + 0xe0);
437 u8x32_store_unaligned (v0, d + off + 0x80);
438 u8x32_store_unaligned (v1, d + off + 0xa0);
439 u8x32_store_unaligned (v2, d + off + 0xc0);
440 u8x32_store_unaligned (v3, d + off + 0xe0);
442 if (off != final_off)
445 if ((nr & 0xff) == 0)
449 if (PREDICT_TRUE (nr & 128))
451 v0 = u8x32_load_unaligned (s + off + 0x00);
452 v1 = u8x32_load_unaligned (s + off + 0x20);
453 v2 = u8x32_load_unaligned (s + off + 0x40);
454 v3 = u8x32_load_unaligned (s + off + 0x60);
455 u8x32_store_unaligned (v0, d + off + 0x00);
456 u8x32_store_unaligned (v1, d + off + 0x20);
457 u8x32_store_unaligned (v2, d + off + 0x40);
458 u8x32_store_unaligned (v3, d + off + 0x60);
461 if (PREDICT_TRUE (nr & 64))
463 v0 = u8x32_load_unaligned (s + off + 0x00);
464 v1 = u8x32_load_unaligned (s + off + 0x20);
465 u8x32_store_unaligned (v0, d + off + 0x00);
466 u8x32_store_unaligned (v1, d + off + 0x20);
469 if (PREDICT_TRUE (nr & 32))
472 u8x32_store_unaligned (u8x32_load_unaligned (s + off), d + off);
475 u8x32_store_unaligned (u8x32_load_unaligned (s + n - 32), d + n - 32);
478 #elif defined(CLIB_HAVE_VEC128)
481 u8x16 v0, v1, v2, v3;
482 u64 final_off, nr, off = 32;
486 __builtin_memcpy (d, s, n);
490 u8x16_store_unaligned (u8x16_load_unaligned (s), d);
491 u8x16_store_unaligned (u8x16_load_unaligned (s + 16), d + 16);
501 nr = round_pow2 (n - 48, 16);
505 off -= ((u64) d) & 0x0f;
506 nr = round_pow2 (n - off - 16, 16);
507 final_off = (nr & ~(u64) 0xff) + off;
510 v0 = u8x16_load_unaligned (s + off + 0x00);
511 v1 = u8x16_load_unaligned (s + off + 0x10);
512 v2 = u8x16_load_unaligned (s + off + 0x20);
513 v3 = u8x16_load_unaligned (s + off + 0x30);
514 u8x16_store_unaligned (v0, d + off + 0x00);
515 u8x16_store_unaligned (v1, d + off + 0x10);
516 u8x16_store_unaligned (v2, d + off + 0x20);
517 u8x16_store_unaligned (v3, d + off + 0x30);
518 v0 = u8x16_load_unaligned (s + off + 0x40);
519 v1 = u8x16_load_unaligned (s + off + 0x50);
520 v2 = u8x16_load_unaligned (s + off + 0x60);
521 v3 = u8x16_load_unaligned (s + off + 0x70);
522 u8x16_store_unaligned (v0, d + off + 0x40);
523 u8x16_store_unaligned (v1, d + off + 0x50);
524 u8x16_store_unaligned (v2, d + off + 0x60);
525 u8x16_store_unaligned (v3, d + off + 0x70);
526 v0 = u8x16_load_unaligned (s + off + 0x80);
527 v1 = u8x16_load_unaligned (s + off + 0x90);
528 v2 = u8x16_load_unaligned (s + off + 0xa0);
529 v3 = u8x16_load_unaligned (s + off + 0xb0);
530 u8x16_store_unaligned (v0, d + off + 0x80);
531 u8x16_store_unaligned (v1, d + off + 0x90);
532 u8x16_store_unaligned (v2, d + off + 0xa0);
533 u8x16_store_unaligned (v3, d + off + 0xb0);
534 v0 = u8x16_load_unaligned (s + off + 0xc0);
535 v1 = u8x16_load_unaligned (s + off + 0xd0);
536 v2 = u8x16_load_unaligned (s + off + 0xe0);
537 v3 = u8x16_load_unaligned (s + off + 0xf0);
538 u8x16_store_unaligned (v0, d + off + 0xc0);
539 u8x16_store_unaligned (v1, d + off + 0xd0);
540 u8x16_store_unaligned (v2, d + off + 0xe0);
541 u8x16_store_unaligned (v3, d + off + 0xf0);
543 if (off != final_off)
546 if ((nr & 0xff) == 0)
550 if (PREDICT_TRUE (nr & 128))
552 v0 = u8x16_load_unaligned (s + off + 0x00);
553 v1 = u8x16_load_unaligned (s + off + 0x10);
554 v2 = u8x16_load_unaligned (s + off + 0x20);
555 v3 = u8x16_load_unaligned (s + off + 0x30);
556 u8x16_store_unaligned (v0, d + off + 0x00);
557 u8x16_store_unaligned (v1, d + off + 0x10);
558 u8x16_store_unaligned (v2, d + off + 0x20);
559 u8x16_store_unaligned (v3, d + off + 0x30);
560 v0 = u8x16_load_unaligned (s + off + 0x40);
561 v1 = u8x16_load_unaligned (s + off + 0x50);
562 v2 = u8x16_load_unaligned (s + off + 0x60);
563 v3 = u8x16_load_unaligned (s + off + 0x70);
564 u8x16_store_unaligned (v0, d + off + 0x40);
565 u8x16_store_unaligned (v1, d + off + 0x50);
566 u8x16_store_unaligned (v2, d + off + 0x60);
567 u8x16_store_unaligned (v3, d + off + 0x70);
570 if (PREDICT_TRUE (nr & 64))
572 v0 = u8x16_load_unaligned (s + off + 0x00);
573 v1 = u8x16_load_unaligned (s + off + 0x10);
574 v2 = u8x16_load_unaligned (s + off + 0x20);
575 v3 = u8x16_load_unaligned (s + off + 0x30);
576 u8x16_store_unaligned (v0, d + off + 0x00);
577 u8x16_store_unaligned (v1, d + off + 0x10);
578 u8x16_store_unaligned (v2, d + off + 0x20);
579 u8x16_store_unaligned (v3, d + off + 0x30);
582 if (PREDICT_TRUE (nr & 32))
584 v0 = u8x16_load_unaligned (s + off + 0x00);
585 v1 = u8x16_load_unaligned (s + off + 0x10);
586 u8x16_store_unaligned (v0, d + off + 0x00);
587 u8x16_store_unaligned (v1, d + off + 0x10);
590 if (PREDICT_TRUE (nr & 16))
593 u8x16_store_unaligned (u8x16_load_unaligned (s + off), d + off);
596 u8x16_store_unaligned (u8x16_load_unaligned (s + n - 16), d + n - 16);
600 __builtin_memcpy (dst, src, n);
605 /* clang-format off */
606 WARN_ON (stringop-overflow)
607 /* clang-format on */