1 /* SPDX-License-Identifier: Apache-2.0
2 * Copyright(c) 2021 Damjan Marion
5 #ifndef included_clib_memcpy_x86_64_h
6 #define included_clib_memcpy_x86_64_h
9 #include <vppinfra/clib.h>
10 #include <vppinfra/warnings.h>
13 /* clang-format off */
14 WARN_OFF (stringop-overflow)
17 static_always_inline void
18 clib_memcpy1 (void *d, void *s)
20 *(u8 *) d = *(u8 *) s;
23 static_always_inline void
24 clib_memcpy2 (void *d, void *s)
26 *(u16u *) d = *(u16u *) s;
29 static_always_inline void
30 clib_memcpy4 (void *d, void *s)
32 *(u32u *) d = *(u32u *) s;
35 static_always_inline void
36 clib_memcpy8 (void *d, void *s)
38 *(u64u *) d = *(u64u *) s;
41 static_always_inline void
42 clib_memcpy16 (void *d, void *s)
44 #ifdef CLIB_HAVE_VEC128
45 *(u8x16u *) d = *(u8x16u *) s;
48 clib_memcpy8 (d + 8, s + 8);
52 #ifdef CLIB_HAVE_VEC256
53 static_always_inline void
54 clib_memcpy32 (void *d, void *s)
56 *(u8x32u *) d = *(u8x32u *) s;
60 #ifdef CLIB_HAVE_VEC512
61 static_always_inline void
62 clib_memcpy64 (void *d, void *s)
64 *(u8x64u *) d = *(u8x64u *) s;
68 static_always_inline void
69 clib_memcpy_const_le32 (u8 *dst, u8 *src, size_t n)
74 clib_memcpy1 (dst, src);
77 clib_memcpy2 (dst, src);
80 clib_memcpy2 (dst, src);
81 clib_memcpy1 (dst + 2, src + 2);
84 clib_memcpy4 (dst, src);
87 clib_memcpy4 (dst, src);
88 clib_memcpy1 (dst + 4, src + 4);
91 clib_memcpy4 (dst, src);
92 clib_memcpy2 (dst + 4, src + 4);
95 clib_memcpy4 (dst, src);
96 clib_memcpy4 (dst + 3, src + 3);
99 clib_memcpy8 (dst, src);
102 clib_memcpy8 (dst, src);
103 clib_memcpy1 (dst + 8, src + 8);
106 clib_memcpy8 (dst, src);
107 clib_memcpy2 (dst + 8, src + 8);
111 clib_memcpy8 (dst, src);
112 clib_memcpy4 (dst + n - 4, src + n - 4);
117 clib_memcpy8 (dst, src);
118 clib_memcpy8 (dst + n - 8, src + n - 8);
121 clib_memcpy16 (dst, src);
124 clib_memcpy16 (dst, src);
125 clib_memcpy1 (dst + 16, src + 16);
128 clib_memcpy16 (dst, src);
129 clib_memcpy2 (dst + 16, src + 16);
132 clib_memcpy16 (dst, src);
133 clib_memcpy4 (dst + 16, src + 16);
136 clib_memcpy16 (dst, src);
137 clib_memcpy8 (dst + 16, src + 16);
140 clib_memcpy16 (dst, src);
141 clib_memcpy16 (dst + n - 16, src + n - 16);
146 static_always_inline void
147 clib_memcpy_const_le64 (u8 *dst, u8 *src, size_t n)
151 clib_memcpy_const_le32 (dst, src, n);
155 #if defined(CLIB_HAVE_VEC256)
159 clib_memcpy32 (dst, src);
162 clib_memcpy32 (dst, src);
163 clib_memcpy1 (dst + 32, src + 32);
166 clib_memcpy32 (dst, src);
167 clib_memcpy2 (dst + 32, src + 32);
170 clib_memcpy32 (dst, src);
171 clib_memcpy4 (dst + 32, src + 32);
174 clib_memcpy32 (dst, src);
175 clib_memcpy8 (dst + 32, src + 32);
178 clib_memcpy32 (dst, src);
179 clib_memcpy16 (dst + 32, src + 32);
182 clib_memcpy32 (dst, src);
183 clib_memcpy32 (dst + n - 32, src + n - 32);
189 clib_memcpy16 (dst, src);
190 clib_memcpy16 (dst + 16, src + 16);
195 clib_memcpy_const_le32 (dst, src, n);
199 static_always_inline void
200 clib_memcpy_x86_64_const (u8 *dst, u8 *src, size_t n)
202 #if defined(CLIB_HAVE_VEC512)
205 clib_memcpy64 (dst, src);
213 clib_memcpy_const_le64 (dst, src, n);
220 clib_memcpy64 (dst, src);
223 clib_memcpy64 (dst, src);
224 clib_memcpy1 (dst + 64, src + 64);
227 clib_memcpy64 (dst, src);
228 clib_memcpy2 (dst + 64, src + 64);
231 clib_memcpy64 (dst, src);
232 clib_memcpy4 (dst + 64, src + 64);
235 clib_memcpy64 (dst, src);
236 clib_memcpy8 (dst + 64, src + 64);
239 clib_memcpy64 (dst, src);
240 clib_memcpy16 (dst + 64, src + 64);
243 clib_memcpy64 (dst, src);
244 clib_memcpy32 (dst + 64, src + 64);
247 clib_memcpy64 (dst, src);
248 clib_memcpy64 (dst + n - 64, src + n - 64);
251 #elif defined(CLIB_HAVE_VEC256)
254 clib_memcpy32 (dst, src);
259 clib_memcpy_const_le64 (dst, src, n);
263 clib_memcpy16 (dst, src);
268 clib_memcpy_const_le32 (dst, src, n);
272 static_always_inline void *
273 clib_memcpy_x86_64 (void *restrict dst, const void *restrict src, size_t n)
275 u8 *d = (u8 *) dst, *s = (u8 *) src;
280 if (COMPILE_TIME_CONST (n))
283 clib_memcpy_x86_64_const (d, s, n);
289 #if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
290 u32 mask = pow2_mask (n);
291 u8x32_mask_store (u8x32_mask_load_zero (s, mask), d, mask);
293 if (PREDICT_TRUE (n >= 16))
295 clib_memcpy16 (d, s);
296 clib_memcpy16 (d + n - 16, s + n - 16);
298 else if (PREDICT_TRUE (n >= 8))
301 clib_memcpy8 (d + n - 8, s + n - 8);
303 else if (PREDICT_TRUE (n >= 4))
306 clib_memcpy4 (d + n - 4, s + n - 4);
308 else if (PREDICT_TRUE (n > 1))
311 clib_memcpy2 (d + n - 2, s + n - 2);
317 #ifdef CLIB_HAVE_VEC512
320 u8x64 v0, v1, v2, v3;
321 u64 final_off, nr, off = 64;
326 u8x32_store_unaligned (u8x32_load_unaligned (s), d);
327 u8x32_store_unaligned (u8x32_load_unaligned (s + n), d + n);
331 u8x64_store_unaligned (u8x64_load_unaligned (s), d);
341 nr = round_pow2 (n - 128, 64);
345 off -= ((u64) d) & 0x3f;
346 nr = round_pow2 (n - off - 64, 64);
347 final_off = (nr & ~(u64) 0x1ff) + off;
350 v0 = u8x64_load_unaligned (s + off + 0x000);
351 v1 = u8x64_load_unaligned (s + off + 0x040);
352 v2 = u8x64_load_unaligned (s + off + 0x080);
353 v3 = u8x64_load_unaligned (s + off + 0x0c0);
354 u8x64_store_unaligned (v0, d + off + 0x000);
355 u8x64_store_unaligned (v1, d + off + 0x040);
356 u8x64_store_unaligned (v2, d + off + 0x080);
357 u8x64_store_unaligned (v3, d + off + 0x0c0);
358 v0 = u8x64_load_unaligned (s + off + 0x100);
359 v1 = u8x64_load_unaligned (s + off + 0x140);
360 v2 = u8x64_load_unaligned (s + off + 0x180);
361 v3 = u8x64_load_unaligned (s + off + 0x1c0);
362 u8x64_store_unaligned (v0, d + off + 0x100);
363 u8x64_store_unaligned (v1, d + off + 0x140);
364 u8x64_store_unaligned (v2, d + off + 0x180);
365 u8x64_store_unaligned (v3, d + off + 0x1c0);
367 if (off != final_off)
370 if ((nr & 0x1ff) == 0)
374 if (PREDICT_TRUE (nr & 256))
376 v0 = u8x64_load_unaligned (s + off + 0x000);
377 v1 = u8x64_load_unaligned (s + off + 0x040);
378 v2 = u8x64_load_unaligned (s + off + 0x080);
379 v3 = u8x64_load_unaligned (s + off + 0x0c0);
380 u8x64_store_unaligned (v0, d + off + 0x000);
381 u8x64_store_unaligned (v1, d + off + 0x040);
382 u8x64_store_unaligned (v2, d + off + 0x080);
383 u8x64_store_unaligned (v3, d + off + 0x0c0);
386 if (PREDICT_TRUE (nr & 128))
388 v0 = u8x64_load_unaligned (s + off + 0x000);
389 v1 = u8x64_load_unaligned (s + off + 0x040);
390 u8x64_store_unaligned (v0, d + off + 0x000);
391 u8x64_store_unaligned (v1, d + off + 0x040);
394 if (PREDICT_TRUE (nr & 64))
397 u8x64_store_unaligned (u8x64_load_unaligned (s + off), d + off);
400 u8x64_store_unaligned (u8x64_load_unaligned (s + n - 64), d + n - 64);
403 #elif defined(CLIB_HAVE_VEC256)
406 u8x32 v0, v1, v2, v3;
407 u64 final_off, nr, off = 32;
409 u8x32_store_unaligned (u8x32_load_unaligned (s), d);
419 nr = round_pow2 (n - 64, 32);
423 off -= ((u64) d) & 0x1f;
424 nr = round_pow2 (n - off - 32, 32);
425 final_off = (nr & ~(u64) 0xff) + off;
428 v0 = u8x32_load_unaligned (s + off + 0x00);
429 v1 = u8x32_load_unaligned (s + off + 0x20);
430 v2 = u8x32_load_unaligned (s + off + 0x40);
431 v3 = u8x32_load_unaligned (s + off + 0x60);
432 u8x32_store_unaligned (v0, d + off + 0x00);
433 u8x32_store_unaligned (v1, d + off + 0x20);
434 u8x32_store_unaligned (v2, d + off + 0x40);
435 u8x32_store_unaligned (v3, d + off + 0x60);
436 v0 = u8x32_load_unaligned (s + off + 0x80);
437 v1 = u8x32_load_unaligned (s + off + 0xa0);
438 v2 = u8x32_load_unaligned (s + off + 0xc0);
439 v3 = u8x32_load_unaligned (s + off + 0xe0);
440 u8x32_store_unaligned (v0, d + off + 0x80);
441 u8x32_store_unaligned (v1, d + off + 0xa0);
442 u8x32_store_unaligned (v2, d + off + 0xc0);
443 u8x32_store_unaligned (v3, d + off + 0xe0);
445 if (off != final_off)
448 if ((nr & 0xff) == 0)
452 if (PREDICT_TRUE (nr & 128))
454 v0 = u8x32_load_unaligned (s + off + 0x00);
455 v1 = u8x32_load_unaligned (s + off + 0x20);
456 v2 = u8x32_load_unaligned (s + off + 0x40);
457 v3 = u8x32_load_unaligned (s + off + 0x60);
458 u8x32_store_unaligned (v0, d + off + 0x00);
459 u8x32_store_unaligned (v1, d + off + 0x20);
460 u8x32_store_unaligned (v2, d + off + 0x40);
461 u8x32_store_unaligned (v3, d + off + 0x60);
464 if (PREDICT_TRUE (nr & 64))
466 v0 = u8x32_load_unaligned (s + off + 0x00);
467 v1 = u8x32_load_unaligned (s + off + 0x20);
468 u8x32_store_unaligned (v0, d + off + 0x00);
469 u8x32_store_unaligned (v1, d + off + 0x20);
472 if (PREDICT_TRUE (nr & 32))
475 u8x32_store_unaligned (u8x32_load_unaligned (s + off), d + off);
478 u8x32_store_unaligned (u8x32_load_unaligned (s + n - 32), d + n - 32);
481 #elif defined(CLIB_HAVE_VEC128)
484 u8x16 v0, v1, v2, v3;
485 u64 final_off, nr, off = 32;
489 __builtin_memcpy (d, s, n);
493 u8x16_store_unaligned (u8x16_load_unaligned (s), d);
494 u8x16_store_unaligned (u8x16_load_unaligned (s + 16), d + 16);
504 nr = round_pow2 (n - 48, 16);
508 off -= ((u64) d) & 0x0f;
509 nr = round_pow2 (n - off - 16, 16);
510 final_off = (nr & ~(u64) 0xff) + off;
513 v0 = u8x16_load_unaligned (s + off + 0x00);
514 v1 = u8x16_load_unaligned (s + off + 0x10);
515 v2 = u8x16_load_unaligned (s + off + 0x20);
516 v3 = u8x16_load_unaligned (s + off + 0x30);
517 u8x16_store_unaligned (v0, d + off + 0x00);
518 u8x16_store_unaligned (v1, d + off + 0x10);
519 u8x16_store_unaligned (v2, d + off + 0x20);
520 u8x16_store_unaligned (v3, d + off + 0x30);
521 v0 = u8x16_load_unaligned (s + off + 0x40);
522 v1 = u8x16_load_unaligned (s + off + 0x50);
523 v2 = u8x16_load_unaligned (s + off + 0x60);
524 v3 = u8x16_load_unaligned (s + off + 0x70);
525 u8x16_store_unaligned (v0, d + off + 0x40);
526 u8x16_store_unaligned (v1, d + off + 0x50);
527 u8x16_store_unaligned (v2, d + off + 0x60);
528 u8x16_store_unaligned (v3, d + off + 0x70);
529 v0 = u8x16_load_unaligned (s + off + 0x80);
530 v1 = u8x16_load_unaligned (s + off + 0x90);
531 v2 = u8x16_load_unaligned (s + off + 0xa0);
532 v3 = u8x16_load_unaligned (s + off + 0xb0);
533 u8x16_store_unaligned (v0, d + off + 0x80);
534 u8x16_store_unaligned (v1, d + off + 0x90);
535 u8x16_store_unaligned (v2, d + off + 0xa0);
536 u8x16_store_unaligned (v3, d + off + 0xb0);
537 v0 = u8x16_load_unaligned (s + off + 0xc0);
538 v1 = u8x16_load_unaligned (s + off + 0xd0);
539 v2 = u8x16_load_unaligned (s + off + 0xe0);
540 v3 = u8x16_load_unaligned (s + off + 0xf0);
541 u8x16_store_unaligned (v0, d + off + 0xc0);
542 u8x16_store_unaligned (v1, d + off + 0xd0);
543 u8x16_store_unaligned (v2, d + off + 0xe0);
544 u8x16_store_unaligned (v3, d + off + 0xf0);
546 if (off != final_off)
549 if ((nr & 0xff) == 0)
553 if (PREDICT_TRUE (nr & 128))
555 v0 = u8x16_load_unaligned (s + off + 0x00);
556 v1 = u8x16_load_unaligned (s + off + 0x10);
557 v2 = u8x16_load_unaligned (s + off + 0x20);
558 v3 = u8x16_load_unaligned (s + off + 0x30);
559 u8x16_store_unaligned (v0, d + off + 0x00);
560 u8x16_store_unaligned (v1, d + off + 0x10);
561 u8x16_store_unaligned (v2, d + off + 0x20);
562 u8x16_store_unaligned (v3, d + off + 0x30);
563 v0 = u8x16_load_unaligned (s + off + 0x40);
564 v1 = u8x16_load_unaligned (s + off + 0x50);
565 v2 = u8x16_load_unaligned (s + off + 0x60);
566 v3 = u8x16_load_unaligned (s + off + 0x70);
567 u8x16_store_unaligned (v0, d + off + 0x40);
568 u8x16_store_unaligned (v1, d + off + 0x50);
569 u8x16_store_unaligned (v2, d + off + 0x60);
570 u8x16_store_unaligned (v3, d + off + 0x70);
573 if (PREDICT_TRUE (nr & 64))
575 v0 = u8x16_load_unaligned (s + off + 0x00);
576 v1 = u8x16_load_unaligned (s + off + 0x10);
577 v2 = u8x16_load_unaligned (s + off + 0x20);
578 v3 = u8x16_load_unaligned (s + off + 0x30);
579 u8x16_store_unaligned (v0, d + off + 0x00);
580 u8x16_store_unaligned (v1, d + off + 0x10);
581 u8x16_store_unaligned (v2, d + off + 0x20);
582 u8x16_store_unaligned (v3, d + off + 0x30);
585 if (PREDICT_TRUE (nr & 32))
587 v0 = u8x16_load_unaligned (s + off + 0x00);
588 v1 = u8x16_load_unaligned (s + off + 0x10);
589 u8x16_store_unaligned (v0, d + off + 0x00);
590 u8x16_store_unaligned (v1, d + off + 0x10);
593 if (PREDICT_TRUE (nr & 16))
596 u8x16_store_unaligned (u8x16_load_unaligned (s + off), d + off);
599 u8x16_store_unaligned (u8x16_load_unaligned (s + n - 16), d + n - 16);
603 __builtin_memcpy (dst, src, n);
608 /* clang-format off */
609 WARN_ON (stringop-overflow)
610 /* clang-format on */