lib/librte_eal/common/include/arch/x86/rte_memcpy.h

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2010-2014 Intel Corporation
   3  */
   4
   5 #ifndef _RTE_MEMCPY_X86_64_H_
   6 #define _RTE_MEMCPY_X86_64_H_
   7
   8 /**
   9  * @file
  10  *
  11  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  12  */
  13
  14 #include <stdio.h>
  15 #include <stdint.h>
  16 #include <string.h>
  17 #include <rte_vect.h>
  18 #include <rte_common.h>
  19 #include <rte_config.h>
  20
  21 #ifdef __cplusplus
  22 extern "C" {
  23 #endif
  24
  25 /**
  26  * Copy bytes from one location to another. The locations must not overlap.
  27  *
  28  * @note This is implemented as a macro, so it's address should not be taken
  29  * and care is needed as parameter expressions may be evaluated multiple times.
  30  *
  31  * @param dst
  32  *   Pointer to the destination of the data.
  33  * @param src
  34  *   Pointer to the source data.
  35  * @param n
  36  *   Number of bytes to copy.
  37  * @return
  38  *   Pointer to the destination data.
  39  */
  40 static __rte_always_inline void *
  41 rte_memcpy(void *dst, const void *src, size_t n);
  42
  43 #ifdef RTE_MACHINE_CPUFLAG_AVX512F
  44
  45 #define ALIGNMENT_MASK 0x3F
  46
  47 /**
  48  * AVX512 implementation below
  49  */
  50
  51 /**
  52  * Copy 16 bytes from one location to another,
  53  * locations should not overlap.
  54  */
  55 static inline void
  56 rte_mov16(uint8_t *dst, const uint8_t *src)
  57 {
  58         __m128i xmm0;
  59
  60         xmm0 = _mm_loadu_si128((const __m128i *)src);
  61         _mm_storeu_si128((__m128i *)dst, xmm0);
  62 }
  63
  64 /**
  65  * Copy 32 bytes from one location to another,
  66  * locations should not overlap.
  67  */
  68 static inline void
  69 rte_mov32(uint8_t *dst, const uint8_t *src)
  70 {
  71         __m256i ymm0;
  72
  73         ymm0 = _mm256_loadu_si256((const __m256i *)src);
  74         _mm256_storeu_si256((__m256i *)dst, ymm0);
  75 }
  76
  77 /**
  78  * Copy 64 bytes from one location to another,
  79  * locations should not overlap.
  80  */
  81 static inline void
  82 rte_mov64(uint8_t *dst, const uint8_t *src)
  83 {
  84         __m512i zmm0;
  85
  86         zmm0 = _mm512_loadu_si512((const void *)src);
  87         _mm512_storeu_si512((void *)dst, zmm0);
  88 }
  89
  90 /**
  91  * Copy 128 bytes from one location to another,
  92  * locations should not overlap.
  93  */
  94 static inline void
  95 rte_mov128(uint8_t *dst, const uint8_t *src)
  96 {
  97         rte_mov64(dst + 0 * 64, src + 0 * 64);
  98         rte_mov64(dst + 1 * 64, src + 1 * 64);
  99 }
 100
 101 /**
 102  * Copy 256 bytes from one location to another,
 103  * locations should not overlap.
 104  */
 105 static inline void
 106 rte_mov256(uint8_t *dst, const uint8_t *src)
 107 {
 108         rte_mov64(dst + 0 * 64, src + 0 * 64);
 109         rte_mov64(dst + 1 * 64, src + 1 * 64);
 110         rte_mov64(dst + 2 * 64, src + 2 * 64);
 111         rte_mov64(dst + 3 * 64, src + 3 * 64);
 112 }
 113
 114 /**
 115  * Copy 128-byte blocks from one location to another,
 116  * locations should not overlap.
 117  */
 118 static inline void
 119 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 120 {
 121         __m512i zmm0, zmm1;
 122
 123         while (n >= 128) {
 124                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 125                 n -= 128;
 126                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 127                 src = src + 128;
 128                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 129                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 130                 dst = dst + 128;
 131         }
 132 }
 133
 134 /**
 135  * Copy 512-byte blocks from one location to another,
 136  * locations should not overlap.
 137  */
 138 static inline void
 139 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 140 {
 141         __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 142
 143         while (n >= 512) {
 144                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 145                 n -= 512;
 146                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 147                 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
 148                 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
 149                 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
 150                 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
 151                 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
 152                 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
 153                 src = src + 512;
 154                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 155                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 156                 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
 157                 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
 158                 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
 159                 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
 160                 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
 161                 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
 162                 dst = dst + 512;
 163         }
 164 }
 165
 166 static inline void *
 167 rte_memcpy_generic(void *dst, const void *src, size_t n)
 168 {
 169         uintptr_t dstu = (uintptr_t)dst;
 170         uintptr_t srcu = (uintptr_t)src;
 171         void *ret = dst;
 172         size_t dstofss;
 173         size_t bits;
 174
 175         /**
 176          * Copy less than 16 bytes
 177          */
 178         if (n < 16) {
 179                 if (n & 0x01) {
 180                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 181                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 182                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 183                 }
 184                 if (n & 0x02) {
 185                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 186                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 187                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 188                 }
 189                 if (n & 0x04) {
 190                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 191                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 192                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 193                 }
 194                 if (n & 0x08)
 195                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 196                 return ret;
 197         }
 198
 199         /**
 200          * Fast way when copy size doesn't exceed 512 bytes
 201          */
 202         if (n <= 32) {
 203                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 204                 rte_mov16((uint8_t *)dst - 16 + n,
 205                                   (const uint8_t *)src - 16 + n);
 206                 return ret;
 207         }
 208         if (n <= 64) {
 209                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 210                 rte_mov32((uint8_t *)dst - 32 + n,
 211                                   (const uint8_t *)src - 32 + n);
 212                 return ret;
 213         }
 214         if (n <= 512) {
 215                 if (n >= 256) {
 216                         n -= 256;
 217                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 218                         src = (const uint8_t *)src + 256;
 219                         dst = (uint8_t *)dst + 256;
 220                 }
 221                 if (n >= 128) {
 222                         n -= 128;
 223                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 224                         src = (const uint8_t *)src + 128;
 225                         dst = (uint8_t *)dst + 128;
 226                 }
 227 COPY_BLOCK_128_BACK63:
 228                 if (n > 64) {
 229                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 230                         rte_mov64((uint8_t *)dst - 64 + n,
 231                                           (const uint8_t *)src - 64 + n);
 232                         return ret;
 233                 }
 234                 if (n > 0)
 235                         rte_mov64((uint8_t *)dst - 64 + n,
 236                                           (const uint8_t *)src - 64 + n);
 237                 return ret;
 238         }
 239
 240         /**
 241          * Make store aligned when copy size exceeds 512 bytes
 242          */
 243         dstofss = ((uintptr_t)dst & 0x3F);
 244         if (dstofss > 0) {
 245                 dstofss = 64 - dstofss;
 246                 n -= dstofss;
 247                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 248                 src = (const uint8_t *)src + dstofss;
 249                 dst = (uint8_t *)dst + dstofss;
 250         }
 251
 252         /**
 253          * Copy 512-byte blocks.
 254          * Use copy block function for better instruction order control,
 255          * which is important when load is unaligned.
 256          */
 257         rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
 258         bits = n;
 259         n = n & 511;
 260         bits -= n;
 261         src = (const uint8_t *)src + bits;
 262         dst = (uint8_t *)dst + bits;
 263
 264         /**
 265          * Copy 128-byte blocks.
 266          * Use copy block function for better instruction order control,
 267          * which is important when load is unaligned.
 268          */
 269         if (n >= 128) {
 270                 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 271                 bits = n;
 272                 n = n & 127;
 273                 bits -= n;
 274                 src = (const uint8_t *)src + bits;
 275                 dst = (uint8_t *)dst + bits;
 276         }
 277
 278         /**
 279          * Copy whatever left
 280          */
 281         goto COPY_BLOCK_128_BACK63;
 282 }
 283
 284 #elif defined RTE_MACHINE_CPUFLAG_AVX2
 285
 286 #define ALIGNMENT_MASK 0x1F
 287
 288 /**
 289  * AVX2 implementation below
 290  */
 291
 292 /**
 293  * Copy 16 bytes from one location to another,
 294  * locations should not overlap.
 295  */
 296 static inline void
 297 rte_mov16(uint8_t *dst, const uint8_t *src)
 298 {
 299         __m128i xmm0;
 300
 301         xmm0 = _mm_loadu_si128((const __m128i *)src);
 302         _mm_storeu_si128((__m128i *)dst, xmm0);
 303 }
 304
 305 /**
 306  * Copy 32 bytes from one location to another,
 307  * locations should not overlap.
 308  */
 309 static inline void
 310 rte_mov32(uint8_t *dst, const uint8_t *src)
 311 {
 312         __m256i ymm0;
 313
 314         ymm0 = _mm256_loadu_si256((const __m256i *)src);
 315         _mm256_storeu_si256((__m256i *)dst, ymm0);
 316 }
 317
 318 /**
 319  * Copy 64 bytes from one location to another,
 320  * locations should not overlap.
 321  */
 322 static inline void
 323 rte_mov64(uint8_t *dst, const uint8_t *src)
 324 {
 325         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 326         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 327 }
 328
 329 /**
 330  * Copy 128 bytes from one location to another,
 331  * locations should not overlap.
 332  */
 333 static inline void
 334 rte_mov128(uint8_t *dst, const uint8_t *src)
 335 {
 336         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 337         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 338         rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
 339         rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 340 }
 341
 342 /**
 343  * Copy 128-byte blocks from one location to another,
 344  * locations should not overlap.
 345  */
 346 static inline void
 347 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 348 {
 349         __m256i ymm0, ymm1, ymm2, ymm3;
 350
 351         while (n >= 128) {
 352                 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
 353                 n -= 128;
 354                 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
 355                 ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
 356                 ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
 357                 src = (const uint8_t *)src + 128;
 358                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 359                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 360                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
 361                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
 362                 dst = (uint8_t *)dst + 128;
 363         }
 364 }
 365
 366 static inline void *
 367 rte_memcpy_generic(void *dst, const void *src, size_t n)
 368 {
 369         uintptr_t dstu = (uintptr_t)dst;
 370         uintptr_t srcu = (uintptr_t)src;
 371         void *ret = dst;
 372         size_t dstofss;
 373         size_t bits;
 374
 375         /**
 376          * Copy less than 16 bytes
 377          */
 378         if (n < 16) {
 379                 if (n & 0x01) {
 380                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 381                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 382                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 383                 }
 384                 if (n & 0x02) {
 385                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 386                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 387                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 388                 }
 389                 if (n & 0x04) {
 390                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 391                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 392                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 393                 }
 394                 if (n & 0x08) {
 395                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 396                 }
 397                 return ret;
 398         }
 399
 400         /**
 401          * Fast way when copy size doesn't exceed 256 bytes
 402          */
 403         if (n <= 32) {
 404                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 405                 rte_mov16((uint8_t *)dst - 16 + n,
 406                                 (const uint8_t *)src - 16 + n);
 407                 return ret;
 408         }
 409         if (n <= 48) {
 410                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 411                 rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
 412                 rte_mov16((uint8_t *)dst - 16 + n,
 413                                 (const uint8_t *)src - 16 + n);
 414                 return ret;
 415         }
 416         if (n <= 64) {
 417                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 418                 rte_mov32((uint8_t *)dst - 32 + n,
 419                                 (const uint8_t *)src - 32 + n);
 420                 return ret;
 421         }
 422         if (n <= 256) {
 423                 if (n >= 128) {
 424                         n -= 128;
 425                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 426                         src = (const uint8_t *)src + 128;
 427                         dst = (uint8_t *)dst + 128;
 428                 }
 429 COPY_BLOCK_128_BACK31:
 430                 if (n >= 64) {
 431                         n -= 64;
 432                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 433                         src = (const uint8_t *)src + 64;
 434                         dst = (uint8_t *)dst + 64;
 435                 }
 436                 if (n > 32) {
 437                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 438                         rte_mov32((uint8_t *)dst - 32 + n,
 439                                         (const uint8_t *)src - 32 + n);
 440                         return ret;
 441                 }
 442                 if (n > 0) {
 443                         rte_mov32((uint8_t *)dst - 32 + n,
 444                                         (const uint8_t *)src - 32 + n);
 445                 }
 446                 return ret;
 447         }
 448
 449         /**
 450          * Make store aligned when copy size exceeds 256 bytes
 451          */
 452         dstofss = (uintptr_t)dst & 0x1F;
 453         if (dstofss > 0) {
 454                 dstofss = 32 - dstofss;
 455                 n -= dstofss;
 456                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 457                 src = (const uint8_t *)src + dstofss;
 458                 dst = (uint8_t *)dst + dstofss;
 459         }
 460
 461         /**
 462          * Copy 128-byte blocks
 463          */
 464         rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 465         bits = n;
 466         n = n & 127;
 467         bits -= n;
 468         src = (const uint8_t *)src + bits;
 469         dst = (uint8_t *)dst + bits;
 470
 471         /**
 472          * Copy whatever left
 473          */
 474         goto COPY_BLOCK_128_BACK31;
 475 }
 476
 477 #else /* RTE_MACHINE_CPUFLAG */
 478
 479 #define ALIGNMENT_MASK 0x0F
 480
 481 /**
 482  * SSE & AVX implementation below
 483  */
 484
 485 /**
 486  * Copy 16 bytes from one location to another,
 487  * locations should not overlap.
 488  */
 489 static inline void
 490 rte_mov16(uint8_t *dst, const uint8_t *src)
 491 {
 492         __m128i xmm0;
 493
 494         xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
 495         _mm_storeu_si128((__m128i *)dst, xmm0);
 496 }
 497
 498 /**
 499  * Copy 32 bytes from one location to another,
 500  * locations should not overlap.
 501  */
 502 static inline void
 503 rte_mov32(uint8_t *dst, const uint8_t *src)
 504 {
 505         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 506         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 507 }
 508
 509 /**
 510  * Copy 64 bytes from one location to another,
 511  * locations should not overlap.
 512  */
 513 static inline void
 514 rte_mov64(uint8_t *dst, const uint8_t *src)
 515 {
 516         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 517         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 518         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 519         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 520 }
 521
 522 /**
 523  * Copy 128 bytes from one location to another,
 524  * locations should not overlap.
 525  */
 526 static inline void
 527 rte_mov128(uint8_t *dst, const uint8_t *src)
 528 {
 529         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 530         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 531         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 532         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 533         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 534         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 535         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 536         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 537 }
 538
 539 /**
 540  * Copy 256 bytes from one location to another,
 541  * locations should not overlap.
 542  */
 543 static inline void
 544 rte_mov256(uint8_t *dst, const uint8_t *src)
 545 {
 546         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 547         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 548         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 549         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 550         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 551         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 552         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 553         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 554         rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
 555         rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
 556         rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
 557         rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
 558         rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
 559         rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
 560         rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
 561         rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
 562 }
 563
 564 /**
 565  * Macro for copying unaligned block from one location to another with constant load offset,
 566  * 47 bytes leftover maximum,
 567  * locations should not overlap.
 568  * Requirements:
 569  * - Store is aligned
 570  * - Load offset is <offset>, which must be immediate value within [1, 15]
 571  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 572  * - <dst>, <src>, <len> must be variables
 573  * - __m128i <xmm0> ~ <xmm8> must be pre-defined
 574  */
 575 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
 576 __extension__ ({                                                                                            \
 577     int tmp;                                                                                                \
 578     while (len >= 128 + 16 - offset) {                                                                      \
 579         xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));                  \
 580         len -= 128;                                                                                         \
 581         xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));                  \
 582         xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));                  \
 583         xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16));                  \
 584         xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16));                  \
 585         xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16));                  \
 586         xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16));                  \
 587         xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16));                  \
 588         xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16));                  \
 589         src = (const uint8_t *)src + 128;                                                                   \
 590         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
 591         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
 592         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
 593         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
 594         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
 595         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
 596         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
 597         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
 598         dst = (uint8_t *)dst + 128;                                                                         \
 599     }                                                                                                       \
 600     tmp = len;                                                                                              \
 601     len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
 602     tmp -= len;                                                                                             \
 603     src = (const uint8_t *)src + tmp;                                                                       \
 604     dst = (uint8_t *)dst + tmp;                                                                             \
 605     if (len >= 32 + 16 - offset) {                                                                          \
 606         while (len >= 32 + 16 - offset) {                                                                   \
 607             xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));              \
 608             len -= 32;                                                                                      \
 609             xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));              \
 610             xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));              \
 611             src = (const uint8_t *)src + 32;                                                                \
 612             _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
 613             _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
 614             dst = (uint8_t *)dst + 32;                                                                      \
 615         }                                                                                                   \
 616         tmp = len;                                                                                          \
 617         len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
 618         tmp -= len;                                                                                         \
 619         src = (const uint8_t *)src + tmp;                                                                   \
 620         dst = (uint8_t *)dst + tmp;                                                                         \
 621     }                                                                                                       \
 622 })
 623
 624 /**
 625  * Macro for copying unaligned block from one location to another,
 626  * 47 bytes leftover maximum,
 627  * locations should not overlap.
 628  * Use switch here because the aligning instruction requires immediate value for shift count.
 629  * Requirements:
 630  * - Store is aligned
 631  * - Load offset is <offset>, which must be within [1, 15]
 632  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 633  * - <dst>, <src>, <len> must be variables
 634  * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
 635  */
 636 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
 637 __extension__ ({                                                      \
 638     switch (offset) {                                                 \
 639     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
 640     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
 641     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
 642     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
 643     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
 644     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
 645     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
 646     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
 647     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
 648     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
 649     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
 650     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
 651     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
 652     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
 653     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
 654     default:;                                                         \
 655     }                                                                 \
 656 })
 657
 658 static inline void *
 659 rte_memcpy_generic(void *dst, const void *src, size_t n)
 660 {
 661         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 662         uintptr_t dstu = (uintptr_t)dst;
 663         uintptr_t srcu = (uintptr_t)src;
 664         void *ret = dst;
 665         size_t dstofss;
 666         size_t srcofs;
 667
 668         /**
 669          * Copy less than 16 bytes
 670          */
 671         if (n < 16) {
 672                 if (n & 0x01) {
 673                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 674                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 675                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 676                 }
 677                 if (n & 0x02) {
 678                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 679                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 680                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 681                 }
 682                 if (n & 0x04) {
 683                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 684                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 685                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 686                 }
 687                 if (n & 0x08) {
 688                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 689                 }
 690                 return ret;
 691         }
 692
 693         /**
 694          * Fast way when copy size doesn't exceed 512 bytes
 695          */
 696         if (n <= 32) {
 697                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 698                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 699                 return ret;
 700         }
 701         if (n <= 48) {
 702                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 703                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 704                 return ret;
 705         }
 706         if (n <= 64) {
 707                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 708                 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 709                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 710                 return ret;
 711         }
 712         if (n <= 128) {
 713                 goto COPY_BLOCK_128_BACK15;
 714         }
 715         if (n <= 512) {
 716                 if (n >= 256) {
 717                         n -= 256;
 718                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 719                         rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
 720                         src = (const uint8_t *)src + 256;
 721                         dst = (uint8_t *)dst + 256;
 722                 }
 723 COPY_BLOCK_255_BACK15:
 724                 if (n >= 128) {
 725                         n -= 128;
 726                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 727                         src = (const uint8_t *)src + 128;
 728                         dst = (uint8_t *)dst + 128;
 729                 }
 730 COPY_BLOCK_128_BACK15:
 731                 if (n >= 64) {
 732                         n -= 64;
 733                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 734                         src = (const uint8_t *)src + 64;
 735                         dst = (uint8_t *)dst + 64;
 736                 }
 737 COPY_BLOCK_64_BACK15:
 738                 if (n >= 32) {
 739                         n -= 32;
 740                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 741                         src = (const uint8_t *)src + 32;
 742                         dst = (uint8_t *)dst + 32;
 743                 }
 744                 if (n > 16) {
 745                         rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 746                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 747                         return ret;
 748                 }
 749                 if (n > 0) {
 750                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 751                 }
 752                 return ret;
 753         }
 754
 755         /**
 756          * Make store aligned when copy size exceeds 512 bytes,
 757          * and make sure the first 15 bytes are copied, because
 758          * unaligned copy functions require up to 15 bytes
 759          * backwards access.
 760          */
 761         dstofss = (uintptr_t)dst & 0x0F;
 762         if (dstofss > 0) {
 763                 dstofss = 16 - dstofss + 16;
 764                 n -= dstofss;
 765                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 766                 src = (const uint8_t *)src + dstofss;
 767                 dst = (uint8_t *)dst + dstofss;
 768         }
 769         srcofs = ((uintptr_t)src & 0x0F);
 770
 771         /**
 772          * For aligned copy
 773          */
 774         if (srcofs == 0) {
 775                 /**
 776                  * Copy 256-byte blocks
 777                  */
 778                 for (; n >= 256; n -= 256) {
 779                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 780                         dst = (uint8_t *)dst + 256;
 781                         src = (const uint8_t *)src + 256;
 782                 }
 783
 784                 /**
 785                  * Copy whatever left
 786                  */
 787                 goto COPY_BLOCK_255_BACK15;
 788         }
 789
 790         /**
 791          * For copy with unaligned load
 792          */
 793         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
 794
 795         /**
 796          * Copy whatever left
 797          */
 798         goto COPY_BLOCK_64_BACK15;
 799 }
 800
 801 #endif /* RTE_MACHINE_CPUFLAG */
 802
 803 static inline void *
 804 rte_memcpy_aligned(void *dst, const void *src, size_t n)
 805 {
 806         void *ret = dst;
 807
 808         /* Copy size <= 16 bytes */
 809         if (n < 16) {
 810                 if (n & 0x01) {
 811                         *(uint8_t *)dst = *(const uint8_t *)src;
 812                         src = (const uint8_t *)src + 1;
 813                         dst = (uint8_t *)dst + 1;
 814                 }
 815                 if (n & 0x02) {
 816                         *(uint16_t *)dst = *(const uint16_t *)src;
 817                         src = (const uint16_t *)src + 1;
 818                         dst = (uint16_t *)dst + 1;
 819                 }
 820                 if (n & 0x04) {
 821                         *(uint32_t *)dst = *(const uint32_t *)src;
 822                         src = (const uint32_t *)src + 1;
 823                         dst = (uint32_t *)dst + 1;
 824                 }
 825                 if (n & 0x08)
 826                         *(uint64_t *)dst = *(const uint64_t *)src;
 827
 828                 return ret;
 829         }
 830
 831         /* Copy 16 <= size <= 32 bytes */
 832         if (n <= 32) {
 833                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 834                 rte_mov16((uint8_t *)dst - 16 + n,
 835                                 (const uint8_t *)src - 16 + n);
 836
 837                 return ret;
 838         }
 839
 840         /* Copy 32 < size <= 64 bytes */
 841         if (n <= 64) {
 842                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 843                 rte_mov32((uint8_t *)dst - 32 + n,
 844                                 (const uint8_t *)src - 32 + n);
 845
 846                 return ret;
 847         }
 848
 849         /* Copy 64 bytes blocks */
 850         for (; n >= 64; n -= 64) {
 851                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 852                 dst = (uint8_t *)dst + 64;
 853                 src = (const uint8_t *)src + 64;
 854         }
 855
 856         /* Copy whatever left */
 857         rte_mov64((uint8_t *)dst - 64 + n,
 858                         (const uint8_t *)src - 64 + n);
 859
 860         return ret;
 861 }
 862
 863 static inline void *
 864 rte_memcpy(void *dst, const void *src, size_t n)
 865 {
 866         if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
 867                 return rte_memcpy_aligned(dst, src, n);
 868         else
 869                 return rte_memcpy_generic(dst, src, n);
 870 }
 871
 872 #ifdef __cplusplus
 873 }
 874 #endif
 875
 876 #endif /* _RTE_MEMCPY_X86_64_H_ */