lib/librte_eal/common/include/arch/x86/rte_memcpy.h

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #ifndef _RTE_MEMCPY_X86_64_H_
  35 #define _RTE_MEMCPY_X86_64_H_
  36
  37 /**
  38  * @file
  39  *
  40  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  41  */
  42
  43 #include <stdio.h>
  44 #include <stdint.h>
  45 #include <string.h>
  46 #include <rte_vect.h>
  47
  48 #ifdef __cplusplus
  49 extern "C" {
  50 #endif
  51
  52 /**
  53  * Copy bytes from one location to another. The locations must not overlap.
  54  *
  55  * @note This is implemented as a macro, so it's address should not be taken
  56  * and care is needed as parameter expressions may be evaluated multiple times.
  57  *
  58  * @param dst
  59  *   Pointer to the destination of the data.
  60  * @param src
  61  *   Pointer to the source data.
  62  * @param n
  63  *   Number of bytes to copy.
  64  * @return
  65  *   Pointer to the destination data.
  66  */
  67 static inline void *
  68 rte_memcpy(void *dst, const void *src, size_t n) __attribute__((always_inline));
  69
  70 #ifdef RTE_MACHINE_CPUFLAG_AVX512F
  71
  72 /**
  73  * AVX512 implementation below
  74  */
  75
  76 /**
  77  * Copy 16 bytes from one location to another,
  78  * locations should not overlap.
  79  */
  80 static inline void
  81 rte_mov16(uint8_t *dst, const uint8_t *src)
  82 {
  83         __m128i xmm0;
  84
  85         xmm0 = _mm_loadu_si128((const __m128i *)src);
  86         _mm_storeu_si128((__m128i *)dst, xmm0);
  87 }
  88
  89 /**
  90  * Copy 32 bytes from one location to another,
  91  * locations should not overlap.
  92  */
  93 static inline void
  94 rte_mov32(uint8_t *dst, const uint8_t *src)
  95 {
  96         __m256i ymm0;
  97
  98         ymm0 = _mm256_loadu_si256((const __m256i *)src);
  99         _mm256_storeu_si256((__m256i *)dst, ymm0);
 100 }
 101
 102 /**
 103  * Copy 64 bytes from one location to another,
 104  * locations should not overlap.
 105  */
 106 static inline void
 107 rte_mov64(uint8_t *dst, const uint8_t *src)
 108 {
 109         __m512i zmm0;
 110
 111         zmm0 = _mm512_loadu_si512((const void *)src);
 112         _mm512_storeu_si512((void *)dst, zmm0);
 113 }
 114
 115 /**
 116  * Copy 128 bytes from one location to another,
 117  * locations should not overlap.
 118  */
 119 static inline void
 120 rte_mov128(uint8_t *dst, const uint8_t *src)
 121 {
 122         rte_mov64(dst + 0 * 64, src + 0 * 64);
 123         rte_mov64(dst + 1 * 64, src + 1 * 64);
 124 }
 125
 126 /**
 127  * Copy 256 bytes from one location to another,
 128  * locations should not overlap.
 129  */
 130 static inline void
 131 rte_mov256(uint8_t *dst, const uint8_t *src)
 132 {
 133         rte_mov64(dst + 0 * 64, src + 0 * 64);
 134         rte_mov64(dst + 1 * 64, src + 1 * 64);
 135         rte_mov64(dst + 2 * 64, src + 2 * 64);
 136         rte_mov64(dst + 3 * 64, src + 3 * 64);
 137 }
 138
 139 /**
 140  * Copy 128-byte blocks from one location to another,
 141  * locations should not overlap.
 142  */
 143 static inline void
 144 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 145 {
 146         __m512i zmm0, zmm1;
 147
 148         while (n >= 128) {
 149                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 150                 n -= 128;
 151                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 152                 src = src + 128;
 153                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 154                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 155                 dst = dst + 128;
 156         }
 157 }
 158
 159 /**
 160  * Copy 512-byte blocks from one location to another,
 161  * locations should not overlap.
 162  */
 163 static inline void
 164 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 165 {
 166         __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 167
 168         while (n >= 512) {
 169                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 170                 n -= 512;
 171                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 172                 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
 173                 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
 174                 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
 175                 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
 176                 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
 177                 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
 178                 src = src + 512;
 179                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 180                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 181                 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
 182                 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
 183                 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
 184                 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
 185                 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
 186                 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
 187                 dst = dst + 512;
 188         }
 189 }
 190
 191 static inline void *
 192 rte_memcpy(void *dst, const void *src, size_t n)
 193 {
 194         uintptr_t dstu = (uintptr_t)dst;
 195         uintptr_t srcu = (uintptr_t)src;
 196         void *ret = dst;
 197         size_t dstofss;
 198         size_t bits;
 199
 200         /**
 201          * Copy less than 16 bytes
 202          */
 203         if (n < 16) {
 204                 if (n & 0x01) {
 205                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 206                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 207                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 208                 }
 209                 if (n & 0x02) {
 210                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 211                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 212                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 213                 }
 214                 if (n & 0x04) {
 215                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 216                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 217                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 218                 }
 219                 if (n & 0x08)
 220                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 221                 return ret;
 222         }
 223
 224         /**
 225          * Fast way when copy size doesn't exceed 512 bytes
 226          */
 227         if (n <= 32) {
 228                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 229                 rte_mov16((uint8_t *)dst - 16 + n,
 230                                   (const uint8_t *)src - 16 + n);
 231                 return ret;
 232         }
 233         if (n <= 64) {
 234                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 235                 rte_mov32((uint8_t *)dst - 32 + n,
 236                                   (const uint8_t *)src - 32 + n);
 237                 return ret;
 238         }
 239         if (n <= 512) {
 240                 if (n >= 256) {
 241                         n -= 256;
 242                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 243                         src = (const uint8_t *)src + 256;
 244                         dst = (uint8_t *)dst + 256;
 245                 }
 246                 if (n >= 128) {
 247                         n -= 128;
 248                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 249                         src = (const uint8_t *)src + 128;
 250                         dst = (uint8_t *)dst + 128;
 251                 }
 252 COPY_BLOCK_128_BACK63:
 253                 if (n > 64) {
 254                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 255                         rte_mov64((uint8_t *)dst - 64 + n,
 256                                           (const uint8_t *)src - 64 + n);
 257                         return ret;
 258                 }
 259                 if (n > 0)
 260                         rte_mov64((uint8_t *)dst - 64 + n,
 261                                           (const uint8_t *)src - 64 + n);
 262                 return ret;
 263         }
 264
 265         /**
 266          * Make store aligned when copy size exceeds 512 bytes
 267          */
 268         dstofss = ((uintptr_t)dst & 0x3F);
 269         if (dstofss > 0) {
 270                 dstofss = 64 - dstofss;
 271                 n -= dstofss;
 272                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 273                 src = (const uint8_t *)src + dstofss;
 274                 dst = (uint8_t *)dst + dstofss;
 275         }
 276
 277         /**
 278          * Copy 512-byte blocks.
 279          * Use copy block function for better instruction order control,
 280          * which is important when load is unaligned.
 281          */
 282         rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
 283         bits = n;
 284         n = n & 511;
 285         bits -= n;
 286         src = (const uint8_t *)src + bits;
 287         dst = (uint8_t *)dst + bits;
 288
 289         /**
 290          * Copy 128-byte blocks.
 291          * Use copy block function for better instruction order control,
 292          * which is important when load is unaligned.
 293          */
 294         if (n >= 128) {
 295                 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 296                 bits = n;
 297                 n = n & 127;
 298                 bits -= n;
 299                 src = (const uint8_t *)src + bits;
 300                 dst = (uint8_t *)dst + bits;
 301         }
 302
 303         /**
 304          * Copy whatever left
 305          */
 306         goto COPY_BLOCK_128_BACK63;
 307 }
 308
 309 #elif defined RTE_MACHINE_CPUFLAG_AVX2
 310
 311 /**
 312  * AVX2 implementation below
 313  */
 314
 315 /**
 316  * Copy 16 bytes from one location to another,
 317  * locations should not overlap.
 318  */
 319 static inline void
 320 rte_mov16(uint8_t *dst, const uint8_t *src)
 321 {
 322         __m128i xmm0;
 323
 324         xmm0 = _mm_loadu_si128((const __m128i *)src);
 325         _mm_storeu_si128((__m128i *)dst, xmm0);
 326 }
 327
 328 /**
 329  * Copy 32 bytes from one location to another,
 330  * locations should not overlap.
 331  */
 332 static inline void
 333 rte_mov32(uint8_t *dst, const uint8_t *src)
 334 {
 335         __m256i ymm0;
 336
 337         ymm0 = _mm256_loadu_si256((const __m256i *)src);
 338         _mm256_storeu_si256((__m256i *)dst, ymm0);
 339 }
 340
 341 /**
 342  * Copy 64 bytes from one location to another,
 343  * locations should not overlap.
 344  */
 345 static inline void
 346 rte_mov64(uint8_t *dst, const uint8_t *src)
 347 {
 348         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 349         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 350 }
 351
 352 /**
 353  * Copy 128 bytes from one location to another,
 354  * locations should not overlap.
 355  */
 356 static inline void
 357 rte_mov128(uint8_t *dst, const uint8_t *src)
 358 {
 359         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 360         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 361         rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
 362         rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 363 }
 364
 365 /**
 366  * Copy 128-byte blocks from one location to another,
 367  * locations should not overlap.
 368  */
 369 static inline void
 370 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 371 {
 372         __m256i ymm0, ymm1, ymm2, ymm3;
 373
 374         while (n >= 128) {
 375                 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
 376                 n -= 128;
 377                 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
 378                 ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
 379                 ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
 380                 src = (const uint8_t *)src + 128;
 381                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 382                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 383                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
 384                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
 385                 dst = (uint8_t *)dst + 128;
 386         }
 387 }
 388
 389 static inline void *
 390 rte_memcpy(void *dst, const void *src, size_t n)
 391 {
 392         uintptr_t dstu = (uintptr_t)dst;
 393         uintptr_t srcu = (uintptr_t)src;
 394         void *ret = dst;
 395         size_t dstofss;
 396         size_t bits;
 397
 398         /**
 399          * Copy less than 16 bytes
 400          */
 401         if (n < 16) {
 402                 if (n & 0x01) {
 403                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 404                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 405                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 406                 }
 407                 if (n & 0x02) {
 408                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 409                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 410                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 411                 }
 412                 if (n & 0x04) {
 413                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 414                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 415                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 416                 }
 417                 if (n & 0x08) {
 418                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 419                 }
 420                 return ret;
 421         }
 422
 423         /**
 424          * Fast way when copy size doesn't exceed 256 bytes
 425          */
 426         if (n <= 32) {
 427                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 428                 rte_mov16((uint8_t *)dst - 16 + n,
 429                                 (const uint8_t *)src - 16 + n);
 430                 return ret;
 431         }
 432         if (n <= 48) {
 433                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 434                 rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
 435                 rte_mov16((uint8_t *)dst - 16 + n,
 436                                 (const uint8_t *)src - 16 + n);
 437                 return ret;
 438         }
 439         if (n <= 64) {
 440                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 441                 rte_mov32((uint8_t *)dst - 32 + n,
 442                                 (const uint8_t *)src - 32 + n);
 443                 return ret;
 444         }
 445         if (n <= 256) {
 446                 if (n >= 128) {
 447                         n -= 128;
 448                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 449                         src = (const uint8_t *)src + 128;
 450                         dst = (uint8_t *)dst + 128;
 451                 }
 452 COPY_BLOCK_128_BACK31:
 453                 if (n >= 64) {
 454                         n -= 64;
 455                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 456                         src = (const uint8_t *)src + 64;
 457                         dst = (uint8_t *)dst + 64;
 458                 }
 459                 if (n > 32) {
 460                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 461                         rte_mov32((uint8_t *)dst - 32 + n,
 462                                         (const uint8_t *)src - 32 + n);
 463                         return ret;
 464                 }
 465                 if (n > 0) {
 466                         rte_mov32((uint8_t *)dst - 32 + n,
 467                                         (const uint8_t *)src - 32 + n);
 468                 }
 469                 return ret;
 470         }
 471
 472         /**
 473          * Make store aligned when copy size exceeds 256 bytes
 474          */
 475         dstofss = (uintptr_t)dst & 0x1F;
 476         if (dstofss > 0) {
 477                 dstofss = 32 - dstofss;
 478                 n -= dstofss;
 479                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 480                 src = (const uint8_t *)src + dstofss;
 481                 dst = (uint8_t *)dst + dstofss;
 482         }
 483
 484         /**
 485          * Copy 128-byte blocks
 486          */
 487         rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 488         bits = n;
 489         n = n & 127;
 490         bits -= n;
 491         src = (const uint8_t *)src + bits;
 492         dst = (uint8_t *)dst + bits;
 493
 494         /**
 495          * Copy whatever left
 496          */
 497         goto COPY_BLOCK_128_BACK31;
 498 }
 499
 500 #else /* RTE_MACHINE_CPUFLAG */
 501
 502 /**
 503  * SSE & AVX implementation below
 504  */
 505
 506 /**
 507  * Copy 16 bytes from one location to another,
 508  * locations should not overlap.
 509  */
 510 static inline void
 511 rte_mov16(uint8_t *dst, const uint8_t *src)
 512 {
 513         __m128i xmm0;
 514
 515         xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
 516         _mm_storeu_si128((__m128i *)dst, xmm0);
 517 }
 518
 519 /**
 520  * Copy 32 bytes from one location to another,
 521  * locations should not overlap.
 522  */
 523 static inline void
 524 rte_mov32(uint8_t *dst, const uint8_t *src)
 525 {
 526         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 527         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 528 }
 529
 530 /**
 531  * Copy 64 bytes from one location to another,
 532  * locations should not overlap.
 533  */
 534 static inline void
 535 rte_mov64(uint8_t *dst, const uint8_t *src)
 536 {
 537         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 538         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 539         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 540         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 541 }
 542
 543 /**
 544  * Copy 128 bytes from one location to another,
 545  * locations should not overlap.
 546  */
 547 static inline void
 548 rte_mov128(uint8_t *dst, const uint8_t *src)
 549 {
 550         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 551         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 552         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 553         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 554         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 555         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 556         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 557         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 558 }
 559
 560 /**
 561  * Copy 256 bytes from one location to another,
 562  * locations should not overlap.
 563  */
 564 static inline void
 565 rte_mov256(uint8_t *dst, const uint8_t *src)
 566 {
 567         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 568         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 569         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 570         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 571         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 572         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 573         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 574         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 575         rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
 576         rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
 577         rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
 578         rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
 579         rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
 580         rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
 581         rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
 582         rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
 583 }
 584
 585 /**
 586  * Macro for copying unaligned block from one location to another with constant load offset,
 587  * 47 bytes leftover maximum,
 588  * locations should not overlap.
 589  * Requirements:
 590  * - Store is aligned
 591  * - Load offset is <offset>, which must be immediate value within [1, 15]
 592  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 593  * - <dst>, <src>, <len> must be variables
 594  * - __m128i <xmm0> ~ <xmm8> must be pre-defined
 595  */
 596 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
 597 __extension__ ({                                                                                            \
 598     int tmp;                                                                                                \
 599     while (len >= 128 + 16 - offset) {                                                                      \
 600         xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));                  \
 601         len -= 128;                                                                                         \
 602         xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));                  \
 603         xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));                  \
 604         xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16));                  \
 605         xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16));                  \
 606         xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16));                  \
 607         xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16));                  \
 608         xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16));                  \
 609         xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16));                  \
 610         src = (const uint8_t *)src + 128;                                                                   \
 611         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
 612         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
 613         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
 614         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
 615         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
 616         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
 617         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
 618         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
 619         dst = (uint8_t *)dst + 128;                                                                         \
 620     }                                                                                                       \
 621     tmp = len;                                                                                              \
 622     len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
 623     tmp -= len;                                                                                             \
 624     src = (const uint8_t *)src + tmp;                                                                       \
 625     dst = (uint8_t *)dst + tmp;                                                                             \
 626     if (len >= 32 + 16 - offset) {                                                                          \
 627         while (len >= 32 + 16 - offset) {                                                                   \
 628             xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));              \
 629             len -= 32;                                                                                      \
 630             xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));              \
 631             xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));              \
 632             src = (const uint8_t *)src + 32;                                                                \
 633             _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
 634             _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
 635             dst = (uint8_t *)dst + 32;                                                                      \
 636         }                                                                                                   \
 637         tmp = len;                                                                                          \
 638         len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
 639         tmp -= len;                                                                                         \
 640         src = (const uint8_t *)src + tmp;                                                                   \
 641         dst = (uint8_t *)dst + tmp;                                                                         \
 642     }                                                                                                       \
 643 })
 644
 645 /**
 646  * Macro for copying unaligned block from one location to another,
 647  * 47 bytes leftover maximum,
 648  * locations should not overlap.
 649  * Use switch here because the aligning instruction requires immediate value for shift count.
 650  * Requirements:
 651  * - Store is aligned
 652  * - Load offset is <offset>, which must be within [1, 15]
 653  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 654  * - <dst>, <src>, <len> must be variables
 655  * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
 656  */
 657 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
 658 __extension__ ({                                                      \
 659     switch (offset) {                                                 \
 660     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
 661     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
 662     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
 663     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
 664     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
 665     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
 666     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
 667     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
 668     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
 669     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
 670     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
 671     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
 672     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
 673     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
 674     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
 675     default:;                                                         \
 676     }                                                                 \
 677 })
 678
 679 static inline void *
 680 rte_memcpy(void *dst, const void *src, size_t n)
 681 {
 682         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 683         uintptr_t dstu = (uintptr_t)dst;
 684         uintptr_t srcu = (uintptr_t)src;
 685         void *ret = dst;
 686         size_t dstofss;
 687         size_t srcofs;
 688
 689         /**
 690          * Copy less than 16 bytes
 691          */
 692         if (n < 16) {
 693                 if (n & 0x01) {
 694                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 695                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 696                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 697                 }
 698                 if (n & 0x02) {
 699                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 700                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 701                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 702                 }
 703                 if (n & 0x04) {
 704                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 705                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 706                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 707                 }
 708                 if (n & 0x08) {
 709                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 710                 }
 711                 return ret;
 712         }
 713
 714         /**
 715          * Fast way when copy size doesn't exceed 512 bytes
 716          */
 717         if (n <= 32) {
 718                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 719                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 720                 return ret;
 721         }
 722         if (n <= 48) {
 723                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 724                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 725                 return ret;
 726         }
 727         if (n <= 64) {
 728                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 729                 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 730                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 731                 return ret;
 732         }
 733         if (n <= 128) {
 734                 goto COPY_BLOCK_128_BACK15;
 735         }
 736         if (n <= 512) {
 737                 if (n >= 256) {
 738                         n -= 256;
 739                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 740                         rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
 741                         src = (const uint8_t *)src + 256;
 742                         dst = (uint8_t *)dst + 256;
 743                 }
 744 COPY_BLOCK_255_BACK15:
 745                 if (n >= 128) {
 746                         n -= 128;
 747                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 748                         src = (const uint8_t *)src + 128;
 749                         dst = (uint8_t *)dst + 128;
 750                 }
 751 COPY_BLOCK_128_BACK15:
 752                 if (n >= 64) {
 753                         n -= 64;
 754                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 755                         src = (const uint8_t *)src + 64;
 756                         dst = (uint8_t *)dst + 64;
 757                 }
 758 COPY_BLOCK_64_BACK15:
 759                 if (n >= 32) {
 760                         n -= 32;
 761                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 762                         src = (const uint8_t *)src + 32;
 763                         dst = (uint8_t *)dst + 32;
 764                 }
 765                 if (n > 16) {
 766                         rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 767                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 768                         return ret;
 769                 }
 770                 if (n > 0) {
 771                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 772                 }
 773                 return ret;
 774         }
 775
 776         /**
 777          * Make store aligned when copy size exceeds 512 bytes,
 778          * and make sure the first 15 bytes are copied, because
 779          * unaligned copy functions require up to 15 bytes
 780          * backwards access.
 781          */
 782         dstofss = (uintptr_t)dst & 0x0F;
 783         if (dstofss > 0) {
 784                 dstofss = 16 - dstofss + 16;
 785                 n -= dstofss;
 786                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 787                 src = (const uint8_t *)src + dstofss;
 788                 dst = (uint8_t *)dst + dstofss;
 789         }
 790         srcofs = ((uintptr_t)src & 0x0F);
 791
 792         /**
 793          * For aligned copy
 794          */
 795         if (srcofs == 0) {
 796                 /**
 797                  * Copy 256-byte blocks
 798                  */
 799                 for (; n >= 256; n -= 256) {
 800                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 801                         dst = (uint8_t *)dst + 256;
 802                         src = (const uint8_t *)src + 256;
 803                 }
 804
 805                 /**
 806                  * Copy whatever left
 807                  */
 808                 goto COPY_BLOCK_255_BACK15;
 809         }
 810
 811         /**
 812          * For copy with unaligned load
 813          */
 814         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
 815
 816         /**
 817          * Copy whatever left
 818          */
 819         goto COPY_BLOCK_64_BACK15;
 820 }
 821
 822 #endif /* RTE_MACHINE_CPUFLAG */
 823
 824 #ifdef __cplusplus
 825 }
 826 #endif
 827
 828 #endif /* _RTE_MEMCPY_X86_64_H_ */