lib/librte_eal/common/include/arch/x86/rte_memcpy.h

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #ifndef _RTE_MEMCPY_X86_64_H_
  35 #define _RTE_MEMCPY_X86_64_H_
  36
  37 /**
  38  * @file
  39  *
  40  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  41  */
  42
  43 #include <stdio.h>
  44 #include <stdint.h>
  45 #include <string.h>
  46 #include <rte_vect.h>
  47
  48 #ifdef __cplusplus
  49 extern "C" {
  50 #endif
  51
  52 /**
  53  * Copy bytes from one location to another. The locations must not overlap.
  54  *
  55  * @note This is implemented as a macro, so it's address should not be taken
  56  * and care is needed as parameter expressions may be evaluated multiple times.
  57  *
  58  * @param dst
  59  *   Pointer to the destination of the data.
  60  * @param src
  61  *   Pointer to the source data.
  62  * @param n
  63  *   Number of bytes to copy.
  64  * @return
  65  *   Pointer to the destination data.
  66  */
  67 static inline void *
  68 rte_memcpy(void *dst, const void *src, size_t n) __attribute__((always_inline));
  69
  70 #ifdef RTE_MACHINE_CPUFLAG_AVX512F
  71
  72 /**
  73  * AVX512 implementation below
  74  */
  75
  76 /**
  77  * Copy 16 bytes from one location to another,
  78  * locations should not overlap.
  79  */
  80 static inline void
  81 rte_mov16(uint8_t *dst, const uint8_t *src)
  82 {
  83         __m128i xmm0;
  84
  85         xmm0 = _mm_loadu_si128((const __m128i *)src);
  86         _mm_storeu_si128((__m128i *)dst, xmm0);
  87 }
  88
  89 /**
  90  * Copy 32 bytes from one location to another,
  91  * locations should not overlap.
  92  */
  93 static inline void
  94 rte_mov32(uint8_t *dst, const uint8_t *src)
  95 {
  96         __m256i ymm0;
  97
  98         ymm0 = _mm256_loadu_si256((const __m256i *)src);
  99         _mm256_storeu_si256((__m256i *)dst, ymm0);
 100 }
 101
 102 /**
 103  * Copy 64 bytes from one location to another,
 104  * locations should not overlap.
 105  */
 106 static inline void
 107 rte_mov64(uint8_t *dst, const uint8_t *src)
 108 {
 109         __m512i zmm0;
 110
 111         zmm0 = _mm512_loadu_si512((const void *)src);
 112         _mm512_storeu_si512((void *)dst, zmm0);
 113 }
 114
 115 /**
 116  * Copy 128 bytes from one location to another,
 117  * locations should not overlap.
 118  */
 119 static inline void
 120 rte_mov128(uint8_t *dst, const uint8_t *src)
 121 {
 122         rte_mov64(dst + 0 * 64, src + 0 * 64);
 123         rte_mov64(dst + 1 * 64, src + 1 * 64);
 124 }
 125
 126 /**
 127  * Copy 256 bytes from one location to another,
 128  * locations should not overlap.
 129  */
 130 static inline void
 131 rte_mov256(uint8_t *dst, const uint8_t *src)
 132 {
 133         rte_mov64(dst + 0 * 64, src + 0 * 64);
 134         rte_mov64(dst + 1 * 64, src + 1 * 64);
 135         rte_mov64(dst + 2 * 64, src + 2 * 64);
 136         rte_mov64(dst + 3 * 64, src + 3 * 64);
 137 }
 138
 139 /**
 140  * Copy 128-byte blocks from one location to another,
 141  * locations should not overlap.
 142  */
 143 static inline void
 144 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 145 {
 146         __m512i zmm0, zmm1;
 147
 148         while (n >= 128) {
 149                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 150                 n -= 128;
 151                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 152                 src = src + 128;
 153                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 154                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 155                 dst = dst + 128;
 156         }
 157 }
 158
 159 /**
 160  * Copy 512-byte blocks from one location to another,
 161  * locations should not overlap.
 162  */
 163 static inline void
 164 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 165 {
 166         __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 167
 168         while (n >= 512) {
 169                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 170                 n -= 512;
 171                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 172                 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
 173                 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
 174                 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
 175                 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
 176                 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
 177                 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
 178                 src = src + 512;
 179                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 180                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 181                 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
 182                 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
 183                 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
 184                 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
 185                 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
 186                 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
 187                 dst = dst + 512;
 188         }
 189 }
 190
 191 static inline void *
 192 rte_memcpy(void *dst, const void *src, size_t n)
 193 {
 194         uintptr_t dstu = (uintptr_t)dst;
 195         uintptr_t srcu = (uintptr_t)src;
 196         void *ret = dst;
 197         size_t dstofss;
 198         size_t bits;
 199
 200         /**
 201          * Copy less than 16 bytes
 202          */
 203         if (n < 16) {
 204                 if (n & 0x01) {
 205                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 206                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 207                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 208                 }
 209                 if (n & 0x02) {
 210                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 211                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 212                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 213                 }
 214                 if (n & 0x04) {
 215                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 216                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 217                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 218                 }
 219                 if (n & 0x08)
 220                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 221                 return ret;
 222         }
 223
 224         /**
 225          * Fast way when copy size doesn't exceed 512 bytes
 226          */
 227         if (n <= 32) {
 228                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 229                 rte_mov16((uint8_t *)dst - 16 + n,
 230                                   (const uint8_t *)src - 16 + n);
 231                 return ret;
 232         }
 233         if (n <= 64) {
 234                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 235                 rte_mov32((uint8_t *)dst - 32 + n,
 236                                   (const uint8_t *)src - 32 + n);
 237                 return ret;
 238         }
 239         if (n <= 512) {
 240                 if (n >= 256) {
 241                         n -= 256;
 242                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 243                         src = (const uint8_t *)src + 256;
 244                         dst = (uint8_t *)dst + 256;
 245                 }
 246                 if (n >= 128) {
 247                         n -= 128;
 248                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 249                         src = (const uint8_t *)src + 128;
 250                         dst = (uint8_t *)dst + 128;
 251                 }
 252 COPY_BLOCK_128_BACK63:
 253                 if (n > 64) {
 254                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 255                         rte_mov64((uint8_t *)dst - 64 + n,
 256                                           (const uint8_t *)src - 64 + n);
 257                         return ret;
 258                 }
 259                 if (n > 0)
 260                         rte_mov64((uint8_t *)dst - 64 + n,
 261                                           (const uint8_t *)src - 64 + n);
 262                 return ret;
 263         }
 264
 265         /**
 266          * Make store aligned when copy size exceeds 512 bytes
 267          */
 268         dstofss = ((uintptr_t)dst & 0x3F);
 269         if (dstofss > 0) {
 270                 dstofss = 64 - dstofss;
 271                 n -= dstofss;
 272                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 273                 src = (const uint8_t *)src + dstofss;
 274                 dst = (uint8_t *)dst + dstofss;
 275         }
 276
 277         /**
 278          * Copy 512-byte blocks.
 279          * Use copy block function for better instruction order control,
 280          * which is important when load is unaligned.
 281          */
 282         rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
 283         bits = n;
 284         n = n & 511;
 285         bits -= n;
 286         src = (const uint8_t *)src + bits;
 287         dst = (uint8_t *)dst + bits;
 288
 289         /**
 290          * Copy 128-byte blocks.
 291          * Use copy block function for better instruction order control,
 292          * which is important when load is unaligned.
 293          */
 294         if (n >= 128) {
 295                 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 296                 bits = n;
 297                 n = n & 127;
 298                 bits -= n;
 299                 src = (const uint8_t *)src + bits;
 300                 dst = (uint8_t *)dst + bits;
 301         }
 302
 303         /**
 304          * Copy whatever left
 305          */
 306         goto COPY_BLOCK_128_BACK63;
 307 }
 308
 309 #elif defined RTE_MACHINE_CPUFLAG_AVX2
 310
 311 /**
 312  * AVX2 implementation below
 313  */
 314
 315 /**
 316  * Copy 16 bytes from one location to another,
 317  * locations should not overlap.
 318  */
 319 static inline void
 320 rte_mov16(uint8_t *dst, const uint8_t *src)
 321 {
 322         __m128i xmm0;
 323
 324         xmm0 = _mm_loadu_si128((const __m128i *)src);
 325         _mm_storeu_si128((__m128i *)dst, xmm0);
 326 }
 327
 328 /**
 329  * Copy 32 bytes from one location to another,
 330  * locations should not overlap.
 331  */
 332 static inline void
 333 rte_mov32(uint8_t *dst, const uint8_t *src)
 334 {
 335         __m256i ymm0;
 336
 337         ymm0 = _mm256_loadu_si256((const __m256i *)src);
 338         _mm256_storeu_si256((__m256i *)dst, ymm0);
 339 }
 340
 341 /**
 342  * Copy 64 bytes from one location to another,
 343  * locations should not overlap.
 344  */
 345 static inline void
 346 rte_mov64(uint8_t *dst, const uint8_t *src)
 347 {
 348         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 349         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 350 }
 351
 352 /**
 353  * Copy 128 bytes from one location to another,
 354  * locations should not overlap.
 355  */
 356 static inline void
 357 rte_mov128(uint8_t *dst, const uint8_t *src)
 358 {
 359         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 360         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 361         rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
 362         rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 363 }
 364
 365 /**
 366  * Copy 256 bytes from one location to another,
 367  * locations should not overlap.
 368  */
 369 static inline void
 370 rte_mov256(uint8_t *dst, const uint8_t *src)
 371 {
 372         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 373         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 374         rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
 375         rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 376         rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
 377         rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
 378         rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
 379         rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
 380 }
 381
 382 /**
 383  * Copy 64-byte blocks from one location to another,
 384  * locations should not overlap.
 385  */
 386 static inline void
 387 rte_mov64blocks(uint8_t *dst, const uint8_t *src, size_t n)
 388 {
 389         __m256i ymm0, ymm1;
 390
 391         while (n >= 64) {
 392                 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
 393                 n -= 64;
 394                 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
 395                 src = (const uint8_t *)src + 64;
 396                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 397                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 398                 dst = (uint8_t *)dst + 64;
 399         }
 400 }
 401
 402 /**
 403  * Copy 256-byte blocks from one location to another,
 404  * locations should not overlap.
 405  */
 406 static inline void
 407 rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n)
 408 {
 409         __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
 410
 411         while (n >= 256) {
 412                 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
 413                 n -= 256;
 414                 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
 415                 ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
 416                 ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
 417                 ymm4 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 4 * 32));
 418                 ymm5 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 5 * 32));
 419                 ymm6 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 6 * 32));
 420                 ymm7 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 7 * 32));
 421                 src = (const uint8_t *)src + 256;
 422                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 423                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 424                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
 425                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
 426                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), ymm4);
 427                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), ymm5);
 428                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), ymm6);
 429                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), ymm7);
 430                 dst = (uint8_t *)dst + 256;
 431         }
 432 }
 433
 434 static inline void *
 435 rte_memcpy(void *dst, const void *src, size_t n)
 436 {
 437         uintptr_t dstu = (uintptr_t)dst;
 438         uintptr_t srcu = (uintptr_t)src;
 439         void *ret = dst;
 440         size_t dstofss;
 441         size_t bits;
 442
 443         /**
 444          * Copy less than 16 bytes
 445          */
 446         if (n < 16) {
 447                 if (n & 0x01) {
 448                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 449                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 450                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 451                 }
 452                 if (n & 0x02) {
 453                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 454                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 455                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 456                 }
 457                 if (n & 0x04) {
 458                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 459                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 460                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 461                 }
 462                 if (n & 0x08) {
 463                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 464                 }
 465                 return ret;
 466         }
 467
 468         /**
 469          * Fast way when copy size doesn't exceed 512 bytes
 470          */
 471         if (n <= 32) {
 472                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 473                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 474                 return ret;
 475         }
 476         if (n <= 64) {
 477                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 478                 rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
 479                 return ret;
 480         }
 481         if (n <= 512) {
 482                 if (n >= 256) {
 483                         n -= 256;
 484                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 485                         src = (const uint8_t *)src + 256;
 486                         dst = (uint8_t *)dst + 256;
 487                 }
 488                 if (n >= 128) {
 489                         n -= 128;
 490                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 491                         src = (const uint8_t *)src + 128;
 492                         dst = (uint8_t *)dst + 128;
 493                 }
 494                 if (n >= 64) {
 495                         n -= 64;
 496                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 497                         src = (const uint8_t *)src + 64;
 498                         dst = (uint8_t *)dst + 64;
 499                 }
 500 COPY_BLOCK_64_BACK31:
 501                 if (n > 32) {
 502                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 503                         rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
 504                         return ret;
 505                 }
 506                 if (n > 0) {
 507                         rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
 508                 }
 509                 return ret;
 510         }
 511
 512         /**
 513          * Make store aligned when copy size exceeds 512 bytes
 514          */
 515         dstofss = (uintptr_t)dst & 0x1F;
 516         if (dstofss > 0) {
 517                 dstofss = 32 - dstofss;
 518                 n -= dstofss;
 519                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 520                 src = (const uint8_t *)src + dstofss;
 521                 dst = (uint8_t *)dst + dstofss;
 522         }
 523
 524         /**
 525          * Copy 256-byte blocks.
 526          * Use copy block function for better instruction order control,
 527          * which is important when load is unaligned.
 528          */
 529         rte_mov256blocks((uint8_t *)dst, (const uint8_t *)src, n);
 530         bits = n;
 531         n = n & 255;
 532         bits -= n;
 533         src = (const uint8_t *)src + bits;
 534         dst = (uint8_t *)dst + bits;
 535
 536         /**
 537          * Copy 64-byte blocks.
 538          * Use copy block function for better instruction order control,
 539          * which is important when load is unaligned.
 540          */
 541         if (n >= 64) {
 542                 rte_mov64blocks((uint8_t *)dst, (const uint8_t *)src, n);
 543                 bits = n;
 544                 n = n & 63;
 545                 bits -= n;
 546                 src = (const uint8_t *)src + bits;
 547                 dst = (uint8_t *)dst + bits;
 548         }
 549
 550         /**
 551          * Copy whatever left
 552          */
 553         goto COPY_BLOCK_64_BACK31;
 554 }
 555
 556 #else /* RTE_MACHINE_CPUFLAG */
 557
 558 /**
 559  * SSE & AVX implementation below
 560  */
 561
 562 /**
 563  * Copy 16 bytes from one location to another,
 564  * locations should not overlap.
 565  */
 566 static inline void
 567 rte_mov16(uint8_t *dst, const uint8_t *src)
 568 {
 569         __m128i xmm0;
 570
 571         xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
 572         _mm_storeu_si128((__m128i *)dst, xmm0);
 573 }
 574
 575 /**
 576  * Copy 32 bytes from one location to another,
 577  * locations should not overlap.
 578  */
 579 static inline void
 580 rte_mov32(uint8_t *dst, const uint8_t *src)
 581 {
 582         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 583         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 584 }
 585
 586 /**
 587  * Copy 64 bytes from one location to another,
 588  * locations should not overlap.
 589  */
 590 static inline void
 591 rte_mov64(uint8_t *dst, const uint8_t *src)
 592 {
 593         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 594         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 595         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 596         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 597 }
 598
 599 /**
 600  * Copy 128 bytes from one location to another,
 601  * locations should not overlap.
 602  */
 603 static inline void
 604 rte_mov128(uint8_t *dst, const uint8_t *src)
 605 {
 606         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 607         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 608         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 609         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 610         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 611         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 612         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 613         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 614 }
 615
 616 /**
 617  * Copy 256 bytes from one location to another,
 618  * locations should not overlap.
 619  */
 620 static inline void
 621 rte_mov256(uint8_t *dst, const uint8_t *src)
 622 {
 623         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 624         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 625         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 626         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 627         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 628         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 629         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 630         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 631         rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
 632         rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
 633         rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
 634         rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
 635         rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
 636         rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
 637         rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
 638         rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
 639 }
 640
 641 /**
 642  * Macro for copying unaligned block from one location to another with constant load offset,
 643  * 47 bytes leftover maximum,
 644  * locations should not overlap.
 645  * Requirements:
 646  * - Store is aligned
 647  * - Load offset is <offset>, which must be immediate value within [1, 15]
 648  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 649  * - <dst>, <src>, <len> must be variables
 650  * - __m128i <xmm0> ~ <xmm8> must be pre-defined
 651  */
 652 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
 653 ({                                                                                                          \
 654     int tmp;                                                                                                \
 655     while (len >= 128 + 16 - offset) {                                                                      \
 656         xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));                  \
 657         len -= 128;                                                                                         \
 658         xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));                  \
 659         xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));                  \
 660         xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16));                  \
 661         xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16));                  \
 662         xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16));                  \
 663         xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16));                  \
 664         xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16));                  \
 665         xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16));                  \
 666         src = (const uint8_t *)src + 128;                                                                   \
 667         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
 668         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
 669         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
 670         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
 671         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
 672         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
 673         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
 674         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
 675         dst = (uint8_t *)dst + 128;                                                                         \
 676     }                                                                                                       \
 677     tmp = len;                                                                                              \
 678     len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
 679     tmp -= len;                                                                                             \
 680     src = (const uint8_t *)src + tmp;                                                                       \
 681     dst = (uint8_t *)dst + tmp;                                                                             \
 682     if (len >= 32 + 16 - offset) {                                                                          \
 683         while (len >= 32 + 16 - offset) {                                                                   \
 684             xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));              \
 685             len -= 32;                                                                                      \
 686             xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));              \
 687             xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));              \
 688             src = (const uint8_t *)src + 32;                                                                \
 689             _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
 690             _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
 691             dst = (uint8_t *)dst + 32;                                                                      \
 692         }                                                                                                   \
 693         tmp = len;                                                                                          \
 694         len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
 695         tmp -= len;                                                                                         \
 696         src = (const uint8_t *)src + tmp;                                                                   \
 697         dst = (uint8_t *)dst + tmp;                                                                         \
 698     }                                                                                                       \
 699 })
 700
 701 /**
 702  * Macro for copying unaligned block from one location to another,
 703  * 47 bytes leftover maximum,
 704  * locations should not overlap.
 705  * Use switch here because the aligning instruction requires immediate value for shift count.
 706  * Requirements:
 707  * - Store is aligned
 708  * - Load offset is <offset>, which must be within [1, 15]
 709  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 710  * - <dst>, <src>, <len> must be variables
 711  * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
 712  */
 713 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
 714 ({                                                                    \
 715     switch (offset) {                                                 \
 716     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
 717     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
 718     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
 719     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
 720     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
 721     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
 722     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
 723     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
 724     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
 725     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
 726     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
 727     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
 728     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
 729     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
 730     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
 731     default:;                                                         \
 732     }                                                                 \
 733 })
 734
 735 static inline void *
 736 rte_memcpy(void *dst, const void *src, size_t n)
 737 {
 738         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 739         uintptr_t dstu = (uintptr_t)dst;
 740         uintptr_t srcu = (uintptr_t)src;
 741         void *ret = dst;
 742         size_t dstofss;
 743         size_t srcofs;
 744
 745         /**
 746          * Copy less than 16 bytes
 747          */
 748         if (n < 16) {
 749                 if (n & 0x01) {
 750                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 751                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 752                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 753                 }
 754                 if (n & 0x02) {
 755                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 756                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 757                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 758                 }
 759                 if (n & 0x04) {
 760                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 761                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 762                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 763                 }
 764                 if (n & 0x08) {
 765                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 766                 }
 767                 return ret;
 768         }
 769
 770         /**
 771          * Fast way when copy size doesn't exceed 512 bytes
 772          */
 773         if (n <= 32) {
 774                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 775                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 776                 return ret;
 777         }
 778         if (n <= 48) {
 779                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 780                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 781                 return ret;
 782         }
 783         if (n <= 64) {
 784                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 785                 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 786                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 787                 return ret;
 788         }
 789         if (n <= 128) {
 790                 goto COPY_BLOCK_128_BACK15;
 791         }
 792         if (n <= 512) {
 793                 if (n >= 256) {
 794                         n -= 256;
 795                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 796                         rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
 797                         src = (const uint8_t *)src + 256;
 798                         dst = (uint8_t *)dst + 256;
 799                 }
 800 COPY_BLOCK_255_BACK15:
 801                 if (n >= 128) {
 802                         n -= 128;
 803                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 804                         src = (const uint8_t *)src + 128;
 805                         dst = (uint8_t *)dst + 128;
 806                 }
 807 COPY_BLOCK_128_BACK15:
 808                 if (n >= 64) {
 809                         n -= 64;
 810                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 811                         src = (const uint8_t *)src + 64;
 812                         dst = (uint8_t *)dst + 64;
 813                 }
 814 COPY_BLOCK_64_BACK15:
 815                 if (n >= 32) {
 816                         n -= 32;
 817                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 818                         src = (const uint8_t *)src + 32;
 819                         dst = (uint8_t *)dst + 32;
 820                 }
 821                 if (n > 16) {
 822                         rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 823                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 824                         return ret;
 825                 }
 826                 if (n > 0) {
 827                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 828                 }
 829                 return ret;
 830         }
 831
 832         /**
 833          * Make store aligned when copy size exceeds 512 bytes,
 834          * and make sure the first 15 bytes are copied, because
 835          * unaligned copy functions require up to 15 bytes
 836          * backwards access.
 837          */
 838         dstofss = (uintptr_t)dst & 0x0F;
 839         if (dstofss > 0) {
 840                 dstofss = 16 - dstofss + 16;
 841                 n -= dstofss;
 842                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 843                 src = (const uint8_t *)src + dstofss;
 844                 dst = (uint8_t *)dst + dstofss;
 845         }
 846         srcofs = ((uintptr_t)src & 0x0F);
 847
 848         /**
 849          * For aligned copy
 850          */
 851         if (srcofs == 0) {
 852                 /**
 853                  * Copy 256-byte blocks
 854                  */
 855                 for (; n >= 256; n -= 256) {
 856                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 857                         dst = (uint8_t *)dst + 256;
 858                         src = (const uint8_t *)src + 256;
 859                 }
 860
 861                 /**
 862                  * Copy whatever left
 863                  */
 864                 goto COPY_BLOCK_255_BACK15;
 865         }
 866
 867         /**
 868          * For copy with unaligned load
 869          */
 870         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
 871
 872         /**
 873          * Copy whatever left
 874          */
 875         goto COPY_BLOCK_64_BACK15;
 876 }
 877
 878 #endif /* RTE_MACHINE_CPUFLAG */
 879
 880 #ifdef __cplusplus
 881 }
 882 #endif
 883
 884 #endif /* _RTE_MEMCPY_X86_64_H_ */