lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2015 RehiveTech. All rights reserved.
   3  */
   4
   5 #ifndef _RTE_MEMCPY_ARM32_H_
   6 #define _RTE_MEMCPY_ARM32_H_
   7
   8 #include <stdint.h>
   9 #include <string.h>
  10
  11 #ifdef __cplusplus
  12 extern "C" {
  13 #endif
  14
  15 #include "generic/rte_memcpy.h"
  16
  17 #ifdef RTE_ARCH_ARM_NEON_MEMCPY
  18
  19 #ifndef RTE_MACHINE_CPUFLAG_NEON
  20 #error "Cannot optimize memcpy by NEON as the CPU seems to not support this"
  21 #endif
  22
  23 /* ARM NEON Intrinsics are used to copy data */
  24 #include <arm_neon.h>
  25
  26 static inline void
  27 rte_mov16(uint8_t *dst, const uint8_t *src)
  28 {
  29         vst1q_u8(dst, vld1q_u8(src));
  30 }
  31
  32 static inline void
  33 rte_mov32(uint8_t *dst, const uint8_t *src)
  34 {
  35         asm volatile (
  36                 "vld1.8 {d0-d3}, [%0]\n\t"
  37                 "vst1.8 {d0-d3}, [%1]\n\t"
  38                 : "+r" (src), "+r" (dst)
  39                 : : "memory", "d0", "d1", "d2", "d3");
  40 }
  41
  42 static inline void
  43 rte_mov48(uint8_t *dst, const uint8_t *src)
  44 {
  45         asm volatile (
  46                 "vld1.8 {d0-d3}, [%0]!\n\t"
  47                 "vld1.8 {d4-d5}, [%0]\n\t"
  48                 "vst1.8 {d0-d3}, [%1]!\n\t"
  49                 "vst1.8 {d4-d5}, [%1]\n\t"
  50                 : "+r" (src), "+r" (dst)
  51                 :
  52                 : "memory", "d0", "d1", "d2", "d3", "d4", "d5");
  53 }
  54
  55 static inline void
  56 rte_mov64(uint8_t *dst, const uint8_t *src)
  57 {
  58         asm volatile (
  59                 "vld1.8 {d0-d3}, [%0]!\n\t"
  60                 "vld1.8 {d4-d7}, [%0]\n\t"
  61                 "vst1.8 {d0-d3}, [%1]!\n\t"
  62                 "vst1.8 {d4-d7}, [%1]\n\t"
  63                 : "+r" (src), "+r" (dst)
  64                 :
  65                 : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7");
  66 }
  67
  68 static inline void
  69 rte_mov128(uint8_t *dst, const uint8_t *src)
  70 {
  71         asm volatile ("pld [%0, #64]" : : "r" (src));
  72         asm volatile (
  73                 "vld1.8 {d0-d3},   [%0]!\n\t"
  74                 "vld1.8 {d4-d7},   [%0]!\n\t"
  75                 "vld1.8 {d8-d11},  [%0]!\n\t"
  76                 "vld1.8 {d12-d15}, [%0]\n\t"
  77                 "vst1.8 {d0-d3},   [%1]!\n\t"
  78                 "vst1.8 {d4-d7},   [%1]!\n\t"
  79                 "vst1.8 {d8-d11},  [%1]!\n\t"
  80                 "vst1.8 {d12-d15}, [%1]\n\t"
  81                 : "+r" (src), "+r" (dst)
  82                 :
  83                 : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
  84                 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15");
  85 }
  86
  87 static inline void
  88 rte_mov256(uint8_t *dst, const uint8_t *src)
  89 {
  90         asm volatile ("pld [%0,  #64]" : : "r" (src));
  91         asm volatile ("pld [%0, #128]" : : "r" (src));
  92         asm volatile ("pld [%0, #192]" : : "r" (src));
  93         asm volatile ("pld [%0, #256]" : : "r" (src));
  94         asm volatile ("pld [%0, #320]" : : "r" (src));
  95         asm volatile ("pld [%0, #384]" : : "r" (src));
  96         asm volatile ("pld [%0, #448]" : : "r" (src));
  97         asm volatile (
  98                 "vld1.8 {d0-d3},   [%0]!\n\t"
  99                 "vld1.8 {d4-d7},   [%0]!\n\t"
 100                 "vld1.8 {d8-d11},  [%0]!\n\t"
 101                 "vld1.8 {d12-d15}, [%0]!\n\t"
 102                 "vld1.8 {d16-d19}, [%0]!\n\t"
 103                 "vld1.8 {d20-d23}, [%0]!\n\t"
 104                 "vld1.8 {d24-d27}, [%0]!\n\t"
 105                 "vld1.8 {d28-d31}, [%0]\n\t"
 106                 "vst1.8 {d0-d3},   [%1]!\n\t"
 107                 "vst1.8 {d4-d7},   [%1]!\n\t"
 108                 "vst1.8 {d8-d11},  [%1]!\n\t"
 109                 "vst1.8 {d12-d15}, [%1]!\n\t"
 110                 "vst1.8 {d16-d19}, [%1]!\n\t"
 111                 "vst1.8 {d20-d23}, [%1]!\n\t"
 112                 "vst1.8 {d24-d27}, [%1]!\n\t"
 113                 "vst1.8 {d28-d31}, [%1]!\n\t"
 114                 : "+r" (src), "+r" (dst)
 115                 :
 116                 : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 117                 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
 118                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
 119                 "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31");
 120 }
 121
 122 #define rte_memcpy(dst, src, n)              \
 123         __extension__ ({                     \
 124         (__builtin_constant_p(n)) ?          \
 125         memcpy((dst), (src), (n)) :          \
 126         rte_memcpy_func((dst), (src), (n)); })
 127
 128 static inline void *
 129 rte_memcpy_func(void *dst, const void *src, size_t n)
 130 {
 131         void *ret = dst;
 132
 133         /* We can't copy < 16 bytes using XMM registers so do it manually. */
 134         if (n < 16) {
 135                 if (n & 0x01) {
 136                         *(uint8_t *)dst = *(const uint8_t *)src;
 137                         dst = (uint8_t *)dst + 1;
 138                         src = (const uint8_t *)src + 1;
 139                 }
 140                 if (n & 0x02) {
 141                         *(uint16_t *)dst = *(const uint16_t *)src;
 142                         dst = (uint16_t *)dst + 1;
 143                         src = (const uint16_t *)src + 1;
 144                 }
 145                 if (n & 0x04) {
 146                         *(uint32_t *)dst = *(const uint32_t *)src;
 147                         dst = (uint32_t *)dst + 1;
 148                         src = (const uint32_t *)src + 1;
 149                 }
 150                 if (n & 0x08) {
 151                         /* ARMv7 can not handle unaligned access to long long
 152                          * (uint64_t). Therefore two uint32_t operations are
 153                          * used.
 154                          */
 155                         *(uint32_t *)dst = *(const uint32_t *)src;
 156                         dst = (uint32_t *)dst + 1;
 157                         src = (const uint32_t *)src + 1;
 158                         *(uint32_t *)dst = *(const uint32_t *)src;
 159                 }
 160                 return ret;
 161         }
 162
 163         /* Special fast cases for <= 128 bytes */
 164         if (n <= 32) {
 165                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 166                 rte_mov16((uint8_t *)dst - 16 + n,
 167                         (const uint8_t *)src - 16 + n);
 168                 return ret;
 169         }
 170
 171         if (n <= 64) {
 172                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 173                 rte_mov32((uint8_t *)dst - 32 + n,
 174                         (const uint8_t *)src - 32 + n);
 175                 return ret;
 176         }
 177
 178         if (n <= 128) {
 179                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 180                 rte_mov64((uint8_t *)dst - 64 + n,
 181                         (const uint8_t *)src - 64 + n);
 182                 return ret;
 183         }
 184
 185         /*
 186          * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
 187          * copies was found to be faster than doing 128 and 32 byte copies as
 188          * well.
 189          */
 190         for ( ; n >= 256; n -= 256) {
 191                 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 192                 dst = (uint8_t *)dst + 256;
 193                 src = (const uint8_t *)src + 256;
 194         }
 195
 196         /*
 197          * We split the remaining bytes (which will be less than 256) into
 198          * 64byte (2^6) chunks.
 199          * Using incrementing integers in the case labels of a switch statement
 200          * encourages the compiler to use a jump table. To get incrementing
 201          * integers, we shift the 2 relevant bits to the LSB position to first
 202          * get decrementing integers, and then subtract.
 203          */
 204         switch (3 - (n >> 6)) {
 205         case 0x00:
 206                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 207                 n -= 64;
 208                 dst = (uint8_t *)dst + 64;
 209                 src = (const uint8_t *)src + 64;      /* fallthrough */
 210         case 0x01:
 211                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 212                 n -= 64;
 213                 dst = (uint8_t *)dst + 64;
 214                 src = (const uint8_t *)src + 64;      /* fallthrough */
 215         case 0x02:
 216                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 217                 n -= 64;
 218                 dst = (uint8_t *)dst + 64;
 219                 src = (const uint8_t *)src + 64;      /* fallthrough */
 220         default:
 221                 break;
 222         }
 223
 224         /*
 225          * We split the remaining bytes (which will be less than 64) into
 226          * 16byte (2^4) chunks, using the same switch structure as above.
 227          */
 228         switch (3 - (n >> 4)) {
 229         case 0x00:
 230                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 231                 n -= 16;
 232                 dst = (uint8_t *)dst + 16;
 233                 src = (const uint8_t *)src + 16;      /* fallthrough */
 234         case 0x01:
 235                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 236                 n -= 16;
 237                 dst = (uint8_t *)dst + 16;
 238                 src = (const uint8_t *)src + 16;      /* fallthrough */
 239         case 0x02:
 240                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 241                 n -= 16;
 242                 dst = (uint8_t *)dst + 16;
 243                 src = (const uint8_t *)src + 16;      /* fallthrough */
 244         default:
 245                 break;
 246         }
 247
 248         /* Copy any remaining bytes, without going beyond end of buffers */
 249         if (n != 0)
 250                 rte_mov16((uint8_t *)dst - 16 + n,
 251                         (const uint8_t *)src - 16 + n);
 252         return ret;
 253 }
 254
 255 #else
 256
 257 static inline void
 258 rte_mov16(uint8_t *dst, const uint8_t *src)
 259 {
 260         memcpy(dst, src, 16);
 261 }
 262
 263 static inline void
 264 rte_mov32(uint8_t *dst, const uint8_t *src)
 265 {
 266         memcpy(dst, src, 32);
 267 }
 268
 269 static inline void
 270 rte_mov48(uint8_t *dst, const uint8_t *src)
 271 {
 272         memcpy(dst, src, 48);
 273 }
 274
 275 static inline void
 276 rte_mov64(uint8_t *dst, const uint8_t *src)
 277 {
 278         memcpy(dst, src, 64);
 279 }
 280
 281 static inline void
 282 rte_mov128(uint8_t *dst, const uint8_t *src)
 283 {
 284         memcpy(dst, src, 128);
 285 }
 286
 287 static inline void
 288 rte_mov256(uint8_t *dst, const uint8_t *src)
 289 {
 290         memcpy(dst, src, 256);
 291 }
 292
 293 static inline void *
 294 rte_memcpy(void *dst, const void *src, size_t n)
 295 {
 296         return memcpy(dst, src, n);
 297 }
 298
 299 #endif /* RTE_ARCH_ARM_NEON_MEMCPY */
 300
 301 #ifdef __cplusplus
 302 }
 303 #endif
 304
 305 #endif /* _RTE_MEMCPY_ARM32_H_ */