#include <stdint.h>
#include <string.h>
#include <rte_vect.h>
+#include <rte_common.h>
+#include <rte_config.h>
#ifdef __cplusplus
extern "C" {
* @return
* Pointer to the destination data.
*/
-static inline void *
-rte_memcpy(void *dst, const void *src, size_t n) __attribute__((always_inline));
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n);
#ifdef RTE_MACHINE_CPUFLAG_AVX512F
+#define ALIGNMENT_MASK 0x3F
+
/**
* AVX512 implementation below
*/
}
static inline void *
-rte_memcpy(void *dst, const void *src, size_t n)
+rte_memcpy_generic(void *dst, const void *src, size_t n)
{
uintptr_t dstu = (uintptr_t)dst;
uintptr_t srcu = (uintptr_t)src;
#elif defined RTE_MACHINE_CPUFLAG_AVX2
+#define ALIGNMENT_MASK 0x1F
+
/**
* AVX2 implementation below
*/
}
/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
- rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
- rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
- rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
- rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
- rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
- rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
- rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
- rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
-
-/**
- * Copy 64-byte blocks from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov64blocks(uint8_t *dst, const uint8_t *src, size_t n)
-{
- __m256i ymm0, ymm1;
-
- while (n >= 64) {
- ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
- n -= 64;
- ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
- src = (const uint8_t *)src + 64;
- _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
- _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
- dst = (uint8_t *)dst + 64;
- }
-}
-
-/**
- * Copy 256-byte blocks from one location to another,
+ * Copy 128-byte blocks from one location to another,
* locations should not overlap.
*/
static inline void
-rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
{
- __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
+ __m256i ymm0, ymm1, ymm2, ymm3;
- while (n >= 256) {
+ while (n >= 128) {
ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
- n -= 256;
+ n -= 128;
ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
- ymm4 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 4 * 32));
- ymm5 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 5 * 32));
- ymm6 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 6 * 32));
- ymm7 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 7 * 32));
- src = (const uint8_t *)src + 256;
+ src = (const uint8_t *)src + 128;
_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
- _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), ymm4);
- _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), ymm5);
- _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), ymm6);
- _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), ymm7);
- dst = (uint8_t *)dst + 256;
+ dst = (uint8_t *)dst + 128;
}
}
static inline void *
-rte_memcpy(void *dst, const void *src, size_t n)
+rte_memcpy_generic(void *dst, const void *src, size_t n)
{
uintptr_t dstu = (uintptr_t)dst;
uintptr_t srcu = (uintptr_t)src;
}
/**
- * Fast way when copy size doesn't exceed 512 bytes
+ * Fast way when copy size doesn't exceed 256 bytes
*/
if (n <= 32) {
rte_mov16((uint8_t *)dst, (const uint8_t *)src);
- rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+ rte_mov16((uint8_t *)dst - 16 + n,
+ (const uint8_t *)src - 16 + n);
+ return ret;
+ }
+ if (n <= 48) {
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+ rte_mov16((uint8_t *)dst - 16 + n,
+ (const uint8_t *)src - 16 + n);
return ret;
}
if (n <= 64) {
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+ rte_mov32((uint8_t *)dst - 32 + n,
+ (const uint8_t *)src - 32 + n);
return ret;
}
- if (n <= 512) {
- if (n >= 256) {
- n -= 256;
- rte_mov256((uint8_t *)dst, (const uint8_t *)src);
- src = (const uint8_t *)src + 256;
- dst = (uint8_t *)dst + 256;
- }
+ if (n <= 256) {
if (n >= 128) {
n -= 128;
rte_mov128((uint8_t *)dst, (const uint8_t *)src);
src = (const uint8_t *)src + 128;
dst = (uint8_t *)dst + 128;
}
+COPY_BLOCK_128_BACK31:
if (n >= 64) {
n -= 64;
rte_mov64((uint8_t *)dst, (const uint8_t *)src);
src = (const uint8_t *)src + 64;
dst = (uint8_t *)dst + 64;
}
-COPY_BLOCK_64_BACK31:
if (n > 32) {
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
- rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+ rte_mov32((uint8_t *)dst - 32 + n,
+ (const uint8_t *)src - 32 + n);
return ret;
}
if (n > 0) {
- rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+ rte_mov32((uint8_t *)dst - 32 + n,
+ (const uint8_t *)src - 32 + n);
}
return ret;
}
/**
- * Make store aligned when copy size exceeds 512 bytes
+ * Make store aligned when copy size exceeds 256 bytes
*/
dstofss = (uintptr_t)dst & 0x1F;
if (dstofss > 0) {
}
/**
- * Copy 256-byte blocks.
- * Use copy block function for better instruction order control,
- * which is important when load is unaligned.
+ * Copy 128-byte blocks
*/
- rte_mov256blocks((uint8_t *)dst, (const uint8_t *)src, n);
+ rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
bits = n;
- n = n & 255;
+ n = n & 127;
bits -= n;
src = (const uint8_t *)src + bits;
dst = (uint8_t *)dst + bits;
- /**
- * Copy 64-byte blocks.
- * Use copy block function for better instruction order control,
- * which is important when load is unaligned.
- */
- if (n >= 64) {
- rte_mov64blocks((uint8_t *)dst, (const uint8_t *)src, n);
- bits = n;
- n = n & 63;
- bits -= n;
- src = (const uint8_t *)src + bits;
- dst = (uint8_t *)dst + bits;
- }
-
/**
* Copy whatever left
*/
- goto COPY_BLOCK_64_BACK31;
+ goto COPY_BLOCK_128_BACK31;
}
#else /* RTE_MACHINE_CPUFLAG */
+#define ALIGNMENT_MASK 0x0F
+
/**
* SSE & AVX implementation below
*/
* - __m128i <xmm0> ~ <xmm8> must be pre-defined
*/
#define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \
-({ \
- int tmp; \
+__extension__ ({ \
+ size_t tmp; \
while (len >= 128 + 16 - offset) { \
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \
len -= 128; \
* - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
*/
#define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \
-({ \
+__extension__ ({ \
switch (offset) { \
case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \
case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \
})
static inline void *
-rte_memcpy(void *dst, const void *src, size_t n)
+rte_memcpy_generic(void *dst, const void *src, size_t n)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
uintptr_t dstu = (uintptr_t)dst;
#endif /* RTE_MACHINE_CPUFLAG */
+static inline void *
+rte_memcpy_aligned(void *dst, const void *src, size_t n)
+{
+ void *ret = dst;
+
+ /* Copy size <= 16 bytes */
+ if (n < 16) {
+ if (n & 0x01) {
+ *(uint8_t *)dst = *(const uint8_t *)src;
+ src = (const uint8_t *)src + 1;
+ dst = (uint8_t *)dst + 1;
+ }
+ if (n & 0x02) {
+ *(uint16_t *)dst = *(const uint16_t *)src;
+ src = (const uint16_t *)src + 1;
+ dst = (uint16_t *)dst + 1;
+ }
+ if (n & 0x04) {
+ *(uint32_t *)dst = *(const uint32_t *)src;
+ src = (const uint32_t *)src + 1;
+ dst = (uint32_t *)dst + 1;
+ }
+ if (n & 0x08)
+ *(uint64_t *)dst = *(const uint64_t *)src;
+
+ return ret;
+ }
+
+ /* Copy 16 <= size <= 32 bytes */
+ if (n <= 32) {
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov16((uint8_t *)dst - 16 + n,
+ (const uint8_t *)src - 16 + n);
+
+ return ret;
+ }
+
+ /* Copy 32 < size <= 64 bytes */
+ if (n <= 64) {
+ rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov32((uint8_t *)dst - 32 + n,
+ (const uint8_t *)src - 32 + n);
+
+ return ret;
+ }
+
+ /* Copy 64 bytes blocks */
+ for (; n >= 64; n -= 64) {
+ rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+ dst = (uint8_t *)dst + 64;
+ src = (const uint8_t *)src + 64;
+ }
+
+ /* Copy whatever left */
+ rte_mov64((uint8_t *)dst - 64 + n,
+ (const uint8_t *)src - 64 + n);
+
+ return ret;
+}
+
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+ if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
+ return rte_memcpy_aligned(dst, src, n);
+ else
+ return rte_memcpy_generic(dst, src, n);
+}
+
#ifdef __cplusplus
}
#endif