X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=lib%2Flibrte_eal%2Fcommon%2Finclude%2Farch%2Fx86%2Frte_memcpy.h;h=596d77791db0acbe8237ced616261d9289c367e7;hb=c3f15def2ebe9cc255cf0e5cf32aa171f5b4326d;hp=b3bfc23571680ac2496289e4644fd3f1588a23c8;hpb=6b3e017e5d25f15da73f7700f7f2ac553ef1a2e9;p=deb_dpdk.git diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h index b3bfc235..596d7779 100644 --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h @@ -44,6 +44,8 @@ #include #include #include +#include +#include #ifdef __cplusplus extern "C" { @@ -64,11 +66,13 @@ extern "C" { * @return * Pointer to the destination data. */ -static inline void * -rte_memcpy(void *dst, const void *src, size_t n) __attribute__((always_inline)); +static __rte_always_inline void * +rte_memcpy(void *dst, const void *src, size_t n); #ifdef RTE_MACHINE_CPUFLAG_AVX512F +#define ALIGNMENT_MASK 0x3F + /** * AVX512 implementation below */ @@ -189,7 +193,7 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) } static inline void * -rte_memcpy(void *dst, const void *src, size_t n) +rte_memcpy_generic(void *dst, const void *src, size_t n) { uintptr_t dstu = (uintptr_t)dst; uintptr_t srcu = (uintptr_t)src; @@ -308,6 +312,8 @@ COPY_BLOCK_128_BACK63: #elif defined RTE_MACHINE_CPUFLAG_AVX2 +#define ALIGNMENT_MASK 0x1F + /** * AVX2 implementation below */ @@ -387,7 +393,7 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) } static inline void * -rte_memcpy(void *dst, const void *src, size_t n) +rte_memcpy_generic(void *dst, const void *src, size_t n) { uintptr_t dstu = (uintptr_t)dst; uintptr_t srcu = (uintptr_t)src; @@ -499,6 +505,8 @@ COPY_BLOCK_128_BACK31: #else /* RTE_MACHINE_CPUFLAG */ +#define ALIGNMENT_MASK 0x0F + /** * SSE & AVX implementation below */ @@ -677,7 +685,7 @@ __extension__ ({ \ }) static inline void * -rte_memcpy(void *dst, const void *src, size_t n) +rte_memcpy_generic(void *dst, const void *src, size_t n) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; uintptr_t dstu = (uintptr_t)dst; @@ -821,6 +829,75 @@ COPY_BLOCK_64_BACK15: #endif /* RTE_MACHINE_CPUFLAG */ +static inline void * +rte_memcpy_aligned(void *dst, const void *src, size_t n) +{ + void *ret = dst; + + /* Copy size <= 16 bytes */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dst = *(const uint8_t *)src; + src = (const uint8_t *)src + 1; + dst = (uint8_t *)dst + 1; + } + if (n & 0x02) { + *(uint16_t *)dst = *(const uint16_t *)src; + src = (const uint16_t *)src + 1; + dst = (uint16_t *)dst + 1; + } + if (n & 0x04) { + *(uint32_t *)dst = *(const uint32_t *)src; + src = (const uint32_t *)src + 1; + dst = (uint32_t *)dst + 1; + } + if (n & 0x08) + *(uint64_t *)dst = *(const uint64_t *)src; + + return ret; + } + + /* Copy 16 <= size <= 32 bytes */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + + return ret; + } + + /* Copy 32 < size <= 64 bytes */ + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + + return ret; + } + + /* Copy 64 bytes blocks */ + for (; n >= 64; n -= 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; + } + + /* Copy whatever left */ + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + + return ret; +} + +static inline void * +rte_memcpy(void *dst, const void *src, size_t n) +{ + if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) + return rte_memcpy_aligned(dst, src, n); + else + return rte_memcpy_generic(dst, src, n); +} + #ifdef __cplusplus } #endif