X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvppinfra%2Fmemcpy_sse3.h;h=2dd9399d428f3345fc367d6f721eb4a5147044ba;hb=b7b929931a07fbb27b43d5cd105f366c3e29807e;hp=4fc48c86c8b18fb6b000cd784526cd5d40d636dc;hpb=658efd2fc986bca88336569b2e6fa8b537a660d8;p=vpp.git diff --git a/src/vppinfra/memcpy_sse3.h b/src/vppinfra/memcpy_sse3.h index 4fc48c86c8b..2dd9399d428 100644 --- a/src/vppinfra/memcpy_sse3.h +++ b/src/vppinfra/memcpy_sse3.h @@ -51,19 +51,20 @@ #include #include -typedef u8 u8x16u __attribute__ ((vector_size (16), aligned (1))); -typedef u8 u8x32u __attribute__ ((vector_size (32), aligned (1))); - static inline void clib_mov16 (u8 * dst, const u8 * src) { - *(u8x16u *) dst = *(u8x16u *) src; + __m128i xmm0; + + xmm0 = _mm_loadu_si128 ((const __m128i *) src); + _mm_storeu_si128 ((__m128i *) dst, xmm0); } static inline void clib_mov32 (u8 * dst, const u8 * src) { - *(u8x32u *) dst = *(u8x32u *) src; + clib_mov16 ((u8 *) dst + 0 * 16, (const u8 *) src + 0 * 16); + clib_mov16 ((u8 *) dst + 1 * 16, (const u8 *) src + 1 * 16); } static inline void @@ -182,7 +183,7 @@ clib_mov256 (u8 * dst, const u8 * src) }) static inline void * -clib_memcpy (void *dst, const void *src, size_t n) +_clib_memcpy (void *dst, const void *src, size_t n) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; uword dstu = (uword) dst; @@ -221,9 +222,9 @@ clib_memcpy (void *dst, const void *src, size_t n) return ret; } - /** - * Fast way when copy size doesn't exceed 512 bytes - */ + /** + * Fast way when copy size doesn't exceed 512 bytes + */ if (n <= 32) { clib_mov16 ((u8 *) dst, (const u8 *) src); @@ -294,27 +295,31 @@ clib_memcpy (void *dst, const void *src, size_t n) return ret; } - /** - * Make store aligned when copy size exceeds 512 bytes, - * and make sure the first 15 bytes are copied, because - * unaligned copy functions require up to 15 bytes - * backwards access. - */ - dstofss = 16 - ((uword) dst & 0x0F) + 16; - n -= dstofss; - clib_mov32 ((u8 *) dst, (const u8 *) src); - src = (const u8 *) src + dstofss; - dst = (u8 *) dst + dstofss; + /** + * Make store aligned when copy size exceeds 512 bytes, + * and make sure the first 15 bytes are copied, because + * unaligned copy functions require up to 15 bytes + * backwards access. + */ + dstofss = (uword) dst & 0x0F; + if (dstofss > 0) + { + dstofss = 16 - dstofss + 16; + n -= dstofss; + clib_mov32 ((u8 *) dst, (const u8 *) src); + src = (const u8 *) src + dstofss; + dst = (u8 *) dst + dstofss; + } srcofs = ((uword) src & 0x0F); - /** - * For aligned copy - */ + /** + * For aligned copy + */ if (srcofs == 0) { - /** - * Copy 256-byte blocks - */ + /** + * Copy 256-byte blocks + */ for (; n >= 256; n -= 256) { clib_mov256 ((u8 *) dst, (const u8 *) src); @@ -322,20 +327,20 @@ clib_memcpy (void *dst, const void *src, size_t n) src = (const u8 *) src + 256; } - /** - * Copy whatever left - */ + /** + * Copy whatever left + */ goto COPY_BLOCK_255_BACK15; } - /** - * For copy with unaligned load - */ + /** + * For copy with unaligned load + */ CLIB_MVUNALIGN_LEFT47 (dst, src, n, srcofs); - /** - * Copy whatever left - */ + /** + * Copy whatever left + */ goto COPY_BLOCK_64_BACK15; }