vppinfra/vppinfra/memcpy_avx.h

   1 /*
   2  * Copyright (c) 2016 Cisco and/or its affiliates.
   3  * Licensed under the Apache License, Version 2.0 (the "License");
   4  * you may not use this file except in compliance with the License.
   5  * You may obtain a copy of the License at:
   6  *
   7  *     http://www.apache.org/licenses/LICENSE-2.0
   8  *
   9  * Unless required by applicable law or agreed to in writing, software
  10  * distributed under the License is distributed on an "AS IS" BASIS,
  11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12  * See the License for the specific language governing permissions and
  13  * limitations under the License.
  14  */
  15 /*-
  16  *   BSD LICENSE
  17  *
  18  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
  19  *   All rights reserved.
  20  *
  21  *   Redistribution and use in source and binary forms, with or without
  22  *   modification, are permitted provided that the following conditions
  23  *   are met:
  24  *
  25  *     * Redistributions of source code must retain the above copyright
  26  *       notice, this list of conditions and the following disclaimer.
  27  *     * Redistributions in binary form must reproduce the above copyright
  28  *       notice, this list of conditions and the following disclaimer in
  29  *       the documentation and/or other materials provided with the
  30  *       distribution.
  31  *     * Neither the name of Intel Corporation nor the names of its
  32  *       contributors may be used to endorse or promote products derived
  33  *       from this software without specific prior written permission.
  34  *
  35  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  36  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  37  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  38  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  39  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  40  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  41  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  42  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  43  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  44  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  45  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  46  */
  47
  48 #ifndef included_clib_memcpy_avx_h
  49 #define included_clib_memcpy_avx_h
  50
  51 #include <stdint.h>
  52 #include <x86intrin.h>
  53
  54 typedef u8 u8x16u __attribute__ ((vector_size (16), aligned (1)));
  55 typedef u8 u8x32u __attribute__ ((vector_size (32), aligned (1)));
  56
  57 static inline void
  58 clib_mov16 (u8 * dst, const u8 * src)
  59 {
  60   *(u8x16u *) dst = *(u8x16u *) src;
  61 }
  62
  63 static inline void
  64 clib_mov32 (u8 * dst, const u8 * src)
  65 {
  66   *(u8x32u *) dst = *(u8x32u *) src;
  67 }
  68
  69 static inline void
  70 clib_mov64 (u8 * dst, const u8 * src)
  71 {
  72   clib_mov32 ((u8 *) dst + 0 * 32, (const u8 *) src + 0 * 32);
  73   clib_mov32 ((u8 *) dst + 1 * 32, (const u8 *) src + 1 * 32);
  74 }
  75
  76 static inline void
  77 clib_mov128 (u8 * dst, const u8 * src)
  78 {
  79   clib_mov64 ((u8 *) dst + 0 * 64, (const u8 *) src + 0 * 64);
  80   clib_mov64 ((u8 *) dst + 1 * 64, (const u8 *) src + 1 * 64);
  81 }
  82
  83 static inline void
  84 clib_mov256 (u8 * dst, const u8 * src)
  85 {
  86   clib_mov128 ((u8 *) dst + 0 * 128, (const u8 *) src + 0 * 128);
  87   clib_mov128 ((u8 *) dst + 1 * 128, (const u8 *) src + 1 * 128);
  88 }
  89
  90 static inline void
  91 clib_mov64blocks (u8 * dst, const u8 * src, size_t n)
  92 {
  93   __m256i ymm0, ymm1;
  94
  95   while (n >= 64)
  96     {
  97       ymm0 =
  98         _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 0 * 32));
  99       n -= 64;
 100       ymm1 =
 101         _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 1 * 32));
 102       src = (const u8 *) src + 64;
 103       _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 0 * 32), ymm0);
 104       _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 1 * 32), ymm1);
 105       dst = (u8 *) dst + 64;
 106     }
 107 }
 108
 109 static inline void
 110 clib_mov256blocks (u8 * dst, const u8 * src, size_t n)
 111 {
 112   __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
 113
 114   while (n >= 256)
 115     {
 116       ymm0 =
 117         _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 0 * 32));
 118       n -= 256;
 119       ymm1 =
 120         _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 1 * 32));
 121       ymm2 =
 122         _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 2 * 32));
 123       ymm3 =
 124         _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 3 * 32));
 125       ymm4 =
 126         _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 4 * 32));
 127       ymm5 =
 128         _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 5 * 32));
 129       ymm6 =
 130         _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 6 * 32));
 131       ymm7 =
 132         _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 7 * 32));
 133       src = (const u8 *) src + 256;
 134       _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 0 * 32), ymm0);
 135       _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 1 * 32), ymm1);
 136       _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 2 * 32), ymm2);
 137       _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 3 * 32), ymm3);
 138       _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 4 * 32), ymm4);
 139       _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 5 * 32), ymm5);
 140       _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 6 * 32), ymm6);
 141       _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 7 * 32), ymm7);
 142       dst = (u8 *) dst + 256;
 143     }
 144 }
 145
 146 static inline void *
 147 clib_memcpy (void *dst, const void *src, size_t n)
 148 {
 149   uword dstu = (uword) dst;
 150   uword srcu = (uword) src;
 151   void *ret = dst;
 152   size_t dstofss;
 153   size_t bits;
 154
 155         /**
 156          * Copy less than 16 bytes
 157          */
 158   if (n < 16)
 159     {
 160       if (n & 0x01)
 161         {
 162           *(u8 *) dstu = *(const u8 *) srcu;
 163           srcu = (uword) ((const u8 *) srcu + 1);
 164           dstu = (uword) ((u8 *) dstu + 1);
 165         }
 166       if (n & 0x02)
 167         {
 168           *(uint16_t *) dstu = *(const uint16_t *) srcu;
 169           srcu = (uword) ((const uint16_t *) srcu + 1);
 170           dstu = (uword) ((uint16_t *) dstu + 1);
 171         }
 172       if (n & 0x04)
 173         {
 174           *(uint32_t *) dstu = *(const uint32_t *) srcu;
 175           srcu = (uword) ((const uint32_t *) srcu + 1);
 176           dstu = (uword) ((uint32_t *) dstu + 1);
 177         }
 178       if (n & 0x08)
 179         {
 180           *(uint64_t *) dstu = *(const uint64_t *) srcu;
 181         }
 182       return ret;
 183     }
 184
 185         /**
 186          * Fast way when copy size doesn't exceed 512 bytes
 187          */
 188   if (n <= 32)
 189     {
 190       clib_mov16 ((u8 *) dst, (const u8 *) src);
 191       clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
 192       return ret;
 193     }
 194   if (n <= 64)
 195     {
 196       clib_mov32 ((u8 *) dst, (const u8 *) src);
 197       clib_mov32 ((u8 *) dst - 32 + n, (const u8 *) src - 32 + n);
 198       return ret;
 199     }
 200   if (n <= 512)
 201     {
 202       if (n >= 256)
 203         {
 204           n -= 256;
 205           clib_mov256 ((u8 *) dst, (const u8 *) src);
 206           src = (const u8 *) src + 256;
 207           dst = (u8 *) dst + 256;
 208         }
 209       if (n >= 128)
 210         {
 211           n -= 128;
 212           clib_mov128 ((u8 *) dst, (const u8 *) src);
 213           src = (const u8 *) src + 128;
 214           dst = (u8 *) dst + 128;
 215         }
 216       if (n >= 64)
 217         {
 218           n -= 64;
 219           clib_mov64 ((u8 *) dst, (const u8 *) src);
 220           src = (const u8 *) src + 64;
 221           dst = (u8 *) dst + 64;
 222         }
 223     COPY_BLOCK_64_BACK31:
 224       if (n > 32)
 225         {
 226           clib_mov32 ((u8 *) dst, (const u8 *) src);
 227           clib_mov32 ((u8 *) dst - 32 + n, (const u8 *) src - 32 + n);
 228           return ret;
 229         }
 230       if (n > 0)
 231         {
 232           clib_mov32 ((u8 *) dst - 32 + n, (const u8 *) src - 32 + n);
 233         }
 234       return ret;
 235     }
 236
 237         /**
 238          * Make store aligned when copy size exceeds 512 bytes
 239          */
 240   dstofss = (uword) dst & 0x1F;
 241   if (dstofss > 0)
 242     {
 243       dstofss = 32 - dstofss;
 244       n -= dstofss;
 245       clib_mov32 ((u8 *) dst, (const u8 *) src);
 246       src = (const u8 *) src + dstofss;
 247       dst = (u8 *) dst + dstofss;
 248     }
 249
 250         /**
 251          * Copy 256-byte blocks.
 252          * Use copy block function for better instruction order control,
 253          * which is important when load is unaligned.
 254          */
 255   clib_mov256blocks ((u8 *) dst, (const u8 *) src, n);
 256   bits = n;
 257   n = n & 255;
 258   bits -= n;
 259   src = (const u8 *) src + bits;
 260   dst = (u8 *) dst + bits;
 261
 262         /**
 263          * Copy 64-byte blocks.
 264          * Use copy block function for better instruction order control,
 265          * which is important when load is unaligned.
 266          */
 267   if (n >= 64)
 268     {
 269       clib_mov64blocks ((u8 *) dst, (const u8 *) src, n);
 270       bits = n;
 271       n = n & 63;
 272       bits -= n;
 273       src = (const u8 *) src + bits;
 274       dst = (u8 *) dst + bits;
 275     }
 276
 277         /**
 278          * Copy whatever left
 279          */
 280   goto COPY_BLOCK_64_BACK31;
 281 }
 282
 283
 284 #endif /* included_clib_mamcpy_avx_h */
 285
 286
 287 /*
 288  * fd.io coding-style-patch-verification: ON
 289  *
 290  * Local Variables:
 291  * eval: (c-set-style "gnu")
 292  * End:
 293  */