lib/librte_eal/common/include/arch/arm/rte_vect.h

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2015 Cavium, Inc
   3  */
   4
   5 #ifndef _RTE_VECT_ARM_H_
   6 #define _RTE_VECT_ARM_H_
   7
   8 #include <stdint.h>
   9 #include "generic/rte_vect.h"
  10 #include "rte_debug.h"
  11 #include "arm_neon.h"
  12
  13 #ifdef __cplusplus
  14 extern "C" {
  15 #endif
  16
  17 typedef int32x4_t xmm_t;
  18
  19 #define XMM_SIZE        (sizeof(xmm_t))
  20 #define XMM_MASK        (XMM_SIZE - 1)
  21
  22 typedef union rte_xmm {
  23         xmm_t    x;
  24         uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
  25         uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
  26         uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
  27         uint64_t u64[XMM_SIZE / sizeof(uint64_t)];
  28         double   pd[XMM_SIZE / sizeof(double)];
  29 } __attribute__((aligned(16))) rte_xmm_t;
  30
  31 #ifdef RTE_ARCH_ARM
  32 /* NEON intrinsic vqtbl1q_u8() is not supported in ARMv7-A(AArch32) */
  33 static __inline uint8x16_t
  34 vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
  35 {
  36         uint8_t i, pos;
  37         rte_xmm_t rte_a, rte_b, rte_ret;
  38
  39         vst1q_u8(rte_a.u8, a);
  40         vst1q_u8(rte_b.u8, b);
  41
  42         for (i = 0; i < 16; i++) {
  43                 pos = rte_b.u8[i];
  44                 if (pos < 16)
  45                         rte_ret.u8[i] = rte_a.u8[pos];
  46                 else
  47                         rte_ret.u8[i] = 0;
  48         }
  49
  50         return vld1q_u8(rte_ret.u8);
  51 }
  52
  53 static inline uint16_t
  54 vaddvq_u16(uint16x8_t a)
  55 {
  56         uint32x4_t m = vpaddlq_u16(a);
  57         uint64x2_t n = vpaddlq_u32(m);
  58         uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
  59
  60         return vget_lane_u32((uint32x2_t)o, 0);
  61 }
  62
  63 #endif
  64
  65 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70000)
  66 static inline uint32x4_t
  67 vcopyq_laneq_u32(uint32x4_t a, const int lane_a,
  68                  uint32x4_t b, const int lane_b)
  69 {
  70         return vsetq_lane_u32(vgetq_lane_u32(b, lane_b), a, lane_a);
  71 }
  72 #endif
  73
  74 #if defined(RTE_ARCH_ARM64)
  75 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70000)
  76
  77 #if (GCC_VERSION < 40900)
  78 typedef uint64_t poly64_t;
  79 typedef uint64x2_t poly64x2_t;
  80 typedef uint8_t poly128_t __attribute__((vector_size(16), aligned(16)));
  81
  82 static inline uint32x4_t
  83 vceqzq_u32(uint32x4_t a)
  84 {
  85         return (a == 0);
  86 }
  87 #endif
  88
  89 /* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 */
  90 static inline uint64x2_t
  91 vreinterpretq_u64_p128(poly128_t x)
  92 {
  93         return (uint64x2_t)x;
  94 }
  95
  96 /* NEON intrinsic vreinterpretq_p64_u64() is supported since GCC version 7 */
  97 static inline poly64x2_t
  98 vreinterpretq_p64_u64(uint64x2_t x)
  99 {
 100         return (poly64x2_t)x;
 101 }
 102
 103 /* NEON intrinsic vgetq_lane_p64() is supported since GCC version 7 */
 104 static inline poly64_t
 105 vgetq_lane_p64(poly64x2_t x, const int lane)
 106 {
 107         RTE_ASSERT(lane >= 0 && lane <= 1);
 108
 109         poly64_t *p = (poly64_t *)&x;
 110
 111         return p[lane];
 112 }
 113 #endif
 114 #endif
 115
 116 /*
 117  * If (0 <= index <= 15), then call the ASIMD ext instruction on the
 118  * 128 bit regs v0 and v1 with the appropriate index.
 119  *
 120  * Else returns a zero vector.
 121  */
 122 static inline uint8x16_t
 123 vextract(uint8x16_t v0, uint8x16_t v1, const int index)
 124 {
 125         switch (index) {
 126         case 0: return vextq_u8(v0, v1, 0);
 127         case 1: return vextq_u8(v0, v1, 1);
 128         case 2: return vextq_u8(v0, v1, 2);
 129         case 3: return vextq_u8(v0, v1, 3);
 130         case 4: return vextq_u8(v0, v1, 4);
 131         case 5: return vextq_u8(v0, v1, 5);
 132         case 6: return vextq_u8(v0, v1, 6);
 133         case 7: return vextq_u8(v0, v1, 7);
 134         case 8: return vextq_u8(v0, v1, 8);
 135         case 9: return vextq_u8(v0, v1, 9);
 136         case 10: return vextq_u8(v0, v1, 10);
 137         case 11: return vextq_u8(v0, v1, 11);
 138         case 12: return vextq_u8(v0, v1, 12);
 139         case 13: return vextq_u8(v0, v1, 13);
 140         case 14: return vextq_u8(v0, v1, 14);
 141         case 15: return vextq_u8(v0, v1, 15);
 142         }
 143         return vdupq_n_u8(0);
 144 }
 145
 146 /**
 147  * Shifts right 128 bit register by specified number of bytes
 148  *
 149  * Value of shift parameter must be in range 0 - 16
 150  */
 151 static inline uint64x2_t
 152 vshift_bytes_right(uint64x2_t reg, const unsigned int shift)
 153 {
 154         return vreinterpretq_u64_u8(vextract(
 155                                 vreinterpretq_u8_u64(reg),
 156                                 vdupq_n_u8(0),
 157                                 shift));
 158 }
 159
 160 /**
 161  * Shifts left 128 bit register by specified number of bytes
 162  *
 163  * Value of shift parameter must be in range 0 - 16
 164  */
 165 static inline uint64x2_t
 166 vshift_bytes_left(uint64x2_t reg, const unsigned int shift)
 167 {
 168         return vreinterpretq_u64_u8(vextract(
 169                                 vdupq_n_u8(0),
 170                                 vreinterpretq_u8_u64(reg),
 171                                 16 - shift));
 172 }
 173
 174 #ifdef __cplusplus
 175 }
 176 #endif
 177
 178 #endif