dpdk: DPDK 20.05 iavf flow director backporting to DPDK 20.02
[vpp.git] / build / external / patches / dpdk_20.02 / 0010-net-iavf-flexible-Rx-descriptor-support-in-AVX-path.patch
1 From b1138c10d2cd5938f4c0316e0b132caeb7e869dd Mon Sep 17 00:00:00 2001
2 From: Leyi Rong <leyi.rong@intel.com>
3 Date: Wed, 8 Apr 2020 14:22:03 +0800
4 Subject: [DPDK 10/17] net/iavf: flexible Rx descriptor support in AVX path
5
6 Support flexible Rx descriptor format in AVX
7 path of iAVF PMD.
8
9 Signed-off-by: Leyi Rong <leyi.rong@intel.com>
10 ---
11  drivers/net/iavf/iavf_rxtx.c          |  24 +-
12  drivers/net/iavf/iavf_rxtx.h          |   6 +
13  drivers/net/iavf/iavf_rxtx_vec_avx2.c | 550 +++++++++++++++++++++++++-
14  3 files changed, 570 insertions(+), 10 deletions(-)
15
16 diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
17 index 67297dcb7..34c41d104 100644
18 --- a/drivers/net/iavf/iavf_rxtx.c
19 +++ b/drivers/net/iavf/iavf_rxtx.c
20 @@ -2081,16 +2081,28 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
21                                     "Using %sVector Scattered Rx (port %d).",
22                                     use_avx2 ? "avx2 " : "",
23                                     dev->data->port_id);
24 -                       dev->rx_pkt_burst = use_avx2 ?
25 -                                           iavf_recv_scattered_pkts_vec_avx2 :
26 -                                           iavf_recv_scattered_pkts_vec;
27 +                       if (vf->vf_res->vf_cap_flags &
28 +                               VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
29 +                               dev->rx_pkt_burst = use_avx2 ?
30 +                                       iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
31 +                                       iavf_recv_scattered_pkts_vec;
32 +                       else
33 +                               dev->rx_pkt_burst = use_avx2 ?
34 +                                       iavf_recv_scattered_pkts_vec_avx2 :
35 +                                       iavf_recv_scattered_pkts_vec;
36                 } else {
37                         PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
38                                     use_avx2 ? "avx2 " : "",
39                                     dev->data->port_id);
40 -                       dev->rx_pkt_burst = use_avx2 ?
41 -                                           iavf_recv_pkts_vec_avx2 :
42 -                                           iavf_recv_pkts_vec;
43 +                       if (vf->vf_res->vf_cap_flags &
44 +                               VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
45 +                               dev->rx_pkt_burst = use_avx2 ?
46 +                                       iavf_recv_pkts_vec_avx2_flex_rxd :
47 +                                       iavf_recv_pkts_vec;
48 +                       else
49 +                               dev->rx_pkt_burst = use_avx2 ?
50 +                                       iavf_recv_pkts_vec_avx2 :
51 +                                       iavf_recv_pkts_vec;
52                 }
53  
54                 return;
55 diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
56 index f33d1df41..8e1db2588 100644
57 --- a/drivers/net/iavf/iavf_rxtx.h
58 +++ b/drivers/net/iavf/iavf_rxtx.h
59 @@ -413,9 +413,15 @@ uint16_t iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
60                                   uint16_t nb_pkts);
61  uint16_t iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
62                                  uint16_t nb_pkts);
63 +uint16_t iavf_recv_pkts_vec_avx2_flex_rxd(void *rx_queue,
64 +                                         struct rte_mbuf **rx_pkts,
65 +                                         uint16_t nb_pkts);
66  uint16_t iavf_recv_scattered_pkts_vec_avx2(void *rx_queue,
67                                            struct rte_mbuf **rx_pkts,
68                                            uint16_t nb_pkts);
69 +uint16_t iavf_recv_scattered_pkts_vec_avx2_flex_rxd(void *rx_queue,
70 +                                                   struct rte_mbuf **rx_pkts,
71 +                                                   uint16_t nb_pkts);
72  uint16_t iavf_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
73                             uint16_t nb_pkts);
74  uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
75 diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
76 index 2587083d8..b23188fd3 100644
77 --- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c
78 +++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
79 @@ -11,14 +11,16 @@
80  #endif
81  
82  static inline void
83 -iavf_rxq_rearm(struct iavf_rx_queue *rxq)
84 +iavf_rxq_rearm(struct iavf_rx_queue *rxq, volatile union iavf_rx_desc *rxdp)
85  {
86         int i;
87         uint16_t rx_id;
88 -       volatile union iavf_rx_desc *rxdp;
89         struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start];
90  
91 -       rxdp = rxq->rx_ring + rxq->rxrearm_start;
92 +       if (rxq->rxdid == IAVF_RXDID_COMMS_OVS_1) {
93 +               volatile union iavf_rx_flex_desc *rxdp =
94 +                       (union iavf_rx_flex_desc *)rxdp;
95 +       }
96  
97         /* Pull 'n' more MBUFs into the software ring */
98         if (rte_mempool_get_bulk(rxq->mp,
99 @@ -160,7 +162,7 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
100          * of time to act
101          */
102         if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
103 -               iavf_rxq_rearm(rxq);
104 +               iavf_rxq_rearm(rxq, rxq->rx_ring + rxq->rxrearm_start);
105  
106         /* Before we start moving massive data around, check to see if
107          * there is actually a packet available
108 @@ -614,6 +616,465 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
109         return received;
110  }
111  
112 +static inline uint16_t
113 +_iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
114 +                                     struct rte_mbuf **rx_pkts,
115 +                                     uint16_t nb_pkts, uint8_t *split_packet)
116 +{
117 +#define IAVF_DESCS_PER_LOOP_AVX 8
118 +
119 +       const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
120 +
121 +       const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
122 +                       0, rxq->mbuf_initializer);
123 +       struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
124 +       volatile union iavf_rx_flex_desc *rxdp =
125 +               (union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
126 +
127 +       rte_prefetch0(rxdp);
128 +
129 +       /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
130 +       nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
131 +
132 +       /* See if we need to rearm the RX queue - gives the prefetch a bit
133 +        * of time to act
134 +        */
135 +       if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
136 +               /* iavf_rxq_rearm(rxq); */
137 +               iavf_rxq_rearm(rxq, rxq->rx_ring + rxq->rxrearm_start);
138 +
139 +       /* Before we start moving massive data around, check to see if
140 +        * there is actually a packet available
141 +        */
142 +       if (!(rxdp->wb.status_error0 &
143 +                       rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S)))
144 +               return 0;
145 +
146 +       /* constants used in processing loop */
147 +       const __m256i crc_adjust =
148 +               _mm256_set_epi16
149 +                       (/* first descriptor */
150 +                        0, 0, 0,       /* ignore non-length fields */
151 +                        -rxq->crc_len, /* sub crc on data_len */
152 +                        0,             /* ignore high-16bits of pkt_len */
153 +                        -rxq->crc_len, /* sub crc on pkt_len */
154 +                        0, 0,          /* ignore pkt_type field */
155 +                        /* second descriptor */
156 +                        0, 0, 0,       /* ignore non-length fields */
157 +                        -rxq->crc_len, /* sub crc on data_len */
158 +                        0,             /* ignore high-16bits of pkt_len */
159 +                        -rxq->crc_len, /* sub crc on pkt_len */
160 +                        0, 0           /* ignore pkt_type field */
161 +                       );
162 +
163 +       /* 8 packets DD mask, LSB in each 32-bit value */
164 +       const __m256i dd_check = _mm256_set1_epi32(1);
165 +
166 +       /* 8 packets EOP mask, second-LSB in each 32-bit value */
167 +       const __m256i eop_check = _mm256_slli_epi32(dd_check,
168 +                       IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
169 +
170 +       /* mask to shuffle from desc. to mbuf (2 descriptors)*/
171 +       const __m256i shuf_msk =
172 +               _mm256_set_epi8
173 +                       (/* first descriptor */
174 +                        15, 14,
175 +                        13, 12,        /* octet 12~15, 32 bits rss */
176 +                        11, 10,        /* octet 10~11, 16 bits vlan_macip */
177 +                        5, 4,          /* octet 4~5, 16 bits data_len */
178 +                        0xFF, 0xFF,    /* skip hi 16 bits pkt_len, zero out */
179 +                        5, 4,          /* octet 4~5, 16 bits pkt_len */
180 +                        0xFF, 0xFF,    /* pkt_type set as unknown */
181 +                        0xFF, 0xFF,    /*pkt_type set as unknown */
182 +                        /* second descriptor */
183 +                        15, 14,
184 +                        13, 12,        /* octet 12~15, 32 bits rss */
185 +                        11, 10,        /* octet 10~11, 16 bits vlan_macip */
186 +                        5, 4,          /* octet 4~5, 16 bits data_len */
187 +                        0xFF, 0xFF,    /* skip hi 16 bits pkt_len, zero out */
188 +                        5, 4,          /* octet 4~5, 16 bits pkt_len */
189 +                        0xFF, 0xFF,    /* pkt_type set as unknown */
190 +                        0xFF, 0xFF     /*pkt_type set as unknown */
191 +                       );
192 +       /**
193 +        * compile-time check the above crc and shuffle layout is correct.
194 +        * NOTE: the first field (lowest address) is given last in set_epi
195 +        * calls above.
196 +        */
197 +       RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
198 +                       offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
199 +       RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
200 +                       offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
201 +       RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
202 +                       offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
203 +       RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
204 +                       offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
205 +
206 +       /* Status/Error flag masks */
207 +       /**
208 +        * mask everything except Checksum Reports, RSS indication
209 +        * and VLAN indication.
210 +        * bit6:4 for IP/L4 checksum errors.
211 +        * bit12 is for RSS indication.
212 +        * bit13 is for VLAN indication.
213 +        */
214 +       const __m256i flags_mask =
215 +                _mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
216 +       /**
217 +        * data to be shuffled by the result of the flags mask shifted by 4
218 +        * bits.  This gives use the l3_l4 flags.
219 +        */
220 +       const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
221 +                       /* shift right 1 bit to make sure it not exceed 255 */
222 +                       (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
223 +                        PKT_RX_IP_CKSUM_BAD) >> 1,
224 +                       (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
225 +                        PKT_RX_IP_CKSUM_GOOD) >> 1,
226 +                       (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
227 +                        PKT_RX_IP_CKSUM_BAD) >> 1,
228 +                       (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
229 +                        PKT_RX_IP_CKSUM_GOOD) >> 1,
230 +                       (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
231 +                       (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
232 +                       (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
233 +                       (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
234 +                       /* second 128-bits */
235 +                       0, 0, 0, 0, 0, 0, 0, 0,
236 +                       (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
237 +                        PKT_RX_IP_CKSUM_BAD) >> 1,
238 +                       (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
239 +                        PKT_RX_IP_CKSUM_GOOD) >> 1,
240 +                       (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
241 +                        PKT_RX_IP_CKSUM_BAD) >> 1,
242 +                       (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
243 +                        PKT_RX_IP_CKSUM_GOOD) >> 1,
244 +                       (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
245 +                       (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
246 +                       (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
247 +                       (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
248 +       const __m256i cksum_mask =
249 +                _mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
250 +                                  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
251 +                                  PKT_RX_EIP_CKSUM_BAD);
252 +       /**
253 +        * data to be shuffled by result of flag mask, shifted down 12.
254 +        * If RSS(bit12)/VLAN(bit13) are set,
255 +        * shuffle moves appropriate flags in place.
256 +        */
257 +       const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
258 +                       0, 0, 0, 0,
259 +                       0, 0, 0, 0,
260 +                       PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
261 +                       PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
262 +                       PKT_RX_RSS_HASH, 0,
263 +                       /* end up 128-bits */
264 +                       0, 0, 0, 0,
265 +                       0, 0, 0, 0,
266 +                       0, 0, 0, 0,
267 +                       PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
268 +                       PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
269 +                       PKT_RX_RSS_HASH, 0);
270 +
271 +       uint16_t i, received;
272 +
273 +       for (i = 0, received = 0; i < nb_pkts;
274 +            i += IAVF_DESCS_PER_LOOP_AVX,
275 +            rxdp += IAVF_DESCS_PER_LOOP_AVX) {
276 +               /* step 1, copy over 8 mbuf pointers to rx_pkts array */
277 +               _mm256_storeu_si256((void *)&rx_pkts[i],
278 +                                   _mm256_loadu_si256((void *)&sw_ring[i]));
279 +#ifdef RTE_ARCH_X86_64
280 +               _mm256_storeu_si256
281 +                       ((void *)&rx_pkts[i + 4],
282 +                        _mm256_loadu_si256((void *)&sw_ring[i + 4]));
283 +#endif
284 +
285 +               __m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7;
286 +
287 +               const __m128i raw_desc7 =
288 +                       _mm_load_si128((void *)(rxdp + 7));
289 +               rte_compiler_barrier();
290 +               const __m128i raw_desc6 =
291 +                       _mm_load_si128((void *)(rxdp + 6));
292 +               rte_compiler_barrier();
293 +               const __m128i raw_desc5 =
294 +                       _mm_load_si128((void *)(rxdp + 5));
295 +               rte_compiler_barrier();
296 +               const __m128i raw_desc4 =
297 +                       _mm_load_si128((void *)(rxdp + 4));
298 +               rte_compiler_barrier();
299 +               const __m128i raw_desc3 =
300 +                       _mm_load_si128((void *)(rxdp + 3));
301 +               rte_compiler_barrier();
302 +               const __m128i raw_desc2 =
303 +                       _mm_load_si128((void *)(rxdp + 2));
304 +               rte_compiler_barrier();
305 +               const __m128i raw_desc1 =
306 +                       _mm_load_si128((void *)(rxdp + 1));
307 +               rte_compiler_barrier();
308 +               const __m128i raw_desc0 =
309 +                       _mm_load_si128((void *)(rxdp + 0));
310 +
311 +               raw_desc6_7 =
312 +                       _mm256_inserti128_si256
313 +                               (_mm256_castsi128_si256(raw_desc6),
314 +                                raw_desc7, 1);
315 +               raw_desc4_5 =
316 +                       _mm256_inserti128_si256
317 +                               (_mm256_castsi128_si256(raw_desc4),
318 +                                raw_desc5, 1);
319 +               raw_desc2_3 =
320 +                       _mm256_inserti128_si256
321 +                               (_mm256_castsi128_si256(raw_desc2),
322 +                                raw_desc3, 1);
323 +               raw_desc0_1 =
324 +                       _mm256_inserti128_si256
325 +                               (_mm256_castsi128_si256(raw_desc0),
326 +                                raw_desc1, 1);
327 +
328 +               if (split_packet) {
329 +                       int j;
330 +
331 +                       for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
332 +                               rte_mbuf_prefetch_part2(rx_pkts[i + j]);
333 +               }
334 +
335 +               /**
336 +                * convert descriptors 4-7 into mbufs, re-arrange fields.
337 +                * Then write into the mbuf.
338 +                */
339 +               __m256i mb6_7 = _mm256_shuffle_epi8(raw_desc6_7, shuf_msk);
340 +               __m256i mb4_5 = _mm256_shuffle_epi8(raw_desc4_5, shuf_msk);
341 +
342 +               mb6_7 = _mm256_add_epi16(mb6_7, crc_adjust);
343 +               mb4_5 = _mm256_add_epi16(mb4_5, crc_adjust);
344 +               /**
345 +                * to get packet types, ptype is located in bit16-25
346 +                * of each 128bits
347 +                */
348 +               const __m256i ptype_mask =
349 +                       _mm256_set1_epi16(IAVF_RX_FLEX_DESC_PTYPE_M);
350 +               const __m256i ptypes6_7 =
351 +                       _mm256_and_si256(raw_desc6_7, ptype_mask);
352 +               const __m256i ptypes4_5 =
353 +                       _mm256_and_si256(raw_desc4_5, ptype_mask);
354 +               const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9);
355 +               const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1);
356 +               const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9);
357 +               const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1);
358 +
359 +               mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype7], 4);
360 +               mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype6], 0);
361 +               mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype5], 4);
362 +               mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype4], 0);
363 +               /* merge the status bits into one register */
364 +               const __m256i status4_7 = _mm256_unpackhi_epi32(raw_desc6_7,
365 +                               raw_desc4_5);
366 +
367 +               /**
368 +                * convert descriptors 0-3 into mbufs, re-arrange fields.
369 +                * Then write into the mbuf.
370 +                */
371 +               __m256i mb2_3 = _mm256_shuffle_epi8(raw_desc2_3, shuf_msk);
372 +               __m256i mb0_1 = _mm256_shuffle_epi8(raw_desc0_1, shuf_msk);
373 +
374 +               mb2_3 = _mm256_add_epi16(mb2_3, crc_adjust);
375 +               mb0_1 = _mm256_add_epi16(mb0_1, crc_adjust);
376 +               /**
377 +                * to get packet types, ptype is located in bit16-25
378 +                * of each 128bits
379 +                */
380 +               const __m256i ptypes2_3 =
381 +                       _mm256_and_si256(raw_desc2_3, ptype_mask);
382 +               const __m256i ptypes0_1 =
383 +                       _mm256_and_si256(raw_desc0_1, ptype_mask);
384 +               const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9);
385 +               const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1);
386 +               const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9);
387 +               const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1);
388 +
389 +               mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype3], 4);
390 +               mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype2], 0);
391 +               mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype1], 4);
392 +               mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype0], 0);
393 +               /* merge the status bits into one register */
394 +               const __m256i status0_3 = _mm256_unpackhi_epi32(raw_desc2_3,
395 +                                                               raw_desc0_1);
396 +
397 +               /**
398 +                * take the two sets of status bits and merge to one
399 +                * After merge, the packets status flags are in the
400 +                * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
401 +                */
402 +               __m256i status0_7 = _mm256_unpacklo_epi64(status4_7,
403 +                                                         status0_3);
404 +
405 +               /* now do flag manipulation */
406 +
407 +               /* get only flag/error bits we want */
408 +               const __m256i flag_bits =
409 +                       _mm256_and_si256(status0_7, flags_mask);
410 +               /**
411 +                * l3_l4_error flags, shuffle, then shift to correct adjustment
412 +                * of flags in flags_shuf, and finally mask out extra bits
413 +                */
414 +               __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
415 +                               _mm256_srli_epi32(flag_bits, 4));
416 +               l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
417 +               l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
418 +               /* set rss and vlan flags */
419 +               const __m256i rss_vlan_flag_bits =
420 +                       _mm256_srli_epi32(flag_bits, 12);
421 +               const __m256i rss_vlan_flags =
422 +                       _mm256_shuffle_epi8(rss_vlan_flags_shuf,
423 +                                           rss_vlan_flag_bits);
424 +
425 +               /* merge flags */
426 +               const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
427 +                               rss_vlan_flags);
428 +               /**
429 +                * At this point, we have the 8 sets of flags in the low 16-bits
430 +                * of each 32-bit value in vlan0.
431 +                * We want to extract these, and merge them with the mbuf init
432 +                * data so we can do a single write to the mbuf to set the flags
433 +                * and all the other initialization fields. Extracting the
434 +                * appropriate flags means that we have to do a shift and blend
435 +                * for each mbuf before we do the write. However, we can also
436 +                * add in the previously computed rx_descriptor fields to
437 +                * make a single 256-bit write per mbuf
438 +                */
439 +               /* check the structure matches expectations */
440 +               RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
441 +                                offsetof(struct rte_mbuf, rearm_data) + 8);
442 +               RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
443 +                                RTE_ALIGN(offsetof(struct rte_mbuf,
444 +                                                   rearm_data),
445 +                                          16));
446 +               /* build up data and do writes */
447 +               __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
448 +                       rearm6, rearm7;
449 +               rearm6 = _mm256_blend_epi32(mbuf_init,
450 +                                           _mm256_slli_si256(mbuf_flags, 8),
451 +                                           0x04);
452 +               rearm4 = _mm256_blend_epi32(mbuf_init,
453 +                                           _mm256_slli_si256(mbuf_flags, 4),
454 +                                           0x04);
455 +               rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
456 +               rearm0 = _mm256_blend_epi32(mbuf_init,
457 +                                           _mm256_srli_si256(mbuf_flags, 4),
458 +                                           0x04);
459 +               /* permute to add in the rx_descriptor e.g. rss fields */
460 +               rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
461 +               rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
462 +               rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
463 +               rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
464 +               /* write to mbuf */
465 +               _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
466 +                                   rearm6);
467 +               _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
468 +                                   rearm4);
469 +               _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
470 +                                   rearm2);
471 +               _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
472 +                                   rearm0);
473 +
474 +               /* repeat for the odd mbufs */
475 +               const __m256i odd_flags =
476 +                       _mm256_castsi128_si256
477 +                               (_mm256_extracti128_si256(mbuf_flags, 1));
478 +               rearm7 = _mm256_blend_epi32(mbuf_init,
479 +                                           _mm256_slli_si256(odd_flags, 8),
480 +                                           0x04);
481 +               rearm5 = _mm256_blend_epi32(mbuf_init,
482 +                                           _mm256_slli_si256(odd_flags, 4),
483 +                                           0x04);
484 +               rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
485 +               rearm1 = _mm256_blend_epi32(mbuf_init,
486 +                                           _mm256_srli_si256(odd_flags, 4),
487 +                                           0x04);
488 +               /* since odd mbufs are already in hi 128-bits use blend */
489 +               rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
490 +               rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
491 +               rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
492 +               rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
493 +               /* again write to mbufs */
494 +               _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
495 +                                   rearm7);
496 +               _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
497 +                                   rearm5);
498 +               _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
499 +                                   rearm3);
500 +               _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
501 +                                   rearm1);
502 +
503 +               /* extract and record EOP bit */
504 +               if (split_packet) {
505 +                       const __m128i eop_mask =
506 +                               _mm_set1_epi16(1 <<
507 +                                              IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
508 +                       const __m256i eop_bits256 = _mm256_and_si256(status0_7,
509 +                                                                    eop_check);
510 +                       /* pack status bits into a single 128-bit register */
511 +                       const __m128i eop_bits =
512 +                               _mm_packus_epi32
513 +                                       (_mm256_castsi256_si128(eop_bits256),
514 +                                        _mm256_extractf128_si256(eop_bits256,
515 +                                                                 1));
516 +                       /**
517 +                        * flip bits, and mask out the EOP bit, which is now
518 +                        * a split-packet bit i.e. !EOP, rather than EOP one.
519 +                        */
520 +                       __m128i split_bits = _mm_andnot_si128(eop_bits,
521 +                                       eop_mask);
522 +                       /**
523 +                        * eop bits are out of order, so we need to shuffle them
524 +                        * back into order again. In doing so, only use low 8
525 +                        * bits, which acts like another pack instruction
526 +                        * The original order is (hi->lo): 1,3,5,7,0,2,4,6
527 +                        * [Since we use epi8, the 16-bit positions are
528 +                        * multiplied by 2 in the eop_shuffle value.]
529 +                        */
530 +                       __m128i eop_shuffle =
531 +                               _mm_set_epi8(/* zero hi 64b */
532 +                                            0xFF, 0xFF, 0xFF, 0xFF,
533 +                                            0xFF, 0xFF, 0xFF, 0xFF,
534 +                                            /* move values to lo 64b */
535 +                                            8, 0, 10, 2,
536 +                                            12, 4, 14, 6);
537 +                       split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
538 +                       *(uint64_t *)split_packet =
539 +                               _mm_cvtsi128_si64(split_bits);
540 +                       split_packet += IAVF_DESCS_PER_LOOP_AVX;
541 +               }
542 +
543 +               /* perform dd_check */
544 +               status0_7 = _mm256_and_si256(status0_7, dd_check);
545 +               status0_7 = _mm256_packs_epi32(status0_7,
546 +                                              _mm256_setzero_si256());
547 +
548 +               uint64_t burst = __builtin_popcountll
549 +                                       (_mm_cvtsi128_si64
550 +                                               (_mm256_extracti128_si256
551 +                                                       (status0_7, 1)));
552 +               burst += __builtin_popcountll
553 +                               (_mm_cvtsi128_si64
554 +                                       (_mm256_castsi256_si128(status0_7)));
555 +               received += burst;
556 +               if (burst != IAVF_DESCS_PER_LOOP_AVX)
557 +                       break;
558 +       }
559 +
560 +       /* update tail pointers */
561 +       rxq->rx_tail += received;
562 +       rxq->rx_tail &= (rxq->nb_rx_desc - 1);
563 +       if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */
564 +               rxq->rx_tail--;
565 +               received--;
566 +       }
567 +       rxq->rxrearm_nb += received;
568 +       return received;
569 +}
570 +
571  /**
572   * Notice:
573   * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
574 @@ -625,6 +1086,18 @@ iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
575         return _iavf_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
576  }
577  
578 +/**
579 + * Notice:
580 + * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
581 + */
582 +uint16_t
583 +iavf_recv_pkts_vec_avx2_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
584 +                                uint16_t nb_pkts)
585 +{
586 +       return _iavf_recv_raw_pkts_vec_avx2_flex_rxd(rx_queue, rx_pkts,
587 +                                                    nb_pkts, NULL);
588 +}
589 +
590  /**
591   * vPMD receive routine that reassembles single burst of 32 scattered packets
592   * Notice:
593 @@ -690,6 +1163,75 @@ iavf_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
594                                 rx_pkts + retval, nb_pkts);
595  }
596  
597 +/**
598 + * vPMD receive routine that reassembles single burst of
599 + * 32 scattered packets for flex RxD
600 + * Notice:
601 + * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
602 + */
603 +static uint16_t
604 +iavf_recv_scattered_burst_vec_avx2_flex_rxd(void *rx_queue,
605 +                                           struct rte_mbuf **rx_pkts,
606 +                                           uint16_t nb_pkts)
607 +{
608 +       struct iavf_rx_queue *rxq = rx_queue;
609 +       uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
610 +
611 +       /* get some new buffers */
612 +       uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx2_flex_rxd(rxq,
613 +                                       rx_pkts, nb_pkts, split_flags);
614 +       if (nb_bufs == 0)
615 +               return 0;
616 +
617 +       /* happy day case, full burst + no packets to be joined */
618 +       const uint64_t *split_fl64 = (uint64_t *)split_flags;
619 +
620 +       if (!rxq->pkt_first_seg &&
621 +           split_fl64[0] == 0 && split_fl64[1] == 0 &&
622 +           split_fl64[2] == 0 && split_fl64[3] == 0)
623 +               return nb_bufs;
624 +
625 +       /* reassemble any packets that need reassembly*/
626 +       unsigned int i = 0;
627 +
628 +       if (!rxq->pkt_first_seg) {
629 +               /* find the first split flag, and only reassemble then*/
630 +               while (i < nb_bufs && !split_flags[i])
631 +                       i++;
632 +               if (i == nb_bufs)
633 +                       return nb_bufs;
634 +               rxq->pkt_first_seg = rx_pkts[i];
635 +       }
636 +       return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
637 +                                            &split_flags[i]);
638 +}
639 +
640 +/**
641 + * vPMD receive routine that reassembles scattered packets for flex RxD.
642 + * Main receive routine that can handle arbitrary burst sizes
643 + * Notice:
644 + * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
645 + */
646 +uint16_t
647 +iavf_recv_scattered_pkts_vec_avx2_flex_rxd(void *rx_queue,
648 +                                          struct rte_mbuf **rx_pkts,
649 +                                          uint16_t nb_pkts)
650 +{
651 +       uint16_t retval = 0;
652 +
653 +       while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
654 +               uint16_t burst =
655 +                       iavf_recv_scattered_burst_vec_avx2_flex_rxd
656 +                       (rx_queue, rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST);
657 +               retval += burst;
658 +               nb_pkts -= burst;
659 +               if (burst < IAVF_VPMD_RX_MAX_BURST)
660 +                       return retval;
661 +       }
662 +       return retval + iavf_recv_scattered_burst_vec_avx2_flex_rxd(rx_queue,
663 +                               rx_pkts + retval, nb_pkts);
664 +}
665 +
666  static inline void
667  iavf_vtx1(volatile struct iavf_tx_desc *txdp,
668           struct rte_mbuf *pkt, uint64_t flags)
669 -- 
670 2.17.1
671