New upstream version 18.11-rc1
[deb_dpdk.git] / drivers / net / enic / enic_rxtx_vec_avx2.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2008-2018 Cisco Systems, Inc.  All rights reserved.
3  * Copyright 2007 Nuova Systems, Inc.  All rights reserved.
4  */
5
6 #include <rte_mbuf.h>
7 #include <rte_ethdev_driver.h>
8
9 #include "enic_compat.h"
10 #include "rq_enet_desc.h"
11 #include "enic.h"
12 #include "enic_rxtx_common.h"
13
14 #include <x86intrin.h>
15
16 static struct rte_mbuf *
17 rx_one(struct cq_enet_rq_desc *cqd, struct rte_mbuf *mb, struct enic *enic)
18 {
19         bool tnl;
20
21         *(uint64_t *)&mb->rearm_data = enic->mbuf_initializer;
22         mb->data_len = cqd->bytes_written_flags &
23                 CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
24         mb->pkt_len = mb->data_len;
25         tnl = enic->overlay_offload && (cqd->completed_index_flags &
26                                         CQ_ENET_RQ_DESC_FLAGS_FCOE) != 0;
27         mb->packet_type =
28                 enic_cq_rx_flags_to_pkt_type((struct cq_desc *)cqd, tnl);
29         enic_cq_rx_to_pkt_flags((struct cq_desc *)cqd, mb);
30         /* Wipe the outer types set by enic_cq_rx_flags_to_pkt_type() */
31         if (tnl) {
32                 mb->packet_type &= ~(RTE_PTYPE_L3_MASK |
33                                      RTE_PTYPE_L4_MASK);
34         }
35         return mb;
36 }
37
38 static uint16_t
39 enic_noscatter_vec_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
40                              uint16_t nb_pkts)
41 {
42         struct rte_mbuf **rx, **rxmb;
43         uint16_t cq_idx, nb_rx, max_rx;
44         struct cq_enet_rq_desc *cqd;
45         struct rq_enet_desc *rqd;
46         struct vnic_cq *cq;
47         struct vnic_rq *rq;
48         struct enic *enic;
49         uint8_t color;
50
51         rq = rx_queue;
52         enic = vnic_dev_priv(rq->vdev);
53         cq = &enic->cq[enic_cq_rq(enic, rq->index)];
54         cq_idx = cq->to_clean;
55
56         /*
57          * Fill up the reserve of free mbufs. Below, we restock the receive
58          * ring with these mbufs to avoid allocation failures.
59          */
60         if (rq->num_free_mbufs == 0) {
61                 if (rte_mempool_get_bulk(rq->mp, (void **)rq->free_mbufs,
62                                          ENIC_RX_BURST_MAX))
63                         return 0;
64                 rq->num_free_mbufs = ENIC_RX_BURST_MAX;
65         }
66         /* Receive until the end of the ring, at most. */
67         max_rx = RTE_MIN(nb_pkts, rq->num_free_mbufs);
68         max_rx = RTE_MIN(max_rx, cq->ring.desc_count - cq_idx);
69
70         rxmb = rq->mbuf_ring + cq_idx;
71         color = cq->last_color;
72         cqd = (struct cq_enet_rq_desc *)(cq->ring.descs) + cq_idx;
73         rx = rx_pkts;
74         if (max_rx == 0 ||
75             (cqd->type_color & CQ_DESC_COLOR_MASK_NOSHIFT) == color)
76                 return 0;
77
78         /* Step 1: Process one packet to do aligned 256-bit load below */
79         if (cq_idx & 0x1) {
80                 if (unlikely(cqd->bytes_written_flags &
81                              CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
82                         rte_pktmbuf_free(*rxmb++);
83                         rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
84                 } else {
85                         *rx++ = rx_one(cqd, *rxmb++, enic);
86                 }
87                 cqd++;
88                 max_rx--;
89         }
90
91         const __m256i mask =
92                 _mm256_set_epi8(/* Second descriptor */
93                         0xff, /* type_color */
94                         (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
95                          CQ_ENET_RQ_DESC_FLAGS_IPV4 |
96                          CQ_ENET_RQ_DESC_FLAGS_IPV6 |
97                          CQ_ENET_RQ_DESC_FLAGS_TCP |
98                          CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
99                         0, 0, /* checksum_fcoe */
100                         0xff, 0xff, /* vlan */
101                         0x3f, 0xff, /* bytes_written_flags */
102                         0xff, 0xff, 0xff, 0xff, /* rss_hash */
103                         0xff, 0xff, /* q_number_rss_type_flags */
104                         0, 0, /* completed_index_flags */
105                         /* First descriptor */
106                         0xff, /* type_color */
107                         (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
108                          CQ_ENET_RQ_DESC_FLAGS_IPV4 |
109                          CQ_ENET_RQ_DESC_FLAGS_IPV6 |
110                          CQ_ENET_RQ_DESC_FLAGS_TCP |
111                          CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
112                         0, 0, /* checksum_fcoe */
113                         0xff, 0xff, /* vlan */
114                         0x3f, 0xff, /* bytes_written_flags */
115                         0xff, 0xff, 0xff, 0xff, /* rss_hash */
116                         0xff, 0xff, /* q_number_rss_type_flags */
117                         0, 0 /* completed_index_flags */
118                         );
119         const __m256i shuffle_mask =
120                 _mm256_set_epi8(/* Second descriptor */
121                         7, 6, 5, 4,             /* rss = rss_hash */
122                         11, 10,                 /* vlan_tci = vlan */
123                         9, 8,                   /* data_len = bytes_written */
124                         0x80, 0x80, 9, 8,       /* pkt_len = bytes_written */
125                         0x80, 0x80, 0x80, 0x80, /* packet_type = 0 */
126                         /* First descriptor */
127                         7, 6, 5, 4,             /* rss = rss_hash */
128                         11, 10,                 /* vlan_tci = vlan */
129                         9, 8,                   /* data_len = bytes_written */
130                         0x80, 0x80, 9, 8,       /* pkt_len = bytes_written */
131                         0x80, 0x80, 0x80, 0x80  /* packet_type = 0 */
132                         );
133         /* Used to collect 8 flags from 8 desc into one register */
134         const __m256i flags_shuffle_mask =
135                 _mm256_set_epi8(/* Second descriptor */
136                         1, 3, 9, 14,
137                         1, 3, 9, 14,
138                         1, 3, 9, 14,
139                         1, 3, 9, 14,
140                         /* First descriptor */
141                         1, 3, 9, 14,
142                         1, 3, 9, 14,
143                         1, 3, 9, 14,
144                         /*
145                          * Byte 3: upper byte of completed_index_flags
146                          *         bit 5 = fcoe (tunnel)
147                          * Byte 2: upper byte of q_number_rss_type_flags
148                          *         bits 2,3,4,5 = rss type
149                          *         bit 6 = csum_not_calc
150                          * Byte 1: upper byte of bytes_written_flags
151                          *         bit 6 = truncated
152                          *         bit 7 = vlan stripped
153                          * Byte 0: flags
154                          */
155                         1, 3, 9, 14
156                         );
157         /* Used to collect 8 VLAN IDs from 8 desc into one register */
158         const __m256i vlan_shuffle_mask =
159                 _mm256_set_epi8(/* Second descriptor */
160                         0x80, 0x80, 11, 10,
161                         0x80, 0x80, 11, 10,
162                         0x80, 0x80, 11, 10,
163                         0x80, 0x80, 11, 10,
164                         /* First descriptor */
165                         0x80, 0x80, 11, 10,
166                         0x80, 0x80, 11, 10,
167                         0x80, 0x80, 11, 10,
168                         0x80, 0x80, 11, 10);
169         /* PKT_RX_RSS_HASH is 1<<1 so fits in 8-bit integer */
170         const __m256i rss_shuffle =
171                 _mm256_set_epi8(/* second 128 bits */
172                         PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
173                         PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
174                         PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
175                         PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
176                         PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
177                         0, /* rss_types = 0 */
178                         /* first 128 bits */
179                         PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
180                         PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
181                         PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
182                         PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
183                         PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
184                         0 /* rss_types = 0 */);
185         /*
186          * VLAN offload flags.
187          * shuffle index:
188          * vlan_stripped => bit 0
189          * vlan_id == 0  => bit 1
190          */
191         const __m256i vlan_shuffle =
192                 _mm256_set_epi32(0, 0, 0, 0,
193                         PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
194                         PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, PKT_RX_VLAN);
195         /* Use the same shuffle index as vlan_shuffle */
196         const __m256i vlan_ptype_shuffle =
197                 _mm256_set_epi32(0, 0, 0, 0,
198                                  RTE_PTYPE_L2_ETHER,
199                                  RTE_PTYPE_L2_ETHER,
200                                  RTE_PTYPE_L2_ETHER,
201                                  RTE_PTYPE_L2_ETHER_VLAN);
202         /*
203          * CKSUM flags. Shift right so they fit int 8-bit integers.
204          * shuffle index:
205          * ipv4_csum_ok    => bit 3
206          * ip4             => bit 2
207          * tcp_or_udp      => bit 1
208          * tcp_udp_csum_ok => bit 0
209          */
210         const __m256i csum_shuffle =
211                 _mm256_set_epi8(/* second 128 bits */
212                         /* 1111 ip4+ip4_ok+l4+l4_ok */
213                         ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
214                         /* 1110 ip4_ok+ip4+l4+!l4_ok */
215                         ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1),
216                         (PKT_RX_IP_CKSUM_GOOD >> 1), /* 1101 ip4+ip4_ok */
217                         (PKT_RX_IP_CKSUM_GOOD >> 1), /* 1100 ip4_ok+ip4 */
218                         (PKT_RX_L4_CKSUM_GOOD >> 1), /* 1011 l4+l4_ok */
219                         (PKT_RX_L4_CKSUM_BAD >> 1),  /* 1010 l4+!l4_ok */
220                         0, /* 1001 */
221                         0, /* 1000 */
222                         /* 0111 !ip4_ok+ip4+l4+l4_ok */
223                         ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1),
224                         /* 0110 !ip4_ok+ip4+l4+!l4_ok */
225                         ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1),
226                         (PKT_RX_IP_CKSUM_BAD >> 1),  /* 0101 !ip4_ok+ip4 */
227                         (PKT_RX_IP_CKSUM_BAD >> 1),  /* 0100 !ip4_ok+ip4 */
228                         (PKT_RX_L4_CKSUM_GOOD >> 1), /* 0011 l4+l4_ok */
229                         (PKT_RX_L4_CKSUM_BAD >> 1),  /* 0010 l4+!l4_ok */
230                         0, /* 0001 */
231                         0, /* 0000 */
232                         /* first 128 bits */
233                         ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
234                         ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1),
235                         (PKT_RX_IP_CKSUM_GOOD >> 1),
236                         (PKT_RX_IP_CKSUM_GOOD >> 1),
237                         (PKT_RX_L4_CKSUM_GOOD >> 1),
238                         (PKT_RX_L4_CKSUM_BAD >> 1),
239                         0, 0,
240                         ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1),
241                         ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1),
242                         (PKT_RX_IP_CKSUM_BAD >> 1),
243                         (PKT_RX_IP_CKSUM_BAD >> 1),
244                         (PKT_RX_L4_CKSUM_GOOD >> 1),
245                         (PKT_RX_L4_CKSUM_BAD >> 1),
246                         0, 0);
247         /*
248          * Non-fragment PTYPEs.
249          * Shuffle 4-bit index:
250          * ip6 => bit 0
251          * ip4 => bit 1
252          * udp => bit 2
253          * tcp => bit 3
254          *   bit
255          * 3 2 1 0
256          * -------
257          * 0 0 0 0 unknown
258          * 0 0 0 1 ip6 | nonfrag
259          * 0 0 1 0 ip4 | nonfrag
260          * 0 0 1 1 unknown
261          * 0 1 0 0 unknown
262          * 0 1 0 1 ip6 | udp
263          * 0 1 1 0 ip4 | udp
264          * 0 1 1 1 unknown
265          * 1 0 0 0 unknown
266          * 1 0 0 1 ip6 | tcp
267          * 1 0 1 0 ip4 | tcp
268          * 1 0 1 1 unknown
269          * 1 1 0 0 unknown
270          * 1 1 0 1 unknown
271          * 1 1 1 0 unknown
272          * 1 1 1 1 unknown
273          *
274          * PTYPEs do not fit in 8 bits, so shift right 4..
275          */
276         const __m256i nonfrag_ptype_shuffle =
277                 _mm256_set_epi8(/* second 128 bits */
278                         RTE_PTYPE_UNKNOWN,
279                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
280                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
281                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
282                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
283                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
284                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
285                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
286                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
287                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
288                          RTE_PTYPE_L4_NONFRAG) >> 4,
289                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
290                          RTE_PTYPE_L4_NONFRAG) >> 4,
291                         RTE_PTYPE_UNKNOWN,
292                         /* first 128 bits */
293                         RTE_PTYPE_UNKNOWN,
294                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
295                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
296                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
297                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
298                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
299                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
300                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
301                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
302                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
303                          RTE_PTYPE_L4_NONFRAG) >> 4,
304                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
305                          RTE_PTYPE_L4_NONFRAG) >> 4,
306                         RTE_PTYPE_UNKNOWN);
307         /* Fragment PTYPEs. Use the same shuffle index as above. */
308         const __m256i frag_ptype_shuffle =
309                 _mm256_set_epi8(/* second 128 bits */
310                         RTE_PTYPE_UNKNOWN,
311                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
312                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
313                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
314                          RTE_PTYPE_L4_FRAG) >> 4,
315                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
316                          RTE_PTYPE_L4_FRAG) >> 4,
317                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
318                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
319                          RTE_PTYPE_L4_FRAG) >> 4,
320                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
321                          RTE_PTYPE_L4_FRAG) >> 4,
322                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
323                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
324                          RTE_PTYPE_L4_FRAG) >> 4,
325                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
326                          RTE_PTYPE_L4_FRAG) >> 4,
327                         RTE_PTYPE_UNKNOWN,
328                         /* first 128 bits */
329                         RTE_PTYPE_UNKNOWN,
330                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
331                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
332                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
333                          RTE_PTYPE_L4_FRAG) >> 4,
334                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
335                          RTE_PTYPE_L4_FRAG) >> 4,
336                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
337                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
338                          RTE_PTYPE_L4_FRAG) >> 4,
339                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
340                          RTE_PTYPE_L4_FRAG) >> 4,
341                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
342                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
343                          RTE_PTYPE_L4_FRAG) >> 4,
344                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
345                          RTE_PTYPE_L4_FRAG) >> 4,
346                         RTE_PTYPE_UNKNOWN);
347         /*
348          * Tunnel PTYPEs. Use the same shuffle index as above.
349          * L4 types are not part of this table. They come from non-tunnel
350          * types above.
351          */
352         const __m256i tnl_l3_ptype_shuffle =
353                 _mm256_set_epi8(/* second 128 bits */
354                         RTE_PTYPE_UNKNOWN,
355                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
356                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
357                         RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
358                         RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
359                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
360                         RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
361                         RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
362                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
363                         RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
364                         RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
365                         RTE_PTYPE_UNKNOWN,
366                         /* first 128 bits */
367                         RTE_PTYPE_UNKNOWN,
368                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
369                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
370                         RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
371                         RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
372                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
373                         RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
374                         RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
375                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
376                         RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
377                         RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
378                         RTE_PTYPE_UNKNOWN);
379
380         const __m256i mbuf_init = _mm256_set_epi64x(0, enic->mbuf_initializer,
381                                                     0, enic->mbuf_initializer);
382
383         /*
384          * --- cq desc fields ---    offset
385          * completed_index_flags    - 0   use: fcoe
386          * q_number_rss_type_flags  - 2   use: rss types, csum_not_calc
387          * rss_hash                 - 4   ==> mbuf.hash.rss
388          * bytes_written_flags      - 8   ==> mbuf.pkt_len,data_len
389          *                                use: truncated, vlan_stripped
390          * vlan                     - 10  ==> mbuf.vlan_tci
391          * checksum_fcoe            - 12  (unused)
392          * flags                    - 14  use: all bits
393          * type_color               - 15  (unused)
394          *
395          * --- mbuf fields ---       offset
396          * rearm_data              ---- 16
397          * data_off    - 0      (mbuf_init) -+
398          * refcnt      - 2      (mbuf_init)  |
399          * nb_segs     - 4      (mbuf_init)  | 16B 128b
400          * port        - 6      (mbuf_init)  |
401          * ol_flag     - 8      (from cqd)  -+
402          * rx_descriptor_fields1   ---- 32
403          * packet_type - 0      (from cqd)  -+
404          * pkt_len     - 4      (from cqd)   |
405          * data_len    - 8      (from cqd)   | 16B 128b
406          * vlan_tci    - 10     (from cqd)   |
407          * rss         - 12     (from cqd)  -+
408          */
409
410         __m256i overlay_enabled =
411                 _mm256_set1_epi32((uint32_t)enic->overlay_offload);
412
413         /* Step 2: Process 8 packets per loop using SIMD */
414         while (max_rx > 7 && (((cqd + 7)->type_color &
415                                CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
416                 /* Load 8 16B CQ descriptors */
417                 __m256i cqd01 = _mm256_load_si256((void *)cqd);
418                 __m256i cqd23 = _mm256_load_si256((void *)(cqd + 2));
419                 __m256i cqd45 = _mm256_load_si256((void *)(cqd + 4));
420                 __m256i cqd67 = _mm256_load_si256((void *)(cqd + 6));
421                 /* Copy 8 mbuf pointers to rx_pkts */
422                 _mm256_storeu_si256((void *)rx,
423                                     _mm256_loadu_si256((void *)rxmb));
424                 _mm256_storeu_si256((void *)(rx + 4),
425                                     _mm256_loadu_si256((void *)(rxmb + 4)));
426
427                 /*
428                  * Collect 8 flags (each 32 bits) into one register.
429                  * 4 shuffles, 3 blends, 1 permute for 8 desc: 1 inst/desc
430                  */
431                 __m256i flags01 =
432                         _mm256_shuffle_epi8(cqd01, flags_shuffle_mask);
433                 /*
434                  * Shuffle above produces 8 x 32-bit flags for 8 descriptors
435                  * in this order: 0, 0, 0, 0, 1, 1, 1, 1
436                  * The duplicates in each 128-bit lane simplifies blending
437                  * below.
438                  */
439                 __m256i flags23 =
440                         _mm256_shuffle_epi8(cqd23, flags_shuffle_mask);
441                 __m256i flags45 =
442                         _mm256_shuffle_epi8(cqd45, flags_shuffle_mask);
443                 __m256i flags67 =
444                         _mm256_shuffle_epi8(cqd67, flags_shuffle_mask);
445                 /* 1st blend produces flags for desc: 0, 2, 0, 0, 1, 3, 1, 1 */
446                 __m256i flags0_3 = _mm256_blend_epi32(flags01, flags23, 0x22);
447                 /* 2nd blend produces flags for desc: 4, 4, 4, 6, 5, 5, 5, 7 */
448                 __m256i flags4_7 = _mm256_blend_epi32(flags45, flags67, 0x88);
449                 /* 3rd blend produces flags for desc: 0, 2, 4, 6, 1, 3, 5, 7 */
450                 __m256i flags0_7 = _mm256_blend_epi32(flags0_3, flags4_7, 0xcc);
451                 /*
452                  * Swap to reorder flags in this order: 1, 3, 5, 7, 0, 2, 4, 6
453                  * This order simplifies blend operations way below that
454                  * produce 'rearm' data for each mbuf.
455                  */
456                 flags0_7 = _mm256_permute4x64_epi64(flags0_7,
457                         (1 << 6) + (0 << 4) + (3 << 2) + 2);
458
459                 /*
460                  * Check truncated bits and bail out early on.
461                  * 6 avx inst, 1 or, 1 if-then-else for 8 desc: 1 inst/desc
462                  */
463                 __m256i trunc =
464                         _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 17), 31);
465                 trunc = _mm256_add_epi64(trunc, _mm256_permute4x64_epi64(trunc,
466                         (1 << 6) + (0 << 4) + (3 << 2) + 2));
467                 /* 0:63 contains 1+3+0+2 and 64:127 contains 5+7+4+6 */
468                 if (_mm256_extract_epi64(trunc, 0) ||
469                     _mm256_extract_epi64(trunc, 1))
470                         break;
471
472                 /*
473                  * Compute PKT_RX_RSS_HASH.
474                  * Use 2 shifts and 1 shuffle for 8 desc: 0.375 inst/desc
475                  * RSS types in byte 0, 4, 8, 12, 16, 20, 24, 28
476                  * Everything else is zero.
477                  */
478                 __m256i rss_types =
479                         _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 10), 28);
480                 /*
481                  * RSS flags (PKT_RX_RSS_HASH) are in
482                  * byte 0, 4, 8, 12, 16, 20, 24, 28
483                  * Everything else is zero.
484                  */
485                 __m256i rss_flags = _mm256_shuffle_epi8(rss_shuffle, rss_types);
486
487                 /*
488                  * Compute CKSUM flags. First build the index and then
489                  * use it to shuffle csum_shuffle.
490                  * 20 instructions including const loads: 2.5 inst/desc
491                  */
492                 /*
493                  * csum_not_calc (bit 22)
494                  * csum_not_calc (0) => 0xffffffff
495                  * csum_not_calc (1) => 0x0
496                  */
497                 const __m256i zero4 = _mm256_setzero_si256();
498                 const __m256i mask22 = _mm256_set1_epi32(0x400000);
499                 __m256i csum_not_calc = _mm256_cmpeq_epi32(zero4,
500                         _mm256_and_si256(flags0_7, mask22));
501                 /*
502                  * (tcp|udp) && !fragment => bit 1
503                  * tcp = bit 2, udp = bit 1, frag = bit 6
504                  */
505                 const __m256i mask1 = _mm256_set1_epi32(0x2);
506                 __m256i tcp_udp =
507                         _mm256_andnot_si256(_mm256_srli_epi32(flags0_7, 5),
508                                 _mm256_or_si256(flags0_7,
509                                         _mm256_srli_epi32(flags0_7, 1)));
510                 tcp_udp = _mm256_and_si256(tcp_udp, mask1);
511                 /* ipv4 (bit 5) => bit 2 */
512                 const __m256i mask2 = _mm256_set1_epi32(0x4);
513                 __m256i ipv4 = _mm256_and_si256(mask2,
514                         _mm256_srli_epi32(flags0_7, 3));
515                 /*
516                  * ipv4_csum_ok (bit 3) => bit 3
517                  * tcp_udp_csum_ok (bit 0) => bit 0
518                  * 0x9
519                  */
520                 const __m256i mask0_3 = _mm256_set1_epi32(0x9);
521                 __m256i csum_idx = _mm256_and_si256(flags0_7, mask0_3);
522                 csum_idx = _mm256_and_si256(csum_not_calc,
523                         _mm256_or_si256(_mm256_or_si256(csum_idx, ipv4),
524                                 tcp_udp));
525                 __m256i csum_flags =
526                         _mm256_shuffle_epi8(csum_shuffle, csum_idx);
527                 /* Shift left to restore CKSUM flags. See csum_shuffle. */
528                 csum_flags = _mm256_slli_epi32(csum_flags, 1);
529                 /* Combine csum flags and offload flags: 0.125 inst/desc */
530                 rss_flags = _mm256_or_si256(rss_flags, csum_flags);
531
532                 /*
533                  * Collect 8 VLAN IDs and compute vlan_id != 0 on each.
534                  * 4 shuffles, 3 blends, 1 permute, 1 cmp, 1 sub for 8 desc:
535                  * 1.25 inst/desc
536                  */
537                 __m256i vlan01 = _mm256_shuffle_epi8(cqd01, vlan_shuffle_mask);
538                 __m256i vlan23 = _mm256_shuffle_epi8(cqd23, vlan_shuffle_mask);
539                 __m256i vlan45 = _mm256_shuffle_epi8(cqd45, vlan_shuffle_mask);
540                 __m256i vlan67 = _mm256_shuffle_epi8(cqd67, vlan_shuffle_mask);
541                 __m256i vlan0_3 = _mm256_blend_epi32(vlan01, vlan23, 0x22);
542                 __m256i vlan4_7 = _mm256_blend_epi32(vlan45, vlan67, 0x88);
543                 /* desc: 0, 2, 4, 6, 1, 3, 5, 7 */
544                 __m256i vlan0_7 = _mm256_blend_epi32(vlan0_3, vlan4_7, 0xcc);
545                 /* desc: 1, 3, 5, 7, 0, 2, 4, 6 */
546                 vlan0_7 = _mm256_permute4x64_epi64(vlan0_7,
547                         (1 << 6) + (0 << 4) + (3 << 2) + 2);
548                 /*
549                  * Compare 0 == vlan_id produces 0xffffffff (-1) if
550                  * vlan 0 and 0 if vlan non-0. Then subtracting the
551                  * result from 0 produces 0 - (-1) = 1 for vlan 0, and
552                  * 0 - 0 = 0 for vlan non-0.
553                  */
554                 vlan0_7 = _mm256_cmpeq_epi32(zero4, vlan0_7);
555                 /* vlan_id != 0 => 0, vlan_id == 0 => 1 */
556                 vlan0_7 = _mm256_sub_epi32(zero4, vlan0_7);
557
558                 /*
559                  * Compute PKT_RX_VLAN and PKT_RX_VLAN_STRIPPED.
560                  * Use 3 shifts, 1 or,  1 shuffle for 8 desc: 0.625 inst/desc
561                  * VLAN offload flags in byte 0, 4, 8, 12, 16, 20, 24, 28
562                  * Everything else is zero.
563                  */
564                 __m256i vlan_idx =
565                         _mm256_or_si256(/* vlan_stripped => bit 0 */
566                                 _mm256_srli_epi32(_mm256_slli_epi32(flags0_7,
567                                         16), 31),
568                                 /* (vlan_id == 0) => bit 1 */
569                                 _mm256_slli_epi32(vlan0_7, 1));
570                 /*
571                  * The index captures 4 cases.
572                  * stripped, id = 0   ==> 11b = 3
573                  * stripped, id != 0  ==> 01b = 1
574                  * not strip, id == 0 ==> 10b = 2
575                  * not strip, id != 0 ==> 00b = 0
576                  */
577                 __m256i vlan_flags = _mm256_permutevar8x32_epi32(vlan_shuffle,
578                         vlan_idx);
579                 /* Combine vlan and offload flags: 0.125 inst/desc */
580                 rss_flags = _mm256_or_si256(rss_flags, vlan_flags);
581
582                 /*
583                  * Compute non-tunnel PTYPEs.
584                  * 17 inst / 8 desc = 2.125 inst/desc
585                  */
586                 /* ETHER and ETHER_VLAN */
587                 __m256i vlan_ptype =
588                         _mm256_permutevar8x32_epi32(vlan_ptype_shuffle,
589                                 vlan_idx);
590                 /* Build the ptype index from flags */
591                 tcp_udp = _mm256_slli_epi32(flags0_7, 29);
592                 tcp_udp = _mm256_slli_epi32(_mm256_srli_epi32(tcp_udp, 30), 2);
593                 __m256i ip4_ip6 =
594                         _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 26), 30);
595                 __m256i ptype_idx = _mm256_or_si256(tcp_udp, ip4_ip6);
596                 __m256i frag_bit =
597                         _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 25), 31);
598                 __m256i nonfrag_ptype =
599                         _mm256_shuffle_epi8(nonfrag_ptype_shuffle, ptype_idx);
600                 __m256i frag_ptype =
601                         _mm256_shuffle_epi8(frag_ptype_shuffle, ptype_idx);
602                 /*
603                  * Zero out the unwanted types and combine the remaining bits.
604                  * The effect is same as selecting non-frag or frag types
605                  * depending on the frag bit.
606                  */
607                 nonfrag_ptype = _mm256_and_si256(nonfrag_ptype,
608                         _mm256_cmpeq_epi32(zero4, frag_bit));
609                 frag_ptype = _mm256_and_si256(frag_ptype,
610                         _mm256_cmpgt_epi32(frag_bit, zero4));
611                 __m256i ptype = _mm256_or_si256(nonfrag_ptype, frag_ptype);
612                 ptype = _mm256_slli_epi32(ptype, 4);
613                 /*
614                  * Compute tunnel PTYPEs.
615                  * 15 inst / 8 desc = 1.875 inst/desc
616                  */
617                 __m256i tnl_l3_ptype =
618                         _mm256_shuffle_epi8(tnl_l3_ptype_shuffle, ptype_idx);
619                 tnl_l3_ptype = _mm256_slli_epi32(tnl_l3_ptype, 16);
620                 /*
621                  * Shift non-tunnel L4 types to make them tunnel types.
622                  * RTE_PTYPE_L4_TCP << 16 == RTE_PTYPE_INNER_L4_TCP
623                  */
624                 __m256i tnl_l4_ptype =
625                         _mm256_slli_epi32(_mm256_and_si256(ptype,
626                                 _mm256_set1_epi32(RTE_PTYPE_L4_MASK)), 16);
627                 __m256i tnl_ptype =
628                         _mm256_or_si256(tnl_l3_ptype, tnl_l4_ptype);
629                 tnl_ptype = _mm256_or_si256(tnl_ptype,
630                         _mm256_set1_epi32(RTE_PTYPE_TUNNEL_GRENAT |
631                                 RTE_PTYPE_INNER_L2_ETHER));
632                 /*
633                  * Select non-tunnel or tunnel types by zeroing out the
634                  * unwanted ones.
635                  */
636                 __m256i tnl_flags = _mm256_and_si256(overlay_enabled,
637                         _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 2), 31));
638                 tnl_ptype = _mm256_and_si256(tnl_ptype,
639                         _mm256_sub_epi32(zero4, tnl_flags));
640                 ptype = _mm256_and_si256(ptype,
641                         _mm256_cmpeq_epi32(zero4, tnl_flags));
642                 /*
643                  * Combine types and swap to have ptypes in the same order
644                  * as desc.
645                  * desc: 0 2 4 6 1 3 5 7
646                  * 3 inst / 8 desc = 0.375 inst/desc
647                  */
648                 ptype = _mm256_or_si256(ptype, tnl_ptype);
649                 ptype = _mm256_or_si256(ptype, vlan_ptype);
650                 ptype = _mm256_permute4x64_epi64(ptype,
651                         (1 << 6) + (0 << 4) + (3 << 2) + 2);
652
653                 /*
654                  * Mask packet length.
655                  * Use 4 ands: 0.5 instructions/desc
656                  */
657                 cqd01 = _mm256_and_si256(cqd01, mask);
658                 cqd23 = _mm256_and_si256(cqd23, mask);
659                 cqd45 = _mm256_and_si256(cqd45, mask);
660                 cqd67 = _mm256_and_si256(cqd67, mask);
661                 /*
662                  * Shuffle. Two 16B sets of the mbuf fields.
663                  * packet_type, pkt_len, data_len, vlan_tci, rss
664                  */
665                 __m256i rearm01 = _mm256_shuffle_epi8(cqd01, shuffle_mask);
666                 __m256i rearm23 = _mm256_shuffle_epi8(cqd23, shuffle_mask);
667                 __m256i rearm45 = _mm256_shuffle_epi8(cqd45, shuffle_mask);
668                 __m256i rearm67 = _mm256_shuffle_epi8(cqd67, shuffle_mask);
669
670                 /*
671                  * Blend in ptypes
672                  * 4 blends and 3 shuffles for 8 desc: 0.875 inst/desc
673                  */
674                 rearm01 = _mm256_blend_epi32(rearm01, ptype, 0x11);
675                 rearm23 = _mm256_blend_epi32(rearm23,
676                         _mm256_shuffle_epi32(ptype, 1), 0x11);
677                 rearm45 = _mm256_blend_epi32(rearm45,
678                         _mm256_shuffle_epi32(ptype, 2), 0x11);
679                 rearm67 = _mm256_blend_epi32(rearm67,
680                         _mm256_shuffle_epi32(ptype, 3), 0x11);
681
682                 /*
683                  * Move rss_flags into ol_flags in mbuf_init.
684                  * Use 1 shift and 1 blend for each desc: 2 inst/desc
685                  */
686                 __m256i mbuf_init4_5 = _mm256_blend_epi32(mbuf_init,
687                         rss_flags, 0x44);
688                 __m256i mbuf_init2_3 = _mm256_blend_epi32(mbuf_init,
689                         _mm256_slli_si256(rss_flags, 4), 0x44);
690                 __m256i mbuf_init0_1 = _mm256_blend_epi32(mbuf_init,
691                         _mm256_slli_si256(rss_flags, 8), 0x44);
692                 __m256i mbuf_init6_7 = _mm256_blend_epi32(mbuf_init,
693                         _mm256_srli_si256(rss_flags, 4), 0x44);
694
695                 /*
696                  * Build rearm, one per desc.
697                  * 8 blends and 4 permutes: 1.5 inst/desc
698                  */
699                 __m256i rearm0 = _mm256_blend_epi32(rearm01,
700                         mbuf_init0_1, 0xf0);
701                 __m256i rearm1 = _mm256_blend_epi32(mbuf_init0_1,
702                         rearm01, 0xf0);
703                 __m256i rearm2 = _mm256_blend_epi32(rearm23,
704                         mbuf_init2_3, 0xf0);
705                 __m256i rearm3 = _mm256_blend_epi32(mbuf_init2_3,
706                         rearm23, 0xf0);
707                 /* Swap upper and lower 64 bits */
708                 rearm0 = _mm256_permute4x64_epi64(rearm0,
709                         (1 << 6) + (0 << 4) + (3 << 2) + 2);
710                 rearm2 = _mm256_permute4x64_epi64(rearm2,
711                         (1 << 6) + (0 << 4) + (3 << 2) + 2);
712                 /* Second set of 4 descriptors */
713                 __m256i rearm4 = _mm256_blend_epi32(rearm45,
714                         mbuf_init4_5, 0xf0);
715                 __m256i rearm5 = _mm256_blend_epi32(mbuf_init4_5,
716                         rearm45, 0xf0);
717                 __m256i rearm6 = _mm256_blend_epi32(rearm67,
718                         mbuf_init6_7, 0xf0);
719                 __m256i rearm7 = _mm256_blend_epi32(mbuf_init6_7,
720                         rearm67, 0xf0);
721                 rearm4 = _mm256_permute4x64_epi64(rearm4,
722                         (1 << 6) + (0 << 4) + (3 << 2) + 2);
723                 rearm6 = _mm256_permute4x64_epi64(rearm6,
724                         (1 << 6) + (0 << 4) + (3 << 2) + 2);
725
726                 /*
727                  * Write out 32B of mbuf fields.
728                  * data_off    - off 0  (mbuf_init)
729                  * refcnt      - 2      (mbuf_init)
730                  * nb_segs     - 4      (mbuf_init)
731                  * port        - 6      (mbuf_init)
732                  * ol_flag     - 8      (from cqd)
733                  * packet_type - 16     (from cqd)
734                  * pkt_len     - 20     (from cqd)
735                  * data_len    - 24     (from cqd)
736                  * vlan_tci    - 26     (from cqd)
737                  * rss         - 28     (from cqd)
738                  */
739                 _mm256_storeu_si256((__m256i *)&rxmb[0]->rearm_data, rearm0);
740                 _mm256_storeu_si256((__m256i *)&rxmb[1]->rearm_data, rearm1);
741                 _mm256_storeu_si256((__m256i *)&rxmb[2]->rearm_data, rearm2);
742                 _mm256_storeu_si256((__m256i *)&rxmb[3]->rearm_data, rearm3);
743                 _mm256_storeu_si256((__m256i *)&rxmb[4]->rearm_data, rearm4);
744                 _mm256_storeu_si256((__m256i *)&rxmb[5]->rearm_data, rearm5);
745                 _mm256_storeu_si256((__m256i *)&rxmb[6]->rearm_data, rearm6);
746                 _mm256_storeu_si256((__m256i *)&rxmb[7]->rearm_data, rearm7);
747
748                 max_rx -= 8;
749                 cqd += 8;
750                 rx += 8;
751                 rxmb += 8;
752         }
753
754         /*
755          * Step 3: Slow path to handle a small (<8) number of packets and
756          * occasional truncated packets.
757          */
758         while (max_rx && ((cqd->type_color &
759                            CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
760                 if (unlikely(cqd->bytes_written_flags &
761                              CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
762                         rte_pktmbuf_free(*rxmb++);
763                         rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
764                 } else {
765                         *rx++ = rx_one(cqd, *rxmb++, enic);
766                 }
767                 cqd++;
768                 max_rx--;
769         }
770
771         /* Number of descriptors visited */
772         nb_rx = cqd - (struct cq_enet_rq_desc *)(cq->ring.descs) - cq_idx;
773         if (nb_rx == 0)
774                 return 0;
775         rqd = ((struct rq_enet_desc *)rq->ring.descs) + cq_idx;
776         rxmb = rq->mbuf_ring + cq_idx;
777         cq_idx += nb_rx;
778         rq->rx_nb_hold += nb_rx;
779         if (unlikely(cq_idx == cq->ring.desc_count)) {
780                 cq_idx = 0;
781                 cq->last_color ^= CQ_DESC_COLOR_MASK_NOSHIFT;
782         }
783         cq->to_clean = cq_idx;
784
785         /* Step 4: Restock RQ with new mbufs */
786         memcpy(rxmb, rq->free_mbufs + ENIC_RX_BURST_MAX - rq->num_free_mbufs,
787                sizeof(struct rte_mbuf *) * nb_rx);
788         rq->num_free_mbufs -= nb_rx;
789         while (nb_rx) {
790                 rqd->address = (*rxmb)->buf_iova + RTE_PKTMBUF_HEADROOM;
791                 nb_rx--;
792                 rqd++;
793                 rxmb++;
794         }
795         if (rq->rx_nb_hold > rq->rx_free_thresh) {
796                 rq->posted_index = enic_ring_add(rq->ring.desc_count,
797                                                  rq->posted_index,
798                                                  rq->rx_nb_hold);
799                 rq->rx_nb_hold = 0;
800                 rte_wmb();
801                 iowrite32_relaxed(rq->posted_index,
802                                   &rq->ctrl->posted_index);
803         }
804
805         return rx - rx_pkts;
806 }
807
808 bool
809 enic_use_vector_rx_handler(struct enic *enic)
810 {
811         struct rte_eth_dev *eth_dev;
812         struct rte_fdir_conf *fconf;
813
814         eth_dev = enic->rte_dev;
815         /* User needs to request for the avx2 handler */
816         if (!enic->enable_avx2_rx)
817                 return false;
818         /* Do not support scatter Rx */
819         if (!(enic->rq_count > 0 && enic->rq[0].data_queue_enable == 0))
820                 return false;
821         /* Do not support fdir/flow */
822         fconf = &eth_dev->data->dev_conf.fdir_conf;
823         if (fconf->mode != RTE_FDIR_MODE_NONE)
824                 return false;
825         if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
826                 PMD_INIT_LOG(DEBUG, " use the non-scatter avx2 Rx handler");
827                 eth_dev->rx_pkt_burst = &enic_noscatter_vec_recv_pkts;
828                 return true;
829         }
830         return false;
831 }