Imported Upstream version 16.07-rc1
[deb_dpdk.git] / drivers / net / e1000 / igb_rxtx.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <sys/queue.h>
35
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <stdint.h>
41 #include <stdarg.h>
42 #include <inttypes.h>
43
44 #include <rte_interrupts.h>
45 #include <rte_byteorder.h>
46 #include <rte_common.h>
47 #include <rte_log.h>
48 #include <rte_debug.h>
49 #include <rte_pci.h>
50 #include <rte_memory.h>
51 #include <rte_memcpy.h>
52 #include <rte_memzone.h>
53 #include <rte_launch.h>
54 #include <rte_eal.h>
55 #include <rte_per_lcore.h>
56 #include <rte_lcore.h>
57 #include <rte_atomic.h>
58 #include <rte_branch_prediction.h>
59 #include <rte_ring.h>
60 #include <rte_mempool.h>
61 #include <rte_malloc.h>
62 #include <rte_mbuf.h>
63 #include <rte_ether.h>
64 #include <rte_ethdev.h>
65 #include <rte_prefetch.h>
66 #include <rte_udp.h>
67 #include <rte_tcp.h>
68 #include <rte_sctp.h>
69 #include <rte_string_fns.h>
70
71 #include "e1000_logs.h"
72 #include "base/e1000_api.h"
73 #include "e1000_ethdev.h"
74
75 /* Bit Mask to indicate what bits required for building TX context */
76 #define IGB_TX_OFFLOAD_MASK (                    \
77                 PKT_TX_VLAN_PKT |                \
78                 PKT_TX_IP_CKSUM |                \
79                 PKT_TX_L4_MASK |                 \
80                 PKT_TX_TCP_SEG)
81
82 /**
83  * Structure associated with each descriptor of the RX ring of a RX queue.
84  */
85 struct igb_rx_entry {
86         struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
87 };
88
89 /**
90  * Structure associated with each descriptor of the TX ring of a TX queue.
91  */
92 struct igb_tx_entry {
93         struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
94         uint16_t next_id; /**< Index of next descriptor in ring. */
95         uint16_t last_id; /**< Index of last scattered descriptor. */
96 };
97
98 /**
99  * Structure associated with each RX queue.
100  */
101 struct igb_rx_queue {
102         struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
103         volatile union e1000_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
104         uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
105         volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
106         volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
107         struct igb_rx_entry *sw_ring;   /**< address of RX software ring. */
108         struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
109         struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
110         uint16_t            nb_rx_desc; /**< number of RX descriptors. */
111         uint16_t            rx_tail;    /**< current value of RDT register. */
112         uint16_t            nb_rx_hold; /**< number of held free RX desc. */
113         uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
114         uint16_t            queue_id;   /**< RX queue index. */
115         uint16_t            reg_idx;    /**< RX queue register index. */
116         uint8_t             port_id;    /**< Device port identifier. */
117         uint8_t             pthresh;    /**< Prefetch threshold register. */
118         uint8_t             hthresh;    /**< Host threshold register. */
119         uint8_t             wthresh;    /**< Write-back threshold register. */
120         uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
121         uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
122 };
123
124 /**
125  * Hardware context number
126  */
127 enum igb_advctx_num {
128         IGB_CTX_0    = 0, /**< CTX0    */
129         IGB_CTX_1    = 1, /**< CTX1    */
130         IGB_CTX_NUM  = 2, /**< CTX_NUM */
131 };
132
133 /** Offload features */
134 union igb_tx_offload {
135         uint64_t data;
136         struct {
137                 uint64_t l3_len:9; /**< L3 (IP) Header Length. */
138                 uint64_t l2_len:7; /**< L2 (MAC) Header Length. */
139                 uint64_t vlan_tci:16;  /**< VLAN Tag Control Identifier(CPU order). */
140                 uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
141                 uint64_t tso_segsz:16; /**< TCP TSO segment size. */
142
143                 /* uint64_t unused:8; */
144         };
145 };
146
147 /*
148  * Compare mask for igb_tx_offload.data,
149  * should be in sync with igb_tx_offload layout.
150  * */
151 #define TX_MACIP_LEN_CMP_MASK   0x000000000000FFFFULL /**< L2L3 header mask. */
152 #define TX_VLAN_CMP_MASK                0x00000000FFFF0000ULL /**< Vlan mask. */
153 #define TX_TCP_LEN_CMP_MASK             0x000000FF00000000ULL /**< TCP header mask. */
154 #define TX_TSO_MSS_CMP_MASK             0x00FFFF0000000000ULL /**< TSO segsz mask. */
155 /** Mac + IP + TCP + Mss mask. */
156 #define TX_TSO_CMP_MASK \
157         (TX_MACIP_LEN_CMP_MASK | TX_TCP_LEN_CMP_MASK | TX_TSO_MSS_CMP_MASK)
158
159 /**
160  * Strucutre to check if new context need be built
161  */
162 struct igb_advctx_info {
163         uint64_t flags;           /**< ol_flags related to context build. */
164         /** tx offload: vlan, tso, l2-l3-l4 lengths. */
165         union igb_tx_offload tx_offload;
166         /** compare mask for tx offload. */
167         union igb_tx_offload tx_offload_mask;
168 };
169
170 /**
171  * Structure associated with each TX queue.
172  */
173 struct igb_tx_queue {
174         volatile union e1000_adv_tx_desc *tx_ring; /**< TX ring address */
175         uint64_t               tx_ring_phys_addr; /**< TX ring DMA address. */
176         struct igb_tx_entry    *sw_ring; /**< virtual address of SW ring. */
177         volatile uint32_t      *tdt_reg_addr; /**< Address of TDT register. */
178         uint32_t               txd_type;      /**< Device-specific TXD type */
179         uint16_t               nb_tx_desc;    /**< number of TX descriptors. */
180         uint16_t               tx_tail; /**< Current value of TDT register. */
181         uint16_t               tx_head;
182         /**< Index of first used TX descriptor. */
183         uint16_t               queue_id; /**< TX queue index. */
184         uint16_t               reg_idx;  /**< TX queue register index. */
185         uint8_t                port_id;  /**< Device port identifier. */
186         uint8_t                pthresh;  /**< Prefetch threshold register. */
187         uint8_t                hthresh;  /**< Host threshold register. */
188         uint8_t                wthresh;  /**< Write-back threshold register. */
189         uint32_t               ctx_curr;
190         /**< Current used hardware descriptor. */
191         uint32_t               ctx_start;
192         /**< Start context position for transmit queue. */
193         struct igb_advctx_info ctx_cache[IGB_CTX_NUM];
194         /**< Hardware context history.*/
195 };
196
197 #if 1
198 #define RTE_PMD_USE_PREFETCH
199 #endif
200
201 #ifdef RTE_PMD_USE_PREFETCH
202 #define rte_igb_prefetch(p)     rte_prefetch0(p)
203 #else
204 #define rte_igb_prefetch(p)     do {} while(0)
205 #endif
206
207 #ifdef RTE_PMD_PACKET_PREFETCH
208 #define rte_packet_prefetch(p) rte_prefetch1(p)
209 #else
210 #define rte_packet_prefetch(p)  do {} while(0)
211 #endif
212
213 /*
214  * Macro for VMDq feature for 1 GbE NIC.
215  */
216 #define E1000_VMOLR_SIZE                        (8)
217 #define IGB_TSO_MAX_HDRLEN                      (512)
218 #define IGB_TSO_MAX_MSS                         (9216)
219
220 /*********************************************************************
221  *
222  *  TX function
223  *
224  **********************************************************************/
225
226 /*
227  *There're some limitations in hardware for TCP segmentation offload. We
228  *should check whether the parameters are valid.
229  */
230 static inline uint64_t
231 check_tso_para(uint64_t ol_req, union igb_tx_offload ol_para)
232 {
233         if (!(ol_req & PKT_TX_TCP_SEG))
234                 return ol_req;
235         if ((ol_para.tso_segsz > IGB_TSO_MAX_MSS) || (ol_para.l2_len +
236                         ol_para.l3_len + ol_para.l4_len > IGB_TSO_MAX_HDRLEN)) {
237                 ol_req &= ~PKT_TX_TCP_SEG;
238                 ol_req |= PKT_TX_TCP_CKSUM;
239         }
240         return ol_req;
241 }
242
243 /*
244  * Advanced context descriptor are almost same between igb/ixgbe
245  * This is a separate function, looking for optimization opportunity here
246  * Rework required to go with the pre-defined values.
247  */
248
249 static inline void
250 igbe_set_xmit_ctx(struct igb_tx_queue* txq,
251                 volatile struct e1000_adv_tx_context_desc *ctx_txd,
252                 uint64_t ol_flags, union igb_tx_offload tx_offload)
253 {
254         uint32_t type_tucmd_mlhl;
255         uint32_t mss_l4len_idx;
256         uint32_t ctx_idx, ctx_curr;
257         uint32_t vlan_macip_lens;
258         union igb_tx_offload tx_offload_mask;
259
260         ctx_curr = txq->ctx_curr;
261         ctx_idx = ctx_curr + txq->ctx_start;
262
263         tx_offload_mask.data = 0;
264         type_tucmd_mlhl = 0;
265
266         /* Specify which HW CTX to upload. */
267         mss_l4len_idx = (ctx_idx << E1000_ADVTXD_IDX_SHIFT);
268
269         if (ol_flags & PKT_TX_VLAN_PKT)
270                 tx_offload_mask.data |= TX_VLAN_CMP_MASK;
271
272         /* check if TCP segmentation required for this packet */
273         if (ol_flags & PKT_TX_TCP_SEG) {
274                 /* implies IP cksum in IPv4 */
275                 if (ol_flags & PKT_TX_IP_CKSUM)
276                         type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4 |
277                                 E1000_ADVTXD_TUCMD_L4T_TCP |
278                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
279                 else
280                         type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV6 |
281                                 E1000_ADVTXD_TUCMD_L4T_TCP |
282                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
283
284                 tx_offload_mask.data |= TX_TSO_CMP_MASK;
285                 mss_l4len_idx |= tx_offload.tso_segsz << E1000_ADVTXD_MSS_SHIFT;
286                 mss_l4len_idx |= tx_offload.l4_len << E1000_ADVTXD_L4LEN_SHIFT;
287         } else { /* no TSO, check if hardware checksum is needed */
288                 if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_L4_MASK))
289                         tx_offload_mask.data |= TX_MACIP_LEN_CMP_MASK;
290
291                 if (ol_flags & PKT_TX_IP_CKSUM)
292                         type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4;
293
294                 switch (ol_flags & PKT_TX_L4_MASK) {
295                 case PKT_TX_UDP_CKSUM:
296                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP |
297                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
298                         mss_l4len_idx |= sizeof(struct udp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
299                         break;
300                 case PKT_TX_TCP_CKSUM:
301                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP |
302                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
303                         mss_l4len_idx |= sizeof(struct tcp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
304                         break;
305                 case PKT_TX_SCTP_CKSUM:
306                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP |
307                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
308                         mss_l4len_idx |= sizeof(struct sctp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
309                         break;
310                 default:
311                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_RSV |
312                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
313                         break;
314                 }
315         }
316
317         txq->ctx_cache[ctx_curr].flags = ol_flags;
318         txq->ctx_cache[ctx_curr].tx_offload.data =
319                 tx_offload_mask.data & tx_offload.data;
320         txq->ctx_cache[ctx_curr].tx_offload_mask = tx_offload_mask;
321
322         ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
323         vlan_macip_lens = (uint32_t)tx_offload.data;
324         ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
325         ctx_txd->mss_l4len_idx = rte_cpu_to_le_32(mss_l4len_idx);
326         ctx_txd->seqnum_seed = 0;
327 }
328
329 /*
330  * Check which hardware context can be used. Use the existing match
331  * or create a new context descriptor.
332  */
333 static inline uint32_t
334 what_advctx_update(struct igb_tx_queue *txq, uint64_t flags,
335                 union igb_tx_offload tx_offload)
336 {
337         /* If match with the current context */
338         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
339                 (txq->ctx_cache[txq->ctx_curr].tx_offload.data ==
340                 (txq->ctx_cache[txq->ctx_curr].tx_offload_mask.data & tx_offload.data)))) {
341                         return txq->ctx_curr;
342         }
343
344         /* If match with the second context */
345         txq->ctx_curr ^= 1;
346         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
347                 (txq->ctx_cache[txq->ctx_curr].tx_offload.data ==
348                 (txq->ctx_cache[txq->ctx_curr].tx_offload_mask.data & tx_offload.data)))) {
349                         return txq->ctx_curr;
350         }
351
352         /* Mismatch, use the previous context */
353         return IGB_CTX_NUM;
354 }
355
356 static inline uint32_t
357 tx_desc_cksum_flags_to_olinfo(uint64_t ol_flags)
358 {
359         static const uint32_t l4_olinfo[2] = {0, E1000_ADVTXD_POPTS_TXSM};
360         static const uint32_t l3_olinfo[2] = {0, E1000_ADVTXD_POPTS_IXSM};
361         uint32_t tmp;
362
363         tmp  = l4_olinfo[(ol_flags & PKT_TX_L4_MASK)  != PKT_TX_L4_NO_CKSUM];
364         tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
365         tmp |= l4_olinfo[(ol_flags & PKT_TX_TCP_SEG) != 0];
366         return tmp;
367 }
368
369 static inline uint32_t
370 tx_desc_vlan_flags_to_cmdtype(uint64_t ol_flags)
371 {
372         uint32_t cmdtype;
373         static uint32_t vlan_cmd[2] = {0, E1000_ADVTXD_DCMD_VLE};
374         static uint32_t tso_cmd[2] = {0, E1000_ADVTXD_DCMD_TSE};
375         cmdtype = vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
376         cmdtype |= tso_cmd[(ol_flags & PKT_TX_TCP_SEG) != 0];
377         return cmdtype;
378 }
379
380 uint16_t
381 eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
382                uint16_t nb_pkts)
383 {
384         struct igb_tx_queue *txq;
385         struct igb_tx_entry *sw_ring;
386         struct igb_tx_entry *txe, *txn;
387         volatile union e1000_adv_tx_desc *txr;
388         volatile union e1000_adv_tx_desc *txd;
389         struct rte_mbuf     *tx_pkt;
390         struct rte_mbuf     *m_seg;
391         uint64_t buf_dma_addr;
392         uint32_t olinfo_status;
393         uint32_t cmd_type_len;
394         uint32_t pkt_len;
395         uint16_t slen;
396         uint64_t ol_flags;
397         uint16_t tx_end;
398         uint16_t tx_id;
399         uint16_t tx_last;
400         uint16_t nb_tx;
401         uint64_t tx_ol_req;
402         uint32_t new_ctx = 0;
403         uint32_t ctx = 0;
404         union igb_tx_offload tx_offload = {0};
405
406         txq = tx_queue;
407         sw_ring = txq->sw_ring;
408         txr     = txq->tx_ring;
409         tx_id   = txq->tx_tail;
410         txe = &sw_ring[tx_id];
411
412         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
413                 tx_pkt = *tx_pkts++;
414                 pkt_len = tx_pkt->pkt_len;
415
416                 RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
417
418                 /*
419                  * The number of descriptors that must be allocated for a
420                  * packet is the number of segments of that packet, plus 1
421                  * Context Descriptor for the VLAN Tag Identifier, if any.
422                  * Determine the last TX descriptor to allocate in the TX ring
423                  * for the packet, starting from the current position (tx_id)
424                  * in the ring.
425                  */
426                 tx_last = (uint16_t) (tx_id + tx_pkt->nb_segs - 1);
427
428                 ol_flags = tx_pkt->ol_flags;
429                 tx_ol_req = ol_flags & IGB_TX_OFFLOAD_MASK;
430
431                 /* If a Context Descriptor need be built . */
432                 if (tx_ol_req) {
433                         tx_offload.l2_len = tx_pkt->l2_len;
434                         tx_offload.l3_len = tx_pkt->l3_len;
435                         tx_offload.l4_len = tx_pkt->l4_len;
436                         tx_offload.vlan_tci = tx_pkt->vlan_tci;
437                         tx_offload.tso_segsz = tx_pkt->tso_segsz;
438                         tx_ol_req = check_tso_para(tx_ol_req, tx_offload);
439
440                         ctx = what_advctx_update(txq, tx_ol_req, tx_offload);
441                         /* Only allocate context descriptor if required*/
442                         new_ctx = (ctx == IGB_CTX_NUM);
443                         ctx = txq->ctx_curr + txq->ctx_start;
444                         tx_last = (uint16_t) (tx_last + new_ctx);
445                 }
446                 if (tx_last >= txq->nb_tx_desc)
447                         tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
448
449                 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
450                            " tx_first=%u tx_last=%u",
451                            (unsigned) txq->port_id,
452                            (unsigned) txq->queue_id,
453                            (unsigned) pkt_len,
454                            (unsigned) tx_id,
455                            (unsigned) tx_last);
456
457                 /*
458                  * Check if there are enough free descriptors in the TX ring
459                  * to transmit the next packet.
460                  * This operation is based on the two following rules:
461                  *
462                  *   1- Only check that the last needed TX descriptor can be
463                  *      allocated (by construction, if that descriptor is free,
464                  *      all intermediate ones are also free).
465                  *
466                  *      For this purpose, the index of the last TX descriptor
467                  *      used for a packet (the "last descriptor" of a packet)
468                  *      is recorded in the TX entries (the last one included)
469                  *      that are associated with all TX descriptors allocated
470                  *      for that packet.
471                  *
472                  *   2- Avoid to allocate the last free TX descriptor of the
473                  *      ring, in order to never set the TDT register with the
474                  *      same value stored in parallel by the NIC in the TDH
475                  *      register, which makes the TX engine of the NIC enter
476                  *      in a deadlock situation.
477                  *
478                  *      By extension, avoid to allocate a free descriptor that
479                  *      belongs to the last set of free descriptors allocated
480                  *      to the same packet previously transmitted.
481                  */
482
483                 /*
484                  * The "last descriptor" of the previously sent packet, if any,
485                  * which used the last descriptor to allocate.
486                  */
487                 tx_end = sw_ring[tx_last].last_id;
488
489                 /*
490                  * The next descriptor following that "last descriptor" in the
491                  * ring.
492                  */
493                 tx_end = sw_ring[tx_end].next_id;
494
495                 /*
496                  * The "last descriptor" associated with that next descriptor.
497                  */
498                 tx_end = sw_ring[tx_end].last_id;
499
500                 /*
501                  * Check that this descriptor is free.
502                  */
503                 if (! (txr[tx_end].wb.status & E1000_TXD_STAT_DD)) {
504                         if (nb_tx == 0)
505                                 return 0;
506                         goto end_of_tx;
507                 }
508
509                 /*
510                  * Set common flags of all TX Data Descriptors.
511                  *
512                  * The following bits must be set in all Data Descriptors:
513                  *   - E1000_ADVTXD_DTYP_DATA
514                  *   - E1000_ADVTXD_DCMD_DEXT
515                  *
516                  * The following bits must be set in the first Data Descriptor
517                  * and are ignored in the other ones:
518                  *   - E1000_ADVTXD_DCMD_IFCS
519                  *   - E1000_ADVTXD_MAC_1588
520                  *   - E1000_ADVTXD_DCMD_VLE
521                  *
522                  * The following bits must only be set in the last Data
523                  * Descriptor:
524                  *   - E1000_TXD_CMD_EOP
525                  *
526                  * The following bits can be set in any Data Descriptor, but
527                  * are only set in the last Data Descriptor:
528                  *   - E1000_TXD_CMD_RS
529                  */
530                 cmd_type_len = txq->txd_type |
531                         E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT;
532                 if (tx_ol_req & PKT_TX_TCP_SEG)
533                         pkt_len -= (tx_pkt->l2_len + tx_pkt->l3_len + tx_pkt->l4_len);
534                 olinfo_status = (pkt_len << E1000_ADVTXD_PAYLEN_SHIFT);
535 #if defined(RTE_LIBRTE_IEEE1588)
536                 if (ol_flags & PKT_TX_IEEE1588_TMST)
537                         cmd_type_len |= E1000_ADVTXD_MAC_TSTAMP;
538 #endif
539                 if (tx_ol_req) {
540                         /* Setup TX Advanced context descriptor if required */
541                         if (new_ctx) {
542                                 volatile struct e1000_adv_tx_context_desc *
543                                     ctx_txd;
544
545                                 ctx_txd = (volatile struct
546                                     e1000_adv_tx_context_desc *)
547                                     &txr[tx_id];
548
549                                 txn = &sw_ring[txe->next_id];
550                                 RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
551
552                                 if (txe->mbuf != NULL) {
553                                         rte_pktmbuf_free_seg(txe->mbuf);
554                                         txe->mbuf = NULL;
555                                 }
556
557                                 igbe_set_xmit_ctx(txq, ctx_txd, tx_ol_req, tx_offload);
558
559                                 txe->last_id = tx_last;
560                                 tx_id = txe->next_id;
561                                 txe = txn;
562                         }
563
564                         /* Setup the TX Advanced Data Descriptor */
565                         cmd_type_len  |= tx_desc_vlan_flags_to_cmdtype(tx_ol_req);
566                         olinfo_status |= tx_desc_cksum_flags_to_olinfo(tx_ol_req);
567                         olinfo_status |= (ctx << E1000_ADVTXD_IDX_SHIFT);
568                 }
569
570                 m_seg = tx_pkt;
571                 do {
572                         txn = &sw_ring[txe->next_id];
573                         txd = &txr[tx_id];
574
575                         if (txe->mbuf != NULL)
576                                 rte_pktmbuf_free_seg(txe->mbuf);
577                         txe->mbuf = m_seg;
578
579                         /*
580                          * Set up transmit descriptor.
581                          */
582                         slen = (uint16_t) m_seg->data_len;
583                         buf_dma_addr = rte_mbuf_data_dma_addr(m_seg);
584                         txd->read.buffer_addr =
585                                 rte_cpu_to_le_64(buf_dma_addr);
586                         txd->read.cmd_type_len =
587                                 rte_cpu_to_le_32(cmd_type_len | slen);
588                         txd->read.olinfo_status =
589                                 rte_cpu_to_le_32(olinfo_status);
590                         txe->last_id = tx_last;
591                         tx_id = txe->next_id;
592                         txe = txn;
593                         m_seg = m_seg->next;
594                 } while (m_seg != NULL);
595
596                 /*
597                  * The last packet data descriptor needs End Of Packet (EOP)
598                  * and Report Status (RS).
599                  */
600                 txd->read.cmd_type_len |=
601                         rte_cpu_to_le_32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS);
602         }
603  end_of_tx:
604         rte_wmb();
605
606         /*
607          * Set the Transmit Descriptor Tail (TDT).
608          */
609         E1000_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
610         PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
611                    (unsigned) txq->port_id, (unsigned) txq->queue_id,
612                    (unsigned) tx_id, (unsigned) nb_tx);
613         txq->tx_tail = tx_id;
614
615         return nb_tx;
616 }
617
618 /*********************************************************************
619  *
620  *  RX functions
621  *
622  **********************************************************************/
623 #define IGB_PACKET_TYPE_IPV4              0X01
624 #define IGB_PACKET_TYPE_IPV4_TCP          0X11
625 #define IGB_PACKET_TYPE_IPV4_UDP          0X21
626 #define IGB_PACKET_TYPE_IPV4_SCTP         0X41
627 #define IGB_PACKET_TYPE_IPV4_EXT          0X03
628 #define IGB_PACKET_TYPE_IPV4_EXT_SCTP     0X43
629 #define IGB_PACKET_TYPE_IPV6              0X04
630 #define IGB_PACKET_TYPE_IPV6_TCP          0X14
631 #define IGB_PACKET_TYPE_IPV6_UDP          0X24
632 #define IGB_PACKET_TYPE_IPV6_EXT          0X0C
633 #define IGB_PACKET_TYPE_IPV6_EXT_TCP      0X1C
634 #define IGB_PACKET_TYPE_IPV6_EXT_UDP      0X2C
635 #define IGB_PACKET_TYPE_IPV4_IPV6         0X05
636 #define IGB_PACKET_TYPE_IPV4_IPV6_TCP     0X15
637 #define IGB_PACKET_TYPE_IPV4_IPV6_UDP     0X25
638 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT     0X0D
639 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT_TCP 0X1D
640 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT_UDP 0X2D
641 #define IGB_PACKET_TYPE_MAX               0X80
642 #define IGB_PACKET_TYPE_MASK              0X7F
643 #define IGB_PACKET_TYPE_SHIFT             0X04
644 static inline uint32_t
645 igb_rxd_pkt_info_to_pkt_type(uint16_t pkt_info)
646 {
647         static const uint32_t
648                 ptype_table[IGB_PACKET_TYPE_MAX] __rte_cache_aligned = {
649                 [IGB_PACKET_TYPE_IPV4] = RTE_PTYPE_L2_ETHER |
650                         RTE_PTYPE_L3_IPV4,
651                 [IGB_PACKET_TYPE_IPV4_EXT] = RTE_PTYPE_L2_ETHER |
652                         RTE_PTYPE_L3_IPV4_EXT,
653                 [IGB_PACKET_TYPE_IPV6] = RTE_PTYPE_L2_ETHER |
654                         RTE_PTYPE_L3_IPV6,
655                 [IGB_PACKET_TYPE_IPV4_IPV6] = RTE_PTYPE_L2_ETHER |
656                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
657                         RTE_PTYPE_INNER_L3_IPV6,
658                 [IGB_PACKET_TYPE_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
659                         RTE_PTYPE_L3_IPV6_EXT,
660                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
661                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
662                         RTE_PTYPE_INNER_L3_IPV6_EXT,
663                 [IGB_PACKET_TYPE_IPV4_TCP] = RTE_PTYPE_L2_ETHER |
664                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP,
665                 [IGB_PACKET_TYPE_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
666                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_TCP,
667                 [IGB_PACKET_TYPE_IPV4_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
668                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
669                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_TCP,
670                 [IGB_PACKET_TYPE_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
671                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_TCP,
672                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
673                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
674                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_TCP,
675                 [IGB_PACKET_TYPE_IPV4_UDP] = RTE_PTYPE_L2_ETHER |
676                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_UDP,
677                 [IGB_PACKET_TYPE_IPV6_UDP] = RTE_PTYPE_L2_ETHER |
678                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_UDP,
679                 [IGB_PACKET_TYPE_IPV4_IPV6_UDP] =  RTE_PTYPE_L2_ETHER |
680                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
681                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_UDP,
682                 [IGB_PACKET_TYPE_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
683                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_UDP,
684                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
685                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
686                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_UDP,
687                 [IGB_PACKET_TYPE_IPV4_SCTP] = RTE_PTYPE_L2_ETHER |
688                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_SCTP,
689                 [IGB_PACKET_TYPE_IPV4_EXT_SCTP] = RTE_PTYPE_L2_ETHER |
690                         RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L4_SCTP,
691         };
692         if (unlikely(pkt_info & E1000_RXDADV_PKTTYPE_ETQF))
693                 return RTE_PTYPE_UNKNOWN;
694
695         pkt_info = (pkt_info >> IGB_PACKET_TYPE_SHIFT) & IGB_PACKET_TYPE_MASK;
696
697         return ptype_table[pkt_info];
698 }
699
700 static inline uint64_t
701 rx_desc_hlen_type_rss_to_pkt_flags(struct igb_rx_queue *rxq, uint32_t hl_tp_rs)
702 {
703         uint64_t pkt_flags = ((hl_tp_rs & 0x0F) == 0) ?  0 : PKT_RX_RSS_HASH;
704
705 #if defined(RTE_LIBRTE_IEEE1588)
706         static uint32_t ip_pkt_etqf_map[8] = {
707                 0, 0, 0, PKT_RX_IEEE1588_PTP,
708                 0, 0, 0, 0,
709         };
710
711         struct rte_eth_dev dev = rte_eth_devices[rxq->port_id];
712         struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev.data->dev_private);
713
714         /* EtherType is in bits 8:10 in Packet Type, and not in the default 0:2 */
715         if (hw->mac.type == e1000_i210)
716                 pkt_flags |= ip_pkt_etqf_map[(hl_tp_rs >> 12) & 0x07];
717         else
718                 pkt_flags |= ip_pkt_etqf_map[(hl_tp_rs >> 4) & 0x07];
719 #else
720         RTE_SET_USED(rxq);
721 #endif
722
723         return pkt_flags;
724 }
725
726 static inline uint64_t
727 rx_desc_status_to_pkt_flags(uint32_t rx_status)
728 {
729         uint64_t pkt_flags;
730
731         /* Check if VLAN present */
732         pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?
733                 PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED : 0);
734
735 #if defined(RTE_LIBRTE_IEEE1588)
736         if (rx_status & E1000_RXD_STAT_TMST)
737                 pkt_flags = pkt_flags | PKT_RX_IEEE1588_TMST;
738 #endif
739         return pkt_flags;
740 }
741
742 static inline uint64_t
743 rx_desc_error_to_pkt_flags(uint32_t rx_status)
744 {
745         /*
746          * Bit 30: IPE, IPv4 checksum error
747          * Bit 29: L4I, L4I integrity error
748          */
749
750         static uint64_t error_to_pkt_flags_map[4] = {
751                 0,  PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_BAD,
752                 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
753         };
754         return error_to_pkt_flags_map[(rx_status >>
755                 E1000_RXD_ERR_CKSUM_BIT) & E1000_RXD_ERR_CKSUM_MSK];
756 }
757
758 uint16_t
759 eth_igb_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
760                uint16_t nb_pkts)
761 {
762         struct igb_rx_queue *rxq;
763         volatile union e1000_adv_rx_desc *rx_ring;
764         volatile union e1000_adv_rx_desc *rxdp;
765         struct igb_rx_entry *sw_ring;
766         struct igb_rx_entry *rxe;
767         struct rte_mbuf *rxm;
768         struct rte_mbuf *nmb;
769         union e1000_adv_rx_desc rxd;
770         uint64_t dma_addr;
771         uint32_t staterr;
772         uint32_t hlen_type_rss;
773         uint16_t pkt_len;
774         uint16_t rx_id;
775         uint16_t nb_rx;
776         uint16_t nb_hold;
777         uint64_t pkt_flags;
778
779         nb_rx = 0;
780         nb_hold = 0;
781         rxq = rx_queue;
782         rx_id = rxq->rx_tail;
783         rx_ring = rxq->rx_ring;
784         sw_ring = rxq->sw_ring;
785         while (nb_rx < nb_pkts) {
786                 /*
787                  * The order of operations here is important as the DD status
788                  * bit must not be read after any other descriptor fields.
789                  * rx_ring and rxdp are pointing to volatile data so the order
790                  * of accesses cannot be reordered by the compiler. If they were
791                  * not volatile, they could be reordered which could lead to
792                  * using invalid descriptor fields when read from rxd.
793                  */
794                 rxdp = &rx_ring[rx_id];
795                 staterr = rxdp->wb.upper.status_error;
796                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
797                         break;
798                 rxd = *rxdp;
799
800                 /*
801                  * End of packet.
802                  *
803                  * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
804                  * likely to be invalid and to be dropped by the various
805                  * validation checks performed by the network stack.
806                  *
807                  * Allocate a new mbuf to replenish the RX ring descriptor.
808                  * If the allocation fails:
809                  *    - arrange for that RX descriptor to be the first one
810                  *      being parsed the next time the receive function is
811                  *      invoked [on the same queue].
812                  *
813                  *    - Stop parsing the RX ring and return immediately.
814                  *
815                  * This policy do not drop the packet received in the RX
816                  * descriptor for which the allocation of a new mbuf failed.
817                  * Thus, it allows that packet to be later retrieved if
818                  * mbuf have been freed in the mean time.
819                  * As a side effect, holding RX descriptors instead of
820                  * systematically giving them back to the NIC may lead to
821                  * RX ring exhaustion situations.
822                  * However, the NIC can gracefully prevent such situations
823                  * to happen by sending specific "back-pressure" flow control
824                  * frames to its peer(s).
825                  */
826                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
827                            "staterr=0x%x pkt_len=%u",
828                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
829                            (unsigned) rx_id, (unsigned) staterr,
830                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
831
832                 nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
833                 if (nmb == NULL) {
834                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
835                                    "queue_id=%u", (unsigned) rxq->port_id,
836                                    (unsigned) rxq->queue_id);
837                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
838                         break;
839                 }
840
841                 nb_hold++;
842                 rxe = &sw_ring[rx_id];
843                 rx_id++;
844                 if (rx_id == rxq->nb_rx_desc)
845                         rx_id = 0;
846
847                 /* Prefetch next mbuf while processing current one. */
848                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
849
850                 /*
851                  * When next RX descriptor is on a cache-line boundary,
852                  * prefetch the next 4 RX descriptors and the next 8 pointers
853                  * to mbufs.
854                  */
855                 if ((rx_id & 0x3) == 0) {
856                         rte_igb_prefetch(&rx_ring[rx_id]);
857                         rte_igb_prefetch(&sw_ring[rx_id]);
858                 }
859
860                 rxm = rxe->mbuf;
861                 rxe->mbuf = nmb;
862                 dma_addr =
863                         rte_cpu_to_le_64(rte_mbuf_data_dma_addr_default(nmb));
864                 rxdp->read.hdr_addr = 0;
865                 rxdp->read.pkt_addr = dma_addr;
866
867                 /*
868                  * Initialize the returned mbuf.
869                  * 1) setup generic mbuf fields:
870                  *    - number of segments,
871                  *    - next segment,
872                  *    - packet length,
873                  *    - RX port identifier.
874                  * 2) integrate hardware offload data, if any:
875                  *    - RSS flag & hash,
876                  *    - IP checksum flag,
877                  *    - VLAN TCI, if any,
878                  *    - error flags.
879                  */
880                 pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
881                                       rxq->crc_len);
882                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
883                 rte_packet_prefetch((char *)rxm->buf_addr + rxm->data_off);
884                 rxm->nb_segs = 1;
885                 rxm->next = NULL;
886                 rxm->pkt_len = pkt_len;
887                 rxm->data_len = pkt_len;
888                 rxm->port = rxq->port_id;
889
890                 rxm->hash.rss = rxd.wb.lower.hi_dword.rss;
891                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
892                 /* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
893                 rxm->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
894
895                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(rxq, hlen_type_rss);
896                 pkt_flags = pkt_flags | rx_desc_status_to_pkt_flags(staterr);
897                 pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
898                 rxm->ol_flags = pkt_flags;
899                 rxm->packet_type = igb_rxd_pkt_info_to_pkt_type(rxd.wb.lower.
900                                                 lo_dword.hs_rss.pkt_info);
901
902                 /*
903                  * Store the mbuf address into the next entry of the array
904                  * of returned packets.
905                  */
906                 rx_pkts[nb_rx++] = rxm;
907         }
908         rxq->rx_tail = rx_id;
909
910         /*
911          * If the number of free RX descriptors is greater than the RX free
912          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
913          * register.
914          * Update the RDT with the value of the last processed RX descriptor
915          * minus 1, to guarantee that the RDT register is never equal to the
916          * RDH register, which creates a "full" ring situtation from the
917          * hardware point of view...
918          */
919         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
920         if (nb_hold > rxq->rx_free_thresh) {
921                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
922                            "nb_hold=%u nb_rx=%u",
923                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
924                            (unsigned) rx_id, (unsigned) nb_hold,
925                            (unsigned) nb_rx);
926                 rx_id = (uint16_t) ((rx_id == 0) ?
927                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
928                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
929                 nb_hold = 0;
930         }
931         rxq->nb_rx_hold = nb_hold;
932         return nb_rx;
933 }
934
935 uint16_t
936 eth_igb_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
937                          uint16_t nb_pkts)
938 {
939         struct igb_rx_queue *rxq;
940         volatile union e1000_adv_rx_desc *rx_ring;
941         volatile union e1000_adv_rx_desc *rxdp;
942         struct igb_rx_entry *sw_ring;
943         struct igb_rx_entry *rxe;
944         struct rte_mbuf *first_seg;
945         struct rte_mbuf *last_seg;
946         struct rte_mbuf *rxm;
947         struct rte_mbuf *nmb;
948         union e1000_adv_rx_desc rxd;
949         uint64_t dma; /* Physical address of mbuf data buffer */
950         uint32_t staterr;
951         uint32_t hlen_type_rss;
952         uint16_t rx_id;
953         uint16_t nb_rx;
954         uint16_t nb_hold;
955         uint16_t data_len;
956         uint64_t pkt_flags;
957
958         nb_rx = 0;
959         nb_hold = 0;
960         rxq = rx_queue;
961         rx_id = rxq->rx_tail;
962         rx_ring = rxq->rx_ring;
963         sw_ring = rxq->sw_ring;
964
965         /*
966          * Retrieve RX context of current packet, if any.
967          */
968         first_seg = rxq->pkt_first_seg;
969         last_seg = rxq->pkt_last_seg;
970
971         while (nb_rx < nb_pkts) {
972         next_desc:
973                 /*
974                  * The order of operations here is important as the DD status
975                  * bit must not be read after any other descriptor fields.
976                  * rx_ring and rxdp are pointing to volatile data so the order
977                  * of accesses cannot be reordered by the compiler. If they were
978                  * not volatile, they could be reordered which could lead to
979                  * using invalid descriptor fields when read from rxd.
980                  */
981                 rxdp = &rx_ring[rx_id];
982                 staterr = rxdp->wb.upper.status_error;
983                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
984                         break;
985                 rxd = *rxdp;
986
987                 /*
988                  * Descriptor done.
989                  *
990                  * Allocate a new mbuf to replenish the RX ring descriptor.
991                  * If the allocation fails:
992                  *    - arrange for that RX descriptor to be the first one
993                  *      being parsed the next time the receive function is
994                  *      invoked [on the same queue].
995                  *
996                  *    - Stop parsing the RX ring and return immediately.
997                  *
998                  * This policy does not drop the packet received in the RX
999                  * descriptor for which the allocation of a new mbuf failed.
1000                  * Thus, it allows that packet to be later retrieved if
1001                  * mbuf have been freed in the mean time.
1002                  * As a side effect, holding RX descriptors instead of
1003                  * systematically giving them back to the NIC may lead to
1004                  * RX ring exhaustion situations.
1005                  * However, the NIC can gracefully prevent such situations
1006                  * to happen by sending specific "back-pressure" flow control
1007                  * frames to its peer(s).
1008                  */
1009                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
1010                            "staterr=0x%x data_len=%u",
1011                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1012                            (unsigned) rx_id, (unsigned) staterr,
1013                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
1014
1015                 nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
1016                 if (nmb == NULL) {
1017                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
1018                                    "queue_id=%u", (unsigned) rxq->port_id,
1019                                    (unsigned) rxq->queue_id);
1020                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
1021                         break;
1022                 }
1023
1024                 nb_hold++;
1025                 rxe = &sw_ring[rx_id];
1026                 rx_id++;
1027                 if (rx_id == rxq->nb_rx_desc)
1028                         rx_id = 0;
1029
1030                 /* Prefetch next mbuf while processing current one. */
1031                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
1032
1033                 /*
1034                  * When next RX descriptor is on a cache-line boundary,
1035                  * prefetch the next 4 RX descriptors and the next 8 pointers
1036                  * to mbufs.
1037                  */
1038                 if ((rx_id & 0x3) == 0) {
1039                         rte_igb_prefetch(&rx_ring[rx_id]);
1040                         rte_igb_prefetch(&sw_ring[rx_id]);
1041                 }
1042
1043                 /*
1044                  * Update RX descriptor with the physical address of the new
1045                  * data buffer of the new allocated mbuf.
1046                  */
1047                 rxm = rxe->mbuf;
1048                 rxe->mbuf = nmb;
1049                 dma = rte_cpu_to_le_64(rte_mbuf_data_dma_addr_default(nmb));
1050                 rxdp->read.pkt_addr = dma;
1051                 rxdp->read.hdr_addr = 0;
1052
1053                 /*
1054                  * Set data length & data buffer address of mbuf.
1055                  */
1056                 data_len = rte_le_to_cpu_16(rxd.wb.upper.length);
1057                 rxm->data_len = data_len;
1058                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
1059
1060                 /*
1061                  * If this is the first buffer of the received packet,
1062                  * set the pointer to the first mbuf of the packet and
1063                  * initialize its context.
1064                  * Otherwise, update the total length and the number of segments
1065                  * of the current scattered packet, and update the pointer to
1066                  * the last mbuf of the current packet.
1067                  */
1068                 if (first_seg == NULL) {
1069                         first_seg = rxm;
1070                         first_seg->pkt_len = data_len;
1071                         first_seg->nb_segs = 1;
1072                 } else {
1073                         first_seg->pkt_len += data_len;
1074                         first_seg->nb_segs++;
1075                         last_seg->next = rxm;
1076                 }
1077
1078                 /*
1079                  * If this is not the last buffer of the received packet,
1080                  * update the pointer to the last mbuf of the current scattered
1081                  * packet and continue to parse the RX ring.
1082                  */
1083                 if (! (staterr & E1000_RXD_STAT_EOP)) {
1084                         last_seg = rxm;
1085                         goto next_desc;
1086                 }
1087
1088                 /*
1089                  * This is the last buffer of the received packet.
1090                  * If the CRC is not stripped by the hardware:
1091                  *   - Subtract the CRC length from the total packet length.
1092                  *   - If the last buffer only contains the whole CRC or a part
1093                  *     of it, free the mbuf associated to the last buffer.
1094                  *     If part of the CRC is also contained in the previous
1095                  *     mbuf, subtract the length of that CRC part from the
1096                  *     data length of the previous mbuf.
1097                  */
1098                 rxm->next = NULL;
1099                 if (unlikely(rxq->crc_len > 0)) {
1100                         first_seg->pkt_len -= ETHER_CRC_LEN;
1101                         if (data_len <= ETHER_CRC_LEN) {
1102                                 rte_pktmbuf_free_seg(rxm);
1103                                 first_seg->nb_segs--;
1104                                 last_seg->data_len = (uint16_t)
1105                                         (last_seg->data_len -
1106                                          (ETHER_CRC_LEN - data_len));
1107                                 last_seg->next = NULL;
1108                         } else
1109                                 rxm->data_len =
1110                                         (uint16_t) (data_len - ETHER_CRC_LEN);
1111                 }
1112
1113                 /*
1114                  * Initialize the first mbuf of the returned packet:
1115                  *    - RX port identifier,
1116                  *    - hardware offload data, if any:
1117                  *      - RSS flag & hash,
1118                  *      - IP checksum flag,
1119                  *      - VLAN TCI, if any,
1120                  *      - error flags.
1121                  */
1122                 first_seg->port = rxq->port_id;
1123                 first_seg->hash.rss = rxd.wb.lower.hi_dword.rss;
1124
1125                 /*
1126                  * The vlan_tci field is only valid when PKT_RX_VLAN_PKT is
1127                  * set in the pkt_flags field.
1128                  */
1129                 first_seg->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
1130                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
1131                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(rxq, hlen_type_rss);
1132                 pkt_flags = pkt_flags | rx_desc_status_to_pkt_flags(staterr);
1133                 pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
1134                 first_seg->ol_flags = pkt_flags;
1135                 first_seg->packet_type = igb_rxd_pkt_info_to_pkt_type(rxd.wb.
1136                                         lower.lo_dword.hs_rss.pkt_info);
1137
1138                 /* Prefetch data of first segment, if configured to do so. */
1139                 rte_packet_prefetch((char *)first_seg->buf_addr +
1140                         first_seg->data_off);
1141
1142                 /*
1143                  * Store the mbuf address into the next entry of the array
1144                  * of returned packets.
1145                  */
1146                 rx_pkts[nb_rx++] = first_seg;
1147
1148                 /*
1149                  * Setup receipt context for a new packet.
1150                  */
1151                 first_seg = NULL;
1152         }
1153
1154         /*
1155          * Record index of the next RX descriptor to probe.
1156          */
1157         rxq->rx_tail = rx_id;
1158
1159         /*
1160          * Save receive context.
1161          */
1162         rxq->pkt_first_seg = first_seg;
1163         rxq->pkt_last_seg = last_seg;
1164
1165         /*
1166          * If the number of free RX descriptors is greater than the RX free
1167          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
1168          * register.
1169          * Update the RDT with the value of the last processed RX descriptor
1170          * minus 1, to guarantee that the RDT register is never equal to the
1171          * RDH register, which creates a "full" ring situtation from the
1172          * hardware point of view...
1173          */
1174         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
1175         if (nb_hold > rxq->rx_free_thresh) {
1176                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
1177                            "nb_hold=%u nb_rx=%u",
1178                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1179                            (unsigned) rx_id, (unsigned) nb_hold,
1180                            (unsigned) nb_rx);
1181                 rx_id = (uint16_t) ((rx_id == 0) ?
1182                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
1183                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
1184                 nb_hold = 0;
1185         }
1186         rxq->nb_rx_hold = nb_hold;
1187         return nb_rx;
1188 }
1189
1190 /*
1191  * Maximum number of Ring Descriptors.
1192  *
1193  * Since RDLEN/TDLEN should be multiple of 128bytes, the number of ring
1194  * desscriptors should meet the following condition:
1195  *      (num_ring_desc * sizeof(struct e1000_rx/tx_desc)) % 128 == 0
1196  */
1197
1198 static void
1199 igb_tx_queue_release_mbufs(struct igb_tx_queue *txq)
1200 {
1201         unsigned i;
1202
1203         if (txq->sw_ring != NULL) {
1204                 for (i = 0; i < txq->nb_tx_desc; i++) {
1205                         if (txq->sw_ring[i].mbuf != NULL) {
1206                                 rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
1207                                 txq->sw_ring[i].mbuf = NULL;
1208                         }
1209                 }
1210         }
1211 }
1212
1213 static void
1214 igb_tx_queue_release(struct igb_tx_queue *txq)
1215 {
1216         if (txq != NULL) {
1217                 igb_tx_queue_release_mbufs(txq);
1218                 rte_free(txq->sw_ring);
1219                 rte_free(txq);
1220         }
1221 }
1222
1223 void
1224 eth_igb_tx_queue_release(void *txq)
1225 {
1226         igb_tx_queue_release(txq);
1227 }
1228
1229 static void
1230 igb_reset_tx_queue_stat(struct igb_tx_queue *txq)
1231 {
1232         txq->tx_head = 0;
1233         txq->tx_tail = 0;
1234         txq->ctx_curr = 0;
1235         memset((void*)&txq->ctx_cache, 0,
1236                 IGB_CTX_NUM * sizeof(struct igb_advctx_info));
1237 }
1238
1239 static void
1240 igb_reset_tx_queue(struct igb_tx_queue *txq, struct rte_eth_dev *dev)
1241 {
1242         static const union e1000_adv_tx_desc zeroed_desc = {{0}};
1243         struct igb_tx_entry *txe = txq->sw_ring;
1244         uint16_t i, prev;
1245         struct e1000_hw *hw;
1246
1247         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1248         /* Zero out HW ring memory */
1249         for (i = 0; i < txq->nb_tx_desc; i++) {
1250                 txq->tx_ring[i] = zeroed_desc;
1251         }
1252
1253         /* Initialize ring entries */
1254         prev = (uint16_t)(txq->nb_tx_desc - 1);
1255         for (i = 0; i < txq->nb_tx_desc; i++) {
1256                 volatile union e1000_adv_tx_desc *txd = &(txq->tx_ring[i]);
1257
1258                 txd->wb.status = E1000_TXD_STAT_DD;
1259                 txe[i].mbuf = NULL;
1260                 txe[i].last_id = i;
1261                 txe[prev].next_id = i;
1262                 prev = i;
1263         }
1264
1265         txq->txd_type = E1000_ADVTXD_DTYP_DATA;
1266         /* 82575 specific, each tx queue will use 2 hw contexts */
1267         if (hw->mac.type == e1000_82575)
1268                 txq->ctx_start = txq->queue_id * IGB_CTX_NUM;
1269
1270         igb_reset_tx_queue_stat(txq);
1271 }
1272
1273 int
1274 eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
1275                          uint16_t queue_idx,
1276                          uint16_t nb_desc,
1277                          unsigned int socket_id,
1278                          const struct rte_eth_txconf *tx_conf)
1279 {
1280         const struct rte_memzone *tz;
1281         struct igb_tx_queue *txq;
1282         struct e1000_hw     *hw;
1283         uint32_t size;
1284
1285         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1286
1287         /*
1288          * Validate number of transmit descriptors.
1289          * It must not exceed hardware maximum, and must be multiple
1290          * of E1000_ALIGN.
1291          */
1292         if (nb_desc % IGB_TXD_ALIGN != 0 ||
1293                         (nb_desc > E1000_MAX_RING_DESC) ||
1294                         (nb_desc < E1000_MIN_RING_DESC)) {
1295                 return -EINVAL;
1296         }
1297
1298         /*
1299          * The tx_free_thresh and tx_rs_thresh values are not used in the 1G
1300          * driver.
1301          */
1302         if (tx_conf->tx_free_thresh != 0)
1303                 PMD_INIT_LOG(INFO, "The tx_free_thresh parameter is not "
1304                              "used for the 1G driver.");
1305         if (tx_conf->tx_rs_thresh != 0)
1306                 PMD_INIT_LOG(INFO, "The tx_rs_thresh parameter is not "
1307                              "used for the 1G driver.");
1308         if (tx_conf->tx_thresh.wthresh == 0 && hw->mac.type != e1000_82576)
1309                 PMD_INIT_LOG(INFO, "To improve 1G driver performance, "
1310                              "consider setting the TX WTHRESH value to 4, 8, "
1311                              "or 16.");
1312
1313         /* Free memory prior to re-allocation if needed */
1314         if (dev->data->tx_queues[queue_idx] != NULL) {
1315                 igb_tx_queue_release(dev->data->tx_queues[queue_idx]);
1316                 dev->data->tx_queues[queue_idx] = NULL;
1317         }
1318
1319         /* First allocate the tx queue data structure */
1320         txq = rte_zmalloc("ethdev TX queue", sizeof(struct igb_tx_queue),
1321                                                         RTE_CACHE_LINE_SIZE);
1322         if (txq == NULL)
1323                 return -ENOMEM;
1324
1325         /*
1326          * Allocate TX ring hardware descriptors. A memzone large enough to
1327          * handle the maximum ring size is allocated in order to allow for
1328          * resizing in later calls to the queue setup function.
1329          */
1330         size = sizeof(union e1000_adv_tx_desc) * E1000_MAX_RING_DESC;
1331         tz = rte_eth_dma_zone_reserve(dev, "tx_ring", queue_idx, size,
1332                                       E1000_ALIGN, socket_id);
1333         if (tz == NULL) {
1334                 igb_tx_queue_release(txq);
1335                 return -ENOMEM;
1336         }
1337
1338         txq->nb_tx_desc = nb_desc;
1339         txq->pthresh = tx_conf->tx_thresh.pthresh;
1340         txq->hthresh = tx_conf->tx_thresh.hthresh;
1341         txq->wthresh = tx_conf->tx_thresh.wthresh;
1342         if (txq->wthresh > 0 && hw->mac.type == e1000_82576)
1343                 txq->wthresh = 1;
1344         txq->queue_id = queue_idx;
1345         txq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1346                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1347         txq->port_id = dev->data->port_id;
1348
1349         txq->tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(txq->reg_idx));
1350         txq->tx_ring_phys_addr = rte_mem_phy2mch(tz->memseg_id, tz->phys_addr);
1351
1352         txq->tx_ring = (union e1000_adv_tx_desc *) tz->addr;
1353         /* Allocate software ring */
1354         txq->sw_ring = rte_zmalloc("txq->sw_ring",
1355                                    sizeof(struct igb_tx_entry) * nb_desc,
1356                                    RTE_CACHE_LINE_SIZE);
1357         if (txq->sw_ring == NULL) {
1358                 igb_tx_queue_release(txq);
1359                 return -ENOMEM;
1360         }
1361         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
1362                      txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
1363
1364         igb_reset_tx_queue(txq, dev);
1365         dev->tx_pkt_burst = eth_igb_xmit_pkts;
1366         dev->data->tx_queues[queue_idx] = txq;
1367
1368         return 0;
1369 }
1370
1371 static void
1372 igb_rx_queue_release_mbufs(struct igb_rx_queue *rxq)
1373 {
1374         unsigned i;
1375
1376         if (rxq->sw_ring != NULL) {
1377                 for (i = 0; i < rxq->nb_rx_desc; i++) {
1378                         if (rxq->sw_ring[i].mbuf != NULL) {
1379                                 rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
1380                                 rxq->sw_ring[i].mbuf = NULL;
1381                         }
1382                 }
1383         }
1384 }
1385
1386 static void
1387 igb_rx_queue_release(struct igb_rx_queue *rxq)
1388 {
1389         if (rxq != NULL) {
1390                 igb_rx_queue_release_mbufs(rxq);
1391                 rte_free(rxq->sw_ring);
1392                 rte_free(rxq);
1393         }
1394 }
1395
1396 void
1397 eth_igb_rx_queue_release(void *rxq)
1398 {
1399         igb_rx_queue_release(rxq);
1400 }
1401
1402 static void
1403 igb_reset_rx_queue(struct igb_rx_queue *rxq)
1404 {
1405         static const union e1000_adv_rx_desc zeroed_desc = {{0}};
1406         unsigned i;
1407
1408         /* Zero out HW ring memory */
1409         for (i = 0; i < rxq->nb_rx_desc; i++) {
1410                 rxq->rx_ring[i] = zeroed_desc;
1411         }
1412
1413         rxq->rx_tail = 0;
1414         rxq->pkt_first_seg = NULL;
1415         rxq->pkt_last_seg = NULL;
1416 }
1417
1418 int
1419 eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
1420                          uint16_t queue_idx,
1421                          uint16_t nb_desc,
1422                          unsigned int socket_id,
1423                          const struct rte_eth_rxconf *rx_conf,
1424                          struct rte_mempool *mp)
1425 {
1426         const struct rte_memzone *rz;
1427         struct igb_rx_queue *rxq;
1428         struct e1000_hw     *hw;
1429         unsigned int size;
1430
1431         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1432
1433         /*
1434          * Validate number of receive descriptors.
1435          * It must not exceed hardware maximum, and must be multiple
1436          * of E1000_ALIGN.
1437          */
1438         if (nb_desc % IGB_RXD_ALIGN != 0 ||
1439                         (nb_desc > E1000_MAX_RING_DESC) ||
1440                         (nb_desc < E1000_MIN_RING_DESC)) {
1441                 return -EINVAL;
1442         }
1443
1444         /* Free memory prior to re-allocation if needed */
1445         if (dev->data->rx_queues[queue_idx] != NULL) {
1446                 igb_rx_queue_release(dev->data->rx_queues[queue_idx]);
1447                 dev->data->rx_queues[queue_idx] = NULL;
1448         }
1449
1450         /* First allocate the RX queue data structure. */
1451         rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
1452                           RTE_CACHE_LINE_SIZE);
1453         if (rxq == NULL)
1454                 return -ENOMEM;
1455         rxq->mb_pool = mp;
1456         rxq->nb_rx_desc = nb_desc;
1457         rxq->pthresh = rx_conf->rx_thresh.pthresh;
1458         rxq->hthresh = rx_conf->rx_thresh.hthresh;
1459         rxq->wthresh = rx_conf->rx_thresh.wthresh;
1460         if (rxq->wthresh > 0 &&
1461             (hw->mac.type == e1000_82576 || hw->mac.type == e1000_vfadapt_i350))
1462                 rxq->wthresh = 1;
1463         rxq->drop_en = rx_conf->rx_drop_en;
1464         rxq->rx_free_thresh = rx_conf->rx_free_thresh;
1465         rxq->queue_id = queue_idx;
1466         rxq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1467                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1468         rxq->port_id = dev->data->port_id;
1469         rxq->crc_len = (uint8_t) ((dev->data->dev_conf.rxmode.hw_strip_crc) ? 0 :
1470                                   ETHER_CRC_LEN);
1471
1472         /*
1473          *  Allocate RX ring hardware descriptors. A memzone large enough to
1474          *  handle the maximum ring size is allocated in order to allow for
1475          *  resizing in later calls to the queue setup function.
1476          */
1477         size = sizeof(union e1000_adv_rx_desc) * E1000_MAX_RING_DESC;
1478         rz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx, size,
1479                                       E1000_ALIGN, socket_id);
1480         if (rz == NULL) {
1481                 igb_rx_queue_release(rxq);
1482                 return -ENOMEM;
1483         }
1484         rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(rxq->reg_idx));
1485         rxq->rdh_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDH(rxq->reg_idx));
1486         rxq->rx_ring_phys_addr = rte_mem_phy2mch(rz->memseg_id, rz->phys_addr);
1487         rxq->rx_ring = (union e1000_adv_rx_desc *) rz->addr;
1488
1489         /* Allocate software ring. */
1490         rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
1491                                    sizeof(struct igb_rx_entry) * nb_desc,
1492                                    RTE_CACHE_LINE_SIZE);
1493         if (rxq->sw_ring == NULL) {
1494                 igb_rx_queue_release(rxq);
1495                 return -ENOMEM;
1496         }
1497         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
1498                      rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
1499
1500         dev->data->rx_queues[queue_idx] = rxq;
1501         igb_reset_rx_queue(rxq);
1502
1503         return 0;
1504 }
1505
1506 uint32_t
1507 eth_igb_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1508 {
1509 #define IGB_RXQ_SCAN_INTERVAL 4
1510         volatile union e1000_adv_rx_desc *rxdp;
1511         struct igb_rx_queue *rxq;
1512         uint32_t desc = 0;
1513
1514         if (rx_queue_id >= dev->data->nb_rx_queues) {
1515                 PMD_RX_LOG(ERR, "Invalid RX queue id=%d", rx_queue_id);
1516                 return 0;
1517         }
1518
1519         rxq = dev->data->rx_queues[rx_queue_id];
1520         rxdp = &(rxq->rx_ring[rxq->rx_tail]);
1521
1522         while ((desc < rxq->nb_rx_desc) &&
1523                 (rxdp->wb.upper.status_error & E1000_RXD_STAT_DD)) {
1524                 desc += IGB_RXQ_SCAN_INTERVAL;
1525                 rxdp += IGB_RXQ_SCAN_INTERVAL;
1526                 if (rxq->rx_tail + desc >= rxq->nb_rx_desc)
1527                         rxdp = &(rxq->rx_ring[rxq->rx_tail +
1528                                 desc - rxq->nb_rx_desc]);
1529         }
1530
1531         return 0;
1532 }
1533
1534 int
1535 eth_igb_rx_descriptor_done(void *rx_queue, uint16_t offset)
1536 {
1537         volatile union e1000_adv_rx_desc *rxdp;
1538         struct igb_rx_queue *rxq = rx_queue;
1539         uint32_t desc;
1540
1541         if (unlikely(offset >= rxq->nb_rx_desc))
1542                 return 0;
1543         desc = rxq->rx_tail + offset;
1544         if (desc >= rxq->nb_rx_desc)
1545                 desc -= rxq->nb_rx_desc;
1546
1547         rxdp = &rxq->rx_ring[desc];
1548         return !!(rxdp->wb.upper.status_error & E1000_RXD_STAT_DD);
1549 }
1550
1551 void
1552 igb_dev_clear_queues(struct rte_eth_dev *dev)
1553 {
1554         uint16_t i;
1555         struct igb_tx_queue *txq;
1556         struct igb_rx_queue *rxq;
1557
1558         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1559                 txq = dev->data->tx_queues[i];
1560                 if (txq != NULL) {
1561                         igb_tx_queue_release_mbufs(txq);
1562                         igb_reset_tx_queue(txq, dev);
1563                 }
1564         }
1565
1566         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1567                 rxq = dev->data->rx_queues[i];
1568                 if (rxq != NULL) {
1569                         igb_rx_queue_release_mbufs(rxq);
1570                         igb_reset_rx_queue(rxq);
1571                 }
1572         }
1573 }
1574
1575 void
1576 igb_dev_free_queues(struct rte_eth_dev *dev)
1577 {
1578         uint16_t i;
1579
1580         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1581                 eth_igb_rx_queue_release(dev->data->rx_queues[i]);
1582                 dev->data->rx_queues[i] = NULL;
1583         }
1584         dev->data->nb_rx_queues = 0;
1585
1586         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1587                 eth_igb_tx_queue_release(dev->data->tx_queues[i]);
1588                 dev->data->tx_queues[i] = NULL;
1589         }
1590         dev->data->nb_tx_queues = 0;
1591 }
1592
1593 /**
1594  * Receive Side Scaling (RSS).
1595  * See section 7.1.1.7 in the following document:
1596  *     "Intel 82576 GbE Controller Datasheet" - Revision 2.45 October 2009
1597  *
1598  * Principles:
1599  * The source and destination IP addresses of the IP header and the source and
1600  * destination ports of TCP/UDP headers, if any, of received packets are hashed
1601  * against a configurable random key to compute a 32-bit RSS hash result.
1602  * The seven (7) LSBs of the 32-bit hash result are used as an index into a
1603  * 128-entry redirection table (RETA).  Each entry of the RETA provides a 3-bit
1604  * RSS output index which is used as the RX queue index where to store the
1605  * received packets.
1606  * The following output is supplied in the RX write-back descriptor:
1607  *     - 32-bit result of the Microsoft RSS hash function,
1608  *     - 4-bit RSS type field.
1609  */
1610
1611 /*
1612  * RSS random key supplied in section 7.1.1.7.3 of the Intel 82576 datasheet.
1613  * Used as the default key.
1614  */
1615 static uint8_t rss_intel_key[40] = {
1616         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1617         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1618         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1619         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1620         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1621 };
1622
1623 static void
1624 igb_rss_disable(struct rte_eth_dev *dev)
1625 {
1626         struct e1000_hw *hw;
1627         uint32_t mrqc;
1628
1629         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1630         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1631         mrqc &= ~E1000_MRQC_ENABLE_MASK;
1632         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1633 }
1634
1635 static void
1636 igb_hw_rss_hash_set(struct e1000_hw *hw, struct rte_eth_rss_conf *rss_conf)
1637 {
1638         uint8_t  *hash_key;
1639         uint32_t rss_key;
1640         uint32_t mrqc;
1641         uint64_t rss_hf;
1642         uint16_t i;
1643
1644         hash_key = rss_conf->rss_key;
1645         if (hash_key != NULL) {
1646                 /* Fill in RSS hash key */
1647                 for (i = 0; i < 10; i++) {
1648                         rss_key  = hash_key[(i * 4)];
1649                         rss_key |= hash_key[(i * 4) + 1] << 8;
1650                         rss_key |= hash_key[(i * 4) + 2] << 16;
1651                         rss_key |= hash_key[(i * 4) + 3] << 24;
1652                         E1000_WRITE_REG_ARRAY(hw, E1000_RSSRK(0), i, rss_key);
1653                 }
1654         }
1655
1656         /* Set configured hashing protocols in MRQC register */
1657         rss_hf = rss_conf->rss_hf;
1658         mrqc = E1000_MRQC_ENABLE_RSS_4Q; /* RSS enabled. */
1659         if (rss_hf & ETH_RSS_IPV4)
1660                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4;
1661         if (rss_hf & ETH_RSS_NONFRAG_IPV4_TCP)
1662                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_TCP;
1663         if (rss_hf & ETH_RSS_IPV6)
1664                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6;
1665         if (rss_hf & ETH_RSS_IPV6_EX)
1666                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_EX;
1667         if (rss_hf & ETH_RSS_NONFRAG_IPV6_TCP)
1668                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP;
1669         if (rss_hf & ETH_RSS_IPV6_TCP_EX)
1670                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP_EX;
1671         if (rss_hf & ETH_RSS_NONFRAG_IPV4_UDP)
1672                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_UDP;
1673         if (rss_hf & ETH_RSS_NONFRAG_IPV6_UDP)
1674                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP;
1675         if (rss_hf & ETH_RSS_IPV6_UDP_EX)
1676                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP_EX;
1677         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1678 }
1679
1680 int
1681 eth_igb_rss_hash_update(struct rte_eth_dev *dev,
1682                         struct rte_eth_rss_conf *rss_conf)
1683 {
1684         struct e1000_hw *hw;
1685         uint32_t mrqc;
1686         uint64_t rss_hf;
1687
1688         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1689
1690         /*
1691          * Before changing anything, first check that the update RSS operation
1692          * does not attempt to disable RSS, if RSS was enabled at
1693          * initialization time, or does not attempt to enable RSS, if RSS was
1694          * disabled at initialization time.
1695          */
1696         rss_hf = rss_conf->rss_hf & IGB_RSS_OFFLOAD_ALL;
1697         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1698         if (!(mrqc & E1000_MRQC_ENABLE_MASK)) { /* RSS disabled */
1699                 if (rss_hf != 0) /* Enable RSS */
1700                         return -(EINVAL);
1701                 return 0; /* Nothing to do */
1702         }
1703         /* RSS enabled */
1704         if (rss_hf == 0) /* Disable RSS */
1705                 return -(EINVAL);
1706         igb_hw_rss_hash_set(hw, rss_conf);
1707         return 0;
1708 }
1709
1710 int eth_igb_rss_hash_conf_get(struct rte_eth_dev *dev,
1711                               struct rte_eth_rss_conf *rss_conf)
1712 {
1713         struct e1000_hw *hw;
1714         uint8_t *hash_key;
1715         uint32_t rss_key;
1716         uint32_t mrqc;
1717         uint64_t rss_hf;
1718         uint16_t i;
1719
1720         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1721         hash_key = rss_conf->rss_key;
1722         if (hash_key != NULL) {
1723                 /* Return RSS hash key */
1724                 for (i = 0; i < 10; i++) {
1725                         rss_key = E1000_READ_REG_ARRAY(hw, E1000_RSSRK(0), i);
1726                         hash_key[(i * 4)] = rss_key & 0x000000FF;
1727                         hash_key[(i * 4) + 1] = (rss_key >> 8) & 0x000000FF;
1728                         hash_key[(i * 4) + 2] = (rss_key >> 16) & 0x000000FF;
1729                         hash_key[(i * 4) + 3] = (rss_key >> 24) & 0x000000FF;
1730                 }
1731         }
1732
1733         /* Get RSS functions configured in MRQC register */
1734         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1735         if ((mrqc & E1000_MRQC_ENABLE_RSS_4Q) == 0) { /* RSS is disabled */
1736                 rss_conf->rss_hf = 0;
1737                 return 0;
1738         }
1739         rss_hf = 0;
1740         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4)
1741                 rss_hf |= ETH_RSS_IPV4;
1742         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_TCP)
1743                 rss_hf |= ETH_RSS_NONFRAG_IPV4_TCP;
1744         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6)
1745                 rss_hf |= ETH_RSS_IPV6;
1746         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_EX)
1747                 rss_hf |= ETH_RSS_IPV6_EX;
1748         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP)
1749                 rss_hf |= ETH_RSS_NONFRAG_IPV6_TCP;
1750         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP_EX)
1751                 rss_hf |= ETH_RSS_IPV6_TCP_EX;
1752         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_UDP)
1753                 rss_hf |= ETH_RSS_NONFRAG_IPV4_UDP;
1754         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP)
1755                 rss_hf |= ETH_RSS_NONFRAG_IPV6_UDP;
1756         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP_EX)
1757                 rss_hf |= ETH_RSS_IPV6_UDP_EX;
1758         rss_conf->rss_hf = rss_hf;
1759         return 0;
1760 }
1761
1762 static void
1763 igb_rss_configure(struct rte_eth_dev *dev)
1764 {
1765         struct rte_eth_rss_conf rss_conf;
1766         struct e1000_hw *hw;
1767         uint32_t shift;
1768         uint16_t i;
1769
1770         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1771
1772         /* Fill in redirection table. */
1773         shift = (hw->mac.type == e1000_82575) ? 6 : 0;
1774         for (i = 0; i < 128; i++) {
1775                 union e1000_reta {
1776                         uint32_t dword;
1777                         uint8_t  bytes[4];
1778                 } reta;
1779                 uint8_t q_idx;
1780
1781                 q_idx = (uint8_t) ((dev->data->nb_rx_queues > 1) ?
1782                                    i % dev->data->nb_rx_queues : 0);
1783                 reta.bytes[i & 3] = (uint8_t) (q_idx << shift);
1784                 if ((i & 3) == 3)
1785                         E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
1786         }
1787
1788         /*
1789          * Configure the RSS key and the RSS protocols used to compute
1790          * the RSS hash of input packets.
1791          */
1792         rss_conf = dev->data->dev_conf.rx_adv_conf.rss_conf;
1793         if ((rss_conf.rss_hf & IGB_RSS_OFFLOAD_ALL) == 0) {
1794                 igb_rss_disable(dev);
1795                 return;
1796         }
1797         if (rss_conf.rss_key == NULL)
1798                 rss_conf.rss_key = rss_intel_key; /* Default hash key */
1799         igb_hw_rss_hash_set(hw, &rss_conf);
1800 }
1801
1802 /*
1803  * Check if the mac type support VMDq or not.
1804  * Return 1 if it supports, otherwise, return 0.
1805  */
1806 static int
1807 igb_is_vmdq_supported(const struct rte_eth_dev *dev)
1808 {
1809         const struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1810
1811         switch (hw->mac.type) {
1812         case e1000_82576:
1813         case e1000_82580:
1814         case e1000_i350:
1815                 return 1;
1816         case e1000_82540:
1817         case e1000_82541:
1818         case e1000_82542:
1819         case e1000_82543:
1820         case e1000_82544:
1821         case e1000_82545:
1822         case e1000_82546:
1823         case e1000_82547:
1824         case e1000_82571:
1825         case e1000_82572:
1826         case e1000_82573:
1827         case e1000_82574:
1828         case e1000_82583:
1829         case e1000_i210:
1830         case e1000_i211:
1831         default:
1832                 PMD_INIT_LOG(ERR, "Cannot support VMDq feature");
1833                 return 0;
1834         }
1835 }
1836
1837 static int
1838 igb_vmdq_rx_hw_configure(struct rte_eth_dev *dev)
1839 {
1840         struct rte_eth_vmdq_rx_conf *cfg;
1841         struct e1000_hw *hw;
1842         uint32_t mrqc, vt_ctl, vmolr, rctl;
1843         int i;
1844
1845         PMD_INIT_FUNC_TRACE();
1846
1847         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1848         cfg = &dev->data->dev_conf.rx_adv_conf.vmdq_rx_conf;
1849
1850         /* Check if mac type can support VMDq, return value of 0 means NOT support */
1851         if (igb_is_vmdq_supported(dev) == 0)
1852                 return -1;
1853
1854         igb_rss_disable(dev);
1855
1856         /* RCTL: eanble VLAN filter */
1857         rctl = E1000_READ_REG(hw, E1000_RCTL);
1858         rctl |= E1000_RCTL_VFE;
1859         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
1860
1861         /* MRQC: enable vmdq */
1862         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1863         mrqc |= E1000_MRQC_ENABLE_VMDQ;
1864         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1865
1866         /* VTCTL:  pool selection according to VLAN tag */
1867         vt_ctl = E1000_READ_REG(hw, E1000_VT_CTL);
1868         if (cfg->enable_default_pool)
1869                 vt_ctl |= (cfg->default_pool << E1000_VT_CTL_DEFAULT_POOL_SHIFT);
1870         vt_ctl |= E1000_VT_CTL_IGNORE_MAC;
1871         E1000_WRITE_REG(hw, E1000_VT_CTL, vt_ctl);
1872
1873         for (i = 0; i < E1000_VMOLR_SIZE; i++) {
1874                 vmolr = E1000_READ_REG(hw, E1000_VMOLR(i));
1875                 vmolr &= ~(E1000_VMOLR_AUPE | E1000_VMOLR_ROMPE |
1876                         E1000_VMOLR_ROPE | E1000_VMOLR_BAM |
1877                         E1000_VMOLR_MPME);
1878
1879                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_UNTAG)
1880                         vmolr |= E1000_VMOLR_AUPE;
1881                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_HASH_MC)
1882                         vmolr |= E1000_VMOLR_ROMPE;
1883                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_HASH_UC)
1884                         vmolr |= E1000_VMOLR_ROPE;
1885                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_BROADCAST)
1886                         vmolr |= E1000_VMOLR_BAM;
1887                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_MULTICAST)
1888                         vmolr |= E1000_VMOLR_MPME;
1889
1890                 E1000_WRITE_REG(hw, E1000_VMOLR(i), vmolr);
1891         }
1892
1893         /*
1894          * VMOLR: set STRVLAN as 1 if IGMAC in VTCTL is set as 1
1895          * Both 82576 and 82580 support it
1896          */
1897         if (hw->mac.type != e1000_i350) {
1898                 for (i = 0; i < E1000_VMOLR_SIZE; i++) {
1899                         vmolr = E1000_READ_REG(hw, E1000_VMOLR(i));
1900                         vmolr |= E1000_VMOLR_STRVLAN;
1901                         E1000_WRITE_REG(hw, E1000_VMOLR(i), vmolr);
1902                 }
1903         }
1904
1905         /* VFTA - enable all vlan filters */
1906         for (i = 0; i < IGB_VFTA_SIZE; i++)
1907                 E1000_WRITE_REG(hw, (E1000_VFTA+(i*4)), UINT32_MAX);
1908
1909         /* VFRE: 8 pools enabling for rx, both 82576 and i350 support it */
1910         if (hw->mac.type != e1000_82580)
1911                 E1000_WRITE_REG(hw, E1000_VFRE, E1000_MBVFICR_VFREQ_MASK);
1912
1913         /*
1914          * RAH/RAL - allow pools to read specific mac addresses
1915          * In this case, all pools should be able to read from mac addr 0
1916          */
1917         E1000_WRITE_REG(hw, E1000_RAH(0), (E1000_RAH_AV | UINT16_MAX));
1918         E1000_WRITE_REG(hw, E1000_RAL(0), UINT32_MAX);
1919
1920         /* VLVF: set up filters for vlan tags as configured */
1921         for (i = 0; i < cfg->nb_pool_maps; i++) {
1922                 /* set vlan id in VF register and set the valid bit */
1923                 E1000_WRITE_REG(hw, E1000_VLVF(i), (E1000_VLVF_VLANID_ENABLE | \
1924                         (cfg->pool_map[i].vlan_id & ETH_VLAN_ID_MAX) | \
1925                         ((cfg->pool_map[i].pools << E1000_VLVF_POOLSEL_SHIFT ) & \
1926                         E1000_VLVF_POOLSEL_MASK)));
1927         }
1928
1929         E1000_WRITE_FLUSH(hw);
1930
1931         return 0;
1932 }
1933
1934
1935 /*********************************************************************
1936  *
1937  *  Enable receive unit.
1938  *
1939  **********************************************************************/
1940
1941 static int
1942 igb_alloc_rx_queue_mbufs(struct igb_rx_queue *rxq)
1943 {
1944         struct igb_rx_entry *rxe = rxq->sw_ring;
1945         uint64_t dma_addr;
1946         unsigned i;
1947
1948         /* Initialize software ring entries. */
1949         for (i = 0; i < rxq->nb_rx_desc; i++) {
1950                 volatile union e1000_adv_rx_desc *rxd;
1951                 struct rte_mbuf *mbuf = rte_mbuf_raw_alloc(rxq->mb_pool);
1952
1953                 if (mbuf == NULL) {
1954                         PMD_INIT_LOG(ERR, "RX mbuf alloc failed "
1955                                      "queue_id=%hu", rxq->queue_id);
1956                         return -ENOMEM;
1957                 }
1958                 dma_addr =
1959                         rte_cpu_to_le_64(rte_mbuf_data_dma_addr_default(mbuf));
1960                 rxd = &rxq->rx_ring[i];
1961                 rxd->read.hdr_addr = 0;
1962                 rxd->read.pkt_addr = dma_addr;
1963                 rxe[i].mbuf = mbuf;
1964         }
1965
1966         return 0;
1967 }
1968
1969 #define E1000_MRQC_DEF_Q_SHIFT               (3)
1970 static int
1971 igb_dev_mq_rx_configure(struct rte_eth_dev *dev)
1972 {
1973         struct e1000_hw *hw =
1974                 E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1975         uint32_t mrqc;
1976
1977         if (RTE_ETH_DEV_SRIOV(dev).active == ETH_8_POOLS) {
1978                 /*
1979                  * SRIOV active scheme
1980                  * FIXME if support RSS together with VMDq & SRIOV
1981                  */
1982                 mrqc = E1000_MRQC_ENABLE_VMDQ;
1983                 /* 011b Def_Q ignore, according to VT_CTL.DEF_PL */
1984                 mrqc |= 0x3 << E1000_MRQC_DEF_Q_SHIFT;
1985                 E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1986         } else if(RTE_ETH_DEV_SRIOV(dev).active == 0) {
1987                 /*
1988                  * SRIOV inactive scheme
1989                  */
1990                 switch (dev->data->dev_conf.rxmode.mq_mode) {
1991                         case ETH_MQ_RX_RSS:
1992                                 igb_rss_configure(dev);
1993                                 break;
1994                         case ETH_MQ_RX_VMDQ_ONLY:
1995                                 /*Configure general VMDQ only RX parameters*/
1996                                 igb_vmdq_rx_hw_configure(dev);
1997                                 break;
1998                         case ETH_MQ_RX_NONE:
1999                                 /* if mq_mode is none, disable rss mode.*/
2000                         default:
2001                                 igb_rss_disable(dev);
2002                                 break;
2003                 }
2004         }
2005
2006         return 0;
2007 }
2008
2009 int
2010 eth_igb_rx_init(struct rte_eth_dev *dev)
2011 {
2012         struct e1000_hw     *hw;
2013         struct igb_rx_queue *rxq;
2014         uint32_t rctl;
2015         uint32_t rxcsum;
2016         uint32_t srrctl;
2017         uint16_t buf_size;
2018         uint16_t rctl_bsize;
2019         uint16_t i;
2020         int ret;
2021
2022         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2023         srrctl = 0;
2024
2025         /*
2026          * Make sure receives are disabled while setting
2027          * up the descriptor ring.
2028          */
2029         rctl = E1000_READ_REG(hw, E1000_RCTL);
2030         E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
2031
2032         /*
2033          * Configure support of jumbo frames, if any.
2034          */
2035         if (dev->data->dev_conf.rxmode.jumbo_frame == 1) {
2036                 rctl |= E1000_RCTL_LPE;
2037
2038                 /*
2039                  * Set maximum packet length by default, and might be updated
2040                  * together with enabling/disabling dual VLAN.
2041                  */
2042                 E1000_WRITE_REG(hw, E1000_RLPML,
2043                         dev->data->dev_conf.rxmode.max_rx_pkt_len +
2044                                                 VLAN_TAG_SIZE);
2045         } else
2046                 rctl &= ~E1000_RCTL_LPE;
2047
2048         /* Configure and enable each RX queue. */
2049         rctl_bsize = 0;
2050         dev->rx_pkt_burst = eth_igb_recv_pkts;
2051         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2052                 uint64_t bus_addr;
2053                 uint32_t rxdctl;
2054
2055                 rxq = dev->data->rx_queues[i];
2056
2057                 /* Allocate buffers for descriptor rings and set up queue */
2058                 ret = igb_alloc_rx_queue_mbufs(rxq);
2059                 if (ret)
2060                         return ret;
2061
2062                 /*
2063                  * Reset crc_len in case it was changed after queue setup by a
2064                  *  call to configure
2065                  */
2066                 rxq->crc_len =
2067                         (uint8_t)(dev->data->dev_conf.rxmode.hw_strip_crc ?
2068                                                         0 : ETHER_CRC_LEN);
2069
2070                 bus_addr = rxq->rx_ring_phys_addr;
2071                 E1000_WRITE_REG(hw, E1000_RDLEN(rxq->reg_idx),
2072                                 rxq->nb_rx_desc *
2073                                 sizeof(union e1000_adv_rx_desc));
2074                 E1000_WRITE_REG(hw, E1000_RDBAH(rxq->reg_idx),
2075                                 (uint32_t)(bus_addr >> 32));
2076                 E1000_WRITE_REG(hw, E1000_RDBAL(rxq->reg_idx), (uint32_t)bus_addr);
2077
2078                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2079
2080                 /*
2081                  * Configure RX buffer size.
2082                  */
2083                 buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
2084                         RTE_PKTMBUF_HEADROOM);
2085                 if (buf_size >= 1024) {
2086                         /*
2087                          * Configure the BSIZEPACKET field of the SRRCTL
2088                          * register of the queue.
2089                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
2090                          * If this field is equal to 0b, then RCTL.BSIZE
2091                          * determines the RX packet buffer size.
2092                          */
2093                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
2094                                    E1000_SRRCTL_BSIZEPKT_MASK);
2095                         buf_size = (uint16_t) ((srrctl &
2096                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
2097                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
2098
2099                         /* It adds dual VLAN length for supporting dual VLAN */
2100                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
2101                                                 2 * VLAN_TAG_SIZE) > buf_size){
2102                                 if (!dev->data->scattered_rx)
2103                                         PMD_INIT_LOG(DEBUG,
2104                                                      "forcing scatter mode");
2105                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2106                                 dev->data->scattered_rx = 1;
2107                         }
2108                 } else {
2109                         /*
2110                          * Use BSIZE field of the device RCTL register.
2111                          */
2112                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
2113                                 rctl_bsize = buf_size;
2114                         if (!dev->data->scattered_rx)
2115                                 PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2116                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2117                         dev->data->scattered_rx = 1;
2118                 }
2119
2120                 /* Set if packets are dropped when no descriptors available */
2121                 if (rxq->drop_en)
2122                         srrctl |= E1000_SRRCTL_DROP_EN;
2123
2124                 E1000_WRITE_REG(hw, E1000_SRRCTL(rxq->reg_idx), srrctl);
2125
2126                 /* Enable this RX queue. */
2127                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(rxq->reg_idx));
2128                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2129                 rxdctl &= 0xFFF00000;
2130                 rxdctl |= (rxq->pthresh & 0x1F);
2131                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2132                 rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2133                 E1000_WRITE_REG(hw, E1000_RXDCTL(rxq->reg_idx), rxdctl);
2134         }
2135
2136         if (dev->data->dev_conf.rxmode.enable_scatter) {
2137                 if (!dev->data->scattered_rx)
2138                         PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2139                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2140                 dev->data->scattered_rx = 1;
2141         }
2142
2143         /*
2144          * Setup BSIZE field of RCTL register, if needed.
2145          * Buffer sizes >= 1024 are not [supposed to be] setup in the RCTL
2146          * register, since the code above configures the SRRCTL register of
2147          * the RX queue in such a case.
2148          * All configurable sizes are:
2149          * 16384: rctl |= (E1000_RCTL_SZ_16384 | E1000_RCTL_BSEX);
2150          *  8192: rctl |= (E1000_RCTL_SZ_8192  | E1000_RCTL_BSEX);
2151          *  4096: rctl |= (E1000_RCTL_SZ_4096  | E1000_RCTL_BSEX);
2152          *  2048: rctl |= E1000_RCTL_SZ_2048;
2153          *  1024: rctl |= E1000_RCTL_SZ_1024;
2154          *   512: rctl |= E1000_RCTL_SZ_512;
2155          *   256: rctl |= E1000_RCTL_SZ_256;
2156          */
2157         if (rctl_bsize > 0) {
2158                 if (rctl_bsize >= 512) /* 512 <= buf_size < 1024 - use 512 */
2159                         rctl |= E1000_RCTL_SZ_512;
2160                 else /* 256 <= buf_size < 512 - use 256 */
2161                         rctl |= E1000_RCTL_SZ_256;
2162         }
2163
2164         /*
2165          * Configure RSS if device configured with multiple RX queues.
2166          */
2167         igb_dev_mq_rx_configure(dev);
2168
2169         /* Update the rctl since igb_dev_mq_rx_configure may change its value */
2170         rctl |= E1000_READ_REG(hw, E1000_RCTL);
2171
2172         /*
2173          * Setup the Checksum Register.
2174          * Receive Full-Packet Checksum Offload is mutually exclusive with RSS.
2175          */
2176         rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
2177         rxcsum |= E1000_RXCSUM_PCSD;
2178
2179         /* Enable both L3/L4 rx checksum offload */
2180         if (dev->data->dev_conf.rxmode.hw_ip_checksum)
2181                 rxcsum |= (E1000_RXCSUM_IPOFL  | E1000_RXCSUM_TUOFL);
2182         else
2183                 rxcsum &= ~(E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL);
2184         E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
2185
2186         /* Setup the Receive Control Register. */
2187         if (dev->data->dev_conf.rxmode.hw_strip_crc) {
2188                 rctl |= E1000_RCTL_SECRC; /* Strip Ethernet CRC. */
2189
2190                 /* set STRCRC bit in all queues */
2191                 if (hw->mac.type == e1000_i350 ||
2192                     hw->mac.type == e1000_i210 ||
2193                     hw->mac.type == e1000_i211 ||
2194                     hw->mac.type == e1000_i354) {
2195                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2196                                 rxq = dev->data->rx_queues[i];
2197                                 uint32_t dvmolr = E1000_READ_REG(hw,
2198                                         E1000_DVMOLR(rxq->reg_idx));
2199                                 dvmolr |= E1000_DVMOLR_STRCRC;
2200                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2201                         }
2202                 }
2203         } else {
2204                 rctl &= ~E1000_RCTL_SECRC; /* Do not Strip Ethernet CRC. */
2205
2206                 /* clear STRCRC bit in all queues */
2207                 if (hw->mac.type == e1000_i350 ||
2208                     hw->mac.type == e1000_i210 ||
2209                     hw->mac.type == e1000_i211 ||
2210                     hw->mac.type == e1000_i354) {
2211                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2212                                 rxq = dev->data->rx_queues[i];
2213                                 uint32_t dvmolr = E1000_READ_REG(hw,
2214                                         E1000_DVMOLR(rxq->reg_idx));
2215                                 dvmolr &= ~E1000_DVMOLR_STRCRC;
2216                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2217                         }
2218                 }
2219         }
2220
2221         rctl &= ~(3 << E1000_RCTL_MO_SHIFT);
2222         rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
2223                 E1000_RCTL_RDMTS_HALF |
2224                 (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
2225
2226         /* Make sure VLAN Filters are off. */
2227         if (dev->data->dev_conf.rxmode.mq_mode != ETH_MQ_RX_VMDQ_ONLY)
2228                 rctl &= ~E1000_RCTL_VFE;
2229         /* Don't store bad packets. */
2230         rctl &= ~E1000_RCTL_SBP;
2231
2232         /* Enable Receives. */
2233         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
2234
2235         /*
2236          * Setup the HW Rx Head and Tail Descriptor Pointers.
2237          * This needs to be done after enable.
2238          */
2239         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2240                 rxq = dev->data->rx_queues[i];
2241                 E1000_WRITE_REG(hw, E1000_RDH(rxq->reg_idx), 0);
2242                 E1000_WRITE_REG(hw, E1000_RDT(rxq->reg_idx), rxq->nb_rx_desc - 1);
2243         }
2244
2245         return 0;
2246 }
2247
2248 /*********************************************************************
2249  *
2250  *  Enable transmit unit.
2251  *
2252  **********************************************************************/
2253 void
2254 eth_igb_tx_init(struct rte_eth_dev *dev)
2255 {
2256         struct e1000_hw     *hw;
2257         struct igb_tx_queue *txq;
2258         uint32_t tctl;
2259         uint32_t txdctl;
2260         uint16_t i;
2261
2262         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2263
2264         /* Setup the Base and Length of the Tx Descriptor Rings. */
2265         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2266                 uint64_t bus_addr;
2267                 txq = dev->data->tx_queues[i];
2268                 bus_addr = txq->tx_ring_phys_addr;
2269
2270                 E1000_WRITE_REG(hw, E1000_TDLEN(txq->reg_idx),
2271                                 txq->nb_tx_desc *
2272                                 sizeof(union e1000_adv_tx_desc));
2273                 E1000_WRITE_REG(hw, E1000_TDBAH(txq->reg_idx),
2274                                 (uint32_t)(bus_addr >> 32));
2275                 E1000_WRITE_REG(hw, E1000_TDBAL(txq->reg_idx), (uint32_t)bus_addr);
2276
2277                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2278                 E1000_WRITE_REG(hw, E1000_TDT(txq->reg_idx), 0);
2279                 E1000_WRITE_REG(hw, E1000_TDH(txq->reg_idx), 0);
2280
2281                 /* Setup Transmit threshold registers. */
2282                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(txq->reg_idx));
2283                 txdctl |= txq->pthresh & 0x1F;
2284                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2285                 txdctl |= ((txq->wthresh & 0x1F) << 16);
2286                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2287                 E1000_WRITE_REG(hw, E1000_TXDCTL(txq->reg_idx), txdctl);
2288         }
2289
2290         /* Program the Transmit Control Register. */
2291         tctl = E1000_READ_REG(hw, E1000_TCTL);
2292         tctl &= ~E1000_TCTL_CT;
2293         tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
2294                  (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
2295
2296         e1000_config_collision_dist(hw);
2297
2298         /* This write will effectively turn on the transmit unit. */
2299         E1000_WRITE_REG(hw, E1000_TCTL, tctl);
2300 }
2301
2302 /*********************************************************************
2303  *
2304  *  Enable VF receive unit.
2305  *
2306  **********************************************************************/
2307 int
2308 eth_igbvf_rx_init(struct rte_eth_dev *dev)
2309 {
2310         struct e1000_hw     *hw;
2311         struct igb_rx_queue *rxq;
2312         uint32_t srrctl;
2313         uint16_t buf_size;
2314         uint16_t rctl_bsize;
2315         uint16_t i;
2316         int ret;
2317
2318         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2319
2320         /* setup MTU */
2321         e1000_rlpml_set_vf(hw,
2322                 (uint16_t)(dev->data->dev_conf.rxmode.max_rx_pkt_len +
2323                 VLAN_TAG_SIZE));
2324
2325         /* Configure and enable each RX queue. */
2326         rctl_bsize = 0;
2327         dev->rx_pkt_burst = eth_igb_recv_pkts;
2328         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2329                 uint64_t bus_addr;
2330                 uint32_t rxdctl;
2331
2332                 rxq = dev->data->rx_queues[i];
2333
2334                 /* Allocate buffers for descriptor rings and set up queue */
2335                 ret = igb_alloc_rx_queue_mbufs(rxq);
2336                 if (ret)
2337                         return ret;
2338
2339                 bus_addr = rxq->rx_ring_phys_addr;
2340                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
2341                                 rxq->nb_rx_desc *
2342                                 sizeof(union e1000_adv_rx_desc));
2343                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
2344                                 (uint32_t)(bus_addr >> 32));
2345                 E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);
2346
2347                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2348
2349                 /*
2350                  * Configure RX buffer size.
2351                  */
2352                 buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
2353                         RTE_PKTMBUF_HEADROOM);
2354                 if (buf_size >= 1024) {
2355                         /*
2356                          * Configure the BSIZEPACKET field of the SRRCTL
2357                          * register of the queue.
2358                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
2359                          * If this field is equal to 0b, then RCTL.BSIZE
2360                          * determines the RX packet buffer size.
2361                          */
2362                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
2363                                    E1000_SRRCTL_BSIZEPKT_MASK);
2364                         buf_size = (uint16_t) ((srrctl &
2365                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
2366                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
2367
2368                         /* It adds dual VLAN length for supporting dual VLAN */
2369                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
2370                                                 2 * VLAN_TAG_SIZE) > buf_size){
2371                                 if (!dev->data->scattered_rx)
2372                                         PMD_INIT_LOG(DEBUG,
2373                                                      "forcing scatter mode");
2374                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2375                                 dev->data->scattered_rx = 1;
2376                         }
2377                 } else {
2378                         /*
2379                          * Use BSIZE field of the device RCTL register.
2380                          */
2381                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
2382                                 rctl_bsize = buf_size;
2383                         if (!dev->data->scattered_rx)
2384                                 PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2385                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2386                         dev->data->scattered_rx = 1;
2387                 }
2388
2389                 /* Set if packets are dropped when no descriptors available */
2390                 if (rxq->drop_en)
2391                         srrctl |= E1000_SRRCTL_DROP_EN;
2392
2393                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
2394
2395                 /* Enable this RX queue. */
2396                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
2397                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2398                 rxdctl &= 0xFFF00000;
2399                 rxdctl |= (rxq->pthresh & 0x1F);
2400                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2401                 if (hw->mac.type == e1000_vfadapt) {
2402                         /*
2403                          * Workaround of 82576 VF Erratum
2404                          * force set WTHRESH to 1
2405                          * to avoid Write-Back not triggered sometimes
2406                          */
2407                         rxdctl |= 0x10000;
2408                         PMD_INIT_LOG(DEBUG, "Force set RX WTHRESH to 1 !");
2409                 }
2410                 else
2411                         rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2412                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
2413         }
2414
2415         if (dev->data->dev_conf.rxmode.enable_scatter) {
2416                 if (!dev->data->scattered_rx)
2417                         PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2418                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2419                 dev->data->scattered_rx = 1;
2420         }
2421
2422         /*
2423          * Setup the HW Rx Head and Tail Descriptor Pointers.
2424          * This needs to be done after enable.
2425          */
2426         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2427                 rxq = dev->data->rx_queues[i];
2428                 E1000_WRITE_REG(hw, E1000_RDH(i), 0);
2429                 E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);
2430         }
2431
2432         return 0;
2433 }
2434
2435 /*********************************************************************
2436  *
2437  *  Enable VF transmit unit.
2438  *
2439  **********************************************************************/
2440 void
2441 eth_igbvf_tx_init(struct rte_eth_dev *dev)
2442 {
2443         struct e1000_hw     *hw;
2444         struct igb_tx_queue *txq;
2445         uint32_t txdctl;
2446         uint16_t i;
2447
2448         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2449
2450         /* Setup the Base and Length of the Tx Descriptor Rings. */
2451         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2452                 uint64_t bus_addr;
2453
2454                 txq = dev->data->tx_queues[i];
2455                 bus_addr = txq->tx_ring_phys_addr;
2456                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
2457                                 txq->nb_tx_desc *
2458                                 sizeof(union e1000_adv_tx_desc));
2459                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
2460                                 (uint32_t)(bus_addr >> 32));
2461                 E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);
2462
2463                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2464                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
2465                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
2466
2467                 /* Setup Transmit threshold registers. */
2468                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
2469                 txdctl |= txq->pthresh & 0x1F;
2470                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2471                 if (hw->mac.type == e1000_82576) {
2472                         /*
2473                          * Workaround of 82576 VF Erratum
2474                          * force set WTHRESH to 1
2475                          * to avoid Write-Back not triggered sometimes
2476                          */
2477                         txdctl |= 0x10000;
2478                         PMD_INIT_LOG(DEBUG, "Force set TX WTHRESH to 1 !");
2479                 }
2480                 else
2481                         txdctl |= ((txq->wthresh & 0x1F) << 16);
2482                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2483                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
2484         }
2485
2486 }
2487
2488 void
2489 igb_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
2490         struct rte_eth_rxq_info *qinfo)
2491 {
2492         struct igb_rx_queue *rxq;
2493
2494         rxq = dev->data->rx_queues[queue_id];
2495
2496         qinfo->mp = rxq->mb_pool;
2497         qinfo->scattered_rx = dev->data->scattered_rx;
2498         qinfo->nb_desc = rxq->nb_rx_desc;
2499
2500         qinfo->conf.rx_free_thresh = rxq->rx_free_thresh;
2501         qinfo->conf.rx_drop_en = rxq->drop_en;
2502 }
2503
2504 void
2505 igb_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
2506         struct rte_eth_txq_info *qinfo)
2507 {
2508         struct igb_tx_queue *txq;
2509
2510         txq = dev->data->tx_queues[queue_id];
2511
2512         qinfo->nb_desc = txq->nb_tx_desc;
2513
2514         qinfo->conf.tx_thresh.pthresh = txq->pthresh;
2515         qinfo->conf.tx_thresh.hthresh = txq->hthresh;
2516         qinfo->conf.tx_thresh.wthresh = txq->wthresh;
2517 }