New upstream version 17.11-rc3
[deb_dpdk.git] / drivers / net / e1000 / igb_rxtx.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <sys/queue.h>
35
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <stdint.h>
41 #include <stdarg.h>
42 #include <inttypes.h>
43
44 #include <rte_interrupts.h>
45 #include <rte_byteorder.h>
46 #include <rte_common.h>
47 #include <rte_log.h>
48 #include <rte_debug.h>
49 #include <rte_pci.h>
50 #include <rte_memory.h>
51 #include <rte_memcpy.h>
52 #include <rte_memzone.h>
53 #include <rte_launch.h>
54 #include <rte_eal.h>
55 #include <rte_per_lcore.h>
56 #include <rte_lcore.h>
57 #include <rte_atomic.h>
58 #include <rte_branch_prediction.h>
59 #include <rte_mempool.h>
60 #include <rte_malloc.h>
61 #include <rte_mbuf.h>
62 #include <rte_ether.h>
63 #include <rte_ethdev.h>
64 #include <rte_prefetch.h>
65 #include <rte_udp.h>
66 #include <rte_tcp.h>
67 #include <rte_sctp.h>
68 #include <rte_net.h>
69 #include <rte_string_fns.h>
70
71 #include "e1000_logs.h"
72 #include "base/e1000_api.h"
73 #include "e1000_ethdev.h"
74
75 #ifdef RTE_LIBRTE_IEEE1588
76 #define IGB_TX_IEEE1588_TMST PKT_TX_IEEE1588_TMST
77 #else
78 #define IGB_TX_IEEE1588_TMST 0
79 #endif
80 /* Bit Mask to indicate what bits required for building TX context */
81 #define IGB_TX_OFFLOAD_MASK (                    \
82                 PKT_TX_VLAN_PKT |                \
83                 PKT_TX_IP_CKSUM |                \
84                 PKT_TX_L4_MASK |                 \
85                 PKT_TX_TCP_SEG |                 \
86                 IGB_TX_IEEE1588_TMST)
87
88 #define IGB_TX_OFFLOAD_NOTSUP_MASK \
89                 (PKT_TX_OFFLOAD_MASK ^ IGB_TX_OFFLOAD_MASK)
90
91 /**
92  * Structure associated with each descriptor of the RX ring of a RX queue.
93  */
94 struct igb_rx_entry {
95         struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
96 };
97
98 /**
99  * Structure associated with each descriptor of the TX ring of a TX queue.
100  */
101 struct igb_tx_entry {
102         struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
103         uint16_t next_id; /**< Index of next descriptor in ring. */
104         uint16_t last_id; /**< Index of last scattered descriptor. */
105 };
106
107 /**
108  * rx queue flags
109  */
110 enum igb_rxq_flags {
111         IGB_RXQ_FLAG_LB_BSWAP_VLAN = 0x01,
112 };
113
114 /**
115  * Structure associated with each RX queue.
116  */
117 struct igb_rx_queue {
118         struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
119         volatile union e1000_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
120         uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
121         volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
122         volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
123         struct igb_rx_entry *sw_ring;   /**< address of RX software ring. */
124         struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
125         struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
126         uint16_t            nb_rx_desc; /**< number of RX descriptors. */
127         uint16_t            rx_tail;    /**< current value of RDT register. */
128         uint16_t            nb_rx_hold; /**< number of held free RX desc. */
129         uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
130         uint16_t            queue_id;   /**< RX queue index. */
131         uint16_t            reg_idx;    /**< RX queue register index. */
132         uint16_t            port_id;    /**< Device port identifier. */
133         uint8_t             pthresh;    /**< Prefetch threshold register. */
134         uint8_t             hthresh;    /**< Host threshold register. */
135         uint8_t             wthresh;    /**< Write-back threshold register. */
136         uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
137         uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
138         uint32_t            flags;      /**< RX flags. */
139 };
140
141 /**
142  * Hardware context number
143  */
144 enum igb_advctx_num {
145         IGB_CTX_0    = 0, /**< CTX0    */
146         IGB_CTX_1    = 1, /**< CTX1    */
147         IGB_CTX_NUM  = 2, /**< CTX_NUM */
148 };
149
150 /** Offload features */
151 union igb_tx_offload {
152         uint64_t data;
153         struct {
154                 uint64_t l3_len:9; /**< L3 (IP) Header Length. */
155                 uint64_t l2_len:7; /**< L2 (MAC) Header Length. */
156                 uint64_t vlan_tci:16;  /**< VLAN Tag Control Identifier(CPU order). */
157                 uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
158                 uint64_t tso_segsz:16; /**< TCP TSO segment size. */
159
160                 /* uint64_t unused:8; */
161         };
162 };
163
164 /*
165  * Compare mask for igb_tx_offload.data,
166  * should be in sync with igb_tx_offload layout.
167  * */
168 #define TX_MACIP_LEN_CMP_MASK   0x000000000000FFFFULL /**< L2L3 header mask. */
169 #define TX_VLAN_CMP_MASK                0x00000000FFFF0000ULL /**< Vlan mask. */
170 #define TX_TCP_LEN_CMP_MASK             0x000000FF00000000ULL /**< TCP header mask. */
171 #define TX_TSO_MSS_CMP_MASK             0x00FFFF0000000000ULL /**< TSO segsz mask. */
172 /** Mac + IP + TCP + Mss mask. */
173 #define TX_TSO_CMP_MASK \
174         (TX_MACIP_LEN_CMP_MASK | TX_TCP_LEN_CMP_MASK | TX_TSO_MSS_CMP_MASK)
175
176 /**
177  * Strucutre to check if new context need be built
178  */
179 struct igb_advctx_info {
180         uint64_t flags;           /**< ol_flags related to context build. */
181         /** tx offload: vlan, tso, l2-l3-l4 lengths. */
182         union igb_tx_offload tx_offload;
183         /** compare mask for tx offload. */
184         union igb_tx_offload tx_offload_mask;
185 };
186
187 /**
188  * Structure associated with each TX queue.
189  */
190 struct igb_tx_queue {
191         volatile union e1000_adv_tx_desc *tx_ring; /**< TX ring address */
192         uint64_t               tx_ring_phys_addr; /**< TX ring DMA address. */
193         struct igb_tx_entry    *sw_ring; /**< virtual address of SW ring. */
194         volatile uint32_t      *tdt_reg_addr; /**< Address of TDT register. */
195         uint32_t               txd_type;      /**< Device-specific TXD type */
196         uint16_t               nb_tx_desc;    /**< number of TX descriptors. */
197         uint16_t               tx_tail; /**< Current value of TDT register. */
198         uint16_t               tx_head;
199         /**< Index of first used TX descriptor. */
200         uint16_t               queue_id; /**< TX queue index. */
201         uint16_t               reg_idx;  /**< TX queue register index. */
202         uint16_t               port_id;  /**< Device port identifier. */
203         uint8_t                pthresh;  /**< Prefetch threshold register. */
204         uint8_t                hthresh;  /**< Host threshold register. */
205         uint8_t                wthresh;  /**< Write-back threshold register. */
206         uint32_t               ctx_curr;
207         /**< Current used hardware descriptor. */
208         uint32_t               ctx_start;
209         /**< Start context position for transmit queue. */
210         struct igb_advctx_info ctx_cache[IGB_CTX_NUM];
211         /**< Hardware context history.*/
212 };
213
214 #if 1
215 #define RTE_PMD_USE_PREFETCH
216 #endif
217
218 #ifdef RTE_PMD_USE_PREFETCH
219 #define rte_igb_prefetch(p)     rte_prefetch0(p)
220 #else
221 #define rte_igb_prefetch(p)     do {} while(0)
222 #endif
223
224 #ifdef RTE_PMD_PACKET_PREFETCH
225 #define rte_packet_prefetch(p) rte_prefetch1(p)
226 #else
227 #define rte_packet_prefetch(p)  do {} while(0)
228 #endif
229
230 /*
231  * Macro for VMDq feature for 1 GbE NIC.
232  */
233 #define E1000_VMOLR_SIZE                        (8)
234 #define IGB_TSO_MAX_HDRLEN                      (512)
235 #define IGB_TSO_MAX_MSS                         (9216)
236
237 /*********************************************************************
238  *
239  *  TX function
240  *
241  **********************************************************************/
242
243 /*
244  *There're some limitations in hardware for TCP segmentation offload. We
245  *should check whether the parameters are valid.
246  */
247 static inline uint64_t
248 check_tso_para(uint64_t ol_req, union igb_tx_offload ol_para)
249 {
250         if (!(ol_req & PKT_TX_TCP_SEG))
251                 return ol_req;
252         if ((ol_para.tso_segsz > IGB_TSO_MAX_MSS) || (ol_para.l2_len +
253                         ol_para.l3_len + ol_para.l4_len > IGB_TSO_MAX_HDRLEN)) {
254                 ol_req &= ~PKT_TX_TCP_SEG;
255                 ol_req |= PKT_TX_TCP_CKSUM;
256         }
257         return ol_req;
258 }
259
260 /*
261  * Advanced context descriptor are almost same between igb/ixgbe
262  * This is a separate function, looking for optimization opportunity here
263  * Rework required to go with the pre-defined values.
264  */
265
266 static inline void
267 igbe_set_xmit_ctx(struct igb_tx_queue* txq,
268                 volatile struct e1000_adv_tx_context_desc *ctx_txd,
269                 uint64_t ol_flags, union igb_tx_offload tx_offload)
270 {
271         uint32_t type_tucmd_mlhl;
272         uint32_t mss_l4len_idx;
273         uint32_t ctx_idx, ctx_curr;
274         uint32_t vlan_macip_lens;
275         union igb_tx_offload tx_offload_mask;
276
277         ctx_curr = txq->ctx_curr;
278         ctx_idx = ctx_curr + txq->ctx_start;
279
280         tx_offload_mask.data = 0;
281         type_tucmd_mlhl = 0;
282
283         /* Specify which HW CTX to upload. */
284         mss_l4len_idx = (ctx_idx << E1000_ADVTXD_IDX_SHIFT);
285
286         if (ol_flags & PKT_TX_VLAN_PKT)
287                 tx_offload_mask.data |= TX_VLAN_CMP_MASK;
288
289         /* check if TCP segmentation required for this packet */
290         if (ol_flags & PKT_TX_TCP_SEG) {
291                 /* implies IP cksum in IPv4 */
292                 if (ol_flags & PKT_TX_IP_CKSUM)
293                         type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4 |
294                                 E1000_ADVTXD_TUCMD_L4T_TCP |
295                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
296                 else
297                         type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV6 |
298                                 E1000_ADVTXD_TUCMD_L4T_TCP |
299                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
300
301                 tx_offload_mask.data |= TX_TSO_CMP_MASK;
302                 mss_l4len_idx |= tx_offload.tso_segsz << E1000_ADVTXD_MSS_SHIFT;
303                 mss_l4len_idx |= tx_offload.l4_len << E1000_ADVTXD_L4LEN_SHIFT;
304         } else { /* no TSO, check if hardware checksum is needed */
305                 if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_L4_MASK))
306                         tx_offload_mask.data |= TX_MACIP_LEN_CMP_MASK;
307
308                 if (ol_flags & PKT_TX_IP_CKSUM)
309                         type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4;
310
311                 switch (ol_flags & PKT_TX_L4_MASK) {
312                 case PKT_TX_UDP_CKSUM:
313                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP |
314                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
315                         mss_l4len_idx |= sizeof(struct udp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
316                         break;
317                 case PKT_TX_TCP_CKSUM:
318                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP |
319                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
320                         mss_l4len_idx |= sizeof(struct tcp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
321                         break;
322                 case PKT_TX_SCTP_CKSUM:
323                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP |
324                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
325                         mss_l4len_idx |= sizeof(struct sctp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
326                         break;
327                 default:
328                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_RSV |
329                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
330                         break;
331                 }
332         }
333
334         txq->ctx_cache[ctx_curr].flags = ol_flags;
335         txq->ctx_cache[ctx_curr].tx_offload.data =
336                 tx_offload_mask.data & tx_offload.data;
337         txq->ctx_cache[ctx_curr].tx_offload_mask = tx_offload_mask;
338
339         ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
340         vlan_macip_lens = (uint32_t)tx_offload.data;
341         ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
342         ctx_txd->mss_l4len_idx = rte_cpu_to_le_32(mss_l4len_idx);
343         ctx_txd->seqnum_seed = 0;
344 }
345
346 /*
347  * Check which hardware context can be used. Use the existing match
348  * or create a new context descriptor.
349  */
350 static inline uint32_t
351 what_advctx_update(struct igb_tx_queue *txq, uint64_t flags,
352                 union igb_tx_offload tx_offload)
353 {
354         /* If match with the current context */
355         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
356                 (txq->ctx_cache[txq->ctx_curr].tx_offload.data ==
357                 (txq->ctx_cache[txq->ctx_curr].tx_offload_mask.data & tx_offload.data)))) {
358                         return txq->ctx_curr;
359         }
360
361         /* If match with the second context */
362         txq->ctx_curr ^= 1;
363         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
364                 (txq->ctx_cache[txq->ctx_curr].tx_offload.data ==
365                 (txq->ctx_cache[txq->ctx_curr].tx_offload_mask.data & tx_offload.data)))) {
366                         return txq->ctx_curr;
367         }
368
369         /* Mismatch, use the previous context */
370         return IGB_CTX_NUM;
371 }
372
373 static inline uint32_t
374 tx_desc_cksum_flags_to_olinfo(uint64_t ol_flags)
375 {
376         static const uint32_t l4_olinfo[2] = {0, E1000_ADVTXD_POPTS_TXSM};
377         static const uint32_t l3_olinfo[2] = {0, E1000_ADVTXD_POPTS_IXSM};
378         uint32_t tmp;
379
380         tmp  = l4_olinfo[(ol_flags & PKT_TX_L4_MASK)  != PKT_TX_L4_NO_CKSUM];
381         tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
382         tmp |= l4_olinfo[(ol_flags & PKT_TX_TCP_SEG) != 0];
383         return tmp;
384 }
385
386 static inline uint32_t
387 tx_desc_vlan_flags_to_cmdtype(uint64_t ol_flags)
388 {
389         uint32_t cmdtype;
390         static uint32_t vlan_cmd[2] = {0, E1000_ADVTXD_DCMD_VLE};
391         static uint32_t tso_cmd[2] = {0, E1000_ADVTXD_DCMD_TSE};
392         cmdtype = vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
393         cmdtype |= tso_cmd[(ol_flags & PKT_TX_TCP_SEG) != 0];
394         return cmdtype;
395 }
396
397 uint16_t
398 eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
399                uint16_t nb_pkts)
400 {
401         struct igb_tx_queue *txq;
402         struct igb_tx_entry *sw_ring;
403         struct igb_tx_entry *txe, *txn;
404         volatile union e1000_adv_tx_desc *txr;
405         volatile union e1000_adv_tx_desc *txd;
406         struct rte_mbuf     *tx_pkt;
407         struct rte_mbuf     *m_seg;
408         uint64_t buf_dma_addr;
409         uint32_t olinfo_status;
410         uint32_t cmd_type_len;
411         uint32_t pkt_len;
412         uint16_t slen;
413         uint64_t ol_flags;
414         uint16_t tx_end;
415         uint16_t tx_id;
416         uint16_t tx_last;
417         uint16_t nb_tx;
418         uint64_t tx_ol_req;
419         uint32_t new_ctx = 0;
420         uint32_t ctx = 0;
421         union igb_tx_offload tx_offload = {0};
422
423         txq = tx_queue;
424         sw_ring = txq->sw_ring;
425         txr     = txq->tx_ring;
426         tx_id   = txq->tx_tail;
427         txe = &sw_ring[tx_id];
428
429         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
430                 tx_pkt = *tx_pkts++;
431                 pkt_len = tx_pkt->pkt_len;
432
433                 RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
434
435                 /*
436                  * The number of descriptors that must be allocated for a
437                  * packet is the number of segments of that packet, plus 1
438                  * Context Descriptor for the VLAN Tag Identifier, if any.
439                  * Determine the last TX descriptor to allocate in the TX ring
440                  * for the packet, starting from the current position (tx_id)
441                  * in the ring.
442                  */
443                 tx_last = (uint16_t) (tx_id + tx_pkt->nb_segs - 1);
444
445                 ol_flags = tx_pkt->ol_flags;
446                 tx_ol_req = ol_flags & IGB_TX_OFFLOAD_MASK;
447
448                 /* If a Context Descriptor need be built . */
449                 if (tx_ol_req) {
450                         tx_offload.l2_len = tx_pkt->l2_len;
451                         tx_offload.l3_len = tx_pkt->l3_len;
452                         tx_offload.l4_len = tx_pkt->l4_len;
453                         tx_offload.vlan_tci = tx_pkt->vlan_tci;
454                         tx_offload.tso_segsz = tx_pkt->tso_segsz;
455                         tx_ol_req = check_tso_para(tx_ol_req, tx_offload);
456
457                         ctx = what_advctx_update(txq, tx_ol_req, tx_offload);
458                         /* Only allocate context descriptor if required*/
459                         new_ctx = (ctx == IGB_CTX_NUM);
460                         ctx = txq->ctx_curr + txq->ctx_start;
461                         tx_last = (uint16_t) (tx_last + new_ctx);
462                 }
463                 if (tx_last >= txq->nb_tx_desc)
464                         tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
465
466                 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
467                            " tx_first=%u tx_last=%u",
468                            (unsigned) txq->port_id,
469                            (unsigned) txq->queue_id,
470                            (unsigned) pkt_len,
471                            (unsigned) tx_id,
472                            (unsigned) tx_last);
473
474                 /*
475                  * Check if there are enough free descriptors in the TX ring
476                  * to transmit the next packet.
477                  * This operation is based on the two following rules:
478                  *
479                  *   1- Only check that the last needed TX descriptor can be
480                  *      allocated (by construction, if that descriptor is free,
481                  *      all intermediate ones are also free).
482                  *
483                  *      For this purpose, the index of the last TX descriptor
484                  *      used for a packet (the "last descriptor" of a packet)
485                  *      is recorded in the TX entries (the last one included)
486                  *      that are associated with all TX descriptors allocated
487                  *      for that packet.
488                  *
489                  *   2- Avoid to allocate the last free TX descriptor of the
490                  *      ring, in order to never set the TDT register with the
491                  *      same value stored in parallel by the NIC in the TDH
492                  *      register, which makes the TX engine of the NIC enter
493                  *      in a deadlock situation.
494                  *
495                  *      By extension, avoid to allocate a free descriptor that
496                  *      belongs to the last set of free descriptors allocated
497                  *      to the same packet previously transmitted.
498                  */
499
500                 /*
501                  * The "last descriptor" of the previously sent packet, if any,
502                  * which used the last descriptor to allocate.
503                  */
504                 tx_end = sw_ring[tx_last].last_id;
505
506                 /*
507                  * The next descriptor following that "last descriptor" in the
508                  * ring.
509                  */
510                 tx_end = sw_ring[tx_end].next_id;
511
512                 /*
513                  * The "last descriptor" associated with that next descriptor.
514                  */
515                 tx_end = sw_ring[tx_end].last_id;
516
517                 /*
518                  * Check that this descriptor is free.
519                  */
520                 if (! (txr[tx_end].wb.status & E1000_TXD_STAT_DD)) {
521                         if (nb_tx == 0)
522                                 return 0;
523                         goto end_of_tx;
524                 }
525
526                 /*
527                  * Set common flags of all TX Data Descriptors.
528                  *
529                  * The following bits must be set in all Data Descriptors:
530                  *   - E1000_ADVTXD_DTYP_DATA
531                  *   - E1000_ADVTXD_DCMD_DEXT
532                  *
533                  * The following bits must be set in the first Data Descriptor
534                  * and are ignored in the other ones:
535                  *   - E1000_ADVTXD_DCMD_IFCS
536                  *   - E1000_ADVTXD_MAC_1588
537                  *   - E1000_ADVTXD_DCMD_VLE
538                  *
539                  * The following bits must only be set in the last Data
540                  * Descriptor:
541                  *   - E1000_TXD_CMD_EOP
542                  *
543                  * The following bits can be set in any Data Descriptor, but
544                  * are only set in the last Data Descriptor:
545                  *   - E1000_TXD_CMD_RS
546                  */
547                 cmd_type_len = txq->txd_type |
548                         E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT;
549                 if (tx_ol_req & PKT_TX_TCP_SEG)
550                         pkt_len -= (tx_pkt->l2_len + tx_pkt->l3_len + tx_pkt->l4_len);
551                 olinfo_status = (pkt_len << E1000_ADVTXD_PAYLEN_SHIFT);
552 #if defined(RTE_LIBRTE_IEEE1588)
553                 if (ol_flags & PKT_TX_IEEE1588_TMST)
554                         cmd_type_len |= E1000_ADVTXD_MAC_TSTAMP;
555 #endif
556                 if (tx_ol_req) {
557                         /* Setup TX Advanced context descriptor if required */
558                         if (new_ctx) {
559                                 volatile struct e1000_adv_tx_context_desc *
560                                     ctx_txd;
561
562                                 ctx_txd = (volatile struct
563                                     e1000_adv_tx_context_desc *)
564                                     &txr[tx_id];
565
566                                 txn = &sw_ring[txe->next_id];
567                                 RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
568
569                                 if (txe->mbuf != NULL) {
570                                         rte_pktmbuf_free_seg(txe->mbuf);
571                                         txe->mbuf = NULL;
572                                 }
573
574                                 igbe_set_xmit_ctx(txq, ctx_txd, tx_ol_req, tx_offload);
575
576                                 txe->last_id = tx_last;
577                                 tx_id = txe->next_id;
578                                 txe = txn;
579                         }
580
581                         /* Setup the TX Advanced Data Descriptor */
582                         cmd_type_len  |= tx_desc_vlan_flags_to_cmdtype(tx_ol_req);
583                         olinfo_status |= tx_desc_cksum_flags_to_olinfo(tx_ol_req);
584                         olinfo_status |= (ctx << E1000_ADVTXD_IDX_SHIFT);
585                 }
586
587                 m_seg = tx_pkt;
588                 do {
589                         txn = &sw_ring[txe->next_id];
590                         txd = &txr[tx_id];
591
592                         if (txe->mbuf != NULL)
593                                 rte_pktmbuf_free_seg(txe->mbuf);
594                         txe->mbuf = m_seg;
595
596                         /*
597                          * Set up transmit descriptor.
598                          */
599                         slen = (uint16_t) m_seg->data_len;
600                         buf_dma_addr = rte_mbuf_data_iova(m_seg);
601                         txd->read.buffer_addr =
602                                 rte_cpu_to_le_64(buf_dma_addr);
603                         txd->read.cmd_type_len =
604                                 rte_cpu_to_le_32(cmd_type_len | slen);
605                         txd->read.olinfo_status =
606                                 rte_cpu_to_le_32(olinfo_status);
607                         txe->last_id = tx_last;
608                         tx_id = txe->next_id;
609                         txe = txn;
610                         m_seg = m_seg->next;
611                 } while (m_seg != NULL);
612
613                 /*
614                  * The last packet data descriptor needs End Of Packet (EOP)
615                  * and Report Status (RS).
616                  */
617                 txd->read.cmd_type_len |=
618                         rte_cpu_to_le_32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS);
619         }
620  end_of_tx:
621         rte_wmb();
622
623         /*
624          * Set the Transmit Descriptor Tail (TDT).
625          */
626         E1000_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
627         PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
628                    (unsigned) txq->port_id, (unsigned) txq->queue_id,
629                    (unsigned) tx_id, (unsigned) nb_tx);
630         txq->tx_tail = tx_id;
631
632         return nb_tx;
633 }
634
635 /*********************************************************************
636  *
637  *  TX prep functions
638  *
639  **********************************************************************/
640 uint16_t
641 eth_igb_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
642                 uint16_t nb_pkts)
643 {
644         int i, ret;
645         struct rte_mbuf *m;
646
647         for (i = 0; i < nb_pkts; i++) {
648                 m = tx_pkts[i];
649
650                 /* Check some limitations for TSO in hardware */
651                 if (m->ol_flags & PKT_TX_TCP_SEG)
652                         if ((m->tso_segsz > IGB_TSO_MAX_MSS) ||
653                                         (m->l2_len + m->l3_len + m->l4_len >
654                                         IGB_TSO_MAX_HDRLEN)) {
655                                 rte_errno = -EINVAL;
656                                 return i;
657                         }
658
659                 if (m->ol_flags & IGB_TX_OFFLOAD_NOTSUP_MASK) {
660                         rte_errno = -ENOTSUP;
661                         return i;
662                 }
663
664 #ifdef RTE_LIBRTE_ETHDEV_DEBUG
665                 ret = rte_validate_tx_offload(m);
666                 if (ret != 0) {
667                         rte_errno = ret;
668                         return i;
669                 }
670 #endif
671                 ret = rte_net_intel_cksum_prepare(m);
672                 if (ret != 0) {
673                         rte_errno = ret;
674                         return i;
675                 }
676         }
677
678         return i;
679 }
680
681 /*********************************************************************
682  *
683  *  RX functions
684  *
685  **********************************************************************/
686 #define IGB_PACKET_TYPE_IPV4              0X01
687 #define IGB_PACKET_TYPE_IPV4_TCP          0X11
688 #define IGB_PACKET_TYPE_IPV4_UDP          0X21
689 #define IGB_PACKET_TYPE_IPV4_SCTP         0X41
690 #define IGB_PACKET_TYPE_IPV4_EXT          0X03
691 #define IGB_PACKET_TYPE_IPV4_EXT_SCTP     0X43
692 #define IGB_PACKET_TYPE_IPV6              0X04
693 #define IGB_PACKET_TYPE_IPV6_TCP          0X14
694 #define IGB_PACKET_TYPE_IPV6_UDP          0X24
695 #define IGB_PACKET_TYPE_IPV6_EXT          0X0C
696 #define IGB_PACKET_TYPE_IPV6_EXT_TCP      0X1C
697 #define IGB_PACKET_TYPE_IPV6_EXT_UDP      0X2C
698 #define IGB_PACKET_TYPE_IPV4_IPV6         0X05
699 #define IGB_PACKET_TYPE_IPV4_IPV6_TCP     0X15
700 #define IGB_PACKET_TYPE_IPV4_IPV6_UDP     0X25
701 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT     0X0D
702 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT_TCP 0X1D
703 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT_UDP 0X2D
704 #define IGB_PACKET_TYPE_MAX               0X80
705 #define IGB_PACKET_TYPE_MASK              0X7F
706 #define IGB_PACKET_TYPE_SHIFT             0X04
707 static inline uint32_t
708 igb_rxd_pkt_info_to_pkt_type(uint16_t pkt_info)
709 {
710         static const uint32_t
711                 ptype_table[IGB_PACKET_TYPE_MAX] __rte_cache_aligned = {
712                 [IGB_PACKET_TYPE_IPV4] = RTE_PTYPE_L2_ETHER |
713                         RTE_PTYPE_L3_IPV4,
714                 [IGB_PACKET_TYPE_IPV4_EXT] = RTE_PTYPE_L2_ETHER |
715                         RTE_PTYPE_L3_IPV4_EXT,
716                 [IGB_PACKET_TYPE_IPV6] = RTE_PTYPE_L2_ETHER |
717                         RTE_PTYPE_L3_IPV6,
718                 [IGB_PACKET_TYPE_IPV4_IPV6] = RTE_PTYPE_L2_ETHER |
719                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
720                         RTE_PTYPE_INNER_L3_IPV6,
721                 [IGB_PACKET_TYPE_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
722                         RTE_PTYPE_L3_IPV6_EXT,
723                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
724                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
725                         RTE_PTYPE_INNER_L3_IPV6_EXT,
726                 [IGB_PACKET_TYPE_IPV4_TCP] = RTE_PTYPE_L2_ETHER |
727                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP,
728                 [IGB_PACKET_TYPE_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
729                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_TCP,
730                 [IGB_PACKET_TYPE_IPV4_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
731                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
732                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_TCP,
733                 [IGB_PACKET_TYPE_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
734                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_TCP,
735                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
736                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
737                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_TCP,
738                 [IGB_PACKET_TYPE_IPV4_UDP] = RTE_PTYPE_L2_ETHER |
739                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_UDP,
740                 [IGB_PACKET_TYPE_IPV6_UDP] = RTE_PTYPE_L2_ETHER |
741                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_UDP,
742                 [IGB_PACKET_TYPE_IPV4_IPV6_UDP] =  RTE_PTYPE_L2_ETHER |
743                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
744                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_UDP,
745                 [IGB_PACKET_TYPE_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
746                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_UDP,
747                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
748                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
749                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_UDP,
750                 [IGB_PACKET_TYPE_IPV4_SCTP] = RTE_PTYPE_L2_ETHER |
751                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_SCTP,
752                 [IGB_PACKET_TYPE_IPV4_EXT_SCTP] = RTE_PTYPE_L2_ETHER |
753                         RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L4_SCTP,
754         };
755         if (unlikely(pkt_info & E1000_RXDADV_PKTTYPE_ETQF))
756                 return RTE_PTYPE_UNKNOWN;
757
758         pkt_info = (pkt_info >> IGB_PACKET_TYPE_SHIFT) & IGB_PACKET_TYPE_MASK;
759
760         return ptype_table[pkt_info];
761 }
762
763 static inline uint64_t
764 rx_desc_hlen_type_rss_to_pkt_flags(struct igb_rx_queue *rxq, uint32_t hl_tp_rs)
765 {
766         uint64_t pkt_flags = ((hl_tp_rs & 0x0F) == 0) ?  0 : PKT_RX_RSS_HASH;
767
768 #if defined(RTE_LIBRTE_IEEE1588)
769         static uint32_t ip_pkt_etqf_map[8] = {
770                 0, 0, 0, PKT_RX_IEEE1588_PTP,
771                 0, 0, 0, 0,
772         };
773
774         struct rte_eth_dev dev = rte_eth_devices[rxq->port_id];
775         struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev.data->dev_private);
776
777         /* EtherType is in bits 8:10 in Packet Type, and not in the default 0:2 */
778         if (hw->mac.type == e1000_i210)
779                 pkt_flags |= ip_pkt_etqf_map[(hl_tp_rs >> 12) & 0x07];
780         else
781                 pkt_flags |= ip_pkt_etqf_map[(hl_tp_rs >> 4) & 0x07];
782 #else
783         RTE_SET_USED(rxq);
784 #endif
785
786         return pkt_flags;
787 }
788
789 static inline uint64_t
790 rx_desc_status_to_pkt_flags(uint32_t rx_status)
791 {
792         uint64_t pkt_flags;
793
794         /* Check if VLAN present */
795         pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?
796                 PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED : 0);
797
798 #if defined(RTE_LIBRTE_IEEE1588)
799         if (rx_status & E1000_RXD_STAT_TMST)
800                 pkt_flags = pkt_flags | PKT_RX_IEEE1588_TMST;
801 #endif
802         return pkt_flags;
803 }
804
805 static inline uint64_t
806 rx_desc_error_to_pkt_flags(uint32_t rx_status)
807 {
808         /*
809          * Bit 30: IPE, IPv4 checksum error
810          * Bit 29: L4I, L4I integrity error
811          */
812
813         static uint64_t error_to_pkt_flags_map[4] = {
814                 PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD,
815                 PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD,
816                 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD,
817                 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
818         };
819         return error_to_pkt_flags_map[(rx_status >>
820                 E1000_RXD_ERR_CKSUM_BIT) & E1000_RXD_ERR_CKSUM_MSK];
821 }
822
823 uint16_t
824 eth_igb_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
825                uint16_t nb_pkts)
826 {
827         struct igb_rx_queue *rxq;
828         volatile union e1000_adv_rx_desc *rx_ring;
829         volatile union e1000_adv_rx_desc *rxdp;
830         struct igb_rx_entry *sw_ring;
831         struct igb_rx_entry *rxe;
832         struct rte_mbuf *rxm;
833         struct rte_mbuf *nmb;
834         union e1000_adv_rx_desc rxd;
835         uint64_t dma_addr;
836         uint32_t staterr;
837         uint32_t hlen_type_rss;
838         uint16_t pkt_len;
839         uint16_t rx_id;
840         uint16_t nb_rx;
841         uint16_t nb_hold;
842         uint64_t pkt_flags;
843
844         nb_rx = 0;
845         nb_hold = 0;
846         rxq = rx_queue;
847         rx_id = rxq->rx_tail;
848         rx_ring = rxq->rx_ring;
849         sw_ring = rxq->sw_ring;
850         while (nb_rx < nb_pkts) {
851                 /*
852                  * The order of operations here is important as the DD status
853                  * bit must not be read after any other descriptor fields.
854                  * rx_ring and rxdp are pointing to volatile data so the order
855                  * of accesses cannot be reordered by the compiler. If they were
856                  * not volatile, they could be reordered which could lead to
857                  * using invalid descriptor fields when read from rxd.
858                  */
859                 rxdp = &rx_ring[rx_id];
860                 staterr = rxdp->wb.upper.status_error;
861                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
862                         break;
863                 rxd = *rxdp;
864
865                 /*
866                  * End of packet.
867                  *
868                  * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
869                  * likely to be invalid and to be dropped by the various
870                  * validation checks performed by the network stack.
871                  *
872                  * Allocate a new mbuf to replenish the RX ring descriptor.
873                  * If the allocation fails:
874                  *    - arrange for that RX descriptor to be the first one
875                  *      being parsed the next time the receive function is
876                  *      invoked [on the same queue].
877                  *
878                  *    - Stop parsing the RX ring and return immediately.
879                  *
880                  * This policy do not drop the packet received in the RX
881                  * descriptor for which the allocation of a new mbuf failed.
882                  * Thus, it allows that packet to be later retrieved if
883                  * mbuf have been freed in the mean time.
884                  * As a side effect, holding RX descriptors instead of
885                  * systematically giving them back to the NIC may lead to
886                  * RX ring exhaustion situations.
887                  * However, the NIC can gracefully prevent such situations
888                  * to happen by sending specific "back-pressure" flow control
889                  * frames to its peer(s).
890                  */
891                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
892                            "staterr=0x%x pkt_len=%u",
893                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
894                            (unsigned) rx_id, (unsigned) staterr,
895                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
896
897                 nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
898                 if (nmb == NULL) {
899                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
900                                    "queue_id=%u", (unsigned) rxq->port_id,
901                                    (unsigned) rxq->queue_id);
902                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
903                         break;
904                 }
905
906                 nb_hold++;
907                 rxe = &sw_ring[rx_id];
908                 rx_id++;
909                 if (rx_id == rxq->nb_rx_desc)
910                         rx_id = 0;
911
912                 /* Prefetch next mbuf while processing current one. */
913                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
914
915                 /*
916                  * When next RX descriptor is on a cache-line boundary,
917                  * prefetch the next 4 RX descriptors and the next 8 pointers
918                  * to mbufs.
919                  */
920                 if ((rx_id & 0x3) == 0) {
921                         rte_igb_prefetch(&rx_ring[rx_id]);
922                         rte_igb_prefetch(&sw_ring[rx_id]);
923                 }
924
925                 rxm = rxe->mbuf;
926                 rxe->mbuf = nmb;
927                 dma_addr =
928                         rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
929                 rxdp->read.hdr_addr = 0;
930                 rxdp->read.pkt_addr = dma_addr;
931
932                 /*
933                  * Initialize the returned mbuf.
934                  * 1) setup generic mbuf fields:
935                  *    - number of segments,
936                  *    - next segment,
937                  *    - packet length,
938                  *    - RX port identifier.
939                  * 2) integrate hardware offload data, if any:
940                  *    - RSS flag & hash,
941                  *    - IP checksum flag,
942                  *    - VLAN TCI, if any,
943                  *    - error flags.
944                  */
945                 pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
946                                       rxq->crc_len);
947                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
948                 rte_packet_prefetch((char *)rxm->buf_addr + rxm->data_off);
949                 rxm->nb_segs = 1;
950                 rxm->next = NULL;
951                 rxm->pkt_len = pkt_len;
952                 rxm->data_len = pkt_len;
953                 rxm->port = rxq->port_id;
954
955                 rxm->hash.rss = rxd.wb.lower.hi_dword.rss;
956                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
957
958                 /*
959                  * The vlan_tci field is only valid when PKT_RX_VLAN is
960                  * set in the pkt_flags field and must be in CPU byte order.
961                  */
962                 if ((staterr & rte_cpu_to_le_32(E1000_RXDEXT_STATERR_LB)) &&
963                                 (rxq->flags & IGB_RXQ_FLAG_LB_BSWAP_VLAN)) {
964                         rxm->vlan_tci = rte_be_to_cpu_16(rxd.wb.upper.vlan);
965                 } else {
966                         rxm->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
967                 }
968                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(rxq, hlen_type_rss);
969                 pkt_flags = pkt_flags | rx_desc_status_to_pkt_flags(staterr);
970                 pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
971                 rxm->ol_flags = pkt_flags;
972                 rxm->packet_type = igb_rxd_pkt_info_to_pkt_type(rxd.wb.lower.
973                                                 lo_dword.hs_rss.pkt_info);
974
975                 /*
976                  * Store the mbuf address into the next entry of the array
977                  * of returned packets.
978                  */
979                 rx_pkts[nb_rx++] = rxm;
980         }
981         rxq->rx_tail = rx_id;
982
983         /*
984          * If the number of free RX descriptors is greater than the RX free
985          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
986          * register.
987          * Update the RDT with the value of the last processed RX descriptor
988          * minus 1, to guarantee that the RDT register is never equal to the
989          * RDH register, which creates a "full" ring situtation from the
990          * hardware point of view...
991          */
992         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
993         if (nb_hold > rxq->rx_free_thresh) {
994                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
995                            "nb_hold=%u nb_rx=%u",
996                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
997                            (unsigned) rx_id, (unsigned) nb_hold,
998                            (unsigned) nb_rx);
999                 rx_id = (uint16_t) ((rx_id == 0) ?
1000                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
1001                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
1002                 nb_hold = 0;
1003         }
1004         rxq->nb_rx_hold = nb_hold;
1005         return nb_rx;
1006 }
1007
1008 uint16_t
1009 eth_igb_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
1010                          uint16_t nb_pkts)
1011 {
1012         struct igb_rx_queue *rxq;
1013         volatile union e1000_adv_rx_desc *rx_ring;
1014         volatile union e1000_adv_rx_desc *rxdp;
1015         struct igb_rx_entry *sw_ring;
1016         struct igb_rx_entry *rxe;
1017         struct rte_mbuf *first_seg;
1018         struct rte_mbuf *last_seg;
1019         struct rte_mbuf *rxm;
1020         struct rte_mbuf *nmb;
1021         union e1000_adv_rx_desc rxd;
1022         uint64_t dma; /* Physical address of mbuf data buffer */
1023         uint32_t staterr;
1024         uint32_t hlen_type_rss;
1025         uint16_t rx_id;
1026         uint16_t nb_rx;
1027         uint16_t nb_hold;
1028         uint16_t data_len;
1029         uint64_t pkt_flags;
1030
1031         nb_rx = 0;
1032         nb_hold = 0;
1033         rxq = rx_queue;
1034         rx_id = rxq->rx_tail;
1035         rx_ring = rxq->rx_ring;
1036         sw_ring = rxq->sw_ring;
1037
1038         /*
1039          * Retrieve RX context of current packet, if any.
1040          */
1041         first_seg = rxq->pkt_first_seg;
1042         last_seg = rxq->pkt_last_seg;
1043
1044         while (nb_rx < nb_pkts) {
1045         next_desc:
1046                 /*
1047                  * The order of operations here is important as the DD status
1048                  * bit must not be read after any other descriptor fields.
1049                  * rx_ring and rxdp are pointing to volatile data so the order
1050                  * of accesses cannot be reordered by the compiler. If they were
1051                  * not volatile, they could be reordered which could lead to
1052                  * using invalid descriptor fields when read from rxd.
1053                  */
1054                 rxdp = &rx_ring[rx_id];
1055                 staterr = rxdp->wb.upper.status_error;
1056                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
1057                         break;
1058                 rxd = *rxdp;
1059
1060                 /*
1061                  * Descriptor done.
1062                  *
1063                  * Allocate a new mbuf to replenish the RX ring descriptor.
1064                  * If the allocation fails:
1065                  *    - arrange for that RX descriptor to be the first one
1066                  *      being parsed the next time the receive function is
1067                  *      invoked [on the same queue].
1068                  *
1069                  *    - Stop parsing the RX ring and return immediately.
1070                  *
1071                  * This policy does not drop the packet received in the RX
1072                  * descriptor for which the allocation of a new mbuf failed.
1073                  * Thus, it allows that packet to be later retrieved if
1074                  * mbuf have been freed in the mean time.
1075                  * As a side effect, holding RX descriptors instead of
1076                  * systematically giving them back to the NIC may lead to
1077                  * RX ring exhaustion situations.
1078                  * However, the NIC can gracefully prevent such situations
1079                  * to happen by sending specific "back-pressure" flow control
1080                  * frames to its peer(s).
1081                  */
1082                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
1083                            "staterr=0x%x data_len=%u",
1084                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1085                            (unsigned) rx_id, (unsigned) staterr,
1086                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
1087
1088                 nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
1089                 if (nmb == NULL) {
1090                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
1091                                    "queue_id=%u", (unsigned) rxq->port_id,
1092                                    (unsigned) rxq->queue_id);
1093                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
1094                         break;
1095                 }
1096
1097                 nb_hold++;
1098                 rxe = &sw_ring[rx_id];
1099                 rx_id++;
1100                 if (rx_id == rxq->nb_rx_desc)
1101                         rx_id = 0;
1102
1103                 /* Prefetch next mbuf while processing current one. */
1104                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
1105
1106                 /*
1107                  * When next RX descriptor is on a cache-line boundary,
1108                  * prefetch the next 4 RX descriptors and the next 8 pointers
1109                  * to mbufs.
1110                  */
1111                 if ((rx_id & 0x3) == 0) {
1112                         rte_igb_prefetch(&rx_ring[rx_id]);
1113                         rte_igb_prefetch(&sw_ring[rx_id]);
1114                 }
1115
1116                 /*
1117                  * Update RX descriptor with the physical address of the new
1118                  * data buffer of the new allocated mbuf.
1119                  */
1120                 rxm = rxe->mbuf;
1121                 rxe->mbuf = nmb;
1122                 dma = rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
1123                 rxdp->read.pkt_addr = dma;
1124                 rxdp->read.hdr_addr = 0;
1125
1126                 /*
1127                  * Set data length & data buffer address of mbuf.
1128                  */
1129                 data_len = rte_le_to_cpu_16(rxd.wb.upper.length);
1130                 rxm->data_len = data_len;
1131                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
1132
1133                 /*
1134                  * If this is the first buffer of the received packet,
1135                  * set the pointer to the first mbuf of the packet and
1136                  * initialize its context.
1137                  * Otherwise, update the total length and the number of segments
1138                  * of the current scattered packet, and update the pointer to
1139                  * the last mbuf of the current packet.
1140                  */
1141                 if (first_seg == NULL) {
1142                         first_seg = rxm;
1143                         first_seg->pkt_len = data_len;
1144                         first_seg->nb_segs = 1;
1145                 } else {
1146                         first_seg->pkt_len += data_len;
1147                         first_seg->nb_segs++;
1148                         last_seg->next = rxm;
1149                 }
1150
1151                 /*
1152                  * If this is not the last buffer of the received packet,
1153                  * update the pointer to the last mbuf of the current scattered
1154                  * packet and continue to parse the RX ring.
1155                  */
1156                 if (! (staterr & E1000_RXD_STAT_EOP)) {
1157                         last_seg = rxm;
1158                         goto next_desc;
1159                 }
1160
1161                 /*
1162                  * This is the last buffer of the received packet.
1163                  * If the CRC is not stripped by the hardware:
1164                  *   - Subtract the CRC length from the total packet length.
1165                  *   - If the last buffer only contains the whole CRC or a part
1166                  *     of it, free the mbuf associated to the last buffer.
1167                  *     If part of the CRC is also contained in the previous
1168                  *     mbuf, subtract the length of that CRC part from the
1169                  *     data length of the previous mbuf.
1170                  */
1171                 rxm->next = NULL;
1172                 if (unlikely(rxq->crc_len > 0)) {
1173                         first_seg->pkt_len -= ETHER_CRC_LEN;
1174                         if (data_len <= ETHER_CRC_LEN) {
1175                                 rte_pktmbuf_free_seg(rxm);
1176                                 first_seg->nb_segs--;
1177                                 last_seg->data_len = (uint16_t)
1178                                         (last_seg->data_len -
1179                                          (ETHER_CRC_LEN - data_len));
1180                                 last_seg->next = NULL;
1181                         } else
1182                                 rxm->data_len =
1183                                         (uint16_t) (data_len - ETHER_CRC_LEN);
1184                 }
1185
1186                 /*
1187                  * Initialize the first mbuf of the returned packet:
1188                  *    - RX port identifier,
1189                  *    - hardware offload data, if any:
1190                  *      - RSS flag & hash,
1191                  *      - IP checksum flag,
1192                  *      - VLAN TCI, if any,
1193                  *      - error flags.
1194                  */
1195                 first_seg->port = rxq->port_id;
1196                 first_seg->hash.rss = rxd.wb.lower.hi_dword.rss;
1197
1198                 /*
1199                  * The vlan_tci field is only valid when PKT_RX_VLAN is
1200                  * set in the pkt_flags field and must be in CPU byte order.
1201                  */
1202                 if ((staterr & rte_cpu_to_le_32(E1000_RXDEXT_STATERR_LB)) &&
1203                                 (rxq->flags & IGB_RXQ_FLAG_LB_BSWAP_VLAN)) {
1204                         first_seg->vlan_tci =
1205                                 rte_be_to_cpu_16(rxd.wb.upper.vlan);
1206                 } else {
1207                         first_seg->vlan_tci =
1208                                 rte_le_to_cpu_16(rxd.wb.upper.vlan);
1209                 }
1210                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
1211                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(rxq, hlen_type_rss);
1212                 pkt_flags = pkt_flags | rx_desc_status_to_pkt_flags(staterr);
1213                 pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
1214                 first_seg->ol_flags = pkt_flags;
1215                 first_seg->packet_type = igb_rxd_pkt_info_to_pkt_type(rxd.wb.
1216                                         lower.lo_dword.hs_rss.pkt_info);
1217
1218                 /* Prefetch data of first segment, if configured to do so. */
1219                 rte_packet_prefetch((char *)first_seg->buf_addr +
1220                         first_seg->data_off);
1221
1222                 /*
1223                  * Store the mbuf address into the next entry of the array
1224                  * of returned packets.
1225                  */
1226                 rx_pkts[nb_rx++] = first_seg;
1227
1228                 /*
1229                  * Setup receipt context for a new packet.
1230                  */
1231                 first_seg = NULL;
1232         }
1233
1234         /*
1235          * Record index of the next RX descriptor to probe.
1236          */
1237         rxq->rx_tail = rx_id;
1238
1239         /*
1240          * Save receive context.
1241          */
1242         rxq->pkt_first_seg = first_seg;
1243         rxq->pkt_last_seg = last_seg;
1244
1245         /*
1246          * If the number of free RX descriptors is greater than the RX free
1247          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
1248          * register.
1249          * Update the RDT with the value of the last processed RX descriptor
1250          * minus 1, to guarantee that the RDT register is never equal to the
1251          * RDH register, which creates a "full" ring situtation from the
1252          * hardware point of view...
1253          */
1254         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
1255         if (nb_hold > rxq->rx_free_thresh) {
1256                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
1257                            "nb_hold=%u nb_rx=%u",
1258                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1259                            (unsigned) rx_id, (unsigned) nb_hold,
1260                            (unsigned) nb_rx);
1261                 rx_id = (uint16_t) ((rx_id == 0) ?
1262                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
1263                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
1264                 nb_hold = 0;
1265         }
1266         rxq->nb_rx_hold = nb_hold;
1267         return nb_rx;
1268 }
1269
1270 /*
1271  * Maximum number of Ring Descriptors.
1272  *
1273  * Since RDLEN/TDLEN should be multiple of 128bytes, the number of ring
1274  * desscriptors should meet the following condition:
1275  *      (num_ring_desc * sizeof(struct e1000_rx/tx_desc)) % 128 == 0
1276  */
1277
1278 static void
1279 igb_tx_queue_release_mbufs(struct igb_tx_queue *txq)
1280 {
1281         unsigned i;
1282
1283         if (txq->sw_ring != NULL) {
1284                 for (i = 0; i < txq->nb_tx_desc; i++) {
1285                         if (txq->sw_ring[i].mbuf != NULL) {
1286                                 rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
1287                                 txq->sw_ring[i].mbuf = NULL;
1288                         }
1289                 }
1290         }
1291 }
1292
1293 static void
1294 igb_tx_queue_release(struct igb_tx_queue *txq)
1295 {
1296         if (txq != NULL) {
1297                 igb_tx_queue_release_mbufs(txq);
1298                 rte_free(txq->sw_ring);
1299                 rte_free(txq);
1300         }
1301 }
1302
1303 void
1304 eth_igb_tx_queue_release(void *txq)
1305 {
1306         igb_tx_queue_release(txq);
1307 }
1308
1309 static int
1310 igb_tx_done_cleanup(struct igb_tx_queue *txq, uint32_t free_cnt)
1311 {
1312         struct igb_tx_entry *sw_ring;
1313         volatile union e1000_adv_tx_desc *txr;
1314         uint16_t tx_first; /* First segment analyzed. */
1315         uint16_t tx_id;    /* Current segment being processed. */
1316         uint16_t tx_last;  /* Last segment in the current packet. */
1317         uint16_t tx_next;  /* First segment of the next packet. */
1318         int count;
1319
1320         if (txq != NULL) {
1321                 count = 0;
1322                 sw_ring = txq->sw_ring;
1323                 txr = txq->tx_ring;
1324
1325                 /*
1326                  * tx_tail is the last sent packet on the sw_ring. Goto the end
1327                  * of that packet (the last segment in the packet chain) and
1328                  * then the next segment will be the start of the oldest segment
1329                  * in the sw_ring. This is the first packet that will be
1330                  * attempted to be freed.
1331                  */
1332
1333                 /* Get last segment in most recently added packet. */
1334                 tx_first = sw_ring[txq->tx_tail].last_id;
1335
1336                 /* Get the next segment, which is the oldest segment in ring. */
1337                 tx_first = sw_ring[tx_first].next_id;
1338
1339                 /* Set the current index to the first. */
1340                 tx_id = tx_first;
1341
1342                 /*
1343                  * Loop through each packet. For each packet, verify that an
1344                  * mbuf exists and that the last segment is free. If so, free
1345                  * it and move on.
1346                  */
1347                 while (1) {
1348                         tx_last = sw_ring[tx_id].last_id;
1349
1350                         if (sw_ring[tx_last].mbuf) {
1351                                 if (txr[tx_last].wb.status &
1352                                                 E1000_TXD_STAT_DD) {
1353                                         /*
1354                                          * Increment the number of packets
1355                                          * freed.
1356                                          */
1357                                         count++;
1358
1359                                         /* Get the start of the next packet. */
1360                                         tx_next = sw_ring[tx_last].next_id;
1361
1362                                         /*
1363                                          * Loop through all segments in a
1364                                          * packet.
1365                                          */
1366                                         do {
1367                                                 rte_pktmbuf_free_seg(sw_ring[tx_id].mbuf);
1368                                                 sw_ring[tx_id].mbuf = NULL;
1369                                                 sw_ring[tx_id].last_id = tx_id;
1370
1371                                                 /* Move to next segemnt. */
1372                                                 tx_id = sw_ring[tx_id].next_id;
1373
1374                                         } while (tx_id != tx_next);
1375
1376                                         if (unlikely(count == (int)free_cnt))
1377                                                 break;
1378                                 } else
1379                                         /*
1380                                          * mbuf still in use, nothing left to
1381                                          * free.
1382                                          */
1383                                         break;
1384                         } else {
1385                                 /*
1386                                  * There are multiple reasons to be here:
1387                                  * 1) All the packets on the ring have been
1388                                  *    freed - tx_id is equal to tx_first
1389                                  *    and some packets have been freed.
1390                                  *    - Done, exit
1391                                  * 2) Interfaces has not sent a rings worth of
1392                                  *    packets yet, so the segment after tail is
1393                                  *    still empty. Or a previous call to this
1394                                  *    function freed some of the segments but
1395                                  *    not all so there is a hole in the list.
1396                                  *    Hopefully this is a rare case.
1397                                  *    - Walk the list and find the next mbuf. If
1398                                  *      there isn't one, then done.
1399                                  */
1400                                 if (likely((tx_id == tx_first) && (count != 0)))
1401                                         break;
1402
1403                                 /*
1404                                  * Walk the list and find the next mbuf, if any.
1405                                  */
1406                                 do {
1407                                         /* Move to next segemnt. */
1408                                         tx_id = sw_ring[tx_id].next_id;
1409
1410                                         if (sw_ring[tx_id].mbuf)
1411                                                 break;
1412
1413                                 } while (tx_id != tx_first);
1414
1415                                 /*
1416                                  * Determine why previous loop bailed. If there
1417                                  * is not an mbuf, done.
1418                                  */
1419                                 if (sw_ring[tx_id].mbuf == NULL)
1420                                         break;
1421                         }
1422                 }
1423         } else
1424                 count = -ENODEV;
1425
1426         return count;
1427 }
1428
1429 int
1430 eth_igb_tx_done_cleanup(void *txq, uint32_t free_cnt)
1431 {
1432         return igb_tx_done_cleanup(txq, free_cnt);
1433 }
1434
1435 static void
1436 igb_reset_tx_queue_stat(struct igb_tx_queue *txq)
1437 {
1438         txq->tx_head = 0;
1439         txq->tx_tail = 0;
1440         txq->ctx_curr = 0;
1441         memset((void*)&txq->ctx_cache, 0,
1442                 IGB_CTX_NUM * sizeof(struct igb_advctx_info));
1443 }
1444
1445 static void
1446 igb_reset_tx_queue(struct igb_tx_queue *txq, struct rte_eth_dev *dev)
1447 {
1448         static const union e1000_adv_tx_desc zeroed_desc = {{0}};
1449         struct igb_tx_entry *txe = txq->sw_ring;
1450         uint16_t i, prev;
1451         struct e1000_hw *hw;
1452
1453         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1454         /* Zero out HW ring memory */
1455         for (i = 0; i < txq->nb_tx_desc; i++) {
1456                 txq->tx_ring[i] = zeroed_desc;
1457         }
1458
1459         /* Initialize ring entries */
1460         prev = (uint16_t)(txq->nb_tx_desc - 1);
1461         for (i = 0; i < txq->nb_tx_desc; i++) {
1462                 volatile union e1000_adv_tx_desc *txd = &(txq->tx_ring[i]);
1463
1464                 txd->wb.status = E1000_TXD_STAT_DD;
1465                 txe[i].mbuf = NULL;
1466                 txe[i].last_id = i;
1467                 txe[prev].next_id = i;
1468                 prev = i;
1469         }
1470
1471         txq->txd_type = E1000_ADVTXD_DTYP_DATA;
1472         /* 82575 specific, each tx queue will use 2 hw contexts */
1473         if (hw->mac.type == e1000_82575)
1474                 txq->ctx_start = txq->queue_id * IGB_CTX_NUM;
1475
1476         igb_reset_tx_queue_stat(txq);
1477 }
1478
1479 int
1480 eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
1481                          uint16_t queue_idx,
1482                          uint16_t nb_desc,
1483                          unsigned int socket_id,
1484                          const struct rte_eth_txconf *tx_conf)
1485 {
1486         const struct rte_memzone *tz;
1487         struct igb_tx_queue *txq;
1488         struct e1000_hw     *hw;
1489         uint32_t size;
1490
1491         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1492
1493         /*
1494          * Validate number of transmit descriptors.
1495          * It must not exceed hardware maximum, and must be multiple
1496          * of E1000_ALIGN.
1497          */
1498         if (nb_desc % IGB_TXD_ALIGN != 0 ||
1499                         (nb_desc > E1000_MAX_RING_DESC) ||
1500                         (nb_desc < E1000_MIN_RING_DESC)) {
1501                 return -EINVAL;
1502         }
1503
1504         /*
1505          * The tx_free_thresh and tx_rs_thresh values are not used in the 1G
1506          * driver.
1507          */
1508         if (tx_conf->tx_free_thresh != 0)
1509                 PMD_INIT_LOG(INFO, "The tx_free_thresh parameter is not "
1510                              "used for the 1G driver.");
1511         if (tx_conf->tx_rs_thresh != 0)
1512                 PMD_INIT_LOG(INFO, "The tx_rs_thresh parameter is not "
1513                              "used for the 1G driver.");
1514         if (tx_conf->tx_thresh.wthresh == 0 && hw->mac.type != e1000_82576)
1515                 PMD_INIT_LOG(INFO, "To improve 1G driver performance, "
1516                              "consider setting the TX WTHRESH value to 4, 8, "
1517                              "or 16.");
1518
1519         /* Free memory prior to re-allocation if needed */
1520         if (dev->data->tx_queues[queue_idx] != NULL) {
1521                 igb_tx_queue_release(dev->data->tx_queues[queue_idx]);
1522                 dev->data->tx_queues[queue_idx] = NULL;
1523         }
1524
1525         /* First allocate the tx queue data structure */
1526         txq = rte_zmalloc("ethdev TX queue", sizeof(struct igb_tx_queue),
1527                                                         RTE_CACHE_LINE_SIZE);
1528         if (txq == NULL)
1529                 return -ENOMEM;
1530
1531         /*
1532          * Allocate TX ring hardware descriptors. A memzone large enough to
1533          * handle the maximum ring size is allocated in order to allow for
1534          * resizing in later calls to the queue setup function.
1535          */
1536         size = sizeof(union e1000_adv_tx_desc) * E1000_MAX_RING_DESC;
1537         tz = rte_eth_dma_zone_reserve(dev, "tx_ring", queue_idx, size,
1538                                       E1000_ALIGN, socket_id);
1539         if (tz == NULL) {
1540                 igb_tx_queue_release(txq);
1541                 return -ENOMEM;
1542         }
1543
1544         txq->nb_tx_desc = nb_desc;
1545         txq->pthresh = tx_conf->tx_thresh.pthresh;
1546         txq->hthresh = tx_conf->tx_thresh.hthresh;
1547         txq->wthresh = tx_conf->tx_thresh.wthresh;
1548         if (txq->wthresh > 0 && hw->mac.type == e1000_82576)
1549                 txq->wthresh = 1;
1550         txq->queue_id = queue_idx;
1551         txq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1552                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1553         txq->port_id = dev->data->port_id;
1554
1555         txq->tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(txq->reg_idx));
1556         txq->tx_ring_phys_addr = tz->iova;
1557
1558         txq->tx_ring = (union e1000_adv_tx_desc *) tz->addr;
1559         /* Allocate software ring */
1560         txq->sw_ring = rte_zmalloc("txq->sw_ring",
1561                                    sizeof(struct igb_tx_entry) * nb_desc,
1562                                    RTE_CACHE_LINE_SIZE);
1563         if (txq->sw_ring == NULL) {
1564                 igb_tx_queue_release(txq);
1565                 return -ENOMEM;
1566         }
1567         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
1568                      txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
1569
1570         igb_reset_tx_queue(txq, dev);
1571         dev->tx_pkt_burst = eth_igb_xmit_pkts;
1572         dev->tx_pkt_prepare = &eth_igb_prep_pkts;
1573         dev->data->tx_queues[queue_idx] = txq;
1574
1575         return 0;
1576 }
1577
1578 static void
1579 igb_rx_queue_release_mbufs(struct igb_rx_queue *rxq)
1580 {
1581         unsigned i;
1582
1583         if (rxq->sw_ring != NULL) {
1584                 for (i = 0; i < rxq->nb_rx_desc; i++) {
1585                         if (rxq->sw_ring[i].mbuf != NULL) {
1586                                 rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
1587                                 rxq->sw_ring[i].mbuf = NULL;
1588                         }
1589                 }
1590         }
1591 }
1592
1593 static void
1594 igb_rx_queue_release(struct igb_rx_queue *rxq)
1595 {
1596         if (rxq != NULL) {
1597                 igb_rx_queue_release_mbufs(rxq);
1598                 rte_free(rxq->sw_ring);
1599                 rte_free(rxq);
1600         }
1601 }
1602
1603 void
1604 eth_igb_rx_queue_release(void *rxq)
1605 {
1606         igb_rx_queue_release(rxq);
1607 }
1608
1609 static void
1610 igb_reset_rx_queue(struct igb_rx_queue *rxq)
1611 {
1612         static const union e1000_adv_rx_desc zeroed_desc = {{0}};
1613         unsigned i;
1614
1615         /* Zero out HW ring memory */
1616         for (i = 0; i < rxq->nb_rx_desc; i++) {
1617                 rxq->rx_ring[i] = zeroed_desc;
1618         }
1619
1620         rxq->rx_tail = 0;
1621         rxq->pkt_first_seg = NULL;
1622         rxq->pkt_last_seg = NULL;
1623 }
1624
1625 int
1626 eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
1627                          uint16_t queue_idx,
1628                          uint16_t nb_desc,
1629                          unsigned int socket_id,
1630                          const struct rte_eth_rxconf *rx_conf,
1631                          struct rte_mempool *mp)
1632 {
1633         const struct rte_memzone *rz;
1634         struct igb_rx_queue *rxq;
1635         struct e1000_hw     *hw;
1636         unsigned int size;
1637
1638         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1639
1640         /*
1641          * Validate number of receive descriptors.
1642          * It must not exceed hardware maximum, and must be multiple
1643          * of E1000_ALIGN.
1644          */
1645         if (nb_desc % IGB_RXD_ALIGN != 0 ||
1646                         (nb_desc > E1000_MAX_RING_DESC) ||
1647                         (nb_desc < E1000_MIN_RING_DESC)) {
1648                 return -EINVAL;
1649         }
1650
1651         /* Free memory prior to re-allocation if needed */
1652         if (dev->data->rx_queues[queue_idx] != NULL) {
1653                 igb_rx_queue_release(dev->data->rx_queues[queue_idx]);
1654                 dev->data->rx_queues[queue_idx] = NULL;
1655         }
1656
1657         /* First allocate the RX queue data structure. */
1658         rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
1659                           RTE_CACHE_LINE_SIZE);
1660         if (rxq == NULL)
1661                 return -ENOMEM;
1662         rxq->mb_pool = mp;
1663         rxq->nb_rx_desc = nb_desc;
1664         rxq->pthresh = rx_conf->rx_thresh.pthresh;
1665         rxq->hthresh = rx_conf->rx_thresh.hthresh;
1666         rxq->wthresh = rx_conf->rx_thresh.wthresh;
1667         if (rxq->wthresh > 0 &&
1668             (hw->mac.type == e1000_82576 || hw->mac.type == e1000_vfadapt_i350))
1669                 rxq->wthresh = 1;
1670         rxq->drop_en = rx_conf->rx_drop_en;
1671         rxq->rx_free_thresh = rx_conf->rx_free_thresh;
1672         rxq->queue_id = queue_idx;
1673         rxq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1674                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1675         rxq->port_id = dev->data->port_id;
1676         rxq->crc_len = (uint8_t) ((dev->data->dev_conf.rxmode.hw_strip_crc) ? 0 :
1677                                   ETHER_CRC_LEN);
1678
1679         /*
1680          *  Allocate RX ring hardware descriptors. A memzone large enough to
1681          *  handle the maximum ring size is allocated in order to allow for
1682          *  resizing in later calls to the queue setup function.
1683          */
1684         size = sizeof(union e1000_adv_rx_desc) * E1000_MAX_RING_DESC;
1685         rz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx, size,
1686                                       E1000_ALIGN, socket_id);
1687         if (rz == NULL) {
1688                 igb_rx_queue_release(rxq);
1689                 return -ENOMEM;
1690         }
1691         rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(rxq->reg_idx));
1692         rxq->rdh_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDH(rxq->reg_idx));
1693         rxq->rx_ring_phys_addr = rz->iova;
1694         rxq->rx_ring = (union e1000_adv_rx_desc *) rz->addr;
1695
1696         /* Allocate software ring. */
1697         rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
1698                                    sizeof(struct igb_rx_entry) * nb_desc,
1699                                    RTE_CACHE_LINE_SIZE);
1700         if (rxq->sw_ring == NULL) {
1701                 igb_rx_queue_release(rxq);
1702                 return -ENOMEM;
1703         }
1704         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
1705                      rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
1706
1707         dev->data->rx_queues[queue_idx] = rxq;
1708         igb_reset_rx_queue(rxq);
1709
1710         return 0;
1711 }
1712
1713 uint32_t
1714 eth_igb_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1715 {
1716 #define IGB_RXQ_SCAN_INTERVAL 4
1717         volatile union e1000_adv_rx_desc *rxdp;
1718         struct igb_rx_queue *rxq;
1719         uint32_t desc = 0;
1720
1721         rxq = dev->data->rx_queues[rx_queue_id];
1722         rxdp = &(rxq->rx_ring[rxq->rx_tail]);
1723
1724         while ((desc < rxq->nb_rx_desc) &&
1725                 (rxdp->wb.upper.status_error & E1000_RXD_STAT_DD)) {
1726                 desc += IGB_RXQ_SCAN_INTERVAL;
1727                 rxdp += IGB_RXQ_SCAN_INTERVAL;
1728                 if (rxq->rx_tail + desc >= rxq->nb_rx_desc)
1729                         rxdp = &(rxq->rx_ring[rxq->rx_tail +
1730                                 desc - rxq->nb_rx_desc]);
1731         }
1732
1733         return desc;
1734 }
1735
1736 int
1737 eth_igb_rx_descriptor_done(void *rx_queue, uint16_t offset)
1738 {
1739         volatile union e1000_adv_rx_desc *rxdp;
1740         struct igb_rx_queue *rxq = rx_queue;
1741         uint32_t desc;
1742
1743         if (unlikely(offset >= rxq->nb_rx_desc))
1744                 return 0;
1745         desc = rxq->rx_tail + offset;
1746         if (desc >= rxq->nb_rx_desc)
1747                 desc -= rxq->nb_rx_desc;
1748
1749         rxdp = &rxq->rx_ring[desc];
1750         return !!(rxdp->wb.upper.status_error & E1000_RXD_STAT_DD);
1751 }
1752
1753 int
1754 eth_igb_rx_descriptor_status(void *rx_queue, uint16_t offset)
1755 {
1756         struct igb_rx_queue *rxq = rx_queue;
1757         volatile uint32_t *status;
1758         uint32_t desc;
1759
1760         if (unlikely(offset >= rxq->nb_rx_desc))
1761                 return -EINVAL;
1762
1763         if (offset >= rxq->nb_rx_desc - rxq->nb_rx_hold)
1764                 return RTE_ETH_RX_DESC_UNAVAIL;
1765
1766         desc = rxq->rx_tail + offset;
1767         if (desc >= rxq->nb_rx_desc)
1768                 desc -= rxq->nb_rx_desc;
1769
1770         status = &rxq->rx_ring[desc].wb.upper.status_error;
1771         if (*status & rte_cpu_to_le_32(E1000_RXD_STAT_DD))
1772                 return RTE_ETH_RX_DESC_DONE;
1773
1774         return RTE_ETH_RX_DESC_AVAIL;
1775 }
1776
1777 int
1778 eth_igb_tx_descriptor_status(void *tx_queue, uint16_t offset)
1779 {
1780         struct igb_tx_queue *txq = tx_queue;
1781         volatile uint32_t *status;
1782         uint32_t desc;
1783
1784         if (unlikely(offset >= txq->nb_tx_desc))
1785                 return -EINVAL;
1786
1787         desc = txq->tx_tail + offset;
1788         if (desc >= txq->nb_tx_desc)
1789                 desc -= txq->nb_tx_desc;
1790
1791         status = &txq->tx_ring[desc].wb.status;
1792         if (*status & rte_cpu_to_le_32(E1000_TXD_STAT_DD))
1793                 return RTE_ETH_TX_DESC_DONE;
1794
1795         return RTE_ETH_TX_DESC_FULL;
1796 }
1797
1798 void
1799 igb_dev_clear_queues(struct rte_eth_dev *dev)
1800 {
1801         uint16_t i;
1802         struct igb_tx_queue *txq;
1803         struct igb_rx_queue *rxq;
1804
1805         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1806                 txq = dev->data->tx_queues[i];
1807                 if (txq != NULL) {
1808                         igb_tx_queue_release_mbufs(txq);
1809                         igb_reset_tx_queue(txq, dev);
1810                 }
1811         }
1812
1813         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1814                 rxq = dev->data->rx_queues[i];
1815                 if (rxq != NULL) {
1816                         igb_rx_queue_release_mbufs(rxq);
1817                         igb_reset_rx_queue(rxq);
1818                 }
1819         }
1820 }
1821
1822 void
1823 igb_dev_free_queues(struct rte_eth_dev *dev)
1824 {
1825         uint16_t i;
1826
1827         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1828                 eth_igb_rx_queue_release(dev->data->rx_queues[i]);
1829                 dev->data->rx_queues[i] = NULL;
1830         }
1831         dev->data->nb_rx_queues = 0;
1832
1833         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1834                 eth_igb_tx_queue_release(dev->data->tx_queues[i]);
1835                 dev->data->tx_queues[i] = NULL;
1836         }
1837         dev->data->nb_tx_queues = 0;
1838 }
1839
1840 /**
1841  * Receive Side Scaling (RSS).
1842  * See section 7.1.1.7 in the following document:
1843  *     "Intel 82576 GbE Controller Datasheet" - Revision 2.45 October 2009
1844  *
1845  * Principles:
1846  * The source and destination IP addresses of the IP header and the source and
1847  * destination ports of TCP/UDP headers, if any, of received packets are hashed
1848  * against a configurable random key to compute a 32-bit RSS hash result.
1849  * The seven (7) LSBs of the 32-bit hash result are used as an index into a
1850  * 128-entry redirection table (RETA).  Each entry of the RETA provides a 3-bit
1851  * RSS output index which is used as the RX queue index where to store the
1852  * received packets.
1853  * The following output is supplied in the RX write-back descriptor:
1854  *     - 32-bit result of the Microsoft RSS hash function,
1855  *     - 4-bit RSS type field.
1856  */
1857
1858 /*
1859  * RSS random key supplied in section 7.1.1.7.3 of the Intel 82576 datasheet.
1860  * Used as the default key.
1861  */
1862 static uint8_t rss_intel_key[40] = {
1863         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1864         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1865         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1866         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1867         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1868 };
1869
1870 static void
1871 igb_rss_disable(struct rte_eth_dev *dev)
1872 {
1873         struct e1000_hw *hw;
1874         uint32_t mrqc;
1875
1876         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1877         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1878         mrqc &= ~E1000_MRQC_ENABLE_MASK;
1879         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1880 }
1881
1882 static void
1883 igb_hw_rss_hash_set(struct e1000_hw *hw, struct rte_eth_rss_conf *rss_conf)
1884 {
1885         uint8_t  *hash_key;
1886         uint32_t rss_key;
1887         uint32_t mrqc;
1888         uint64_t rss_hf;
1889         uint16_t i;
1890
1891         hash_key = rss_conf->rss_key;
1892         if (hash_key != NULL) {
1893                 /* Fill in RSS hash key */
1894                 for (i = 0; i < 10; i++) {
1895                         rss_key  = hash_key[(i * 4)];
1896                         rss_key |= hash_key[(i * 4) + 1] << 8;
1897                         rss_key |= hash_key[(i * 4) + 2] << 16;
1898                         rss_key |= hash_key[(i * 4) + 3] << 24;
1899                         E1000_WRITE_REG_ARRAY(hw, E1000_RSSRK(0), i, rss_key);
1900                 }
1901         }
1902
1903         /* Set configured hashing protocols in MRQC register */
1904         rss_hf = rss_conf->rss_hf;
1905         mrqc = E1000_MRQC_ENABLE_RSS_4Q; /* RSS enabled. */
1906         if (rss_hf & ETH_RSS_IPV4)
1907                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4;
1908         if (rss_hf & ETH_RSS_NONFRAG_IPV4_TCP)
1909                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_TCP;
1910         if (rss_hf & ETH_RSS_IPV6)
1911                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6;
1912         if (rss_hf & ETH_RSS_IPV6_EX)
1913                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_EX;
1914         if (rss_hf & ETH_RSS_NONFRAG_IPV6_TCP)
1915                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP;
1916         if (rss_hf & ETH_RSS_IPV6_TCP_EX)
1917                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP_EX;
1918         if (rss_hf & ETH_RSS_NONFRAG_IPV4_UDP)
1919                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_UDP;
1920         if (rss_hf & ETH_RSS_NONFRAG_IPV6_UDP)
1921                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP;
1922         if (rss_hf & ETH_RSS_IPV6_UDP_EX)
1923                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP_EX;
1924         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1925 }
1926
1927 int
1928 eth_igb_rss_hash_update(struct rte_eth_dev *dev,
1929                         struct rte_eth_rss_conf *rss_conf)
1930 {
1931         struct e1000_hw *hw;
1932         uint32_t mrqc;
1933         uint64_t rss_hf;
1934
1935         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1936
1937         /*
1938          * Before changing anything, first check that the update RSS operation
1939          * does not attempt to disable RSS, if RSS was enabled at
1940          * initialization time, or does not attempt to enable RSS, if RSS was
1941          * disabled at initialization time.
1942          */
1943         rss_hf = rss_conf->rss_hf & IGB_RSS_OFFLOAD_ALL;
1944         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1945         if (!(mrqc & E1000_MRQC_ENABLE_MASK)) { /* RSS disabled */
1946                 if (rss_hf != 0) /* Enable RSS */
1947                         return -(EINVAL);
1948                 return 0; /* Nothing to do */
1949         }
1950         /* RSS enabled */
1951         if (rss_hf == 0) /* Disable RSS */
1952                 return -(EINVAL);
1953         igb_hw_rss_hash_set(hw, rss_conf);
1954         return 0;
1955 }
1956
1957 int eth_igb_rss_hash_conf_get(struct rte_eth_dev *dev,
1958                               struct rte_eth_rss_conf *rss_conf)
1959 {
1960         struct e1000_hw *hw;
1961         uint8_t *hash_key;
1962         uint32_t rss_key;
1963         uint32_t mrqc;
1964         uint64_t rss_hf;
1965         uint16_t i;
1966
1967         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1968         hash_key = rss_conf->rss_key;
1969         if (hash_key != NULL) {
1970                 /* Return RSS hash key */
1971                 for (i = 0; i < 10; i++) {
1972                         rss_key = E1000_READ_REG_ARRAY(hw, E1000_RSSRK(0), i);
1973                         hash_key[(i * 4)] = rss_key & 0x000000FF;
1974                         hash_key[(i * 4) + 1] = (rss_key >> 8) & 0x000000FF;
1975                         hash_key[(i * 4) + 2] = (rss_key >> 16) & 0x000000FF;
1976                         hash_key[(i * 4) + 3] = (rss_key >> 24) & 0x000000FF;
1977                 }
1978         }
1979
1980         /* Get RSS functions configured in MRQC register */
1981         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1982         if ((mrqc & E1000_MRQC_ENABLE_RSS_4Q) == 0) { /* RSS is disabled */
1983                 rss_conf->rss_hf = 0;
1984                 return 0;
1985         }
1986         rss_hf = 0;
1987         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4)
1988                 rss_hf |= ETH_RSS_IPV4;
1989         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_TCP)
1990                 rss_hf |= ETH_RSS_NONFRAG_IPV4_TCP;
1991         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6)
1992                 rss_hf |= ETH_RSS_IPV6;
1993         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_EX)
1994                 rss_hf |= ETH_RSS_IPV6_EX;
1995         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP)
1996                 rss_hf |= ETH_RSS_NONFRAG_IPV6_TCP;
1997         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP_EX)
1998                 rss_hf |= ETH_RSS_IPV6_TCP_EX;
1999         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_UDP)
2000                 rss_hf |= ETH_RSS_NONFRAG_IPV4_UDP;
2001         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP)
2002                 rss_hf |= ETH_RSS_NONFRAG_IPV6_UDP;
2003         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP_EX)
2004                 rss_hf |= ETH_RSS_IPV6_UDP_EX;
2005         rss_conf->rss_hf = rss_hf;
2006         return 0;
2007 }
2008
2009 static void
2010 igb_rss_configure(struct rte_eth_dev *dev)
2011 {
2012         struct rte_eth_rss_conf rss_conf;
2013         struct e1000_hw *hw;
2014         uint32_t shift;
2015         uint16_t i;
2016
2017         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2018
2019         /* Fill in redirection table. */
2020         shift = (hw->mac.type == e1000_82575) ? 6 : 0;
2021         for (i = 0; i < 128; i++) {
2022                 union e1000_reta {
2023                         uint32_t dword;
2024                         uint8_t  bytes[4];
2025                 } reta;
2026                 uint8_t q_idx;
2027
2028                 q_idx = (uint8_t) ((dev->data->nb_rx_queues > 1) ?
2029                                    i % dev->data->nb_rx_queues : 0);
2030                 reta.bytes[i & 3] = (uint8_t) (q_idx << shift);
2031                 if ((i & 3) == 3)
2032                         E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
2033         }
2034
2035         /*
2036          * Configure the RSS key and the RSS protocols used to compute
2037          * the RSS hash of input packets.
2038          */
2039         rss_conf = dev->data->dev_conf.rx_adv_conf.rss_conf;
2040         if ((rss_conf.rss_hf & IGB_RSS_OFFLOAD_ALL) == 0) {
2041                 igb_rss_disable(dev);
2042                 return;
2043         }
2044         if (rss_conf.rss_key == NULL)
2045                 rss_conf.rss_key = rss_intel_key; /* Default hash key */
2046         igb_hw_rss_hash_set(hw, &rss_conf);
2047 }
2048
2049 /*
2050  * Check if the mac type support VMDq or not.
2051  * Return 1 if it supports, otherwise, return 0.
2052  */
2053 static int
2054 igb_is_vmdq_supported(const struct rte_eth_dev *dev)
2055 {
2056         const struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2057
2058         switch (hw->mac.type) {
2059         case e1000_82576:
2060         case e1000_82580:
2061         case e1000_i350:
2062                 return 1;
2063         case e1000_82540:
2064         case e1000_82541:
2065         case e1000_82542:
2066         case e1000_82543:
2067         case e1000_82544:
2068         case e1000_82545:
2069         case e1000_82546:
2070         case e1000_82547:
2071         case e1000_82571:
2072         case e1000_82572:
2073         case e1000_82573:
2074         case e1000_82574:
2075         case e1000_82583:
2076         case e1000_i210:
2077         case e1000_i211:
2078         default:
2079                 PMD_INIT_LOG(ERR, "Cannot support VMDq feature");
2080                 return 0;
2081         }
2082 }
2083
2084 static int
2085 igb_vmdq_rx_hw_configure(struct rte_eth_dev *dev)
2086 {
2087         struct rte_eth_vmdq_rx_conf *cfg;
2088         struct e1000_hw *hw;
2089         uint32_t mrqc, vt_ctl, vmolr, rctl;
2090         int i;
2091
2092         PMD_INIT_FUNC_TRACE();
2093
2094         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2095         cfg = &dev->data->dev_conf.rx_adv_conf.vmdq_rx_conf;
2096
2097         /* Check if mac type can support VMDq, return value of 0 means NOT support */
2098         if (igb_is_vmdq_supported(dev) == 0)
2099                 return -1;
2100
2101         igb_rss_disable(dev);
2102
2103         /* RCTL: eanble VLAN filter */
2104         rctl = E1000_READ_REG(hw, E1000_RCTL);
2105         rctl |= E1000_RCTL_VFE;
2106         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
2107
2108         /* MRQC: enable vmdq */
2109         mrqc = E1000_READ_REG(hw, E1000_MRQC);
2110         mrqc |= E1000_MRQC_ENABLE_VMDQ;
2111         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
2112
2113         /* VTCTL:  pool selection according to VLAN tag */
2114         vt_ctl = E1000_READ_REG(hw, E1000_VT_CTL);
2115         if (cfg->enable_default_pool)
2116                 vt_ctl |= (cfg->default_pool << E1000_VT_CTL_DEFAULT_POOL_SHIFT);
2117         vt_ctl |= E1000_VT_CTL_IGNORE_MAC;
2118         E1000_WRITE_REG(hw, E1000_VT_CTL, vt_ctl);
2119
2120         for (i = 0; i < E1000_VMOLR_SIZE; i++) {
2121                 vmolr = E1000_READ_REG(hw, E1000_VMOLR(i));
2122                 vmolr &= ~(E1000_VMOLR_AUPE | E1000_VMOLR_ROMPE |
2123                         E1000_VMOLR_ROPE | E1000_VMOLR_BAM |
2124                         E1000_VMOLR_MPME);
2125
2126                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_UNTAG)
2127                         vmolr |= E1000_VMOLR_AUPE;
2128                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_HASH_MC)
2129                         vmolr |= E1000_VMOLR_ROMPE;
2130                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_HASH_UC)
2131                         vmolr |= E1000_VMOLR_ROPE;
2132                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_BROADCAST)
2133                         vmolr |= E1000_VMOLR_BAM;
2134                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_MULTICAST)
2135                         vmolr |= E1000_VMOLR_MPME;
2136
2137                 E1000_WRITE_REG(hw, E1000_VMOLR(i), vmolr);
2138         }
2139
2140         /*
2141          * VMOLR: set STRVLAN as 1 if IGMAC in VTCTL is set as 1
2142          * Both 82576 and 82580 support it
2143          */
2144         if (hw->mac.type != e1000_i350) {
2145                 for (i = 0; i < E1000_VMOLR_SIZE; i++) {
2146                         vmolr = E1000_READ_REG(hw, E1000_VMOLR(i));
2147                         vmolr |= E1000_VMOLR_STRVLAN;
2148                         E1000_WRITE_REG(hw, E1000_VMOLR(i), vmolr);
2149                 }
2150         }
2151
2152         /* VFTA - enable all vlan filters */
2153         for (i = 0; i < IGB_VFTA_SIZE; i++)
2154                 E1000_WRITE_REG(hw, (E1000_VFTA+(i*4)), UINT32_MAX);
2155
2156         /* VFRE: 8 pools enabling for rx, both 82576 and i350 support it */
2157         if (hw->mac.type != e1000_82580)
2158                 E1000_WRITE_REG(hw, E1000_VFRE, E1000_MBVFICR_VFREQ_MASK);
2159
2160         /*
2161          * RAH/RAL - allow pools to read specific mac addresses
2162          * In this case, all pools should be able to read from mac addr 0
2163          */
2164         E1000_WRITE_REG(hw, E1000_RAH(0), (E1000_RAH_AV | UINT16_MAX));
2165         E1000_WRITE_REG(hw, E1000_RAL(0), UINT32_MAX);
2166
2167         /* VLVF: set up filters for vlan tags as configured */
2168         for (i = 0; i < cfg->nb_pool_maps; i++) {
2169                 /* set vlan id in VF register and set the valid bit */
2170                 E1000_WRITE_REG(hw, E1000_VLVF(i), (E1000_VLVF_VLANID_ENABLE | \
2171                         (cfg->pool_map[i].vlan_id & ETH_VLAN_ID_MAX) | \
2172                         ((cfg->pool_map[i].pools << E1000_VLVF_POOLSEL_SHIFT ) & \
2173                         E1000_VLVF_POOLSEL_MASK)));
2174         }
2175
2176         E1000_WRITE_FLUSH(hw);
2177
2178         return 0;
2179 }
2180
2181
2182 /*********************************************************************
2183  *
2184  *  Enable receive unit.
2185  *
2186  **********************************************************************/
2187
2188 static int
2189 igb_alloc_rx_queue_mbufs(struct igb_rx_queue *rxq)
2190 {
2191         struct igb_rx_entry *rxe = rxq->sw_ring;
2192         uint64_t dma_addr;
2193         unsigned i;
2194
2195         /* Initialize software ring entries. */
2196         for (i = 0; i < rxq->nb_rx_desc; i++) {
2197                 volatile union e1000_adv_rx_desc *rxd;
2198                 struct rte_mbuf *mbuf = rte_mbuf_raw_alloc(rxq->mb_pool);
2199
2200                 if (mbuf == NULL) {
2201                         PMD_INIT_LOG(ERR, "RX mbuf alloc failed "
2202                                      "queue_id=%hu", rxq->queue_id);
2203                         return -ENOMEM;
2204                 }
2205                 dma_addr =
2206                         rte_cpu_to_le_64(rte_mbuf_data_iova_default(mbuf));
2207                 rxd = &rxq->rx_ring[i];
2208                 rxd->read.hdr_addr = 0;
2209                 rxd->read.pkt_addr = dma_addr;
2210                 rxe[i].mbuf = mbuf;
2211         }
2212
2213         return 0;
2214 }
2215
2216 #define E1000_MRQC_DEF_Q_SHIFT               (3)
2217 static int
2218 igb_dev_mq_rx_configure(struct rte_eth_dev *dev)
2219 {
2220         struct e1000_hw *hw =
2221                 E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2222         uint32_t mrqc;
2223
2224         if (RTE_ETH_DEV_SRIOV(dev).active == ETH_8_POOLS) {
2225                 /*
2226                  * SRIOV active scheme
2227                  * FIXME if support RSS together with VMDq & SRIOV
2228                  */
2229                 mrqc = E1000_MRQC_ENABLE_VMDQ;
2230                 /* 011b Def_Q ignore, according to VT_CTL.DEF_PL */
2231                 mrqc |= 0x3 << E1000_MRQC_DEF_Q_SHIFT;
2232                 E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
2233         } else if(RTE_ETH_DEV_SRIOV(dev).active == 0) {
2234                 /*
2235                  * SRIOV inactive scheme
2236                  */
2237                 switch (dev->data->dev_conf.rxmode.mq_mode) {
2238                         case ETH_MQ_RX_RSS:
2239                                 igb_rss_configure(dev);
2240                                 break;
2241                         case ETH_MQ_RX_VMDQ_ONLY:
2242                                 /*Configure general VMDQ only RX parameters*/
2243                                 igb_vmdq_rx_hw_configure(dev);
2244                                 break;
2245                         case ETH_MQ_RX_NONE:
2246                                 /* if mq_mode is none, disable rss mode.*/
2247                         default:
2248                                 igb_rss_disable(dev);
2249                                 break;
2250                 }
2251         }
2252
2253         return 0;
2254 }
2255
2256 int
2257 eth_igb_rx_init(struct rte_eth_dev *dev)
2258 {
2259         struct e1000_hw     *hw;
2260         struct igb_rx_queue *rxq;
2261         uint32_t rctl;
2262         uint32_t rxcsum;
2263         uint32_t srrctl;
2264         uint16_t buf_size;
2265         uint16_t rctl_bsize;
2266         uint16_t i;
2267         int ret;
2268
2269         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2270         srrctl = 0;
2271
2272         /*
2273          * Make sure receives are disabled while setting
2274          * up the descriptor ring.
2275          */
2276         rctl = E1000_READ_REG(hw, E1000_RCTL);
2277         E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
2278
2279         /*
2280          * Configure support of jumbo frames, if any.
2281          */
2282         if (dev->data->dev_conf.rxmode.jumbo_frame == 1) {
2283                 rctl |= E1000_RCTL_LPE;
2284
2285                 /*
2286                  * Set maximum packet length by default, and might be updated
2287                  * together with enabling/disabling dual VLAN.
2288                  */
2289                 E1000_WRITE_REG(hw, E1000_RLPML,
2290                         dev->data->dev_conf.rxmode.max_rx_pkt_len +
2291                                                 VLAN_TAG_SIZE);
2292         } else
2293                 rctl &= ~E1000_RCTL_LPE;
2294
2295         /* Configure and enable each RX queue. */
2296         rctl_bsize = 0;
2297         dev->rx_pkt_burst = eth_igb_recv_pkts;
2298         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2299                 uint64_t bus_addr;
2300                 uint32_t rxdctl;
2301
2302                 rxq = dev->data->rx_queues[i];
2303
2304                 rxq->flags = 0;
2305                 /*
2306                  * i350 and i354 vlan packets have vlan tags byte swapped.
2307                  */
2308                 if (hw->mac.type == e1000_i350 || hw->mac.type == e1000_i354) {
2309                         rxq->flags |= IGB_RXQ_FLAG_LB_BSWAP_VLAN;
2310                         PMD_INIT_LOG(DEBUG, "IGB rx vlan bswap required");
2311                 } else {
2312                         PMD_INIT_LOG(DEBUG, "IGB rx vlan bswap not required");
2313                 }
2314
2315                 /* Allocate buffers for descriptor rings and set up queue */
2316                 ret = igb_alloc_rx_queue_mbufs(rxq);
2317                 if (ret)
2318                         return ret;
2319
2320                 /*
2321                  * Reset crc_len in case it was changed after queue setup by a
2322                  *  call to configure
2323                  */
2324                 rxq->crc_len =
2325                         (uint8_t)(dev->data->dev_conf.rxmode.hw_strip_crc ?
2326                                                         0 : ETHER_CRC_LEN);
2327
2328                 bus_addr = rxq->rx_ring_phys_addr;
2329                 E1000_WRITE_REG(hw, E1000_RDLEN(rxq->reg_idx),
2330                                 rxq->nb_rx_desc *
2331                                 sizeof(union e1000_adv_rx_desc));
2332                 E1000_WRITE_REG(hw, E1000_RDBAH(rxq->reg_idx),
2333                                 (uint32_t)(bus_addr >> 32));
2334                 E1000_WRITE_REG(hw, E1000_RDBAL(rxq->reg_idx), (uint32_t)bus_addr);
2335
2336                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2337
2338                 /*
2339                  * Configure RX buffer size.
2340                  */
2341                 buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
2342                         RTE_PKTMBUF_HEADROOM);
2343                 if (buf_size >= 1024) {
2344                         /*
2345                          * Configure the BSIZEPACKET field of the SRRCTL
2346                          * register of the queue.
2347                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
2348                          * If this field is equal to 0b, then RCTL.BSIZE
2349                          * determines the RX packet buffer size.
2350                          */
2351                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
2352                                    E1000_SRRCTL_BSIZEPKT_MASK);
2353                         buf_size = (uint16_t) ((srrctl &
2354                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
2355                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
2356
2357                         /* It adds dual VLAN length for supporting dual VLAN */
2358                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
2359                                                 2 * VLAN_TAG_SIZE) > buf_size){
2360                                 if (!dev->data->scattered_rx)
2361                                         PMD_INIT_LOG(DEBUG,
2362                                                      "forcing scatter mode");
2363                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2364                                 dev->data->scattered_rx = 1;
2365                         }
2366                 } else {
2367                         /*
2368                          * Use BSIZE field of the device RCTL register.
2369                          */
2370                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
2371                                 rctl_bsize = buf_size;
2372                         if (!dev->data->scattered_rx)
2373                                 PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2374                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2375                         dev->data->scattered_rx = 1;
2376                 }
2377
2378                 /* Set if packets are dropped when no descriptors available */
2379                 if (rxq->drop_en)
2380                         srrctl |= E1000_SRRCTL_DROP_EN;
2381
2382                 E1000_WRITE_REG(hw, E1000_SRRCTL(rxq->reg_idx), srrctl);
2383
2384                 /* Enable this RX queue. */
2385                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(rxq->reg_idx));
2386                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2387                 rxdctl &= 0xFFF00000;
2388                 rxdctl |= (rxq->pthresh & 0x1F);
2389                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2390                 rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2391                 E1000_WRITE_REG(hw, E1000_RXDCTL(rxq->reg_idx), rxdctl);
2392         }
2393
2394         if (dev->data->dev_conf.rxmode.enable_scatter) {
2395                 if (!dev->data->scattered_rx)
2396                         PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2397                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2398                 dev->data->scattered_rx = 1;
2399         }
2400
2401         /*
2402          * Setup BSIZE field of RCTL register, if needed.
2403          * Buffer sizes >= 1024 are not [supposed to be] setup in the RCTL
2404          * register, since the code above configures the SRRCTL register of
2405          * the RX queue in such a case.
2406          * All configurable sizes are:
2407          * 16384: rctl |= (E1000_RCTL_SZ_16384 | E1000_RCTL_BSEX);
2408          *  8192: rctl |= (E1000_RCTL_SZ_8192  | E1000_RCTL_BSEX);
2409          *  4096: rctl |= (E1000_RCTL_SZ_4096  | E1000_RCTL_BSEX);
2410          *  2048: rctl |= E1000_RCTL_SZ_2048;
2411          *  1024: rctl |= E1000_RCTL_SZ_1024;
2412          *   512: rctl |= E1000_RCTL_SZ_512;
2413          *   256: rctl |= E1000_RCTL_SZ_256;
2414          */
2415         if (rctl_bsize > 0) {
2416                 if (rctl_bsize >= 512) /* 512 <= buf_size < 1024 - use 512 */
2417                         rctl |= E1000_RCTL_SZ_512;
2418                 else /* 256 <= buf_size < 512 - use 256 */
2419                         rctl |= E1000_RCTL_SZ_256;
2420         }
2421
2422         /*
2423          * Configure RSS if device configured with multiple RX queues.
2424          */
2425         igb_dev_mq_rx_configure(dev);
2426
2427         /* Update the rctl since igb_dev_mq_rx_configure may change its value */
2428         rctl |= E1000_READ_REG(hw, E1000_RCTL);
2429
2430         /*
2431          * Setup the Checksum Register.
2432          * Receive Full-Packet Checksum Offload is mutually exclusive with RSS.
2433          */
2434         rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
2435         rxcsum |= E1000_RXCSUM_PCSD;
2436
2437         /* Enable both L3/L4 rx checksum offload */
2438         if (dev->data->dev_conf.rxmode.hw_ip_checksum)
2439                 rxcsum |= (E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL |
2440                                 E1000_RXCSUM_CRCOFL);
2441         else
2442                 rxcsum &= ~(E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL |
2443                                 E1000_RXCSUM_CRCOFL);
2444         E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
2445
2446         /* Setup the Receive Control Register. */
2447         if (dev->data->dev_conf.rxmode.hw_strip_crc) {
2448                 rctl |= E1000_RCTL_SECRC; /* Strip Ethernet CRC. */
2449
2450                 /* set STRCRC bit in all queues */
2451                 if (hw->mac.type == e1000_i350 ||
2452                     hw->mac.type == e1000_i210 ||
2453                     hw->mac.type == e1000_i211 ||
2454                     hw->mac.type == e1000_i354) {
2455                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2456                                 rxq = dev->data->rx_queues[i];
2457                                 uint32_t dvmolr = E1000_READ_REG(hw,
2458                                         E1000_DVMOLR(rxq->reg_idx));
2459                                 dvmolr |= E1000_DVMOLR_STRCRC;
2460                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2461                         }
2462                 }
2463         } else {
2464                 rctl &= ~E1000_RCTL_SECRC; /* Do not Strip Ethernet CRC. */
2465
2466                 /* clear STRCRC bit in all queues */
2467                 if (hw->mac.type == e1000_i350 ||
2468                     hw->mac.type == e1000_i210 ||
2469                     hw->mac.type == e1000_i211 ||
2470                     hw->mac.type == e1000_i354) {
2471                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2472                                 rxq = dev->data->rx_queues[i];
2473                                 uint32_t dvmolr = E1000_READ_REG(hw,
2474                                         E1000_DVMOLR(rxq->reg_idx));
2475                                 dvmolr &= ~E1000_DVMOLR_STRCRC;
2476                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2477                         }
2478                 }
2479         }
2480
2481         rctl &= ~(3 << E1000_RCTL_MO_SHIFT);
2482         rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
2483                 E1000_RCTL_RDMTS_HALF |
2484                 (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
2485
2486         /* Make sure VLAN Filters are off. */
2487         if (dev->data->dev_conf.rxmode.mq_mode != ETH_MQ_RX_VMDQ_ONLY)
2488                 rctl &= ~E1000_RCTL_VFE;
2489         /* Don't store bad packets. */
2490         rctl &= ~E1000_RCTL_SBP;
2491
2492         /* Enable Receives. */
2493         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
2494
2495         /*
2496          * Setup the HW Rx Head and Tail Descriptor Pointers.
2497          * This needs to be done after enable.
2498          */
2499         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2500                 rxq = dev->data->rx_queues[i];
2501                 E1000_WRITE_REG(hw, E1000_RDH(rxq->reg_idx), 0);
2502                 E1000_WRITE_REG(hw, E1000_RDT(rxq->reg_idx), rxq->nb_rx_desc - 1);
2503         }
2504
2505         return 0;
2506 }
2507
2508 /*********************************************************************
2509  *
2510  *  Enable transmit unit.
2511  *
2512  **********************************************************************/
2513 void
2514 eth_igb_tx_init(struct rte_eth_dev *dev)
2515 {
2516         struct e1000_hw     *hw;
2517         struct igb_tx_queue *txq;
2518         uint32_t tctl;
2519         uint32_t txdctl;
2520         uint16_t i;
2521
2522         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2523
2524         /* Setup the Base and Length of the Tx Descriptor Rings. */
2525         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2526                 uint64_t bus_addr;
2527                 txq = dev->data->tx_queues[i];
2528                 bus_addr = txq->tx_ring_phys_addr;
2529
2530                 E1000_WRITE_REG(hw, E1000_TDLEN(txq->reg_idx),
2531                                 txq->nb_tx_desc *
2532                                 sizeof(union e1000_adv_tx_desc));
2533                 E1000_WRITE_REG(hw, E1000_TDBAH(txq->reg_idx),
2534                                 (uint32_t)(bus_addr >> 32));
2535                 E1000_WRITE_REG(hw, E1000_TDBAL(txq->reg_idx), (uint32_t)bus_addr);
2536
2537                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2538                 E1000_WRITE_REG(hw, E1000_TDT(txq->reg_idx), 0);
2539                 E1000_WRITE_REG(hw, E1000_TDH(txq->reg_idx), 0);
2540
2541                 /* Setup Transmit threshold registers. */
2542                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(txq->reg_idx));
2543                 txdctl |= txq->pthresh & 0x1F;
2544                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2545                 txdctl |= ((txq->wthresh & 0x1F) << 16);
2546                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2547                 E1000_WRITE_REG(hw, E1000_TXDCTL(txq->reg_idx), txdctl);
2548         }
2549
2550         /* Program the Transmit Control Register. */
2551         tctl = E1000_READ_REG(hw, E1000_TCTL);
2552         tctl &= ~E1000_TCTL_CT;
2553         tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
2554                  (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
2555
2556         e1000_config_collision_dist(hw);
2557
2558         /* This write will effectively turn on the transmit unit. */
2559         E1000_WRITE_REG(hw, E1000_TCTL, tctl);
2560 }
2561
2562 /*********************************************************************
2563  *
2564  *  Enable VF receive unit.
2565  *
2566  **********************************************************************/
2567 int
2568 eth_igbvf_rx_init(struct rte_eth_dev *dev)
2569 {
2570         struct e1000_hw     *hw;
2571         struct igb_rx_queue *rxq;
2572         uint32_t srrctl;
2573         uint16_t buf_size;
2574         uint16_t rctl_bsize;
2575         uint16_t i;
2576         int ret;
2577
2578         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2579
2580         /* setup MTU */
2581         e1000_rlpml_set_vf(hw,
2582                 (uint16_t)(dev->data->dev_conf.rxmode.max_rx_pkt_len +
2583                 VLAN_TAG_SIZE));
2584
2585         /* Configure and enable each RX queue. */
2586         rctl_bsize = 0;
2587         dev->rx_pkt_burst = eth_igb_recv_pkts;
2588         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2589                 uint64_t bus_addr;
2590                 uint32_t rxdctl;
2591
2592                 rxq = dev->data->rx_queues[i];
2593
2594                 rxq->flags = 0;
2595                 /*
2596                  * i350VF LB vlan packets have vlan tags byte swapped.
2597                  */
2598                 if (hw->mac.type == e1000_vfadapt_i350) {
2599                         rxq->flags |= IGB_RXQ_FLAG_LB_BSWAP_VLAN;
2600                         PMD_INIT_LOG(DEBUG, "IGB rx vlan bswap required");
2601                 } else {
2602                         PMD_INIT_LOG(DEBUG, "IGB rx vlan bswap not required");
2603                 }
2604
2605                 /* Allocate buffers for descriptor rings and set up queue */
2606                 ret = igb_alloc_rx_queue_mbufs(rxq);
2607                 if (ret)
2608                         return ret;
2609
2610                 bus_addr = rxq->rx_ring_phys_addr;
2611                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
2612                                 rxq->nb_rx_desc *
2613                                 sizeof(union e1000_adv_rx_desc));
2614                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
2615                                 (uint32_t)(bus_addr >> 32));
2616                 E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);
2617
2618                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2619
2620                 /*
2621                  * Configure RX buffer size.
2622                  */
2623                 buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
2624                         RTE_PKTMBUF_HEADROOM);
2625                 if (buf_size >= 1024) {
2626                         /*
2627                          * Configure the BSIZEPACKET field of the SRRCTL
2628                          * register of the queue.
2629                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
2630                          * If this field is equal to 0b, then RCTL.BSIZE
2631                          * determines the RX packet buffer size.
2632                          */
2633                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
2634                                    E1000_SRRCTL_BSIZEPKT_MASK);
2635                         buf_size = (uint16_t) ((srrctl &
2636                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
2637                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
2638
2639                         /* It adds dual VLAN length for supporting dual VLAN */
2640                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
2641                                                 2 * VLAN_TAG_SIZE) > buf_size){
2642                                 if (!dev->data->scattered_rx)
2643                                         PMD_INIT_LOG(DEBUG,
2644                                                      "forcing scatter mode");
2645                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2646                                 dev->data->scattered_rx = 1;
2647                         }
2648                 } else {
2649                         /*
2650                          * Use BSIZE field of the device RCTL register.
2651                          */
2652                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
2653                                 rctl_bsize = buf_size;
2654                         if (!dev->data->scattered_rx)
2655                                 PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2656                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2657                         dev->data->scattered_rx = 1;
2658                 }
2659
2660                 /* Set if packets are dropped when no descriptors available */
2661                 if (rxq->drop_en)
2662                         srrctl |= E1000_SRRCTL_DROP_EN;
2663
2664                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
2665
2666                 /* Enable this RX queue. */
2667                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
2668                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2669                 rxdctl &= 0xFFF00000;
2670                 rxdctl |= (rxq->pthresh & 0x1F);
2671                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2672                 if (hw->mac.type == e1000_vfadapt) {
2673                         /*
2674                          * Workaround of 82576 VF Erratum
2675                          * force set WTHRESH to 1
2676                          * to avoid Write-Back not triggered sometimes
2677                          */
2678                         rxdctl |= 0x10000;
2679                         PMD_INIT_LOG(DEBUG, "Force set RX WTHRESH to 1 !");
2680                 }
2681                 else
2682                         rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2683                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
2684         }
2685
2686         if (dev->data->dev_conf.rxmode.enable_scatter) {
2687                 if (!dev->data->scattered_rx)
2688                         PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2689                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2690                 dev->data->scattered_rx = 1;
2691         }
2692
2693         /*
2694          * Setup the HW Rx Head and Tail Descriptor Pointers.
2695          * This needs to be done after enable.
2696          */
2697         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2698                 rxq = dev->data->rx_queues[i];
2699                 E1000_WRITE_REG(hw, E1000_RDH(i), 0);
2700                 E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);
2701         }
2702
2703         return 0;
2704 }
2705
2706 /*********************************************************************
2707  *
2708  *  Enable VF transmit unit.
2709  *
2710  **********************************************************************/
2711 void
2712 eth_igbvf_tx_init(struct rte_eth_dev *dev)
2713 {
2714         struct e1000_hw     *hw;
2715         struct igb_tx_queue *txq;
2716         uint32_t txdctl;
2717         uint16_t i;
2718
2719         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2720
2721         /* Setup the Base and Length of the Tx Descriptor Rings. */
2722         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2723                 uint64_t bus_addr;
2724
2725                 txq = dev->data->tx_queues[i];
2726                 bus_addr = txq->tx_ring_phys_addr;
2727                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
2728                                 txq->nb_tx_desc *
2729                                 sizeof(union e1000_adv_tx_desc));
2730                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
2731                                 (uint32_t)(bus_addr >> 32));
2732                 E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);
2733
2734                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2735                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
2736                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
2737
2738                 /* Setup Transmit threshold registers. */
2739                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
2740                 txdctl |= txq->pthresh & 0x1F;
2741                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2742                 if (hw->mac.type == e1000_82576) {
2743                         /*
2744                          * Workaround of 82576 VF Erratum
2745                          * force set WTHRESH to 1
2746                          * to avoid Write-Back not triggered sometimes
2747                          */
2748                         txdctl |= 0x10000;
2749                         PMD_INIT_LOG(DEBUG, "Force set TX WTHRESH to 1 !");
2750                 }
2751                 else
2752                         txdctl |= ((txq->wthresh & 0x1F) << 16);
2753                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2754                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
2755         }
2756
2757 }
2758
2759 void
2760 igb_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
2761         struct rte_eth_rxq_info *qinfo)
2762 {
2763         struct igb_rx_queue *rxq;
2764
2765         rxq = dev->data->rx_queues[queue_id];
2766
2767         qinfo->mp = rxq->mb_pool;
2768         qinfo->scattered_rx = dev->data->scattered_rx;
2769         qinfo->nb_desc = rxq->nb_rx_desc;
2770
2771         qinfo->conf.rx_free_thresh = rxq->rx_free_thresh;
2772         qinfo->conf.rx_drop_en = rxq->drop_en;
2773 }
2774
2775 void
2776 igb_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
2777         struct rte_eth_txq_info *qinfo)
2778 {
2779         struct igb_tx_queue *txq;
2780
2781         txq = dev->data->tx_queues[queue_id];
2782
2783         qinfo->nb_desc = txq->nb_tx_desc;
2784
2785         qinfo->conf.tx_thresh.pthresh = txq->pthresh;
2786         qinfo->conf.tx_thresh.hthresh = txq->hthresh;
2787         qinfo->conf.tx_thresh.wthresh = txq->wthresh;
2788 }