New upstream version 16.11.9
[deb_dpdk.git] / drivers / net / e1000 / igb_rxtx.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <sys/queue.h>
35
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <stdint.h>
41 #include <stdarg.h>
42 #include <inttypes.h>
43
44 #include <rte_interrupts.h>
45 #include <rte_byteorder.h>
46 #include <rte_common.h>
47 #include <rte_log.h>
48 #include <rte_debug.h>
49 #include <rte_pci.h>
50 #include <rte_memory.h>
51 #include <rte_memcpy.h>
52 #include <rte_memzone.h>
53 #include <rte_launch.h>
54 #include <rte_eal.h>
55 #include <rte_per_lcore.h>
56 #include <rte_lcore.h>
57 #include <rte_atomic.h>
58 #include <rte_branch_prediction.h>
59 #include <rte_mempool.h>
60 #include <rte_malloc.h>
61 #include <rte_mbuf.h>
62 #include <rte_ether.h>
63 #include <rte_ethdev.h>
64 #include <rte_prefetch.h>
65 #include <rte_udp.h>
66 #include <rte_tcp.h>
67 #include <rte_sctp.h>
68 #include <rte_string_fns.h>
69
70 #include "e1000_logs.h"
71 #include "base/e1000_api.h"
72 #include "e1000_ethdev.h"
73
74 /* Bit Mask to indicate what bits required for building TX context */
75 #define IGB_TX_OFFLOAD_MASK (                    \
76                 PKT_TX_OUTER_IPV6 |      \
77                 PKT_TX_OUTER_IPV4 |      \
78                 PKT_TX_IPV6 |            \
79                 PKT_TX_IPV4 |            \
80                 PKT_TX_VLAN_PKT |                \
81                 PKT_TX_IP_CKSUM |                \
82                 PKT_TX_L4_MASK |                 \
83                 PKT_TX_TCP_SEG)
84
85 /**
86  * Structure associated with each descriptor of the RX ring of a RX queue.
87  */
88 struct igb_rx_entry {
89         struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
90 };
91
92 /**
93  * Structure associated with each descriptor of the TX ring of a TX queue.
94  */
95 struct igb_tx_entry {
96         struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
97         uint16_t next_id; /**< Index of next descriptor in ring. */
98         uint16_t last_id; /**< Index of last scattered descriptor. */
99 };
100
101 /**
102  * Structure associated with each RX queue.
103  */
104 struct igb_rx_queue {
105         struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
106         volatile union e1000_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
107         uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
108         volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
109         volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
110         struct igb_rx_entry *sw_ring;   /**< address of RX software ring. */
111         struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
112         struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
113         uint16_t            nb_rx_desc; /**< number of RX descriptors. */
114         uint16_t            rx_tail;    /**< current value of RDT register. */
115         uint16_t            nb_rx_hold; /**< number of held free RX desc. */
116         uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
117         uint16_t            queue_id;   /**< RX queue index. */
118         uint16_t            reg_idx;    /**< RX queue register index. */
119         uint8_t             port_id;    /**< Device port identifier. */
120         uint8_t             pthresh;    /**< Prefetch threshold register. */
121         uint8_t             hthresh;    /**< Host threshold register. */
122         uint8_t             wthresh;    /**< Write-back threshold register. */
123         uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
124         uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
125 };
126
127 /**
128  * Hardware context number
129  */
130 enum igb_advctx_num {
131         IGB_CTX_0    = 0, /**< CTX0    */
132         IGB_CTX_1    = 1, /**< CTX1    */
133         IGB_CTX_NUM  = 2, /**< CTX_NUM */
134 };
135
136 /** Offload features */
137 union igb_tx_offload {
138         uint64_t data;
139         struct {
140                 uint64_t l3_len:9; /**< L3 (IP) Header Length. */
141                 uint64_t l2_len:7; /**< L2 (MAC) Header Length. */
142                 uint64_t vlan_tci:16;  /**< VLAN Tag Control Identifier(CPU order). */
143                 uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
144                 uint64_t tso_segsz:16; /**< TCP TSO segment size. */
145
146                 /* uint64_t unused:8; */
147         };
148 };
149
150 /*
151  * Compare mask for igb_tx_offload.data,
152  * should be in sync with igb_tx_offload layout.
153  * */
154 #define TX_MACIP_LEN_CMP_MASK   0x000000000000FFFFULL /**< L2L3 header mask. */
155 #define TX_VLAN_CMP_MASK                0x00000000FFFF0000ULL /**< Vlan mask. */
156 #define TX_TCP_LEN_CMP_MASK             0x000000FF00000000ULL /**< TCP header mask. */
157 #define TX_TSO_MSS_CMP_MASK             0x00FFFF0000000000ULL /**< TSO segsz mask. */
158 /** Mac + IP + TCP + Mss mask. */
159 #define TX_TSO_CMP_MASK \
160         (TX_MACIP_LEN_CMP_MASK | TX_TCP_LEN_CMP_MASK | TX_TSO_MSS_CMP_MASK)
161
162 /**
163  * Strucutre to check if new context need be built
164  */
165 struct igb_advctx_info {
166         uint64_t flags;           /**< ol_flags related to context build. */
167         /** tx offload: vlan, tso, l2-l3-l4 lengths. */
168         union igb_tx_offload tx_offload;
169         /** compare mask for tx offload. */
170         union igb_tx_offload tx_offload_mask;
171 };
172
173 /**
174  * Structure associated with each TX queue.
175  */
176 struct igb_tx_queue {
177         volatile union e1000_adv_tx_desc *tx_ring; /**< TX ring address */
178         uint64_t               tx_ring_phys_addr; /**< TX ring DMA address. */
179         struct igb_tx_entry    *sw_ring; /**< virtual address of SW ring. */
180         volatile uint32_t      *tdt_reg_addr; /**< Address of TDT register. */
181         uint32_t               txd_type;      /**< Device-specific TXD type */
182         uint16_t               nb_tx_desc;    /**< number of TX descriptors. */
183         uint16_t               tx_tail; /**< Current value of TDT register. */
184         uint16_t               tx_head;
185         /**< Index of first used TX descriptor. */
186         uint16_t               queue_id; /**< TX queue index. */
187         uint16_t               reg_idx;  /**< TX queue register index. */
188         uint8_t                port_id;  /**< Device port identifier. */
189         uint8_t                pthresh;  /**< Prefetch threshold register. */
190         uint8_t                hthresh;  /**< Host threshold register. */
191         uint8_t                wthresh;  /**< Write-back threshold register. */
192         uint32_t               ctx_curr;
193         /**< Current used hardware descriptor. */
194         uint32_t               ctx_start;
195         /**< Start context position for transmit queue. */
196         struct igb_advctx_info ctx_cache[IGB_CTX_NUM];
197         /**< Hardware context history.*/
198 };
199
200 #if 1
201 #define RTE_PMD_USE_PREFETCH
202 #endif
203
204 #ifdef RTE_PMD_USE_PREFETCH
205 #define rte_igb_prefetch(p)     rte_prefetch0(p)
206 #else
207 #define rte_igb_prefetch(p)     do {} while(0)
208 #endif
209
210 #ifdef RTE_PMD_PACKET_PREFETCH
211 #define rte_packet_prefetch(p) rte_prefetch1(p)
212 #else
213 #define rte_packet_prefetch(p)  do {} while(0)
214 #endif
215
216 /*
217  * Macro for VMDq feature for 1 GbE NIC.
218  */
219 #define E1000_VMOLR_SIZE                        (8)
220 #define IGB_TSO_MAX_HDRLEN                      (512)
221 #define IGB_TSO_MAX_MSS                         (9216)
222
223 /*********************************************************************
224  *
225  *  TX function
226  *
227  **********************************************************************/
228
229 /*
230  *There're some limitations in hardware for TCP segmentation offload. We
231  *should check whether the parameters are valid.
232  */
233 static inline uint64_t
234 check_tso_para(uint64_t ol_req, union igb_tx_offload ol_para)
235 {
236         if (!(ol_req & PKT_TX_TCP_SEG))
237                 return ol_req;
238         if ((ol_para.tso_segsz > IGB_TSO_MAX_MSS) || (ol_para.l2_len +
239                         ol_para.l3_len + ol_para.l4_len > IGB_TSO_MAX_HDRLEN)) {
240                 ol_req &= ~PKT_TX_TCP_SEG;
241                 ol_req |= PKT_TX_TCP_CKSUM;
242         }
243         return ol_req;
244 }
245
246 /*
247  * Advanced context descriptor are almost same between igb/ixgbe
248  * This is a separate function, looking for optimization opportunity here
249  * Rework required to go with the pre-defined values.
250  */
251
252 static inline void
253 igbe_set_xmit_ctx(struct igb_tx_queue* txq,
254                 volatile struct e1000_adv_tx_context_desc *ctx_txd,
255                 uint64_t ol_flags, union igb_tx_offload tx_offload)
256 {
257         uint32_t type_tucmd_mlhl;
258         uint32_t mss_l4len_idx;
259         uint32_t ctx_idx, ctx_curr;
260         uint32_t vlan_macip_lens;
261         union igb_tx_offload tx_offload_mask;
262
263         ctx_curr = txq->ctx_curr;
264         ctx_idx = ctx_curr + txq->ctx_start;
265
266         tx_offload_mask.data = 0;
267         type_tucmd_mlhl = 0;
268
269         /* Specify which HW CTX to upload. */
270         mss_l4len_idx = (ctx_idx << E1000_ADVTXD_IDX_SHIFT);
271
272         if (ol_flags & PKT_TX_VLAN_PKT)
273                 tx_offload_mask.data |= TX_VLAN_CMP_MASK;
274
275         /* check if TCP segmentation required for this packet */
276         if (ol_flags & PKT_TX_TCP_SEG) {
277                 /* implies IP cksum in IPv4 */
278                 if (ol_flags & PKT_TX_IP_CKSUM)
279                         type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4 |
280                                 E1000_ADVTXD_TUCMD_L4T_TCP |
281                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
282                 else
283                         type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV6 |
284                                 E1000_ADVTXD_TUCMD_L4T_TCP |
285                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
286
287                 tx_offload_mask.data |= TX_TSO_CMP_MASK;
288                 mss_l4len_idx |= tx_offload.tso_segsz << E1000_ADVTXD_MSS_SHIFT;
289                 mss_l4len_idx |= tx_offload.l4_len << E1000_ADVTXD_L4LEN_SHIFT;
290         } else { /* no TSO, check if hardware checksum is needed */
291                 if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_L4_MASK))
292                         tx_offload_mask.data |= TX_MACIP_LEN_CMP_MASK;
293
294                 if (ol_flags & PKT_TX_IP_CKSUM)
295                         type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4;
296
297                 switch (ol_flags & PKT_TX_L4_MASK) {
298                 case PKT_TX_UDP_CKSUM:
299                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP |
300                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
301                         mss_l4len_idx |= sizeof(struct udp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
302                         break;
303                 case PKT_TX_TCP_CKSUM:
304                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP |
305                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
306                         mss_l4len_idx |= sizeof(struct tcp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
307                         break;
308                 case PKT_TX_SCTP_CKSUM:
309                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP |
310                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
311                         mss_l4len_idx |= sizeof(struct sctp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
312                         break;
313                 default:
314                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_RSV |
315                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
316                         break;
317                 }
318         }
319
320         txq->ctx_cache[ctx_curr].flags = ol_flags;
321         txq->ctx_cache[ctx_curr].tx_offload.data =
322                 tx_offload_mask.data & tx_offload.data;
323         txq->ctx_cache[ctx_curr].tx_offload_mask = tx_offload_mask;
324
325         ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
326         vlan_macip_lens = (uint32_t)tx_offload.data;
327         ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
328         ctx_txd->mss_l4len_idx = rte_cpu_to_le_32(mss_l4len_idx);
329         ctx_txd->seqnum_seed = 0;
330 }
331
332 /*
333  * Check which hardware context can be used. Use the existing match
334  * or create a new context descriptor.
335  */
336 static inline uint32_t
337 what_advctx_update(struct igb_tx_queue *txq, uint64_t flags,
338                 union igb_tx_offload tx_offload)
339 {
340         /* If match with the current context */
341         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
342                 (txq->ctx_cache[txq->ctx_curr].tx_offload.data ==
343                 (txq->ctx_cache[txq->ctx_curr].tx_offload_mask.data & tx_offload.data)))) {
344                         return txq->ctx_curr;
345         }
346
347         /* If match with the second context */
348         txq->ctx_curr ^= 1;
349         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
350                 (txq->ctx_cache[txq->ctx_curr].tx_offload.data ==
351                 (txq->ctx_cache[txq->ctx_curr].tx_offload_mask.data & tx_offload.data)))) {
352                         return txq->ctx_curr;
353         }
354
355         /* Mismatch, use the previous context */
356         return IGB_CTX_NUM;
357 }
358
359 static inline uint32_t
360 tx_desc_cksum_flags_to_olinfo(uint64_t ol_flags)
361 {
362         static const uint32_t l4_olinfo[2] = {0, E1000_ADVTXD_POPTS_TXSM};
363         static const uint32_t l3_olinfo[2] = {0, E1000_ADVTXD_POPTS_IXSM};
364         uint32_t tmp;
365
366         tmp  = l4_olinfo[(ol_flags & PKT_TX_L4_MASK)  != PKT_TX_L4_NO_CKSUM];
367         tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
368         tmp |= l4_olinfo[(ol_flags & PKT_TX_TCP_SEG) != 0];
369         return tmp;
370 }
371
372 static inline uint32_t
373 tx_desc_vlan_flags_to_cmdtype(uint64_t ol_flags)
374 {
375         uint32_t cmdtype;
376         static uint32_t vlan_cmd[2] = {0, E1000_ADVTXD_DCMD_VLE};
377         static uint32_t tso_cmd[2] = {0, E1000_ADVTXD_DCMD_TSE};
378         cmdtype = vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
379         cmdtype |= tso_cmd[(ol_flags & PKT_TX_TCP_SEG) != 0];
380         return cmdtype;
381 }
382
383 uint16_t
384 eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
385                uint16_t nb_pkts)
386 {
387         struct igb_tx_queue *txq;
388         struct igb_tx_entry *sw_ring;
389         struct igb_tx_entry *txe, *txn;
390         volatile union e1000_adv_tx_desc *txr;
391         volatile union e1000_adv_tx_desc *txd;
392         struct rte_mbuf     *tx_pkt;
393         struct rte_mbuf     *m_seg;
394         uint64_t buf_dma_addr;
395         uint32_t olinfo_status;
396         uint32_t cmd_type_len;
397         uint32_t pkt_len;
398         uint16_t slen;
399         uint64_t ol_flags;
400         uint16_t tx_end;
401         uint16_t tx_id;
402         uint16_t tx_last;
403         uint16_t nb_tx;
404         uint64_t tx_ol_req;
405         uint32_t new_ctx = 0;
406         uint32_t ctx = 0;
407         union igb_tx_offload tx_offload = {0};
408
409         txq = tx_queue;
410         sw_ring = txq->sw_ring;
411         txr     = txq->tx_ring;
412         tx_id   = txq->tx_tail;
413         txe = &sw_ring[tx_id];
414
415         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
416                 tx_pkt = *tx_pkts++;
417                 pkt_len = tx_pkt->pkt_len;
418
419                 RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
420
421                 /*
422                  * The number of descriptors that must be allocated for a
423                  * packet is the number of segments of that packet, plus 1
424                  * Context Descriptor for the VLAN Tag Identifier, if any.
425                  * Determine the last TX descriptor to allocate in the TX ring
426                  * for the packet, starting from the current position (tx_id)
427                  * in the ring.
428                  */
429                 tx_last = (uint16_t) (tx_id + tx_pkt->nb_segs - 1);
430
431                 ol_flags = tx_pkt->ol_flags;
432                 tx_ol_req = ol_flags & IGB_TX_OFFLOAD_MASK;
433
434                 /* If a Context Descriptor need be built . */
435                 if (tx_ol_req) {
436                         tx_offload.l2_len = tx_pkt->l2_len;
437                         tx_offload.l3_len = tx_pkt->l3_len;
438                         tx_offload.l4_len = tx_pkt->l4_len;
439                         tx_offload.vlan_tci = tx_pkt->vlan_tci;
440                         tx_offload.tso_segsz = tx_pkt->tso_segsz;
441                         tx_ol_req = check_tso_para(tx_ol_req, tx_offload);
442
443                         ctx = what_advctx_update(txq, tx_ol_req, tx_offload);
444                         /* Only allocate context descriptor if required*/
445                         new_ctx = (ctx == IGB_CTX_NUM);
446                         ctx = txq->ctx_curr + txq->ctx_start;
447                         tx_last = (uint16_t) (tx_last + new_ctx);
448                 }
449                 if (tx_last >= txq->nb_tx_desc)
450                         tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
451
452                 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
453                            " tx_first=%u tx_last=%u",
454                            (unsigned) txq->port_id,
455                            (unsigned) txq->queue_id,
456                            (unsigned) pkt_len,
457                            (unsigned) tx_id,
458                            (unsigned) tx_last);
459
460                 /*
461                  * Check if there are enough free descriptors in the TX ring
462                  * to transmit the next packet.
463                  * This operation is based on the two following rules:
464                  *
465                  *   1- Only check that the last needed TX descriptor can be
466                  *      allocated (by construction, if that descriptor is free,
467                  *      all intermediate ones are also free).
468                  *
469                  *      For this purpose, the index of the last TX descriptor
470                  *      used for a packet (the "last descriptor" of a packet)
471                  *      is recorded in the TX entries (the last one included)
472                  *      that are associated with all TX descriptors allocated
473                  *      for that packet.
474                  *
475                  *   2- Avoid to allocate the last free TX descriptor of the
476                  *      ring, in order to never set the TDT register with the
477                  *      same value stored in parallel by the NIC in the TDH
478                  *      register, which makes the TX engine of the NIC enter
479                  *      in a deadlock situation.
480                  *
481                  *      By extension, avoid to allocate a free descriptor that
482                  *      belongs to the last set of free descriptors allocated
483                  *      to the same packet previously transmitted.
484                  */
485
486                 /*
487                  * The "last descriptor" of the previously sent packet, if any,
488                  * which used the last descriptor to allocate.
489                  */
490                 tx_end = sw_ring[tx_last].last_id;
491
492                 /*
493                  * The next descriptor following that "last descriptor" in the
494                  * ring.
495                  */
496                 tx_end = sw_ring[tx_end].next_id;
497
498                 /*
499                  * The "last descriptor" associated with that next descriptor.
500                  */
501                 tx_end = sw_ring[tx_end].last_id;
502
503                 /*
504                  * Check that this descriptor is free.
505                  */
506                 if (! (txr[tx_end].wb.status & E1000_TXD_STAT_DD)) {
507                         if (nb_tx == 0)
508                                 return 0;
509                         goto end_of_tx;
510                 }
511
512                 /*
513                  * Set common flags of all TX Data Descriptors.
514                  *
515                  * The following bits must be set in all Data Descriptors:
516                  *   - E1000_ADVTXD_DTYP_DATA
517                  *   - E1000_ADVTXD_DCMD_DEXT
518                  *
519                  * The following bits must be set in the first Data Descriptor
520                  * and are ignored in the other ones:
521                  *   - E1000_ADVTXD_DCMD_IFCS
522                  *   - E1000_ADVTXD_MAC_1588
523                  *   - E1000_ADVTXD_DCMD_VLE
524                  *
525                  * The following bits must only be set in the last Data
526                  * Descriptor:
527                  *   - E1000_TXD_CMD_EOP
528                  *
529                  * The following bits can be set in any Data Descriptor, but
530                  * are only set in the last Data Descriptor:
531                  *   - E1000_TXD_CMD_RS
532                  */
533                 cmd_type_len = txq->txd_type |
534                         E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT;
535                 if (tx_ol_req & PKT_TX_TCP_SEG)
536                         pkt_len -= (tx_pkt->l2_len + tx_pkt->l3_len + tx_pkt->l4_len);
537                 olinfo_status = (pkt_len << E1000_ADVTXD_PAYLEN_SHIFT);
538 #if defined(RTE_LIBRTE_IEEE1588)
539                 if (ol_flags & PKT_TX_IEEE1588_TMST)
540                         cmd_type_len |= E1000_ADVTXD_MAC_TSTAMP;
541 #endif
542                 if (tx_ol_req) {
543                         /* Setup TX Advanced context descriptor if required */
544                         if (new_ctx) {
545                                 volatile struct e1000_adv_tx_context_desc *
546                                     ctx_txd;
547
548                                 ctx_txd = (volatile struct
549                                     e1000_adv_tx_context_desc *)
550                                     &txr[tx_id];
551
552                                 txn = &sw_ring[txe->next_id];
553                                 RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
554
555                                 if (txe->mbuf != NULL) {
556                                         rte_pktmbuf_free_seg(txe->mbuf);
557                                         txe->mbuf = NULL;
558                                 }
559
560                                 igbe_set_xmit_ctx(txq, ctx_txd, tx_ol_req, tx_offload);
561
562                                 txe->last_id = tx_last;
563                                 tx_id = txe->next_id;
564                                 txe = txn;
565                         }
566
567                         /* Setup the TX Advanced Data Descriptor */
568                         cmd_type_len  |= tx_desc_vlan_flags_to_cmdtype(tx_ol_req);
569                         olinfo_status |= tx_desc_cksum_flags_to_olinfo(tx_ol_req);
570                         olinfo_status |= (ctx << E1000_ADVTXD_IDX_SHIFT);
571                 }
572
573                 m_seg = tx_pkt;
574                 do {
575                         txn = &sw_ring[txe->next_id];
576                         txd = &txr[tx_id];
577
578                         if (txe->mbuf != NULL)
579                                 rte_pktmbuf_free_seg(txe->mbuf);
580                         txe->mbuf = m_seg;
581
582                         /*
583                          * Set up transmit descriptor.
584                          */
585                         slen = (uint16_t) m_seg->data_len;
586                         buf_dma_addr = rte_mbuf_data_dma_addr(m_seg);
587                         txd->read.buffer_addr =
588                                 rte_cpu_to_le_64(buf_dma_addr);
589                         txd->read.cmd_type_len =
590                                 rte_cpu_to_le_32(cmd_type_len | slen);
591                         txd->read.olinfo_status =
592                                 rte_cpu_to_le_32(olinfo_status);
593                         txe->last_id = tx_last;
594                         tx_id = txe->next_id;
595                         txe = txn;
596                         m_seg = m_seg->next;
597                 } while (m_seg != NULL);
598
599                 /*
600                  * The last packet data descriptor needs End Of Packet (EOP)
601                  * and Report Status (RS).
602                  */
603                 txd->read.cmd_type_len |=
604                         rte_cpu_to_le_32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS);
605         }
606  end_of_tx:
607         rte_wmb();
608
609         /*
610          * Set the Transmit Descriptor Tail (TDT).
611          */
612         E1000_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
613         PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
614                    (unsigned) txq->port_id, (unsigned) txq->queue_id,
615                    (unsigned) tx_id, (unsigned) nb_tx);
616         txq->tx_tail = tx_id;
617
618         return nb_tx;
619 }
620
621 /*********************************************************************
622  *
623  *  RX functions
624  *
625  **********************************************************************/
626 #define IGB_PACKET_TYPE_IPV4              0X01
627 #define IGB_PACKET_TYPE_IPV4_TCP          0X11
628 #define IGB_PACKET_TYPE_IPV4_UDP          0X21
629 #define IGB_PACKET_TYPE_IPV4_SCTP         0X41
630 #define IGB_PACKET_TYPE_IPV4_EXT          0X03
631 #define IGB_PACKET_TYPE_IPV4_EXT_SCTP     0X43
632 #define IGB_PACKET_TYPE_IPV6              0X04
633 #define IGB_PACKET_TYPE_IPV6_TCP          0X14
634 #define IGB_PACKET_TYPE_IPV6_UDP          0X24
635 #define IGB_PACKET_TYPE_IPV6_EXT          0X0C
636 #define IGB_PACKET_TYPE_IPV6_EXT_TCP      0X1C
637 #define IGB_PACKET_TYPE_IPV6_EXT_UDP      0X2C
638 #define IGB_PACKET_TYPE_IPV4_IPV6         0X05
639 #define IGB_PACKET_TYPE_IPV4_IPV6_TCP     0X15
640 #define IGB_PACKET_TYPE_IPV4_IPV6_UDP     0X25
641 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT     0X0D
642 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT_TCP 0X1D
643 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT_UDP 0X2D
644 #define IGB_PACKET_TYPE_MAX               0X80
645 #define IGB_PACKET_TYPE_MASK              0X7F
646 #define IGB_PACKET_TYPE_SHIFT             0X04
647 static inline uint32_t
648 igb_rxd_pkt_info_to_pkt_type(uint16_t pkt_info)
649 {
650         static const uint32_t
651                 ptype_table[IGB_PACKET_TYPE_MAX] __rte_cache_aligned = {
652                 [IGB_PACKET_TYPE_IPV4] = RTE_PTYPE_L2_ETHER |
653                         RTE_PTYPE_L3_IPV4,
654                 [IGB_PACKET_TYPE_IPV4_EXT] = RTE_PTYPE_L2_ETHER |
655                         RTE_PTYPE_L3_IPV4_EXT,
656                 [IGB_PACKET_TYPE_IPV6] = RTE_PTYPE_L2_ETHER |
657                         RTE_PTYPE_L3_IPV6,
658                 [IGB_PACKET_TYPE_IPV4_IPV6] = RTE_PTYPE_L2_ETHER |
659                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
660                         RTE_PTYPE_INNER_L3_IPV6,
661                 [IGB_PACKET_TYPE_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
662                         RTE_PTYPE_L3_IPV6_EXT,
663                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
664                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
665                         RTE_PTYPE_INNER_L3_IPV6_EXT,
666                 [IGB_PACKET_TYPE_IPV4_TCP] = RTE_PTYPE_L2_ETHER |
667                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP,
668                 [IGB_PACKET_TYPE_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
669                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_TCP,
670                 [IGB_PACKET_TYPE_IPV4_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
671                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
672                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_TCP,
673                 [IGB_PACKET_TYPE_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
674                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_TCP,
675                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
676                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
677                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_TCP,
678                 [IGB_PACKET_TYPE_IPV4_UDP] = RTE_PTYPE_L2_ETHER |
679                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_UDP,
680                 [IGB_PACKET_TYPE_IPV6_UDP] = RTE_PTYPE_L2_ETHER |
681                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_UDP,
682                 [IGB_PACKET_TYPE_IPV4_IPV6_UDP] =  RTE_PTYPE_L2_ETHER |
683                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
684                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_UDP,
685                 [IGB_PACKET_TYPE_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
686                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_UDP,
687                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
688                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
689                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_UDP,
690                 [IGB_PACKET_TYPE_IPV4_SCTP] = RTE_PTYPE_L2_ETHER |
691                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_SCTP,
692                 [IGB_PACKET_TYPE_IPV4_EXT_SCTP] = RTE_PTYPE_L2_ETHER |
693                         RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L4_SCTP,
694         };
695         if (unlikely(pkt_info & E1000_RXDADV_PKTTYPE_ETQF))
696                 return RTE_PTYPE_UNKNOWN;
697
698         pkt_info = (pkt_info >> IGB_PACKET_TYPE_SHIFT) & IGB_PACKET_TYPE_MASK;
699
700         return ptype_table[pkt_info];
701 }
702
703 static inline uint64_t
704 rx_desc_hlen_type_rss_to_pkt_flags(struct igb_rx_queue *rxq, uint32_t hl_tp_rs)
705 {
706         uint64_t pkt_flags = ((hl_tp_rs & 0x0F) == 0) ?  0 : PKT_RX_RSS_HASH;
707
708 #if defined(RTE_LIBRTE_IEEE1588)
709         static uint32_t ip_pkt_etqf_map[8] = {
710                 0, 0, 0, PKT_RX_IEEE1588_PTP,
711                 0, 0, 0, 0,
712         };
713
714         struct rte_eth_dev dev = rte_eth_devices[rxq->port_id];
715         struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev.data->dev_private);
716
717         /* EtherType is in bits 8:10 in Packet Type, and not in the default 0:2 */
718         if (hw->mac.type == e1000_i210)
719                 pkt_flags |= ip_pkt_etqf_map[(hl_tp_rs >> 12) & 0x07];
720         else
721                 pkt_flags |= ip_pkt_etqf_map[(hl_tp_rs >> 4) & 0x07];
722 #else
723         RTE_SET_USED(rxq);
724 #endif
725
726         return pkt_flags;
727 }
728
729 static inline uint64_t
730 rx_desc_status_to_pkt_flags(uint32_t rx_status)
731 {
732         uint64_t pkt_flags;
733
734         /* Check if VLAN present */
735         pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?
736                 PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED : 0);
737
738 #if defined(RTE_LIBRTE_IEEE1588)
739         if (rx_status & E1000_RXD_STAT_TMST)
740                 pkt_flags = pkt_flags | PKT_RX_IEEE1588_TMST;
741 #endif
742         return pkt_flags;
743 }
744
745 static inline uint64_t
746 rx_desc_error_to_pkt_flags(uint32_t rx_status)
747 {
748         /*
749          * Bit 30: IPE, IPv4 checksum error
750          * Bit 29: L4I, L4I integrity error
751          */
752
753         static uint64_t error_to_pkt_flags_map[4] = {
754                 PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD,
755                 PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD,
756                 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD,
757                 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
758         };
759         return error_to_pkt_flags_map[(rx_status >>
760                 E1000_RXD_ERR_CKSUM_BIT) & E1000_RXD_ERR_CKSUM_MSK];
761 }
762
763 uint16_t
764 eth_igb_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
765                uint16_t nb_pkts)
766 {
767         struct igb_rx_queue *rxq;
768         volatile union e1000_adv_rx_desc *rx_ring;
769         volatile union e1000_adv_rx_desc *rxdp;
770         struct igb_rx_entry *sw_ring;
771         struct igb_rx_entry *rxe;
772         struct rte_mbuf *rxm;
773         struct rte_mbuf *nmb;
774         union e1000_adv_rx_desc rxd;
775         uint64_t dma_addr;
776         uint32_t staterr;
777         uint32_t hlen_type_rss;
778         uint16_t pkt_len;
779         uint16_t rx_id;
780         uint16_t nb_rx;
781         uint16_t nb_hold;
782         uint64_t pkt_flags;
783
784         nb_rx = 0;
785         nb_hold = 0;
786         rxq = rx_queue;
787         rx_id = rxq->rx_tail;
788         rx_ring = rxq->rx_ring;
789         sw_ring = rxq->sw_ring;
790         while (nb_rx < nb_pkts) {
791                 /*
792                  * The order of operations here is important as the DD status
793                  * bit must not be read after any other descriptor fields.
794                  * rx_ring and rxdp are pointing to volatile data so the order
795                  * of accesses cannot be reordered by the compiler. If they were
796                  * not volatile, they could be reordered which could lead to
797                  * using invalid descriptor fields when read from rxd.
798                  */
799                 rxdp = &rx_ring[rx_id];
800                 staterr = rxdp->wb.upper.status_error;
801                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
802                         break;
803                 rxd = *rxdp;
804
805                 /*
806                  * End of packet.
807                  *
808                  * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
809                  * likely to be invalid and to be dropped by the various
810                  * validation checks performed by the network stack.
811                  *
812                  * Allocate a new mbuf to replenish the RX ring descriptor.
813                  * If the allocation fails:
814                  *    - arrange for that RX descriptor to be the first one
815                  *      being parsed the next time the receive function is
816                  *      invoked [on the same queue].
817                  *
818                  *    - Stop parsing the RX ring and return immediately.
819                  *
820                  * This policy do not drop the packet received in the RX
821                  * descriptor for which the allocation of a new mbuf failed.
822                  * Thus, it allows that packet to be later retrieved if
823                  * mbuf have been freed in the mean time.
824                  * As a side effect, holding RX descriptors instead of
825                  * systematically giving them back to the NIC may lead to
826                  * RX ring exhaustion situations.
827                  * However, the NIC can gracefully prevent such situations
828                  * to happen by sending specific "back-pressure" flow control
829                  * frames to its peer(s).
830                  */
831                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
832                            "staterr=0x%x pkt_len=%u",
833                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
834                            (unsigned) rx_id, (unsigned) staterr,
835                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
836
837                 nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
838                 if (nmb == NULL) {
839                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
840                                    "queue_id=%u", (unsigned) rxq->port_id,
841                                    (unsigned) rxq->queue_id);
842                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
843                         break;
844                 }
845
846                 nb_hold++;
847                 rxe = &sw_ring[rx_id];
848                 rx_id++;
849                 if (rx_id == rxq->nb_rx_desc)
850                         rx_id = 0;
851
852                 /* Prefetch next mbuf while processing current one. */
853                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
854
855                 /*
856                  * When next RX descriptor is on a cache-line boundary,
857                  * prefetch the next 4 RX descriptors and the next 8 pointers
858                  * to mbufs.
859                  */
860                 if ((rx_id & 0x3) == 0) {
861                         rte_igb_prefetch(&rx_ring[rx_id]);
862                         rte_igb_prefetch(&sw_ring[rx_id]);
863                 }
864
865                 rxm = rxe->mbuf;
866                 rxe->mbuf = nmb;
867                 dma_addr =
868                         rte_cpu_to_le_64(rte_mbuf_data_dma_addr_default(nmb));
869                 rxdp->read.hdr_addr = 0;
870                 rxdp->read.pkt_addr = dma_addr;
871
872                 /*
873                  * Initialize the returned mbuf.
874                  * 1) setup generic mbuf fields:
875                  *    - number of segments,
876                  *    - next segment,
877                  *    - packet length,
878                  *    - RX port identifier.
879                  * 2) integrate hardware offload data, if any:
880                  *    - RSS flag & hash,
881                  *    - IP checksum flag,
882                  *    - VLAN TCI, if any,
883                  *    - error flags.
884                  */
885                 pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
886                                       rxq->crc_len);
887                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
888                 rte_packet_prefetch((char *)rxm->buf_addr + rxm->data_off);
889                 rxm->nb_segs = 1;
890                 rxm->next = NULL;
891                 rxm->pkt_len = pkt_len;
892                 rxm->data_len = pkt_len;
893                 rxm->port = rxq->port_id;
894
895                 rxm->hash.rss = rxd.wb.lower.hi_dword.rss;
896                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
897                 /* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
898                 rxm->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
899
900                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(rxq, hlen_type_rss);
901                 pkt_flags = pkt_flags | rx_desc_status_to_pkt_flags(staterr);
902                 pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
903                 rxm->ol_flags = pkt_flags;
904                 rxm->packet_type = igb_rxd_pkt_info_to_pkt_type(rxd.wb.lower.
905                                                 lo_dword.hs_rss.pkt_info);
906
907                 /*
908                  * Store the mbuf address into the next entry of the array
909                  * of returned packets.
910                  */
911                 rx_pkts[nb_rx++] = rxm;
912         }
913         rxq->rx_tail = rx_id;
914
915         /*
916          * If the number of free RX descriptors is greater than the RX free
917          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
918          * register.
919          * Update the RDT with the value of the last processed RX descriptor
920          * minus 1, to guarantee that the RDT register is never equal to the
921          * RDH register, which creates a "full" ring situtation from the
922          * hardware point of view...
923          */
924         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
925         if (nb_hold > rxq->rx_free_thresh) {
926                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
927                            "nb_hold=%u nb_rx=%u",
928                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
929                            (unsigned) rx_id, (unsigned) nb_hold,
930                            (unsigned) nb_rx);
931                 rx_id = (uint16_t) ((rx_id == 0) ?
932                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
933                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
934                 nb_hold = 0;
935         }
936         rxq->nb_rx_hold = nb_hold;
937         return nb_rx;
938 }
939
940 uint16_t
941 eth_igb_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
942                          uint16_t nb_pkts)
943 {
944         struct igb_rx_queue *rxq;
945         volatile union e1000_adv_rx_desc *rx_ring;
946         volatile union e1000_adv_rx_desc *rxdp;
947         struct igb_rx_entry *sw_ring;
948         struct igb_rx_entry *rxe;
949         struct rte_mbuf *first_seg;
950         struct rte_mbuf *last_seg;
951         struct rte_mbuf *rxm;
952         struct rte_mbuf *nmb;
953         union e1000_adv_rx_desc rxd;
954         uint64_t dma; /* Physical address of mbuf data buffer */
955         uint32_t staterr;
956         uint32_t hlen_type_rss;
957         uint16_t rx_id;
958         uint16_t nb_rx;
959         uint16_t nb_hold;
960         uint16_t data_len;
961         uint64_t pkt_flags;
962
963         nb_rx = 0;
964         nb_hold = 0;
965         rxq = rx_queue;
966         rx_id = rxq->rx_tail;
967         rx_ring = rxq->rx_ring;
968         sw_ring = rxq->sw_ring;
969
970         /*
971          * Retrieve RX context of current packet, if any.
972          */
973         first_seg = rxq->pkt_first_seg;
974         last_seg = rxq->pkt_last_seg;
975
976         while (nb_rx < nb_pkts) {
977         next_desc:
978                 /*
979                  * The order of operations here is important as the DD status
980                  * bit must not be read after any other descriptor fields.
981                  * rx_ring and rxdp are pointing to volatile data so the order
982                  * of accesses cannot be reordered by the compiler. If they were
983                  * not volatile, they could be reordered which could lead to
984                  * using invalid descriptor fields when read from rxd.
985                  */
986                 rxdp = &rx_ring[rx_id];
987                 staterr = rxdp->wb.upper.status_error;
988                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
989                         break;
990                 rxd = *rxdp;
991
992                 /*
993                  * Descriptor done.
994                  *
995                  * Allocate a new mbuf to replenish the RX ring descriptor.
996                  * If the allocation fails:
997                  *    - arrange for that RX descriptor to be the first one
998                  *      being parsed the next time the receive function is
999                  *      invoked [on the same queue].
1000                  *
1001                  *    - Stop parsing the RX ring and return immediately.
1002                  *
1003                  * This policy does not drop the packet received in the RX
1004                  * descriptor for which the allocation of a new mbuf failed.
1005                  * Thus, it allows that packet to be later retrieved if
1006                  * mbuf have been freed in the mean time.
1007                  * As a side effect, holding RX descriptors instead of
1008                  * systematically giving them back to the NIC may lead to
1009                  * RX ring exhaustion situations.
1010                  * However, the NIC can gracefully prevent such situations
1011                  * to happen by sending specific "back-pressure" flow control
1012                  * frames to its peer(s).
1013                  */
1014                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
1015                            "staterr=0x%x data_len=%u",
1016                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1017                            (unsigned) rx_id, (unsigned) staterr,
1018                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
1019
1020                 nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
1021                 if (nmb == NULL) {
1022                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
1023                                    "queue_id=%u", (unsigned) rxq->port_id,
1024                                    (unsigned) rxq->queue_id);
1025                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
1026                         break;
1027                 }
1028
1029                 nb_hold++;
1030                 rxe = &sw_ring[rx_id];
1031                 rx_id++;
1032                 if (rx_id == rxq->nb_rx_desc)
1033                         rx_id = 0;
1034
1035                 /* Prefetch next mbuf while processing current one. */
1036                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
1037
1038                 /*
1039                  * When next RX descriptor is on a cache-line boundary,
1040                  * prefetch the next 4 RX descriptors and the next 8 pointers
1041                  * to mbufs.
1042                  */
1043                 if ((rx_id & 0x3) == 0) {
1044                         rte_igb_prefetch(&rx_ring[rx_id]);
1045                         rte_igb_prefetch(&sw_ring[rx_id]);
1046                 }
1047
1048                 /*
1049                  * Update RX descriptor with the physical address of the new
1050                  * data buffer of the new allocated mbuf.
1051                  */
1052                 rxm = rxe->mbuf;
1053                 rxe->mbuf = nmb;
1054                 dma = rte_cpu_to_le_64(rte_mbuf_data_dma_addr_default(nmb));
1055                 rxdp->read.pkt_addr = dma;
1056                 rxdp->read.hdr_addr = 0;
1057
1058                 /*
1059                  * Set data length & data buffer address of mbuf.
1060                  */
1061                 data_len = rte_le_to_cpu_16(rxd.wb.upper.length);
1062                 rxm->data_len = data_len;
1063                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
1064
1065                 /*
1066                  * If this is the first buffer of the received packet,
1067                  * set the pointer to the first mbuf of the packet and
1068                  * initialize its context.
1069                  * Otherwise, update the total length and the number of segments
1070                  * of the current scattered packet, and update the pointer to
1071                  * the last mbuf of the current packet.
1072                  */
1073                 if (first_seg == NULL) {
1074                         first_seg = rxm;
1075                         first_seg->pkt_len = data_len;
1076                         first_seg->nb_segs = 1;
1077                 } else {
1078                         first_seg->pkt_len += data_len;
1079                         first_seg->nb_segs++;
1080                         last_seg->next = rxm;
1081                 }
1082
1083                 /*
1084                  * If this is not the last buffer of the received packet,
1085                  * update the pointer to the last mbuf of the current scattered
1086                  * packet and continue to parse the RX ring.
1087                  */
1088                 if (! (staterr & E1000_RXD_STAT_EOP)) {
1089                         last_seg = rxm;
1090                         goto next_desc;
1091                 }
1092
1093                 /*
1094                  * This is the last buffer of the received packet.
1095                  * If the CRC is not stripped by the hardware:
1096                  *   - Subtract the CRC length from the total packet length.
1097                  *   - If the last buffer only contains the whole CRC or a part
1098                  *     of it, free the mbuf associated to the last buffer.
1099                  *     If part of the CRC is also contained in the previous
1100                  *     mbuf, subtract the length of that CRC part from the
1101                  *     data length of the previous mbuf.
1102                  */
1103                 rxm->next = NULL;
1104                 if (unlikely(rxq->crc_len > 0)) {
1105                         first_seg->pkt_len -= ETHER_CRC_LEN;
1106                         if (data_len <= ETHER_CRC_LEN) {
1107                                 rte_pktmbuf_free_seg(rxm);
1108                                 first_seg->nb_segs--;
1109                                 last_seg->data_len = (uint16_t)
1110                                         (last_seg->data_len -
1111                                          (ETHER_CRC_LEN - data_len));
1112                                 last_seg->next = NULL;
1113                         } else
1114                                 rxm->data_len =
1115                                         (uint16_t) (data_len - ETHER_CRC_LEN);
1116                 }
1117
1118                 /*
1119                  * Initialize the first mbuf of the returned packet:
1120                  *    - RX port identifier,
1121                  *    - hardware offload data, if any:
1122                  *      - RSS flag & hash,
1123                  *      - IP checksum flag,
1124                  *      - VLAN TCI, if any,
1125                  *      - error flags.
1126                  */
1127                 first_seg->port = rxq->port_id;
1128                 first_seg->hash.rss = rxd.wb.lower.hi_dword.rss;
1129
1130                 /*
1131                  * The vlan_tci field is only valid when PKT_RX_VLAN_PKT is
1132                  * set in the pkt_flags field.
1133                  */
1134                 first_seg->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
1135                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
1136                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(rxq, hlen_type_rss);
1137                 pkt_flags = pkt_flags | rx_desc_status_to_pkt_flags(staterr);
1138                 pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
1139                 first_seg->ol_flags = pkt_flags;
1140                 first_seg->packet_type = igb_rxd_pkt_info_to_pkt_type(rxd.wb.
1141                                         lower.lo_dword.hs_rss.pkt_info);
1142
1143                 /* Prefetch data of first segment, if configured to do so. */
1144                 rte_packet_prefetch((char *)first_seg->buf_addr +
1145                         first_seg->data_off);
1146
1147                 /*
1148                  * Store the mbuf address into the next entry of the array
1149                  * of returned packets.
1150                  */
1151                 rx_pkts[nb_rx++] = first_seg;
1152
1153                 /*
1154                  * Setup receipt context for a new packet.
1155                  */
1156                 first_seg = NULL;
1157         }
1158
1159         /*
1160          * Record index of the next RX descriptor to probe.
1161          */
1162         rxq->rx_tail = rx_id;
1163
1164         /*
1165          * Save receive context.
1166          */
1167         rxq->pkt_first_seg = first_seg;
1168         rxq->pkt_last_seg = last_seg;
1169
1170         /*
1171          * If the number of free RX descriptors is greater than the RX free
1172          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
1173          * register.
1174          * Update the RDT with the value of the last processed RX descriptor
1175          * minus 1, to guarantee that the RDT register is never equal to the
1176          * RDH register, which creates a "full" ring situtation from the
1177          * hardware point of view...
1178          */
1179         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
1180         if (nb_hold > rxq->rx_free_thresh) {
1181                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
1182                            "nb_hold=%u nb_rx=%u",
1183                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1184                            (unsigned) rx_id, (unsigned) nb_hold,
1185                            (unsigned) nb_rx);
1186                 rx_id = (uint16_t) ((rx_id == 0) ?
1187                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
1188                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
1189                 nb_hold = 0;
1190         }
1191         rxq->nb_rx_hold = nb_hold;
1192         return nb_rx;
1193 }
1194
1195 /*
1196  * Maximum number of Ring Descriptors.
1197  *
1198  * Since RDLEN/TDLEN should be multiple of 128bytes, the number of ring
1199  * desscriptors should meet the following condition:
1200  *      (num_ring_desc * sizeof(struct e1000_rx/tx_desc)) % 128 == 0
1201  */
1202
1203 static void
1204 igb_tx_queue_release_mbufs(struct igb_tx_queue *txq)
1205 {
1206         unsigned i;
1207
1208         if (txq->sw_ring != NULL) {
1209                 for (i = 0; i < txq->nb_tx_desc; i++) {
1210                         if (txq->sw_ring[i].mbuf != NULL) {
1211                                 rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
1212                                 txq->sw_ring[i].mbuf = NULL;
1213                         }
1214                 }
1215         }
1216 }
1217
1218 static void
1219 igb_tx_queue_release(struct igb_tx_queue *txq)
1220 {
1221         if (txq != NULL) {
1222                 igb_tx_queue_release_mbufs(txq);
1223                 rte_free(txq->sw_ring);
1224                 rte_free(txq);
1225         }
1226 }
1227
1228 void
1229 eth_igb_tx_queue_release(void *txq)
1230 {
1231         igb_tx_queue_release(txq);
1232 }
1233
1234 static void
1235 igb_reset_tx_queue_stat(struct igb_tx_queue *txq)
1236 {
1237         txq->tx_head = 0;
1238         txq->tx_tail = 0;
1239         txq->ctx_curr = 0;
1240         memset((void*)&txq->ctx_cache, 0,
1241                 IGB_CTX_NUM * sizeof(struct igb_advctx_info));
1242 }
1243
1244 static void
1245 igb_reset_tx_queue(struct igb_tx_queue *txq, struct rte_eth_dev *dev)
1246 {
1247         static const union e1000_adv_tx_desc zeroed_desc = {{0}};
1248         struct igb_tx_entry *txe = txq->sw_ring;
1249         uint16_t i, prev;
1250         struct e1000_hw *hw;
1251
1252         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1253         /* Zero out HW ring memory */
1254         for (i = 0; i < txq->nb_tx_desc; i++) {
1255                 txq->tx_ring[i] = zeroed_desc;
1256         }
1257
1258         /* Initialize ring entries */
1259         prev = (uint16_t)(txq->nb_tx_desc - 1);
1260         for (i = 0; i < txq->nb_tx_desc; i++) {
1261                 volatile union e1000_adv_tx_desc *txd = &(txq->tx_ring[i]);
1262
1263                 txd->wb.status = E1000_TXD_STAT_DD;
1264                 txe[i].mbuf = NULL;
1265                 txe[i].last_id = i;
1266                 txe[prev].next_id = i;
1267                 prev = i;
1268         }
1269
1270         txq->txd_type = E1000_ADVTXD_DTYP_DATA;
1271         /* 82575 specific, each tx queue will use 2 hw contexts */
1272         if (hw->mac.type == e1000_82575)
1273                 txq->ctx_start = txq->queue_id * IGB_CTX_NUM;
1274
1275         igb_reset_tx_queue_stat(txq);
1276 }
1277
1278 int
1279 eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
1280                          uint16_t queue_idx,
1281                          uint16_t nb_desc,
1282                          unsigned int socket_id,
1283                          const struct rte_eth_txconf *tx_conf)
1284 {
1285         const struct rte_memzone *tz;
1286         struct igb_tx_queue *txq;
1287         struct e1000_hw     *hw;
1288         uint32_t size;
1289
1290         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1291
1292         /*
1293          * Validate number of transmit descriptors.
1294          * It must not exceed hardware maximum, and must be multiple
1295          * of E1000_ALIGN.
1296          */
1297         if (nb_desc % IGB_TXD_ALIGN != 0 ||
1298                         (nb_desc > E1000_MAX_RING_DESC) ||
1299                         (nb_desc < E1000_MIN_RING_DESC)) {
1300                 return -EINVAL;
1301         }
1302
1303         /*
1304          * The tx_free_thresh and tx_rs_thresh values are not used in the 1G
1305          * driver.
1306          */
1307         if (tx_conf->tx_free_thresh != 0)
1308                 PMD_INIT_LOG(INFO, "The tx_free_thresh parameter is not "
1309                              "used for the 1G driver.");
1310         if (tx_conf->tx_rs_thresh != 0)
1311                 PMD_INIT_LOG(INFO, "The tx_rs_thresh parameter is not "
1312                              "used for the 1G driver.");
1313         if (tx_conf->tx_thresh.wthresh == 0 && hw->mac.type != e1000_82576)
1314                 PMD_INIT_LOG(INFO, "To improve 1G driver performance, "
1315                              "consider setting the TX WTHRESH value to 4, 8, "
1316                              "or 16.");
1317
1318         /* Free memory prior to re-allocation if needed */
1319         if (dev->data->tx_queues[queue_idx] != NULL) {
1320                 igb_tx_queue_release(dev->data->tx_queues[queue_idx]);
1321                 dev->data->tx_queues[queue_idx] = NULL;
1322         }
1323
1324         /* First allocate the tx queue data structure */
1325         txq = rte_zmalloc("ethdev TX queue", sizeof(struct igb_tx_queue),
1326                                                         RTE_CACHE_LINE_SIZE);
1327         if (txq == NULL)
1328                 return -ENOMEM;
1329
1330         /*
1331          * Allocate TX ring hardware descriptors. A memzone large enough to
1332          * handle the maximum ring size is allocated in order to allow for
1333          * resizing in later calls to the queue setup function.
1334          */
1335         size = sizeof(union e1000_adv_tx_desc) * E1000_MAX_RING_DESC;
1336         tz = rte_eth_dma_zone_reserve(dev, "tx_ring", queue_idx, size,
1337                                       E1000_ALIGN, socket_id);
1338         if (tz == NULL) {
1339                 igb_tx_queue_release(txq);
1340                 return -ENOMEM;
1341         }
1342
1343         txq->nb_tx_desc = nb_desc;
1344         txq->pthresh = tx_conf->tx_thresh.pthresh;
1345         txq->hthresh = tx_conf->tx_thresh.hthresh;
1346         txq->wthresh = tx_conf->tx_thresh.wthresh;
1347         if (txq->wthresh > 0 && hw->mac.type == e1000_82576)
1348                 txq->wthresh = 1;
1349         txq->queue_id = queue_idx;
1350         txq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1351                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1352         txq->port_id = dev->data->port_id;
1353
1354         txq->tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(txq->reg_idx));
1355         txq->tx_ring_phys_addr = rte_mem_phy2mch(tz->memseg_id, tz->phys_addr);
1356
1357         txq->tx_ring = (union e1000_adv_tx_desc *) tz->addr;
1358         /* Allocate software ring */
1359         txq->sw_ring = rte_zmalloc("txq->sw_ring",
1360                                    sizeof(struct igb_tx_entry) * nb_desc,
1361                                    RTE_CACHE_LINE_SIZE);
1362         if (txq->sw_ring == NULL) {
1363                 igb_tx_queue_release(txq);
1364                 return -ENOMEM;
1365         }
1366         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
1367                      txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
1368
1369         igb_reset_tx_queue(txq, dev);
1370         dev->tx_pkt_burst = eth_igb_xmit_pkts;
1371         dev->data->tx_queues[queue_idx] = txq;
1372
1373         return 0;
1374 }
1375
1376 static void
1377 igb_rx_queue_release_mbufs(struct igb_rx_queue *rxq)
1378 {
1379         unsigned i;
1380
1381         if (rxq->sw_ring != NULL) {
1382                 for (i = 0; i < rxq->nb_rx_desc; i++) {
1383                         if (rxq->sw_ring[i].mbuf != NULL) {
1384                                 rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
1385                                 rxq->sw_ring[i].mbuf = NULL;
1386                         }
1387                 }
1388         }
1389 }
1390
1391 static void
1392 igb_rx_queue_release(struct igb_rx_queue *rxq)
1393 {
1394         if (rxq != NULL) {
1395                 igb_rx_queue_release_mbufs(rxq);
1396                 rte_free(rxq->sw_ring);
1397                 rte_free(rxq);
1398         }
1399 }
1400
1401 void
1402 eth_igb_rx_queue_release(void *rxq)
1403 {
1404         igb_rx_queue_release(rxq);
1405 }
1406
1407 static void
1408 igb_reset_rx_queue(struct igb_rx_queue *rxq)
1409 {
1410         static const union e1000_adv_rx_desc zeroed_desc = {{0}};
1411         unsigned i;
1412
1413         /* Zero out HW ring memory */
1414         for (i = 0; i < rxq->nb_rx_desc; i++) {
1415                 rxq->rx_ring[i] = zeroed_desc;
1416         }
1417
1418         rxq->rx_tail = 0;
1419         rxq->pkt_first_seg = NULL;
1420         rxq->pkt_last_seg = NULL;
1421 }
1422
1423 int
1424 eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
1425                          uint16_t queue_idx,
1426                          uint16_t nb_desc,
1427                          unsigned int socket_id,
1428                          const struct rte_eth_rxconf *rx_conf,
1429                          struct rte_mempool *mp)
1430 {
1431         const struct rte_memzone *rz;
1432         struct igb_rx_queue *rxq;
1433         struct e1000_hw     *hw;
1434         unsigned int size;
1435
1436         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1437
1438         /*
1439          * Validate number of receive descriptors.
1440          * It must not exceed hardware maximum, and must be multiple
1441          * of E1000_ALIGN.
1442          */
1443         if (nb_desc % IGB_RXD_ALIGN != 0 ||
1444                         (nb_desc > E1000_MAX_RING_DESC) ||
1445                         (nb_desc < E1000_MIN_RING_DESC)) {
1446                 return -EINVAL;
1447         }
1448
1449         /* Free memory prior to re-allocation if needed */
1450         if (dev->data->rx_queues[queue_idx] != NULL) {
1451                 igb_rx_queue_release(dev->data->rx_queues[queue_idx]);
1452                 dev->data->rx_queues[queue_idx] = NULL;
1453         }
1454
1455         /* First allocate the RX queue data structure. */
1456         rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
1457                           RTE_CACHE_LINE_SIZE);
1458         if (rxq == NULL)
1459                 return -ENOMEM;
1460         rxq->mb_pool = mp;
1461         rxq->nb_rx_desc = nb_desc;
1462         rxq->pthresh = rx_conf->rx_thresh.pthresh;
1463         rxq->hthresh = rx_conf->rx_thresh.hthresh;
1464         rxq->wthresh = rx_conf->rx_thresh.wthresh;
1465         if (rxq->wthresh > 0 &&
1466             (hw->mac.type == e1000_82576 || hw->mac.type == e1000_vfadapt_i350))
1467                 rxq->wthresh = 1;
1468         rxq->drop_en = rx_conf->rx_drop_en;
1469         rxq->rx_free_thresh = rx_conf->rx_free_thresh;
1470         rxq->queue_id = queue_idx;
1471         rxq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1472                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1473         rxq->port_id = dev->data->port_id;
1474         rxq->crc_len = (uint8_t) ((dev->data->dev_conf.rxmode.hw_strip_crc) ? 0 :
1475                                   ETHER_CRC_LEN);
1476
1477         /*
1478          *  Allocate RX ring hardware descriptors. A memzone large enough to
1479          *  handle the maximum ring size is allocated in order to allow for
1480          *  resizing in later calls to the queue setup function.
1481          */
1482         size = sizeof(union e1000_adv_rx_desc) * E1000_MAX_RING_DESC;
1483         rz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx, size,
1484                                       E1000_ALIGN, socket_id);
1485         if (rz == NULL) {
1486                 igb_rx_queue_release(rxq);
1487                 return -ENOMEM;
1488         }
1489         rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(rxq->reg_idx));
1490         rxq->rdh_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDH(rxq->reg_idx));
1491         rxq->rx_ring_phys_addr = rte_mem_phy2mch(rz->memseg_id, rz->phys_addr);
1492         rxq->rx_ring = (union e1000_adv_rx_desc *) rz->addr;
1493
1494         /* Allocate software ring. */
1495         rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
1496                                    sizeof(struct igb_rx_entry) * nb_desc,
1497                                    RTE_CACHE_LINE_SIZE);
1498         if (rxq->sw_ring == NULL) {
1499                 igb_rx_queue_release(rxq);
1500                 return -ENOMEM;
1501         }
1502         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
1503                      rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
1504
1505         dev->data->rx_queues[queue_idx] = rxq;
1506         igb_reset_rx_queue(rxq);
1507
1508         return 0;
1509 }
1510
1511 uint32_t
1512 eth_igb_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1513 {
1514 #define IGB_RXQ_SCAN_INTERVAL 4
1515         volatile union e1000_adv_rx_desc *rxdp;
1516         struct igb_rx_queue *rxq;
1517         uint32_t desc = 0;
1518
1519         if (rx_queue_id >= dev->data->nb_rx_queues) {
1520                 PMD_RX_LOG(ERR, "Invalid RX queue id=%d", rx_queue_id);
1521                 return 0;
1522         }
1523
1524         rxq = dev->data->rx_queues[rx_queue_id];
1525         rxdp = &(rxq->rx_ring[rxq->rx_tail]);
1526
1527         while ((desc < rxq->nb_rx_desc) &&
1528                 (rxdp->wb.upper.status_error & E1000_RXD_STAT_DD)) {
1529                 desc += IGB_RXQ_SCAN_INTERVAL;
1530                 rxdp += IGB_RXQ_SCAN_INTERVAL;
1531                 if (rxq->rx_tail + desc >= rxq->nb_rx_desc)
1532                         rxdp = &(rxq->rx_ring[rxq->rx_tail +
1533                                 desc - rxq->nb_rx_desc]);
1534         }
1535
1536         return desc;
1537 }
1538
1539 int
1540 eth_igb_rx_descriptor_done(void *rx_queue, uint16_t offset)
1541 {
1542         volatile union e1000_adv_rx_desc *rxdp;
1543         struct igb_rx_queue *rxq = rx_queue;
1544         uint32_t desc;
1545
1546         if (unlikely(offset >= rxq->nb_rx_desc))
1547                 return 0;
1548         desc = rxq->rx_tail + offset;
1549         if (desc >= rxq->nb_rx_desc)
1550                 desc -= rxq->nb_rx_desc;
1551
1552         rxdp = &rxq->rx_ring[desc];
1553         return !!(rxdp->wb.upper.status_error & E1000_RXD_STAT_DD);
1554 }
1555
1556 void
1557 igb_dev_clear_queues(struct rte_eth_dev *dev)
1558 {
1559         uint16_t i;
1560         struct igb_tx_queue *txq;
1561         struct igb_rx_queue *rxq;
1562
1563         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1564                 txq = dev->data->tx_queues[i];
1565                 if (txq != NULL) {
1566                         igb_tx_queue_release_mbufs(txq);
1567                         igb_reset_tx_queue(txq, dev);
1568                 }
1569         }
1570
1571         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1572                 rxq = dev->data->rx_queues[i];
1573                 if (rxq != NULL) {
1574                         igb_rx_queue_release_mbufs(rxq);
1575                         igb_reset_rx_queue(rxq);
1576                 }
1577         }
1578 }
1579
1580 void
1581 igb_dev_free_queues(struct rte_eth_dev *dev)
1582 {
1583         uint16_t i;
1584
1585         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1586                 eth_igb_rx_queue_release(dev->data->rx_queues[i]);
1587                 dev->data->rx_queues[i] = NULL;
1588         }
1589         dev->data->nb_rx_queues = 0;
1590
1591         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1592                 eth_igb_tx_queue_release(dev->data->tx_queues[i]);
1593                 dev->data->tx_queues[i] = NULL;
1594         }
1595         dev->data->nb_tx_queues = 0;
1596 }
1597
1598 /**
1599  * Receive Side Scaling (RSS).
1600  * See section 7.1.1.7 in the following document:
1601  *     "Intel 82576 GbE Controller Datasheet" - Revision 2.45 October 2009
1602  *
1603  * Principles:
1604  * The source and destination IP addresses of the IP header and the source and
1605  * destination ports of TCP/UDP headers, if any, of received packets are hashed
1606  * against a configurable random key to compute a 32-bit RSS hash result.
1607  * The seven (7) LSBs of the 32-bit hash result are used as an index into a
1608  * 128-entry redirection table (RETA).  Each entry of the RETA provides a 3-bit
1609  * RSS output index which is used as the RX queue index where to store the
1610  * received packets.
1611  * The following output is supplied in the RX write-back descriptor:
1612  *     - 32-bit result of the Microsoft RSS hash function,
1613  *     - 4-bit RSS type field.
1614  */
1615
1616 /*
1617  * RSS random key supplied in section 7.1.1.7.3 of the Intel 82576 datasheet.
1618  * Used as the default key.
1619  */
1620 static uint8_t rss_intel_key[40] = {
1621         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1622         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1623         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1624         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1625         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1626 };
1627
1628 static void
1629 igb_rss_disable(struct rte_eth_dev *dev)
1630 {
1631         struct e1000_hw *hw;
1632         uint32_t mrqc;
1633
1634         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1635         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1636         mrqc &= ~E1000_MRQC_ENABLE_MASK;
1637         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1638 }
1639
1640 static void
1641 igb_hw_rss_hash_set(struct e1000_hw *hw, struct rte_eth_rss_conf *rss_conf)
1642 {
1643         uint8_t  *hash_key;
1644         uint32_t rss_key;
1645         uint32_t mrqc;
1646         uint64_t rss_hf;
1647         uint16_t i;
1648
1649         hash_key = rss_conf->rss_key;
1650         if (hash_key != NULL) {
1651                 /* Fill in RSS hash key */
1652                 for (i = 0; i < 10; i++) {
1653                         rss_key  = hash_key[(i * 4)];
1654                         rss_key |= hash_key[(i * 4) + 1] << 8;
1655                         rss_key |= hash_key[(i * 4) + 2] << 16;
1656                         rss_key |= hash_key[(i * 4) + 3] << 24;
1657                         E1000_WRITE_REG_ARRAY(hw, E1000_RSSRK(0), i, rss_key);
1658                 }
1659         }
1660
1661         /* Set configured hashing protocols in MRQC register */
1662         rss_hf = rss_conf->rss_hf;
1663         mrqc = E1000_MRQC_ENABLE_RSS_4Q; /* RSS enabled. */
1664         if (rss_hf & ETH_RSS_IPV4)
1665                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4;
1666         if (rss_hf & ETH_RSS_NONFRAG_IPV4_TCP)
1667                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_TCP;
1668         if (rss_hf & ETH_RSS_IPV6)
1669                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6;
1670         if (rss_hf & ETH_RSS_IPV6_EX)
1671                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_EX;
1672         if (rss_hf & ETH_RSS_NONFRAG_IPV6_TCP)
1673                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP;
1674         if (rss_hf & ETH_RSS_IPV6_TCP_EX)
1675                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP_EX;
1676         if (rss_hf & ETH_RSS_NONFRAG_IPV4_UDP)
1677                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_UDP;
1678         if (rss_hf & ETH_RSS_NONFRAG_IPV6_UDP)
1679                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP;
1680         if (rss_hf & ETH_RSS_IPV6_UDP_EX)
1681                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP_EX;
1682         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1683 }
1684
1685 int
1686 eth_igb_rss_hash_update(struct rte_eth_dev *dev,
1687                         struct rte_eth_rss_conf *rss_conf)
1688 {
1689         struct e1000_hw *hw;
1690         uint32_t mrqc;
1691         uint64_t rss_hf;
1692
1693         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1694
1695         /*
1696          * Before changing anything, first check that the update RSS operation
1697          * does not attempt to disable RSS, if RSS was enabled at
1698          * initialization time, or does not attempt to enable RSS, if RSS was
1699          * disabled at initialization time.
1700          */
1701         rss_hf = rss_conf->rss_hf & IGB_RSS_OFFLOAD_ALL;
1702         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1703         if (!(mrqc & E1000_MRQC_ENABLE_MASK)) { /* RSS disabled */
1704                 if (rss_hf != 0) /* Enable RSS */
1705                         return -(EINVAL);
1706                 return 0; /* Nothing to do */
1707         }
1708         /* RSS enabled */
1709         if (rss_hf == 0) /* Disable RSS */
1710                 return -(EINVAL);
1711         igb_hw_rss_hash_set(hw, rss_conf);
1712         return 0;
1713 }
1714
1715 int eth_igb_rss_hash_conf_get(struct rte_eth_dev *dev,
1716                               struct rte_eth_rss_conf *rss_conf)
1717 {
1718         struct e1000_hw *hw;
1719         uint8_t *hash_key;
1720         uint32_t rss_key;
1721         uint32_t mrqc;
1722         uint64_t rss_hf;
1723         uint16_t i;
1724
1725         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1726         hash_key = rss_conf->rss_key;
1727         if (hash_key != NULL) {
1728                 /* Return RSS hash key */
1729                 for (i = 0; i < 10; i++) {
1730                         rss_key = E1000_READ_REG_ARRAY(hw, E1000_RSSRK(0), i);
1731                         hash_key[(i * 4)] = rss_key & 0x000000FF;
1732                         hash_key[(i * 4) + 1] = (rss_key >> 8) & 0x000000FF;
1733                         hash_key[(i * 4) + 2] = (rss_key >> 16) & 0x000000FF;
1734                         hash_key[(i * 4) + 3] = (rss_key >> 24) & 0x000000FF;
1735                 }
1736         }
1737
1738         /* Get RSS functions configured in MRQC register */
1739         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1740         if ((mrqc & E1000_MRQC_ENABLE_RSS_4Q) == 0) { /* RSS is disabled */
1741                 rss_conf->rss_hf = 0;
1742                 return 0;
1743         }
1744         rss_hf = 0;
1745         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4)
1746                 rss_hf |= ETH_RSS_IPV4;
1747         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_TCP)
1748                 rss_hf |= ETH_RSS_NONFRAG_IPV4_TCP;
1749         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6)
1750                 rss_hf |= ETH_RSS_IPV6;
1751         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_EX)
1752                 rss_hf |= ETH_RSS_IPV6_EX;
1753         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP)
1754                 rss_hf |= ETH_RSS_NONFRAG_IPV6_TCP;
1755         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP_EX)
1756                 rss_hf |= ETH_RSS_IPV6_TCP_EX;
1757         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_UDP)
1758                 rss_hf |= ETH_RSS_NONFRAG_IPV4_UDP;
1759         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP)
1760                 rss_hf |= ETH_RSS_NONFRAG_IPV6_UDP;
1761         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP_EX)
1762                 rss_hf |= ETH_RSS_IPV6_UDP_EX;
1763         rss_conf->rss_hf = rss_hf;
1764         return 0;
1765 }
1766
1767 static void
1768 igb_rss_configure(struct rte_eth_dev *dev)
1769 {
1770         struct rte_eth_rss_conf rss_conf;
1771         struct e1000_hw *hw;
1772         uint32_t shift;
1773         uint16_t i;
1774
1775         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1776
1777         /* Fill in redirection table. */
1778         shift = (hw->mac.type == e1000_82575) ? 6 : 0;
1779         for (i = 0; i < 128; i++) {
1780                 union e1000_reta {
1781                         uint32_t dword;
1782                         uint8_t  bytes[4];
1783                 } reta;
1784                 uint8_t q_idx;
1785
1786                 q_idx = (uint8_t) ((dev->data->nb_rx_queues > 1) ?
1787                                    i % dev->data->nb_rx_queues : 0);
1788                 reta.bytes[i & 3] = (uint8_t) (q_idx << shift);
1789                 if ((i & 3) == 3)
1790                         E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
1791         }
1792
1793         /*
1794          * Configure the RSS key and the RSS protocols used to compute
1795          * the RSS hash of input packets.
1796          */
1797         rss_conf = dev->data->dev_conf.rx_adv_conf.rss_conf;
1798         if ((rss_conf.rss_hf & IGB_RSS_OFFLOAD_ALL) == 0) {
1799                 igb_rss_disable(dev);
1800                 return;
1801         }
1802         if (rss_conf.rss_key == NULL)
1803                 rss_conf.rss_key = rss_intel_key; /* Default hash key */
1804         igb_hw_rss_hash_set(hw, &rss_conf);
1805 }
1806
1807 /*
1808  * Check if the mac type support VMDq or not.
1809  * Return 1 if it supports, otherwise, return 0.
1810  */
1811 static int
1812 igb_is_vmdq_supported(const struct rte_eth_dev *dev)
1813 {
1814         const struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1815
1816         switch (hw->mac.type) {
1817         case e1000_82576:
1818         case e1000_82580:
1819         case e1000_i350:
1820                 return 1;
1821         case e1000_82540:
1822         case e1000_82541:
1823         case e1000_82542:
1824         case e1000_82543:
1825         case e1000_82544:
1826         case e1000_82545:
1827         case e1000_82546:
1828         case e1000_82547:
1829         case e1000_82571:
1830         case e1000_82572:
1831         case e1000_82573:
1832         case e1000_82574:
1833         case e1000_82583:
1834         case e1000_i210:
1835         case e1000_i211:
1836         default:
1837                 PMD_INIT_LOG(ERR, "Cannot support VMDq feature");
1838                 return 0;
1839         }
1840 }
1841
1842 static int
1843 igb_vmdq_rx_hw_configure(struct rte_eth_dev *dev)
1844 {
1845         struct rte_eth_vmdq_rx_conf *cfg;
1846         struct e1000_hw *hw;
1847         uint32_t mrqc, vt_ctl, vmolr, rctl;
1848         int i;
1849
1850         PMD_INIT_FUNC_TRACE();
1851
1852         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1853         cfg = &dev->data->dev_conf.rx_adv_conf.vmdq_rx_conf;
1854
1855         /* Check if mac type can support VMDq, return value of 0 means NOT support */
1856         if (igb_is_vmdq_supported(dev) == 0)
1857                 return -1;
1858
1859         igb_rss_disable(dev);
1860
1861         /* RCTL: eanble VLAN filter */
1862         rctl = E1000_READ_REG(hw, E1000_RCTL);
1863         rctl |= E1000_RCTL_VFE;
1864         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
1865
1866         /* MRQC: enable vmdq */
1867         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1868         mrqc |= E1000_MRQC_ENABLE_VMDQ;
1869         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1870
1871         /* VTCTL:  pool selection according to VLAN tag */
1872         vt_ctl = E1000_READ_REG(hw, E1000_VT_CTL);
1873         if (cfg->enable_default_pool)
1874                 vt_ctl |= (cfg->default_pool << E1000_VT_CTL_DEFAULT_POOL_SHIFT);
1875         vt_ctl |= E1000_VT_CTL_IGNORE_MAC;
1876         E1000_WRITE_REG(hw, E1000_VT_CTL, vt_ctl);
1877
1878         for (i = 0; i < E1000_VMOLR_SIZE; i++) {
1879                 vmolr = E1000_READ_REG(hw, E1000_VMOLR(i));
1880                 vmolr &= ~(E1000_VMOLR_AUPE | E1000_VMOLR_ROMPE |
1881                         E1000_VMOLR_ROPE | E1000_VMOLR_BAM |
1882                         E1000_VMOLR_MPME);
1883
1884                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_UNTAG)
1885                         vmolr |= E1000_VMOLR_AUPE;
1886                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_HASH_MC)
1887                         vmolr |= E1000_VMOLR_ROMPE;
1888                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_HASH_UC)
1889                         vmolr |= E1000_VMOLR_ROPE;
1890                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_BROADCAST)
1891                         vmolr |= E1000_VMOLR_BAM;
1892                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_MULTICAST)
1893                         vmolr |= E1000_VMOLR_MPME;
1894
1895                 E1000_WRITE_REG(hw, E1000_VMOLR(i), vmolr);
1896         }
1897
1898         /*
1899          * VMOLR: set STRVLAN as 1 if IGMAC in VTCTL is set as 1
1900          * Both 82576 and 82580 support it
1901          */
1902         if (hw->mac.type != e1000_i350) {
1903                 for (i = 0; i < E1000_VMOLR_SIZE; i++) {
1904                         vmolr = E1000_READ_REG(hw, E1000_VMOLR(i));
1905                         vmolr |= E1000_VMOLR_STRVLAN;
1906                         E1000_WRITE_REG(hw, E1000_VMOLR(i), vmolr);
1907                 }
1908         }
1909
1910         /* VFTA - enable all vlan filters */
1911         for (i = 0; i < IGB_VFTA_SIZE; i++)
1912                 E1000_WRITE_REG(hw, (E1000_VFTA+(i*4)), UINT32_MAX);
1913
1914         /* VFRE: 8 pools enabling for rx, both 82576 and i350 support it */
1915         if (hw->mac.type != e1000_82580)
1916                 E1000_WRITE_REG(hw, E1000_VFRE, E1000_MBVFICR_VFREQ_MASK);
1917
1918         /*
1919          * RAH/RAL - allow pools to read specific mac addresses
1920          * In this case, all pools should be able to read from mac addr 0
1921          */
1922         E1000_WRITE_REG(hw, E1000_RAH(0), (E1000_RAH_AV | UINT16_MAX));
1923         E1000_WRITE_REG(hw, E1000_RAL(0), UINT32_MAX);
1924
1925         /* VLVF: set up filters for vlan tags as configured */
1926         for (i = 0; i < cfg->nb_pool_maps; i++) {
1927                 /* set vlan id in VF register and set the valid bit */
1928                 E1000_WRITE_REG(hw, E1000_VLVF(i), (E1000_VLVF_VLANID_ENABLE | \
1929                         (cfg->pool_map[i].vlan_id & ETH_VLAN_ID_MAX) | \
1930                         ((cfg->pool_map[i].pools << E1000_VLVF_POOLSEL_SHIFT ) & \
1931                         E1000_VLVF_POOLSEL_MASK)));
1932         }
1933
1934         E1000_WRITE_FLUSH(hw);
1935
1936         return 0;
1937 }
1938
1939
1940 /*********************************************************************
1941  *
1942  *  Enable receive unit.
1943  *
1944  **********************************************************************/
1945
1946 static int
1947 igb_alloc_rx_queue_mbufs(struct igb_rx_queue *rxq)
1948 {
1949         struct igb_rx_entry *rxe = rxq->sw_ring;
1950         uint64_t dma_addr;
1951         unsigned i;
1952
1953         /* Initialize software ring entries. */
1954         for (i = 0; i < rxq->nb_rx_desc; i++) {
1955                 volatile union e1000_adv_rx_desc *rxd;
1956                 struct rte_mbuf *mbuf = rte_mbuf_raw_alloc(rxq->mb_pool);
1957
1958                 if (mbuf == NULL) {
1959                         PMD_INIT_LOG(ERR, "RX mbuf alloc failed "
1960                                      "queue_id=%hu", rxq->queue_id);
1961                         return -ENOMEM;
1962                 }
1963                 dma_addr =
1964                         rte_cpu_to_le_64(rte_mbuf_data_dma_addr_default(mbuf));
1965                 rxd = &rxq->rx_ring[i];
1966                 rxd->read.hdr_addr = 0;
1967                 rxd->read.pkt_addr = dma_addr;
1968                 rxe[i].mbuf = mbuf;
1969         }
1970
1971         return 0;
1972 }
1973
1974 #define E1000_MRQC_DEF_Q_SHIFT               (3)
1975 static int
1976 igb_dev_mq_rx_configure(struct rte_eth_dev *dev)
1977 {
1978         struct e1000_hw *hw =
1979                 E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1980         uint32_t mrqc;
1981
1982         if (RTE_ETH_DEV_SRIOV(dev).active == ETH_8_POOLS) {
1983                 /*
1984                  * SRIOV active scheme
1985                  * FIXME if support RSS together with VMDq & SRIOV
1986                  */
1987                 mrqc = E1000_MRQC_ENABLE_VMDQ;
1988                 /* 011b Def_Q ignore, according to VT_CTL.DEF_PL */
1989                 mrqc |= 0x3 << E1000_MRQC_DEF_Q_SHIFT;
1990                 E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1991         } else if(RTE_ETH_DEV_SRIOV(dev).active == 0) {
1992                 /*
1993                  * SRIOV inactive scheme
1994                  */
1995                 switch (dev->data->dev_conf.rxmode.mq_mode) {
1996                         case ETH_MQ_RX_RSS:
1997                                 igb_rss_configure(dev);
1998                                 break;
1999                         case ETH_MQ_RX_VMDQ_ONLY:
2000                                 /*Configure general VMDQ only RX parameters*/
2001                                 igb_vmdq_rx_hw_configure(dev);
2002                                 break;
2003                         case ETH_MQ_RX_NONE:
2004                                 /* if mq_mode is none, disable rss mode.*/
2005                         default:
2006                                 igb_rss_disable(dev);
2007                                 break;
2008                 }
2009         }
2010
2011         return 0;
2012 }
2013
2014 int
2015 eth_igb_rx_init(struct rte_eth_dev *dev)
2016 {
2017         struct e1000_hw     *hw;
2018         struct igb_rx_queue *rxq;
2019         uint32_t rctl;
2020         uint32_t rxcsum;
2021         uint32_t srrctl;
2022         uint16_t buf_size;
2023         uint16_t rctl_bsize;
2024         uint16_t i;
2025         int ret;
2026
2027         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2028         srrctl = 0;
2029
2030         /*
2031          * Make sure receives are disabled while setting
2032          * up the descriptor ring.
2033          */
2034         rctl = E1000_READ_REG(hw, E1000_RCTL);
2035         E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
2036
2037         /*
2038          * Configure support of jumbo frames, if any.
2039          */
2040         if (dev->data->dev_conf.rxmode.jumbo_frame == 1) {
2041                 rctl |= E1000_RCTL_LPE;
2042
2043                 /*
2044                  * Set maximum packet length by default, and might be updated
2045                  * together with enabling/disabling dual VLAN.
2046                  */
2047                 E1000_WRITE_REG(hw, E1000_RLPML,
2048                         dev->data->dev_conf.rxmode.max_rx_pkt_len +
2049                                                 VLAN_TAG_SIZE);
2050         } else
2051                 rctl &= ~E1000_RCTL_LPE;
2052
2053         /* Configure and enable each RX queue. */
2054         rctl_bsize = 0;
2055         dev->rx_pkt_burst = eth_igb_recv_pkts;
2056         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2057                 uint64_t bus_addr;
2058                 uint32_t rxdctl;
2059
2060                 rxq = dev->data->rx_queues[i];
2061
2062                 /* Allocate buffers for descriptor rings and set up queue */
2063                 ret = igb_alloc_rx_queue_mbufs(rxq);
2064                 if (ret)
2065                         return ret;
2066
2067                 /*
2068                  * Reset crc_len in case it was changed after queue setup by a
2069                  *  call to configure
2070                  */
2071                 rxq->crc_len =
2072                         (uint8_t)(dev->data->dev_conf.rxmode.hw_strip_crc ?
2073                                                         0 : ETHER_CRC_LEN);
2074
2075                 bus_addr = rxq->rx_ring_phys_addr;
2076                 E1000_WRITE_REG(hw, E1000_RDLEN(rxq->reg_idx),
2077                                 rxq->nb_rx_desc *
2078                                 sizeof(union e1000_adv_rx_desc));
2079                 E1000_WRITE_REG(hw, E1000_RDBAH(rxq->reg_idx),
2080                                 (uint32_t)(bus_addr >> 32));
2081                 E1000_WRITE_REG(hw, E1000_RDBAL(rxq->reg_idx), (uint32_t)bus_addr);
2082
2083                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2084
2085                 /*
2086                  * Configure RX buffer size.
2087                  */
2088                 buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
2089                         RTE_PKTMBUF_HEADROOM);
2090                 if (buf_size >= 1024) {
2091                         /*
2092                          * Configure the BSIZEPACKET field of the SRRCTL
2093                          * register of the queue.
2094                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
2095                          * If this field is equal to 0b, then RCTL.BSIZE
2096                          * determines the RX packet buffer size.
2097                          */
2098                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
2099                                    E1000_SRRCTL_BSIZEPKT_MASK);
2100                         buf_size = (uint16_t) ((srrctl &
2101                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
2102                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
2103
2104                         /* It adds dual VLAN length for supporting dual VLAN */
2105                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
2106                                                 2 * VLAN_TAG_SIZE) > buf_size){
2107                                 if (!dev->data->scattered_rx)
2108                                         PMD_INIT_LOG(DEBUG,
2109                                                      "forcing scatter mode");
2110                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2111                                 dev->data->scattered_rx = 1;
2112                         }
2113                 } else {
2114                         /*
2115                          * Use BSIZE field of the device RCTL register.
2116                          */
2117                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
2118                                 rctl_bsize = buf_size;
2119                         if (!dev->data->scattered_rx)
2120                                 PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2121                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2122                         dev->data->scattered_rx = 1;
2123                 }
2124
2125                 /* Set if packets are dropped when no descriptors available */
2126                 if (rxq->drop_en)
2127                         srrctl |= E1000_SRRCTL_DROP_EN;
2128
2129                 E1000_WRITE_REG(hw, E1000_SRRCTL(rxq->reg_idx), srrctl);
2130
2131                 /* Enable this RX queue. */
2132                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(rxq->reg_idx));
2133                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2134                 rxdctl &= 0xFFF00000;
2135                 rxdctl |= (rxq->pthresh & 0x1F);
2136                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2137                 rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2138                 E1000_WRITE_REG(hw, E1000_RXDCTL(rxq->reg_idx), rxdctl);
2139         }
2140
2141         if (dev->data->dev_conf.rxmode.enable_scatter) {
2142                 if (!dev->data->scattered_rx)
2143                         PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2144                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2145                 dev->data->scattered_rx = 1;
2146         }
2147
2148         /*
2149          * Setup BSIZE field of RCTL register, if needed.
2150          * Buffer sizes >= 1024 are not [supposed to be] setup in the RCTL
2151          * register, since the code above configures the SRRCTL register of
2152          * the RX queue in such a case.
2153          * All configurable sizes are:
2154          * 16384: rctl |= (E1000_RCTL_SZ_16384 | E1000_RCTL_BSEX);
2155          *  8192: rctl |= (E1000_RCTL_SZ_8192  | E1000_RCTL_BSEX);
2156          *  4096: rctl |= (E1000_RCTL_SZ_4096  | E1000_RCTL_BSEX);
2157          *  2048: rctl |= E1000_RCTL_SZ_2048;
2158          *  1024: rctl |= E1000_RCTL_SZ_1024;
2159          *   512: rctl |= E1000_RCTL_SZ_512;
2160          *   256: rctl |= E1000_RCTL_SZ_256;
2161          */
2162         if (rctl_bsize > 0) {
2163                 if (rctl_bsize >= 512) /* 512 <= buf_size < 1024 - use 512 */
2164                         rctl |= E1000_RCTL_SZ_512;
2165                 else /* 256 <= buf_size < 512 - use 256 */
2166                         rctl |= E1000_RCTL_SZ_256;
2167         }
2168
2169         /*
2170          * Configure RSS if device configured with multiple RX queues.
2171          */
2172         igb_dev_mq_rx_configure(dev);
2173
2174         /* Update the rctl since igb_dev_mq_rx_configure may change its value */
2175         rctl |= E1000_READ_REG(hw, E1000_RCTL);
2176
2177         /*
2178          * Setup the Checksum Register.
2179          * Receive Full-Packet Checksum Offload is mutually exclusive with RSS.
2180          */
2181         rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
2182         rxcsum |= E1000_RXCSUM_PCSD;
2183
2184         /* Enable both L3/L4 rx checksum offload */
2185         if (dev->data->dev_conf.rxmode.hw_ip_checksum)
2186                 rxcsum |= (E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL |
2187                                 E1000_RXCSUM_CRCOFL);
2188         else
2189                 rxcsum &= ~(E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL |
2190                                 E1000_RXCSUM_CRCOFL);
2191         E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
2192
2193         /* Setup the Receive Control Register. */
2194         if (dev->data->dev_conf.rxmode.hw_strip_crc) {
2195                 rctl |= E1000_RCTL_SECRC; /* Strip Ethernet CRC. */
2196
2197                 /* set STRCRC bit in all queues */
2198                 if (hw->mac.type == e1000_i350 ||
2199                     hw->mac.type == e1000_i210 ||
2200                     hw->mac.type == e1000_i211 ||
2201                     hw->mac.type == e1000_i354) {
2202                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2203                                 rxq = dev->data->rx_queues[i];
2204                                 uint32_t dvmolr = E1000_READ_REG(hw,
2205                                         E1000_DVMOLR(rxq->reg_idx));
2206                                 dvmolr |= E1000_DVMOLR_STRCRC;
2207                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2208                         }
2209                 }
2210         } else {
2211                 rctl &= ~E1000_RCTL_SECRC; /* Do not Strip Ethernet CRC. */
2212
2213                 /* clear STRCRC bit in all queues */
2214                 if (hw->mac.type == e1000_i350 ||
2215                     hw->mac.type == e1000_i210 ||
2216                     hw->mac.type == e1000_i211 ||
2217                     hw->mac.type == e1000_i354) {
2218                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2219                                 rxq = dev->data->rx_queues[i];
2220                                 uint32_t dvmolr = E1000_READ_REG(hw,
2221                                         E1000_DVMOLR(rxq->reg_idx));
2222                                 dvmolr &= ~E1000_DVMOLR_STRCRC;
2223                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2224                         }
2225                 }
2226         }
2227
2228         rctl &= ~(3 << E1000_RCTL_MO_SHIFT);
2229         rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
2230                 E1000_RCTL_RDMTS_HALF |
2231                 (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
2232
2233         /* Make sure VLAN Filters are off. */
2234         if (dev->data->dev_conf.rxmode.mq_mode != ETH_MQ_RX_VMDQ_ONLY)
2235                 rctl &= ~E1000_RCTL_VFE;
2236         /* Don't store bad packets. */
2237         rctl &= ~E1000_RCTL_SBP;
2238
2239         /* Enable Receives. */
2240         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
2241
2242         /*
2243          * Setup the HW Rx Head and Tail Descriptor Pointers.
2244          * This needs to be done after enable.
2245          */
2246         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2247                 rxq = dev->data->rx_queues[i];
2248                 E1000_WRITE_REG(hw, E1000_RDH(rxq->reg_idx), 0);
2249                 E1000_WRITE_REG(hw, E1000_RDT(rxq->reg_idx), rxq->nb_rx_desc - 1);
2250         }
2251
2252         return 0;
2253 }
2254
2255 /*********************************************************************
2256  *
2257  *  Enable transmit unit.
2258  *
2259  **********************************************************************/
2260 void
2261 eth_igb_tx_init(struct rte_eth_dev *dev)
2262 {
2263         struct e1000_hw     *hw;
2264         struct igb_tx_queue *txq;
2265         uint32_t tctl;
2266         uint32_t txdctl;
2267         uint16_t i;
2268
2269         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2270
2271         /* Setup the Base and Length of the Tx Descriptor Rings. */
2272         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2273                 uint64_t bus_addr;
2274                 txq = dev->data->tx_queues[i];
2275                 bus_addr = txq->tx_ring_phys_addr;
2276
2277                 E1000_WRITE_REG(hw, E1000_TDLEN(txq->reg_idx),
2278                                 txq->nb_tx_desc *
2279                                 sizeof(union e1000_adv_tx_desc));
2280                 E1000_WRITE_REG(hw, E1000_TDBAH(txq->reg_idx),
2281                                 (uint32_t)(bus_addr >> 32));
2282                 E1000_WRITE_REG(hw, E1000_TDBAL(txq->reg_idx), (uint32_t)bus_addr);
2283
2284                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2285                 E1000_WRITE_REG(hw, E1000_TDT(txq->reg_idx), 0);
2286                 E1000_WRITE_REG(hw, E1000_TDH(txq->reg_idx), 0);
2287
2288                 /* Setup Transmit threshold registers. */
2289                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(txq->reg_idx));
2290                 txdctl |= txq->pthresh & 0x1F;
2291                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2292                 txdctl |= ((txq->wthresh & 0x1F) << 16);
2293                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2294                 E1000_WRITE_REG(hw, E1000_TXDCTL(txq->reg_idx), txdctl);
2295         }
2296
2297         /* Program the Transmit Control Register. */
2298         tctl = E1000_READ_REG(hw, E1000_TCTL);
2299         tctl &= ~E1000_TCTL_CT;
2300         tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
2301                  (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
2302
2303         e1000_config_collision_dist(hw);
2304
2305         /* This write will effectively turn on the transmit unit. */
2306         E1000_WRITE_REG(hw, E1000_TCTL, tctl);
2307 }
2308
2309 /*********************************************************************
2310  *
2311  *  Enable VF receive unit.
2312  *
2313  **********************************************************************/
2314 int
2315 eth_igbvf_rx_init(struct rte_eth_dev *dev)
2316 {
2317         struct e1000_hw     *hw;
2318         struct igb_rx_queue *rxq;
2319         uint32_t srrctl;
2320         uint16_t buf_size;
2321         uint16_t rctl_bsize;
2322         uint16_t i;
2323         int ret;
2324
2325         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2326
2327         /* setup MTU */
2328         e1000_rlpml_set_vf(hw,
2329                 (uint16_t)(dev->data->dev_conf.rxmode.max_rx_pkt_len +
2330                 VLAN_TAG_SIZE));
2331
2332         /* Configure and enable each RX queue. */
2333         rctl_bsize = 0;
2334         dev->rx_pkt_burst = eth_igb_recv_pkts;
2335         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2336                 uint64_t bus_addr;
2337                 uint32_t rxdctl;
2338
2339                 rxq = dev->data->rx_queues[i];
2340
2341                 /* Allocate buffers for descriptor rings and set up queue */
2342                 ret = igb_alloc_rx_queue_mbufs(rxq);
2343                 if (ret)
2344                         return ret;
2345
2346                 bus_addr = rxq->rx_ring_phys_addr;
2347                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
2348                                 rxq->nb_rx_desc *
2349                                 sizeof(union e1000_adv_rx_desc));
2350                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
2351                                 (uint32_t)(bus_addr >> 32));
2352                 E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);
2353
2354                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2355
2356                 /*
2357                  * Configure RX buffer size.
2358                  */
2359                 buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
2360                         RTE_PKTMBUF_HEADROOM);
2361                 if (buf_size >= 1024) {
2362                         /*
2363                          * Configure the BSIZEPACKET field of the SRRCTL
2364                          * register of the queue.
2365                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
2366                          * If this field is equal to 0b, then RCTL.BSIZE
2367                          * determines the RX packet buffer size.
2368                          */
2369                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
2370                                    E1000_SRRCTL_BSIZEPKT_MASK);
2371                         buf_size = (uint16_t) ((srrctl &
2372                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
2373                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
2374
2375                         /* It adds dual VLAN length for supporting dual VLAN */
2376                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
2377                                                 2 * VLAN_TAG_SIZE) > buf_size){
2378                                 if (!dev->data->scattered_rx)
2379                                         PMD_INIT_LOG(DEBUG,
2380                                                      "forcing scatter mode");
2381                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2382                                 dev->data->scattered_rx = 1;
2383                         }
2384                 } else {
2385                         /*
2386                          * Use BSIZE field of the device RCTL register.
2387                          */
2388                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
2389                                 rctl_bsize = buf_size;
2390                         if (!dev->data->scattered_rx)
2391                                 PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2392                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2393                         dev->data->scattered_rx = 1;
2394                 }
2395
2396                 /* Set if packets are dropped when no descriptors available */
2397                 if (rxq->drop_en)
2398                         srrctl |= E1000_SRRCTL_DROP_EN;
2399
2400                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
2401
2402                 /* Enable this RX queue. */
2403                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
2404                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2405                 rxdctl &= 0xFFF00000;
2406                 rxdctl |= (rxq->pthresh & 0x1F);
2407                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2408                 if (hw->mac.type == e1000_vfadapt) {
2409                         /*
2410                          * Workaround of 82576 VF Erratum
2411                          * force set WTHRESH to 1
2412                          * to avoid Write-Back not triggered sometimes
2413                          */
2414                         rxdctl |= 0x10000;
2415                         PMD_INIT_LOG(DEBUG, "Force set RX WTHRESH to 1 !");
2416                 }
2417                 else
2418                         rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2419                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
2420         }
2421
2422         if (dev->data->dev_conf.rxmode.enable_scatter) {
2423                 if (!dev->data->scattered_rx)
2424                         PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2425                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2426                 dev->data->scattered_rx = 1;
2427         }
2428
2429         /*
2430          * Setup the HW Rx Head and Tail Descriptor Pointers.
2431          * This needs to be done after enable.
2432          */
2433         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2434                 rxq = dev->data->rx_queues[i];
2435                 E1000_WRITE_REG(hw, E1000_RDH(i), 0);
2436                 E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);
2437         }
2438
2439         return 0;
2440 }
2441
2442 /*********************************************************************
2443  *
2444  *  Enable VF transmit unit.
2445  *
2446  **********************************************************************/
2447 void
2448 eth_igbvf_tx_init(struct rte_eth_dev *dev)
2449 {
2450         struct e1000_hw     *hw;
2451         struct igb_tx_queue *txq;
2452         uint32_t txdctl;
2453         uint16_t i;
2454
2455         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2456
2457         /* Setup the Base and Length of the Tx Descriptor Rings. */
2458         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2459                 uint64_t bus_addr;
2460
2461                 txq = dev->data->tx_queues[i];
2462                 bus_addr = txq->tx_ring_phys_addr;
2463                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
2464                                 txq->nb_tx_desc *
2465                                 sizeof(union e1000_adv_tx_desc));
2466                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
2467                                 (uint32_t)(bus_addr >> 32));
2468                 E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);
2469
2470                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2471                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
2472                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
2473
2474                 /* Setup Transmit threshold registers. */
2475                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
2476                 txdctl |= txq->pthresh & 0x1F;
2477                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2478                 if (hw->mac.type == e1000_82576) {
2479                         /*
2480                          * Workaround of 82576 VF Erratum
2481                          * force set WTHRESH to 1
2482                          * to avoid Write-Back not triggered sometimes
2483                          */
2484                         txdctl |= 0x10000;
2485                         PMD_INIT_LOG(DEBUG, "Force set TX WTHRESH to 1 !");
2486                 }
2487                 else
2488                         txdctl |= ((txq->wthresh & 0x1F) << 16);
2489                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2490                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
2491         }
2492
2493 }
2494
2495 void
2496 igb_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
2497         struct rte_eth_rxq_info *qinfo)
2498 {
2499         struct igb_rx_queue *rxq;
2500
2501         rxq = dev->data->rx_queues[queue_id];
2502
2503         qinfo->mp = rxq->mb_pool;
2504         qinfo->scattered_rx = dev->data->scattered_rx;
2505         qinfo->nb_desc = rxq->nb_rx_desc;
2506
2507         qinfo->conf.rx_free_thresh = rxq->rx_free_thresh;
2508         qinfo->conf.rx_drop_en = rxq->drop_en;
2509 }
2510
2511 void
2512 igb_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
2513         struct rte_eth_txq_info *qinfo)
2514 {
2515         struct igb_tx_queue *txq;
2516
2517         txq = dev->data->tx_queues[queue_id];
2518
2519         qinfo->nb_desc = txq->nb_tx_desc;
2520
2521         qinfo->conf.tx_thresh.pthresh = txq->pthresh;
2522         qinfo->conf.tx_thresh.hthresh = txq->hthresh;
2523         qinfo->conf.tx_thresh.wthresh = txq->wthresh;
2524 }