4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
42 #pragma GCC diagnostic ignored "-Wpedantic"
44 #include <infiniband/verbs.h>
45 #include <infiniband/mlx5_hw.h>
46 #include <infiniband/arch.h>
48 #pragma GCC diagnostic error "-Wpedantic"
51 /* DPDK headers don't like -pedantic. */
53 #pragma GCC diagnostic ignored "-Wpedantic"
56 #include <rte_mempool.h>
57 #include <rte_prefetch.h>
58 #include <rte_common.h>
59 #include <rte_branch_prediction.h>
60 #include <rte_ether.h>
62 #pragma GCC diagnostic error "-Wpedantic"
66 #include "mlx5_utils.h"
67 #include "mlx5_rxtx.h"
68 #include "mlx5_autoconf.h"
69 #include "mlx5_defs.h"
73 check_cqe(volatile struct mlx5_cqe *cqe,
74 unsigned int cqes_n, const uint16_t ci)
75 __attribute__((always_inline));
77 static inline uint32_t
78 txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
79 __attribute__((always_inline));
82 mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe)
83 __attribute__((always_inline));
85 static inline uint32_t
86 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
87 __attribute__((always_inline));
90 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
91 uint16_t cqe_cnt, uint32_t *rss_hash)
92 __attribute__((always_inline));
94 static inline uint32_t
95 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe)
96 __attribute__((always_inline));
101 * Verify or set magic value in CQE.
110 check_cqe_seen(volatile struct mlx5_cqe *cqe)
112 static const uint8_t magic[] = "seen";
113 volatile uint8_t (*buf)[sizeof(cqe->rsvd3)] = &cqe->rsvd3;
117 for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i)
118 if (!ret || (*buf)[i] != magic[i]) {
120 (*buf)[i] = magic[i];
128 * Check whether CQE is valid.
133 * Size of completion queue.
138 * 0 on success, 1 on failure.
141 check_cqe(volatile struct mlx5_cqe *cqe,
142 unsigned int cqes_n, const uint16_t ci)
144 uint16_t idx = ci & cqes_n;
145 uint8_t op_own = cqe->op_own;
146 uint8_t op_owner = MLX5_CQE_OWNER(op_own);
147 uint8_t op_code = MLX5_CQE_OPCODE(op_own);
149 if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
150 return 1; /* No CQE. */
152 if ((op_code == MLX5_CQE_RESP_ERR) ||
153 (op_code == MLX5_CQE_REQ_ERR)) {
154 volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe;
155 uint8_t syndrome = err_cqe->syndrome;
157 if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) ||
158 (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR))
160 if (!check_cqe_seen(cqe))
161 ERROR("unexpected CQE error %u (0x%02x)"
163 op_code, op_code, syndrome);
165 } else if ((op_code != MLX5_CQE_RESP_SEND) &&
166 (op_code != MLX5_CQE_REQ)) {
167 if (!check_cqe_seen(cqe))
168 ERROR("unexpected CQE opcode %u (0x%02x)",
177 txq_complete(struct txq *txq) __attribute__((always_inline));
180 * Manage TX completions.
182 * When sending a burst, mlx5_tx_burst() posts several WRs.
185 * Pointer to TX queue structure.
188 txq_complete(struct txq *txq)
190 const unsigned int elts_n = 1 << txq->elts_n;
191 const unsigned int cqe_n = 1 << txq->cqe_n;
192 const unsigned int cqe_cnt = cqe_n - 1;
193 uint16_t elts_free = txq->elts_tail;
195 uint16_t cq_ci = txq->cq_ci;
196 volatile struct mlx5_cqe *cqe = NULL;
197 volatile struct mlx5_wqe *wqe;
200 volatile struct mlx5_cqe *tmp;
202 tmp = &(*txq->cqes)[cq_ci & cqe_cnt];
203 if (check_cqe(tmp, cqe_n, cq_ci))
207 if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) {
208 if (!check_cqe_seen(cqe))
209 ERROR("unexpected compressed CQE, TX stopped");
212 if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) ||
213 (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) {
214 if (!check_cqe_seen(cqe))
215 ERROR("unexpected error CQE, TX stopped");
221 if (unlikely(cqe == NULL))
223 wqe = &(*txq->wqes)[ntohs(cqe->wqe_counter) &
224 ((1 << txq->wqe_n) - 1)].hdr;
225 elts_tail = wqe->ctrl[3];
226 assert(elts_tail < (1 << txq->wqe_n));
228 while (elts_free != elts_tail) {
229 struct rte_mbuf *elt = (*txq->elts)[elts_free];
230 unsigned int elts_free_next =
231 (elts_free + 1) & (elts_n - 1);
232 struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next];
236 memset(&(*txq->elts)[elts_free],
238 sizeof((*txq->elts)[elts_free]));
240 RTE_MBUF_PREFETCH_TO_FREE(elt_next);
241 /* Only one segment needs to be freed. */
242 rte_pktmbuf_free_seg(elt);
243 elts_free = elts_free_next;
246 txq->elts_tail = elts_tail;
247 /* Update the consumer index. */
249 *txq->cq_db = htonl(cq_ci);
253 * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
254 * the cloned mbuf is allocated is returned instead.
260 * Memory pool where data is located for given mbuf.
262 static struct rte_mempool *
263 txq_mb2mp(struct rte_mbuf *buf)
265 if (unlikely(RTE_MBUF_INDIRECT(buf)))
266 return rte_mbuf_from_indirect(buf)->pool;
271 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
272 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
273 * remove an entry first.
276 * Pointer to TX queue structure.
278 * Memory Pool for which a Memory Region lkey must be returned.
281 * mr->lkey on success, (uint32_t)-1 on failure.
283 static inline uint32_t
284 txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
287 uint32_t lkey = (uint32_t)-1;
289 for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
290 if (unlikely(txq->mp2mr[i].mp == NULL)) {
291 /* Unknown MP, add a new MR for it. */
294 if (txq->mp2mr[i].mp == mp) {
295 assert(txq->mp2mr[i].lkey != (uint32_t)-1);
296 assert(htonl(txq->mp2mr[i].mr->lkey) ==
298 lkey = txq->mp2mr[i].lkey;
302 if (unlikely(lkey == (uint32_t)-1))
303 lkey = txq_mp2mr_reg(txq, mp, i);
308 * Ring TX queue doorbell.
311 * Pointer to TX queue structure.
313 * Pointer to the last WQE posted in the NIC.
316 mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe)
318 uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
319 volatile uint64_t *src = ((volatile uint64_t *)wqe);
322 *txq->qp_db = htonl(txq->wqe_ci);
323 /* Ensure ordering between DB record and BF copy. */
332 * Pointer to TX queue structure.
334 * CQE consumer index.
337 tx_prefetch_cqe(struct txq *txq, uint16_t ci)
339 volatile struct mlx5_cqe *cqe;
341 cqe = &(*txq->cqes)[ci & ((1 << txq->cqe_n) - 1)];
349 * Pointer to TX queue structure.
351 * WQE consumer index.
354 tx_prefetch_wqe(struct txq *txq, uint16_t ci)
356 volatile struct mlx5_wqe64 *wqe;
358 wqe = &(*txq->wqes)[ci & ((1 << txq->wqe_n) - 1)];
363 * DPDK callback for TX.
366 * Generic pointer to TX queue structure.
368 * Packets to transmit.
370 * Number of packets in array.
373 * Number of packets successfully transmitted (<= pkts_n).
376 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
378 struct txq *txq = (struct txq *)dpdk_txq;
379 uint16_t elts_head = txq->elts_head;
380 const unsigned int elts_n = 1 << txq->elts_n;
385 volatile struct mlx5_wqe *wqe = NULL;
386 unsigned int segs_n = 0;
387 struct rte_mbuf *buf = NULL;
390 if (unlikely(!pkts_n))
392 /* Prefetch first packet cacheline. */
393 tx_prefetch_cqe(txq, txq->cq_ci);
394 tx_prefetch_cqe(txq, txq->cq_ci + 1);
395 rte_prefetch0(*pkts);
396 /* Start processing. */
398 max = (elts_n - (elts_head - txq->elts_tail));
402 volatile struct mlx5_wqe_data_seg *dseg = NULL;
406 #ifdef MLX5_PMD_SOFT_COUNTERS
407 uint32_t total_length = 0;
412 segs_n = buf->nb_segs;
414 * Make sure there is enough room to store this packet and
415 * that one ring entry remains unused.
418 if (max < segs_n + 1)
424 wqe = &(*txq->wqes)[txq->wqe_ci &
425 ((1 << txq->wqe_n) - 1)].hdr;
426 tx_prefetch_wqe(txq, txq->wqe_ci + 1);
428 rte_prefetch0(*pkts);
429 addr = rte_pktmbuf_mtod(buf, uintptr_t);
430 length = DATA_LEN(buf);
431 #ifdef MLX5_PMD_SOFT_COUNTERS
432 total_length = length;
434 if (length < (MLX5_WQE_DWORD_SIZE + 2)) {
435 txq->stats.oerrors++;
438 /* Update element. */
439 (*txq->elts)[elts_head] = buf;
440 elts_head = (elts_head + 1) & (elts_n - 1);
441 /* Prefetch next buffer data. */
443 volatile void *pkt_addr;
445 pkt_addr = rte_pktmbuf_mtod(*pkts, volatile void *);
446 rte_prefetch0(pkt_addr);
448 /* Should we enable HW CKSUM offload */
450 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
452 MLX5_ETH_WQE_L3_CSUM |
453 MLX5_ETH_WQE_L4_CSUM;
455 wqe->eseg.cs_flags = 0;
457 raw = (uint8_t *)(uintptr_t)&wqe->eseg.inline_hdr[0];
458 /* Start the know and common part of the WQE structure. */
459 wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
466 /* Start by copying the Ethernet Header. */
467 memcpy((uint8_t *)raw, ((uint8_t *)addr), 16);
468 length -= MLX5_WQE_DWORD_SIZE;
469 addr += MLX5_WQE_DWORD_SIZE;
470 /* Replace the Ethernet type by the VLAN if necessary. */
471 if (buf->ol_flags & PKT_TX_VLAN_PKT) {
472 uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
474 memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE -
476 &vlan, sizeof(vlan));
477 addr -= sizeof(vlan);
478 length += sizeof(vlan);
480 /* Inline if enough room. */
481 if (txq->max_inline != 0) {
483 (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
484 uint16_t max_inline =
485 txq->max_inline * RTE_CACHE_LINE_SIZE;
486 uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
489 raw += MLX5_WQE_DWORD_SIZE;
490 room = end - (uintptr_t)raw;
491 if (room > max_inline) {
492 uintptr_t addr_end = (addr + max_inline) &
493 ~(RTE_CACHE_LINE_SIZE - 1);
494 uint16_t copy_b = ((addr_end - addr) > length) ?
498 rte_memcpy((void *)raw, (void *)addr, copy_b);
501 pkt_inline_sz += copy_b;
503 assert(addr <= addr_end);
505 /* Store the inlined packet size in the WQE. */
506 wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
508 * 2 DWORDs consumed by the WQE header + 1 DSEG +
509 * the size of the inline part of the packet.
511 ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
513 dseg = (struct mlx5_wqe_data_seg *)
515 (ds * MLX5_WQE_DWORD_SIZE));
516 if ((uintptr_t)dseg >= end)
517 dseg = (struct mlx5_wqe_data_seg *)
518 ((uintptr_t)&(*txq->wqes)[0]);
520 } else if (!segs_n) {
527 * No inline has been done in the packet, only the
528 * Ethernet Header as been stored.
530 wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
531 dseg = (struct mlx5_wqe_data_seg *)
532 ((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
535 /* Add the remaining packet as a simple ds. */
536 *dseg = (struct mlx5_wqe_data_seg) {
537 .addr = htonll(addr),
538 .byte_count = htonl(length),
539 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
550 * Spill on next WQE when the current one does not have
551 * enough room left. Size of WQE must a be a multiple
552 * of data segment size.
554 assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
555 if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
556 unsigned int n = (txq->wqe_ci + ((ds + 3) / 4)) &
557 ((1 << txq->wqe_n) - 1);
559 dseg = (struct mlx5_wqe_data_seg *)
560 ((uintptr_t)&(*txq->wqes)[n]);
561 tx_prefetch_wqe(txq, n + 1);
568 length = DATA_LEN(buf);
569 #ifdef MLX5_PMD_SOFT_COUNTERS
570 total_length += length;
572 /* Store segment information. */
573 *dseg = (struct mlx5_wqe_data_seg) {
574 .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
575 .byte_count = htonl(length),
576 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
578 (*txq->elts)[elts_head] = buf;
579 elts_head = (elts_head + 1) & (elts_n - 1);
588 wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
589 txq->wqe_ci += (ds + 3) / 4;
590 #ifdef MLX5_PMD_SOFT_COUNTERS
591 /* Increment sent bytes counter. */
592 txq->stats.obytes += total_length;
595 /* Take a shortcut if nothing must be sent. */
596 if (unlikely(i == 0))
598 /* Check whether completion threshold has been reached. */
599 comp = txq->elts_comp + i + j;
600 if (comp >= MLX5_TX_COMP_THRESH) {
601 /* Request completion on last WQE. */
602 wqe->ctrl[2] = htonl(8);
603 /* Save elts_head in unused "immediate" field of WQE. */
604 wqe->ctrl[3] = elts_head;
607 txq->elts_comp = comp;
609 #ifdef MLX5_PMD_SOFT_COUNTERS
610 /* Increment sent packets counter. */
611 txq->stats.opackets += i;
613 /* Ring QP doorbell. */
614 mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)wqe);
615 txq->elts_head = elts_head;
620 * Open a MPW session.
623 * Pointer to TX queue structure.
625 * Pointer to MPW session structure.
630 mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
632 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
633 volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
634 (volatile struct mlx5_wqe_data_seg (*)[])
635 (uintptr_t)&(*txq->wqes)[(idx + 1) & ((1 << txq->wqe_n) - 1)];
637 mpw->state = MLX5_MPW_STATE_OPENED;
641 mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
642 mpw->wqe->eseg.mss = htons(length);
643 mpw->wqe->eseg.inline_hdr_sz = 0;
644 mpw->wqe->eseg.rsvd0 = 0;
645 mpw->wqe->eseg.rsvd1 = 0;
646 mpw->wqe->eseg.rsvd2 = 0;
647 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
648 (txq->wqe_ci << 8) | MLX5_OPCODE_TSO);
649 mpw->wqe->ctrl[2] = 0;
650 mpw->wqe->ctrl[3] = 0;
651 mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
652 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
653 mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
654 (((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
655 mpw->data.dseg[2] = &(*dseg)[0];
656 mpw->data.dseg[3] = &(*dseg)[1];
657 mpw->data.dseg[4] = &(*dseg)[2];
661 * Close a MPW session.
664 * Pointer to TX queue structure.
666 * Pointer to MPW session structure.
669 mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
671 unsigned int num = mpw->pkts_n;
674 * Store size in multiple of 16 bytes. Control and Ethernet segments
677 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num));
678 mpw->state = MLX5_MPW_STATE_CLOSED;
683 tx_prefetch_wqe(txq, txq->wqe_ci);
684 tx_prefetch_wqe(txq, txq->wqe_ci + 1);
688 * DPDK callback for TX with MPW support.
691 * Generic pointer to TX queue structure.
693 * Packets to transmit.
695 * Number of packets in array.
698 * Number of packets successfully transmitted (<= pkts_n).
701 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
703 struct txq *txq = (struct txq *)dpdk_txq;
704 uint16_t elts_head = txq->elts_head;
705 const unsigned int elts_n = 1 << txq->elts_n;
710 struct mlx5_mpw mpw = {
711 .state = MLX5_MPW_STATE_CLOSED,
714 if (unlikely(!pkts_n))
716 /* Prefetch first packet cacheline. */
717 tx_prefetch_cqe(txq, txq->cq_ci);
718 tx_prefetch_wqe(txq, txq->wqe_ci);
719 tx_prefetch_wqe(txq, txq->wqe_ci + 1);
720 /* Start processing. */
722 max = (elts_n - (elts_head - txq->elts_tail));
726 struct rte_mbuf *buf = *(pkts++);
727 unsigned int elts_head_next;
729 unsigned int segs_n = buf->nb_segs;
730 uint32_t cs_flags = 0;
733 * Make sure there is enough room to store this packet and
734 * that one ring entry remains unused.
737 if (max < segs_n + 1)
739 /* Do not bother with large packets MPW cannot handle. */
740 if (segs_n > MLX5_MPW_DSEG_MAX) {
741 txq->stats.oerrors++;
746 /* Should we enable HW CKSUM offload */
748 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
749 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
750 /* Retrieve packet information. */
751 length = PKT_LEN(buf);
753 /* Start new session if packet differs. */
754 if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
755 ((mpw.len != length) ||
757 (mpw.wqe->eseg.cs_flags != cs_flags)))
758 mlx5_mpw_close(txq, &mpw);
759 if (mpw.state == MLX5_MPW_STATE_CLOSED) {
760 mlx5_mpw_new(txq, &mpw, length);
761 mpw.wqe->eseg.cs_flags = cs_flags;
763 /* Multi-segment packets must be alone in their MPW. */
764 assert((segs_n == 1) || (mpw.pkts_n == 0));
765 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
769 volatile struct mlx5_wqe_data_seg *dseg;
772 elts_head_next = (elts_head + 1) & (elts_n - 1);
774 (*txq->elts)[elts_head] = buf;
775 dseg = mpw.data.dseg[mpw.pkts_n];
776 addr = rte_pktmbuf_mtod(buf, uintptr_t);
777 *dseg = (struct mlx5_wqe_data_seg){
778 .byte_count = htonl(DATA_LEN(buf)),
779 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
780 .addr = htonll(addr),
782 elts_head = elts_head_next;
783 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
784 length += DATA_LEN(buf);
790 assert(length == mpw.len);
791 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
792 mlx5_mpw_close(txq, &mpw);
793 elts_head = elts_head_next;
794 #ifdef MLX5_PMD_SOFT_COUNTERS
795 /* Increment sent bytes counter. */
796 txq->stats.obytes += length;
800 /* Take a shortcut if nothing must be sent. */
801 if (unlikely(i == 0))
803 /* Check whether completion threshold has been reached. */
804 /* "j" includes both packets and segments. */
805 comp = txq->elts_comp + j;
806 if (comp >= MLX5_TX_COMP_THRESH) {
807 volatile struct mlx5_wqe *wqe = mpw.wqe;
809 /* Request completion on last WQE. */
810 wqe->ctrl[2] = htonl(8);
811 /* Save elts_head in unused "immediate" field of WQE. */
812 wqe->ctrl[3] = elts_head;
815 txq->elts_comp = comp;
817 #ifdef MLX5_PMD_SOFT_COUNTERS
818 /* Increment sent packets counter. */
819 txq->stats.opackets += i;
821 /* Ring QP doorbell. */
822 if (mpw.state == MLX5_MPW_STATE_OPENED)
823 mlx5_mpw_close(txq, &mpw);
824 mlx5_tx_dbrec(txq, mpw.wqe);
825 txq->elts_head = elts_head;
830 * Open a MPW inline session.
833 * Pointer to TX queue structure.
835 * Pointer to MPW session structure.
840 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
842 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
843 struct mlx5_wqe_inl_small *inl;
845 mpw->state = MLX5_MPW_INL_STATE_OPENED;
849 mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
850 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
853 mpw->wqe->ctrl[2] = 0;
854 mpw->wqe->ctrl[3] = 0;
855 mpw->wqe->eseg.mss = htons(length);
856 mpw->wqe->eseg.inline_hdr_sz = 0;
857 mpw->wqe->eseg.cs_flags = 0;
858 mpw->wqe->eseg.rsvd0 = 0;
859 mpw->wqe->eseg.rsvd1 = 0;
860 mpw->wqe->eseg.rsvd2 = 0;
861 inl = (struct mlx5_wqe_inl_small *)
862 (((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
863 mpw->data.raw = (uint8_t *)&inl->raw;
867 * Close a MPW inline session.
870 * Pointer to TX queue structure.
872 * Pointer to MPW session structure.
875 mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
878 struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
879 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
881 size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
883 * Store size in multiple of 16 bytes. Control and Ethernet segments
886 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size));
887 mpw->state = MLX5_MPW_STATE_CLOSED;
888 inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
889 txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
893 * DPDK callback for TX with MPW inline support.
896 * Generic pointer to TX queue structure.
898 * Packets to transmit.
900 * Number of packets in array.
903 * Number of packets successfully transmitted (<= pkts_n).
906 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
909 struct txq *txq = (struct txq *)dpdk_txq;
910 uint16_t elts_head = txq->elts_head;
911 const unsigned int elts_n = 1 << txq->elts_n;
916 unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
917 struct mlx5_mpw mpw = {
918 .state = MLX5_MPW_STATE_CLOSED,
921 if (unlikely(!pkts_n))
923 /* Prefetch first packet cacheline. */
924 tx_prefetch_cqe(txq, txq->cq_ci);
925 tx_prefetch_wqe(txq, txq->wqe_ci);
926 tx_prefetch_wqe(txq, txq->wqe_ci + 1);
927 /* Start processing. */
929 max = (elts_n - (elts_head - txq->elts_tail));
933 struct rte_mbuf *buf = *(pkts++);
934 unsigned int elts_head_next;
937 unsigned int segs_n = buf->nb_segs;
938 uint32_t cs_flags = 0;
941 * Make sure there is enough room to store this packet and
942 * that one ring entry remains unused.
945 if (max < segs_n + 1)
947 /* Do not bother with large packets MPW cannot handle. */
948 if (segs_n > MLX5_MPW_DSEG_MAX) {
949 txq->stats.oerrors++;
954 /* Should we enable HW CKSUM offload */
956 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
957 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
958 /* Retrieve packet information. */
959 length = PKT_LEN(buf);
960 /* Start new session if packet differs. */
961 if (mpw.state == MLX5_MPW_STATE_OPENED) {
962 if ((mpw.len != length) ||
964 (mpw.wqe->eseg.cs_flags != cs_flags))
965 mlx5_mpw_close(txq, &mpw);
966 } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
967 if ((mpw.len != length) ||
969 (length > inline_room) ||
970 (mpw.wqe->eseg.cs_flags != cs_flags)) {
971 mlx5_mpw_inline_close(txq, &mpw);
973 txq->max_inline * RTE_CACHE_LINE_SIZE;
976 if (mpw.state == MLX5_MPW_STATE_CLOSED) {
978 (length > inline_room)) {
979 mlx5_mpw_new(txq, &mpw, length);
980 mpw.wqe->eseg.cs_flags = cs_flags;
982 mlx5_mpw_inline_new(txq, &mpw, length);
983 mpw.wqe->eseg.cs_flags = cs_flags;
986 /* Multi-segment packets must be alone in their MPW. */
987 assert((segs_n == 1) || (mpw.pkts_n == 0));
988 if (mpw.state == MLX5_MPW_STATE_OPENED) {
989 assert(inline_room ==
990 txq->max_inline * RTE_CACHE_LINE_SIZE);
991 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
995 volatile struct mlx5_wqe_data_seg *dseg;
998 (elts_head + 1) & (elts_n - 1);
1000 (*txq->elts)[elts_head] = buf;
1001 dseg = mpw.data.dseg[mpw.pkts_n];
1002 addr = rte_pktmbuf_mtod(buf, uintptr_t);
1003 *dseg = (struct mlx5_wqe_data_seg){
1004 .byte_count = htonl(DATA_LEN(buf)),
1005 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
1006 .addr = htonll(addr),
1008 elts_head = elts_head_next;
1009 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1010 length += DATA_LEN(buf);
1016 assert(length == mpw.len);
1017 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1018 mlx5_mpw_close(txq, &mpw);
1022 assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
1023 assert(length <= inline_room);
1024 assert(length == DATA_LEN(buf));
1025 elts_head_next = (elts_head + 1) & (elts_n - 1);
1026 addr = rte_pktmbuf_mtod(buf, uintptr_t);
1027 (*txq->elts)[elts_head] = buf;
1028 /* Maximum number of bytes before wrapping. */
1029 max = ((uintptr_t)&(*txq->wqes)[1 << txq->wqe_n] -
1030 (uintptr_t)mpw.data.raw);
1032 rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1036 (volatile void *)&(*txq->wqes)[0];
1037 rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1038 (void *)(addr + max),
1040 mpw.data.raw += length - max;
1042 rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1045 mpw.data.raw += length;
1047 if ((uintptr_t)mpw.data.raw ==
1048 (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n])
1050 (volatile void *)&(*txq->wqes)[0];
1052 mpw.total_len += length;
1054 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
1055 mlx5_mpw_inline_close(txq, &mpw);
1057 txq->max_inline * RTE_CACHE_LINE_SIZE;
1059 inline_room -= length;
1062 elts_head = elts_head_next;
1063 #ifdef MLX5_PMD_SOFT_COUNTERS
1064 /* Increment sent bytes counter. */
1065 txq->stats.obytes += length;
1069 /* Take a shortcut if nothing must be sent. */
1070 if (unlikely(i == 0))
1072 /* Check whether completion threshold has been reached. */
1073 /* "j" includes both packets and segments. */
1074 comp = txq->elts_comp + j;
1075 if (comp >= MLX5_TX_COMP_THRESH) {
1076 volatile struct mlx5_wqe *wqe = mpw.wqe;
1078 /* Request completion on last WQE. */
1079 wqe->ctrl[2] = htonl(8);
1080 /* Save elts_head in unused "immediate" field of WQE. */
1081 wqe->ctrl[3] = elts_head;
1084 txq->elts_comp = comp;
1086 #ifdef MLX5_PMD_SOFT_COUNTERS
1087 /* Increment sent packets counter. */
1088 txq->stats.opackets += i;
1090 /* Ring QP doorbell. */
1091 if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
1092 mlx5_mpw_inline_close(txq, &mpw);
1093 else if (mpw.state == MLX5_MPW_STATE_OPENED)
1094 mlx5_mpw_close(txq, &mpw);
1095 mlx5_tx_dbrec(txq, mpw.wqe);
1096 txq->elts_head = elts_head;
1101 * Translate RX completion flags to packet type.
1106 * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
1109 * Packet type for struct rte_mbuf.
1111 static inline uint32_t
1112 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
1115 uint16_t flags = ntohs(cqe->hdr_type_etc);
1117 if (cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) {
1120 MLX5_CQE_RX_IPV4_PACKET,
1121 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN) |
1123 MLX5_CQE_RX_IPV6_PACKET,
1124 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN);
1125 pkt_type |= ((cqe->pkt_info & MLX5_CQE_RX_OUTER_PACKET) ?
1126 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN :
1127 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN);
1131 MLX5_CQE_L3_HDR_TYPE_IPV6,
1132 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN) |
1134 MLX5_CQE_L3_HDR_TYPE_IPV4,
1135 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN);
1141 * Get size of the next packet for a given CQE. For compressed CQEs, the
1142 * consumer index is updated only once all packets of the current one have
1146 * Pointer to RX queue.
1149 * @param[out] rss_hash
1150 * Packet RSS Hash result.
1153 * Packet size in bytes (0 if there is none), -1 in case of completion
1157 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
1158 uint16_t cqe_cnt, uint32_t *rss_hash)
1160 struct rxq_zip *zip = &rxq->zip;
1161 uint16_t cqe_n = cqe_cnt + 1;
1164 /* Process compressed data in the CQE and mini arrays. */
1166 volatile struct mlx5_mini_cqe8 (*mc)[8] =
1167 (volatile struct mlx5_mini_cqe8 (*)[8])
1168 (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt]);
1170 len = ntohl((*mc)[zip->ai & 7].byte_cnt);
1171 *rss_hash = ntohl((*mc)[zip->ai & 7].rx_hash_result);
1172 if ((++zip->ai & 7) == 0) {
1174 * Increment consumer index to skip the number of
1175 * CQEs consumed. Hardware leaves holes in the CQ
1176 * ring for software use.
1181 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
1182 uint16_t idx = rxq->cq_ci;
1183 uint16_t end = zip->cq_ci;
1185 while (idx != end) {
1186 (*rxq->cqes)[idx & cqe_cnt].op_own =
1187 MLX5_CQE_INVALIDATE;
1190 rxq->cq_ci = zip->cq_ci;
1193 /* No compressed data, get next CQE and verify if it is compressed. */
1198 ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
1199 if (unlikely(ret == 1))
1202 op_own = cqe->op_own;
1203 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
1204 volatile struct mlx5_mini_cqe8 (*mc)[8] =
1205 (volatile struct mlx5_mini_cqe8 (*)[8])
1206 (uintptr_t)(&(*rxq->cqes)[rxq->cq_ci &
1209 /* Fix endianness. */
1210 zip->cqe_cnt = ntohl(cqe->byte_cnt);
1212 * Current mini array position is the one returned by
1215 * If completion comprises several mini arrays, as a
1216 * special case the second one is located 7 CQEs after
1217 * the initial CQE instead of 8 for subsequent ones.
1219 zip->ca = rxq->cq_ci & cqe_cnt;
1220 zip->na = zip->ca + 7;
1221 /* Compute the next non compressed CQE. */
1223 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
1224 /* Get packet size to return. */
1225 len = ntohl((*mc)[0].byte_cnt);
1226 *rss_hash = ntohl((*mc)[0].rx_hash_result);
1229 len = ntohl(cqe->byte_cnt);
1230 *rss_hash = ntohl(cqe->rx_hash_res);
1232 /* Error while receiving packet. */
1233 if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
1240 * Translate RX completion flags to offload flags.
1243 * Pointer to RX queue structure.
1248 * Offload flags (ol_flags) for struct rte_mbuf.
1250 static inline uint32_t
1251 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe)
1253 uint32_t ol_flags = 0;
1254 uint16_t flags = ntohs(cqe->hdr_type_etc);
1258 MLX5_CQE_RX_L3_HDR_VALID,
1259 PKT_RX_IP_CKSUM_GOOD) |
1261 MLX5_CQE_RX_L4_HDR_VALID,
1262 PKT_RX_L4_CKSUM_GOOD);
1263 if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
1266 MLX5_CQE_RX_L3_HDR_VALID,
1267 PKT_RX_IP_CKSUM_GOOD) |
1269 MLX5_CQE_RX_L4_HDR_VALID,
1270 PKT_RX_L4_CKSUM_GOOD);
1275 * DPDK callback for RX.
1278 * Generic pointer to RX queue structure.
1280 * Array to store received packets.
1282 * Maximum number of packets in array.
1285 * Number of packets successfully received (<= pkts_n).
1288 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1290 struct rxq *rxq = dpdk_rxq;
1291 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
1292 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
1293 const unsigned int sges_n = rxq->sges_n;
1294 struct rte_mbuf *pkt = NULL;
1295 struct rte_mbuf *seg = NULL;
1296 volatile struct mlx5_cqe *cqe =
1297 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1299 unsigned int rq_ci = rxq->rq_ci << sges_n;
1300 int len = 0; /* keep its value across iterations. */
1303 unsigned int idx = rq_ci & wqe_cnt;
1304 volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
1305 struct rte_mbuf *rep = (*rxq->elts)[idx];
1306 uint32_t rss_hash_res = 0;
1314 rep = rte_mbuf_raw_alloc(rxq->mp);
1315 if (unlikely(rep == NULL)) {
1316 ++rxq->stats.rx_nombuf;
1319 * no buffers before we even started,
1320 * bail out silently.
1324 while (pkt != seg) {
1325 assert(pkt != (*rxq->elts)[idx]);
1327 rte_mbuf_refcnt_set(pkt, 0);
1328 __rte_mbuf_raw_free(pkt);
1334 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1335 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt,
1338 rte_mbuf_refcnt_set(rep, 0);
1339 __rte_mbuf_raw_free(rep);
1342 if (unlikely(len == -1)) {
1343 /* RX error, packet is likely too large. */
1344 rte_mbuf_refcnt_set(rep, 0);
1345 __rte_mbuf_raw_free(rep);
1346 ++rxq->stats.idropped;
1350 assert(len >= (rxq->crc_present << 2));
1351 /* Update packet information. */
1352 pkt->packet_type = 0;
1354 if (rss_hash_res && rxq->rss_hash) {
1355 pkt->hash.rss = rss_hash_res;
1356 pkt->ol_flags = PKT_RX_RSS_HASH;
1358 if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
1362 rxq_cq_to_pkt_type(cqe);
1364 rxq_cq_to_ol_flags(rxq, cqe);
1366 if (ntohs(cqe->hdr_type_etc) &
1367 MLX5_CQE_VLAN_STRIPPED) {
1368 pkt->ol_flags |= PKT_RX_VLAN_PKT |
1369 PKT_RX_VLAN_STRIPPED;
1370 pkt->vlan_tci = ntohs(cqe->vlan_info);
1372 if (rxq->crc_present)
1373 len -= ETHER_CRC_LEN;
1377 DATA_LEN(rep) = DATA_LEN(seg);
1378 PKT_LEN(rep) = PKT_LEN(seg);
1379 SET_DATA_OFF(rep, DATA_OFF(seg));
1380 NB_SEGS(rep) = NB_SEGS(seg);
1381 PORT(rep) = PORT(seg);
1383 (*rxq->elts)[idx] = rep;
1385 * Fill NIC descriptor with the new buffer. The lkey and size
1386 * of the buffers are already known, only the buffer address
1389 wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t));
1390 if (len > DATA_LEN(seg)) {
1391 len -= DATA_LEN(seg);
1396 DATA_LEN(seg) = len;
1397 #ifdef MLX5_PMD_SOFT_COUNTERS
1398 /* Increment bytes counter. */
1399 rxq->stats.ibytes += PKT_LEN(pkt);
1401 /* Return packet. */
1407 /* Align consumer index to the next stride. */
1412 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
1414 /* Update the consumer index. */
1415 rxq->rq_ci = rq_ci >> sges_n;
1417 *rxq->cq_db = htonl(rxq->cq_ci);
1419 *rxq->rq_db = htonl(rxq->rq_ci);
1420 #ifdef MLX5_PMD_SOFT_COUNTERS
1421 /* Increment packets counter. */
1422 rxq->stats.ipackets += i;
1428 * Dummy DPDK callback for TX.
1430 * This function is used to temporarily replace the real callback during
1431 * unsafe control operations on the queue, or in case of error.
1434 * Generic pointer to TX queue structure.
1436 * Packets to transmit.
1438 * Number of packets in array.
1441 * Number of packets successfully transmitted (<= pkts_n).
1444 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1453 * Dummy DPDK callback for RX.
1455 * This function is used to temporarily replace the real callback during
1456 * unsafe control operations on the queue, or in case of error.
1459 * Generic pointer to RX queue structure.
1461 * Array to store received packets.
1463 * Maximum number of packets in array.
1466 * Number of packets successfully received (<= pkts_n).
1469 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)