X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=drivers%2Fnet%2Fmlx5%2Fmlx5_rxtx.c;h=1bbce3b7544c0e1d41457287d755bf212ffd0f59;hb=6e7cbd63706f3435b9d9a2057a37db1da01db9a7;hp=9d1380a02132116f44e5064d4ee18e8ac62040ee;hpb=97f17497d162afdb82c8704bf097f0fee3724b2e;p=deb_dpdk.git diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index 9d1380a0..1bbce3b7 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -39,486 +39,1308 @@ /* Verbs header. */ /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ #ifdef PEDANTIC -#pragma GCC diagnostic ignored "-pedantic" +#pragma GCC diagnostic ignored "-Wpedantic" #endif #include +#include #ifdef PEDANTIC -#pragma GCC diagnostic error "-pedantic" +#pragma GCC diagnostic error "-Wpedantic" #endif -/* DPDK headers don't like -pedantic. */ -#ifdef PEDANTIC -#pragma GCC diagnostic ignored "-pedantic" -#endif #include #include #include #include #include -#include -#ifdef PEDANTIC -#pragma GCC diagnostic error "-pedantic" -#endif +#include #include "mlx5.h" #include "mlx5_utils.h" #include "mlx5_rxtx.h" #include "mlx5_autoconf.h" #include "mlx5_defs.h" +#include "mlx5_prm.h" + +static __rte_always_inline uint32_t +rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe); + +static __rte_always_inline int +mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, + uint16_t cqe_cnt, uint32_t *rss_hash); + +static __rte_always_inline uint32_t +rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); + +uint32_t mlx5_ptype_table[] __rte_cache_aligned = { + [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ +}; /** - * Manage TX completions. + * Build a table to translate Rx completion flags to packet type. * - * When sending a burst, mlx5_tx_burst() posts several WRs. - * To improve performance, a completion event is only required once every - * MLX5_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information - * for other WRs, but this information would not be used anyway. + * @note: fix mlx5_dev_supported_ptypes_get() if any change here. + */ +void +mlx5_set_ptype_table(void) +{ + unsigned int i; + uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table; + + /* Last entry must not be overwritten, reserved for errored packet. */ + for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i) + (*p)[i] = RTE_PTYPE_UNKNOWN; + /* + * The index to the array should have: + * bit[1:0] = l3_hdr_type + * bit[4:2] = l4_hdr_type + * bit[5] = ip_frag + * bit[6] = tunneled + * bit[7] = outer_l3_type + */ + /* L2 */ + (*p)[0x00] = RTE_PTYPE_L2_ETHER; + /* L3 */ + (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_NONFRAG; + (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_NONFRAG; + /* Fragmented */ + (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG; + (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG; + /* TCP */ + (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + /* UDP */ + (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; + (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; + /* Repeat with outer_l3_type being set. Just in case. */ + (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_NONFRAG; + (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_NONFRAG; + (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG; + (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG; + (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; + (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; + /* Tunneled - L3 */ + (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG; + (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG; + (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG; + (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG; + /* Tunneled - Fragmented */ + (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG; + (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG; + (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG; + (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG; + /* Tunneled - TCP */ + (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + /* Tunneled - UDP */ + (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP; + (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP; + (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP; + (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP; +} + +/** + * Return the size of tailroom of WQ. * * @param txq * Pointer to TX queue structure. + * @param addr + * Pointer to tail of WQ. * * @return - * 0 on success, -1 on failure. + * Size of tailroom. */ -static int -txq_complete(struct txq *txq) +static inline size_t +tx_mlx5_wq_tailroom(struct mlx5_txq_data *txq, void *addr) { - unsigned int elts_comp = txq->elts_comp; - unsigned int elts_tail = txq->elts_tail; - unsigned int elts_free = txq->elts_tail; - const unsigned int elts_n = txq->elts_n; - int wcs_n; + size_t tailroom; + tailroom = (uintptr_t)(txq->wqes) + + (1 << txq->wqe_n) * MLX5_WQE_SIZE - + (uintptr_t)addr; + return tailroom; +} - if (unlikely(elts_comp == 0)) - return 0; -#ifdef DEBUG_SEND - DEBUG("%p: processing %u work requests completions", - (void *)txq, elts_comp); -#endif - wcs_n = txq->poll_cnt(txq->cq, elts_comp); - if (unlikely(wcs_n == 0)) - return 0; - if (unlikely(wcs_n < 0)) { - DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)", - (void *)txq, wcs_n); - return -1; - } - elts_comp -= wcs_n; - assert(elts_comp <= txq->elts_comp); - /* - * Assume WC status is successful as nothing can be done about it - * anyway. - */ - elts_tail += wcs_n * txq->elts_comp_cd_init; - if (elts_tail >= elts_n) - elts_tail -= elts_n; - - while (elts_free != elts_tail) { - struct txq_elt *elt = &(*txq->elts)[elts_free]; - unsigned int elts_free_next = - (((elts_free + 1) == elts_n) ? 0 : elts_free + 1); - struct rte_mbuf *tmp = elt->buf; - struct txq_elt *elt_next = &(*txq->elts)[elts_free_next]; - -#ifndef NDEBUG - /* Poisoning. */ - memset(elt, 0x66, sizeof(*elt)); -#endif - RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); - /* Faster than rte_pktmbuf_free(). */ - do { - struct rte_mbuf *next = NEXT(tmp); +/** + * Copy data to tailroom of circular queue. + * + * @param dst + * Pointer to destination. + * @param src + * Pointer to source. + * @param n + * Number of bytes to copy. + * @param base + * Pointer to head of queue. + * @param tailroom + * Size of tailroom from dst. + * + * @return + * Pointer after copied data. + */ +static inline void * +mlx5_copy_to_wq(void *dst, const void *src, size_t n, + void *base, size_t tailroom) +{ + void *ret; - rte_pktmbuf_free_seg(tmp); - tmp = next; - } while (tmp != NULL); - elts_free = elts_free_next; + if (n > tailroom) { + rte_memcpy(dst, src, tailroom); + rte_memcpy(base, (void *)((uintptr_t)src + tailroom), + n - tailroom); + ret = (uint8_t *)base + n - tailroom; + } else { + rte_memcpy(dst, src, n); + ret = (n == tailroom) ? base : (uint8_t *)dst + n; } - - txq->elts_tail = elts_tail; - txq->elts_comp = elts_comp; - return 0; + return ret; } -/* For best performance, this function should not be inlined. */ -struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, const struct rte_mempool *) - __attribute__((noinline)); - /** - * Register mempool as a memory region. + * DPDK callback to check the status of a tx descriptor. * - * @param pd - * Pointer to protection domain. - * @param mp - * Pointer to memory pool. + * @param tx_queue + * The tx queue. + * @param[in] offset + * The index of the descriptor in the ring. * * @return - * Memory region pointer, NULL in case of error. + * The status of the tx descriptor. */ -struct ibv_mr * -mlx5_mp2mr(struct ibv_pd *pd, const struct rte_mempool *mp) +int +mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) { - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - uintptr_t start = mp->elt_va_start; - uintptr_t end = mp->elt_va_end; - unsigned int i; + struct mlx5_txq_data *txq = tx_queue; + uint16_t used; - DEBUG("mempool %p area start=%p end=%p size=%zu", - (const void *)mp, (void *)start, (void *)end, - (size_t)(end - start)); - /* Round start and end to page boundary if found in memory segments. */ - for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) { - uintptr_t addr = (uintptr_t)ms[i].addr; - size_t len = ms[i].len; - unsigned int align = ms[i].hugepage_sz; - - if ((start > addr) && (start < addr + len)) - start = RTE_ALIGN_FLOOR(start, align); - if ((end > addr) && (end < addr + len)) - end = RTE_ALIGN_CEIL(end, align); - } - DEBUG("mempool %p using start=%p end=%p size=%zu for MR", - (const void *)mp, (void *)start, (void *)end, - (size_t)(end - start)); - return ibv_reg_mr(pd, - (void *)start, - end - start, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + mlx5_tx_complete(txq); + used = txq->elts_head - txq->elts_tail; + if (offset < used) + return RTE_ETH_TX_DESC_FULL; + return RTE_ETH_TX_DESC_DONE; } /** - * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which - * the cloned mbuf is allocated is returned instead. + * DPDK callback to check the status of a rx descriptor. * - * @param buf - * Pointer to mbuf. + * @param rx_queue + * The rx queue. + * @param[in] offset + * The index of the descriptor in the ring. * * @return - * Memory pool where data is located for given mbuf. + * The status of the tx descriptor. */ -static struct rte_mempool * -txq_mb2mp(struct rte_mbuf *buf) +int +mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) { - if (unlikely(RTE_MBUF_INDIRECT(buf))) - return rte_mbuf_from_indirect(buf)->pool; - return buf->pool; + struct mlx5_rxq_data *rxq = rx_queue; + struct rxq_zip *zip = &rxq->zip; + volatile struct mlx5_cqe *cqe; + const unsigned int cqe_n = (1 << rxq->cqe_n); + const unsigned int cqe_cnt = cqe_n - 1; + unsigned int cq_ci; + unsigned int used; + + /* if we are processing a compressed cqe */ + if (zip->ai) { + used = zip->cqe_cnt - zip->ca; + cq_ci = zip->cq_ci; + } else { + used = 0; + cq_ci = rxq->cq_ci; + } + cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; + while (check_cqe(cqe, cqe_n, cq_ci) == 0) { + int8_t op_own; + unsigned int n; + + op_own = cqe->op_own; + if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) + n = rte_be_to_cpu_32(cqe->byte_cnt); + else + n = 1; + cq_ci += n; + used += n; + cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; + } + used = RTE_MIN(used, (1U << rxq->elts_n) - 1); + if (offset < used) + return RTE_ETH_RX_DESC_DONE; + return RTE_ETH_RX_DESC_AVAIL; } /** - * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, - * remove an entry first. + * DPDK callback for TX. * - * @param txq - * Pointer to TX queue structure. - * @param[in] mp - * Memory Pool for which a Memory Region lkey must be returned. + * @param dpdk_txq + * Generic pointer to TX queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. * * @return - * mr->lkey on success, (uint32_t)-1 on failure. + * Number of packets successfully transmitted (<= pkts_n). */ -static uint32_t -txq_mp2mr(struct txq *txq, const struct rte_mempool *mp) +uint16_t +mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { - unsigned int i; - struct ibv_mr *mr; + struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; + uint16_t elts_head = txq->elts_head; + const uint16_t elts_n = 1 << txq->elts_n; + const uint16_t elts_m = elts_n - 1; + unsigned int i = 0; + unsigned int j = 0; + unsigned int k = 0; + uint16_t max_elts; + unsigned int max_inline = txq->max_inline; + const unsigned int inline_en = !!max_inline && txq->inline_en; + uint16_t max_wqe; + unsigned int comp; + volatile struct mlx5_wqe_v *wqe = NULL; + volatile struct mlx5_wqe_ctrl *last_wqe = NULL; + unsigned int segs_n = 0; + struct rte_mbuf *buf = NULL; + uint8_t *raw; - for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { - if (unlikely(txq->mp2mr[i].mp == NULL)) { - /* Unknown MP, add a new MR for it. */ + if (unlikely(!pkts_n)) + return 0; + /* Prefetch first packet cacheline. */ + rte_prefetch0(*pkts); + /* Start processing. */ + mlx5_tx_complete(txq); + max_elts = (elts_n - (elts_head - txq->elts_tail)); + max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); + if (unlikely(!max_wqe)) + return 0; + do { + volatile rte_v128u32_t *dseg = NULL; + uint32_t length; + unsigned int ds = 0; + unsigned int sg = 0; /* counter of additional segs attached. */ + uintptr_t addr; + uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2; + uint16_t tso_header_sz = 0; + uint16_t ehdr; + uint8_t cs_flags; + uint64_t tso = 0; + uint16_t tso_segsz = 0; +#ifdef MLX5_PMD_SOFT_COUNTERS + uint32_t total_length = 0; +#endif + + /* first_seg */ + buf = *pkts; + segs_n = buf->nb_segs; + /* + * Make sure there is enough room to store this packet and + * that one ring entry remains unused. + */ + assert(segs_n); + if (max_elts < segs_n) + break; + max_elts -= segs_n; + --segs_n; + if (unlikely(--max_wqe == 0)) + break; + wqe = (volatile struct mlx5_wqe_v *) + tx_mlx5_wqe(txq, txq->wqe_ci); + rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); + if (pkts_n - i > 1) + rte_prefetch0(*(pkts + 1)); + addr = rte_pktmbuf_mtod(buf, uintptr_t); + length = DATA_LEN(buf); + ehdr = (((uint8_t *)addr)[1] << 8) | + ((uint8_t *)addr)[0]; +#ifdef MLX5_PMD_SOFT_COUNTERS + total_length = length; +#endif + if (length < (MLX5_WQE_DWORD_SIZE + 2)) { + txq->stats.oerrors++; break; } - if (txq->mp2mr[i].mp == mp) { - assert(txq->mp2mr[i].lkey != (uint32_t)-1); - assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey); - return txq->mp2mr[i].lkey; + /* Update element. */ + (*txq->elts)[elts_head & elts_m] = buf; + /* Prefetch next buffer data. */ + if (pkts_n - i > 1) + rte_prefetch0( + rte_pktmbuf_mtod(*(pkts + 1), volatile void *)); + cs_flags = txq_ol_cksum_to_cs(txq, buf); + raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE; + /* Replace the Ethernet type by the VLAN if necessary. */ + if (buf->ol_flags & PKT_TX_VLAN_PKT) { + uint32_t vlan = rte_cpu_to_be_32(0x81000000 | + buf->vlan_tci); + unsigned int len = 2 * ETHER_ADDR_LEN - 2; + + addr += 2; + length -= 2; + /* Copy Destination and source mac address. */ + memcpy((uint8_t *)raw, ((uint8_t *)addr), len); + /* Copy VLAN. */ + memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan)); + /* Copy missing two bytes to end the DSeg. */ + memcpy((uint8_t *)raw + len + sizeof(vlan), + ((uint8_t *)addr) + len, 2); + addr += len + 2; + length -= (len + 2); + } else { + memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2, + MLX5_WQE_DWORD_SIZE); + length -= pkt_inline_sz; + addr += pkt_inline_sz; } + raw += MLX5_WQE_DWORD_SIZE; + if (txq->tso_en) { + tso = buf->ol_flags & PKT_TX_TCP_SEG; + if (tso) { + uintptr_t end = (uintptr_t) + (((uintptr_t)txq->wqes) + + (1 << txq->wqe_n) * + MLX5_WQE_SIZE); + unsigned int copy_b; + uint8_t vlan_sz = (buf->ol_flags & + PKT_TX_VLAN_PKT) ? 4 : 0; + const uint64_t is_tunneled = + buf->ol_flags & + (PKT_TX_TUNNEL_GRE | + PKT_TX_TUNNEL_VXLAN); + + tso_header_sz = buf->l2_len + vlan_sz + + buf->l3_len + buf->l4_len; + tso_segsz = buf->tso_segsz; + if (unlikely(tso_segsz == 0)) { + txq->stats.oerrors++; + break; + } + if (is_tunneled && txq->tunnel_en) { + tso_header_sz += buf->outer_l2_len + + buf->outer_l3_len; + cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM; + } else { + cs_flags |= MLX5_ETH_WQE_L4_CSUM; + } + if (unlikely(tso_header_sz > + MLX5_MAX_TSO_HEADER)) { + txq->stats.oerrors++; + break; + } + copy_b = tso_header_sz - pkt_inline_sz; + /* First seg must contain all headers. */ + assert(copy_b <= length); + if (copy_b && + ((end - (uintptr_t)raw) > copy_b)) { + uint16_t n = (MLX5_WQE_DS(copy_b) - + 1 + 3) / 4; + + if (unlikely(max_wqe < n)) + break; + max_wqe -= n; + rte_memcpy((void *)raw, + (void *)addr, copy_b); + addr += copy_b; + length -= copy_b; + /* Include padding for TSO header. */ + copy_b = MLX5_WQE_DS(copy_b) * + MLX5_WQE_DWORD_SIZE; + pkt_inline_sz += copy_b; + raw += copy_b; + } else { + /* NOP WQE. */ + wqe->ctrl = (rte_v128u32_t){ + rte_cpu_to_be_32( + txq->wqe_ci << 8), + rte_cpu_to_be_32( + txq->qp_num_8s | 1), + 0, + 0, + }; + ds = 1; +#ifdef MLX5_PMD_SOFT_COUNTERS + total_length = 0; +#endif + k++; + goto next_wqe; + } + } + } + /* Inline if enough room. */ + if (inline_en || tso) { + uint32_t inl; + uintptr_t end = (uintptr_t) + (((uintptr_t)txq->wqes) + + (1 << txq->wqe_n) * MLX5_WQE_SIZE); + unsigned int inline_room = max_inline * + RTE_CACHE_LINE_SIZE - + (pkt_inline_sz - 2) - + !!tso * sizeof(inl); + uintptr_t addr_end = (addr + inline_room) & + ~(RTE_CACHE_LINE_SIZE - 1); + unsigned int copy_b = (addr_end > addr) ? + RTE_MIN((addr_end - addr), length) : + 0; + + if (copy_b && ((end - (uintptr_t)raw) > copy_b)) { + /* + * One Dseg remains in the current WQE. To + * keep the computation positive, it is + * removed after the bytes to Dseg conversion. + */ + uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4; + + if (unlikely(max_wqe < n)) + break; + max_wqe -= n; + if (tso) { + inl = rte_cpu_to_be_32(copy_b | + MLX5_INLINE_SEG); + rte_memcpy((void *)raw, + (void *)&inl, sizeof(inl)); + raw += sizeof(inl); + pkt_inline_sz += sizeof(inl); + } + rte_memcpy((void *)raw, (void *)addr, copy_b); + addr += copy_b; + length -= copy_b; + pkt_inline_sz += copy_b; + } + /* + * 2 DWORDs consumed by the WQE header + ETH segment + + * the size of the inline part of the packet. + */ + ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2); + if (length > 0) { + if (ds % (MLX5_WQE_SIZE / + MLX5_WQE_DWORD_SIZE) == 0) { + if (unlikely(--max_wqe == 0)) + break; + dseg = (volatile rte_v128u32_t *) + tx_mlx5_wqe(txq, txq->wqe_ci + + ds / 4); + } else { + dseg = (volatile rte_v128u32_t *) + ((uintptr_t)wqe + + (ds * MLX5_WQE_DWORD_SIZE)); + } + goto use_dseg; + } else if (!segs_n) { + goto next_pkt; + } else { + /* dseg will be advance as part of next_seg */ + dseg = (volatile rte_v128u32_t *) + ((uintptr_t)wqe + + ((ds - 1) * MLX5_WQE_DWORD_SIZE)); + goto next_seg; + } + } else { + /* + * No inline has been done in the packet, only the + * Ethernet Header as been stored. + */ + dseg = (volatile rte_v128u32_t *) + ((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE)); + ds = 3; +use_dseg: + /* Add the remaining packet as a simple ds. */ + addr = rte_cpu_to_be_64(addr); + *dseg = (rte_v128u32_t){ + rte_cpu_to_be_32(length), + mlx5_tx_mb2mr(txq, buf), + addr, + addr >> 32, + }; + ++ds; + if (!segs_n) + goto next_pkt; + } +next_seg: + assert(buf); + assert(ds); + assert(wqe); + /* + * Spill on next WQE when the current one does not have + * enough room left. Size of WQE must a be a multiple + * of data segment size. + */ + assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE)); + if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) { + if (unlikely(--max_wqe == 0)) + break; + dseg = (volatile rte_v128u32_t *) + tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4); + rte_prefetch0(tx_mlx5_wqe(txq, + txq->wqe_ci + ds / 4 + 1)); + } else { + ++dseg; + } + ++ds; + buf = buf->next; + assert(buf); + length = DATA_LEN(buf); +#ifdef MLX5_PMD_SOFT_COUNTERS + total_length += length; +#endif + /* Store segment information. */ + addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t)); + *dseg = (rte_v128u32_t){ + rte_cpu_to_be_32(length), + mlx5_tx_mb2mr(txq, buf), + addr, + addr >> 32, + }; + (*txq->elts)[++elts_head & elts_m] = buf; + ++sg; + /* Advance counter only if all segs are successfully posted. */ + if (sg < segs_n) + goto next_seg; + else + j += sg; +next_pkt: + if (ds > MLX5_DSEG_MAX) { + txq->stats.oerrors++; + break; + } + ++elts_head; + ++pkts; + ++i; + /* Initialize known and common part of the WQE structure. */ + if (tso) { + wqe->ctrl = (rte_v128u32_t){ + rte_cpu_to_be_32((txq->wqe_ci << 8) | + MLX5_OPCODE_TSO), + rte_cpu_to_be_32(txq->qp_num_8s | ds), + 0, + 0, + }; + wqe->eseg = (rte_v128u32_t){ + 0, + cs_flags | (rte_cpu_to_be_16(tso_segsz) << 16), + 0, + (ehdr << 16) | rte_cpu_to_be_16(tso_header_sz), + }; + } else { + wqe->ctrl = (rte_v128u32_t){ + rte_cpu_to_be_32((txq->wqe_ci << 8) | + MLX5_OPCODE_SEND), + rte_cpu_to_be_32(txq->qp_num_8s | ds), + 0, + 0, + }; + wqe->eseg = (rte_v128u32_t){ + 0, + cs_flags, + 0, + (ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz), + }; + } +next_wqe: + txq->wqe_ci += (ds + 3) / 4; + /* Save the last successful WQE for completion request */ + last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment sent bytes counter. */ + txq->stats.obytes += total_length; +#endif + } while (i < pkts_n); + /* Take a shortcut if nothing must be sent. */ + if (unlikely((i + k) == 0)) + return 0; + txq->elts_head += (i + j); + /* Check whether completion threshold has been reached. */ + comp = txq->elts_comp + i + j + k; + if (comp >= MLX5_TX_COMP_THRESH) { + /* A CQE slot must always be available. */ + assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); + /* Request completion on last WQE. */ + last_wqe->ctrl2 = rte_cpu_to_be_32(8); + /* Save elts_head in unused "immediate" field of WQE. */ + last_wqe->ctrl3 = txq->elts_head; + txq->elts_comp = 0; + } else { + txq->elts_comp = comp; } - /* Add a new entry, register MR first. */ - DEBUG("%p: discovered new memory pool \"%s\" (%p)", - (void *)txq, mp->name, (const void *)mp); - mr = mlx5_mp2mr(txq->priv->pd, mp); - if (unlikely(mr == NULL)) { - DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", - (void *)txq); - return (uint32_t)-1; - } - if (unlikely(i == RTE_DIM(txq->mp2mr))) { - /* Table is full, remove oldest entry. */ - DEBUG("%p: MR <-> MP table full, dropping oldest entry.", - (void *)txq); - --i; - claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr)); - memmove(&txq->mp2mr[0], &txq->mp2mr[1], - (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); - } - /* Store the new entry. */ - txq->mp2mr[i].mp = mp; - txq->mp2mr[i].mr = mr; - txq->mp2mr[i].lkey = mr->lkey; - DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, - (void *)txq, mp->name, (const void *)mp, txq->mp2mr[i].lkey); - return txq->mp2mr[i].lkey; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment sent packets counter. */ + txq->stats.opackets += i; +#endif + /* Ring QP doorbell. */ + mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe); + return i; } -struct txq_mp2mr_mbuf_check_data { - const struct rte_mempool *mp; - int ret; -}; - /** - * Callback function for rte_mempool_obj_iter() to check whether a given - * mempool object looks like a mbuf. + * Open a MPW session. * - * @param[in, out] arg - * Context data (struct txq_mp2mr_mbuf_check_data). Contains mempool pointer - * and return value. - * @param[in] start - * Object start address. - * @param[in] end - * Object end address. - * @param index - * Unused. - * - * @return - * Nonzero value when object is not a mbuf. + * @param txq + * Pointer to TX queue structure. + * @param mpw + * Pointer to MPW session structure. + * @param length + * Packet length. */ -static void -txq_mp2mr_mbuf_check(void *arg, void *start, void *end, - uint32_t index __rte_unused) +static inline void +mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length) { - struct txq_mp2mr_mbuf_check_data *data = arg; - struct rte_mbuf *buf = - (void *)((uintptr_t)start + data->mp->header_size); - - (void)index; - /* Check whether mbuf structure fits element size and whether mempool - * pointer is valid. */ - if (((uintptr_t)end >= (uintptr_t)(buf + 1)) && - (buf->pool == data->mp)) - data->ret = 0; - else - data->ret = -1; + uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); + volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] = + (volatile struct mlx5_wqe_data_seg (*)[]) + tx_mlx5_wqe(txq, idx + 1); + + mpw->state = MLX5_MPW_STATE_OPENED; + mpw->pkts_n = 0; + mpw->len = length; + mpw->total_len = 0; + mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); + mpw->wqe->eseg.mss = rte_cpu_to_be_16(length); + mpw->wqe->eseg.inline_hdr_sz = 0; + mpw->wqe->eseg.rsvd0 = 0; + mpw->wqe->eseg.rsvd1 = 0; + mpw->wqe->eseg.rsvd2 = 0; + mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) | + (txq->wqe_ci << 8) | + MLX5_OPCODE_TSO); + mpw->wqe->ctrl[2] = 0; + mpw->wqe->ctrl[3] = 0; + mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *) + (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); + mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *) + (((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE)); + mpw->data.dseg[2] = &(*dseg)[0]; + mpw->data.dseg[3] = &(*dseg)[1]; + mpw->data.dseg[4] = &(*dseg)[2]; } /** - * Iterator function for rte_mempool_walk() to register existing mempools and - * fill the MP to MR cache of a TX queue. + * Close a MPW session. * - * @param[in] mp - * Memory Pool to register. - * @param *arg + * @param txq * Pointer to TX queue structure. + * @param mpw + * Pointer to MPW session structure. */ -void -txq_mp2mr_iter(const struct rte_mempool *mp, void *arg) +static inline void +mlx5_mpw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw) { - struct txq *txq = arg; - struct txq_mp2mr_mbuf_check_data data = { - .mp = mp, - .ret = -1, - }; + unsigned int num = mpw->pkts_n; - /* Discard empty mempools. */ - if (mp->size == 0) - return; - /* Register mempool only if the first element looks like a mbuf. */ - rte_mempool_obj_iter((void *)mp->elt_va_start, - 1, - mp->header_size + mp->elt_size + mp->trailer_size, - 1, - mp->elt_pa, - mp->pg_num, - mp->pg_shift, - txq_mp2mr_mbuf_check, - &data); - if (data.ret) - return; - txq_mp2mr(txq, mp); + /* + * Store size in multiple of 16 bytes. Control and Ethernet segments + * count as 2. + */ + mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | (2 + num)); + mpw->state = MLX5_MPW_STATE_CLOSED; + if (num < 3) + ++txq->wqe_ci; + else + txq->wqe_ci += 2; + rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); + rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); } /** - * Insert VLAN using mbuf headroom space. + * DPDK callback for TX with MPW support. * - * @param buf - * Buffer for VLAN insertion. + * @param dpdk_txq + * Generic pointer to TX queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. * * @return - * 0 on success, errno value on failure. + * Number of packets successfully transmitted (<= pkts_n). */ -static inline int -insert_vlan_sw(struct rte_mbuf *buf) +uint16_t +mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { - uintptr_t addr; - uint32_t vlan; - uint16_t head_room_len = rte_pktmbuf_headroom(buf); + struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; + uint16_t elts_head = txq->elts_head; + const uint16_t elts_n = 1 << txq->elts_n; + const uint16_t elts_m = elts_n - 1; + unsigned int i = 0; + unsigned int j = 0; + uint16_t max_elts; + uint16_t max_wqe; + unsigned int comp; + struct mlx5_mpw mpw = { + .state = MLX5_MPW_STATE_CLOSED, + }; - if (head_room_len < 4) - return EINVAL; + if (unlikely(!pkts_n)) + return 0; + /* Prefetch first packet cacheline. */ + rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); + rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); + /* Start processing. */ + mlx5_tx_complete(txq); + max_elts = (elts_n - (elts_head - txq->elts_tail)); + max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); + if (unlikely(!max_wqe)) + return 0; + do { + struct rte_mbuf *buf = *(pkts++); + uint32_t length; + unsigned int segs_n = buf->nb_segs; + uint32_t cs_flags; - addr = rte_pktmbuf_mtod(buf, uintptr_t); - vlan = htonl(0x81000000 | buf->vlan_tci); - memmove((void *)(addr - 4), (void *)addr, 12); - memcpy((void *)(addr + 8), &vlan, sizeof(vlan)); + /* + * Make sure there is enough room to store this packet and + * that one ring entry remains unused. + */ + assert(segs_n); + if (max_elts < segs_n) + break; + /* Do not bother with large packets MPW cannot handle. */ + if (segs_n > MLX5_MPW_DSEG_MAX) { + txq->stats.oerrors++; + break; + } + max_elts -= segs_n; + --pkts_n; + cs_flags = txq_ol_cksum_to_cs(txq, buf); + /* Retrieve packet information. */ + length = PKT_LEN(buf); + assert(length); + /* Start new session if packet differs. */ + if ((mpw.state == MLX5_MPW_STATE_OPENED) && + ((mpw.len != length) || + (segs_n != 1) || + (mpw.wqe->eseg.cs_flags != cs_flags))) + mlx5_mpw_close(txq, &mpw); + if (mpw.state == MLX5_MPW_STATE_CLOSED) { + /* + * Multi-Packet WQE consumes at most two WQE. + * mlx5_mpw_new() expects to be able to use such + * resources. + */ + if (unlikely(max_wqe < 2)) + break; + max_wqe -= 2; + mlx5_mpw_new(txq, &mpw, length); + mpw.wqe->eseg.cs_flags = cs_flags; + } + /* Multi-segment packets must be alone in their MPW. */ + assert((segs_n == 1) || (mpw.pkts_n == 0)); +#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) + length = 0; +#endif + do { + volatile struct mlx5_wqe_data_seg *dseg; + uintptr_t addr; - SET_DATA_OFF(buf, head_room_len - 4); - DATA_LEN(buf) += 4; + assert(buf); + (*txq->elts)[elts_head++ & elts_m] = buf; + dseg = mpw.data.dseg[mpw.pkts_n]; + addr = rte_pktmbuf_mtod(buf, uintptr_t); + *dseg = (struct mlx5_wqe_data_seg){ + .byte_count = rte_cpu_to_be_32(DATA_LEN(buf)), + .lkey = mlx5_tx_mb2mr(txq, buf), + .addr = rte_cpu_to_be_64(addr), + }; +#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) + length += DATA_LEN(buf); +#endif + buf = buf->next; + ++mpw.pkts_n; + ++j; + } while (--segs_n); + assert(length == mpw.len); + if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) + mlx5_mpw_close(txq, &mpw); +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment sent bytes counter. */ + txq->stats.obytes += length; +#endif + ++i; + } while (pkts_n); + /* Take a shortcut if nothing must be sent. */ + if (unlikely(i == 0)) + return 0; + /* Check whether completion threshold has been reached. */ + /* "j" includes both packets and segments. */ + comp = txq->elts_comp + j; + if (comp >= MLX5_TX_COMP_THRESH) { + volatile struct mlx5_wqe *wqe = mpw.wqe; - return 0; + /* A CQE slot must always be available. */ + assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); + /* Request completion on last WQE. */ + wqe->ctrl[2] = rte_cpu_to_be_32(8); + /* Save elts_head in unused "immediate" field of WQE. */ + wqe->ctrl[3] = elts_head; + txq->elts_comp = 0; + } else { + txq->elts_comp = comp; + } +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment sent packets counter. */ + txq->stats.opackets += i; +#endif + /* Ring QP doorbell. */ + if (mpw.state == MLX5_MPW_STATE_OPENED) + mlx5_mpw_close(txq, &mpw); + mlx5_tx_dbrec(txq, mpw.wqe); + txq->elts_head = elts_head; + return i; } -#if MLX5_PMD_SGE_WR_N > 1 - /** - * Copy scattered mbuf contents to a single linear buffer. + * Open a MPW inline session. * - * @param[out] linear - * Linear output buffer. - * @param[in] buf - * Scattered input buffer. - * - * @return - * Number of bytes copied to the output buffer or 0 if not large enough. + * @param txq + * Pointer to TX queue structure. + * @param mpw + * Pointer to MPW session structure. + * @param length + * Packet length. */ -static unsigned int -linearize_mbuf(linear_t *linear, struct rte_mbuf *buf) +static inline void +mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, + uint32_t length) { - unsigned int size = 0; - unsigned int offset; - - do { - unsigned int len = DATA_LEN(buf); + uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); + struct mlx5_wqe_inl_small *inl; - offset = size; - size += len; - if (unlikely(size > sizeof(*linear))) - return 0; - memcpy(&(*linear)[offset], - rte_pktmbuf_mtod(buf, uint8_t *), - len); - buf = NEXT(buf); - } while (buf != NULL); - return size; + mpw->state = MLX5_MPW_INL_STATE_OPENED; + mpw->pkts_n = 0; + mpw->len = length; + mpw->total_len = 0; + mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); + mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) | + (txq->wqe_ci << 8) | + MLX5_OPCODE_TSO); + mpw->wqe->ctrl[2] = 0; + mpw->wqe->ctrl[3] = 0; + mpw->wqe->eseg.mss = rte_cpu_to_be_16(length); + mpw->wqe->eseg.inline_hdr_sz = 0; + mpw->wqe->eseg.cs_flags = 0; + mpw->wqe->eseg.rsvd0 = 0; + mpw->wqe->eseg.rsvd1 = 0; + mpw->wqe->eseg.rsvd2 = 0; + inl = (struct mlx5_wqe_inl_small *) + (((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE); + mpw->data.raw = (uint8_t *)&inl->raw; } /** - * Handle scattered buffers for mlx5_tx_burst(). + * Close a MPW inline session. * * @param txq - * TX queue structure. - * @param segs - * Number of segments in buf. - * @param elt - * TX queue element to fill. - * @param[in] buf - * Buffer to process. - * @param elts_head - * Index of the linear buffer to use if necessary (normally txq->elts_head). - * @param[out] sges - * Array filled with SGEs on success. + * Pointer to TX queue structure. + * @param mpw + * Pointer to MPW session structure. + */ +static inline void +mlx5_mpw_inline_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw) +{ + unsigned int size; + struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *) + (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); + + size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len; + /* + * Store size in multiple of 16 bytes. Control and Ethernet segments + * count as 2. + */ + mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | + MLX5_WQE_DS(size)); + mpw->state = MLX5_MPW_STATE_CLOSED; + inl->byte_cnt = rte_cpu_to_be_32(mpw->total_len | MLX5_INLINE_SEG); + txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE; +} + +/** + * DPDK callback for TX with MPW inline support. + * + * @param dpdk_txq + * Generic pointer to TX queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. * * @return - * A structure containing the processed packet size in bytes and the - * number of SGEs. Both fields are set to (unsigned int)-1 in case of - * failure. + * Number of packets successfully transmitted (<= pkts_n). */ -static struct tx_burst_sg_ret { - unsigned int length; - unsigned int num; -} -tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt, - struct rte_mbuf *buf, unsigned int elts_head, - struct ibv_sge (*sges)[MLX5_PMD_SGE_WR_N]) +uint16_t +mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n) { - unsigned int sent_size = 0; - unsigned int j; - int linearize = 0; - - /* When there are too many segments, extra segments are - * linearized in the last SGE. */ - if (unlikely(segs > RTE_DIM(*sges))) { - segs = (RTE_DIM(*sges) - 1); - linearize = 1; - } - /* Update element. */ - elt->buf = buf; - /* Register segments as SGEs. */ - for (j = 0; (j != segs); ++j) { - struct ibv_sge *sge = &(*sges)[j]; - uint32_t lkey; - - /* Retrieve Memory Region key for this memory pool. */ - lkey = txq_mp2mr(txq, txq_mb2mp(buf)); - if (unlikely(lkey == (uint32_t)-1)) { - /* MR does not exist. */ - DEBUG("%p: unable to get MP <-> MR association", - (void *)txq); - /* Clean up TX element. */ - elt->buf = NULL; - goto stop; + struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; + uint16_t elts_head = txq->elts_head; + const uint16_t elts_n = 1 << txq->elts_n; + const uint16_t elts_m = elts_n - 1; + unsigned int i = 0; + unsigned int j = 0; + uint16_t max_elts; + uint16_t max_wqe; + unsigned int comp; + unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE; + struct mlx5_mpw mpw = { + .state = MLX5_MPW_STATE_CLOSED, + }; + /* + * Compute the maximum number of WQE which can be consumed by inline + * code. + * - 2 DSEG for: + * - 1 control segment, + * - 1 Ethernet segment, + * - N Dseg from the inline request. + */ + const unsigned int wqe_inl_n = + ((2 * MLX5_WQE_DWORD_SIZE + + txq->max_inline * RTE_CACHE_LINE_SIZE) + + RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE; + + if (unlikely(!pkts_n)) + return 0; + /* Prefetch first packet cacheline. */ + rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); + rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); + /* Start processing. */ + mlx5_tx_complete(txq); + max_elts = (elts_n - (elts_head - txq->elts_tail)); + do { + struct rte_mbuf *buf = *(pkts++); + uintptr_t addr; + uint32_t length; + unsigned int segs_n = buf->nb_segs; + uint8_t cs_flags; + + /* + * Make sure there is enough room to store this packet and + * that one ring entry remains unused. + */ + assert(segs_n); + if (max_elts < segs_n) + break; + /* Do not bother with large packets MPW cannot handle. */ + if (segs_n > MLX5_MPW_DSEG_MAX) { + txq->stats.oerrors++; + break; } - /* Update SGE. */ - sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); - if (txq->priv->vf) - rte_prefetch0((volatile void *) - (uintptr_t)sge->addr); - sge->length = DATA_LEN(buf); - sge->lkey = lkey; - sent_size += sge->length; - buf = NEXT(buf); - } - /* If buf is not NULL here and is not going to be linearized, - * nb_segs is not valid. */ - assert(j == segs); - assert((buf == NULL) || (linearize)); - /* Linearize extra segments. */ - if (linearize) { - struct ibv_sge *sge = &(*sges)[segs]; - linear_t *linear = &(*txq->elts_linear)[elts_head]; - unsigned int size = linearize_mbuf(linear, buf); - - assert(segs == (RTE_DIM(*sges) - 1)); - if (size == 0) { - /* Invalid packet. */ - DEBUG("%p: packet too large to be linearized.", - (void *)txq); - /* Clean up TX element. */ - elt->buf = NULL; - goto stop; + max_elts -= segs_n; + --pkts_n; + /* + * Compute max_wqe in case less WQE were consumed in previous + * iteration. + */ + max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); + cs_flags = txq_ol_cksum_to_cs(txq, buf); + /* Retrieve packet information. */ + length = PKT_LEN(buf); + /* Start new session if packet differs. */ + if (mpw.state == MLX5_MPW_STATE_OPENED) { + if ((mpw.len != length) || + (segs_n != 1) || + (mpw.wqe->eseg.cs_flags != cs_flags)) + mlx5_mpw_close(txq, &mpw); + } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) { + if ((mpw.len != length) || + (segs_n != 1) || + (length > inline_room) || + (mpw.wqe->eseg.cs_flags != cs_flags)) { + mlx5_mpw_inline_close(txq, &mpw); + inline_room = + txq->max_inline * RTE_CACHE_LINE_SIZE; + } } - /* If MLX5_PMD_SGE_WR_N is 1, free mbuf immediately. */ - if (RTE_DIM(*sges) == 1) { + if (mpw.state == MLX5_MPW_STATE_CLOSED) { + if ((segs_n != 1) || + (length > inline_room)) { + /* + * Multi-Packet WQE consumes at most two WQE. + * mlx5_mpw_new() expects to be able to use + * such resources. + */ + if (unlikely(max_wqe < 2)) + break; + max_wqe -= 2; + mlx5_mpw_new(txq, &mpw, length); + mpw.wqe->eseg.cs_flags = cs_flags; + } else { + if (unlikely(max_wqe < wqe_inl_n)) + break; + max_wqe -= wqe_inl_n; + mlx5_mpw_inline_new(txq, &mpw, length); + mpw.wqe->eseg.cs_flags = cs_flags; + } + } + /* Multi-segment packets must be alone in their MPW. */ + assert((segs_n == 1) || (mpw.pkts_n == 0)); + if (mpw.state == MLX5_MPW_STATE_OPENED) { + assert(inline_room == + txq->max_inline * RTE_CACHE_LINE_SIZE); +#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) + length = 0; +#endif do { - struct rte_mbuf *next = NEXT(buf); + volatile struct mlx5_wqe_data_seg *dseg; + + assert(buf); + (*txq->elts)[elts_head++ & elts_m] = buf; + dseg = mpw.data.dseg[mpw.pkts_n]; + addr = rte_pktmbuf_mtod(buf, uintptr_t); + *dseg = (struct mlx5_wqe_data_seg){ + .byte_count = + rte_cpu_to_be_32(DATA_LEN(buf)), + .lkey = mlx5_tx_mb2mr(txq, buf), + .addr = rte_cpu_to_be_64(addr), + }; +#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) + length += DATA_LEN(buf); +#endif + buf = buf->next; + ++mpw.pkts_n; + ++j; + } while (--segs_n); + assert(length == mpw.len); + if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) + mlx5_mpw_close(txq, &mpw); + } else { + unsigned int max; + + assert(mpw.state == MLX5_MPW_INL_STATE_OPENED); + assert(length <= inline_room); + assert(length == DATA_LEN(buf)); + addr = rte_pktmbuf_mtod(buf, uintptr_t); + (*txq->elts)[elts_head++ & elts_m] = buf; + /* Maximum number of bytes before wrapping. */ + max = ((((uintptr_t)(txq->wqes)) + + (1 << txq->wqe_n) * + MLX5_WQE_SIZE) - + (uintptr_t)mpw.data.raw); + if (length > max) { + rte_memcpy((void *)(uintptr_t)mpw.data.raw, + (void *)addr, + max); + mpw.data.raw = (volatile void *)txq->wqes; + rte_memcpy((void *)(uintptr_t)mpw.data.raw, + (void *)(addr + max), + length - max); + mpw.data.raw += length - max; + } else { + rte_memcpy((void *)(uintptr_t)mpw.data.raw, + (void *)addr, + length); - rte_pktmbuf_free_seg(buf); - buf = next; - } while (buf != NULL); - elt->buf = NULL; + if (length == max) + mpw.data.raw = + (volatile void *)txq->wqes; + else + mpw.data.raw += length; + } + ++mpw.pkts_n; + mpw.total_len += length; + ++j; + if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) { + mlx5_mpw_inline_close(txq, &mpw); + inline_room = + txq->max_inline * RTE_CACHE_LINE_SIZE; + } else { + inline_room -= length; + } } - /* Update SGE. */ - sge->addr = (uintptr_t)&(*linear)[0]; - sge->length = size; - sge->lkey = txq->mr_linear->lkey; - sent_size += size; - /* Include last segment. */ - segs++; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment sent bytes counter. */ + txq->stats.obytes += length; +#endif + ++i; + } while (pkts_n); + /* Take a shortcut if nothing must be sent. */ + if (unlikely(i == 0)) + return 0; + /* Check whether completion threshold has been reached. */ + /* "j" includes both packets and segments. */ + comp = txq->elts_comp + j; + if (comp >= MLX5_TX_COMP_THRESH) { + volatile struct mlx5_wqe *wqe = mpw.wqe; + + /* A CQE slot must always be available. */ + assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); + /* Request completion on last WQE. */ + wqe->ctrl[2] = rte_cpu_to_be_32(8); + /* Save elts_head in unused "immediate" field of WQE. */ + wqe->ctrl[3] = elts_head; + txq->elts_comp = 0; + } else { + txq->elts_comp = comp; } - return (struct tx_burst_sg_ret){ - .length = sent_size, - .num = segs, - }; -stop: - return (struct tx_burst_sg_ret){ - .length = -1, - .num = -1, - }; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment sent packets counter. */ + txq->stats.opackets += i; +#endif + /* Ring QP doorbell. */ + if (mpw.state == MLX5_MPW_INL_STATE_OPENED) + mlx5_mpw_inline_close(txq, &mpw); + else if (mpw.state == MLX5_MPW_STATE_OPENED) + mlx5_mpw_close(txq, &mpw); + mlx5_tx_dbrec(txq, mpw.wqe); + txq->elts_head = elts_head; + return i; } -#endif /* MLX5_PMD_SGE_WR_N > 1 */ +/** + * Open an Enhanced MPW session. + * + * @param txq + * Pointer to TX queue structure. + * @param mpw + * Pointer to MPW session structure. + * @param length + * Packet length. + */ +static inline void +mlx5_empw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, int padding) +{ + uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); + + mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED; + mpw->pkts_n = 0; + mpw->total_len = sizeof(struct mlx5_wqe); + mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); + mpw->wqe->ctrl[0] = + rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) | + (txq->wqe_ci << 8) | + MLX5_OPCODE_ENHANCED_MPSW); + mpw->wqe->ctrl[2] = 0; + mpw->wqe->ctrl[3] = 0; + memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE); + if (unlikely(padding)) { + uintptr_t addr = (uintptr_t)(mpw->wqe + 1); + + /* Pad the first 2 DWORDs with zero-length inline header. */ + *(volatile uint32_t *)addr = rte_cpu_to_be_32(MLX5_INLINE_SEG); + *(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) = + rte_cpu_to_be_32(MLX5_INLINE_SEG); + mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE; + /* Start from the next WQEBB. */ + mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1)); + } else { + mpw->data.raw = (volatile void *)(mpw->wqe + 1); + } +} /** - * DPDK callback for TX. + * Close an Enhanced MPW session. + * + * @param txq + * Pointer to TX queue structure. + * @param mpw + * Pointer to MPW session structure. + * + * @return + * Number of consumed WQEs. + */ +static inline uint16_t +mlx5_empw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw) +{ + uint16_t ret; + + /* Store size in multiple of 16 bytes. Control and Ethernet segments + * count as 2. + */ + mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | + MLX5_WQE_DS(mpw->total_len)); + mpw->state = MLX5_MPW_STATE_CLOSED; + ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE; + txq->wqe_ci += ret; + return ret; +} + +/** + * DPDK callback for TX with Enhanced MPW support. * * @param dpdk_txq * Generic pointer to TX queue structure. @@ -531,224 +1353,271 @@ stop: * Number of packets successfully transmitted (<= pkts_n). */ uint16_t -mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { - struct txq *txq = (struct txq *)dpdk_txq; - unsigned int elts_head = txq->elts_head; - const unsigned int elts_n = txq->elts_n; - unsigned int elts_comp_cd = txq->elts_comp_cd; - unsigned int elts_comp = 0; - unsigned int i; - unsigned int max; - int err; - struct rte_mbuf *buf = pkts[0]; + struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; + uint16_t elts_head = txq->elts_head; + const uint16_t elts_n = 1 << txq->elts_n; + const uint16_t elts_m = elts_n - 1; + unsigned int i = 0; + unsigned int j = 0; + uint16_t max_elts; + uint16_t max_wqe; + unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE; + unsigned int mpw_room = 0; + unsigned int inl_pad = 0; + uint32_t inl_hdr; + struct mlx5_mpw mpw = { + .state = MLX5_MPW_STATE_CLOSED, + }; - assert(elts_comp_cd != 0); - /* Prefetch first packet cacheline. */ - rte_prefetch0(buf); - txq_complete(txq); - max = (elts_n - (elts_head - txq->elts_tail)); - if (max > elts_n) - max -= elts_n; - assert(max >= 1); - assert(max <= elts_n); - /* Always leave one free entry in the ring. */ - --max; - if (max == 0) + if (unlikely(!pkts_n)) return 0; - if (max > pkts_n) - max = pkts_n; - for (i = 0; (i != max); ++i) { - struct rte_mbuf *buf_next = pkts[i + 1]; - unsigned int elts_head_next = - (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); - struct txq_elt *elt = &(*txq->elts)[elts_head]; - unsigned int segs = NB_SEGS(buf); -#ifdef MLX5_PMD_SOFT_COUNTERS - unsigned int sent_size = 0; -#endif - uint32_t send_flags = 0; -#ifdef HAVE_VERBS_VLAN_INSERTION - int insert_vlan = 0; -#endif /* HAVE_VERBS_VLAN_INSERTION */ - - if (i + 1 < max) - rte_prefetch0(buf_next); - /* Request TX completion. */ - if (unlikely(--elts_comp_cd == 0)) { - elts_comp_cd = txq->elts_comp_cd_init; - ++elts_comp; - send_flags |= IBV_EXP_QP_BURST_SIGNALED; + /* Start processing. */ + mlx5_tx_complete(txq); + max_elts = (elts_n - (elts_head - txq->elts_tail)); + max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); + if (unlikely(!max_wqe)) + return 0; + do { + struct rte_mbuf *buf = *(pkts++); + uintptr_t addr; + unsigned int do_inline = 0; /* Whether inline is possible. */ + uint32_t length; + unsigned int segs_n = buf->nb_segs; + uint8_t cs_flags; + + /* + * Make sure there is enough room to store this packet and + * that one ring entry remains unused. + */ + assert(segs_n); + if (max_elts - j < segs_n) + break; + /* Do not bother with large packets MPW cannot handle. */ + if (segs_n > MLX5_MPW_DSEG_MAX) { + txq->stats.oerrors++; + break; } - /* Should we enable HW CKSUM offload */ - if (buf->ol_flags & - (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { - send_flags |= IBV_EXP_QP_BURST_IP_CSUM; - /* HW does not support checksum offloads at arbitrary - * offsets but automatically recognizes the packet - * type. For inner L3/L4 checksums, only VXLAN (UDP) - * tunnels are currently supported. */ - if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type)) - send_flags |= IBV_EXP_QP_BURST_TUNNEL; + cs_flags = txq_ol_cksum_to_cs(txq, buf); + /* Retrieve packet information. */ + length = PKT_LEN(buf); + /* Start new session if: + * - multi-segment packet + * - no space left even for a dseg + * - next packet can be inlined with a new WQE + * - cs_flag differs + * It can't be MLX5_MPW_STATE_OPENED as always have a single + * segmented packet. + */ + if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) { + if ((segs_n != 1) || + (inl_pad + sizeof(struct mlx5_wqe_data_seg) > + mpw_room) || + (length <= txq->inline_max_packet_sz && + inl_pad + sizeof(inl_hdr) + length > + mpw_room) || + (mpw.wqe->eseg.cs_flags != cs_flags)) + max_wqe -= mlx5_empw_close(txq, &mpw); } - if (buf->ol_flags & PKT_TX_VLAN_PKT) { -#ifdef HAVE_VERBS_VLAN_INSERTION - if (!txq->priv->mps) - insert_vlan = 1; - else -#endif /* HAVE_VERBS_VLAN_INSERTION */ - { - err = insert_vlan_sw(buf); - if (unlikely(err)) - goto stop; + if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) { + if (unlikely(segs_n != 1)) { + /* Fall back to legacy MPW. + * A MPW session consumes 2 WQEs at most to + * include MLX5_MPW_DSEG_MAX pointers. + */ + if (unlikely(max_wqe < 2)) + break; + mlx5_mpw_new(txq, &mpw, length); + } else { + /* In Enhanced MPW, inline as much as the budget + * is allowed. The remaining space is to be + * filled with dsegs. If the title WQEBB isn't + * padded, it will have 2 dsegs there. + */ + mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX, + (max_inline ? max_inline : + pkts_n * MLX5_WQE_DWORD_SIZE) + + MLX5_WQE_SIZE); + if (unlikely(max_wqe * MLX5_WQE_SIZE < + mpw_room)) + break; + /* Don't pad the title WQEBB to not waste WQ. */ + mlx5_empw_new(txq, &mpw, 0); + mpw_room -= mpw.total_len; + inl_pad = 0; + do_inline = + length <= txq->inline_max_packet_sz && + sizeof(inl_hdr) + length <= mpw_room && + !txq->mpw_hdr_dseg; } + mpw.wqe->eseg.cs_flags = cs_flags; + } else { + /* Evaluate whether the next packet can be inlined. + * Inlininig is possible when: + * - length is less than configured value + * - length fits for remaining space + * - not required to fill the title WQEBB with dsegs + */ + do_inline = + length <= txq->inline_max_packet_sz && + inl_pad + sizeof(inl_hdr) + length <= + mpw_room && + (!txq->mpw_hdr_dseg || + mpw.total_len >= MLX5_WQE_SIZE); } - if (likely(segs == 1)) { - uintptr_t addr; - uint32_t length; - uint32_t lkey; - uintptr_t buf_next_addr; - - /* Retrieve buffer information. */ - addr = rte_pktmbuf_mtod(buf, uintptr_t); - length = DATA_LEN(buf); - /* Update element. */ - elt->buf = buf; - if (txq->priv->vf) - rte_prefetch0((volatile void *) - (uintptr_t)addr); - /* Prefetch next buffer data. */ - if (i + 1 < max) { - buf_next_addr = - rte_pktmbuf_mtod(buf_next, uintptr_t); - rte_prefetch0((volatile void *) - (uintptr_t)buf_next_addr); - } - /* Put packet into send queue. */ -#if MLX5_PMD_MAX_INLINE > 0 - if (length <= txq->max_inline) { -#ifdef HAVE_VERBS_VLAN_INSERTION - if (insert_vlan) - err = txq->send_pending_inline_vlan - (txq->qp, - (void *)addr, - length, - send_flags, - &buf->vlan_tci); - else -#endif /* HAVE_VERBS_VLAN_INSERTION */ - err = txq->send_pending_inline - (txq->qp, - (void *)addr, - length, - send_flags); - } else + /* Multi-segment packets must be alone in their MPW. */ + assert((segs_n == 1) || (mpw.pkts_n == 0)); + if (unlikely(mpw.state == MLX5_MPW_STATE_OPENED)) { +#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) + length = 0; #endif - { - /* Retrieve Memory Region key for this - * memory pool. */ - lkey = txq_mp2mr(txq, txq_mb2mp(buf)); - if (unlikely(lkey == (uint32_t)-1)) { - /* MR does not exist. */ - DEBUG("%p: unable to get MP <-> MR" - " association", (void *)txq); - /* Clean up TX element. */ - elt->buf = NULL; - goto stop; - } -#ifdef HAVE_VERBS_VLAN_INSERTION - if (insert_vlan) - err = txq->send_pending_vlan - (txq->qp, - addr, - length, - lkey, - send_flags, - &buf->vlan_tci); - else -#endif /* HAVE_VERBS_VLAN_INSERTION */ - err = txq->send_pending - (txq->qp, - addr, - length, - lkey, - send_flags); - } - if (unlikely(err)) - goto stop; -#ifdef MLX5_PMD_SOFT_COUNTERS - sent_size += length; + do { + volatile struct mlx5_wqe_data_seg *dseg; + + assert(buf); + (*txq->elts)[elts_head++ & elts_m] = buf; + dseg = mpw.data.dseg[mpw.pkts_n]; + addr = rte_pktmbuf_mtod(buf, uintptr_t); + *dseg = (struct mlx5_wqe_data_seg){ + .byte_count = rte_cpu_to_be_32( + DATA_LEN(buf)), + .lkey = mlx5_tx_mb2mr(txq, buf), + .addr = rte_cpu_to_be_64(addr), + }; +#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) + length += DATA_LEN(buf); #endif + buf = buf->next; + ++j; + ++mpw.pkts_n; + } while (--segs_n); + /* A multi-segmented packet takes one MPW session. + * TODO: Pack more multi-segmented packets if possible. + */ + mlx5_mpw_close(txq, &mpw); + if (mpw.pkts_n < 3) + max_wqe--; + else + max_wqe -= 2; + } else if (max_inline && do_inline) { + /* Inline packet into WQE. */ + unsigned int max; + + assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED); + assert(length == DATA_LEN(buf)); + inl_hdr = rte_cpu_to_be_32(length | MLX5_INLINE_SEG); + addr = rte_pktmbuf_mtod(buf, uintptr_t); + mpw.data.raw = (volatile void *) + ((uintptr_t)mpw.data.raw + inl_pad); + max = tx_mlx5_wq_tailroom(txq, + (void *)(uintptr_t)mpw.data.raw); + /* Copy inline header. */ + mpw.data.raw = (volatile void *) + mlx5_copy_to_wq( + (void *)(uintptr_t)mpw.data.raw, + &inl_hdr, + sizeof(inl_hdr), + (void *)(uintptr_t)txq->wqes, + max); + max = tx_mlx5_wq_tailroom(txq, + (void *)(uintptr_t)mpw.data.raw); + /* Copy packet data. */ + mpw.data.raw = (volatile void *) + mlx5_copy_to_wq( + (void *)(uintptr_t)mpw.data.raw, + (void *)addr, + length, + (void *)(uintptr_t)txq->wqes, + max); + ++mpw.pkts_n; + mpw.total_len += (inl_pad + sizeof(inl_hdr) + length); + /* No need to get completion as the entire packet is + * copied to WQ. Free the buf right away. + */ + rte_pktmbuf_free_seg(buf); + mpw_room -= (inl_pad + sizeof(inl_hdr) + length); + /* Add pad in the next packet if any. */ + inl_pad = (((uintptr_t)mpw.data.raw + + (MLX5_WQE_DWORD_SIZE - 1)) & + ~(MLX5_WQE_DWORD_SIZE - 1)) - + (uintptr_t)mpw.data.raw; } else { -#if MLX5_PMD_SGE_WR_N > 1 - struct ibv_sge sges[MLX5_PMD_SGE_WR_N]; - struct tx_burst_sg_ret ret; - - ret = tx_burst_sg(txq, segs, elt, buf, elts_head, - &sges); - if (ret.length == (unsigned int)-1) - goto stop; - /* Put SG list into send queue. */ -#ifdef HAVE_VERBS_VLAN_INSERTION - if (insert_vlan) - err = txq->send_pending_sg_list_vlan - (txq->qp, - sges, - ret.num, - send_flags, - &buf->vlan_tci); + /* No inline. Load a dseg of packet pointer. */ + volatile rte_v128u32_t *dseg; + + assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED); + assert((inl_pad + sizeof(*dseg)) <= mpw_room); + assert(length == DATA_LEN(buf)); + if (!tx_mlx5_wq_tailroom(txq, + (void *)((uintptr_t)mpw.data.raw + + inl_pad))) + dseg = (volatile void *)txq->wqes; else -#endif /* HAVE_VERBS_VLAN_INSERTION */ - err = txq->send_pending_sg_list - (txq->qp, - sges, - ret.num, - send_flags); - if (unlikely(err)) - goto stop; -#ifdef MLX5_PMD_SOFT_COUNTERS - sent_size += ret.length; -#endif -#else /* MLX5_PMD_SGE_WR_N > 1 */ - DEBUG("%p: TX scattered buffers support not" - " compiled in", (void *)txq); - goto stop; -#endif /* MLX5_PMD_SGE_WR_N > 1 */ + dseg = (volatile void *) + ((uintptr_t)mpw.data.raw + + inl_pad); + (*txq->elts)[elts_head++ & elts_m] = buf; + addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, + uintptr_t)); + *dseg = (rte_v128u32_t) { + rte_cpu_to_be_32(length), + mlx5_tx_mb2mr(txq, buf), + addr, + addr >> 32, + }; + mpw.data.raw = (volatile void *)(dseg + 1); + mpw.total_len += (inl_pad + sizeof(*dseg)); + ++j; + ++mpw.pkts_n; + mpw_room -= (inl_pad + sizeof(*dseg)); + inl_pad = 0; } - elts_head = elts_head_next; - buf = buf_next; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment sent bytes counter. */ - txq->stats.obytes += sent_size; + txq->stats.obytes += length; #endif - } -stop: + ++i; + } while (i < pkts_n); /* Take a shortcut if nothing must be sent. */ if (unlikely(i == 0)) return 0; + /* Check whether completion threshold has been reached. */ + if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH || + (uint16_t)(txq->wqe_ci - txq->mpw_comp) >= + (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) { + volatile struct mlx5_wqe *wqe = mpw.wqe; + + /* A CQE slot must always be available. */ + assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); + /* Request completion on last WQE. */ + wqe->ctrl[2] = rte_cpu_to_be_32(8); + /* Save elts_head in unused "immediate" field of WQE. */ + wqe->ctrl[3] = elts_head; + txq->elts_comp = 0; + txq->mpw_comp = txq->wqe_ci; + } else { + txq->elts_comp += j; + } #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment sent packets counter. */ txq->stats.opackets += i; #endif + if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) + mlx5_empw_close(txq, &mpw); + else if (mpw.state == MLX5_MPW_STATE_OPENED) + mlx5_mpw_close(txq, &mpw); /* Ring QP doorbell. */ - err = txq->send_flush(txq->qp); - if (unlikely(err)) { - /* A nonzero value is not supposed to be returned. - * Nothing can be done about it. */ - DEBUG("%p: send_flush() failed with error %d", - (void *)txq, err); - } + mlx5_tx_dbrec(txq, mpw.wqe); txq->elts_head = elts_head; - txq->elts_comp += elts_comp; - txq->elts_comp_cd = elts_comp_cd; return i; } /** * Translate RX completion flags to packet type. * - * @param flags - * RX completion flags returned by poll_length_flags(). + * @param[in] cqe + * Pointer to CQE. * * @note: fix mlx5_dev_supported_ptypes_get() if any change here. * @@ -756,33 +1625,139 @@ stop: * Packet type for struct rte_mbuf. */ static inline uint32_t -rxq_cq_to_pkt_type(uint32_t flags) +rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) { - uint32_t pkt_type; + uint8_t idx; + uint8_t pinfo = cqe->pkt_info; + uint16_t ptype = cqe->hdr_type_etc; - if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) - pkt_type = - TRANSPOSE(flags, - IBV_EXP_CQ_RX_OUTER_IPV4_PACKET, - RTE_PTYPE_L3_IPV4) | - TRANSPOSE(flags, - IBV_EXP_CQ_RX_OUTER_IPV6_PACKET, - RTE_PTYPE_L3_IPV6) | - TRANSPOSE(flags, - IBV_EXP_CQ_RX_IPV4_PACKET, - RTE_PTYPE_INNER_L3_IPV4) | - TRANSPOSE(flags, - IBV_EXP_CQ_RX_IPV6_PACKET, - RTE_PTYPE_INNER_L3_IPV6); - else - pkt_type = - TRANSPOSE(flags, - IBV_EXP_CQ_RX_IPV4_PACKET, - RTE_PTYPE_L3_IPV4) | - TRANSPOSE(flags, - IBV_EXP_CQ_RX_IPV6_PACKET, - RTE_PTYPE_L3_IPV6); - return pkt_type; + /* + * The index to the array should have: + * bit[1:0] = l3_hdr_type + * bit[4:2] = l4_hdr_type + * bit[5] = ip_frag + * bit[6] = tunneled + * bit[7] = outer_l3_type + */ + idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); + return mlx5_ptype_table[idx]; +} + +/** + * Get size of the next packet for a given CQE. For compressed CQEs, the + * consumer index is updated only once all packets of the current one have + * been processed. + * + * @param rxq + * Pointer to RX queue. + * @param cqe + * CQE to process. + * @param[out] rss_hash + * Packet RSS Hash result. + * + * @return + * Packet size in bytes (0 if there is none), -1 in case of completion + * with error. + */ +static inline int +mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, + uint16_t cqe_cnt, uint32_t *rss_hash) +{ + struct rxq_zip *zip = &rxq->zip; + uint16_t cqe_n = cqe_cnt + 1; + int len = 0; + uint16_t idx, end; + + /* Process compressed data in the CQE and mini arrays. */ + if (zip->ai) { + volatile struct mlx5_mini_cqe8 (*mc)[8] = + (volatile struct mlx5_mini_cqe8 (*)[8]) + (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].pkt_info); + + len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt); + *rss_hash = rte_be_to_cpu_32((*mc)[zip->ai & 7].rx_hash_result); + if ((++zip->ai & 7) == 0) { + /* Invalidate consumed CQEs */ + idx = zip->ca; + end = zip->na; + while (idx != end) { + (*rxq->cqes)[idx & cqe_cnt].op_own = + MLX5_CQE_INVALIDATE; + ++idx; + } + /* + * Increment consumer index to skip the number of + * CQEs consumed. Hardware leaves holes in the CQ + * ring for software use. + */ + zip->ca = zip->na; + zip->na += 8; + } + if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { + /* Invalidate the rest */ + idx = zip->ca; + end = zip->cq_ci; + + while (idx != end) { + (*rxq->cqes)[idx & cqe_cnt].op_own = + MLX5_CQE_INVALIDATE; + ++idx; + } + rxq->cq_ci = zip->cq_ci; + zip->ai = 0; + } + /* No compressed data, get next CQE and verify if it is compressed. */ + } else { + int ret; + int8_t op_own; + + ret = check_cqe(cqe, cqe_n, rxq->cq_ci); + if (unlikely(ret == 1)) + return 0; + ++rxq->cq_ci; + op_own = cqe->op_own; + rte_io_rmb(); + if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { + volatile struct mlx5_mini_cqe8 (*mc)[8] = + (volatile struct mlx5_mini_cqe8 (*)[8]) + (uintptr_t)(&(*rxq->cqes)[rxq->cq_ci & + cqe_cnt].pkt_info); + + /* Fix endianness. */ + zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt); + /* + * Current mini array position is the one returned by + * check_cqe64(). + * + * If completion comprises several mini arrays, as a + * special case the second one is located 7 CQEs after + * the initial CQE instead of 8 for subsequent ones. + */ + zip->ca = rxq->cq_ci; + zip->na = zip->ca + 7; + /* Compute the next non compressed CQE. */ + --rxq->cq_ci; + zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; + /* Get packet size to return. */ + len = rte_be_to_cpu_32((*mc)[0].byte_cnt); + *rss_hash = rte_be_to_cpu_32((*mc)[0].rx_hash_result); + zip->ai = 1; + /* Prefetch all the entries to be invalidated */ + idx = zip->ca; + end = zip->cq_ci; + while (idx != end) { + rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]); + ++idx; + } + } else { + len = rte_be_to_cpu_32(cqe->byte_cnt); + *rss_hash = rte_be_to_cpu_32(cqe->rx_hash_res); + } + /* Error while receiving packet. */ + if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR)) + return -1; + } + return len; } /** @@ -790,53 +1765,38 @@ rxq_cq_to_pkt_type(uint32_t flags) * * @param[in] rxq * Pointer to RX queue structure. - * @param flags - * RX completion flags returned by poll_length_flags(). + * @param[in] cqe + * Pointer to CQE. * * @return * Offload flags (ol_flags) for struct rte_mbuf. */ static inline uint32_t -rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags) +rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) { uint32_t ol_flags = 0; + uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc); - if (rxq->csum) { - /* Set IP checksum flag only for IPv4/IPv6 packets. */ - if (flags & - (IBV_EXP_CQ_RX_IPV4_PACKET | IBV_EXP_CQ_RX_IPV6_PACKET)) - ol_flags |= - TRANSPOSE(~flags, - IBV_EXP_CQ_RX_IP_CSUM_OK, - PKT_RX_IP_CKSUM_BAD); -#ifdef HAVE_EXP_CQ_RX_TCP_PACKET - /* Set L4 checksum flag only for TCP/UDP packets. */ - if (flags & - (IBV_EXP_CQ_RX_TCP_PACKET | IBV_EXP_CQ_RX_UDP_PACKET)) -#endif /* HAVE_EXP_CQ_RX_TCP_PACKET */ - ol_flags |= - TRANSPOSE(~flags, - IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK, - PKT_RX_L4_CKSUM_BAD); - } - /* - * PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place - * of PKT_RX_EIP_CKSUM_BAD because the latter is not functional - * (its value is 0). - */ - if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) + ol_flags = + TRANSPOSE(flags, + MLX5_CQE_RX_L3_HDR_VALID, + PKT_RX_IP_CKSUM_GOOD) | + TRANSPOSE(flags, + MLX5_CQE_RX_L4_HDR_VALID, + PKT_RX_L4_CKSUM_GOOD); + if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) ol_flags |= - TRANSPOSE(~flags, - IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK, - PKT_RX_IP_CKSUM_BAD) | - TRANSPOSE(~flags, - IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK, - PKT_RX_L4_CKSUM_BAD); + TRANSPOSE(flags, + MLX5_CQE_RX_L3_HDR_VALID, + PKT_RX_IP_CKSUM_GOOD) | + TRANSPOSE(flags, + MLX5_CQE_RX_L4_HDR_VALID, + PKT_RX_L4_CKSUM_GOOD); return ol_flags; } /** - * DPDK callback for RX with scattered packets support. + * DPDK callback for RX. * * @param dpdk_rxq * Generic pointer to RX queue structure. @@ -849,353 +1809,152 @@ rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags) * Number of packets successfully received (<= pkts_n). */ uint16_t -mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) { - struct rxq *rxq = (struct rxq *)dpdk_rxq; - struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp; - const unsigned int elts_n = rxq->elts_n; - unsigned int elts_head = rxq->elts_head; - unsigned int i; - unsigned int pkts_ret = 0; - int ret; + struct mlx5_rxq_data *rxq = dpdk_rxq; + const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; + const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; + const unsigned int sges_n = rxq->sges_n; + struct rte_mbuf *pkt = NULL; + struct rte_mbuf *seg = NULL; + volatile struct mlx5_cqe *cqe = + &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; + unsigned int i = 0; + unsigned int rq_ci = rxq->rq_ci << sges_n; + int len = 0; /* keep its value across iterations. */ - if (unlikely(!rxq->sp)) - return mlx5_rx_burst(dpdk_rxq, pkts, pkts_n); - if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */ - return 0; - for (i = 0; (i != pkts_n); ++i) { - struct rxq_elt_sp *elt = &(*elts)[elts_head]; - unsigned int len; - unsigned int pkt_buf_len; - struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */ - struct rte_mbuf **pkt_buf_next = &pkt_buf; - unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM; - unsigned int j = 0; - uint32_t flags; - uint16_t vlan_tci; - - /* Sanity checks. */ - assert(elts_head < rxq->elts_n); - assert(rxq->elts_head < rxq->elts_n); - ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci); - if (unlikely(ret < 0)) { - struct ibv_wc wc; - int wcs_n; - - DEBUG("rxq=%p, poll_length() failed (ret=%d)", - (void *)rxq, ret); - /* ibv_poll_cq() must be used in case of failure. */ - wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); - if (unlikely(wcs_n == 0)) + while (pkts_n) { + unsigned int idx = rq_ci & wqe_cnt; + volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx]; + struct rte_mbuf *rep = (*rxq->elts)[idx]; + uint32_t rss_hash_res = 0; + + if (pkt) + NEXT(seg) = rep; + seg = rep; + rte_prefetch0(seg); + rte_prefetch0(cqe); + rte_prefetch0(wqe); + rep = rte_mbuf_raw_alloc(rxq->mp); + if (unlikely(rep == NULL)) { + ++rxq->stats.rx_nombuf; + if (!pkt) { + /* + * no buffers before we even started, + * bail out silently. + */ break; - if (unlikely(wcs_n < 0)) { - DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", - (void *)rxq, wcs_n); + } + while (pkt != seg) { + assert(pkt != (*rxq->elts)[idx]); + rep = NEXT(pkt); + NEXT(pkt) = NULL; + NB_SEGS(pkt) = 1; + rte_mbuf_raw_free(pkt); + pkt = rep; + } + break; + } + if (!pkt) { + cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; + len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, + &rss_hash_res); + if (!len) { + rte_mbuf_raw_free(rep); break; } - assert(wcs_n == 1); - if (unlikely(wc.status != IBV_WC_SUCCESS)) { - /* Whatever, just repost the offending WR. */ - DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" - " completion status (%d): %s", - (void *)rxq, wc.wr_id, wc.status, - ibv_wc_status_str(wc.status)); -#ifdef MLX5_PMD_SOFT_COUNTERS - /* Increment dropped packets counter. */ + if (unlikely(len == -1)) { + /* RX error, packet is likely too large. */ + rte_mbuf_raw_free(rep); ++rxq->stats.idropped; -#endif - goto repost; + goto skip; } - ret = wc.byte_len; - } - if (ret == 0) - break; - assert(ret >= (rxq->crc_present << 2)); - len = ret - (rxq->crc_present << 2); - pkt_buf_len = len; - /* - * Replace spent segments with new ones, concatenate and - * return them as pkt_buf. - */ - while (1) { - struct ibv_sge *sge = &elt->sges[j]; - struct rte_mbuf *seg = elt->bufs[j]; - struct rte_mbuf *rep; - unsigned int seg_tailroom; + pkt = seg; + assert(len >= (rxq->crc_present << 2)); + /* Update packet information. */ + pkt->packet_type = rxq_cq_to_pkt_type(cqe); + pkt->ol_flags = 0; + if (rss_hash_res && rxq->rss_hash) { + pkt->hash.rss = rss_hash_res; + pkt->ol_flags = PKT_RX_RSS_HASH; + } + if (rxq->mark && + MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { + pkt->ol_flags |= PKT_RX_FDIR; + if (cqe->sop_drop_qpn != + rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { + uint32_t mark = cqe->sop_drop_qpn; - assert(seg != NULL); - /* - * Fetch initial bytes of packet descriptor into a - * cacheline while allocating rep. - */ - rte_prefetch0(seg); - rep = __rte_mbuf_raw_alloc(rxq->mp); - if (unlikely(rep == NULL)) { - /* - * Unable to allocate a replacement mbuf, - * repost WR. - */ - DEBUG("rxq=%p: can't allocate a new mbuf", - (void *)rxq); - if (pkt_buf != NULL) { - *pkt_buf_next = NULL; - rte_pktmbuf_free(pkt_buf); + pkt->ol_flags |= PKT_RX_FDIR_ID; + pkt->hash.fdir.hi = + mlx5_flow_mark_get(mark); } - /* Increment out of memory counters. */ - ++rxq->stats.rx_nombuf; - ++rxq->priv->dev->data->rx_mbuf_alloc_failed; - goto repost; } -#ifndef NDEBUG - /* Poison user-modifiable fields in rep. */ - NEXT(rep) = (void *)((uintptr_t)-1); - SET_DATA_OFF(rep, 0xdead); - DATA_LEN(rep) = 0xd00d; - PKT_LEN(rep) = 0xdeadd00d; - NB_SEGS(rep) = 0x2a; - PORT(rep) = 0x2a; - rep->ol_flags = -1; -#endif - assert(rep->buf_len == seg->buf_len); - assert(rep->buf_len == rxq->mb_len); - /* Reconfigure sge to use rep instead of seg. */ - assert(sge->lkey == rxq->mr->lkey); - sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom); - elt->bufs[j] = rep; - ++j; - /* Update pkt_buf if it's the first segment, or link - * seg to the previous one and update pkt_buf_next. */ - *pkt_buf_next = seg; - pkt_buf_next = &NEXT(seg); - /* Update seg information. */ - seg_tailroom = (seg->buf_len - seg_headroom); - assert(sge->length == seg_tailroom); - SET_DATA_OFF(seg, seg_headroom); - if (likely(len <= seg_tailroom)) { - /* Last segment. */ - DATA_LEN(seg) = len; - PKT_LEN(seg) = len; - /* Sanity check. */ - assert(rte_pktmbuf_headroom(seg) == - seg_headroom); - assert(rte_pktmbuf_tailroom(seg) == - (seg_tailroom - len)); - break; + if (rxq->csum | rxq->csum_l2tun) + pkt->ol_flags |= rxq_cq_to_ol_flags(rxq, cqe); + if (rxq->vlan_strip && + (cqe->hdr_type_etc & + rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { + pkt->ol_flags |= PKT_RX_VLAN | + PKT_RX_VLAN_STRIPPED; + pkt->vlan_tci = + rte_be_to_cpu_16(cqe->vlan_info); } - DATA_LEN(seg) = seg_tailroom; - PKT_LEN(seg) = seg_tailroom; - /* Sanity check. */ - assert(rte_pktmbuf_headroom(seg) == seg_headroom); - assert(rte_pktmbuf_tailroom(seg) == 0); - /* Fix len and clear headroom for next segments. */ - len -= seg_tailroom; - seg_headroom = 0; - } - /* Update head and tail segments. */ - *pkt_buf_next = NULL; - assert(pkt_buf != NULL); - assert(j != 0); - NB_SEGS(pkt_buf) = j; - PORT(pkt_buf) = rxq->port_id; - PKT_LEN(pkt_buf) = pkt_buf_len; - if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) { - pkt_buf->packet_type = rxq_cq_to_pkt_type(flags); - pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags); -#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS - if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) { - pkt_buf->ol_flags |= PKT_RX_VLAN_PKT; - pkt_buf->vlan_tci = vlan_tci; + if (rxq->hw_timestamp) { + pkt->timestamp = + rte_be_to_cpu_64(cqe->timestamp); + pkt->ol_flags |= PKT_RX_TIMESTAMP; } -#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ + if (rxq->crc_present) + len -= ETHER_CRC_LEN; + PKT_LEN(pkt) = len; } - - /* Return packet. */ - *(pkts++) = pkt_buf; - ++pkts_ret; -#ifdef MLX5_PMD_SOFT_COUNTERS - /* Increment bytes counter. */ - rxq->stats.ibytes += pkt_buf_len; -#endif -repost: - ret = rxq->recv(rxq->wq, elt->sges, RTE_DIM(elt->sges)); - if (unlikely(ret)) { - /* Inability to repost WRs is fatal. */ - DEBUG("%p: recv_sg_list(): failed (ret=%d)", - (void *)rxq->priv, - ret); - abort(); - } - if (++elts_head >= elts_n) - elts_head = 0; - continue; - } - if (unlikely(i == 0)) - return 0; - rxq->elts_head = elts_head; -#ifdef MLX5_PMD_SOFT_COUNTERS - /* Increment packets counter. */ - rxq->stats.ipackets += pkts_ret; -#endif - return pkts_ret; -} - -/** - * DPDK callback for RX. - * - * The following function is the same as mlx5_rx_burst_sp(), except it doesn't - * manage scattered packets. Improves performance when MRU is lower than the - * size of the first segment. - * - * @param dpdk_rxq - * Generic pointer to RX queue structure. - * @param[out] pkts - * Array to store received packets. - * @param pkts_n - * Maximum number of packets in array. - * - * @return - * Number of packets successfully received (<= pkts_n). - */ -uint16_t -mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) -{ - struct rxq *rxq = (struct rxq *)dpdk_rxq; - struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp; - const unsigned int elts_n = rxq->elts_n; - unsigned int elts_head = rxq->elts_head; - struct ibv_sge sges[pkts_n]; - unsigned int i; - unsigned int pkts_ret = 0; - int ret; - - if (unlikely(rxq->sp)) - return mlx5_rx_burst_sp(dpdk_rxq, pkts, pkts_n); - for (i = 0; (i != pkts_n); ++i) { - struct rxq_elt *elt = &(*elts)[elts_head]; - unsigned int len; - struct rte_mbuf *seg = elt->buf; - struct rte_mbuf *rep; - uint32_t flags; - uint16_t vlan_tci; - - /* Sanity checks. */ - assert(seg != NULL); - assert(elts_head < rxq->elts_n); - assert(rxq->elts_head < rxq->elts_n); + DATA_LEN(rep) = DATA_LEN(seg); + PKT_LEN(rep) = PKT_LEN(seg); + SET_DATA_OFF(rep, DATA_OFF(seg)); + PORT(rep) = PORT(seg); + (*rxq->elts)[idx] = rep; /* - * Fetch initial bytes of packet descriptor into a - * cacheline while allocating rep. + * Fill NIC descriptor with the new buffer. The lkey and size + * of the buffers are already known, only the buffer address + * changes. */ - rte_prefetch0(seg); - rte_prefetch0(&seg->cacheline1); - ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci); - if (unlikely(ret < 0)) { - struct ibv_wc wc; - int wcs_n; - - DEBUG("rxq=%p, poll_length() failed (ret=%d)", - (void *)rxq, ret); - /* ibv_poll_cq() must be used in case of failure. */ - wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); - if (unlikely(wcs_n == 0)) - break; - if (unlikely(wcs_n < 0)) { - DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", - (void *)rxq, wcs_n); - break; - } - assert(wcs_n == 1); - if (unlikely(wc.status != IBV_WC_SUCCESS)) { - /* Whatever, just repost the offending WR. */ - DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" - " completion status (%d): %s", - (void *)rxq, wc.wr_id, wc.status, - ibv_wc_status_str(wc.status)); -#ifdef MLX5_PMD_SOFT_COUNTERS - /* Increment dropped packets counter. */ - ++rxq->stats.idropped; -#endif - /* Add SGE to array for repost. */ - sges[i] = elt->sge; - goto repost; - } - ret = wc.byte_len; - } - if (ret == 0) - break; - assert(ret >= (rxq->crc_present << 2)); - len = ret - (rxq->crc_present << 2); - rep = __rte_mbuf_raw_alloc(rxq->mp); - if (unlikely(rep == NULL)) { - /* - * Unable to allocate a replacement mbuf, - * repost WR. - */ - DEBUG("rxq=%p: can't allocate a new mbuf", - (void *)rxq); - /* Increment out of memory counters. */ - ++rxq->stats.rx_nombuf; - ++rxq->priv->dev->data->rx_mbuf_alloc_failed; - goto repost; + wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); + if (len > DATA_LEN(seg)) { + len -= DATA_LEN(seg); + ++NB_SEGS(pkt); + ++rq_ci; + continue; } - - /* Reconfigure sge to use rep instead of seg. */ - elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; - assert(elt->sge.lkey == rxq->mr->lkey); - elt->buf = rep; - - /* Add SGE to array for repost. */ - sges[i] = elt->sge; - - /* Update seg information. */ - SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM); - NB_SEGS(seg) = 1; - PORT(seg) = rxq->port_id; - NEXT(seg) = NULL; - PKT_LEN(seg) = len; DATA_LEN(seg) = len; - if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) { - seg->packet_type = rxq_cq_to_pkt_type(flags); - seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags); -#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS - if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) { - seg->ol_flags |= PKT_RX_VLAN_PKT; - seg->vlan_tci = vlan_tci; - } -#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ - } - /* Return packet. */ - *(pkts++) = seg; - ++pkts_ret; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment bytes counter. */ - rxq->stats.ibytes += len; + rxq->stats.ibytes += PKT_LEN(pkt); #endif -repost: - if (++elts_head >= elts_n) - elts_head = 0; - continue; + /* Return packet. */ + *(pkts++) = pkt; + pkt = NULL; + --pkts_n; + ++i; +skip: + /* Align consumer index to the next stride. */ + rq_ci >>= sges_n; + ++rq_ci; + rq_ci <<= sges_n; } - if (unlikely(i == 0)) + if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) return 0; - /* Repost WRs. */ -#ifdef DEBUG_RECV - DEBUG("%p: reposting %u WRs", (void *)rxq, i); -#endif - ret = rxq->recv(rxq->wq, sges, i); - if (unlikely(ret)) { - /* Inability to repost WRs is fatal. */ - DEBUG("%p: recv_burst(): failed (ret=%d)", - (void *)rxq->priv, - ret); - abort(); - } - rxq->elts_head = elts_head; + /* Update the consumer index. */ + rxq->rq_ci = rq_ci >> sges_n; + rte_io_wmb(); + *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); + rte_io_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment packets counter. */ - rxq->stats.ipackets += pkts_ret; + rxq->stats.ipackets += i; #endif - return pkts_ret; + return i; } /** @@ -1215,11 +1974,10 @@ repost: * Number of packets successfully transmitted (<= pkts_n). */ uint16_t -removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +removed_tx_burst(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) { - (void)dpdk_txq; - (void)pkts; - (void)pkts_n; return 0; } @@ -1240,10 +1998,64 @@ removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) * Number of packets successfully received (<= pkts_n). */ uint16_t -removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +removed_rx_burst(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) { - (void)dpdk_rxq; - (void)pkts; - (void)pkts_n; return 0; } + +/* + * Vectorized Rx/Tx routines are not compiled in when required vector + * instructions are not supported on a target architecture. The following null + * stubs are needed for linkage when those are not included outside of this file + * (e.g. mlx5_rxtx_vec_sse.c for x86). + */ + +uint16_t __attribute__((weak)) +mlx5_tx_burst_raw_vec(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) +{ + return 0; +} + +uint16_t __attribute__((weak)) +mlx5_tx_burst_vec(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) +{ + return 0; +} + +uint16_t __attribute__((weak)) +mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) +{ + return 0; +} + +int __attribute__((weak)) +mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev __rte_unused) +{ + return -ENOTSUP; +} + +int __attribute__((weak)) +mlx5_check_vec_tx_support(struct rte_eth_dev *dev __rte_unused) +{ + return -ENOTSUP; +} + +int __attribute__((weak)) +mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) +{ + return -ENOTSUP; +} + +int __attribute__((weak)) +mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused) +{ + return -ENOTSUP; +}