+ if (PREDICT_FALSE ((op_own >> 4)) != MLX5_CQE_REQ)
+ vlib_error_count (vm, node->node_index, RDMA_TX_ERROR_COMPLETION, 1);
+ idx++;
+ cur = cqes + (idx & cq_mask);
+ }
+
+ if (idx == txq->dv_cq_idx)
+ return; /* nothing to do */
+
+ cur = cqes + ((idx - 1) & cq_mask);
+ saved = cur->op_own;
+ (void) saved;
+ cur->op_own = 0xf0;
+ txq->dv_cq_idx = idx;
+
+ /* retrieve original WQE and get new tail counter */
+ wqe = txq->dv_sq_wqes + (be16toh (cur->wqe_counter) & sq_mask);
+ if (PREDICT_FALSE (wqe->ctrl.imm == RDMA_TXQ_DV_INVALID_ID))
+ return; /* can happen if CQE reports error for an intermediate WQE */
+
+ ASSERT (RDMA_TXQ_USED_SZ (txq->head, wqe->ctrl.imm) <= buf_sz &&
+ RDMA_TXQ_USED_SZ (wqe->ctrl.imm, txq->tail) < buf_sz);
+
+ /* free sent buffers and update txq head */
+ vlib_buffer_free_from_ring (vm, txq->bufs, txq->head & mask, buf_sz,
+ RDMA_TXQ_USED_SZ (txq->head, wqe->ctrl.imm));
+ txq->head = wqe->ctrl.imm;
+
+ /* ring doorbell */
+ CLIB_MEMORY_STORE_BARRIER ();
+ txq->dv_cq_dbrec[0] = htobe32 (idx);
+}
+
+static_always_inline void
+rdma_device_output_tx_mlx5_doorbell (rdma_txq_t * txq, rdma_mlx5_wqe_t * last,
+ const u16 tail, u32 sq_mask)
+{
+ last->ctrl.imm = tail; /* register item to free */
+ last->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; /* generate a CQE so we can free buffers */
+
+ ASSERT (tail != txq->tail &&
+ RDMA_TXQ_AVAIL_SZ (txq, txq->head, txq->tail) >=
+ RDMA_TXQ_USED_SZ (txq->tail, tail));
+
+ CLIB_MEMORY_STORE_BARRIER ();
+ txq->dv_sq_dbrec[MLX5_SND_DBR] = htobe32 (tail);
+ CLIB_COMPILER_BARRIER ();
+ txq->dv_sq_db[0] = *(u64 *) (txq->dv_sq_wqes + (txq->tail & sq_mask));
+}
+
+static_always_inline void
+rdma_mlx5_wqe_init (rdma_mlx5_wqe_t * wqe, const void *tmpl,
+ vlib_buffer_t * b, const u16 tail)
+{
+ u16 sz = b->current_length;
+ const void *cur = vlib_buffer_get_current (b);
+ uword addr = pointer_to_uword (cur);
+
+ clib_memcpy_fast (wqe, tmpl, RDMA_MLX5_WQE_SZ);
+ /* speculatively copy at least MLX5_ETH_L2_INLINE_HEADER_SIZE (18-bytes) */
+ STATIC_ASSERT (STRUCT_SIZE_OF (struct mlx5_wqe_eth_seg, inline_hdr_start) +
+ STRUCT_SIZE_OF (struct mlx5_wqe_eth_seg,
+ inline_hdr) >=
+ MLX5_ETH_L2_INLINE_HEADER_SIZE, "wrong size");
+ clib_memcpy_fast (wqe->eseg.inline_hdr_start, cur,
+ MLX5_ETH_L2_INLINE_HEADER_SIZE);
+
+ wqe->wqe_index_lo = tail;
+ wqe->wqe_index_hi = tail >> 8;
+ if (PREDICT_TRUE (sz >= MLX5_ETH_L2_INLINE_HEADER_SIZE))
+ {
+ /* inline_hdr_sz is set to MLX5_ETH_L2_INLINE_HEADER_SIZE
+ in the template */
+ wqe->dseg.byte_count = htobe32 (sz - MLX5_ETH_L2_INLINE_HEADER_SIZE);
+ wqe->dseg.addr = htobe64 (addr + MLX5_ETH_L2_INLINE_HEADER_SIZE);
+ }
+ else
+ {
+ /* dseg.byte_count and desg.addr are set to 0 in the template */
+ wqe->eseg.inline_hdr_sz = htobe16 (sz);
+ }
+}
+
+/*
+ * specific data path for chained buffers, supporting ring wrap-around
+ * contrary to the normal path - otherwise we may fail to enqueue chained
+ * buffers because we are close to the end of the ring while we still have
+ * plenty of descriptors available
+ */
+static_always_inline u32
+rdma_device_output_tx_mlx5_chained (vlib_main_t * vm,
+ const vlib_node_runtime_t * node,
+ const rdma_device_t * rd,
+ rdma_txq_t * txq, u32 n_left_from, u32 n,
+ u32 * bi, vlib_buffer_t ** b,
+ rdma_mlx5_wqe_t * wqe, u16 tail)
+{
+ rdma_mlx5_wqe_t *last = wqe;
+ u32 wqe_n = RDMA_TXQ_AVAIL_SZ (txq, txq->head, tail);
+ u32 sq_mask = pow2_mask (txq->dv_sq_log2sz);
+ u32 mask = pow2_mask (txq->bufs_log2sz);
+ u32 dseg_mask = RDMA_TXQ_DV_DSEG_SZ (txq) - 1;
+ const u32 lkey = wqe[0].dseg.lkey;
+
+ vlib_buffer_copy_indices (txq->bufs + (txq->tail & mask), bi,
+ n_left_from - n);
+
+ while (n >= 1 && wqe_n >= 1)
+ {
+ u32 *bufs = txq->bufs + (tail & mask);
+ rdma_mlx5_wqe_t *wqe = txq->dv_sq_wqes + (tail & sq_mask);
+
+ /* setup the head WQE */
+ rdma_mlx5_wqe_init (wqe, txq->dv_wqe_tmpl, b[0], tail);
+
+ bufs[0] = bi[0];
+
+ if (b[0]->flags & VLIB_BUFFER_NEXT_PRESENT)
+ {
+ /*
+ * max number of available dseg:
+ * - 4 dseg per WQEBB available
+ * - max 32 dseg per WQE (5-bits length field in WQE ctrl)
+ */
+#define RDMA_MLX5_WQE_DS_MAX (1 << 5)
+ const u32 dseg_max =
+ clib_min (RDMA_MLX5_WQE_DS * (wqe_n - 1), RDMA_MLX5_WQE_DS_MAX);
+ vlib_buffer_t *chained_b = b[0];
+ u32 chained_n = 0;
+
+ /* there are exactly 4 dseg per WQEBB and we rely on that */
+ STATIC_ASSERT (RDMA_MLX5_WQE_DS *
+ sizeof (struct mlx5_wqe_data_seg) ==
+ MLX5_SEND_WQE_BB, "wrong size");
+
+ /*
+ * iterate over fragments, supporting ring wrap-around contrary to
+ * the normal path - otherwise we may fail to enqueue chained
+ * buffers because we are close to the end of the ring while we
+ * still have plenty of descriptors available
+ */
+ while (chained_n < dseg_max
+ && chained_b->flags & VLIB_BUFFER_NEXT_PRESENT)
+ {
+ struct mlx5_wqe_data_seg *dseg = (void *) txq->dv_sq_wqes;
+ dseg += ((tail + 1) * RDMA_MLX5_WQE_DS + chained_n) & dseg_mask;
+ if (((clib_address_t) dseg & (MLX5_SEND_WQE_BB - 1)) == 0)
+ {
+ /*
+ * start of new WQEBB
+ * head/tail are shared between buffers and descriptor
+ * In order to maintain 1:1 correspondance between
+ * buffer index and descriptor index, we build
+ * 4-fragments chains and save the head
+ */
+ chained_b->flags &= ~(VLIB_BUFFER_NEXT_PRESENT |
+ VLIB_BUFFER_TOTAL_LENGTH_VALID);
+ u32 idx = tail + 1 + RDMA_TXQ_DV_DSEG2WQE (chained_n);
+ idx &= mask;
+ txq->bufs[idx] = chained_b->next_buffer;
+ }
+
+ chained_b = vlib_get_buffer (vm, chained_b->next_buffer);
+ dseg->byte_count = htobe32 (chained_b->current_length);
+ dseg->lkey = lkey;
+ dseg->addr = htobe64 (vlib_buffer_get_current_va (chained_b));
+
+ chained_n += 1;
+ }
+
+ if (chained_b->flags & VLIB_BUFFER_NEXT_PRESENT)
+ {
+ /*
+ * no descriptors left: drop the chain including 1st WQE
+ * skip the problematic packet and continue
+ */
+ vlib_buffer_free_from_ring (vm, txq->bufs, tail & mask,
+ RDMA_TXQ_BUF_SZ (txq), 1 +
+ RDMA_TXQ_DV_DSEG2WQE (chained_n));
+ vlib_error_count (vm, node->node_index,
+ dseg_max == chained_n ?
+ RDMA_TX_ERROR_SEGMENT_SIZE_EXCEEDED :
+ RDMA_TX_ERROR_NO_FREE_SLOTS, 1);
+
+ /* fixup tail to overwrite wqe head with next packet */
+ tail -= 1;
+ }
+ else
+ {
+ /* update WQE descriptor with new dseg number */
+ ((u8 *) & wqe[0].ctrl.qpn_ds)[3] = RDMA_MLX5_WQE_DS + chained_n;
+
+ tail += RDMA_TXQ_DV_DSEG2WQE (chained_n);
+ wqe_n -= RDMA_TXQ_DV_DSEG2WQE (chained_n);
+
+ last = wqe;
+ }
+ }
+ else
+ {
+ /* not chained */
+ last = wqe;
+ }
+
+ tail += 1;
+ bi += 1;
+ b += 1;
+ wqe_n -= 1;
+ n -= 1;
+ }
+
+ if (n == n_left_from)
+ return 0; /* we fail to enqueue even a single packet */
+
+ rdma_device_output_tx_mlx5_doorbell (txq, last, tail, sq_mask);
+ return n_left_from - n;
+}
+
+static_always_inline u32
+rdma_device_output_tx_mlx5 (vlib_main_t * vm,
+ const vlib_node_runtime_t * node,
+ const rdma_device_t * rd, rdma_txq_t * txq,
+ const u32 n_left_from, u32 * bi,
+ vlib_buffer_t ** b)
+{
+
+ u32 sq_mask = pow2_mask (txq->dv_sq_log2sz);
+ u32 mask = pow2_mask (txq->bufs_log2sz);
+ rdma_mlx5_wqe_t *wqe;
+ u32 n, n_wrap;
+ u16 tail = txq->tail;
+
+ ASSERT (RDMA_TXQ_BUF_SZ (txq) <= RDMA_TXQ_DV_SQ_SZ (txq));
+
+ /* avoid wrap-around logic in core loop */
+ n = clib_min (n_left_from, RDMA_TXQ_BUF_SZ (txq) - (tail & mask));
+ n_wrap = n_left_from - n;
+
+wrap_around:
+ wqe = txq->dv_sq_wqes + (tail & sq_mask);
+
+ while (n >= 4)
+ {
+ u32 flags = b[0]->flags | b[1]->flags | b[2]->flags | b[3]->flags;
+ if (PREDICT_FALSE (flags & VLIB_BUFFER_NEXT_PRESENT))
+ return rdma_device_output_tx_mlx5_chained (vm, node, rd, txq,
+ n_left_from, n, bi, b, wqe,
+ tail);
+
+ if (PREDICT_TRUE (n >= 8))
+ {
+ vlib_prefetch_buffer_header (b[4], LOAD);
+ vlib_prefetch_buffer_header (b[5], LOAD);
+ vlib_prefetch_buffer_header (b[6], LOAD);
+ vlib_prefetch_buffer_header (b[7], LOAD);
+ CLIB_PREFETCH (wqe + 4, 4 * sizeof (wqe[0]), STORE);
+ }
+
+ rdma_mlx5_wqe_init (wqe + 0, txq->dv_wqe_tmpl, b[0], tail + 0);
+ rdma_mlx5_wqe_init (wqe + 1, txq->dv_wqe_tmpl, b[1], tail + 1);
+ rdma_mlx5_wqe_init (wqe + 2, txq->dv_wqe_tmpl, b[2], tail + 2);
+ rdma_mlx5_wqe_init (wqe + 3, txq->dv_wqe_tmpl, b[3], tail + 3);
+
+ b += 4;
+ tail += 4;
+ wqe += 4;
+ n -= 4;
+ }
+
+ while (n >= 1)
+ {
+ if (PREDICT_FALSE (b[0]->flags & VLIB_BUFFER_NEXT_PRESENT))
+ return rdma_device_output_tx_mlx5_chained (vm, node, rd, txq,
+ n_left_from, n, bi, b, wqe,
+ tail);
+
+ rdma_mlx5_wqe_init (wqe, txq->dv_wqe_tmpl, b[0], tail);
+
+ b += 1;
+ tail += 1;
+ wqe += 1;
+ n -= 1;
+ }