From 4df9f737a24be94c2988f18337a4ad845b1b0186 Mon Sep 17 00:00:00 2001 From: Mohammed Hawari Date: Wed, 21 Oct 2020 14:48:38 +0200 Subject: [PATCH] rdma: implement striding rq for multiseg rx This change leverages the striding RQ feature of ConnectX-5 adapters to support chained buffers on the RX path. In Striding RQ mode, WQE are SG lists of data segments, each mapped to a vlib_buffer. When a packet is received, it can consume one or multiple data segments belonging to the WQE, without wasting the whole WQE. Change-Id: I74eba5b2c2c66538e75e046335058ba011cb27fd Type: improvement Signed-off-by: Mohammed Hawari --- src/plugins/rdma/device.c | 88 ++++++++- src/plugins/rdma/input.c | 437 ++++++++++++++++++++++++++++++----------- src/plugins/rdma/rdma.h | 16 +- src/plugins/rdma/rdma_mlx5dv.h | 22 ++- 4 files changed, 433 insertions(+), 130 deletions(-) diff --git a/src/plugins/rdma/device.c b/src/plugins/rdma/device.c index c2eb0006217..9b6fda982ca 100644 --- a/src/plugins/rdma/device.c +++ b/src/plugins/rdma/device.c @@ -426,10 +426,14 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) struct ibv_cq_init_attr_ex cqa = { }; struct ibv_wq_attr wqa; struct ibv_cq_ex *cqex; + struct mlx5dv_wq_init_attr dv_wqia = { }; vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES); rxq = vec_elt_at_index (rd->rxqs, qid); rxq->size = n_desc; + rxq->log_wqe_sz = 0; + rxq->log_stride_per_wqe = 0; + rxq->buf_sz = vlib_buffer_get_default_data_size (vm); vec_validate_aligned (rxq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES); cqa.cqe = n_desc; @@ -456,7 +460,54 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) wqia.max_sge = 1; wqia.pd = rd->pd; wqia.cq = rxq->cq; - if ((rxq->wq = ibv_create_wq (rd->ctx, &wqia)) == 0) + if (rd->flags & RDMA_DEVICE_F_MLX5DV) + { + if (rd->flags & RDMA_DEVICE_F_STRIDING_RQ) + { + /* In STRIDING_RQ mode, map a descriptor to a stride, not a full WQE buffer */ + uword data_seg_log2_sz = + min_log2 (vlib_buffer_get_default_data_size (vm)); + + /* The trick is also to map a descriptor to a data segment in the WQE SG list + The number of strides per WQE and the size of a WQE (in 16-bytes words) both + must be powers of two. + Moreover, in striding RQ mode, WQEs must include the SRQ header, which occupies + one 16-bytes word. That is why WQEs have 2*RDMA_RXQ_MAX_CHAIN_SZ 16-bytes words: + - One for the SRQ Header + - RDMA_RXQ_MAX_CHAIN_SZ for the different data segments (each mapped to + a stride, and a vlib_buffer) + - RDMA_RXQ_MAX_CHAIN_SZ-1 null data segments + */ + + wqia.max_sge = RDMA_RXQ_MAX_CHAIN_SZ; + dv_wqia.comp_mask = MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ; + dv_wqia.striding_rq_attrs.two_byte_shift_en = 0; + dv_wqia.striding_rq_attrs.single_wqe_log_num_of_strides = + RDMA_RXQ_MAX_CHAIN_LOG_SZ; + dv_wqia.striding_rq_attrs.single_stride_log_num_of_bytes = + data_seg_log2_sz; + wqia.max_wr >>= RDMA_RXQ_MAX_CHAIN_LOG_SZ; + rxq->log_wqe_sz = RDMA_RXQ_MAX_CHAIN_LOG_SZ + 1; + rxq->log_stride_per_wqe = RDMA_RXQ_MAX_CHAIN_LOG_SZ; + rxq->buf_sz = 1 << data_seg_log2_sz; + } + else + { + /* For now, in non STRIDING_RQ mode, SG operations/chained buffers + are not supported */ + wqia.max_sge = 1; + } + + if ((rxq->wq = mlx5dv_create_wq (rd->ctx, &wqia, &dv_wqia))) + { + rxq->wq->events_completed = 0; + pthread_mutex_init (&rxq->wq->mutex, NULL); + pthread_cond_init (&rxq->wq->cond, NULL); + } + else + return clib_error_return_unix (0, "Create WQ Failed"); + } + else if ((rxq->wq = ibv_create_wq (rd->ctx, &wqia)) == 0) return clib_error_return_unix (0, "Create WQ Failed"); memset (&wqa, 0, sizeof (wqa)); @@ -471,6 +522,7 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) struct mlx5dv_cq dv_cq; struct mlx5dv_rwq dv_rwq; u64 qw0; + u64 qw0_nullseg; obj.cq.in = rxq->cq; obj.cq.out = &dv_cq; @@ -488,16 +540,26 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) rxq->cq_db = (volatile u32 *) dv_cq.dbrec; rxq->cqn = dv_cq.cqn; - rxq->wqes = (mlx5dv_rwq_t *) dv_rwq.buf; + rxq->wqes = (mlx5dv_wqe_ds_t *) dv_rwq.buf; rxq->wq_db = (volatile u32 *) dv_rwq.dbrec; rxq->wq_stride = dv_rwq.stride; rxq->wqe_cnt = dv_rwq.wqe_cnt; - qw0 = clib_host_to_net_u32 (vlib_buffer_get_default_data_size (vm)); + qw0 = clib_host_to_net_u32 (rxq->buf_sz); + qw0_nullseg = 0; qw0 |= (u64) clib_host_to_net_u32 (rd->lkey) << 32; - - for (int i = 0; i < rxq->size; i++) - rxq->wqes[i].dsz_and_lkey = qw0; + qw0_nullseg |= (u64) clib_host_to_net_u32 (rd->lkey) << 32; + +/* Prefill the different 16 bytes words of the WQ. If not in striding RQ mode, + init with qw0 only with segments of rxq->buf_sz. Otherwise, for each WQE, the + RDMA_RXQ_MAX_CHAIN_SZ + 1 first 16-bytes words are initialised with qw0, the rest + are null segments */ + for (int i = 0; i < rxq->wqe_cnt << rxq->log_wqe_sz; i++) + if (!(rd->flags & RDMA_DEVICE_F_STRIDING_RQ) + || (i == 0) || !(((i - 1) >> rxq->log_stride_per_wqe) & 0x1)) + rxq->wqes[i].dsz_and_lkey = qw0; + else + rxq->wqes[i].dsz_and_lkey = qw0_nullseg; for (int i = 0; i < (1 << rxq->log2_cq_size); i++) rxq->cqes[i].opcode_cqefmt_se_owner = 0xff; @@ -824,11 +886,25 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args) if (args->mode != RDMA_MODE_IBV) { struct mlx5dv_context mlx5dv_attrs = { }; + mlx5dv_attrs.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; if (mlx5dv_query_device (rd->ctx, &mlx5dv_attrs) == 0) { + uword data_seg_log2_sz = + min_log2 (vlib_buffer_get_default_data_size (vm)); + if ((mlx5dv_attrs.flags & MLX5DV_CONTEXT_FLAGS_CQE_V1)) rd->flags |= RDMA_DEVICE_F_MLX5DV; + + if (data_seg_log2_sz <= + mlx5dv_attrs.striding_rq_caps.max_single_stride_log_num_of_bytes + && data_seg_log2_sz >= + mlx5dv_attrs.striding_rq_caps.min_single_stride_log_num_of_bytes + && RDMA_RXQ_MAX_CHAIN_LOG_SZ >= + mlx5dv_attrs.striding_rq_caps.min_single_wqe_log_num_of_strides + && RDMA_RXQ_MAX_CHAIN_LOG_SZ <= + mlx5dv_attrs.striding_rq_caps.max_single_wqe_log_num_of_strides) + rd->flags |= RDMA_DEVICE_F_STRIDING_RQ; } else { diff --git a/src/plugins/rdma/input.c b/src/plugins/rdma/input.c index 3842a58a4ab..1d267ad6cc0 100644 --- a/src/plugins/rdma/input.c +++ b/src/plugins/rdma/input.c @@ -55,28 +55,35 @@ ibv_set_recv_wr_and_sge (struct ibv_recv_wr *w, struct ibv_sge *s, u64 va, static_always_inline void rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, - rdma_rxq_t * rxq, int is_mlx5dv) + rdma_rxq_t * rxq, int is_mlx5dv, int is_striding) { u32 n_alloc, n; + u16 ring_space; struct ibv_recv_wr wr[VLIB_FRAME_SIZE], *w = wr; struct ibv_sge sge[VLIB_FRAME_SIZE], *s = sge; u32 mask = rxq->size - 1; u32 slot = rxq->tail & mask; u32 *bufs = rxq->bufs + slot; - u32 data_size = vlib_buffer_get_default_data_size (vm); + u32 data_size = rxq->buf_sz; u32 lkey = rd->lkey; + int log_stride_per_wqe = rxq->log_stride_per_wqe; + int log_wqe_sz = rxq->log_wqe_sz; - /* do not enqueue more packet than ring space */ - n_alloc = clib_min (VLIB_FRAME_SIZE, rxq->size - (rxq->tail - rxq->head)); + /* refilled buffers must be a multiple of 8 and of strides per WQE */ + u32 alloc_multiple = 1 << (clib_max (3, log_stride_per_wqe)); + + ring_space = rxq->size - (rxq->tail - rxq->head); + + n_alloc = clib_min (VLIB_FRAME_SIZE, ring_space); /* do not bother to allocate if too small */ - if (n_alloc < 16) + if (n_alloc < 2 * alloc_multiple) return; /* avoid wrap-around logic in core loop */ n_alloc = clib_min (n_alloc, rxq->size - slot); - n_alloc &= ~7; /* round to 8 */ + n_alloc &= ~(alloc_multiple - 1); /* round to alloc_multiple */ n = vlib_buffer_alloc_to_ring_from_pool (vm, rxq->bufs, slot, rxq->size, n_alloc, rd->pool); @@ -84,7 +91,7 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, if (PREDICT_FALSE (n != n_alloc)) { u32 n_free; - if (n < 8) + if (n < alloc_multiple) { if (n) vlib_buffer_free_from_ring (vm, rxq->bufs, slot, rxq->size, n); @@ -92,7 +99,7 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, } /* partial allocation, round and return rest */ - n_free = n & 7; + n_free = n & (alloc_multiple - 1); n -= n_free; if (n_free) vlib_buffer_free_from_ring (vm, rxq->bufs, (slot + n) & mask, @@ -104,7 +111,15 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, if (is_mlx5dv) { u64 __clib_aligned (32) va[8]; - mlx5dv_rwq_t *wqe = rxq->wqes + slot; + + /* slot does not necessarily correspond to the slot + in the wqes ring (in 16B words) */ + u32 wqes_slot = slot << (log_wqe_sz - log_stride_per_wqe); + u32 wqe_cnt = rxq->wqe_cnt; + mlx5dv_wqe_ds_t *wqe = rxq->wqes + wqes_slot; + int wqe_sz = 1 << log_wqe_sz; + int stride_per_wqe = 1 << log_stride_per_wqe; + int current_data_seg = 0; while (n >= 1) { @@ -117,22 +132,52 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, for (int i = 0; i < 8; i++) va[i] = clib_host_to_net_u64 (va[i]); #endif - wqe[0].addr = va[0]; - wqe[1].addr = va[1]; - wqe[2].addr = va[2]; - wqe[3].addr = va[3]; - wqe[4].addr = va[4]; - wqe[5].addr = va[5]; - wqe[6].addr = va[6]; - wqe[7].addr = va[7]; + + /*In striding RQ mode, the first 16B-word of the WQE is the SRQ header. + It is initialised as if it were a LINKED_LIST, as we have no guarantee + about what RDMA core does (CYCLIC_RQ or LINKED_LIST_RQ). In cyclic + mode, the SRQ header is ignored anyways... */ + +/* *INDENT-OFF* */ + if (is_striding && !(current_data_seg & (wqe_sz - 1))) + *(mlx5dv_wqe_srq_next_t *) wqe = (mlx5dv_wqe_srq_next_t) + { + .rsvd0 = {0}, + .next_wqe_index = clib_host_to_net_u16 (((wqes_slot >> log_wqe_sz) + 1) & (wqe_cnt - 1)), + .signature = 0, + .rsvd1 = {0} + }; +/* *INDENT-ON* */ + + if (!is_striding || !(current_data_seg & ~(stride_per_wqe - 1))) + { + wqe[0 + is_striding].addr = va[0]; + wqe[1 + is_striding].addr = va[1]; + wqe[2 + is_striding].addr = va[2]; + wqe[3 + is_striding].addr = va[3]; + wqe[4 + is_striding].addr = va[4]; + wqe[5 + is_striding].addr = va[5]; + wqe[6 + is_striding].addr = va[6]; + wqe[7 + is_striding].addr = va[7]; + slot += 8; + n -= 8; + } wqe += 8; - slot += 8; - n -= 8; + wqes_slot += 8; + current_data_seg += 8; + current_data_seg &= wqe_sz - 1; } CLIB_MEMORY_STORE_BARRIER (); rxq->tail += n_alloc; - rxq->wq_db[MLX5_RCV_DBR] = clib_host_to_net_u32 (rxq->tail); + if (is_striding) + { + rxq->striding_wqe_tail += n_alloc >> log_stride_per_wqe; + rxq->wq_db[MLX5_RCV_DBR] = + clib_host_to_net_u32 (rxq->striding_wqe_tail); + } + else + rxq->wq_db[MLX5_RCV_DBR] = clib_host_to_net_u32 (rxq->tail); return; } @@ -176,8 +221,9 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, static_always_inline void rdma_device_input_trace (vlib_main_t * vm, vlib_node_runtime_t * node, - const rdma_device_t * rd, u32 n_left, const u32 * bi, - u32 next_index, u16 * cqe_flags, int is_mlx5dv) + const rdma_device_t * rd, u32 n_left, + const u32 * bi, u32 next_index, u16 * cqe_flags, + int is_mlx5dv) { u32 n_trace, i; @@ -424,9 +470,9 @@ rdma_device_poll_cq_mlx5dv (rdma_device_t * rd, rdma_rxq_t * rxq, if ((cqe_last_byte & 0x1) != owner) break; - cqe_last_byte &= 0xfe; /* remove owner bit */ + cqe_last_byte &= 0xfc; /* remove owner and solicited bits */ - if (cqe_last_byte == 0x2c) + if (cqe_last_byte == 0x2c) /* OPCODE = 0x2 (Responder Send), Format = 0x3 (Compressed CQE) */ { u32 n_mini_cqes = clib_net_to_host_u32 (cqe->mini_cqe_num); u32 n_left = VLIB_FRAME_SIZE - n_rx_packets; @@ -456,7 +502,7 @@ rdma_device_poll_cq_mlx5dv (rdma_device_t * rd, rdma_rxq_t * rxq, continue; } - if (cqe_last_byte == 0x20) + if (cqe_last_byte == 0x20) /* OPCODE = 0x2 (Responder Send), Format = 0x0 (no inline data) */ { byte_cnt[0] = cqe->byte_cnt; cqe_flags[0] = cqe->flags; @@ -476,17 +522,223 @@ done: return n_rx_packets; } +static_always_inline int +rdma_device_mlx5dv_striding_rq_parse_bc (int n_rx_packets, int *n_rx_segs, + u32 * bc) +{ +/* Determine if slow path is needed */ + int filler = 0; + for (int i = 0; i < n_rx_packets; i++) + { + *n_rx_segs += + (bc[i] & CQE_BC_CONSUMED_STRIDES_MASK) >> + CQE_BC_CONSUMED_STRIDES_SHIFT; + filler |= ! !(bc[i] & CQE_BC_FILLER_MASK); + } + return n_rx_packets != *n_rx_segs || filler; +} + +static_always_inline int +rdma_device_mlx5dv_l3_validate_and_swap_bc (rdma_per_thread_data_t + * ptd, int n_rx_packets, u32 * bc) +{ + u16 mask = CQE_FLAG_L3_HDR_TYPE_MASK | CQE_FLAG_L3_OK; + u16 match = CQE_FLAG_L3_HDR_TYPE_IP4 << CQE_FLAG_L3_HDR_TYPE_SHIFT; + + /* verify that all ip4 packets have l3_ok flag set and convert packet + length from network to host byte order */ + int skip_ip4_cksum = 1; + +#if defined CLIB_HAVE_VEC256 + u16x16 mask16 = u16x16_splat (mask); + u16x16 match16 = u16x16_splat (match); + u16x16 r = { }; + + for (int i = 0; i * 16 < n_rx_packets; i++) + r |= (ptd->cqe_flags16[i] & mask16) != match16; + + if (!u16x16_is_all_zero (r)) + skip_ip4_cksum = 0; + + for (int i = 0; i < n_rx_packets; i += 8) + *(u32x8 *) (bc + i) = u32x8_byte_swap (*(u32x8 *) (bc + i)); +#elif defined CLIB_HAVE_VEC128 + u16x8 mask8 = u16x8_splat (mask); + u16x8 match8 = u16x8_splat (match); + u16x8 r = { }; + + for (int i = 0; i * 8 < n_rx_packets; i++) + r |= (ptd->cqe_flags8[i] & mask8) != match8; + + if (!u16x8_is_all_zero (r)) + skip_ip4_cksum = 0; + + for (int i = 0; i < n_rx_packets; i += 4) + *(u32x4 *) (bc + i) = u32x4_byte_swap (*(u32x4 *) (bc + i)); +#else + for (int i = 0; i < n_rx_packets; i++) + if ((ptd->cqe_flags[i] & mask) == match) + skip_ip4_cksum = 0; + + for (int i = 0; i < n_rx_packets; i++) + bc[i] = clib_net_to_host_u32 (bc[i]); +#endif + return skip_ip4_cksum; +} + +static_always_inline u32 +rdma_device_mlx5dv_fast_input (vlib_main_t * vm, rdma_rxq_t * rxq, + u32 qs_mask, vlib_buffer_t * bt, + u32 * to_next, u32 n_rx_segs, u32 * bc, + u32 bc_mask) +{ + vlib_buffer_t *bufs[VLIB_FRAME_SIZE]; + vlib_buffer_t **b = bufs; + u32 n_left = n_rx_segs; + u32 n_rx_bytes = 0; + vlib_buffer_copy_indices_from_ring (to_next, rxq->bufs, + rxq->head & qs_mask, rxq->size, + n_rx_segs); + rxq->head += n_rx_segs; + vlib_get_buffers (vm, to_next, bufs, n_rx_segs); + while (n_left >= 8) + { + clib_prefetch_store (b[4]); + vlib_buffer_copy_template (b[0], bt); + n_rx_bytes += b[0]->current_length = bc[0] & bc_mask; + clib_prefetch_store (b[5]); + vlib_buffer_copy_template (b[1], bt); + n_rx_bytes += b[1]->current_length = bc[1] & bc_mask; + clib_prefetch_store (b[6]); + vlib_buffer_copy_template (b[2], bt); + n_rx_bytes += b[2]->current_length = bc[2] & bc_mask; + clib_prefetch_store (b[7]); + vlib_buffer_copy_template (b[3], bt); + n_rx_bytes += b[3]->current_length = bc[3] & bc_mask; + /* next */ + bc += 4; + b += 4; + n_left -= 4; + } + while (n_left) + { + vlib_buffer_copy_template (b[0], bt); + n_rx_bytes += b[0]->current_length = bc[0] & bc_mask; + /* next */ + bc++; + b++; + n_left--; + } + return n_rx_bytes; +} + +static_always_inline u32 +rdma_device_mlx5dv_striding_rq_input (vlib_main_t * vm, + rdma_per_thread_data_t * ptd, + rdma_rxq_t * rxq, + vlib_buffer_t * bt, u32 * to_next, + int n_rx_segs, int *n_rx_packets, + u32 * bc, int slow_path_needed) +{ + u32 mask = rxq->size - 1; + u32 n_rx_bytes = 0; + if (PREDICT_TRUE (!slow_path_needed)) + { + n_rx_bytes += + rdma_device_mlx5dv_fast_input (vm, rxq, mask, bt, to_next, + n_rx_segs, bc, CQE_BC_BYTE_COUNT_MASK); + } + else /* Slow path with multiseg */ + { + vlib_buffer_t *pkt_head; /*Current head buffer */ + vlib_buffer_t *pkt_prev; /* Buffer processed at the previous iteration */ + u32 pkt_head_idx; + vlib_buffer_t **pkt; + uword n_segs_remaining = 0; /*Remaining strides in current buffer */ + u32 n_bytes_remaining = 0; /*Remaining bytes in current buffer */ + u32 *next_in_frame = to_next; + u32 *next_to_free = ptd->to_free_buffers; + bt->current_length = vlib_buffer_get_default_data_size (vm); + do + { + vlib_buffer_t *bufs[VLIB_FRAME_SIZE]; + u32 n_left = clib_min (n_rx_segs, VLIB_FRAME_SIZE); + n_rx_segs -= n_left; + vlib_buffer_copy_indices_from_ring (ptd->current_segs, + rxq->bufs, rxq->head & mask, + rxq->size, n_left); + rxq->head += n_left; + vlib_get_buffers (vm, ptd->current_segs, bufs, n_left); + pkt = bufs; + while (n_left > 0) + { + /* Initialize the current buffer as full size */ + vlib_buffer_copy_template (pkt[0], bt); + if (!n_segs_remaining) /* No pending chain */ + { + n_segs_remaining = + (bc[0] & CQE_BC_CONSUMED_STRIDES_MASK) >> + CQE_BC_CONSUMED_STRIDES_SHIFT; + pkt_head = pkt[0]; + pkt_head_idx = ptd->current_segs[pkt - bufs]; + n_bytes_remaining = bc[0] & CQE_BC_BYTE_COUNT_MASK; + pkt_head->total_length_not_including_first_buffer = + n_segs_remaining > + 1 ? n_bytes_remaining - pkt[0]->current_length : 0; + } + else /* Perform chaining if it's a continuation buffer */ + { + pkt_prev->next_buffer = ptd->current_segs[pkt - bufs]; + pkt_prev->flags |= VLIB_BUFFER_NEXT_PRESENT; + pkt[0]->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID; + } + if (n_segs_remaining == 1) /* Last buffer of the chain */ + { + pkt[0]->current_length = n_bytes_remaining; + if (bc[0] & CQE_BC_FILLER_MASK) + { + (next_to_free++)[0] = pkt_head_idx; + (*n_rx_packets)--; + } + + else + { + (next_in_frame++)[0] = pkt_head_idx; + n_rx_bytes += + pkt_head->current_length + + pkt_head->total_length_not_including_first_buffer; + } + /*Go to next CQE */ + bc++; + } + else + { + n_bytes_remaining -= pkt[0]->current_length; + pkt_prev = pkt[0]; + } + n_segs_remaining--; + n_left--; + pkt++; + } + + } + while (n_rx_segs > 0); + vlib_buffer_free (vm, ptd->to_free_buffers, + next_to_free - ptd->to_free_buffers); + } + return n_rx_bytes; +} + static_always_inline uword rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, - vlib_frame_t * frame, rdma_device_t * rd, u16 qid, - int use_mlx5dv) + vlib_frame_t * frame, rdma_device_t * rd, + u16 qid, int use_mlx5dv) { rdma_main_t *rm = &rdma_main; vnet_main_t *vnm = vnet_get_main (); rdma_per_thread_data_t *ptd = vec_elt_at_index (rm->per_thread_data, vm->thread_index); rdma_rxq_t *rxq = vec_elt_at_index (rd->rxqs, qid); - vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs; struct ibv_wc wc[VLIB_FRAME_SIZE]; u32 __clib_aligned (32) byte_cnts[VLIB_FRAME_SIZE]; vlib_buffer_t bt; @@ -515,112 +767,61 @@ rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next); - vlib_buffer_copy_indices_from_ring (to_next, rxq->bufs, rxq->head & mask, - rxq->size, n_rx_packets); - - vlib_get_buffers (vm, to_next, bufs, n_rx_packets); - if (use_mlx5dv) { - u16 mask = CQE_FLAG_L3_HDR_TYPE_MASK | CQE_FLAG_L3_OK; - u16 match = CQE_FLAG_L3_HDR_TYPE_IP4 << CQE_FLAG_L3_HDR_TYPE_SHIFT; - u32 n_left = n_rx_packets; u32 *bc = byte_cnts; - - /* verify that all ip4 packets have l3_ok flag set and convert packet - length from network to host byte order */ - skip_ip4_cksum = 1; - -#if defined CLIB_HAVE_VEC256 - u16x16 mask16 = u16x16_splat (mask); - u16x16 match16 = u16x16_splat (match); - u16x16 r = { }; - - for (int i = 0; i * 16 < n_rx_packets; i++) - r |= (ptd->cqe_flags16[i] & mask16) != match16; - - if (!u16x16_is_all_zero (r)) - skip_ip4_cksum = 0; - - for (int i = 0; i < n_rx_packets; i += 8) - *(u32x8 *) (bc + i) = u32x8_byte_swap (*(u32x8 *) (bc + i)); -#elif defined CLIB_HAVE_VEC128 - u16x8 mask8 = u16x8_splat (mask); - u16x8 match8 = u16x8_splat (match); - u16x8 r = { }; - - for (int i = 0; i * 8 < n_rx_packets; i++) - r |= (ptd->cqe_flags8[i] & mask8) != match8; - - if (!u16x8_is_all_zero (r)) - skip_ip4_cksum = 0; - - for (int i = 0; i < n_rx_packets; i += 4) - *(u32x4 *) (bc + i) = u32x4_byte_swap (*(u32x4 *) (bc + i)); -#else - for (int i = 0; i < n_rx_packets; i++) - if ((ptd->cqe_flags[i] & mask) == match) - skip_ip4_cksum = 0; - - for (int i = 0; i < n_rx_packets; i++) - bc[i] = clib_net_to_host_u32 (bc[i]); -#endif - - while (n_left >= 8) + int slow_path_needed; + skip_ip4_cksum = + rdma_device_mlx5dv_l3_validate_and_swap_bc (ptd, n_rx_packets, bc); + if (rd->flags & RDMA_DEVICE_F_STRIDING_RQ) { - clib_prefetch_store (b[4]); - vlib_buffer_copy_template (b[0], &bt); - n_rx_bytes += b[0]->current_length = bc[0]; - clib_prefetch_store (b[5]); - vlib_buffer_copy_template (b[1], &bt); - n_rx_bytes += b[1]->current_length = bc[1]; - clib_prefetch_store (b[6]); - vlib_buffer_copy_template (b[2], &bt); - n_rx_bytes += b[2]->current_length = bc[2]; - clib_prefetch_store (b[7]); - vlib_buffer_copy_template (b[3], &bt); - n_rx_bytes += b[3]->current_length = bc[3]; - - /* next */ - bc += 4; - b += 4; - n_left -= 4; + int n_rx_segs = 0; + slow_path_needed = + rdma_device_mlx5dv_striding_rq_parse_bc (n_rx_packets, + &n_rx_segs, bc); + n_rx_bytes = + rdma_device_mlx5dv_striding_rq_input (vm, ptd, rxq, &bt, + to_next, n_rx_segs, + &n_rx_packets, bc, + slow_path_needed); } - while (n_left) + else { - vlib_buffer_copy_template (b[0], &bt); - n_rx_bytes += b[0]->current_length = bc[0]; - - /* next */ - bc++; - b++; - n_left--; + /*For now, legacy path doesn't support multiseg */ + n_rx_bytes = + rdma_device_mlx5dv_fast_input (vm, rxq, mask, &bt, to_next, + n_rx_packets, bc, ~1); } + } else - n_rx_bytes = rdma_device_input_bufs (vm, rd, bufs, wc, n_rx_packets, &bt); + { + vlib_buffer_t *bufs[VLIB_FRAME_SIZE]; + vlib_buffer_copy_indices_from_ring (to_next, rxq->bufs, + rxq->head & mask, + rxq->size, n_rx_packets); + vlib_get_buffers (vm, to_next, bufs, n_rx_packets); + rxq->head += n_rx_packets; + n_rx_bytes = + rdma_device_input_bufs (vm, rd, bufs, wc, n_rx_packets, &bt); - rdma_device_input_ethernet (vm, node, rd, next_index, skip_ip4_cksum); + } + rdma_device_input_ethernet (vm, node, rd, next_index, skip_ip4_cksum); vlib_put_next_frame (vm, node, next_index, n_left_to_next - n_rx_packets); - - rxq->head += n_rx_packets; - - rdma_device_input_trace (vm, node, rd, n_rx_packets, to_next, next_index, - ptd->cqe_flags, use_mlx5dv); - + rdma_device_input_trace (vm, node, rd, n_rx_packets, to_next, + next_index, ptd->cqe_flags, use_mlx5dv); /* reset flags to zero for the next run */ if (use_mlx5dv) clib_memset_u16 (ptd->cqe_flags, 0, VLIB_FRAME_SIZE); - - vlib_increment_combined_counter - (vnm->interface_main.combined_sw_if_counters + - VNET_INTERFACE_COUNTER_RX, vm->thread_index, - rd->hw_if_index, n_rx_packets, n_rx_bytes); - + vlib_increment_combined_counter (vnm->interface_main. + combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + vm->thread_index, rd->hw_if_index, + n_rx_packets, n_rx_bytes); refill: - rdma_device_input_refill (vm, rd, rxq, use_mlx5dv); - + rdma_device_input_refill (vm, rd, rxq, use_mlx5dv, + ! !(rd->flags & RDMA_DEVICE_F_STRIDING_RQ)); return n_rx_packets; } diff --git a/src/plugins/rdma/rdma.h b/src/plugins/rdma/rdma.h index 19bfb8b11e5..db8f740d946 100644 --- a/src/plugins/rdma/rdma.h +++ b/src/plugins/rdma/rdma.h @@ -30,7 +30,8 @@ _(1, ADMIN_UP, "admin-up") \ _(2, LINK_UP, "link-up") \ _(3, PROMISC, "promiscuous") \ - _(4, MLX5DV, "mlx5dv") + _(4, MLX5DV, "mlx5dv") \ + _(5, STRIDING_RQ, "striding-rq") enum { @@ -81,12 +82,17 @@ typedef struct u16 n_mini_cqes_left; u16 last_cqe_flags; mlx5dv_cqe_t *cqes; - mlx5dv_rwq_t *wqes; + mlx5dv_wqe_ds_t *wqes; + CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); volatile u32 *wq_db; volatile u32 *cq_db; u32 cqn; u32 wqe_cnt; u32 wq_stride; + u32 buf_sz; + u32 striding_wqe_tail; + u8 log_wqe_sz; /* log-size of a single WQE (in data segments) */ + u8 log_stride_per_wqe; /* Striding RQ: number of strides in a single WQE */ } rdma_rxq_t; typedef struct @@ -146,7 +152,9 @@ STATIC_ASSERT_OFFSET_OF (rdma_txq_t, cacheline2, 128); #define RDMA_TXQ_USED_SZ(head, tail) ((u16)((u16)(tail) - (u16)(head))) #define RDMA_TXQ_AVAIL_SZ(txq, head, tail) ((u16)(RDMA_TXQ_BUF_SZ (txq) - RDMA_TXQ_USED_SZ (head, tail))) - +#define RDMA_RXQ_MAX_CHAIN_LOG_SZ 3 /* This should NOT be lower than 3! */ +#define RDMA_RXQ_MAX_CHAIN_SZ (1U << RDMA_RXQ_MAX_CHAIN_LOG_SZ) +#define RDMA_RXQ_LEGACY_MODE_MAX_CHAIN_SZ 5 typedef struct { CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); @@ -193,6 +201,8 @@ typedef struct u16x16 cqe_flags16[VLIB_FRAME_SIZE / 16]; }; vlib_buffer_t buffer_template; + u32 current_segs[VLIB_FRAME_SIZE]; + u32 to_free_buffers[VLIB_FRAME_SIZE]; } rdma_per_thread_data_t; typedef struct diff --git a/src/plugins/rdma/rdma_mlx5dv.h b/src/plugins/rdma/rdma_mlx5dv.h index 43d9002d050..efcefe7fbf7 100644 --- a/src/plugins/rdma/rdma_mlx5dv.h +++ b/src/plugins/rdma/rdma_mlx5dv.h @@ -21,7 +21,8 @@ #undef always_inline #include #define always_inline static_always_inline - +#include +#include /* CQE flags - bits 16-31 of qword at offset 0x1c */ #define CQE_FLAG_L4_OK 10 #define CQE_FLAG_L3_OK 9 @@ -35,6 +36,11 @@ #define CQE_FLAG_L3_HDR_TYPE_IP6 2 #define CQE_FLAG_IP_EXT_OPTS 1 +/* CQE byte count (Striding RQ) */ +#define CQE_BC_FILLER_MASK (1 << 31) +#define CQE_BC_CONSUMED_STRIDES_SHIFT (16) +#define CQE_BC_CONSUMED_STRIDES_MASK (0x3fff << CQE_BC_CONSUMED_STRIDES_SHIFT) +#define CQE_BC_BYTE_COUNT_MASK (0xffff) typedef struct { struct @@ -47,7 +53,9 @@ typedef struct u32 byte_cnt; u32 mini_cqe_num; }; - u8 pad3[15]; + u8 pad3[12]; + u16 wqe_counter; + u8 signature; u8 opcode_cqefmt_se_owner; }; } mlx5dv_cqe_t; @@ -68,7 +76,15 @@ typedef struct { u64 dsz_and_lkey; u64 addr; -} mlx5dv_rwq_t; +} mlx5dv_wqe_ds_t; /* a WQE data segment */ + +typedef struct +{ + u8 rsvd0[2]; + u16 next_wqe_index; + u8 signature; + u8 rsvd1[11]; +} mlx5dv_wqe_srq_next_t; #define foreach_cqe_rx_field \ _(0x1c, 26, 26, l4_ok) \ -- 2.16.6