+static_always_inline int
+rdma_device_mlx5dv_striding_rq_parse_bc (int n_rx_packets, int *n_rx_segs,
+ u32 * bc)
+{
+/* Determine if slow path is needed */
+ int filler = 0;
+ for (int i = 0; i < n_rx_packets; i++)
+ {
+ *n_rx_segs +=
+ (bc[i] & CQE_BC_CONSUMED_STRIDES_MASK) >>
+ CQE_BC_CONSUMED_STRIDES_SHIFT;
+ filler |= ! !(bc[i] & CQE_BC_FILLER_MASK);
+ }
+ return n_rx_packets != *n_rx_segs || filler;
+}
+
+static_always_inline int
+rdma_device_mlx5dv_l3_validate_and_swap_bc (rdma_per_thread_data_t
+ * ptd, int n_rx_packets, u32 * bc)
+{
+ u16 mask = CQE_FLAG_L3_HDR_TYPE_MASK | CQE_FLAG_L3_OK;
+ u16 match = CQE_FLAG_L3_HDR_TYPE_IP4 << CQE_FLAG_L3_HDR_TYPE_SHIFT;
+
+ /* verify that all ip4 packets have l3_ok flag set and convert packet
+ length from network to host byte order */
+ int skip_ip4_cksum = 1;
+
+#if defined CLIB_HAVE_VEC256
+ u16x16 mask16 = u16x16_splat (mask);
+ u16x16 match16 = u16x16_splat (match);
+ u16x16 r = { };
+
+ for (int i = 0; i * 16 < n_rx_packets; i++)
+ r |= (ptd->cqe_flags16[i] & mask16) != match16;
+
+ if (!u16x16_is_all_zero (r))
+ skip_ip4_cksum = 0;
+
+ for (int i = 0; i < n_rx_packets; i += 8)
+ *(u32x8 *) (bc + i) = u32x8_byte_swap (*(u32x8 *) (bc + i));
+#elif defined CLIB_HAVE_VEC128
+ u16x8 mask8 = u16x8_splat (mask);
+ u16x8 match8 = u16x8_splat (match);
+ u16x8 r = { };
+
+ for (int i = 0; i * 8 < n_rx_packets; i++)
+ r |= (ptd->cqe_flags8[i] & mask8) != match8;
+
+ if (!u16x8_is_all_zero (r))
+ skip_ip4_cksum = 0;
+
+ for (int i = 0; i < n_rx_packets; i += 4)
+ *(u32x4 *) (bc + i) = u32x4_byte_swap (*(u32x4 *) (bc + i));
+#else
+ for (int i = 0; i < n_rx_packets; i++)
+ if ((ptd->cqe_flags[i] & mask) == match)
+ skip_ip4_cksum = 0;
+
+ for (int i = 0; i < n_rx_packets; i++)
+ bc[i] = clib_net_to_host_u32 (bc[i]);
+#endif
+ return skip_ip4_cksum;
+}
+
+static_always_inline u32
+rdma_device_mlx5dv_fast_input (vlib_main_t * vm, rdma_rxq_t * rxq,
+ u32 qs_mask, vlib_buffer_t * bt,
+ u32 * to_next, u32 n_rx_segs, u32 * bc,
+ u32 bc_mask)
+{
+ vlib_buffer_t *bufs[VLIB_FRAME_SIZE];
+ vlib_buffer_t **b = bufs;
+ u32 n_left = n_rx_segs;
+ u32 n_rx_bytes = 0;
+ vlib_buffer_copy_indices_from_ring (to_next, rxq->bufs,
+ rxq->head & qs_mask, rxq->size,
+ n_rx_segs);
+ rxq->head += n_rx_segs;
+ vlib_get_buffers (vm, to_next, bufs, n_rx_segs);
+ while (n_left >= 8)
+ {
+ clib_prefetch_store (b[4]);
+ vlib_buffer_copy_template (b[0], bt);
+ n_rx_bytes += b[0]->current_length = bc[0] & bc_mask;
+ clib_prefetch_store (b[5]);
+ vlib_buffer_copy_template (b[1], bt);
+ n_rx_bytes += b[1]->current_length = bc[1] & bc_mask;
+ clib_prefetch_store (b[6]);
+ vlib_buffer_copy_template (b[2], bt);
+ n_rx_bytes += b[2]->current_length = bc[2] & bc_mask;
+ clib_prefetch_store (b[7]);
+ vlib_buffer_copy_template (b[3], bt);
+ n_rx_bytes += b[3]->current_length = bc[3] & bc_mask;
+ /* next */
+ bc += 4;
+ b += 4;
+ n_left -= 4;
+ }
+ while (n_left)
+ {
+ vlib_buffer_copy_template (b[0], bt);
+ n_rx_bytes += b[0]->current_length = bc[0] & bc_mask;
+ /* next */
+ bc++;
+ b++;
+ n_left--;
+ }
+ return n_rx_bytes;
+}
+
+static_always_inline u32
+rdma_device_mlx5dv_striding_rq_input (vlib_main_t * vm,
+ rdma_per_thread_data_t * ptd,
+ rdma_rxq_t * rxq,
+ vlib_buffer_t * bt, u32 * to_next,
+ int n_rx_segs, int *n_rx_packets,
+ u32 * bc, int slow_path_needed)
+{
+ u32 mask = rxq->size - 1;
+ u32 n_rx_bytes = 0;
+ if (PREDICT_TRUE (!slow_path_needed))
+ {
+ n_rx_bytes +=
+ rdma_device_mlx5dv_fast_input (vm, rxq, mask, bt, to_next,
+ n_rx_segs, bc, CQE_BC_BYTE_COUNT_MASK);
+ }
+ else /* Slow path with multiseg */
+ {
+ vlib_buffer_t *pkt_head; /*Current head buffer */
+ vlib_buffer_t *pkt_prev; /* Buffer processed at the previous iteration */
+ u32 pkt_head_idx;
+ vlib_buffer_t **pkt;
+ uword n_segs_remaining = 0; /*Remaining strides in current buffer */
+ u32 n_bytes_remaining = 0; /*Remaining bytes in current buffer */
+ u32 *next_in_frame = to_next;
+ u32 *next_to_free = ptd->to_free_buffers;
+ bt->current_length = vlib_buffer_get_default_data_size (vm);
+ do
+ {
+ vlib_buffer_t *bufs[VLIB_FRAME_SIZE];
+ u32 n_left = clib_min (n_rx_segs, VLIB_FRAME_SIZE);
+ n_rx_segs -= n_left;
+ vlib_buffer_copy_indices_from_ring (ptd->current_segs,
+ rxq->bufs, rxq->head & mask,
+ rxq->size, n_left);
+ rxq->head += n_left;
+ vlib_get_buffers (vm, ptd->current_segs, bufs, n_left);
+ pkt = bufs;
+ while (n_left > 0)
+ {
+ /* Initialize the current buffer as full size */
+ vlib_buffer_copy_template (pkt[0], bt);
+ if (!n_segs_remaining) /* No pending chain */
+ {
+ n_segs_remaining =
+ (bc[0] & CQE_BC_CONSUMED_STRIDES_MASK) >>
+ CQE_BC_CONSUMED_STRIDES_SHIFT;
+ pkt_head = pkt[0];
+ pkt_head_idx = ptd->current_segs[pkt - bufs];
+ n_bytes_remaining = bc[0] & CQE_BC_BYTE_COUNT_MASK;
+ pkt_head->total_length_not_including_first_buffer =
+ n_segs_remaining >
+ 1 ? n_bytes_remaining - pkt[0]->current_length : 0;
+ }
+ else /* Perform chaining if it's a continuation buffer */
+ {
+ pkt_prev->next_buffer = ptd->current_segs[pkt - bufs];
+ pkt_prev->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ pkt[0]->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ }
+ if (n_segs_remaining == 1) /* Last buffer of the chain */
+ {
+ pkt[0]->current_length = n_bytes_remaining;
+ if (bc[0] & CQE_BC_FILLER_MASK)
+ {
+ (next_to_free++)[0] = pkt_head_idx;
+ (*n_rx_packets)--;
+ }
+
+ else
+ {
+ (next_in_frame++)[0] = pkt_head_idx;
+ n_rx_bytes +=
+ pkt_head->current_length +
+ pkt_head->total_length_not_including_first_buffer;
+ }
+ /*Go to next CQE */
+ bc++;
+ }
+ else
+ {
+ n_bytes_remaining -= pkt[0]->current_length;
+ pkt_prev = pkt[0];
+ }
+ n_segs_remaining--;
+ n_left--;
+ pkt++;
+ }
+
+ }
+ while (n_rx_segs > 0);
+ vlib_buffer_free (vm, ptd->to_free_buffers,
+ next_to_free - ptd->to_free_buffers);
+ }
+ return n_rx_bytes;
+}
+