X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fplugins%2Frdma%2Fdevice.c;h=eb13f855b1a89e7c841b2d34e79b41b84acd1657;hb=dc812d9a71f2f5105e4aaba50fd98ea3b0b50a9b;hp=ea6d2785845128159b010aa176fd937f8ac7542f;hpb=90ea5dc1bd7adcedd0d5765c3ee04c98c4853337;p=vpp.git diff --git a/src/plugins/rdma/device.c b/src/plugins/rdma/device.c index ea6d2785845..eb13f855b1a 100644 --- a/src/plugins/rdma/device.c +++ b/src/plugins/rdma/device.c @@ -399,15 +399,32 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) { rdma_rxq_t *rxq; struct ibv_wq_init_attr wqia; + struct ibv_cq_init_attr_ex cqa = { }; struct ibv_wq_attr wqa; + struct ibv_cq_ex *cqex; vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES); rxq = vec_elt_at_index (rd->rxqs, qid); rxq->size = n_desc; vec_validate_aligned (rxq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES); - if ((rxq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0) - return clib_error_return_unix (0, "Create CQ Failed"); + cqa.cqe = n_desc; + if (rd->flags & RDMA_DEVICE_F_MLX5DV) + { + struct mlx5dv_cq_init_attr dvcq = { }; + dvcq.comp_mask = MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE; + dvcq.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH; + + if ((cqex = mlx5dv_create_cq (rd->ctx, &cqa, &dvcq)) == 0) + return clib_error_return_unix (0, "Create mlx5dv rx CQ Failed"); + } + else + { + if ((cqex = ibv_create_cq_ex (rd->ctx, &cqa)) == 0) + return clib_error_return_unix (0, "Create CQ Failed"); + } + + rxq->cq = ibv_cq_ex_to_cq (cqex); memset (&wqia, 0, sizeof (wqia)); wqia.wq_type = IBV_WQT_RQ; @@ -424,6 +441,44 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) if (ibv_modify_wq (rxq->wq, &wqa) != 0) return clib_error_return_unix (0, "Modify WQ (RDY) Failed"); + if (rd->flags & RDMA_DEVICE_F_MLX5DV) + { + struct mlx5dv_obj obj = { }; + struct mlx5dv_cq dv_cq; + struct mlx5dv_rwq dv_rwq; + u64 qw0; + + obj.cq.in = rxq->cq; + obj.cq.out = &dv_cq; + obj.rwq.in = rxq->wq; + obj.rwq.out = &dv_rwq; + + if ((mlx5dv_init_obj (&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_RWQ))) + return clib_error_return_unix (0, "mlx5dv: failed to init rx obj"); + + if (dv_cq.cqe_size != sizeof (mlx5dv_cqe_t)) + return clib_error_return_unix (0, "mlx5dv: incompatible rx CQE size"); + + rxq->log2_cq_size = max_log2 (dv_cq.cqe_cnt); + rxq->cqes = (mlx5dv_cqe_t *) dv_cq.buf; + rxq->cq_db = (volatile u32 *) dv_cq.dbrec; + rxq->cqn = dv_cq.cqn; + + rxq->wqes = (mlx5dv_rwq_t *) dv_rwq.buf; + rxq->wq_db = (volatile u32 *) dv_rwq.dbrec; + rxq->wq_stride = dv_rwq.stride; + rxq->wqe_cnt = dv_rwq.wqe_cnt; + + qw0 = clib_host_to_net_u32 (vlib_buffer_get_default_data_size (vm)); + qw0 |= (u64) clib_host_to_net_u32 (rd->lkey) << 32; + + for (int i = 0; i < rxq->size; i++) + rxq->wqes[i].dsz_and_lkey = qw0; + + for (int i = 0; i < (1 << rxq->log2_cq_size); i++) + rxq->cqes[i].opcode_cqefmt_se_owner = 0xff; + } + return 0; } @@ -480,7 +535,8 @@ rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) vec_validate_aligned (rd->txqs, qid, CLIB_CACHE_LINE_BYTES); txq = vec_elt_at_index (rd->txqs, qid); - txq->size = n_desc; + ASSERT (is_pow2 (n_desc)); + txq->bufs_log2sz = min_log2 (n_desc); vec_validate_aligned (txq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES); if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0) @@ -514,6 +570,57 @@ rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) qpa.qp_state = IBV_QPS_RTS; if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0) return clib_error_return_unix (0, "Modify QP (send) Failed"); + + txq->ibv_cq = txq->cq; + txq->ibv_qp = txq->qp; + + if (rd->flags & RDMA_DEVICE_F_MLX5DV) + { + rdma_mlx5_wqe_t *tmpl = (void *) txq->dv_wqe_tmpl; + struct mlx5dv_cq dv_cq; + struct mlx5dv_qp dv_qp; + struct mlx5dv_obj obj = { }; + + obj.cq.in = txq->cq; + obj.cq.out = &dv_cq; + obj.qp.in = txq->qp; + obj.qp.out = &dv_qp; + + if (mlx5dv_init_obj (&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP)) + return clib_error_return_unix (0, "DV init obj failed"); + + if (RDMA_TXQ_BUF_SZ (txq) > dv_qp.sq.wqe_cnt + || !is_pow2 (dv_qp.sq.wqe_cnt) + || sizeof (rdma_mlx5_wqe_t) != dv_qp.sq.stride + || (uword) dv_qp.sq.buf % sizeof (rdma_mlx5_wqe_t)) + return clib_error_return (0, "Unsupported DV SQ parameters"); + + if (RDMA_TXQ_BUF_SZ (txq) > dv_cq.cqe_cnt + || !is_pow2 (dv_cq.cqe_cnt) + || sizeof (struct mlx5_cqe64) != dv_cq.cqe_size + || (uword) dv_cq.buf % sizeof (struct mlx5_cqe64)) + return clib_error_return (0, "Unsupported DV CQ parameters"); + + /* get SQ and doorbell addresses */ + txq->dv_sq_wqes = dv_qp.sq.buf; + txq->dv_sq_dbrec = dv_qp.dbrec; + txq->dv_sq_db = dv_qp.bf.reg; + txq->dv_sq_log2sz = min_log2 (dv_qp.sq.wqe_cnt); + + /* get CQ and doorbell addresses */ + txq->dv_cq_cqes = dv_cq.buf; + txq->dv_cq_dbrec = dv_cq.dbrec; + txq->dv_cq_log2sz = min_log2 (dv_cq.cqe_cnt); + + /* init tx desc template */ + STATIC_ASSERT_SIZEOF (txq->dv_wqe_tmpl, sizeof (*tmpl)); + mlx5dv_set_ctrl_seg (&tmpl->ctrl, 0, MLX5_OPCODE_SEND, 0, + txq->qp->qp_num, 0, RDMA_MLX5_WQE_DS, 0, + RDMA_TXQ_DV_INVALID_ID); + /* FIXME: mlx5dv_set_eth_seg(&tmpl->eseg, MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM, 0, 0, 0); */ + mlx5dv_set_data_seg (&tmpl->dseg, 0, rd->lkey, 0); + } + return 0; } @@ -532,8 +639,21 @@ rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, u32 rxq_size, if ((rd->pd = ibv_alloc_pd (rd->ctx)) == 0) return clib_error_return_unix (0, "PD Alloc Failed"); + if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start, + bm->buffer_mem_size, + IBV_ACCESS_LOCAL_WRITE)) == 0) + return clib_error_return_unix (0, "Register MR Failed"); + + rd->lkey = rd->mr->lkey; /* avoid indirection in datapath */ + ethernet_mac_address_generate (rd->hwaddr.bytes); + if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start, + bm->buffer_mem_size, + IBV_ACCESS_LOCAL_WRITE)) == 0) + return clib_error_return_unix (0, "Register MR Failed"); + rd->lkey = rd->mr->lkey; /* avoid indirection in datapath */ + /* * /!\ WARNING /!\ creation order is important * We *must* create TX queues *before* RX queues, otherwise we will receive @@ -549,12 +669,6 @@ rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, u32 rxq_size, if ((err = rdma_rxq_finalize (vm, rd))) return err; - if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start, - bm->buffer_mem_size, - IBV_ACCESS_LOCAL_WRITE)) == 0) - return clib_error_return_unix (0, "Register MR Failed"); - rd->lkey = rd->mr->lkey; /* avoid indirection in datapath */ - return 0; } @@ -566,6 +680,9 @@ sysfs_path_to_pci_addr (char *path, vlib_pci_addr_t * addr) u8 *s; s = clib_sysfs_link_to_name (path); + if (!s) + return 0; + unformat_init_string (&in, (char *) s, strlen ((char *) s)); rv = unformat (&in, "%U", unformat_vlib_pci_addr, addr); unformat_free (&in); @@ -586,8 +703,8 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args) u16 qid; int i; - args->rxq_size = args->rxq_size ? args->rxq_size : 2 * VLIB_FRAME_SIZE; - args->txq_size = args->txq_size ? args->txq_size : 2 * VLIB_FRAME_SIZE; + args->rxq_size = args->rxq_size ? args->rxq_size : 1024; + args->txq_size = args->txq_size ? args->txq_size : 1024; args->rxq_num = args->rxq_num ? args->rxq_num : 1; if (!is_pow2 (args->rxq_num)) @@ -599,12 +716,13 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args) } if (args->rxq_size < VLIB_FRAME_SIZE || args->txq_size < VLIB_FRAME_SIZE || + args->rxq_size > 65535 || args->txq_size > 65535 || !is_pow2 (args->rxq_size) || !is_pow2 (args->txq_size)) { args->rv = VNET_API_ERROR_INVALID_VALUE; - args->error = - clib_error_return (0, "queue size must be a power of two >= %i", - VLIB_FRAME_SIZE); + args->error = clib_error_return (0, "queue size must be a power of two " + "between %i and 65535", + VLIB_FRAME_SIZE); goto err0; } @@ -629,9 +747,13 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args) pool_get_zero (rm->devices, rd); rd->dev_instance = rd - rm->devices; rd->per_interface_next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; - rd->name = format (0, "%s", args->name); rd->linux_ifname = format (0, "%s", args->ifname); + if (!args->name || 0 == args->name[0]) + rd->name = format (0, "%s/%d", args->ifname, rd->dev_instance); + else + rd->name = format (0, "%s", args->name); + rd->pci = vlib_pci_get_device_info (vm, &pci_addr, &args->error); if (!rd->pci) goto err2; @@ -667,8 +789,28 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args) break; } - if ((args->error = - rdma_dev_init (vm, rd, args->rxq_size, args->txq_size, args->rxq_num))) + if (args->mode != RDMA_MODE_IBV) + { + struct mlx5dv_context mlx5dv_attrs = { }; + + if (mlx5dv_query_device (rd->ctx, &mlx5dv_attrs) == 0) + { + if ((mlx5dv_attrs.flags & MLX5DV_CONTEXT_FLAGS_CQE_V1)) + rd->flags |= RDMA_DEVICE_F_MLX5DV; + } + else + { + if (args->mode == RDMA_MODE_DV) + { + args->error = clib_error_return (0, "Direct Verbs mode not " + "supported on this interface"); + goto err2; + } + } + } + + if ((args->error = rdma_dev_init (vm, rd, args->rxq_size, args->txq_size, + args->rxq_num))) goto err2; if ((args->error = rdma_register_interface (vnm, rd))) @@ -746,15 +888,9 @@ rdma_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index, rdma_main_t *rm = &rdma_main; vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); rdma_device_t *rd = pool_elt_at_index (rm->devices, hw->dev_instance); - - /* Shut off redirection */ - if (node_index == ~0) - { - rd->per_interface_next_index = node_index; - return; - } - rd->per_interface_next_index = + ~0 == + node_index ? VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT : vlib_node_add_next (vlib_get_main (), rdma_input_node.index, node_index); } @@ -782,9 +918,23 @@ clib_error_t * rdma_init (vlib_main_t * vm) { rdma_main_t *rm = &rdma_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); rm->log_class = vlib_log_register_class ("rdma", 0); + /* vlib_buffer_t template */ + vec_validate_aligned (rm->per_thread_data, tm->n_vlib_mains - 1, + CLIB_CACHE_LINE_BYTES); + + for (int i = 0; i < tm->n_vlib_mains; i++) + { + rdma_per_thread_data_t *ptd = vec_elt_at_index (rm->per_thread_data, i); + clib_memset (&ptd->buffer_template, 0, sizeof (vlib_buffer_t)); + ptd->buffer_template.flags = VLIB_BUFFER_TOTAL_LENGTH_VALID; + ptd->buffer_template.ref_count = 1; + vnet_buffer (&ptd->buffer_template)->sw_if_index[VLIB_TX] = (u32) ~ 0; + } + return 0; }