X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fplugins%2Frdma%2Fdevice.c;h=7316edbaa25deee5f6b40c32cb3fd36afa20823e;hb=2e9a06fa6013e4ac9be3325c2755e0631d9c6ae4;hp=0fddc3a3a5deca00a78a3dc477193ead8c6b203d;hpb=f2d5cdbfa674a2ac9e81fd49d69594f0cdbcffd3;p=vpp.git diff --git a/src/plugins/rdma/device.c b/src/plugins/rdma/device.c index 0fddc3a3a5d..7316edbaa25 100644 --- a/src/plugins/rdma/device.c +++ b/src/plugins/rdma/device.c @@ -153,6 +153,20 @@ rdma_dev_set_ucast (rdma_device_t * rd) return 0; } +static clib_error_t * +rdma_mac_change (vnet_hw_interface_t * hw, const u8 * old, const u8 * new) +{ + rdma_main_t *rm = &rdma_main; + rdma_device_t *rd = vec_elt_at_index (rm->devices, hw->dev_instance); + mac_address_from_bytes (&rd->hwaddr, new); + if (!(rd->flags & RDMA_DEVICE_F_PROMISC) && rdma_dev_set_ucast (rd)) + { + mac_address_from_bytes (&rd->hwaddr, old); + return clib_error_return_unix (0, "MAC update failed"); + } + return 0; +} + static u32 rdma_dev_change_mtu (rdma_device_t * rd) { @@ -252,8 +266,7 @@ rdma_async_event_error_ready (clib_file_t * f) { rdma_main_t *rm = &rdma_main; rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data); - return clib_error_return (0, "RDMA async event error for device %U", - format_vlib_pci_addr, &rd->pci_addr); + return clib_error_return (0, "RDMA: %s: async event error", rd->name); } static clib_error_t * @@ -279,8 +292,7 @@ rdma_async_event_read_ready (clib_file_t * f) case IBV_EVENT_DEVICE_FATAL: rd->flags &= ~RDMA_DEVICE_F_LINK_UP; vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0); - vlib_log_emerg (rm->log_class, "Fatal RDMA error for device %U", - format_vlib_pci_addr, &rd->pci_addr); + vlib_log_emerg (rm->log_class, "%s: fatal error", rd->name); break; default: rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "unhandeld RDMA async event %i", @@ -312,8 +324,7 @@ rdma_async_event_init (rdma_device_t * rd) t.file_descriptor = rd->ctx->async_fd; t.error_function = rdma_async_event_error_ready; t.private_data = rd->dev_instance; - t.description = - format (0, "RMDA %U async event", format_vlib_pci_addr, &rd->pci_addr); + t.description = format (0, "%v async event", rd->name); rd->async_event_clib_file_index = clib_file_add (&file_main, &t); return 0; @@ -379,6 +390,7 @@ rdma_dev_cleanup (rdma_device_t * rd) vec_free (rd->rxqs); vec_free (rd->txqs); vec_free (rd->name); + vlib_pci_free_device_info (rd->pci); pool_put (rm->devices, rd); } @@ -392,6 +404,7 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES); rxq = vec_elt_at_index (rd->rxqs, qid); rxq->size = n_desc; + vec_validate_aligned (rxq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES); if ((rxq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0) return clib_error_return_unix (0, "Create CQ Failed"); @@ -468,6 +481,7 @@ rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) vec_validate_aligned (rd->txqs, qid, CLIB_CACHE_LINE_BYTES); txq = vec_elt_at_index (rd->txqs, qid); txq->size = n_desc; + vec_validate_aligned (txq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES); if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0) return clib_error_return_unix (0, "Create CQ Failed"); @@ -478,7 +492,6 @@ rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) qpia.cap.max_send_wr = n_desc; qpia.cap.max_send_sge = 1; qpia.qp_type = IBV_QPT_RAW_PACKET; - qpia.sq_sig_all = 1; if ((txq->qp = ibv_create_qp (rd->pd, &qpia)) == 0) return clib_error_return_unix (0, "Queue Pair create failed"); @@ -521,20 +534,26 @@ rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, u32 rxq_size, ethernet_mac_address_generate (rd->hwaddr.bytes); + /* + * /!\ WARNING /!\ creation order is important + * We *must* create TX queues *before* RX queues, otherwise we will receive + * the broacast packets we sent + */ + for (i = 0; i < tm->n_vlib_mains; i++) + if ((err = rdma_txq_init (vm, rd, i, txq_size))) + return err; + for (i = 0; i < rxq_num; i++) if ((err = rdma_rxq_init (vm, rd, i, rxq_size))) return err; if ((err = rdma_rxq_finalize (vm, rd))) return err; - for (i = 0; i < tm->n_vlib_mains; i++) - if ((err = rdma_txq_init (vm, rd, i, txq_size))) - return err; - if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start, bm->buffer_mem_size, IBV_ACCESS_LOCAL_WRITE)) == 0) return clib_error_return_unix (0, "Register MR Failed"); + rd->lkey = rd->mr->lkey; /* avoid indirection in datapath */ return 0; } @@ -547,6 +566,9 @@ sysfs_path_to_pci_addr (char *path, vlib_pci_addr_t * addr) u8 *s; s = clib_sysfs_link_to_name (path); + if (!s) + return 0; + unformat_init_string (&in, (char *) s, strlen ((char *) s)); rv = unformat (&in, "%U", unformat_vlib_pci_addr, addr); unformat_free (&in); @@ -559,14 +581,16 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args) { vnet_main_t *vnm = vnet_get_main (); rdma_main_t *rm = &rdma_main; - rdma_device_t *rd = 0; - struct ibv_device **dev_list = 0; + rdma_device_t *rd; + vlib_pci_addr_t pci_addr; + struct ibv_device **dev_list; int n_devs; - u8 *s = 0, *s2 = 0; + u8 *s; u16 qid; + int i; - args->rxq_size = args->rxq_size ? args->rxq_size : 2 * VLIB_FRAME_SIZE; - args->txq_size = args->txq_size ? args->txq_size : 2 * VLIB_FRAME_SIZE; + args->rxq_size = args->rxq_size ? args->rxq_size : 1024; + args->txq_size = args->txq_size ? args->txq_size : 1024; args->rxq_num = args->rxq_num ? args->rxq_num : 1; if (!is_pow2 (args->rxq_num)) @@ -574,41 +598,30 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args) args->rv = VNET_API_ERROR_INVALID_VALUE; args->error = clib_error_return (0, "rx queue number must be a power of two"); - return; + goto err0; } - if (!is_pow2 (args->rxq_size) || !is_pow2 (args->txq_size)) + if (args->rxq_size < VLIB_FRAME_SIZE || args->txq_size < VLIB_FRAME_SIZE || + !is_pow2 (args->rxq_size) || !is_pow2 (args->txq_size)) { args->rv = VNET_API_ERROR_INVALID_VALUE; args->error = - clib_error_return (0, "queue size must be a power of two"); - return; - } - - pool_get_zero (rm->devices, rd); - rd->dev_instance = rd - rm->devices; - rd->per_interface_next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; - rd->name = vec_dup (args->name); - - /* check if device exist and if it is bound to mlx5_core */ - s = format (s, "/sys/class/net/%s/device/driver/module%c", args->ifname, 0); - s2 = clib_sysfs_link_to_name ((char *) s); - - if (s2 == 0 || strncmp ((char *) s2, "mlx5_core", 9) != 0) - { - args->error = - clib_error_return (0, - "invalid interface (only mlx5 supported for now)"); + clib_error_return (0, "queue size must be a power of two >= %i", + VLIB_FRAME_SIZE); goto err0; } - /* extract PCI address */ - vec_reset_length (s); - s = format (s, "/sys/class/net/%s/device%c", args->ifname, 0); - if (sysfs_path_to_pci_addr ((char *) s, &rd->pci_addr) == 0) + switch (args->mode) { - args->error = clib_error_return (0, "cannot find PCI address"); + case RDMA_MODE_AUTO: + break; + case RDMA_MODE_IBV: + break; + case RDMA_MODE_DV: + args->rv = VNET_API_ERROR_INVALID_VALUE; + args->error = clib_error_return (0, "unsupported mode"); goto err0; + break; } dev_list = ibv_get_device_list (&n_devs); @@ -616,12 +629,48 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args) { args->error = clib_error_return_unix (0, - "no RDMA devices available, errno = %d. " - "Is the ib_uverbs module loaded?", errno); + "no RDMA devices available. Is the ib_uverbs module loaded?"); goto err0; } - for (int i = 0; i < n_devs; i++) + /* get PCI address */ + s = format (0, "/sys/class/net/%s/device%c", args->ifname, 0); + if (sysfs_path_to_pci_addr ((char *) s, &pci_addr) == 0) + { + args->error = + clib_error_return (0, "cannot find PCI address for device "); + goto err1; + } + + pool_get_zero (rm->devices, rd); + rd->dev_instance = rd - rm->devices; + rd->per_interface_next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; + rd->linux_ifname = format (0, "%s", args->ifname); + + if (!args->name || 0 == args->name[0]) + rd->name = format (0, "%s/%d", args->ifname, rd->dev_instance); + else + rd->name = format (0, "%s", args->name); + + rd->pci = vlib_pci_get_device_info (vm, &pci_addr, &args->error); + if (!rd->pci) + goto err2; + + /* if we failed to parse NUMA node, default to 0 */ + if (-1 == rd->pci->numa_node) + rd->pci->numa_node = 0; + + rd->pool = vlib_buffer_pool_get_default_for_numa (vm, rd->pci->numa_node); + + if (strncmp ((char *) rd->pci->driver_name, "mlx5_core", 9)) + { + args->error = + clib_error_return (0, + "invalid interface (only mlx5 supported for now)"); + goto err2; + } + + for (i = 0; i < n_devs; i++) { vlib_pci_addr_t addr; @@ -631,7 +680,7 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args) if (sysfs_path_to_pci_addr ((char *) s, &addr) == 0) continue; - if (addr.as_u32 != rd->pci_addr.as_u32) + if (addr.as_u32 != rd->pci->addr.as_u32) continue; if ((rd->ctx = ibv_open_device (dev_list[i]))) @@ -640,7 +689,7 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args) if ((args->error = rdma_dev_init (vm, rd, args->rxq_size, args->txq_size, args->rxq_num))) - goto err1; + goto err2; if ((args->error = rdma_register_interface (vnm, rd))) goto err2; @@ -661,6 +710,8 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args) rdma_input_node.index); vec_foreach_index (qid, rd->rxqs) vnet_hw_interface_assign_rx_thread (vnm, rd->hw_if_index, qid, ~0); + + vec_free (s); return; err3: @@ -669,10 +720,9 @@ err2: rdma_dev_cleanup (rd); err1: ibv_free_device_list (dev_list); -err0: - vec_free (s2); vec_free (s); args->rv = VNET_API_ERROR_INVALID_INTERFACE; +err0: vlib_log_err (rm->log_class, "%U", format_clib_error, args->error); } @@ -716,15 +766,9 @@ rdma_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index, rdma_main_t *rm = &rdma_main; vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); rdma_device_t *rd = pool_elt_at_index (rm->devices, hw->dev_instance); - - /* Shut off redirection */ - if (node_index == ~0) - { - rd->per_interface_next_index = node_index; - return; - } - rd->per_interface_next_index = + ~0 == + node_index ? VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT : vlib_node_add_next (vlib_get_main (), rdma_input_node.index, node_index); } @@ -735,7 +779,7 @@ static char *rdma_tx_func_error_strings[] = { }; /* *INDENT-OFF* */ -VNET_DEVICE_CLASS (rdma_device_class,) = +VNET_DEVICE_CLASS (rdma_device_class) = { .name = "RDMA interface", .format_device = format_rdma_device, @@ -744,6 +788,7 @@ VNET_DEVICE_CLASS (rdma_device_class,) = .rx_redirect_to_node = rdma_set_interface_next_node, .tx_function_n_errors = RDMA_TX_N_ERROR, .tx_function_error_strings = rdma_tx_func_error_strings, + .mac_addr_change_function = rdma_mac_change, }; /* *INDENT-ON* */ @@ -757,7 +802,12 @@ rdma_init (vlib_main_t * vm) return 0; } -VLIB_INIT_FUNCTION (rdma_init); +/* *INDENT-OFF* */ +VLIB_INIT_FUNCTION (rdma_init) = +{ + .runs_after = VLIB_INITS ("pci_bus_init"), +}; +/* *INDENT-OFF* */ /* * fd.io coding-style-patch-verification: ON