X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fplugins%2Faf_xdp%2Fdevice.c;h=30d3c9571e68b3fbbb069b4a1b07004332447d63;hb=1ae9819908ca2ba78c6fa0c0051bb1d4b2066be8;hp=48e61dd03d16cb203ee85a159293549a483b4b17;hpb=b85b0df2a039b694fb2f3c09a01decfb89d7bce2;p=vpp.git diff --git a/src/plugins/af_xdp/device.c b/src/plugins/af_xdp/device.c index 48e61dd03d1..30d3c9571e6 100644 --- a/src/plugins/af_xdp/device.c +++ b/src/plugins/af_xdp/device.c @@ -17,7 +17,10 @@ #include #include +#include +#include #include +#include #include #include #include @@ -93,8 +96,7 @@ af_xdp_delete_if (vlib_main_t * vm, af_xdp_device_t * ad) af_xdp_main_t *axm = &af_xdp_main; struct xsk_socket **xsk; struct xsk_umem **umem; - af_xdp_rxq_t *rxq; - af_xdp_txq_t *txq; + int i; if (ad->hw_if_index) { @@ -102,11 +104,17 @@ af_xdp_delete_if (vlib_main_t * vm, af_xdp_device_t * ad) ethernet_delete_interface (vnm, ad->hw_if_index); } - vec_foreach (rxq, ad->rxqs) clib_file_del_by_index (&file_main, - rxq->file_index); - vec_foreach (txq, ad->txqs) clib_spinlock_free (&txq->lock); - vec_foreach (xsk, ad->xsk) xsk_socket__delete (*xsk); - vec_foreach (umem, ad->umem) xsk_umem__delete (*umem); + for (i = 0; i < ad->txq_num; i++) + clib_spinlock_free (&vec_elt (ad->txqs, i).lock); + + vec_foreach (xsk, ad->xsk) + xsk_socket__delete (*xsk); + + vec_foreach (umem, ad->umem) + xsk_umem__delete (*umem); + + for (i = 0; i < ad->rxq_num; i++) + clib_file_del_by_index (&file_main, vec_elt (ad->rxqs, i).file_index); if (ad->bpf_obj) { @@ -169,8 +177,8 @@ err0: } static int -af_xdp_create_queue (vlib_main_t * vm, af_xdp_create_if_args_t * args, - af_xdp_device_t * ad, int qid, int rxq_num, int txq_num) +af_xdp_create_queue (vlib_main_t *vm, af_xdp_create_if_args_t *args, + af_xdp_device_t *ad, int qid) { struct xsk_umem **umem; struct xsk_socket **xsk; @@ -180,26 +188,21 @@ af_xdp_create_queue (vlib_main_t * vm, af_xdp_create_if_args_t * args, struct xsk_socket_config sock_config; struct xdp_options opt; socklen_t optlen; + const int is_rx = qid < ad->rxq_num; + const int is_tx = qid < ad->txq_num; - vec_validate_aligned (ad->umem, qid, CLIB_CACHE_LINE_BYTES); umem = vec_elt_at_index (ad->umem, qid); - - vec_validate_aligned (ad->xsk, qid, CLIB_CACHE_LINE_BYTES); xsk = vec_elt_at_index (ad->xsk, qid); - - vec_validate_aligned (ad->rxqs, qid, CLIB_CACHE_LINE_BYTES); rxq = vec_elt_at_index (ad->rxqs, qid); - - vec_validate_aligned (ad->txqs, qid, CLIB_CACHE_LINE_BYTES); txq = vec_elt_at_index (ad->txqs, qid); /* * fq and cq must always be allocated even if unused * whereas rx and tx indicates whether we want rxq, txq, or both */ - struct xsk_ring_cons *rx = qid < rxq_num ? &rxq->rx : 0; + struct xsk_ring_cons *rx = is_rx ? &rxq->rx : 0; struct xsk_ring_prod *fq = &rxq->fq; - struct xsk_ring_prod *tx = qid < txq_num ? &txq->tx : 0; + struct xsk_ring_prod *tx = is_tx ? &txq->tx : 0; struct xsk_ring_cons *cq = &txq->cq; int fd; @@ -208,6 +211,7 @@ af_xdp_create_queue (vlib_main_t * vm, af_xdp_create_if_args_t * args, umem_config.comp_size = args->txq_size; umem_config.frame_size = sizeof (vlib_buffer_t) + vlib_buffer_get_default_data_size (vm); + umem_config.frame_headroom = sizeof (vlib_buffer_t); umem_config.flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG; if (xsk_umem__create (umem, uword_to_pointer (vm->buffer_main->buffer_mem_start, void *), @@ -256,8 +260,32 @@ af_xdp_create_queue (vlib_main_t * vm, af_xdp_create_if_args_t * args, if (opt.flags & XDP_OPTIONS_ZEROCOPY) ad->flags |= AF_XDP_DEVICE_F_ZEROCOPY; - rxq->xsk_fd = qid < rxq_num ? fd : -1; - txq->xsk_fd = qid < txq_num ? fd : -1; + rxq->xsk_fd = is_rx ? fd : -1; + + if (is_tx) + { + txq->xsk_fd = fd; + if (is_rx && (ad->flags & AF_XDP_DEVICE_F_SYSCALL_LOCK)) + { + /* This is a shared rx+tx queue and we need to lock before syscalls. + * Prior to Linux 5.6 there is a race condition preventing to call + * poll() and sendto() concurrently on AF_XDP sockets. This was + * fixed with commit 11cc2d21499cabe7e7964389634ed1de3ee91d33 + * to workaround this issue, we protect the syscalls with a + * spinlock. Note that it also prevents to use interrupt mode in + * multi workers setup, because in this case the poll() is done in + * the framework w/o any possibility to protect it. + * See + * https://lore.kernel.org/bpf/BYAPR11MB365382C5DB1E5FCC53242609C1549@BYAPR11MB3653.namprd11.prod.outlook.com/ + */ + clib_spinlock_init (&rxq->syscall_lock); + txq->syscall_lock = rxq->syscall_lock; + } + } + else + { + txq->xsk_fd = -1; + } return 0; @@ -289,6 +317,31 @@ af_xdp_get_numa (const char *ifname) return numa; } +static void +af_xdp_get_q_count (const char *ifname, int *rxq_num, int *txq_num) +{ + struct ethtool_channels ec = { .cmd = ETHTOOL_GCHANNELS }; + struct ifreq ifr = { .ifr_data = (void *) &ec }; + int fd, err; + + *rxq_num = *txq_num = 1; + + fd = socket (AF_INET, SOCK_DGRAM, 0); + if (fd < 0) + return; + + snprintf (ifr.ifr_name, sizeof (ifr.ifr_name), "%s", ifname); + err = ioctl (fd, SIOCETHTOOL, &ifr); + + close (fd); + + if (err) + return; + + *rxq_num = clib_max (ec.combined_count, ec.rx_count); + *txq_num = clib_max (ec.combined_count, ec.tx_count); +} + static clib_error_t * af_xdp_device_rxq_read_ready (clib_file_t * f) { @@ -296,19 +349,37 @@ af_xdp_device_rxq_read_ready (clib_file_t * f) return 0; } -static void -af_xdp_device_set_rxq_mode (af_xdp_rxq_t *rxq, int is_polling) +static clib_error_t * +af_xdp_device_set_rxq_mode (const af_xdp_device_t *ad, af_xdp_rxq_t *rxq, + const af_xdp_rxq_mode_t mode) { clib_file_main_t *fm = &file_main; + clib_file_update_type_t update; clib_file_t *f; - if (rxq->is_polling == is_polling) - return; + if (rxq->mode == mode) + return 0; + + switch (mode) + { + case AF_XDP_RXQ_MODE_POLLING: + update = UNIX_FILE_UPDATE_DELETE; + break; + case AF_XDP_RXQ_MODE_INTERRUPT: + if (ad->flags & AF_XDP_DEVICE_F_SYSCALL_LOCK) + return clib_error_create ( + "kernel workaround incompatible with interrupt mode"); + update = UNIX_FILE_UPDATE_ADD; + break; + default: + ASSERT (0); + return clib_error_create ("unknown rxq mode %i", mode); + } f = clib_file_get (fm, rxq->file_index); - fm->file_update (f, is_polling ? UNIX_FILE_UPDATE_DELETE : - UNIX_FILE_UPDATE_ADD); - rxq->is_polling = !!is_polling; + fm->file_update (f, update); + rxq->mode = mode; + return 0; } void @@ -325,8 +396,7 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) args->rxq_size = args->rxq_size ? args->rxq_size : 2 * VLIB_FRAME_SIZE; args->txq_size = args->txq_size ? args->txq_size : 2 * VLIB_FRAME_SIZE; - rxq_num = args->rxq_num ? args->rxq_num : 1; - txq_num = tm->n_vlib_mains; + args->rxq_num = args->rxq_num ? args->rxq_num : 1; if (!args->linux_ifname) { @@ -347,8 +417,23 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) goto err0; } + af_xdp_get_q_count (args->linux_ifname, &rxq_num, &txq_num); + if (args->rxq_num > rxq_num && AF_XDP_NUM_RX_QUEUES_ALL != args->rxq_num) + { + args->rv = VNET_API_ERROR_INVALID_VALUE; + args->error = clib_error_create ("too many rxq requested (%d > %d)", + args->rxq_num, rxq_num); + goto err0; + } + rxq_num = clib_min (rxq_num, args->rxq_num); + txq_num = clib_min (txq_num, tm->n_vlib_mains); + pool_get_zero (am->devices, ad); + if (tm->n_vlib_mains > 1 && + 0 == (args->flags & AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK)) + ad->flags |= AF_XDP_DEVICE_F_SYSCALL_LOCK; + ad->linux_ifname = (char *) format (0, "%s", args->linux_ifname); vec_validate (ad->linux_ifname, IFNAMSIZ - 1); /* libbpf expects ifname to be at least IFNAMSIZ */ @@ -356,10 +441,17 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) goto err1; q_num = clib_max (rxq_num, txq_num); + ad->rxq_num = rxq_num; ad->txq_num = txq_num; + + vec_validate_aligned (ad->umem, q_num - 1, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (ad->xsk, q_num - 1, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (ad->rxqs, q_num - 1, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (ad->txqs, q_num - 1, CLIB_CACHE_LINE_BYTES); + for (i = 0; i < q_num; i++) { - if (af_xdp_create_queue (vm, args, ad, i, rxq_num, txq_num)) + if (af_xdp_create_queue (vm, args, ad, i)) { /* * queue creation failed @@ -375,23 +467,30 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) vec_set_len (ad->rxqs, i); vec_set_len (ad->txqs, i); - if (i < rxq_num && AF_XDP_NUM_RX_QUEUES_ALL != rxq_num) - goto err1; /* failed creating requested rxq: fatal error, bailing out */ + ad->rxq_num = clib_min (i, rxq_num); + ad->txq_num = clib_min (i, txq_num); - if (i < txq_num) + if (i < rxq_num && AF_XDP_NUM_RX_QUEUES_ALL != args->rxq_num) { - /* we created less txq than threads not an error but initialize lock for shared txq */ - af_xdp_txq_t *txq; - ad->txq_num = i; - vec_foreach (txq, ad->txqs) clib_spinlock_init (&txq->lock); + ad->rxq_num = ad->txq_num = 0; + goto err1; /* failed creating requested rxq: fatal error, bailing + out */ } + args->rv = 0; clib_error_free (args->error); break; } } + if (ad->txq_num < tm->n_vlib_mains) + { + /* initialize lock for shared txq */ + for (i = 0; i < ad->txq_num; i++) + clib_spinlock_init (&vec_elt (ad->txqs, i).lock); + } + ad->dev_instance = ad - am->devices; ad->per_interface_next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; ad->pool = @@ -420,11 +519,11 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) sw = vnet_get_hw_sw_interface (vnm, ad->hw_if_index); hw = vnet_get_hw_interface (vnm, ad->hw_if_index); args->sw_if_index = ad->sw_if_index = sw->sw_if_index; - hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE; + hw->caps |= VNET_HW_INTERFACE_CAP_SUPPORTS_INT_MODE; vnet_hw_if_set_input_node (vnm, ad->hw_if_index, af_xdp_input_node.index); - for (i = 0; i < vec_len (ad->rxqs); i++) + for (i = 0; i < ad->rxq_num; i++) { af_xdp_rxq_t *rxq = vec_elt_at_index (ad->rxqs, i); rxq->queue_index = vnet_hw_if_register_rx_queue ( @@ -433,7 +532,6 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) ad->dev_instance, i); clib_file_t f = { .file_descriptor = rxq->xsk_fd, - .flags = UNIX_FILE_EVENT_EDGE_TRIGGERED, .private_data = rxq->queue_index, .read_function = af_xdp_device_rxq_read_ready, .description = desc, @@ -441,7 +539,8 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) rxq->file_index = clib_file_add (&file_main, &f); vnet_hw_if_set_rx_queue_file_index (vnm, rxq->queue_index, rxq->file_index); - af_xdp_device_set_rxq_mode (rxq, 1 /* polling */); + if (af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_POLLING)) + goto err1; } vnet_hw_if_update_runtime_data (vnm, ad->hw_if_index); @@ -478,6 +577,7 @@ af_xdp_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) vnet_hw_interface_set_flags (vnm, ad->hw_if_index, VNET_HW_INTERFACE_FLAG_LINK_UP); ad->flags |= AF_XDP_DEVICE_F_ADMIN_UP; + af_xdp_device_input_refill (ad); } else { @@ -498,24 +598,20 @@ af_xdp_interface_rx_mode_change (vnet_main_t *vnm, u32 hw_if_index, u32 qid, switch (mode) { - case VNET_HW_IF_RX_MODE_UNKNOWN: - case VNET_HW_IF_NUM_RX_MODES: /* fallthrough */ + default: /* fallthrough */ + case VNET_HW_IF_RX_MODE_UNKNOWN: /* fallthrough */ + case VNET_HW_IF_NUM_RX_MODES: return clib_error_create ("uknown rx mode - doing nothing"); - case VNET_HW_IF_RX_MODE_DEFAULT: - case VNET_HW_IF_RX_MODE_POLLING: /* fallthrough */ - if (rxq->is_polling) - break; - af_xdp_device_set_rxq_mode (rxq, 1 /* polling */); - break; - case VNET_HW_IF_RX_MODE_INTERRUPT: - case VNET_HW_IF_RX_MODE_ADAPTIVE: /* fallthrough */ - if (0 == rxq->is_polling) - break; - af_xdp_device_set_rxq_mode (rxq, 0 /* interrupt */); - break; + case VNET_HW_IF_RX_MODE_DEFAULT: /* fallthrough */ + case VNET_HW_IF_RX_MODE_POLLING: + return af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_POLLING); + case VNET_HW_IF_RX_MODE_INTERRUPT: /* fallthrough */ + case VNET_HW_IF_RX_MODE_ADAPTIVE: + return af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_INTERRUPT); } - return 0; + ASSERT (0 && "unreachable"); + return clib_error_create ("unreachable"); } static void