From a42c41be4eed3e1ce2a42038b07ce1d3420891cd Mon Sep 17 00:00:00 2001 From: =?utf8?q?Beno=C3=AEt=20Ganne?= Date: Thu, 29 Apr 2021 18:24:24 +0200 Subject: [PATCH] af_xdp: workaround kernel race between poll() and sendmsg() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Prior to Linux 5.6 there is a race condition between poll() and sendmsg() in the kernel. This patch protects the syscalls with a lock to prevent it, unless the NO_SYSCALL_LOCK flag is set at create time. See https://lore.kernel.org/bpf/BYAPR11MB365382C5DB1E5FCC53242609C1549@BYAPR11MB3653.namprd11.prod.outlook.com/ Type: fix Change-Id: Ie7d4f5cb41f697b11a09b6046e54d190430d76df Signed-off-by: Benoît Ganne --- src/plugins/af_xdp/af_xdp.api | 10 ++- src/plugins/af_xdp/af_xdp.h | 30 +++++++-- src/plugins/af_xdp/af_xdp_doc.md | 28 +++++++- src/plugins/af_xdp/api.c | 12 ++++ src/plugins/af_xdp/cli.c | 5 +- src/plugins/af_xdp/device.c | 142 +++++++++++++++++++++++++++------------ src/plugins/af_xdp/input.c | 35 +++++----- src/plugins/af_xdp/output.c | 33 +++++---- src/plugins/af_xdp/test_api.c | 2 + src/plugins/af_xdp/unformat.c | 2 + 10 files changed, 213 insertions(+), 86 deletions(-) diff --git a/src/plugins/af_xdp/af_xdp.api b/src/plugins/af_xdp/af_xdp.api index 14f51d87d6a..c6716123703 100644 --- a/src/plugins/af_xdp/af_xdp.api +++ b/src/plugins/af_xdp/af_xdp.api @@ -15,7 +15,7 @@ *------------------------------------------------------------------ */ -option version = "0.1.0"; +option version = "0.2.0"; import "vnet/interface_types.api"; enum af_xdp_mode @@ -25,6 +25,10 @@ enum af_xdp_mode AF_XDP_API_MODE_ZERO_COPY = 2, }; +enumflag af_xdp_flag : u8 +{ + AF_XDP_API_FLAGS_NO_SYSCALL_LOCK = 1, +}; /** \brief @param client_index - opaque cookie to identify the sender @@ -35,6 +39,7 @@ enum af_xdp_mode @param rxq_size - receive queue size (optional) @param txq_size - transmit queue size (optional) @param mode - operation mode (optional) + @param flags - flags (optional) @param prog - eBPF program path (optional) */ @@ -49,8 +54,9 @@ define af_xdp_create u16 rxq_size [default=0]; u16 txq_size [default=0]; vl_api_af_xdp_mode_t mode [default=0]; + vl_api_af_xdp_flag_t flags [default=0]; string prog[256]; - option vat_help = " [name ifname] [rx-queue-size size] [tx-queue-size size] [num-rx-queues ] [prog pathname] [zero-copy|no-zero-copy]"; + option vat_help = " [name ifname] [rx-queue-size size] [tx-queue-size size] [num-rx-queues ] [prog pathname] [zero-copy|no-zero-copy] [no-syscall-lock]"; option status="in_progress"; }; diff --git a/src/plugins/af_xdp/af_xdp.h b/src/plugins/af_xdp/af_xdp.h index 568380baa56..91895ced23b 100644 --- a/src/plugins/af_xdp/af_xdp.h +++ b/src/plugins/af_xdp/af_xdp.h @@ -32,7 +32,8 @@ _ (1, ERROR, "error") \ _ (2, ADMIN_UP, "admin-up") \ _ (3, LINK_UP, "link-up") \ - _ (4, ZEROCOPY, "zero-copy") + _ (4, ZEROCOPY, "zero-copy") \ + _ (5, SYSCALL_LOCK, "syscall-lock") enum { @@ -49,12 +50,20 @@ enum clib_error_free(err_); \ } +typedef enum +{ + AF_XDP_RXQ_MODE_UNKNOWN, + AF_XDP_RXQ_MODE_POLLING, + AF_XDP_RXQ_MODE_INTERRUPT, +} __clib_packed af_xdp_rxq_mode_t; + typedef struct { CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); /* fields below are accessed in data-plane (hot) */ + clib_spinlock_t syscall_lock; struct xsk_ring_cons rx; struct xsk_ring_prod fq; int xsk_fd; @@ -63,7 +72,7 @@ typedef struct uword file_index; u32 queue_index; - u8 is_polling; + af_xdp_rxq_mode_t mode; } af_xdp_rxq_t; typedef struct @@ -73,6 +82,7 @@ typedef struct /* fields below are accessed in data-plane (hot) */ clib_spinlock_t lock; + clib_spinlock_t syscall_lock; struct xsk_ring_prod tx; struct xsk_ring_cons cq; int xsk_fd; @@ -101,6 +111,8 @@ typedef struct u32 dev_instance; u8 hwaddr[6]; + u8 rxq_num; + struct xsk_umem **umem; struct xsk_socket **xsk; @@ -127,12 +139,18 @@ typedef enum AF_XDP_MODE_ZERO_COPY = 2, } af_xdp_mode_t; +typedef enum +{ + AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK = 1, +} af_xdp_create_flag_t; + typedef struct { char *linux_ifname; char *name; char *prog; af_xdp_mode_t mode; + af_xdp_create_flag_t flags; u32 rxq_size; u32 txq_size; u32 rxq_num; @@ -163,10 +181,10 @@ typedef struct u32 hw_if_index; } af_xdp_input_trace_t; -#define foreach_af_xdp_tx_func_error \ -_(NO_FREE_SLOTS, "no free tx slots") \ -_(SENDTO_REQUIRED, "sendto required") \ -_(SENDTO_FAILURES, "sendto failures") +#define foreach_af_xdp_tx_func_error \ + _ (NO_FREE_SLOTS, "no free tx slots") \ + _ (SYSCALL_REQUIRED, "syscall required") \ + _ (SYSCALL_FAILURES, "syscall failures") typedef enum { diff --git a/src/plugins/af_xdp/af_xdp_doc.md b/src/plugins/af_xdp/af_xdp_doc.md index 7d83d712918..f5859dbb901 100644 --- a/src/plugins/af_xdp/af_xdp_doc.md +++ b/src/plugins/af_xdp/af_xdp_doc.md @@ -12,11 +12,15 @@ Under development: it should work, but has not been thoroughly tested. - custom eBPF program - polling, interrupt and adaptive mode -## Limitations +## Known limitations + +### MTU Because of AF_XDP restrictions, the MTU is limited to below PAGE_SIZE (4096-bytes on most systems) minus 256-bytes, and they are additional limitations depending upon specific Linux device drivers. As a rule of thumb, a MTU of 3000-bytes or less should be safe. + +### Number of buffers Furthermore, upon UMEM creation, the kernel allocates a physically-contiguous structure, whose size is proportional to the number of 4KB pages contained in the UMEM. That allocation might fail when @@ -25,7 +29,29 @@ controlled with the `buffers { buffers-per-numa }` configuration option. Finally, note that because of this limitation, this plugin is unlikely to be compatible with the use of 1GB hugepages. +### Interrupt mode +Interrupt and adaptive mode are supported but is limited by default to single +threaded (no worker) configurations because of a kernel limitation prior to +5.6. You can bypass the limitation at interface creation time by adding the +`no-syscall-lock` parameter, but you must be sure that your kernel can +support it, otherwise you will experience double-frees. +See +https://lore.kernel.org/bpf/BYAPR11MB365382C5DB1E5FCC53242609C1549@BYAPR11MB3653.namprd11.prod.outlook.com/ +for more details. + +### Mellanox +When setting the number of queues on Mellanox NIC with `ethtool -L`, you must +use twice the amount of configured queues: it looks like the Linux driver will +create separate RX queues and TX queues (but all queues can be used for both +RX and TX, the NIC will just not sent any packet on "pure" TX queues. +Confused? So I am.). For example if you set `combined 2` you will effectively +have to create 4 rx queues in AF_XDP if you want to be sure to receive all +packets. + ## Requirements +This drivers supports Linux kernel 5.4 and later. Kernels older than 5.4 are +missing unaligned buffers support. + The Linux kernel interface must be up and have enough queues before creating the VPP AF_XDP interface, otherwise Linux will deny creating the AF_XDP socket. diff --git a/src/plugins/af_xdp/api.c b/src/plugins/af_xdp/api.c index 7592aa4ffda..1864c4c2ee9 100644 --- a/src/plugins/af_xdp/api.c +++ b/src/plugins/af_xdp/api.c @@ -44,6 +44,17 @@ af_xdp_api_mode (vl_api_af_xdp_mode_t mode) return AF_XDP_MODE_AUTO; } +static af_xdp_create_flag_t +af_xdp_api_flags (vl_api_af_xdp_flag_t flags) +{ + af_xdp_create_flag_t cflags = 0; + + if (flags & AF_XDP_API_FLAGS_NO_SYSCALL_LOCK) + cflags |= AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK; + + return cflags; +} + static void vl_api_af_xdp_create_t_handler (vl_api_af_xdp_create_t * mp) { @@ -59,6 +70,7 @@ vl_api_af_xdp_create_t_handler (vl_api_af_xdp_create_t * mp) args.name = mp->name[0] ? (char *) mp->name : 0; args.prog = mp->prog[0] ? (char *) mp->prog : 0; args.mode = af_xdp_api_mode (mp->mode); + args.flags = af_xdp_api_flags (mp->flags); args.rxq_size = ntohs (mp->rxq_size); args.txq_size = ntohs (mp->txq_size); args.rxq_num = ntohs (mp->rxq_num); diff --git a/src/plugins/af_xdp/cli.c b/src/plugins/af_xdp/cli.c index d5f21d4c391..2f3deffaaee 100644 --- a/src/plugins/af_xdp/cli.c +++ b/src/plugins/af_xdp/cli.c @@ -47,7 +47,10 @@ af_xdp_create_command_fn (vlib_main_t * vm, unformat_input_t * input, /* *INDENT-OFF* */ VLIB_CLI_COMMAND (af_xdp_create_command, static) = { .path = "create interface af_xdp", - .short_help = "create interface af_xdp [name ifname] [rx-queue-size size] [tx-queue-size size] [num-rx-queues ] [prog pathname] [zero-copy|no-zero-copy]", + .short_help = + "create interface af_xdp [name ifname] " + "[rx-queue-size size] [tx-queue-size size] [num-rx-queues ] " + "[prog pathname] [zero-copy|no-zero-copy] [no-syscall-lock]", .function = af_xdp_create_command_fn, }; /* *INDENT-ON* */ diff --git a/src/plugins/af_xdp/device.c b/src/plugins/af_xdp/device.c index 35ba617d6e3..5bc7e308173 100644 --- a/src/plugins/af_xdp/device.c +++ b/src/plugins/af_xdp/device.c @@ -93,8 +93,7 @@ af_xdp_delete_if (vlib_main_t * vm, af_xdp_device_t * ad) af_xdp_main_t *axm = &af_xdp_main; struct xsk_socket **xsk; struct xsk_umem **umem; - af_xdp_rxq_t *rxq; - af_xdp_txq_t *txq; + int i; if (ad->hw_if_index) { @@ -102,11 +101,17 @@ af_xdp_delete_if (vlib_main_t * vm, af_xdp_device_t * ad) ethernet_delete_interface (vnm, ad->hw_if_index); } - vec_foreach (rxq, ad->rxqs) clib_file_del_by_index (&file_main, - rxq->file_index); - vec_foreach (txq, ad->txqs) clib_spinlock_free (&txq->lock); - vec_foreach (xsk, ad->xsk) xsk_socket__delete (*xsk); - vec_foreach (umem, ad->umem) xsk_umem__delete (*umem); + for (i = 0; i < ad->rxq_num; i++) + clib_file_del_by_index (&file_main, vec_elt (ad->rxqs, i).file_index); + + for (i = 0; i < ad->txq_num; i++) + clib_spinlock_free (&vec_elt (ad->txqs, i).lock); + + vec_foreach (xsk, ad->xsk) + xsk_socket__delete (*xsk); + + vec_foreach (umem, ad->umem) + xsk_umem__delete (*umem); if (ad->bpf_obj) { @@ -169,8 +174,8 @@ err0: } static int -af_xdp_create_queue (vlib_main_t * vm, af_xdp_create_if_args_t * args, - af_xdp_device_t * ad, int qid, int rxq_num, int txq_num) +af_xdp_create_queue (vlib_main_t *vm, af_xdp_create_if_args_t *args, + af_xdp_device_t *ad, int qid) { struct xsk_umem **umem; struct xsk_socket **xsk; @@ -180,6 +185,8 @@ af_xdp_create_queue (vlib_main_t * vm, af_xdp_create_if_args_t * args, struct xsk_socket_config sock_config; struct xdp_options opt; socklen_t optlen; + const int is_rx = qid < ad->rxq_num; + const int is_tx = qid < ad->txq_num; vec_validate_aligned (ad->umem, qid, CLIB_CACHE_LINE_BYTES); umem = vec_elt_at_index (ad->umem, qid); @@ -197,9 +204,9 @@ af_xdp_create_queue (vlib_main_t * vm, af_xdp_create_if_args_t * args, * fq and cq must always be allocated even if unused * whereas rx and tx indicates whether we want rxq, txq, or both */ - struct xsk_ring_cons *rx = qid < rxq_num ? &rxq->rx : 0; + struct xsk_ring_cons *rx = is_rx ? &rxq->rx : 0; struct xsk_ring_prod *fq = &rxq->fq; - struct xsk_ring_prod *tx = qid < txq_num ? &txq->tx : 0; + struct xsk_ring_prod *tx = is_tx ? &txq->tx : 0; struct xsk_ring_cons *cq = &txq->cq; int fd; @@ -268,8 +275,32 @@ af_xdp_create_queue (vlib_main_t * vm, af_xdp_create_if_args_t * args, if (opt.flags & XDP_OPTIONS_ZEROCOPY) ad->flags |= AF_XDP_DEVICE_F_ZEROCOPY; - rxq->xsk_fd = qid < rxq_num ? fd : -1; - txq->xsk_fd = qid < txq_num ? fd : -1; + rxq->xsk_fd = is_rx ? fd : -1; + + if (is_tx) + { + txq->xsk_fd = fd; + if (is_rx && (ad->flags & AF_XDP_DEVICE_F_SYSCALL_LOCK)) + { + /* This is a shared rx+tx queue and we need to lock before syscalls. + * Prior to Linux 5.6 there is a race condition preventing to call + * poll() and sendto() concurrently on AF_XDP sockets. This was + * fixed with commit 11cc2d21499cabe7e7964389634ed1de3ee91d33 + * to workaround this issue, we protect the syscalls with a + * spinlock. Note that it also prevents to use interrupt mode in + * multi workers setup, because in this case the poll() is done in + * the framework w/o any possibility to protect it. + * See + * https://lore.kernel.org/bpf/BYAPR11MB365382C5DB1E5FCC53242609C1549@BYAPR11MB3653.namprd11.prod.outlook.com/ + */ + clib_spinlock_init (&rxq->syscall_lock); + txq->syscall_lock = rxq->syscall_lock; + } + } + else + { + txq->xsk_fd = -1; + } return 0; @@ -308,19 +339,37 @@ af_xdp_device_rxq_read_ready (clib_file_t * f) return 0; } -static void -af_xdp_device_set_rxq_mode (af_xdp_rxq_t *rxq, int is_polling) +static clib_error_t * +af_xdp_device_set_rxq_mode (const af_xdp_device_t *ad, af_xdp_rxq_t *rxq, + const af_xdp_rxq_mode_t mode) { clib_file_main_t *fm = &file_main; + clib_file_update_type_t update; clib_file_t *f; - if (rxq->is_polling == is_polling) - return; + if (rxq->mode == mode) + return 0; + + switch (mode) + { + case AF_XDP_RXQ_MODE_POLLING: + update = UNIX_FILE_UPDATE_DELETE; + break; + case AF_XDP_RXQ_MODE_INTERRUPT: + if (ad->flags & AF_XDP_DEVICE_F_SYSCALL_LOCK) + return clib_error_create ( + "kernel workaround incompatible with interrupt mode"); + update = UNIX_FILE_UPDATE_ADD; + break; + default: + ASSERT (0); + return clib_error_create ("unknown rxq mode %i", mode); + } f = clib_file_get (fm, rxq->file_index); - fm->file_update (f, is_polling ? UNIX_FILE_UPDATE_DELETE : - UNIX_FILE_UPDATE_ADD); - rxq->is_polling = !!is_polling; + fm->file_update (f, update); + rxq->mode = mode; + return 0; } void @@ -361,6 +410,10 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) pool_get_zero (am->devices, ad); + if (tm->n_vlib_mains > 1 && + 0 == (args->flags & AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK)) + ad->flags |= AF_XDP_DEVICE_F_SYSCALL_LOCK; + ad->linux_ifname = (char *) format (0, "%s", args->linux_ifname); vec_validate (ad->linux_ifname, IFNAMSIZ - 1); /* libbpf expects ifname to be at least IFNAMSIZ */ @@ -368,10 +421,11 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) goto err1; q_num = clib_max (rxq_num, txq_num); + ad->rxq_num = rxq_num; ad->txq_num = txq_num; for (i = 0; i < q_num; i++) { - if (af_xdp_create_queue (vm, args, ad, i, rxq_num, txq_num)) + if (af_xdp_create_queue (vm, args, ad, i)) { /* * queue creation failed @@ -387,15 +441,21 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) vec_set_len (ad->rxqs, i); vec_set_len (ad->txqs, i); + ad->rxq_num = clib_min (i, rxq_num); + ad->txq_num = clib_min (i, txq_num); + if (i < rxq_num && AF_XDP_NUM_RX_QUEUES_ALL != rxq_num) - goto err1; /* failed creating requested rxq: fatal error, bailing out */ + { + ad->rxq_num = ad->txq_num = 0; + goto err1; /* failed creating requested rxq: fatal error, bailing + out */ + } if (i < txq_num) { /* we created less txq than threads not an error but initialize lock for shared txq */ - af_xdp_txq_t *txq; - ad->txq_num = i; - vec_foreach (txq, ad->txqs) clib_spinlock_init (&txq->lock); + for (i = 0; i < ad->txq_num; i++) + clib_spinlock_init (&vec_elt (ad->txqs, i).lock); } args->rv = 0; @@ -436,7 +496,7 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) vnet_hw_if_set_input_node (vnm, ad->hw_if_index, af_xdp_input_node.index); - for (i = 0; i < vec_len (ad->rxqs); i++) + for (i = 0; i < ad->rxq_num; i++) { af_xdp_rxq_t *rxq = vec_elt_at_index (ad->rxqs, i); rxq->queue_index = vnet_hw_if_register_rx_queue ( @@ -445,7 +505,6 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) ad->dev_instance, i); clib_file_t f = { .file_descriptor = rxq->xsk_fd, - .flags = UNIX_FILE_EVENT_EDGE_TRIGGERED, .private_data = rxq->queue_index, .read_function = af_xdp_device_rxq_read_ready, .description = desc, @@ -453,7 +512,8 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) rxq->file_index = clib_file_add (&file_main, &f); vnet_hw_if_set_rx_queue_file_index (vnm, rxq->queue_index, rxq->file_index); - af_xdp_device_set_rxq_mode (rxq, 1 /* polling */); + if (af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_POLLING)) + goto err1; } vnet_hw_if_update_runtime_data (vnm, ad->hw_if_index); @@ -510,24 +570,20 @@ af_xdp_interface_rx_mode_change (vnet_main_t *vnm, u32 hw_if_index, u32 qid, switch (mode) { - case VNET_HW_IF_RX_MODE_UNKNOWN: - case VNET_HW_IF_NUM_RX_MODES: /* fallthrough */ + default: /* fallthrough */ + case VNET_HW_IF_RX_MODE_UNKNOWN: /* fallthrough */ + case VNET_HW_IF_NUM_RX_MODES: return clib_error_create ("uknown rx mode - doing nothing"); - case VNET_HW_IF_RX_MODE_DEFAULT: - case VNET_HW_IF_RX_MODE_POLLING: /* fallthrough */ - if (rxq->is_polling) - break; - af_xdp_device_set_rxq_mode (rxq, 1 /* polling */); - break; - case VNET_HW_IF_RX_MODE_INTERRUPT: - case VNET_HW_IF_RX_MODE_ADAPTIVE: /* fallthrough */ - if (0 == rxq->is_polling) - break; - af_xdp_device_set_rxq_mode (rxq, 0 /* interrupt */); - break; + case VNET_HW_IF_RX_MODE_DEFAULT: /* fallthrough */ + case VNET_HW_IF_RX_MODE_POLLING: + return af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_POLLING); + case VNET_HW_IF_RX_MODE_INTERRUPT: /* fallthrough */ + case VNET_HW_IF_RX_MODE_ADAPTIVE: + return af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_INTERRUPT); } - return 0; + ASSERT (0 && "unreachable"); + return clib_error_create ("unreachable"); } static void diff --git a/src/plugins/af_xdp/input.c b/src/plugins/af_xdp/input.c index da422bdccaf..dcbf5a4b587 100644 --- a/src/plugins/af_xdp/input.c +++ b/src/plugins/af_xdp/input.c @@ -24,9 +24,9 @@ #include #include "af_xdp.h" -#define foreach_af_xdp_input_error \ - _(POLL_REQUIRED, "poll required") \ - _(POLL_FAILURES, "poll failures") +#define foreach_af_xdp_input_error \ + _ (SYSCALL_REQUIRED, "syscall required") \ + _ (SYSCALL_FAILURES, "syscall failures") typedef enum { @@ -77,25 +77,28 @@ af_xdp_device_input_refill_db (vlib_main_t * vm, af_xdp_device_t * ad, af_xdp_rxq_t * rxq, const u32 n_alloc) { - int ret; - xsk_ring_prod__submit (&rxq->fq, n_alloc); - if (!xsk_ring_prod__needs_wakeup (&rxq->fq)) + if (AF_XDP_RXQ_MODE_INTERRUPT == rxq->mode || + !xsk_ring_prod__needs_wakeup (&rxq->fq)) return; - vlib_error_count (vm, node->node_index, AF_XDP_INPUT_ERROR_POLL_REQUIRED, + vlib_error_count (vm, node->node_index, AF_XDP_INPUT_ERROR_SYSCALL_REQUIRED, 1); - struct pollfd fd = {.fd = rxq->xsk_fd,.events = POLLIN }; - ret = poll (&fd, 1, 0); - if (PREDICT_TRUE (ret >= 0)) - return; - - /* something bad is happening */ - vlib_error_count (vm, node->node_index, AF_XDP_INPUT_ERROR_POLL_FAILURES, - 1); - af_xdp_device_error (ad, "poll() failed"); + if (clib_spinlock_trylock_if_init (&rxq->syscall_lock)) + { + struct pollfd fd = { .fd = rxq->xsk_fd, .events = POLLIN | POLLOUT }; + int ret = poll (&fd, 1, 0); + clib_spinlock_unlock_if_init (&rxq->syscall_lock); + if (PREDICT_FALSE (ret < 0)) + { + /* something bad is happening */ + vlib_error_count (vm, node->node_index, + AF_XDP_INPUT_ERROR_SYSCALL_FAILURES, 1); + af_xdp_device_error (ad, "rx poll() failed"); + } + } } static_always_inline void diff --git a/src/plugins/af_xdp/output.c b/src/plugins/af_xdp/output.c index 52c34e00d95..51a56ed866d 100644 --- a/src/plugins/af_xdp/output.c +++ b/src/plugins/af_xdp/output.c @@ -1,4 +1,4 @@ -#include +#include #include #include #include @@ -90,31 +90,29 @@ af_xdp_device_output_tx_db (vlib_main_t * vm, af_xdp_device_t * ad, af_xdp_txq_t * txq, const u32 n_tx) { - int ret; - xsk_ring_prod__submit (&txq->tx, n_tx); if (!xsk_ring_prod__needs_wakeup (&txq->tx)) return; - vlib_error_count (vm, node->node_index, AF_XDP_TX_ERROR_SENDTO_REQUIRED, 1); + vlib_error_count (vm, node->node_index, AF_XDP_TX_ERROR_SYSCALL_REQUIRED, 1); - ret = sendto (txq->xsk_fd, NULL, 0, MSG_DONTWAIT, NULL, 0); - if (PREDICT_TRUE (ret >= 0)) - return; + clib_spinlock_lock_if_init (&txq->syscall_lock); - /* those errors are fine */ - switch (errno) + if (xsk_ring_prod__needs_wakeup (&txq->tx)) { - case ENOBUFS: - case EAGAIN: - case EBUSY: - return; + struct pollfd fd = { .fd = txq->xsk_fd, .events = POLLIN | POLLOUT }; + int ret = poll (&fd, 1, 0); + if (PREDICT_FALSE (ret < 0)) + { + /* something bad is happening */ + vlib_error_count (vm, node->node_index, + AF_XDP_TX_ERROR_SYSCALL_FAILURES, 1); + af_xdp_device_error (ad, "tx poll() failed"); + } } - /* something bad is happening */ - vlib_error_count (vm, node->node_index, AF_XDP_TX_ERROR_SENDTO_FAILURES, 1); - af_xdp_device_error (ad, "sendto() failed"); + clib_spinlock_unlock_if_init (&txq->syscall_lock); } static_always_inline u32 @@ -218,7 +216,8 @@ VNET_DEVICE_CLASS_TX_FN (af_xdp_device_class) (vlib_main_t * vm, vnet_interface_output_runtime_t *ord = (void *) node->runtime_data; af_xdp_device_t *ad = pool_elt_at_index (rm->devices, ord->dev_instance); u32 thread_index = vm->thread_index; - af_xdp_txq_t *txq = vec_elt_at_index (ad->txqs, thread_index % ad->txq_num); + af_xdp_txq_t *txq = + vec_elt_at_index (ad->txqs, (thread_index - 1) % ad->txq_num); u32 *from; u32 n, n_tx; int i; diff --git a/src/plugins/af_xdp/test_api.c b/src/plugins/af_xdp/test_api.c index 270db4e2973..6dffa29bdd1 100644 --- a/src/plugins/af_xdp/test_api.c +++ b/src/plugins/af_xdp/test_api.c @@ -81,6 +81,8 @@ api_af_xdp_create (vat_main_t * vam) mp->rxq_size = clib_host_to_net_u16 (args.rxq_size); mp->txq_size = clib_host_to_net_u16 (args.txq_size); mp->mode = api_af_xdp_mode (args.mode); + if (args.flags & AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK) + mp->flags |= AF_XDP_API_FLAGS_NO_SYSCALL_LOCK; snprintf ((char *) mp->prog, sizeof (mp->prog), "%s", args.prog ? : ""); S (mp); diff --git a/src/plugins/af_xdp/unformat.c b/src/plugins/af_xdp/unformat.c index b2292464112..bb4c3048d23 100644 --- a/src/plugins/af_xdp/unformat.c +++ b/src/plugins/af_xdp/unformat.c @@ -50,6 +50,8 @@ unformat_af_xdp_create_if_args (unformat_input_t * input, va_list * vargs) args->mode = AF_XDP_MODE_COPY; else if (unformat (line_input, "zero-copy")) args->mode = AF_XDP_MODE_ZERO_COPY; + else if (unformat (line_input, "no-syscall-lock")) + args->flags |= AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK; else { /* return failure on unknown input */ -- 2.16.6