*------------------------------------------------------------------
*/
-option version = "0.1.0";
+option version = "0.2.0";
import "vnet/interface_types.api";
enum af_xdp_mode
AF_XDP_API_MODE_ZERO_COPY = 2,
};
+enumflag af_xdp_flag : u8
+{
+ AF_XDP_API_FLAGS_NO_SYSCALL_LOCK = 1,
+};
/** \brief
@param client_index - opaque cookie to identify the sender
@param rxq_size - receive queue size (optional)
@param txq_size - transmit queue size (optional)
@param mode - operation mode (optional)
+ @param flags - flags (optional)
@param prog - eBPF program path (optional)
*/
u16 rxq_size [default=0];
u16 txq_size [default=0];
vl_api_af_xdp_mode_t mode [default=0];
+ vl_api_af_xdp_flag_t flags [default=0];
string prog[256];
- option vat_help = "<host-if linux-ifname> [name ifname] [rx-queue-size size] [tx-queue-size size] [num-rx-queues <num|all>] [prog pathname] [zero-copy|no-zero-copy]";
+ option vat_help = "<host-if linux-ifname> [name ifname] [rx-queue-size size] [tx-queue-size size] [num-rx-queues <num|all>] [prog pathname] [zero-copy|no-zero-copy] [no-syscall-lock]";
option status="in_progress";
};
_ (1, ERROR, "error") \
_ (2, ADMIN_UP, "admin-up") \
_ (3, LINK_UP, "link-up") \
- _ (4, ZEROCOPY, "zero-copy")
+ _ (4, ZEROCOPY, "zero-copy") \
+ _ (5, SYSCALL_LOCK, "syscall-lock")
enum
{
clib_error_free(err_); \
}
+typedef enum
+{
+ AF_XDP_RXQ_MODE_UNKNOWN,
+ AF_XDP_RXQ_MODE_POLLING,
+ AF_XDP_RXQ_MODE_INTERRUPT,
+} __clib_packed af_xdp_rxq_mode_t;
+
typedef struct
{
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
/* fields below are accessed in data-plane (hot) */
+ clib_spinlock_t syscall_lock;
struct xsk_ring_cons rx;
struct xsk_ring_prod fq;
int xsk_fd;
uword file_index;
u32 queue_index;
- u8 is_polling;
+ af_xdp_rxq_mode_t mode;
} af_xdp_rxq_t;
typedef struct
/* fields below are accessed in data-plane (hot) */
clib_spinlock_t lock;
+ clib_spinlock_t syscall_lock;
struct xsk_ring_prod tx;
struct xsk_ring_cons cq;
int xsk_fd;
u32 dev_instance;
u8 hwaddr[6];
+ u8 rxq_num;
+
struct xsk_umem **umem;
struct xsk_socket **xsk;
AF_XDP_MODE_ZERO_COPY = 2,
} af_xdp_mode_t;
+typedef enum
+{
+ AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK = 1,
+} af_xdp_create_flag_t;
+
typedef struct
{
char *linux_ifname;
char *name;
char *prog;
af_xdp_mode_t mode;
+ af_xdp_create_flag_t flags;
u32 rxq_size;
u32 txq_size;
u32 rxq_num;
u32 hw_if_index;
} af_xdp_input_trace_t;
-#define foreach_af_xdp_tx_func_error \
-_(NO_FREE_SLOTS, "no free tx slots") \
-_(SENDTO_REQUIRED, "sendto required") \
-_(SENDTO_FAILURES, "sendto failures")
+#define foreach_af_xdp_tx_func_error \
+ _ (NO_FREE_SLOTS, "no free tx slots") \
+ _ (SYSCALL_REQUIRED, "syscall required") \
+ _ (SYSCALL_FAILURES, "syscall failures")
typedef enum
{
- custom eBPF program
- polling, interrupt and adaptive mode
-## Limitations
+## Known limitations
+
+### MTU
Because of AF_XDP restrictions, the MTU is limited to below PAGE_SIZE
(4096-bytes on most systems) minus 256-bytes, and they are additional
limitations depending upon specific Linux device drivers.
As a rule of thumb, a MTU of 3000-bytes or less should be safe.
+
+### Number of buffers
Furthermore, upon UMEM creation, the kernel allocates a
physically-contiguous structure, whose size is proportional to the number
of 4KB pages contained in the UMEM. That allocation might fail when
Finally, note that because of this limitation, this plugin is unlikely
to be compatible with the use of 1GB hugepages.
+### Interrupt mode
+Interrupt and adaptive mode are supported but is limited by default to single
+threaded (no worker) configurations because of a kernel limitation prior to
+5.6. You can bypass the limitation at interface creation time by adding the
+`no-syscall-lock` parameter, but you must be sure that your kernel can
+support it, otherwise you will experience double-frees.
+See
+https://lore.kernel.org/bpf/BYAPR11MB365382C5DB1E5FCC53242609C1549@BYAPR11MB3653.namprd11.prod.outlook.com/
+for more details.
+
+### Mellanox
+When setting the number of queues on Mellanox NIC with `ethtool -L`, you must
+use twice the amount of configured queues: it looks like the Linux driver will
+create separate RX queues and TX queues (but all queues can be used for both
+RX and TX, the NIC will just not sent any packet on "pure" TX queues.
+Confused? So I am.). For example if you set `combined 2` you will effectively
+have to create 4 rx queues in AF_XDP if you want to be sure to receive all
+packets.
+
## Requirements
+This drivers supports Linux kernel 5.4 and later. Kernels older than 5.4 are
+missing unaligned buffers support.
+
The Linux kernel interface must be up and have enough queues before
creating the VPP AF_XDP interface, otherwise Linux will deny creating
the AF_XDP socket.
return AF_XDP_MODE_AUTO;
}
+static af_xdp_create_flag_t
+af_xdp_api_flags (vl_api_af_xdp_flag_t flags)
+{
+ af_xdp_create_flag_t cflags = 0;
+
+ if (flags & AF_XDP_API_FLAGS_NO_SYSCALL_LOCK)
+ cflags |= AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK;
+
+ return cflags;
+}
+
static void
vl_api_af_xdp_create_t_handler (vl_api_af_xdp_create_t * mp)
{
args.name = mp->name[0] ? (char *) mp->name : 0;
args.prog = mp->prog[0] ? (char *) mp->prog : 0;
args.mode = af_xdp_api_mode (mp->mode);
+ args.flags = af_xdp_api_flags (mp->flags);
args.rxq_size = ntohs (mp->rxq_size);
args.txq_size = ntohs (mp->txq_size);
args.rxq_num = ntohs (mp->rxq_num);
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (af_xdp_create_command, static) = {
.path = "create interface af_xdp",
- .short_help = "create interface af_xdp <host-if linux-ifname> [name ifname] [rx-queue-size size] [tx-queue-size size] [num-rx-queues <num|all>] [prog pathname] [zero-copy|no-zero-copy]",
+ .short_help =
+ "create interface af_xdp <host-if linux-ifname> [name ifname] "
+ "[rx-queue-size size] [tx-queue-size size] [num-rx-queues <num|all>] "
+ "[prog pathname] [zero-copy|no-zero-copy] [no-syscall-lock]",
.function = af_xdp_create_command_fn,
};
/* *INDENT-ON* */
af_xdp_main_t *axm = &af_xdp_main;
struct xsk_socket **xsk;
struct xsk_umem **umem;
- af_xdp_rxq_t *rxq;
- af_xdp_txq_t *txq;
+ int i;
if (ad->hw_if_index)
{
ethernet_delete_interface (vnm, ad->hw_if_index);
}
- vec_foreach (rxq, ad->rxqs) clib_file_del_by_index (&file_main,
- rxq->file_index);
- vec_foreach (txq, ad->txqs) clib_spinlock_free (&txq->lock);
- vec_foreach (xsk, ad->xsk) xsk_socket__delete (*xsk);
- vec_foreach (umem, ad->umem) xsk_umem__delete (*umem);
+ for (i = 0; i < ad->rxq_num; i++)
+ clib_file_del_by_index (&file_main, vec_elt (ad->rxqs, i).file_index);
+
+ for (i = 0; i < ad->txq_num; i++)
+ clib_spinlock_free (&vec_elt (ad->txqs, i).lock);
+
+ vec_foreach (xsk, ad->xsk)
+ xsk_socket__delete (*xsk);
+
+ vec_foreach (umem, ad->umem)
+ xsk_umem__delete (*umem);
if (ad->bpf_obj)
{
}
static int
-af_xdp_create_queue (vlib_main_t * vm, af_xdp_create_if_args_t * args,
- af_xdp_device_t * ad, int qid, int rxq_num, int txq_num)
+af_xdp_create_queue (vlib_main_t *vm, af_xdp_create_if_args_t *args,
+ af_xdp_device_t *ad, int qid)
{
struct xsk_umem **umem;
struct xsk_socket **xsk;
struct xsk_socket_config sock_config;
struct xdp_options opt;
socklen_t optlen;
+ const int is_rx = qid < ad->rxq_num;
+ const int is_tx = qid < ad->txq_num;
vec_validate_aligned (ad->umem, qid, CLIB_CACHE_LINE_BYTES);
umem = vec_elt_at_index (ad->umem, qid);
* fq and cq must always be allocated even if unused
* whereas rx and tx indicates whether we want rxq, txq, or both
*/
- struct xsk_ring_cons *rx = qid < rxq_num ? &rxq->rx : 0;
+ struct xsk_ring_cons *rx = is_rx ? &rxq->rx : 0;
struct xsk_ring_prod *fq = &rxq->fq;
- struct xsk_ring_prod *tx = qid < txq_num ? &txq->tx : 0;
+ struct xsk_ring_prod *tx = is_tx ? &txq->tx : 0;
struct xsk_ring_cons *cq = &txq->cq;
int fd;
if (opt.flags & XDP_OPTIONS_ZEROCOPY)
ad->flags |= AF_XDP_DEVICE_F_ZEROCOPY;
- rxq->xsk_fd = qid < rxq_num ? fd : -1;
- txq->xsk_fd = qid < txq_num ? fd : -1;
+ rxq->xsk_fd = is_rx ? fd : -1;
+
+ if (is_tx)
+ {
+ txq->xsk_fd = fd;
+ if (is_rx && (ad->flags & AF_XDP_DEVICE_F_SYSCALL_LOCK))
+ {
+ /* This is a shared rx+tx queue and we need to lock before syscalls.
+ * Prior to Linux 5.6 there is a race condition preventing to call
+ * poll() and sendto() concurrently on AF_XDP sockets. This was
+ * fixed with commit 11cc2d21499cabe7e7964389634ed1de3ee91d33
+ * to workaround this issue, we protect the syscalls with a
+ * spinlock. Note that it also prevents to use interrupt mode in
+ * multi workers setup, because in this case the poll() is done in
+ * the framework w/o any possibility to protect it.
+ * See
+ * https://lore.kernel.org/bpf/BYAPR11MB365382C5DB1E5FCC53242609C1549@BYAPR11MB3653.namprd11.prod.outlook.com/
+ */
+ clib_spinlock_init (&rxq->syscall_lock);
+ txq->syscall_lock = rxq->syscall_lock;
+ }
+ }
+ else
+ {
+ txq->xsk_fd = -1;
+ }
return 0;
return 0;
}
-static void
-af_xdp_device_set_rxq_mode (af_xdp_rxq_t *rxq, int is_polling)
+static clib_error_t *
+af_xdp_device_set_rxq_mode (const af_xdp_device_t *ad, af_xdp_rxq_t *rxq,
+ const af_xdp_rxq_mode_t mode)
{
clib_file_main_t *fm = &file_main;
+ clib_file_update_type_t update;
clib_file_t *f;
- if (rxq->is_polling == is_polling)
- return;
+ if (rxq->mode == mode)
+ return 0;
+
+ switch (mode)
+ {
+ case AF_XDP_RXQ_MODE_POLLING:
+ update = UNIX_FILE_UPDATE_DELETE;
+ break;
+ case AF_XDP_RXQ_MODE_INTERRUPT:
+ if (ad->flags & AF_XDP_DEVICE_F_SYSCALL_LOCK)
+ return clib_error_create (
+ "kernel workaround incompatible with interrupt mode");
+ update = UNIX_FILE_UPDATE_ADD;
+ break;
+ default:
+ ASSERT (0);
+ return clib_error_create ("unknown rxq mode %i", mode);
+ }
f = clib_file_get (fm, rxq->file_index);
- fm->file_update (f, is_polling ? UNIX_FILE_UPDATE_DELETE :
- UNIX_FILE_UPDATE_ADD);
- rxq->is_polling = !!is_polling;
+ fm->file_update (f, update);
+ rxq->mode = mode;
+ return 0;
}
void
pool_get_zero (am->devices, ad);
+ if (tm->n_vlib_mains > 1 &&
+ 0 == (args->flags & AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK))
+ ad->flags |= AF_XDP_DEVICE_F_SYSCALL_LOCK;
+
ad->linux_ifname = (char *) format (0, "%s", args->linux_ifname);
vec_validate (ad->linux_ifname, IFNAMSIZ - 1); /* libbpf expects ifname to be at least IFNAMSIZ */
goto err1;
q_num = clib_max (rxq_num, txq_num);
+ ad->rxq_num = rxq_num;
ad->txq_num = txq_num;
for (i = 0; i < q_num; i++)
{
- if (af_xdp_create_queue (vm, args, ad, i, rxq_num, txq_num))
+ if (af_xdp_create_queue (vm, args, ad, i))
{
/*
* queue creation failed
vec_set_len (ad->rxqs, i);
vec_set_len (ad->txqs, i);
+ ad->rxq_num = clib_min (i, rxq_num);
+ ad->txq_num = clib_min (i, txq_num);
+
if (i < rxq_num && AF_XDP_NUM_RX_QUEUES_ALL != rxq_num)
- goto err1; /* failed creating requested rxq: fatal error, bailing out */
+ {
+ ad->rxq_num = ad->txq_num = 0;
+ goto err1; /* failed creating requested rxq: fatal error, bailing
+ out */
+ }
if (i < txq_num)
{
/* we created less txq than threads not an error but initialize lock for shared txq */
- af_xdp_txq_t *txq;
- ad->txq_num = i;
- vec_foreach (txq, ad->txqs) clib_spinlock_init (&txq->lock);
+ for (i = 0; i < ad->txq_num; i++)
+ clib_spinlock_init (&vec_elt (ad->txqs, i).lock);
}
args->rv = 0;
vnet_hw_if_set_input_node (vnm, ad->hw_if_index, af_xdp_input_node.index);
- for (i = 0; i < vec_len (ad->rxqs); i++)
+ for (i = 0; i < ad->rxq_num; i++)
{
af_xdp_rxq_t *rxq = vec_elt_at_index (ad->rxqs, i);
rxq->queue_index = vnet_hw_if_register_rx_queue (
ad->dev_instance, i);
clib_file_t f = {
.file_descriptor = rxq->xsk_fd,
- .flags = UNIX_FILE_EVENT_EDGE_TRIGGERED,
.private_data = rxq->queue_index,
.read_function = af_xdp_device_rxq_read_ready,
.description = desc,
rxq->file_index = clib_file_add (&file_main, &f);
vnet_hw_if_set_rx_queue_file_index (vnm, rxq->queue_index,
rxq->file_index);
- af_xdp_device_set_rxq_mode (rxq, 1 /* polling */);
+ if (af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_POLLING))
+ goto err1;
}
vnet_hw_if_update_runtime_data (vnm, ad->hw_if_index);
switch (mode)
{
- case VNET_HW_IF_RX_MODE_UNKNOWN:
- case VNET_HW_IF_NUM_RX_MODES: /* fallthrough */
+ default: /* fallthrough */
+ case VNET_HW_IF_RX_MODE_UNKNOWN: /* fallthrough */
+ case VNET_HW_IF_NUM_RX_MODES:
return clib_error_create ("uknown rx mode - doing nothing");
- case VNET_HW_IF_RX_MODE_DEFAULT:
- case VNET_HW_IF_RX_MODE_POLLING: /* fallthrough */
- if (rxq->is_polling)
- break;
- af_xdp_device_set_rxq_mode (rxq, 1 /* polling */);
- break;
- case VNET_HW_IF_RX_MODE_INTERRUPT:
- case VNET_HW_IF_RX_MODE_ADAPTIVE: /* fallthrough */
- if (0 == rxq->is_polling)
- break;
- af_xdp_device_set_rxq_mode (rxq, 0 /* interrupt */);
- break;
+ case VNET_HW_IF_RX_MODE_DEFAULT: /* fallthrough */
+ case VNET_HW_IF_RX_MODE_POLLING:
+ return af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_POLLING);
+ case VNET_HW_IF_RX_MODE_INTERRUPT: /* fallthrough */
+ case VNET_HW_IF_RX_MODE_ADAPTIVE:
+ return af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_INTERRUPT);
}
- return 0;
+ ASSERT (0 && "unreachable");
+ return clib_error_create ("unreachable");
}
static void
#include <vnet/interface/rx_queue_funcs.h>
#include "af_xdp.h"
-#define foreach_af_xdp_input_error \
- _(POLL_REQUIRED, "poll required") \
- _(POLL_FAILURES, "poll failures")
+#define foreach_af_xdp_input_error \
+ _ (SYSCALL_REQUIRED, "syscall required") \
+ _ (SYSCALL_FAILURES, "syscall failures")
typedef enum
{
af_xdp_device_t * ad, af_xdp_rxq_t * rxq,
const u32 n_alloc)
{
- int ret;
-
xsk_ring_prod__submit (&rxq->fq, n_alloc);
- if (!xsk_ring_prod__needs_wakeup (&rxq->fq))
+ if (AF_XDP_RXQ_MODE_INTERRUPT == rxq->mode ||
+ !xsk_ring_prod__needs_wakeup (&rxq->fq))
return;
- vlib_error_count (vm, node->node_index, AF_XDP_INPUT_ERROR_POLL_REQUIRED,
+ vlib_error_count (vm, node->node_index, AF_XDP_INPUT_ERROR_SYSCALL_REQUIRED,
1);
- struct pollfd fd = {.fd = rxq->xsk_fd,.events = POLLIN };
- ret = poll (&fd, 1, 0);
- if (PREDICT_TRUE (ret >= 0))
- return;
-
- /* something bad is happening */
- vlib_error_count (vm, node->node_index, AF_XDP_INPUT_ERROR_POLL_FAILURES,
- 1);
- af_xdp_device_error (ad, "poll() failed");
+ if (clib_spinlock_trylock_if_init (&rxq->syscall_lock))
+ {
+ struct pollfd fd = { .fd = rxq->xsk_fd, .events = POLLIN | POLLOUT };
+ int ret = poll (&fd, 1, 0);
+ clib_spinlock_unlock_if_init (&rxq->syscall_lock);
+ if (PREDICT_FALSE (ret < 0))
+ {
+ /* something bad is happening */
+ vlib_error_count (vm, node->node_index,
+ AF_XDP_INPUT_ERROR_SYSCALL_FAILURES, 1);
+ af_xdp_device_error (ad, "rx poll() failed");
+ }
+ }
}
static_always_inline void
-#include <errno.h>
+#include <poll.h>
#include <string.h>
#include <vlib/vlib.h>
#include <vlib/unix/unix.h>
af_xdp_device_t * ad,
af_xdp_txq_t * txq, const u32 n_tx)
{
- int ret;
-
xsk_ring_prod__submit (&txq->tx, n_tx);
if (!xsk_ring_prod__needs_wakeup (&txq->tx))
return;
- vlib_error_count (vm, node->node_index, AF_XDP_TX_ERROR_SENDTO_REQUIRED, 1);
+ vlib_error_count (vm, node->node_index, AF_XDP_TX_ERROR_SYSCALL_REQUIRED, 1);
- ret = sendto (txq->xsk_fd, NULL, 0, MSG_DONTWAIT, NULL, 0);
- if (PREDICT_TRUE (ret >= 0))
- return;
+ clib_spinlock_lock_if_init (&txq->syscall_lock);
- /* those errors are fine */
- switch (errno)
+ if (xsk_ring_prod__needs_wakeup (&txq->tx))
{
- case ENOBUFS:
- case EAGAIN:
- case EBUSY:
- return;
+ struct pollfd fd = { .fd = txq->xsk_fd, .events = POLLIN | POLLOUT };
+ int ret = poll (&fd, 1, 0);
+ if (PREDICT_FALSE (ret < 0))
+ {
+ /* something bad is happening */
+ vlib_error_count (vm, node->node_index,
+ AF_XDP_TX_ERROR_SYSCALL_FAILURES, 1);
+ af_xdp_device_error (ad, "tx poll() failed");
+ }
}
- /* something bad is happening */
- vlib_error_count (vm, node->node_index, AF_XDP_TX_ERROR_SENDTO_FAILURES, 1);
- af_xdp_device_error (ad, "sendto() failed");
+ clib_spinlock_unlock_if_init (&txq->syscall_lock);
}
static_always_inline u32
vnet_interface_output_runtime_t *ord = (void *) node->runtime_data;
af_xdp_device_t *ad = pool_elt_at_index (rm->devices, ord->dev_instance);
u32 thread_index = vm->thread_index;
- af_xdp_txq_t *txq = vec_elt_at_index (ad->txqs, thread_index % ad->txq_num);
+ af_xdp_txq_t *txq =
+ vec_elt_at_index (ad->txqs, (thread_index - 1) % ad->txq_num);
u32 *from;
u32 n, n_tx;
int i;
mp->rxq_size = clib_host_to_net_u16 (args.rxq_size);
mp->txq_size = clib_host_to_net_u16 (args.txq_size);
mp->mode = api_af_xdp_mode (args.mode);
+ if (args.flags & AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK)
+ mp->flags |= AF_XDP_API_FLAGS_NO_SYSCALL_LOCK;
snprintf ((char *) mp->prog, sizeof (mp->prog), "%s", args.prog ? : "");
S (mp);
args->mode = AF_XDP_MODE_COPY;
else if (unformat (line_input, "zero-copy"))
args->mode = AF_XDP_MODE_ZERO_COPY;
+ else if (unformat (line_input, "no-syscall-lock"))
+ args->flags |= AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK;
else
{
/* return failure on unknown input */