#include <stdio.h>
#include <net/if.h>
+#include <sys/ioctl.h>
+#include <linux/ethtool.h>
#include <linux/if_link.h>
+#include <linux/sockios.h>
#include <bpf/libbpf.h>
#include <vlib/vlib.h>
#include <vlib/unix/unix.h>
#include <vppinfra/linux/sysfs.h>
#include <vppinfra/unix.h>
#include <vnet/ethernet/ethernet.h>
+#include <vnet/interface/rx_queue_funcs.h>
#include "af_xdp.h"
af_xdp_main_t af_xdp_main;
af_xdp_main_t *axm = &af_xdp_main;
struct xsk_socket **xsk;
struct xsk_umem **umem;
- af_xdp_rxq_t *rxq;
- af_xdp_txq_t *txq;
+ int i;
if (ad->hw_if_index)
{
vnet_hw_interface_set_flags (vnm, ad->hw_if_index, 0);
- vnet_hw_interface_unassign_rx_thread (vnm, ad->hw_if_index, 0);
ethernet_delete_interface (vnm, ad->hw_if_index);
}
- vec_foreach (rxq, ad->rxqs) clib_file_del_by_index (&file_main,
- rxq->file_index);
- vec_foreach (txq, ad->txqs) clib_spinlock_free (&txq->lock);
- vec_foreach (xsk, ad->xsk) xsk_socket__delete (*xsk);
- vec_foreach (umem, ad->umem) xsk_umem__delete (*umem);
+ for (i = 0; i < ad->txq_num; i++)
+ clib_spinlock_free (&vec_elt (ad->txqs, i).lock);
+
+ vec_foreach (xsk, ad->xsk)
+ xsk_socket__delete (*xsk);
+
+ vec_foreach (umem, ad->umem)
+ xsk_umem__delete (*umem);
+
+ for (i = 0; i < ad->rxq_num; i++)
+ clib_file_del_by_index (&file_main, vec_elt (ad->rxqs, i).file_index);
if (ad->bpf_obj)
{
}
static int
-af_xdp_create_queue (vlib_main_t * vm, af_xdp_create_if_args_t * args,
- af_xdp_device_t * ad, int qid, int rxq_num, int txq_num)
+af_xdp_create_queue (vlib_main_t *vm, af_xdp_create_if_args_t *args,
+ af_xdp_device_t *ad, int qid)
{
struct xsk_umem **umem;
struct xsk_socket **xsk;
struct xsk_socket_config sock_config;
struct xdp_options opt;
socklen_t optlen;
+ const int is_rx = qid < ad->rxq_num;
+ const int is_tx = qid < ad->txq_num;
- vec_validate_aligned (ad->umem, qid, CLIB_CACHE_LINE_BYTES);
umem = vec_elt_at_index (ad->umem, qid);
-
- vec_validate_aligned (ad->xsk, qid, CLIB_CACHE_LINE_BYTES);
xsk = vec_elt_at_index (ad->xsk, qid);
-
- vec_validate_aligned (ad->rxqs, qid, CLIB_CACHE_LINE_BYTES);
rxq = vec_elt_at_index (ad->rxqs, qid);
-
- vec_validate_aligned (ad->txqs, qid, CLIB_CACHE_LINE_BYTES);
txq = vec_elt_at_index (ad->txqs, qid);
/*
* fq and cq must always be allocated even if unused
* whereas rx and tx indicates whether we want rxq, txq, or both
*/
- struct xsk_ring_cons *rx = qid < rxq_num ? &rxq->rx : 0;
+ struct xsk_ring_cons *rx = is_rx ? &rxq->rx : 0;
struct xsk_ring_prod *fq = &rxq->fq;
- struct xsk_ring_prod *tx = qid < txq_num ? &txq->tx : 0;
+ struct xsk_ring_prod *tx = is_tx ? &txq->tx : 0;
struct xsk_ring_cons *cq = &txq->cq;
int fd;
umem_config.comp_size = args->txq_size;
umem_config.frame_size =
sizeof (vlib_buffer_t) + vlib_buffer_get_default_data_size (vm);
+ umem_config.frame_headroom = sizeof (vlib_buffer_t);
umem_config.flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG;
if (xsk_umem__create
(umem, uword_to_pointer (vm->buffer_main->buffer_mem_start, void *),
if (opt.flags & XDP_OPTIONS_ZEROCOPY)
ad->flags |= AF_XDP_DEVICE_F_ZEROCOPY;
- rxq->xsk_fd = qid < rxq_num ? fd : -1;
- txq->xsk_fd = qid < txq_num ? fd : -1;
+ rxq->xsk_fd = is_rx ? fd : -1;
+
+ if (is_tx)
+ {
+ txq->xsk_fd = fd;
+ if (is_rx && (ad->flags & AF_XDP_DEVICE_F_SYSCALL_LOCK))
+ {
+ /* This is a shared rx+tx queue and we need to lock before syscalls.
+ * Prior to Linux 5.6 there is a race condition preventing to call
+ * poll() and sendto() concurrently on AF_XDP sockets. This was
+ * fixed with commit 11cc2d21499cabe7e7964389634ed1de3ee91d33
+ * to workaround this issue, we protect the syscalls with a
+ * spinlock. Note that it also prevents to use interrupt mode in
+ * multi workers setup, because in this case the poll() is done in
+ * the framework w/o any possibility to protect it.
+ * See
+ * https://lore.kernel.org/bpf/BYAPR11MB365382C5DB1E5FCC53242609C1549@BYAPR11MB3653.namprd11.prod.outlook.com/
+ */
+ clib_spinlock_init (&rxq->syscall_lock);
+ txq->syscall_lock = rxq->syscall_lock;
+ }
+ }
+ else
+ {
+ txq->xsk_fd = -1;
+ }
return 0;
return numa;
}
+static void
+af_xdp_get_q_count (const char *ifname, int *rxq_num, int *txq_num)
+{
+ struct ethtool_channels ec = { .cmd = ETHTOOL_GCHANNELS };
+ struct ifreq ifr = { .ifr_data = (void *) &ec };
+ int fd, err;
+
+ *rxq_num = *txq_num = 1;
+
+ fd = socket (AF_INET, SOCK_DGRAM, 0);
+ if (fd < 0)
+ return;
+
+ snprintf (ifr.ifr_name, sizeof (ifr.ifr_name), "%s", ifname);
+ err = ioctl (fd, SIOCETHTOOL, &ifr);
+
+ close (fd);
+
+ if (err)
+ return;
+
+ *rxq_num = clib_max (ec.combined_count, ec.rx_count);
+ *txq_num = clib_max (ec.combined_count, ec.tx_count);
+}
+
static clib_error_t *
af_xdp_device_rxq_read_ready (clib_file_t * f)
{
- vnet_main_t *vnm = vnet_get_main ();
- const af_xdp_main_t *am = &af_xdp_main;
- const u32 dev_instance = f->private_data >> 16;
- const u16 qid = f->private_data & 0xffff;
- const af_xdp_device_t *ad = vec_elt_at_index (am->devices, dev_instance);
- vnet_device_input_set_interrupt_pending (vnm, ad->hw_if_index, qid);
+ vnet_hw_if_rx_queue_set_int_pending (vnet_get_main (), f->private_data);
+ return 0;
+}
+
+static clib_error_t *
+af_xdp_device_set_rxq_mode (const af_xdp_device_t *ad, af_xdp_rxq_t *rxq,
+ const af_xdp_rxq_mode_t mode)
+{
+ clib_file_main_t *fm = &file_main;
+ clib_file_update_type_t update;
+ clib_file_t *f;
+
+ if (rxq->mode == mode)
+ return 0;
+
+ switch (mode)
+ {
+ case AF_XDP_RXQ_MODE_POLLING:
+ update = UNIX_FILE_UPDATE_DELETE;
+ break;
+ case AF_XDP_RXQ_MODE_INTERRUPT:
+ if (ad->flags & AF_XDP_DEVICE_F_SYSCALL_LOCK)
+ return clib_error_create (
+ "kernel workaround incompatible with interrupt mode");
+ update = UNIX_FILE_UPDATE_ADD;
+ break;
+ default:
+ ASSERT (0);
+ return clib_error_create ("unknown rxq mode %i", mode);
+ }
+
+ f = clib_file_get (fm, rxq->file_index);
+ fm->file_update (f, update);
+ rxq->mode = mode;
return 0;
}
args->rxq_size = args->rxq_size ? args->rxq_size : 2 * VLIB_FRAME_SIZE;
args->txq_size = args->txq_size ? args->txq_size : 2 * VLIB_FRAME_SIZE;
- rxq_num = args->rxq_num ? args->rxq_num : 1;
- txq_num = tm->n_vlib_mains;
+ args->rxq_num = args->rxq_num ? args->rxq_num : 1;
if (!args->linux_ifname)
{
goto err0;
}
+ af_xdp_get_q_count (args->linux_ifname, &rxq_num, &txq_num);
+ if (args->rxq_num > rxq_num && AF_XDP_NUM_RX_QUEUES_ALL != args->rxq_num)
+ {
+ args->rv = VNET_API_ERROR_INVALID_VALUE;
+ args->error = clib_error_create ("too many rxq requested (%d > %d)",
+ args->rxq_num, rxq_num);
+ goto err0;
+ }
+ rxq_num = clib_min (rxq_num, args->rxq_num);
+ txq_num = clib_min (txq_num, tm->n_vlib_mains);
+
pool_get_zero (am->devices, ad);
+ if (tm->n_vlib_mains > 1 &&
+ 0 == (args->flags & AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK))
+ ad->flags |= AF_XDP_DEVICE_F_SYSCALL_LOCK;
+
ad->linux_ifname = (char *) format (0, "%s", args->linux_ifname);
vec_validate (ad->linux_ifname, IFNAMSIZ - 1); /* libbpf expects ifname to be at least IFNAMSIZ */
goto err1;
q_num = clib_max (rxq_num, txq_num);
+ ad->rxq_num = rxq_num;
ad->txq_num = txq_num;
+
+ vec_validate_aligned (ad->umem, q_num - 1, CLIB_CACHE_LINE_BYTES);
+ vec_validate_aligned (ad->xsk, q_num - 1, CLIB_CACHE_LINE_BYTES);
+ vec_validate_aligned (ad->rxqs, q_num - 1, CLIB_CACHE_LINE_BYTES);
+ vec_validate_aligned (ad->txqs, q_num - 1, CLIB_CACHE_LINE_BYTES);
+
for (i = 0; i < q_num; i++)
{
- if (af_xdp_create_queue (vm, args, ad, i, rxq_num, txq_num))
+ if (af_xdp_create_queue (vm, args, ad, i))
{
/*
* queue creation failed
vec_set_len (ad->rxqs, i);
vec_set_len (ad->txqs, i);
- if (i < rxq_num && AF_XDP_NUM_RX_QUEUES_ALL != rxq_num)
- goto err1; /* failed creating requested rxq: fatal error, bailing out */
+ ad->rxq_num = clib_min (i, rxq_num);
+ ad->txq_num = clib_min (i, txq_num);
+
+ if (i < rxq_num && AF_XDP_NUM_RX_QUEUES_ALL != args->rxq_num)
+ {
+ ad->rxq_num = ad->txq_num = 0;
+ goto err1; /* failed creating requested rxq: fatal error, bailing
+ out */
+ }
if (i < txq_num)
{
/* we created less txq than threads not an error but initialize lock for shared txq */
- af_xdp_txq_t *txq;
- ad->txq_num = i;
- vec_foreach (txq, ad->txqs) clib_spinlock_init (&txq->lock);
+ for (i = 0; i < ad->txq_num; i++)
+ clib_spinlock_init (&vec_elt (ad->txqs, i).lock);
}
args->rv = 0;
sw = vnet_get_hw_sw_interface (vnm, ad->hw_if_index);
hw = vnet_get_hw_interface (vnm, ad->hw_if_index);
args->sw_if_index = ad->sw_if_index = sw->sw_if_index;
- hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE;
+ hw->caps |= VNET_HW_INTERFACE_CAP_SUPPORTS_INT_MODE;
- vnet_hw_interface_set_input_node (vnm, ad->hw_if_index,
- af_xdp_input_node.index);
+ vnet_hw_if_set_input_node (vnm, ad->hw_if_index, af_xdp_input_node.index);
- for (i = 0; i < vec_len (ad->rxqs); i++)
+ for (i = 0; i < ad->rxq_num; i++)
{
af_xdp_rxq_t *rxq = vec_elt_at_index (ad->rxqs, i);
+ rxq->queue_index = vnet_hw_if_register_rx_queue (
+ vnm, ad->hw_if_index, i, VNET_HW_IF_RXQ_THREAD_ANY);
+ u8 *desc = format (0, "%U rxq %d", format_af_xdp_device_name,
+ ad->dev_instance, i);
clib_file_t f = {
.file_descriptor = rxq->xsk_fd,
- .flags = UNIX_FILE_EVENT_EDGE_TRIGGERED,
- .private_data = (uword) ad->dev_instance << 16 | (uword) i,
+ .private_data = rxq->queue_index,
.read_function = af_xdp_device_rxq_read_ready,
- .description =
- format (0, "%U rxq %d", format_af_xdp_device_name, ad->dev_instance,
- i),
+ .description = desc,
};
rxq->file_index = clib_file_add (&file_main, &f);
- vnet_hw_interface_assign_rx_thread (vnm, ad->hw_if_index, i, ~0);
+ vnet_hw_if_set_rx_queue_file_index (vnm, rxq->queue_index,
+ rxq->file_index);
+ if (af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_POLLING))
+ goto err1;
}
+ vnet_hw_if_update_runtime_data (vnm, ad->hw_if_index);
+
/* buffer template */
vec_validate_aligned (ad->buffer_template, 1, CLIB_CACHE_LINE_BYTES);
ad->buffer_template->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
vnet_hw_interface_set_flags (vnm, ad->hw_if_index,
VNET_HW_INTERFACE_FLAG_LINK_UP);
ad->flags |= AF_XDP_DEVICE_F_ADMIN_UP;
+ af_xdp_device_input_refill (ad);
}
else
{
return 0;
}
+static clib_error_t *
+af_xdp_interface_rx_mode_change (vnet_main_t *vnm, u32 hw_if_index, u32 qid,
+ vnet_hw_if_rx_mode mode)
+{
+ af_xdp_main_t *am = &af_xdp_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ af_xdp_device_t *ad = pool_elt_at_index (am->devices, hw->dev_instance);
+ af_xdp_rxq_t *rxq = vec_elt_at_index (ad->rxqs, qid);
+
+ switch (mode)
+ {
+ default: /* fallthrough */
+ case VNET_HW_IF_RX_MODE_UNKNOWN: /* fallthrough */
+ case VNET_HW_IF_NUM_RX_MODES:
+ return clib_error_create ("uknown rx mode - doing nothing");
+ case VNET_HW_IF_RX_MODE_DEFAULT: /* fallthrough */
+ case VNET_HW_IF_RX_MODE_POLLING:
+ return af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_POLLING);
+ case VNET_HW_IF_RX_MODE_INTERRUPT: /* fallthrough */
+ case VNET_HW_IF_RX_MODE_ADAPTIVE:
+ return af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_INTERRUPT);
+ }
+
+ ASSERT (0 && "unreachable");
+ return clib_error_create ("unreachable");
+}
+
static void
af_xdp_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
u32 node_index)
}
/* *INDENT-OFF* */
-VNET_DEVICE_CLASS (af_xdp_device_class) =
-{
+VNET_DEVICE_CLASS (af_xdp_device_class) = {
.name = "AF_XDP interface",
.format_device = format_af_xdp_device,
.format_device_name = format_af_xdp_device_name,
.admin_up_down_function = af_xdp_interface_admin_up_down,
+ .rx_mode_change_function = af_xdp_interface_rx_mode_change,
.rx_redirect_to_node = af_xdp_set_interface_next_node,
.tx_function_n_errors = AF_XDP_TX_N_ERROR,
.tx_function_error_strings = af_xdp_tx_func_error_strings,