From 4a76d6f6da035220917097bc047b08bc58254803 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Beno=C3=AEt=20Ganne?= Date: Fri, 12 Jun 2020 08:47:34 +0200 Subject: [PATCH] af_xdp: AF_XDP input plugin MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Type: feature Change-Id: I85aa4ad6b68c1aa0e51938002dc691a4b11c545c Signed-off-by: Damjan Marion Signed-off-by: Benoît Ganne --- MAINTAINERS | 6 + Makefile | 1 + build/external/Makefile | 3 +- build/external/packages/libbpf.mk | 55 ++++ extras/bpf/Makefile | 17 ++ extras/bpf/af_xdp.bpf.c | 88 +++++++ src/plugins/af_xdp/CMakeLists.txt | 71 +++++ src/plugins/af_xdp/FEATURE.yaml | 8 + src/plugins/af_xdp/af_xdp.api | 91 +++++++ src/plugins/af_xdp/af_xdp.h | 183 +++++++++++++ src/plugins/af_xdp/af_xdp_doc.md | 99 +++++++ src/plugins/af_xdp/api.c | 125 +++++++++ src/plugins/af_xdp/cli.c | 121 +++++++++ src/plugins/af_xdp/device.c | 533 ++++++++++++++++++++++++++++++++++++++ src/plugins/af_xdp/format.c | 89 +++++++ src/plugins/af_xdp/input.c | 367 ++++++++++++++++++++++++++ src/plugins/af_xdp/output.c | 260 +++++++++++++++++++ src/plugins/af_xdp/plugin.c | 35 +++ src/plugins/af_xdp/test_api.c | 155 +++++++++++ src/plugins/af_xdp/unformat.c | 69 +++++ 20 files changed, 2375 insertions(+), 1 deletion(-) create mode 100644 build/external/packages/libbpf.mk create mode 100644 extras/bpf/Makefile create mode 100644 extras/bpf/af_xdp.bpf.c create mode 100644 src/plugins/af_xdp/CMakeLists.txt create mode 100644 src/plugins/af_xdp/FEATURE.yaml create mode 100644 src/plugins/af_xdp/af_xdp.api create mode 100644 src/plugins/af_xdp/af_xdp.h create mode 100644 src/plugins/af_xdp/af_xdp_doc.md create mode 100644 src/plugins/af_xdp/api.c create mode 100644 src/plugins/af_xdp/cli.c create mode 100644 src/plugins/af_xdp/device.c create mode 100644 src/plugins/af_xdp/format.c create mode 100644 src/plugins/af_xdp/input.c create mode 100644 src/plugins/af_xdp/output.c create mode 100644 src/plugins/af_xdp/plugin.c create mode 100644 src/plugins/af_xdp/test_api.c create mode 100644 src/plugins/af_xdp/unformat.c diff --git a/MAINTAINERS b/MAINTAINERS index e928186cf65..03cd4791b34 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -698,6 +698,12 @@ I: bash M: Dave Wallace F: extras/bash/ +Plugin - AF_XDP driver +I: af_xdp +M: Benoît Ganne +M: Damjan Marion +F: src/plugins/af_xdp/ + THE REST I: misc C: Contact vpp-dev Mailing List diff --git a/Makefile b/Makefile index fe5e6e950ba..feb23e50cb8 100644 --- a/Makefile +++ b/Makefile @@ -81,6 +81,7 @@ ifeq ($(OS_VERSION_ID),18.04) else ifeq ($(OS_VERSION_ID),20.04) DEB_DEPENDS += python3-virtualenv DEB_DEPENDS += libssl-dev + DEB_DEPENDS += libelf-dev # for libbpf (af_xdp) LIBFFI=libffi7 else ifeq ($(OS_ID)-$(OS_VERSION_ID),debian-8) DEB_DEPENDS += libssl-dev diff --git a/build/external/Makefile b/build/external/Makefile index e5dff3c43e5..b0e3cee4faa 100644 --- a/build/external/Makefile +++ b/build/external/Makefile @@ -39,13 +39,14 @@ include packages/ipsec-mb.mk include packages/quicly.mk include packages/dpdk.mk include packages/rdma-core.mk +include packages/libbpf.mk .PHONY: clean clean: @rm -rf $(B) $(I) .PHONY: install -install: dpdk-install rdma-core-install quicly-install +install: dpdk-install rdma-core-install quicly-install libbpf-install .PHONY: config config: dpdk-config rdma-core-config diff --git a/build/external/packages/libbpf.mk b/build/external/packages/libbpf.mk new file mode 100644 index 00000000000..90ff54b433c --- /dev/null +++ b/build/external/packages/libbpf.mk @@ -0,0 +1,55 @@ +# Copyright (c) 2018 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +LIBBPF_DEBUG?=n + +libbpf_version := 0.1.0 +libbpf_tarball := v$(libbpf_version).tar.gz +libbpf_tarball_md5sum_0.1.0 := 00b991a6e2d28d797a56ab1575ed40e1 +libbpf_tarball_md5sum := $(libbpf_tarball_md5sum_$(libbpf_version)) +libbpf_tarball_strip_dirs := 1 +libbpf_url := https://github.com/libbpf/libbpf/archive/$(libbpf_tarball) + +LIBBPF_CFLAGS:=-g -Werror -Wall -fPIC -fvisibility=hidden +ifeq ($(LIBBPF_DEBUG),y) + LIBBPF_CFLAGS+= -O0 +else + LIBBPF_CFLAGS+= -O2 +endif + +IF_XDP:=$(shell echo "\#include " | $(CC) -E -xc - > /dev/null 2>&1) +IF_XDP:=$(.SHELLSTATUS) + +define libbpf_config_cmds + @true +endef + +define libbpf_build_cmds__ + BUILD_STATIC_ONLY=y OBJDIR='$(libbpf_build_dir)' PREFIX='' DESTDIR='$(libbpf_install_dir)' CFLAGS='$(LIBBPF_CFLAGS)' make -C '$(libbpf_src_dir)/src' $(1) > $(2) +endef + +define libbpf_build_cmds + $(call libbpf_build_cmds__,,$(libbpf_build_log)) +endef + +define libbpf_install_cmds + $(call libbpf_build_cmds__,install,$(libbpf_install_log)) +endef + +ifneq ($(IF_XDP),0) + $(warning "linux/if_xdp.h was not found on this system. libbpf will be skipped.") +libbpf-install: + @true +else + $(eval $(call package,libbpf)) +endif diff --git a/extras/bpf/Makefile b/extras/bpf/Makefile new file mode 100644 index 00000000000..77b06434237 --- /dev/null +++ b/extras/bpf/Makefile @@ -0,0 +1,17 @@ +CC?=clang +# where to find bpf includes? +BPF_ROOT?=/usr/include +#BPF_ROOT?=/opt/vpp/external/x86_64/include + +CFLAGS:=-O3 -g -Wextra -Wall -target bpf +# Workaround for Ubuntu/Debian for asm/types.h +CFLAGS+= -I/usr/include/x86_64-linux-gnu +CFLAGS+= -I$(BPF_ROOT) +#CFLAGS+= -DDEBUG + +all: af_xdp.bpf.o + +clean: + $(RM) af_xdp.bpf.o + +.PHONY: all clean diff --git a/extras/bpf/af_xdp.bpf.c b/extras/bpf/af_xdp.bpf.c new file mode 100644 index 00000000000..eddd2b0e509 --- /dev/null +++ b/extras/bpf/af_xdp.bpf.c @@ -0,0 +1,88 @@ +/* + * SPDX-License-Identifier: GPL-2.0 OR Apache-2.0 + * Dual-licensed under GPL version 2.0 or Apache License version 2.0 + * Copyright (c) 2020 Cisco and/or its affiliates. + */ +#include +#include +#include +#include +#include +#include + +/* + * when compiled, debug print can be viewed with eg. + * sudo cat /sys/kernel/debug/tracing/trace_pipe + */ +#ifdef DEBUG +#define s__(n) # n +#define s_(n) s__(n) +#define x_(fmt) __FILE__ ":" s_(__LINE__) ": " fmt "\n" +#define DEBUG_PRINT_(fmt, ...) do { \ + const char fmt__[] = fmt; \ + bpf_trace_printk(fmt__, sizeof(fmt), ## __VA_ARGS__); } while(0) +#define DEBUG_PRINT(fmt, ...) DEBUG_PRINT_ (x_(fmt), ## __VA_ARGS__) +#else /* DEBUG */ +#define DEBUG_PRINT(fmt, ...) +#endif /* DEBUG */ + +#define ntohs(x) __constant_ntohs(x) + +SEC("maps") +struct bpf_map_def xsks_map = { + .type = BPF_MAP_TYPE_XSKMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 64, /* max 64 queues per device */ +}; + +SEC("xdp_sock") +int xdp_sock_prog(struct xdp_md *ctx) { + const void *data = (void *)(long)ctx->data; + const void *data_end = (void *)(long)ctx->data_end; + + DEBUG_PRINT("rx %ld bytes packet", (long)data_end - (long)data); + + /* smallest packet we are interesting in is ip-ip */ + if (data + sizeof(struct ethhdr) + 2 * sizeof(struct iphdr) > data_end) { + DEBUG_PRINT("packet too small"); + return XDP_PASS; + } + + const struct ethhdr *eth = data; + if (eth->h_proto != ntohs(ETH_P_IP)) { + DEBUG_PRINT("unsupported eth proto %x", (int)eth->h_proto); + return XDP_PASS; + } + + const struct iphdr *ip = (void *)(eth + 1); + switch (ip->protocol) { + case IPPROTO_UDP: { + const struct udphdr *udp = (void *)(ip + 1); + if (udp->dest != ntohs(4789)) { /* VxLAN dest port */ + DEBUG_PRINT("unsupported udp dst port %x", (int)udp->dest); + return XDP_PASS; + } + } + case IPPROTO_IPIP: + case IPPROTO_ESP: + break; + default: + DEBUG_PRINT("unsupported ip proto %x", (int)ip->protocol); + return XDP_PASS; + } + + int qid = ctx->rx_queue_index; + if (!bpf_map_lookup_elem(&xsks_map, &qid)) + { + DEBUG_PRINT("no socket found"); + return XDP_PASS; + } + + DEBUG_PRINT("going to socket %d", qid); + return bpf_redirect_map(&xsks_map, qid, 0); +} + +/* actually Dual GPLv2/Apache2, but GPLv2 as far as kernel is concerned */ +SEC("license") +char _license[] = "GPL"; diff --git a/src/plugins/af_xdp/CMakeLists.txt b/src/plugins/af_xdp/CMakeLists.txt new file mode 100644 index 00000000000..a56f250646c --- /dev/null +++ b/src/plugins/af_xdp/CMakeLists.txt @@ -0,0 +1,71 @@ +# Copyright (c) 2018 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +find_path(BPF_INCLUDE_DIR NAMES bpf/xsk.h) +if (NOT BPF_INCLUDE_DIR) + message(WARNING "libbpf headers not found - af_xdp plugin disabled") + return() +endif() + +set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS TRUE) +vpp_plugin_find_library(af_xdp BPF_LIB libbpf.a) +vpp_plugin_find_library(af_xdp BPF_ELF_LIB elf) +vpp_plugin_find_library(af_xdp BPF_Z_LIB z) +if (NOT BPF_LIB OR NOT BPF_ELF_LIB OR NOT BPF_Z_LIB) + message(WARNING "af_xdp plugin - missing libraries - af_xdp plugin disabled") + return() +endif() + +set(CMAKE_REQUIRED_FLAGS "-fPIC") +set(CMAKE_REQUIRED_INCLUDES "${BPF_INCLUDE_DIR}") +set(CMAKE_REQUIRED_LIBRARIES "${BPF_LIB}" "${BPF_ELF_LIB}" "${BPF_Z_LIB}") +CHECK_C_SOURCE_COMPILES(" +#include +int main(void) +{ + return xsk_socket__create (0, 0, 0, 0, 0, 0, 0); +}" BPF_COMPILES_CHECK) +if (NOT BPF_COMPILES_CHECK) + message(WARNING "af_xdp plugins - no working libbpf found - af_xdp plugin disabled") + return() +endif() + +include_directories(${BPF_INCLUDE_DIR}) + +add_vpp_plugin(af_xdp + SOURCES + api.c + cli.c + device.c + format.c + unformat.c + plugin.c + input.c + output.c + + MULTIARCH_SOURCES + input.c + output.c + + API_FILES + af_xdp.api + + API_TEST_SOURCES + unformat.c + test_api.c + + LINK_LIBRARIES + ${BPF_LIB} + ${BPF_ELF_LIB} + ${BPF_Z_LIB} +) diff --git a/src/plugins/af_xdp/FEATURE.yaml b/src/plugins/af_xdp/FEATURE.yaml new file mode 100644 index 00000000000..80d1f2dd90e --- /dev/null +++ b/src/plugins/af_xdp/FEATURE.yaml @@ -0,0 +1,8 @@ +--- +name: AF_XDP device driver +maintainer: Benoît Ganne +features: + - AF_XDP driver for Linux kernel 5.4+ +description: "AF_XDP device driver support" +state: experimental +properties: [CLI, STATS, MULTITHREAD, API] diff --git a/src/plugins/af_xdp/af_xdp.api b/src/plugins/af_xdp/af_xdp.api new file mode 100644 index 00000000000..765af6820e3 --- /dev/null +++ b/src/plugins/af_xdp/af_xdp.api @@ -0,0 +1,91 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2019 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +option version = "0.1.0"; +import "vnet/interface_types.api"; + +enum af_xdp_mode +{ + AF_XDP_API_MODE_AUTO = 0, + AF_XDP_API_MODE_COPY = 1, + AF_XDP_API_MODE_ZERO_COPY = 2, +}; + + +/** \brief + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param host_if - Linux netdev interface name + @param name - new af_xdp interface name (optional) + @param rxq_num - number of receive queues (optional) + @param rxq_size - receive queue size (optional) + @param txq_size - transmit queue size (optional) + @param mode - operation mode (optional) + @param prog - eBPF program path (optional) +*/ + +define af_xdp_create +{ + u32 client_index; + u32 context; + + string host_if[64]; + string name[64]; + u16 rxq_num [default=1]; + u16 rxq_size [default=0]; + u16 txq_size [default=0]; + vl_api_af_xdp_mode_t mode [default=0]; + string prog[256]; + option vat_help = " [name ifname] [rx-queue-size size] [tx-queue-size size] [num-rx-queues num] [prog pathname] [zero-copy|no-zero-copy]"; + option status="in_progress"; +}; + +/** \brief + @param context - sender context, to match reply w/ request + @param retval - return value for request + @param sw_if_index - software index for the new af_xdp interface +*/ + +define af_xdp_create_reply +{ + u32 context; + i32 retval; + vl_api_interface_index_t sw_if_index; + option status="in_progress"; +}; + +/** \brief + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - interface index +*/ + +autoreply define af_xdp_delete +{ + u32 client_index; + u32 context; + + vl_api_interface_index_t sw_if_index; + option vat_help = ""; + option status="in_progress"; +}; + +/* + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/af_xdp/af_xdp.h b/src/plugins/af_xdp/af_xdp.h new file mode 100644 index 00000000000..fd990ec3f90 --- /dev/null +++ b/src/plugins/af_xdp/af_xdp.h @@ -0,0 +1,183 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#ifndef _AF_XDP_H_ +#define _AF_XDP_H_ + +#include +#include +#include + +#define af_xdp_log(lvl, dev, f, ...) \ + vlib_log(lvl, af_xdp_main.log_class, "%v: " f, (dev)->name, ##__VA_ARGS__) + +#define foreach_af_xdp_device_flags \ + _(0, INITIALIZED, "initialized") \ + _(1, ERROR, "error") \ + _(2, ADMIN_UP, "admin-up") \ + _(4, LINK_UP, "link-up") \ + _(8, ZEROCOPY, "zero-copy") \ + +enum +{ +#define _(a, b, c) AF_XDP_DEVICE_F_##b = (1 << a), + foreach_af_xdp_device_flags +#undef _ +}; + +#define af_xdp_device_error(dev, fmt, ...) \ + if (!(dev)->error) \ + { \ + clib_error_t *err_ = clib_error_return_unix (0, fmt, ## __VA_ARGS__); \ + if (!clib_atomic_bool_cmp_and_swap (&(dev)->error, 0, err_)) \ + clib_error_free(err_); \ + } + +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + + /* fields below are accessed in data-plane (hot) */ + + struct xsk_ring_cons rx; + struct xsk_ring_prod fq; + int xsk_fd; + + /* fields below are accessed in control-plane only (cold) */ + + uword file_index; +} af_xdp_rxq_t; + +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + + /* fields below are accessed in data-plane (hot) */ + + clib_spinlock_t lock; + struct xsk_ring_prod tx; + struct xsk_ring_cons cq; + int xsk_fd; +} af_xdp_txq_t; + +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + + /* fields below are accessed in data-plane (hot) */ + + af_xdp_rxq_t *rxqs; + af_xdp_txq_t *txqs; + vlib_buffer_t *buffer_template; + u32 per_interface_next_index; + u32 sw_if_index; + u32 hw_if_index; + u32 flags; + u8 pool; /* buffer pool index */ + u8 txq_num; + + /* fields below are accessed in control-plane only (cold) */ + + char *name; + char *linux_ifname; + u32 dev_instance; + u8 hwaddr[6]; + + struct xsk_umem **umem; + struct xsk_socket **xsk; + + struct bpf_object *bpf_obj; + unsigned linux_ifindex; + + /* error */ + clib_error_t *error; +} af_xdp_device_t; + +typedef struct +{ + af_xdp_device_t *devices; + vlib_log_class_t log_class; + u16 msg_id_base; +} af_xdp_main_t; + +extern af_xdp_main_t af_xdp_main; + +typedef enum +{ + AF_XDP_MODE_AUTO = 0, + AF_XDP_MODE_COPY = 1, + AF_XDP_MODE_ZERO_COPY = 2, +} af_xdp_mode_t; + +typedef struct +{ + char *linux_ifname; + char *name; + char *prog; + af_xdp_mode_t mode; + u32 rxq_size; + u32 txq_size; + u32 rxq_num; + + /* return */ + int rv; + u32 sw_if_index; + clib_error_t *error; +} af_xdp_create_if_args_t; + +void af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args); +void af_xdp_delete_if (vlib_main_t * vm, af_xdp_device_t * ad); + +extern vlib_node_registration_t af_xdp_input_node; +extern vnet_device_class_t af_xdp_device_class; + +/* format.c */ +format_function_t format_af_xdp_device; +format_function_t format_af_xdp_device_name; +format_function_t format_af_xdp_input_trace; + +/* unformat.c */ +unformat_function_t unformat_af_xdp_create_if_args; + +typedef struct +{ + u32 next_index; + u32 hw_if_index; +} af_xdp_input_trace_t; + +#define foreach_af_xdp_tx_func_error \ +_(NO_FREE_SLOTS, "no free tx slots") \ +_(SENDTO_REQUIRED, "sendto required") \ +_(SENDTO_FAILURES, "sendto failures") + +typedef enum +{ +#define _(f,s) AF_XDP_TX_ERROR_##f, + foreach_af_xdp_tx_func_error +#undef _ + AF_XDP_TX_N_ERROR, +} af_xdp_tx_func_error_t; + +#endif /* _AF_XDP_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/af_xdp/af_xdp_doc.md b/src/plugins/af_xdp/af_xdp_doc.md new file mode 100644 index 00000000000..6d2dae55055 --- /dev/null +++ b/src/plugins/af_xdp/af_xdp_doc.md @@ -0,0 +1,99 @@ +# AF_XDP Ethernet driver {#af_xdp_doc} + +This driver relies on Linux AF_XDP socket to rx/tx Ethernet packets. + +## Maturity level +Under development: it should work, but has not been thoroughly tested. + +## Features + - copy and zero-copy mode + - multiqueue + - API + - custom eBPF program + - polling, interrupt and adaptive mode + +## Limitations +Because of AF_XDP restrictions, the MTU is limited to below PAGE_SIZE +(4096-bytes on most systems) minus 256-bytes, and they are additional +limitations depending upon specific Linux device drivers. +As a rule of thumb, a MTU of 3000-bytes or less should be safe. + +## Requirements +The Linux kernel interface must be up and have enough queues before +creating the VPP AF_XDP interface, otherwise Linux will deny creating +the AF_XDP socket. +The AF_XDP interface will claim NIC RX queue starting from 0, up to the +requested number of RX queues (only 1 by default). It means all packets +destined to NIC RX queue `[0, num_rx_queues[` will be received by the +AF_XDP interface, and only them. Depending on your configuration, there +will usually be several RX queues (typically 1 per core) and packets are +spread accross queues by RSS. In order to receive consistent traffic, +you **must** program the NIC dispatching accordingly. The simplest way +to get all the packets is to reconfigure the Linux kernel driver to use +only `num_rx_queues` RX queues (ie all NIC queues will be associated +with the AF_XDP socket): +``` +~# ethtool -L combined +``` +Additionally, the VPP AF_XDP interface will use a MAC address generated at +creation time instead of the Linux kernel interface MAC. As Linux kernel +interface are not in promiscuous mode by default (see below) this will +results in a useless configuration where the VPP AF_XDP interface only +receives packets destined to the Linux kernel interface MAC just to drop +them because the destination MAC does not match VPP AF_XDP interface MAC. +If you want to use the Linux interface MAC for the VPP AF_XDP interface, +you can change it afterwards in VPP: +``` +~# vppctl set int mac address +``` +Finally, if you wish to receive all packets and not only the packets +destined to the Linux kernel interface MAC you need to set the Linux +kernel interface in promiscuous mode: +``` +~# ip link set dev promisc on +``` + +## Security considerations +When creating an AF_XDP interface, it will receive all packets arriving +to the NIC RX queue #0. You need to configure the Linux kernel NIC +driver properly to ensure that only intented packets will arrive in +this queue. There is no way to filter the packets after-the-fact using +eg. netfilter or eBPF. + +## Quickstart +1. Setup the Linux kernel interface (enp216s0f0 here) to use 4 queues: +``` +~# ethtool -L enp216s0f0 combined 4 +``` +2. Put the Linux kernel interface up and in promiscuous mode: +``` +~# ip l set dev enp216s0f0 promisc on up +``` +3. Create the AF_XDP interface: +``` +~# vppctl create int af_xdp host-if enp216s0f0 num-rx-queues 4 +``` +4. Use the interface as usual, eg.: +``` +~# vppctl set int ip addr enp216s0f0/0 1.1.1.1/24 +~# vppctl set int st enp216s0f0/0 up +~# vppctl ping 1.1.1.100` +``` + +## Custom eBPF XDP program +This driver relies on libbpf and as such relies on the `xsks_map` eBPF +map. The default behavior is to use the XDP program already attached +to the interface if any, otherwise load the default one. +You can request to load a custom XDP program with the `prog` option when +creating the interface in VPP: +``` +~# vppctl create int af_xdp host-if enp216s0f0 num-rx-queues 4 prog extras/bpf/af_xdp.bpf.o +``` +In that case it will replace any previously attached program. A custom +XDP program example is provided in `extras/bpf/`. + +## Performance consideration +AF_XDP relies on the Linux kernel NIC driver to rx/tx packets. To reach +high-performance (10's MPPS), the Linux kernel NIC driver must support +zero-copy mode and its RX path must run on a dedicated core in the NUMA +where the NIC is physically connected. diff --git a/src/plugins/af_xdp/api.c b/src/plugins/af_xdp/api.c new file mode 100644 index 00000000000..7592aa4ffda --- /dev/null +++ b/src/plugins/af_xdp/api.c @@ -0,0 +1,125 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2020 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include +#include + +#include + +#include +#include + +/* define message IDs */ +#include +#include + +#include + +static af_xdp_mode_t +af_xdp_api_mode (vl_api_af_xdp_mode_t mode) +{ + switch (mode) + { + case AF_XDP_API_MODE_AUTO: + return AF_XDP_MODE_AUTO; + case AF_XDP_API_MODE_COPY: + return AF_XDP_MODE_COPY; + case AF_XDP_API_MODE_ZERO_COPY: + return AF_XDP_MODE_ZERO_COPY; + } + return AF_XDP_MODE_AUTO; +} + +static void +vl_api_af_xdp_create_t_handler (vl_api_af_xdp_create_t * mp) +{ + vlib_main_t *vm = vlib_get_main (); + af_xdp_main_t *rm = &af_xdp_main; + vl_api_af_xdp_create_reply_t *rmp; + af_xdp_create_if_args_t args; + int rv; + + clib_memset (&args, 0, sizeof (af_xdp_create_if_args_t)); + + args.linux_ifname = mp->host_if[0] ? (char *) mp->host_if : 0; + args.name = mp->name[0] ? (char *) mp->name : 0; + args.prog = mp->prog[0] ? (char *) mp->prog : 0; + args.mode = af_xdp_api_mode (mp->mode); + args.rxq_size = ntohs (mp->rxq_size); + args.txq_size = ntohs (mp->txq_size); + args.rxq_num = ntohs (mp->rxq_num); + + af_xdp_create_if (vm, &args); + rv = args.rv; + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_AF_XDP_CREATE_REPLY + rm->msg_id_base, + ({ + rmp->sw_if_index = ntohl (args.sw_if_index); + })); + /* *INDENT-ON* */ +} + +static void +vl_api_af_xdp_delete_t_handler (vl_api_af_xdp_delete_t * mp) +{ + vlib_main_t *vm = vlib_get_main (); + vnet_main_t *vnm = vnet_get_main (); + af_xdp_main_t *rm = &af_xdp_main; + vl_api_af_xdp_delete_reply_t *rmp; + af_xdp_device_t *rd; + vnet_hw_interface_t *hw; + int rv = 0; + + hw = + vnet_get_sup_hw_interface_api_visible_or_null (vnm, + htonl (mp->sw_if_index)); + if (hw == NULL || af_xdp_device_class.index != hw->dev_class_index) + { + rv = VNET_API_ERROR_INVALID_INTERFACE; + goto reply; + } + + rd = pool_elt_at_index (rm->devices, hw->dev_instance); + + af_xdp_delete_if (vm, rd); + +reply: + REPLY_MACRO (VL_API_AF_XDP_DELETE_REPLY + rm->msg_id_base); +} + +/* set tup the API message handling tables */ +#include +static clib_error_t * +af_xdp_plugin_api_hookup (vlib_main_t * vm) +{ + af_xdp_main_t *rm = &af_xdp_main; + + /* ask for a correctly-sized block of API message decode slots */ + rm->msg_id_base = setup_message_id_table (); + return 0; +} + +VLIB_API_INIT_FUNCTION (af_xdp_plugin_api_hookup); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/af_xdp/cli.c b/src/plugins/af_xdp/cli.c new file mode 100644 index 00000000000..5fe7c2ef399 --- /dev/null +++ b/src/plugins/af_xdp/cli.c @@ -0,0 +1,121 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +static clib_error_t * +af_xdp_create_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + af_xdp_create_if_args_t args; + + if (!unformat_user (input, unformat_af_xdp_create_if_args, &args)) + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + + af_xdp_create_if (vm, &args); + + vec_free (args.linux_ifname); + vec_free (args.name); + + return args.error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (af_xdp_create_command, static) = { + .path = "create interface af_xdp", + .short_help = "create interface af_xdp [name ifname] [rx-queue-size size] [tx-queue-size size] [num-rx-queues num] [prog pathname] [zero-copy|no-zero-copy]", + .function = af_xdp_create_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +af_xdp_delete_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + u32 sw_if_index = ~0; + vnet_hw_interface_t *hw; + af_xdp_main_t *am = &af_xdp_main; + af_xdp_device_t *ad; + vnet_main_t *vnm = vnet_get_main (); + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "sw_if_index %d", &sw_if_index)) + ; + else if (unformat (line_input, "%U", unformat_vnet_sw_interface, + vnm, &sw_if_index)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + unformat_free (line_input); + + if (sw_if_index == ~0) + return clib_error_return (0, + "please specify interface name or sw_if_index"); + + hw = vnet_get_sup_hw_interface (vnm, sw_if_index); + if (hw == NULL || af_xdp_device_class.index != hw->dev_class_index) + return clib_error_return (0, "not an AVF interface"); + + ad = pool_elt_at_index (am->devices, hw->dev_instance); + + af_xdp_delete_if (vm, ad); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (af_xdp_delete_command, static) = { + .path = "delete interface af_xdp", + .short_help = "delete interface af_xdp " + "{ | sw_if_index }", + .function = af_xdp_delete_command_fn, +}; +/* *INDENT-ON* */ + +clib_error_t * +af_xdp_cli_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (af_xdp_cli_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/af_xdp/device.c b/src/plugins/af_xdp/device.c new file mode 100644 index 00000000000..9bca41c962c --- /dev/null +++ b/src/plugins/af_xdp/device.c @@ -0,0 +1,533 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "af_xdp.h" + +af_xdp_main_t af_xdp_main; + +typedef struct +{ + u32 prod; + u32 cons; +} gdb_af_xdp_pair_t; + +gdb_af_xdp_pair_t +gdb_af_xdp_get_prod (const struct xsk_ring_prod *prod) +{ + gdb_af_xdp_pair_t pair = { *prod->producer, *prod->consumer }; + return pair; +} + +gdb_af_xdp_pair_t +gdb_af_xdp_get_cons (const struct xsk_ring_cons * cons) +{ + gdb_af_xdp_pair_t pair = { *cons->producer, *cons->consumer }; + return pair; +} + +static clib_error_t * +af_xdp_mac_change (vnet_hw_interface_t * hw, const u8 * old, const u8 * new) +{ + af_xdp_main_t *am = &af_xdp_main; + af_xdp_device_t *ad = vec_elt_at_index (am->devices, hw->dev_instance); + errno_t err = memcpy_s (ad->hwaddr, sizeof (ad->hwaddr), new, 6); + if (err) + return clib_error_return_code (0, -err, CLIB_ERROR_ERRNO_VALID, + "mac change failed"); + return 0; +} + +static u32 +af_xdp_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags) +{ + af_xdp_main_t *am = &af_xdp_main; + af_xdp_device_t *ad = vec_elt_at_index (am->devices, hw->dev_instance); + + switch (flags) + { + case 0: + af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "set unicast not supported yet"); + return ~0; + case ETHERNET_INTERFACE_FLAG_ACCEPT_ALL: + af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, + "set promiscuous not supported yet"); + return ~0; + case ETHERNET_INTERFACE_FLAG_MTU: + af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "set mtu not supported yet"); + return ~0; + } + + af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "unknown flag %x requested", flags); + return ~0; +} + +void +af_xdp_delete_if (vlib_main_t * vm, af_xdp_device_t * ad) +{ + vnet_main_t *vnm = vnet_get_main (); + af_xdp_main_t *axm = &af_xdp_main; + struct xsk_socket **xsk; + struct xsk_umem **umem; + af_xdp_rxq_t *rxq; + af_xdp_txq_t *txq; + + if (ad->hw_if_index) + { + vnet_hw_interface_set_flags (vnm, ad->hw_if_index, 0); + vnet_hw_interface_unassign_rx_thread (vnm, ad->hw_if_index, 0); + ethernet_delete_interface (vnm, ad->hw_if_index); + } + + vec_foreach (rxq, ad->rxqs) clib_file_del_by_index (&file_main, + rxq->file_index); + vec_foreach (txq, ad->txqs) clib_spinlock_free (&txq->lock); + vec_foreach (xsk, ad->xsk) xsk_socket__delete (*xsk); + vec_foreach (umem, ad->umem) xsk_umem__delete (*umem); + + if (ad->bpf_obj) + { + bpf_set_link_xdp_fd (ad->linux_ifindex, -1, 0); + bpf_object__unload (ad->bpf_obj); + } + + vec_free (ad->xsk); + vec_free (ad->umem); + vec_free (ad->buffer_template); + vec_free (ad->rxqs); + vec_free (ad->txqs); + clib_error_free (ad->error); + pool_put (axm->devices, ad); +} + +static int +af_xdp_load_program (af_xdp_create_if_args_t * args, af_xdp_device_t * ad) +{ + int fd; + + ad->linux_ifindex = if_nametoindex (ad->linux_ifname); + if (!ad->linux_ifindex) + { + args->rv = VNET_API_ERROR_INVALID_VALUE; + args->error = + clib_error_return_unix (0, "if_nametoindex(%s) failed", + ad->linux_ifname); + goto err0; + } + + if (bpf_prog_load (args->prog, BPF_PROG_TYPE_XDP, &ad->bpf_obj, &fd)) + { + args->rv = VNET_API_ERROR_SYSCALL_ERROR_5; + args->error = + clib_error_return_unix (0, "bpf_prog_load(%s) failed", args->prog); + goto err0; + } + +#ifndef XDP_FLAGS_REPLACE +#define XDP_FLAGS_REPLACE 0 +#endif + if (bpf_set_link_xdp_fd (ad->linux_ifindex, fd, XDP_FLAGS_REPLACE)) + { + args->rv = VNET_API_ERROR_SYSCALL_ERROR_6; + args->error = + clib_error_return_unix (0, "bpf_set_link_xdp_fd(%s) failed", + ad->linux_ifname); + goto err1; + } + + return 0; + +err1: + bpf_object__unload (ad->bpf_obj); + ad->bpf_obj = 0; +err0: + ad->linux_ifindex = ~0; + return -1; +} + +static int +af_xdp_create_queue (vlib_main_t * vm, af_xdp_create_if_args_t * args, + af_xdp_device_t * ad, int qid, int rxq_num, int txq_num) +{ + struct xsk_umem **umem = vec_elt_at_index (ad->umem, qid); + struct xsk_socket **xsk = vec_elt_at_index (ad->xsk, qid); + af_xdp_rxq_t *rxq = vec_elt_at_index (ad->rxqs, qid); + af_xdp_txq_t *txq = vec_elt_at_index (ad->txqs, qid); + struct xsk_umem_config umem_config; + struct xsk_socket_config sock_config; + struct xdp_options opt; + socklen_t optlen; + /* + * fq and cq must always be allocated even if unused + * whereas rx and tx indicates whether we want rxq, txq, or both + */ + struct xsk_ring_cons *rx = qid < rxq_num ? &rxq->rx : 0; + struct xsk_ring_prod *fq = &rxq->fq; + struct xsk_ring_prod *tx = qid < txq_num ? &txq->tx : 0; + struct xsk_ring_cons *cq = &txq->cq; + int fd; + + memset (&umem_config, 0, sizeof (umem_config)); + umem_config.fill_size = args->rxq_size; + umem_config.comp_size = args->txq_size; + umem_config.frame_size = + sizeof (vlib_buffer_t) + vlib_buffer_get_default_data_size (vm); + umem_config.flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG; + if (xsk_umem__create + (umem, uword_to_pointer (vm->buffer_main->buffer_mem_start, void *), + vm->buffer_main->buffer_mem_size, fq, cq, &umem_config)) + { + args->rv = VNET_API_ERROR_SYSCALL_ERROR_1; + args->error = clib_error_return_unix (0, "xsk_umem__create() failed"); + goto err0; + } + + memset (&sock_config, 0, sizeof (sock_config)); + sock_config.rx_size = args->rxq_size; + sock_config.tx_size = args->txq_size; + sock_config.bind_flags = XDP_USE_NEED_WAKEUP; + switch (args->mode) + { + case AF_XDP_MODE_AUTO: + break; + case AF_XDP_MODE_COPY: + sock_config.bind_flags |= XDP_COPY; + break; + case AF_XDP_MODE_ZERO_COPY: + sock_config.bind_flags |= XDP_ZEROCOPY; + break; + } + if (xsk_socket__create + (xsk, ad->linux_ifname, qid, *umem, rx, tx, &sock_config)) + { + args->rv = VNET_API_ERROR_SYSCALL_ERROR_2; + args->error = + clib_error_return_unix (0, + "xsk_socket__create() failed (is linux netdev %s up?)", + ad->linux_ifname); + goto err1; + } + + fd = xsk_socket__fd (*xsk); + optlen = sizeof (opt); + if (getsockopt (fd, SOL_XDP, XDP_OPTIONS, &opt, &optlen)) + { + args->rv = VNET_API_ERROR_SYSCALL_ERROR_3; + args->error = + clib_error_return_unix (0, "getsockopt(XDP_OPTIONS) failed"); + goto err2; + } + if (opt.flags & XDP_OPTIONS_ZEROCOPY) + ad->flags |= AF_XDP_DEVICE_F_ZEROCOPY; + + rxq->xsk_fd = qid < rxq_num ? fd : -1; + txq->xsk_fd = qid < txq_num ? fd : -1; + + return 0; + +err2: + xsk_socket__delete (*xsk); +err1: + xsk_umem__delete (*umem); +err0: + *umem = 0; + *xsk = 0; + return -1; +} + +static int +af_xdp_get_numa (const char *ifname) +{ + FILE *fptr; + int numa; + char *s; + + s = (char *) format (0, "/sys/class/net/%s/device/numa_node%c", ifname, 0); + fptr = fopen (s, "rb"); + vec_free (s); + + if (!fptr) + return 0; + + if (fscanf (fptr, "%d\n", &numa) != 1) + numa = 0; + + fclose (fptr); + return numa; +} + +static clib_error_t * +af_xdp_device_rxq_read_ready (clib_file_t * f) +{ + vnet_main_t *vnm = vnet_get_main (); + const af_xdp_main_t *am = &af_xdp_main; + const u32 dev_instance = f->private_data >> 16; + const u16 qid = f->private_data & 0xffff; + const af_xdp_device_t *ad = vec_elt_at_index (am->devices, dev_instance); + vnet_device_input_set_interrupt_pending (vnm, ad->hw_if_index, qid); + return 0; +} + +void +af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) +{ + vnet_main_t *vnm = vnet_get_main (); + vlib_thread_main_t *tm = vlib_get_thread_main (); + af_xdp_main_t *am = &af_xdp_main; + af_xdp_device_t *ad; + vnet_sw_interface_t *sw; + vnet_hw_interface_t *hw; + int rxq_num, txq_num, q_num; + int i; + + args->rxq_size = args->rxq_size ? args->rxq_size : 2 * VLIB_FRAME_SIZE; + args->txq_size = args->txq_size ? args->txq_size : 2 * VLIB_FRAME_SIZE; + rxq_num = args->rxq_num ? args->rxq_num : 1; + txq_num = tm->n_vlib_mains; + + if (!args->linux_ifname) + { + args->rv = VNET_API_ERROR_INVALID_VALUE; + args->error = clib_error_return (0, "missing host interface"); + goto err0; + } + + if (args->rxq_size < VLIB_FRAME_SIZE || args->txq_size < VLIB_FRAME_SIZE || + args->rxq_size > 65535 || args->txq_size > 65535 || + !is_pow2 (args->rxq_size) || !is_pow2 (args->txq_size)) + { + args->rv = VNET_API_ERROR_INVALID_VALUE; + args->error = + clib_error_return (0, + "queue size must be a power of two between %i and 65535", + VLIB_FRAME_SIZE); + goto err0; + } + + pool_get_zero (am->devices, ad); + + ad->linux_ifname = (char *) format (0, "%s", args->linux_ifname); + vec_validate (ad->linux_ifname, IFNAMSIZ - 1); /* libbpf expects ifname to be at least IFNAMSIZ */ + + if (args->prog && af_xdp_load_program (args, ad)) + goto err1; + + q_num = clib_max (rxq_num, txq_num); + vec_validate_aligned (ad->rxqs, q_num - 1, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (ad->txqs, q_num - 1, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (ad->umem, q_num - 1, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (ad->xsk, q_num - 1, CLIB_CACHE_LINE_BYTES); + ad->txq_num = txq_num; + for (i = 0; i < q_num; i++) + { + if (af_xdp_create_queue (vm, args, ad, i, rxq_num, txq_num)) + { + /* + * queue creation failed + * it is only a fatal error if we could not create the number of rx + * queues requested explicitely by the user + * we might create less tx queues than workers but this is ok + */ + af_xdp_txq_t *txq; + + /* fixup vectors length */ + vec_set_len (ad->umem, i); + vec_set_len (ad->xsk, i); + vec_set_len (ad->rxqs, i); + vec_set_len (ad->txqs, i); + + if (i < rxq_num) + goto err1; /* failed creating requested rxq: fatal error, bailing out */ + + /* + * we created all rxq but failed some txq: not an error but + * initialize lock for shared txq + */ + ad->txq_num = i; + vec_foreach (txq, ad->txqs) clib_spinlock_init (&txq->lock); + args->rv = 0; + clib_error_free (args->error); + break; + } + } + + ad->dev_instance = ad - am->devices; + ad->per_interface_next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; + ad->pool = + vlib_buffer_pool_get_default_for_numa (vm, + af_xdp_get_numa + (ad->linux_ifname)); + if (!args->name) + ad->name = + (char *) format (0, "%s/%d", ad->linux_ifname, ad->dev_instance); + else + ad->name = (char *) format (0, "%s", args->name); + + ethernet_mac_address_generate (ad->hwaddr); + + /* create interface */ + if (ethernet_register_interface (vnm, af_xdp_device_class.index, + ad->dev_instance, ad->hwaddr, + &ad->hw_if_index, af_xdp_flag_change)) + { + args->rv = VNET_API_ERROR_INVALID_INTERFACE; + args->error = + clib_error_return (0, "ethernet_register_interface() failed"); + goto err1; + } + + sw = vnet_get_hw_sw_interface (vnm, ad->hw_if_index); + hw = vnet_get_hw_interface (vnm, ad->hw_if_index); + args->sw_if_index = ad->sw_if_index = sw->sw_if_index; + hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE; + + vnet_hw_interface_set_input_node (vnm, ad->hw_if_index, + af_xdp_input_node.index); + + for (i = 0; i < rxq_num; i++) + { + af_xdp_rxq_t *rxq = vec_elt_at_index (ad->rxqs, i); + clib_file_t f = { + .file_descriptor = rxq->xsk_fd, + .flags = UNIX_FILE_EVENT_EDGE_TRIGGERED, + .private_data = (uword) ad->dev_instance << 16 | (uword) i, + .read_function = af_xdp_device_rxq_read_ready, + .description = + format (0, "%U rxq %d", format_af_xdp_device_name, ad->dev_instance, + i), + }; + rxq->file_index = clib_file_add (&file_main, &f); + vnet_hw_interface_assign_rx_thread (vnm, ad->hw_if_index, i, ~0); + } + + /* buffer template */ + vec_validate_aligned (ad->buffer_template, 1, CLIB_CACHE_LINE_BYTES); + ad->buffer_template->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID; + ad->buffer_template->ref_count = 1; + vnet_buffer (ad->buffer_template)->sw_if_index[VLIB_RX] = ad->sw_if_index; + vnet_buffer (ad->buffer_template)->sw_if_index[VLIB_TX] = (u32) ~ 0; + ad->buffer_template->buffer_pool_index = ad->pool; + + return; + +err1: + af_xdp_delete_if (vm, ad); +err0: + vlib_log_err (am->log_class, "%U", format_clib_error, args->error); +} + +static clib_error_t * +af_xdp_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index); + af_xdp_main_t *am = &af_xdp_main; + af_xdp_device_t *ad = vec_elt_at_index (am->devices, hi->dev_instance); + uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; + + if (ad->flags & AF_XDP_DEVICE_F_ERROR) + return clib_error_return (0, "device is in error state"); + + if (is_up) + { + vnet_hw_interface_set_flags (vnm, ad->hw_if_index, + VNET_HW_INTERFACE_FLAG_LINK_UP); + ad->flags |= AF_XDP_DEVICE_F_ADMIN_UP; + } + else + { + vnet_hw_interface_set_flags (vnm, ad->hw_if_index, 0); + ad->flags &= ~AF_XDP_DEVICE_F_ADMIN_UP; + } + return 0; +} + +static void +af_xdp_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index, + u32 node_index) +{ + af_xdp_main_t *am = &af_xdp_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + af_xdp_device_t *ad = pool_elt_at_index (am->devices, hw->dev_instance); + + /* Shut off redirection */ + if (node_index == ~0) + { + ad->per_interface_next_index = node_index; + return; + } + + ad->per_interface_next_index = + vlib_node_add_next (vlib_get_main (), af_xdp_input_node.index, + node_index); +} + +static char *af_xdp_tx_func_error_strings[] = { +#define _(n,s) s, + foreach_af_xdp_tx_func_error +#undef _ +}; + +static void +af_xdp_clear (u32 dev_instance) +{ + af_xdp_main_t *am = &af_xdp_main; + af_xdp_device_t *ad = pool_elt_at_index (am->devices, dev_instance); + clib_error_free (ad->error); +} + +/* *INDENT-OFF* */ +VNET_DEVICE_CLASS (af_xdp_device_class) = +{ + .name = "AF_XDP interface", + .format_device = format_af_xdp_device, + .format_device_name = format_af_xdp_device_name, + .admin_up_down_function = af_xdp_interface_admin_up_down, + .rx_redirect_to_node = af_xdp_set_interface_next_node, + .tx_function_n_errors = AF_XDP_TX_N_ERROR, + .tx_function_error_strings = af_xdp_tx_func_error_strings, + .mac_addr_change_function = af_xdp_mac_change, + .clear_counters = af_xdp_clear, +}; +/* *INDENT-ON* */ + +clib_error_t * +af_xdp_init (vlib_main_t * vm) +{ + af_xdp_main_t *am = &af_xdp_main; + + am->log_class = vlib_log_register_class ("af_xdp", 0); + + return 0; +} + +VLIB_INIT_FUNCTION (af_xdp_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/af_xdp/format.c b/src/plugins/af_xdp/format.c new file mode 100644 index 00000000000..acdc70e6a0d --- /dev/null +++ b/src/plugins/af_xdp/format.c @@ -0,0 +1,89 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include +#include +#include +#include + +#include + +u8 * +format_af_xdp_device_name (u8 * s, va_list * args) +{ + u32 i = va_arg (*args, u32); + af_xdp_main_t *am = &af_xdp_main; + af_xdp_device_t *ad = vec_elt_at_index (am->devices, i); + + s = format (s, "%v", ad->name); + return s; +} + +u8 * +format_af_xdp_device_flags (u8 * s, va_list * args) +{ + af_xdp_device_t *ad = va_arg (*args, af_xdp_device_t *); +#define _(a, b, c) \ + if (ad->flags & (1 << a)) \ + s = format (s, "%s ", c); + foreach_af_xdp_device_flags +#undef _ + return s; +} + +u8 * +format_af_xdp_device (u8 * s, va_list * args) +{ + u32 i = va_arg (*args, u32); + af_xdp_main_t *am = &af_xdp_main; + af_xdp_device_t *ad = vec_elt_at_index (am->devices, i); + u32 indent = format_get_indent (s); + + s = format (s, "netdev %v\n", ad->linux_ifname); + s = + format (s, "%Uflags: %U", format_white_space, indent, + format_af_xdp_device_flags, ad); + if (ad->error) + s = format (s, "\n%Uerror %U", format_white_space, indent, + format_clib_error, ad->error); + + return s; +} + +u8 * +format_af_xdp_input_trace (u8 * s, va_list * args) +{ + vlib_main_t *vm = va_arg (*args, vlib_main_t *); + vlib_node_t *node = va_arg (*args, vlib_node_t *); + af_xdp_input_trace_t *t = va_arg (*args, af_xdp_input_trace_t *); + vnet_main_t *vnm = vnet_get_main (); + vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, t->hw_if_index); + + s = format (s, "af_xdp: %v (%d) next-node %U", + hi->name, t->hw_if_index, format_vlib_next_node_name, vm, + node->index, t->next_index); + + return s; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/af_xdp/input.c b/src/plugins/af_xdp/input.c new file mode 100644 index 00000000000..c5b3488d438 --- /dev/null +++ b/src/plugins/af_xdp/input.c @@ -0,0 +1,367 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include +#include +#include +#include +#include +#include +#include "af_xdp.h" + +#define foreach_af_xdp_input_error \ + _(POLL_REQUIRED, "poll required") \ + _(POLL_FAILURES, "poll failures") + +typedef enum +{ +#define _(f,s) AF_XDP_INPUT_ERROR_##f, + foreach_af_xdp_input_error +#undef _ + AF_XDP_INPUT_N_ERROR, +} af_xdp_input_error_t; + +static __clib_unused char *af_xdp_input_error_strings[] = { +#define _(n,s) s, + foreach_af_xdp_input_error +#undef _ +}; + +static_always_inline void +af_xdp_device_input_trace (vlib_main_t * vm, vlib_node_runtime_t * node, + u32 n_left, const u32 * bi, u32 next_index, + u32 hw_if_index) +{ + u32 n_trace; + + if (PREDICT_TRUE (0 == (n_trace = vlib_get_trace_count (vm, node)))) + return; + + while (n_trace && n_left) + { + vlib_buffer_t *b; + af_xdp_input_trace_t *tr; + b = vlib_get_buffer (vm, bi[0]); + vlib_trace_buffer (vm, node, next_index, b, + /* follow_chain */ 0); + tr = vlib_add_trace (vm, node, b, sizeof (*tr)); + tr->next_index = next_index; + tr->hw_if_index = hw_if_index; + + n_trace--; + n_left--; + bi++; + } + + vlib_set_trace_count (vm, node, n_trace); +} + +static_always_inline void +af_xdp_device_input_refill_db (vlib_main_t * vm, + const vlib_node_runtime_t * node, + af_xdp_device_t * ad, af_xdp_rxq_t * rxq, + const u32 n_alloc) +{ + int ret; + + xsk_ring_prod__submit (&rxq->fq, n_alloc); + + if (!xsk_ring_prod__needs_wakeup (&rxq->fq)) + return; + + vlib_error_count (vm, node->node_index, AF_XDP_INPUT_ERROR_POLL_REQUIRED, + 1); + + struct pollfd fd = {.fd = rxq->xsk_fd,.events = POLLIN }; + ret = poll (&fd, 1, 0); + if (PREDICT_TRUE (ret >= 0)) + return; + + /* something bad is happening */ + vlib_error_count (vm, node->node_index, AF_XDP_INPUT_ERROR_POLL_FAILURES, + 1); + af_xdp_device_error (ad, "poll() failed"); +} + +static_always_inline void +af_xdp_device_input_refill (vlib_main_t * vm, + const vlib_node_runtime_t * node, + af_xdp_device_t * ad, af_xdp_rxq_t * rxq, + const int copy) +{ + __u64 *fill; + const u32 size = rxq->fq.size; + const u32 mask = size - 1; + u32 bis[VLIB_FRAME_SIZE], *bi = bis; + u32 n_alloc, n, n_wrap; + u32 idx; + + ASSERT (mask == rxq->fq.mask); + + /* do not enqueue more packet than ring space */ + n_alloc = xsk_prod_nb_free (&rxq->fq, 16); + /* do not bother to allocate if too small */ + if (n_alloc < 16) + return; + + n_alloc = clib_min (n_alloc, ARRAY_LEN (bis)); + n_alloc = vlib_buffer_alloc_from_pool (vm, bis, n_alloc, ad->pool); + n = xsk_ring_prod__reserve (&rxq->fq, n_alloc, &idx); + ASSERT (n == n_alloc); + + fill = xsk_ring_prod__fill_addr (&rxq->fq, idx); + n = clib_min (n_alloc, size - (idx & mask)); + n_wrap = n_alloc - n; + + /* + * Note about headroom: for some reasons, there seem to be a discrepency + * between 0-copy and copy mode: + * - 0-copy: XDP_PACKET_HEADROOM will be added to the user headroom + * - copy: nothing is added to the user headroom + * We privileged 0-copy and set headroom to 0. As XDP_PACKET_HEADROOM == + * sizeof(vlib_buffer_t), data will correctly point to vlib_buffer_t->data. + * In copy mode, we have to add sizeof(vlib_buffer_t) to desc offset during + * refill. + */ + STATIC_ASSERT (sizeof (vlib_buffer_t) == XDP_PACKET_HEADROOM, "wrong size"); +#define bi2addr(bi) \ + (((bi) << CLIB_LOG2_CACHE_LINE_BYTES) + (copy ? sizeof(vlib_buffer_t) : 0)) + +wrap_around: + + while (n >= 8) + { +#ifdef CLIB_HAVE_VEC256 + u64x4 b0 = u32x4_extend_to_u64x4 (*(u32x4u *) (bi + 0)); + u64x4 b1 = u32x4_extend_to_u64x4 (*(u32x4u *) (bi + 4)); + *(u64x4u *) (fill + 0) = bi2addr (b0); + *(u64x4u *) (fill + 4) = bi2addr (b1); +#else + fill[0] = bi2addr (bi[0]); + fill[1] = bi2addr (bi[1]); + fill[2] = bi2addr (bi[2]); + fill[3] = bi2addr (bi[3]); + fill[4] = bi2addr (bi[4]); + fill[5] = bi2addr (bi[5]); + fill[6] = bi2addr (bi[6]); + fill[7] = bi2addr (bi[7]); +#endif + fill += 8; + bi += 8; + n -= 8; + } + + while (n >= 1) + { + fill[0] = bi2addr (bi[0]); + fill += 1; + bi += 1; + n -= 1; + } + + if (n_wrap) + { + fill = xsk_ring_prod__fill_addr (&rxq->fq, 0); + n = n_wrap; + n_wrap = 0; + goto wrap_around; + } + + af_xdp_device_input_refill_db (vm, node, ad, rxq, n_alloc); +} + +static_always_inline void +af_xdp_device_input_ethernet (vlib_main_t * vm, vlib_node_runtime_t * node, + const u32 next_index, const u32 sw_if_index, + const u32 hw_if_index) +{ + vlib_next_frame_t *nf; + vlib_frame_t *f; + ethernet_input_frame_t *ef; + + if (PREDICT_FALSE (VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT != next_index)) + return; + + nf = + vlib_node_runtime_get_next_frame (vm, node, + VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT); + f = vlib_get_frame (vm, nf->frame); + f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX; + + ef = vlib_frame_scalar_args (f); + ef->sw_if_index = sw_if_index; + ef->hw_if_index = hw_if_index; +} + +static_always_inline u32 +af_xdp_device_input_bufs (vlib_main_t * vm, const af_xdp_device_t * ad, + af_xdp_rxq_t * rxq, u32 * bis, const u32 n_rx, + vlib_buffer_t * bt, u32 idx, const int copy) +{ + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs; + u16 lens[VLIB_FRAME_SIZE], *len = lens; + const u32 mask = rxq->rx.mask; + u32 n = n_rx, *bi = bis, bytes = 0; + +#define addr2bi(addr) \ + (((addr) - (copy ? sizeof(vlib_buffer_t) : 0)) >> CLIB_LOG2_CACHE_LINE_BYTES) + + while (n >= 1) + { + const struct xdp_desc *desc = xsk_ring_cons__rx_desc (&rxq->rx, idx); + bi[0] = addr2bi (xsk_umem__extract_addr (desc->addr)); + ASSERT (vlib_buffer_is_known (vm, bi[0]) == + VLIB_BUFFER_KNOWN_ALLOCATED); + len[0] = desc->len; + idx = (idx + 1) & mask; + bi += 1; + len += 1; + n -= 1; + } + + vlib_get_buffers (vm, bis, bufs, n_rx); + + n = n_rx; + len = lens; + + while (n >= 8) + { + vlib_prefetch_buffer_header (b[4], LOAD); + vlib_buffer_copy_template (b[0], bt); + bytes += b[0]->current_length = len[0]; + + vlib_prefetch_buffer_header (b[5], LOAD); + vlib_buffer_copy_template (b[1], bt); + bytes += b[1]->current_length = len[1]; + + vlib_prefetch_buffer_header (b[6], LOAD); + vlib_buffer_copy_template (b[2], bt); + bytes += b[2]->current_length = len[2]; + + vlib_prefetch_buffer_header (b[7], LOAD); + vlib_buffer_copy_template (b[3], bt); + bytes += b[3]->current_length = len[3]; + + b += 4; + len += 4; + n -= 4; + } + + while (n >= 1) + { + vlib_buffer_copy_template (b[0], bt); + bytes += b[0]->current_length = len[0]; + b += 1; + len += 1; + n -= 1; + } + + xsk_ring_cons__release (&rxq->rx, n_rx); + return bytes; +} + +static_always_inline uword +af_xdp_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame, af_xdp_device_t * ad, + u16 qid, const int copy) +{ + vnet_main_t *vnm = vnet_get_main (); + af_xdp_rxq_t *rxq = vec_elt_at_index (ad->rxqs, qid); + vlib_buffer_t bt; + u32 next_index, *to_next, n_left_to_next; + u32 n_rx_packets, n_rx_bytes; + u32 idx; + + n_rx_packets = xsk_ring_cons__peek (&rxq->rx, VLIB_FRAME_SIZE, &idx); + + if (PREDICT_FALSE (0 == n_rx_packets)) + goto refill; + + vlib_buffer_copy_template (&bt, ad->buffer_template); + next_index = ad->per_interface_next_index; + if (PREDICT_FALSE (vnet_device_input_have_features (ad->sw_if_index))) + vnet_feature_start_device_input_x1 (ad->sw_if_index, &next_index, &bt); + + vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next); + + n_rx_bytes = + af_xdp_device_input_bufs (vm, ad, rxq, to_next, n_rx_packets, &bt, idx, + copy); + af_xdp_device_input_ethernet (vm, node, next_index, ad->sw_if_index, + ad->hw_if_index); + + vlib_put_next_frame (vm, node, next_index, n_left_to_next - n_rx_packets); + + af_xdp_device_input_trace (vm, node, n_rx_packets, to_next, next_index, + ad->hw_if_index); + + vlib_increment_combined_counter + (vnm->interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, vm->thread_index, + ad->hw_if_index, n_rx_packets, n_rx_bytes); + +refill: + af_xdp_device_input_refill (vm, node, ad, rxq, copy); + + return n_rx_packets; +} + +VLIB_NODE_FN (af_xdp_input_node) (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_rx = 0; + af_xdp_main_t *am = &af_xdp_main; + vnet_device_input_runtime_t *rt = (void *) node->runtime_data; + vnet_device_and_queue_t *dq; + + foreach_device_and_queue (dq, rt->devices_and_queues) + { + af_xdp_device_t *ad; + ad = vec_elt_at_index (am->devices, dq->dev_instance); + if ((ad->flags & AF_XDP_DEVICE_F_ADMIN_UP) == 0) + continue; + if (PREDICT_TRUE (ad->flags & AF_XDP_DEVICE_F_ZEROCOPY)) + n_rx += af_xdp_device_input_inline (vm, node, frame, ad, dq->queue_id, + /* copy */ 0); + else + n_rx += af_xdp_device_input_inline (vm, node, frame, ad, dq->queue_id, + /* copy */ 1); + } + return n_rx; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (af_xdp_input_node) = { + .name = "af_xdp-input", + .sibling_of = "device-input", + .format_trace = format_af_xdp_input_trace, + .type = VLIB_NODE_TYPE_INPUT, + .state = VLIB_NODE_STATE_DISABLED, + .n_errors = AF_XDP_INPUT_N_ERROR, + .error_strings = af_xdp_input_error_strings, + .flags = VLIB_NODE_FLAG_TRACE_SUPPORTED, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/af_xdp/output.c b/src/plugins/af_xdp/output.c new file mode 100644 index 00000000000..52c34e00d95 --- /dev/null +++ b/src/plugins/af_xdp/output.c @@ -0,0 +1,260 @@ +#include +#include +#include +#include +#include +#include +#include + +#define AF_XDP_TX_RETRIES 5 + +static_always_inline void +af_xdp_device_output_free (vlib_main_t * vm, const vlib_node_runtime_t * node, + af_xdp_txq_t * txq) +{ + const __u64 *compl; + const u32 size = txq->cq.size; + const u32 mask = size - 1; + u32 bis[VLIB_FRAME_SIZE], *bi = bis; + u32 n_wrap, idx; + u32 n = xsk_ring_cons__peek (&txq->cq, ARRAY_LEN (bis), &idx); + const u32 n_free = n; + + /* we rely on on casting addr (u64) -> bi (u32) to discard XSK offset below */ + STATIC_ASSERT (BITS (bi[0]) + CLIB_LOG2_CACHE_LINE_BYTES <= + XSK_UNALIGNED_BUF_OFFSET_SHIFT, "wrong size"); + ASSERT (mask == txq->cq.mask); + + if (!n_free) + return; + + compl = xsk_ring_cons__comp_addr (&txq->cq, idx); + n = clib_min (n_free, size - (idx & mask)); + n_wrap = n_free - n; + +wrap_around: + + while (n >= 8) + { +#ifdef CLIB_HAVE_VEC256 + u64x4 b0 = (*(u64x4u *) (compl + 0)) >> CLIB_LOG2_CACHE_LINE_BYTES; + u64x4 b1 = (*(u64x4u *) (compl + 4)) >> CLIB_LOG2_CACHE_LINE_BYTES; + /* permute 256-bit register so lower u32s of each buffer index are + * placed into lower 128-bits */ + const u32x8 mask = { 0, 2, 4, 6, 1, 3, 5, 7 }; + u32x8 b2 = u32x8_permute ((u32x8) b0, mask); + u32x8 b3 = u32x8_permute ((u32x8) b1, mask); + /* extract lower 128-bits and save them to the array of buffer indices */ + *(u32x4u *) (bi + 0) = u32x8_extract_lo (b2); + *(u32x4u *) (bi + 4) = u32x8_extract_lo (b3); +#else + bi[0] = compl[0] >> CLIB_LOG2_CACHE_LINE_BYTES; + bi[1] = compl[1] >> CLIB_LOG2_CACHE_LINE_BYTES; + bi[2] = compl[2] >> CLIB_LOG2_CACHE_LINE_BYTES; + bi[3] = compl[3] >> CLIB_LOG2_CACHE_LINE_BYTES; + bi[4] = compl[4] >> CLIB_LOG2_CACHE_LINE_BYTES; + bi[5] = compl[5] >> CLIB_LOG2_CACHE_LINE_BYTES; + bi[6] = compl[6] >> CLIB_LOG2_CACHE_LINE_BYTES; + bi[7] = compl[7] >> CLIB_LOG2_CACHE_LINE_BYTES; +#endif + compl += 8; + bi += 8; + n -= 8; + } + + while (n >= 1) + { + bi[0] = compl[0] >> CLIB_LOG2_CACHE_LINE_BYTES; + ASSERT (vlib_buffer_is_known (vm, bi[0]) == + VLIB_BUFFER_KNOWN_ALLOCATED); + compl += 1; + bi += 1; + n -= 1; + } + + if (n_wrap) + { + compl = xsk_ring_cons__comp_addr (&txq->cq, 0); + n = n_wrap; + n_wrap = 0; + goto wrap_around; + } + + xsk_ring_cons__release (&txq->cq, n_free); + vlib_buffer_free (vm, bis, n_free); +} + +static_always_inline void +af_xdp_device_output_tx_db (vlib_main_t * vm, + const vlib_node_runtime_t * node, + af_xdp_device_t * ad, + af_xdp_txq_t * txq, const u32 n_tx) +{ + int ret; + + xsk_ring_prod__submit (&txq->tx, n_tx); + + if (!xsk_ring_prod__needs_wakeup (&txq->tx)) + return; + + vlib_error_count (vm, node->node_index, AF_XDP_TX_ERROR_SENDTO_REQUIRED, 1); + + ret = sendto (txq->xsk_fd, NULL, 0, MSG_DONTWAIT, NULL, 0); + if (PREDICT_TRUE (ret >= 0)) + return; + + /* those errors are fine */ + switch (errno) + { + case ENOBUFS: + case EAGAIN: + case EBUSY: + return; + } + + /* something bad is happening */ + vlib_error_count (vm, node->node_index, AF_XDP_TX_ERROR_SENDTO_FAILURES, 1); + af_xdp_device_error (ad, "sendto() failed"); +} + +static_always_inline u32 +af_xdp_device_output_tx_try (vlib_main_t * vm, + const vlib_node_runtime_t * node, + af_xdp_device_t * ad, af_xdp_txq_t * txq, + u32 n_tx, u32 * bi) +{ + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs; + const uword start = vm->buffer_main->buffer_mem_start; + const u32 size = txq->tx.size; + const u32 mask = size - 1; + struct xdp_desc *desc; + u64 offset, addr; + u32 idx, n, n_wrap; + + ASSERT (mask == txq->cq.mask); + + n_tx = xsk_ring_prod__reserve (&txq->tx, n_tx, &idx); + + /* if ring is full, do nothing */ + if (PREDICT_FALSE (0 == n_tx)) + return 0; + + vlib_get_buffers (vm, bi, bufs, n_tx); + + desc = xsk_ring_prod__tx_desc (&txq->tx, idx); + n = clib_min (n_tx, size - (idx & mask)); + n_wrap = n_tx - n; + +wrap_around: + + while (n >= 8) + { + vlib_prefetch_buffer_header (b[4], LOAD); + offset = + (sizeof (vlib_buffer_t) + + b[0]->current_data) << XSK_UNALIGNED_BUF_OFFSET_SHIFT; + addr = pointer_to_uword (b[0]) - start; + desc[0].addr = offset | addr; + desc[0].len = b[0]->current_length; + + vlib_prefetch_buffer_header (b[5], LOAD); + offset = + (sizeof (vlib_buffer_t) + + b[1]->current_data) << XSK_UNALIGNED_BUF_OFFSET_SHIFT; + addr = pointer_to_uword (b[1]) - start; + desc[1].addr = offset | addr; + desc[1].len = b[1]->current_length; + + vlib_prefetch_buffer_header (b[6], LOAD); + offset = + (sizeof (vlib_buffer_t) + + b[2]->current_data) << XSK_UNALIGNED_BUF_OFFSET_SHIFT; + addr = pointer_to_uword (b[2]) - start; + desc[2].addr = offset | addr; + desc[2].len = b[2]->current_length; + + vlib_prefetch_buffer_header (b[7], LOAD); + offset = + (sizeof (vlib_buffer_t) + + b[3]->current_data) << XSK_UNALIGNED_BUF_OFFSET_SHIFT; + addr = pointer_to_uword (b[3]) - start; + desc[3].addr = offset | addr; + desc[3].len = b[3]->current_length; + + desc += 4; + b += 4; + n -= 4; + } + + while (n >= 1) + { + offset = + (sizeof (vlib_buffer_t) + + b[0]->current_data) << XSK_UNALIGNED_BUF_OFFSET_SHIFT; + addr = pointer_to_uword (b[0]) - start; + desc[0].addr = offset | addr; + desc[0].len = b[0]->current_length; + desc += 1; + b += 1; + n -= 1; + } + + if (n_wrap) + { + desc = xsk_ring_prod__tx_desc (&txq->tx, 0); + n = n_wrap; + n_wrap = 0; + goto wrap_around; + } + + return n_tx; +} + +VNET_DEVICE_CLASS_TX_FN (af_xdp_device_class) (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + af_xdp_main_t *rm = &af_xdp_main; + vnet_interface_output_runtime_t *ord = (void *) node->runtime_data; + af_xdp_device_t *ad = pool_elt_at_index (rm->devices, ord->dev_instance); + u32 thread_index = vm->thread_index; + af_xdp_txq_t *txq = vec_elt_at_index (ad->txqs, thread_index % ad->txq_num); + u32 *from; + u32 n, n_tx; + int i; + + from = vlib_frame_vector_args (frame); + n_tx = frame->n_vectors; + + clib_spinlock_lock_if_init (&txq->lock); + + for (i = 0, n = 0; i < AF_XDP_TX_RETRIES && n < n_tx; i++) + { + u32 n_enq; + af_xdp_device_output_free (vm, node, txq); + n_enq = af_xdp_device_output_tx_try (vm, node, ad, txq, n_tx - n, from); + n += n_enq; + from += n_enq; + } + + af_xdp_device_output_tx_db (vm, node, ad, txq, n); + + clib_spinlock_unlock_if_init (&txq->lock); + + if (PREDICT_FALSE (n != n_tx)) + { + vlib_buffer_free (vm, from + n, n_tx - n); + vlib_error_count (vm, node->node_index, + AF_XDP_TX_ERROR_NO_FREE_SLOTS, n_tx - n); + } + + return n; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/af_xdp/plugin.c b/src/plugins/af_xdp/plugin.c new file mode 100644 index 00000000000..444ee553cbf --- /dev/null +++ b/src/plugins/af_xdp/plugin.c @@ -0,0 +1,35 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include +#include +#include + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "AF_XDP Device Plugin", +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/af_xdp/test_api.c b/src/plugins/af_xdp/test_api.c new file mode 100644 index 00000000000..270db4e2973 --- /dev/null +++ b/src/plugins/af_xdp/test_api.c @@ -0,0 +1,155 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2020 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#define __plugin_msg_base af_xdp_test_main.msg_id_base +#include + +/* declare message IDs */ +#include +#include + +typedef struct +{ + /* API message ID base */ + u16 msg_id_base; + vat_main_t *vat_main; +} af_xdp_test_main_t; + +af_xdp_test_main_t af_xdp_test_main; + +static vl_api_af_xdp_mode_t +api_af_xdp_mode (af_xdp_mode_t mode) +{ + switch (mode) + { + case AF_XDP_MODE_AUTO: + return AF_XDP_API_MODE_AUTO; + case AF_XDP_MODE_COPY: + return AF_XDP_API_MODE_COPY; + case AF_XDP_MODE_ZERO_COPY: + return AF_XDP_API_MODE_ZERO_COPY; + } + return ~0; +} + +/* af_xdp create API */ +static int +api_af_xdp_create (vat_main_t * vam) +{ + vl_api_af_xdp_create_t *mp; + af_xdp_create_if_args_t args; + int ret; + + if (!unformat_user (vam->input, unformat_af_xdp_create_if_args, &args)) + { + clib_warning ("unknown input `%U'", format_unformat_error, vam->input); + return -99; + } + + M (AF_XDP_CREATE, mp); + + snprintf ((char *) mp->host_if, sizeof (mp->host_if), "%s", + args.linux_ifname ? : ""); + snprintf ((char *) mp->name, sizeof (mp->name), "%s", args.name ? : ""); + mp->rxq_num = clib_host_to_net_u16 (args.rxq_num); + mp->rxq_size = clib_host_to_net_u16 (args.rxq_size); + mp->txq_size = clib_host_to_net_u16 (args.txq_size); + mp->mode = api_af_xdp_mode (args.mode); + snprintf ((char *) mp->prog, sizeof (mp->prog), "%s", args.prog ? : ""); + + S (mp); + W (ret); + + return ret; +} + +/* af_xdp-create reply handler */ +static void +vl_api_af_xdp_create_reply_t_handler (vl_api_af_xdp_create_reply_t * mp) +{ + vat_main_t *vam = af_xdp_test_main.vat_main; + i32 retval = ntohl (mp->retval); + + if (retval == 0) + { + fformat (vam->ofp, "created af_xdp with sw_if_index %d\n", + ntohl (mp->sw_if_index)); + } + + vam->retval = retval; + vam->result_ready = 1; + vam->regenerate_interface_table = 1; +} + +/* af_xdp delete API */ +static int +api_af_xdp_delete (vat_main_t * vam) +{ + unformat_input_t *i = vam->input; + vl_api_af_xdp_delete_t *mp; + u32 sw_if_index = 0; + u8 index_defined = 0; + int ret; + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "sw_if_index %u", &sw_if_index)) + index_defined = 1; + else + { + clib_warning ("unknown input '%U'", format_unformat_error, i); + return -99; + } + } + + if (!index_defined) + { + errmsg ("missing sw_if_index\n"); + return -99; + } + + M (AF_XDP_DELETE, mp); + + mp->sw_if_index = clib_host_to_net_u32 (sw_if_index); + + S (mp); + W (ret); + + return ret; +} + +#include + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/af_xdp/unformat.c b/src/plugins/af_xdp/unformat.c new file mode 100644 index 00000000000..154d459900e --- /dev/null +++ b/src/plugins/af_xdp/unformat.c @@ -0,0 +1,69 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2020 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include +#include + +uword +unformat_af_xdp_create_if_args (unformat_input_t * input, va_list * vargs) +{ + af_xdp_create_if_args_t *args = va_arg (*vargs, af_xdp_create_if_args_t *); + unformat_input_t _line_input, *line_input = &_line_input; + uword ret = 1; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + clib_memset (args, 0, sizeof (*args)); + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "host-if %s", &args->linux_ifname)) + ; + else if (unformat (line_input, "name %s", &args->name)) + ; + else if (unformat (line_input, "rx-queue-size %u", &args->rxq_size)) + ; + else if (unformat (line_input, "tx-queue-size %u", &args->txq_size)) + ; + else if (unformat (line_input, "num-rx-queues %u", &args->rxq_num)) + ; + else if (unformat (line_input, "prog %s", &args->prog)) + ; + else if (unformat (line_input, "no-zero-copy")) + args->mode = AF_XDP_MODE_COPY; + else if (unformat (line_input, "zero-copy")) + args->mode = AF_XDP_MODE_ZERO_COPY; + else + { + /* return failure on unknown input */ + ret = 0; + break; + } + } + + unformat_free (line_input); + return ret; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ -- 2.16.6