From 108c7313854953ee3b66069b902f9fabb097ed25 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 20 Apr 2016 05:04:20 +0200 Subject: [PATCH] Add native NETMAP driver This is first drop of native NETMAP driver. It is mainly tested with NETMAP pipes but also support for native interfaces should work. New CLI: create netmap [|valeXXX:YYY] [hw-addr ] [pipe] [master|slave] Following example creates NETMAP pipe where VPP acts as master: create netmap name vale00:vpp1 pipe master then NETMAP pkt-gen tool can be used to send traffic: pkt-gen -i vale00:vpp1}0 -f tx Change-Id: Ie0ddaa5facc75285b78467420e8a9f9c8dfc39e5 Signed-off-by: Damjan Marion --- vnet/Makefile.am | 14 + vnet/vnet/devices/dpdk/threads.c | 1 + vnet/vnet/devices/netmap/cli.c | 121 +++++++ vnet/vnet/devices/netmap/device.c | 221 ++++++++++++ vnet/vnet/devices/netmap/net_netmap.h | 650 ++++++++++++++++++++++++++++++++++ vnet/vnet/devices/netmap/netmap.c | 240 +++++++++++++ vnet/vnet/devices/netmap/netmap.h | 131 +++++++ vnet/vnet/devices/netmap/node.c | 284 +++++++++++++++ vnet/vnet/vcgn/cnat_cli_handler.c | 4 + vnet/vnet/vnet.h | 5 - 10 files changed, 1666 insertions(+), 5 deletions(-) create mode 100644 vnet/vnet/devices/netmap/cli.c create mode 100644 vnet/vnet/devices/netmap/device.c create mode 100644 vnet/vnet/devices/netmap/net_netmap.h create mode 100644 vnet/vnet/devices/netmap/netmap.c create mode 100644 vnet/vnet/devices/netmap/netmap.h create mode 100644 vnet/vnet/devices/netmap/node.c diff --git a/vnet/Makefile.am b/vnet/Makefile.am index f7c1f30b520..5fdffcaa13d 100644 --- a/vnet/Makefile.am +++ b/vnet/Makefile.am @@ -683,6 +683,20 @@ libvnet_la_SOURCES += \ nobase_include_HEADERS += \ vnet/devices/af_packet/af_packet.h +######################################## +# NETMAP interface +######################################## + +libvnet_la_SOURCES += \ + vnet/devices/netmap/netmap.c \ + vnet/devices/netmap/device.c \ + vnet/devices/netmap/node.c \ + vnet/devices/netmap/cli.c + +nobase_include_HEADERS += \ + vnet/devices/netmap/netmap.h + + ######################################## # Unix kernel related ######################################## diff --git a/vnet/vnet/devices/dpdk/threads.c b/vnet/vnet/devices/dpdk/threads.c index b1f13ef3791..eeb440e2851 100644 --- a/vnet/vnet/devices/dpdk/threads.c +++ b/vnet/vnet/devices/dpdk/threads.c @@ -20,6 +20,7 @@ #include #include +#include #include #include /* enumerate all vlib messages */ diff --git a/vnet/vnet/devices/netmap/cli.c b/vnet/vnet/devices/netmap/cli.c new file mode 100644 index 00000000000..9c8fb783326 --- /dev/null +++ b/vnet/vnet/devices/netmap/cli.c @@ -0,0 +1,121 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include +#include +#include + +#include + +static clib_error_t * +netmap_create_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, * line_input = &_line_input; + u8 * host_if_name = NULL; + u8 hwaddr [6]; + u8 * hw_addr_ptr = 0; + int r; + u8 is_pipe = 0; + u8 is_master = 0; + + /* Get a line of input. */ + if (! unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "name %s", &host_if_name)) + ; + else if (unformat (line_input, "hw-addr %U", unformat_ethernet_address, hwaddr)) + hw_addr_ptr = hwaddr; + else if (unformat (line_input, "pipe")) + is_pipe = 1; + else if (unformat (line_input, "master")) + is_master = 1; + else if (unformat (line_input, "slave")) + is_master = 0; + else + return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); + } + unformat_free (line_input); + + if (host_if_name == NULL) + return clib_error_return (0, "missing host interface name"); + + r = netmap_create_if(vm, host_if_name, hw_addr_ptr, is_pipe, is_master); + + if (r == VNET_API_ERROR_SYSCALL_ERROR_1) + return clib_error_return(0, "%s (errno %d)", strerror (errno), errno); + + if (r == VNET_API_ERROR_INVALID_INTERFACE) + return clib_error_return(0, "Invalid interface name"); + + if (r == VNET_API_ERROR_SUBIF_ALREADY_EXISTS) + return clib_error_return(0, "Interface elready exists"); + + return 0; +} + +VLIB_CLI_COMMAND (netmap_create_command, static) = { + .path = "create netmap", + .short_help = "create netmap name [|valeXXX:YYY] " + "[hw-addr ] [pipe] [master|slave]", + .function = netmap_create_command_fn, +}; + +static clib_error_t * +netmap_delete_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, * line_input = &_line_input; + u8 * host_if_name = NULL; + + /* Get a line of input. */ + if (! unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "name %s", &host_if_name)) + ; + else + return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); + } + unformat_free (line_input); + + if (host_if_name == NULL) + return clib_error_return (0, "missing host interface name"); + + netmap_delete_if(vm, host_if_name); + + return 0; +} + +VLIB_CLI_COMMAND (netmap_delete_command, static) = { + .path = "delete netmap", + .short_help = "delete netmap name ", + .function = netmap_delete_command_fn, +}; + +clib_error_t * +netmap_cli_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (netmap_cli_init); diff --git a/vnet/vnet/devices/netmap/device.c b/vnet/vnet/devices/netmap/device.c new file mode 100644 index 00000000000..36e17da2a22 --- /dev/null +++ b/vnet/vnet/devices/netmap/device.c @@ -0,0 +1,221 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include +#include +#include + +#include + +#define foreach_netmap_tx_func_error \ +_(NO_FREE_SLOTS, "no free tx slots") + +typedef enum { +#define _(f,s) NETMAP_TX_ERROR_##f, + foreach_netmap_tx_func_error +#undef _ + NETMAP_TX_N_ERROR, +} netmap_tx_func_error_t; + +static char * netmap_tx_func_error_strings[] = { +#define _(n,s) s, + foreach_netmap_tx_func_error +#undef _ +}; + + +static u8 * format_netmap_device_name (u8 * s, va_list * args) +{ + u32 i = va_arg (*args, u32); + netmap_main_t * apm = &netmap_main; + netmap_if_t * nif = pool_elt_at_index (apm->interfaces, i); + + s = format (s, "netmap-%s", nif->host_if_name); + return s; +} + +static u8 * format_netmap_device (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + int verbose = va_arg (*args, int); + netmap_main_t * nm = &netmap_main; + netmap_if_t * nif = vec_elt_at_index (nm->interfaces, dev_instance); + uword indent = format_get_indent (s); + + s = format (s, "NETMAP interface"); + if (verbose) + { + s = format (s, "\n%U version %d flags 0x%x" + "\n%U region %u memsize 0x%x offset 0x%x" + "\n%U tx_slots %u rx_slots %u tx_rings %u rx_rings %u", + format_white_space, indent + 2, + nif->req->nr_version, + nif->req->nr_flags, + format_white_space, indent + 2, + nif->mem_region, + nif->req->nr_memsize, + nif->req->nr_offset, + format_white_space, indent + 2, + nif->req->nr_tx_slots, + nif->req->nr_rx_slots, + nif->req->nr_tx_rings, + nif->req->nr_rx_rings); + } + return s; +} + +static u8 * format_netmap_tx_trace (u8 * s, va_list * args) +{ + s = format (s, "Unimplemented..."); + return s; +} + +static uword +netmap_interface_tx (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + netmap_main_t * nm = &netmap_main; + u32 * buffers = vlib_frame_args (frame); + u32 n_left = frame->n_vectors; + vnet_interface_output_runtime_t * rd = (void *) node->runtime_data; + netmap_if_t * nif = pool_elt_at_index (nm->interfaces, rd->dev_instance); + int cur_ring; + + cur_ring = nif->first_tx_ring; + + while(n_left && cur_ring <= nif->last_tx_ring) + { + struct netmap_ring * ring = NETMAP_TXRING(nif->nifp, cur_ring); + int n_free_slots = nm_ring_space(ring); + uint cur = ring->cur; + + if (!n_free_slots) + { + cur_ring++; + continue; + } + + while (n_left && n_free_slots) + { + vlib_buffer_t * b0; + u32 bi = buffers[0]; + u32 len; + u32 offset = 0; + buffers++; + + struct netmap_slot * slot = &ring->slot[cur]; + + do + { + b0 = vlib_get_buffer (vm, bi); + len = b0->current_length; + /* memcpy */ + clib_memcpy ((u8 *) NETMAP_BUF(ring, slot->buf_idx) + offset, + vlib_buffer_get_current(b0), len); + offset += len; + } + while ((bi = b0->next_buffer)); + + slot->len = offset; + cur = (cur + 1) % ring->num_slots; + n_free_slots--; + n_left--; + } + CLIB_MEMORY_BARRIER(); + ring->head = ring->cur = cur; + } + + if (n_left < frame->n_vectors) + ioctl(nif->fd, NIOCTXSYNC, NULL); + + if (n_left) + vlib_error_count (vm, node->node_index, NETMAP_TX_ERROR_NO_FREE_SLOTS, + n_left); + + vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors); + return frame->n_vectors; +} + +static void +netmap_set_interface_next_node (vnet_main_t *vnm, u32 hw_if_index, + u32 node_index) +{ + netmap_main_t * apm = &netmap_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + netmap_if_t * nif = pool_elt_at_index (apm->interfaces, hw->dev_instance); + + /* Shut off redirection */ + if (node_index == ~0) + { + nif->per_interface_next_index = node_index; + return; + } + + nif->per_interface_next_index = + vlib_node_add_next (vlib_get_main(), netmap_input_node.index, node_index); +} + +static void netmap_clear_hw_interface_counters (u32 instance) +{ + /* Nothing for now */ +} + +static clib_error_t * +netmap_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + netmap_main_t * apm = &netmap_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + netmap_if_t * nif = pool_elt_at_index (apm->interfaces, hw->dev_instance); + u32 hw_flags; + + nif->is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; + + if (nif->is_admin_up) + hw_flags = VNET_HW_INTERFACE_FLAG_LINK_UP; + else + hw_flags = 0; + + vnet_hw_interface_set_flags(vnm, hw_if_index, hw_flags); + + return 0; +} + +static clib_error_t * +netmap_subif_add_del_function (vnet_main_t * vnm, + u32 hw_if_index, + struct vnet_sw_interface_t * st, + int is_add) +{ + /* Nothing for now */ + return 0; +} + +VNET_DEVICE_CLASS (netmap_device_class) = { + .name = "netmap", + .tx_function = netmap_interface_tx, + .format_device_name = format_netmap_device_name, + .format_device = format_netmap_device, + .format_tx_trace = format_netmap_tx_trace, + .tx_function_n_errors = NETMAP_TX_N_ERROR, + .tx_function_error_strings = netmap_tx_func_error_strings, + .rx_redirect_to_node = netmap_set_interface_next_node, + .clear_counters = netmap_clear_hw_interface_counters, + .admin_up_down_function = netmap_interface_admin_up_down, + .subif_add_del_function = netmap_subif_add_del_function, + .no_flatten_output_chains = 1, +}; diff --git a/vnet/vnet/devices/netmap/net_netmap.h b/vnet/vnet/devices/netmap/net_netmap.h new file mode 100644 index 00000000000..fd4253b7c0c --- /dev/null +++ b/vnet/vnet/devices/netmap/net_netmap.h @@ -0,0 +1,650 @@ +/* + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``S IS''AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD: head/sys/net/netmap.h 251139 2013-05-30 14:07:14Z luigi $ + * + * Definitions of constants and the structures used by the netmap + * framework, for the part visible to both kernel and userspace. + * Detailed info on netmap is available with "man netmap" or at + * + * http://info.iet.unipi.it/~luigi/netmap/ + * + * This API is also used to communicate with the VALE software switch + */ + +#ifndef _NET_NETMAP_H_ +#define _NET_NETMAP_H_ + +#define NETMAP_API 11 /* current API version */ + +#define NETMAP_MIN_API 11 /* min and max versions accepted */ +#define NETMAP_MAX_API 15 +/* + * Some fields should be cache-aligned to reduce contention. + * The alignment is architecture and OS dependent, but rather than + * digging into OS headers to find the exact value we use an estimate + * that should cover most architectures. + */ +#define NM_CACHE_ALIGN 128 + +/* + * --- Netmap data structures --- + * + * The userspace data structures used by netmap are shown below. + * They are allocated by the kernel and mmap()ed by userspace threads. + * Pointers are implemented as memory offsets or indexes, + * so that they can be easily dereferenced in kernel and userspace. + + KERNEL (opaque, obviously) + + ==================================================================== + | + USERSPACE | struct netmap_ring + +---->+---------------+ + / | head,cur,tail | + struct netmap_if (nifp, 1 per fd) / | buf_ofs | + +---------------+ / | other fields | + | ni_tx_rings | / +===============+ + | ni_rx_rings | / | buf_idx, len | slot[0] + | | / | flags, ptr | + | | / +---------------+ + +===============+ / | buf_idx, len | slot[1] + | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | + | txring_ofs[1] | +---------------+ + (tx+1 entries) (num_slots entries) + | txring_ofs[t] | | buf_idx, len | slot[n-1] + +---------------+ | flags, ptr | + | rxring_ofs[0] | +---------------+ + | rxring_ofs[1] | + (rx+1 entries) + | rxring_ofs[r] | + +---------------+ + + * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to + * a file descriptor, the mmap()ed region contains a (logically readonly) + * struct netmap_if pointing to struct netmap_ring's. + * + * There is one netmap_ring per physical NIC ring, plus one tx/rx ring + * pair attached to the host stack (this pair is unused for non-NIC ports). + * + * All physical/host stack ports share the same memory region, + * so that zero-copy can be implemented between them. + * VALE switch ports instead have separate memory regions. + * + * The netmap_ring is the userspace-visible replica of the NIC ring. + * Each slot has the index of a buffer (MTU-sized and residing in the + * mmapped region), its length and some flags. An extra 64-bit pointer + * is provided for user-supplied buffers in the tx path. + * + * In user space, the buffer address is computed as + * (char *)ring + buf_ofs + index * NETMAP_BUF_SIZE + * + * Added in NETMAP_API 11: + * + * + NIOCREGIF can request the allocation of extra spare buffers from + * the same memory pool. The desired number of buffers must be in + * nr_arg3. The ioctl may return fewer buffers, depending on memory + * availability. nr_arg3 will return the actual value, and, once + * mapped, nifp->ni_bufs_head will be the index of the first buffer. + * + * The buffers are linked to each other using the first uint32_t + * as the index. On close, ni_bufs_head must point to the list of + * buffers to be released. + * + * + NIOCREGIF can request space for extra rings (and buffers) + * allocated in the same memory space. The number of extra rings + * is in nr_arg1, and is advisory. This is a no-op on NICs where + * the size of the memory space is fixed. + * + * + NIOCREGIF can attach to PIPE rings sharing the same memory + * space with a parent device. The ifname indicates the parent device, + * which must already exist. Flags in nr_flags indicate if we want to + * bind the master or slave side, the index (from nr_ringid) + * is just a cookie and does not need to be sequential. + * + * + NIOCREGIF can also attach to 'monitor' rings that replicate + * the content of specific rings, also from the same memory space. + * + * Extra flags in nr_flags support the above functions. + * Application libraries may use the following naming scheme: + * netmap:foo all NIC ring pairs + * netmap:foo^ only host ring pair + * netmap:foo+ all NIC ring + host ring pairs + * netmap:foo-k the k-th NIC ring pair + * netmap:foo{k PIPE ring pair k, master side + * netmap:foo}k PIPE ring pair k, slave side + */ + +/* + * struct netmap_slot is a buffer descriptor + */ +struct netmap_slot { + uint32_t buf_idx; /* buffer index */ + uint16_t len; /* length for this slot */ + uint16_t flags; /* buf changed, etc. */ + uint64_t ptr; /* pointer for indirect buffers */ +}; + +/* + * The following flags control how the slot is used + */ + +#define NS_BUF_CHANGED 0x0001 /* buf_idx changed */ + /* + * must be set whenever buf_idx is changed (as it might be + * necessary to recompute the physical address and mapping) + * + * It is also set by the kernel whenever the buf_idx is + * changed internally (e.g., by pipes). Applications may + * use this information to know when they can reuse the + * contents of previously prepared buffers. + */ + +#define NS_REPORT 0x0002 /* ask the hardware to report results */ + /* + * Request notification when slot is used by the hardware. + * Normally transmit completions are handled lazily and + * may be unreported. This flag lets us know when a slot + * has been sent (e.g. to terminate the sender). + */ + +#define NS_FORWARD 0x0004 /* pass packet 'forward' */ + /* + * (Only for physical ports, rx rings with NR_FORWARD set). + * Slot released to the kernel (i.e. before ring->head) with + * this flag set are passed to the peer ring (host/NIC), + * thus restoring the host-NIC connection for these slots. + * This supports efficient traffic monitoring or firewalling. + */ + +#define NS_NO_LEARN 0x0008 /* disable bridge learning */ + /* + * On a VALE switch, do not 'learn' the source port for + * this buffer. + */ + +#define NS_INDIRECT 0x0010 /* userspace buffer */ + /* + * (VALE tx rings only) data is in a userspace buffer, + * whose address is in the 'ptr' field in the slot. + */ + +#define NS_MOREFRAG 0x0020 /* packet has more fragments */ + /* + * (VALE ports only) + * Set on all but the last slot of a multi-segment packet. + * The 'len' field refers to the individual fragment. + */ + +#define NS_PORT_SHIFT 8 +#define NS_PORT_MASK (0xff << NS_PORT_SHIFT) + /* + * The high 8 bits of the flag, if not zero, indicate the + * destination port for the VALE switch, overriding + * the lookup table. + */ + +#define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff) + /* + * (VALE rx rings only) the high 8 bits + * are the number of fragments. + */ + + +/* + * struct netmap_ring + * + * Netmap representation of a TX or RX ring (also known as "queue"). + * This is a queue implemented as a fixed-size circular array. + * At the software level the important fields are: head, cur, tail. + * + * In TX rings: + * + * head first slot available for transmission. + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel + * + * [head .. tail-1] can be used for new packets to send; + * 'head' and 'cur' must be incremented as slots are filled + * with new packets to be sent; + * 'cur' can be moved further ahead if we need more space + * for new transmissions. XXX todo (2014-03-12) + * + * In RX rings: + * + * head first valid received packet + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel + * + * [head .. tail-1] contain received packets; + * 'head' and 'cur' must be incremented as slots are consumed + * and can be returned to the kernel; + * 'cur' can be moved further ahead if we want to wait for + * new packets without returning the previous ones. + * + * DATA OWNERSHIP/LOCKING: + * The netmap_ring, and all slots and buffers in the range + * [head .. tail-1] are owned by the user program; + * the kernel only accesses them during a netmap system call + * and in the user thread context. + * + * Other slots and buffers are reserved for use by the kernel + */ +struct netmap_ring { + /* + * buf_ofs is meant to be used through macros. + * It contains the offset of the buffer region from this + * descriptor. + */ + const int64_t buf_ofs; + const uint32_t num_slots; /* number of slots in the ring. */ + const uint32_t nr_buf_size; + const uint16_t ringid; + const uint16_t dir; /* 0: tx, 1: rx */ + + uint32_t head; /* (u) first user slot */ + uint32_t cur; /* (u) wakeup point */ + uint32_t tail; /* (k) first kernel slot */ + + uint32_t flags; + + struct timeval ts; /* (k) time of last *sync() */ + + /* opaque room for a mutex or similar object */ +#if !defined(_WIN32) || defined(__CYGWIN__) + uint8_t __attribute__((__aligned__(NM_CACHE_ALIGN))) sem[128]; +#else + uint8_t __declspec(align(NM_CACHE_ALIGN)) sem[128]; +#endif + + /* the slots follow. This struct has variable size */ + struct netmap_slot slot[0]; /* array of slots. */ +}; + + +/* + * RING FLAGS + */ +#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ + /* + * updates the 'ts' field on each netmap syscall. This saves + * saves a separate gettimeofday(), and is not much worse than + * software timestamps generated in the interrupt handler. + */ + +#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ + /* + * Enables the NS_FORWARD slot flag for the ring. + */ + + +/* + * Netmap representation of an interface and its queue(s). + * This is initialized by the kernel when binding a file + * descriptor to a port, and should be considered as readonly + * by user programs. The kernel never uses it. + * + * There is one netmap_if for each file descriptor on which we want + * to select/poll. + * select/poll operates on one or all pairs depending on the value of + * nmr_queueid passed on the ioctl. + */ +struct netmap_if { + char ni_name[IFNAMSIZ]; /* name of the interface. */ + const uint32_t ni_version; /* API version, currently unused */ + const uint32_t ni_flags; /* properties */ +#define NI_PRIV_MEM 0x1 /* private memory region */ + + /* + * The number of packet rings available in netmap mode. + * Physical NICs can have different numbers of tx and rx rings. + * Physical NICs also have a 'host' ring pair. + * Additionally, clients can request additional ring pairs to + * be used for internal communication. + */ + const uint32_t ni_tx_rings; /* number of HW tx rings */ + const uint32_t ni_rx_rings; /* number of HW rx rings */ + + uint32_t ni_bufs_head; /* head index for extra bufs */ + uint32_t ni_spare1[5]; + /* + * The following array contains the offset of each netmap ring + * from this structure, in the following order: + * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings; + * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings. + * + * The area is filled up by the kernel on NIOCREGIF, + * and then only read by userspace code. + */ + const ssize_t ring_ofs[0]; +}; + + +#ifndef NIOCREGIF +/* + * ioctl names and related fields + * + * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, + * whose identity is set in NIOCREGIF through nr_ringid. + * These are non blocking and take no argument. + * + * NIOCGINFO takes a struct ifreq, the interface name is the input, + * the outputs are number of queues and number of descriptor + * for each queue (useful to set number of threads etc.). + * The info returned is only advisory and may change before + * the interface is bound to a file descriptor. + * + * NIOCREGIF takes an interface name within a struct nmre, + * and activates netmap mode on the interface (if possible). + * + * The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we + * can pass it down to other NIC-related ioctls. + * + * The actual argument (struct nmreq) has a number of options to request + * different functions. + * The following are used in NIOCREGIF when nr_cmd == 0: + * + * nr_name (in) + * The name of the port (em0, valeXXX:YYY, etc.) + * limited to IFNAMSIZ for backward compatibility. + * + * nr_version (in/out) + * Must match NETMAP_API as used in the kernel, error otherwise. + * Always returns the desired value on output. + * + * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out) + * On input, non-zero values may be used to reconfigure the port + * according to the requested values, but this is not guaranteed. + * On output the actual values in use are reported. + * + * nr_ringid (in) + * Indicates how rings should be bound to the file descriptors. + * If nr_flags != 0, then the low bits (in NETMAP_RING_MASK) + * are used to indicate the ring number, and nr_flags specifies + * the actual rings to bind. NETMAP_NO_TX_POLL is unaffected. + * + * NOTE: THE FOLLOWING (nr_flags == 0) IS DEPRECATED: + * If nr_flags == 0, NETMAP_HW_RING and NETMAP_SW_RING control + * the binding as follows: + * 0 (default) binds all physical rings + * NETMAP_HW_RING | ring number binds a single ring pair + * NETMAP_SW_RING binds only the host tx/rx rings + * + * NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push + * packets on tx rings only if POLLOUT is set. + * The default is to push any pending packet. + * + * NETMAP_DO_RX_POLL can be OR-ed to make select()/poll() release + * packets on rx rings also when POLLIN is NOT set. + * The default is to touch the rx ring only with POLLIN. + * Note that this is the opposite of TX because it + * reflects the common usage. + * + * NOTE: NETMAP_PRIV_MEM IS DEPRECATED, use nr_arg2 instead. + * NETMAP_PRIV_MEM is set on return for ports that do not use + * the global memory allocator. + * This information is not significant and applications + * should look at the region id in nr_arg2 + * + * nr_flags is the recommended mode to indicate which rings should + * be bound to a file descriptor. Values are NR_REG_* + * + * nr_arg1 (in) The number of extra rings to be reserved. + * Especially when allocating a VALE port the system only + * allocates the amount of memory needed for the port. + * If more shared memory rings are desired (e.g. for pipes), + * the first invocation for the same basename/allocator + * should specify a suitable number. Memory cannot be + * extended after the first allocation without closing + * all ports on the same region. + * + * nr_arg2 (in/out) The identity of the memory region used. + * On input, 0 means the system decides autonomously, + * other values may try to select a specific region. + * On return the actual value is reported. + * Region '1' is the global allocator, normally shared + * by all interfaces. Other values are private regions. + * If two ports the same region zero-copy is possible. + * + * nr_arg3 (in/out) number of extra buffers to be allocated. + * + * + * + * nr_cmd (in) if non-zero indicates a special command: + * NETMAP_BDG_ATTACH and nr_name = vale*:ifname + * attaches the NIC to the switch; nr_ringid specifies + * which rings to use. Used by vale-ctl -a ... + * nr_arg1 = NETMAP_BDG_HOST also attaches the host port + * as in vale-ctl -h ... + * + * NETMAP_BDG_DETACH and nr_name = vale*:ifname + * disconnects a previously attached NIC. + * Used by vale-ctl -d ... + * + * NETMAP_BDG_LIST + * list the configuration of VALE switches. + * + * NETMAP_BDG_VNET_HDR + * Set the virtio-net header length used by the client + * of a VALE switch port. + * + * NETMAP_BDG_NEWIF + * create a persistent VALE port with name nr_name. + * Used by vale-ctl -n ... + * + * NETMAP_BDG_DELIF + * delete a persistent VALE port. Used by vale-ctl -d ... + * + * nr_arg1, nr_arg2, nr_arg3 (in/out) command specific + * + * + * + */ + + +/* + * struct nmreq overlays a struct ifreq (just the name) + */ +struct nmreq { + char nr_name[IFNAMSIZ]; + uint32_t nr_version; /* API version */ + uint32_t nr_offset; /* nifp offset in the shared region */ + uint32_t nr_memsize; /* size of the shared region */ + uint32_t nr_tx_slots; /* slots in tx rings */ + uint32_t nr_rx_slots; /* slots in rx rings */ + uint16_t nr_tx_rings; /* number of tx rings */ + uint16_t nr_rx_rings; /* number of rx rings */ + + uint16_t nr_ringid; /* ring(s) we care about */ +#define NETMAP_HW_RING 0x4000 /* single NIC ring pair */ +#define NETMAP_SW_RING 0x2000 /* only host ring pair */ + +#define NETMAP_RING_MASK 0x0fff /* the ring number */ + +#define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */ + +#define NETMAP_DO_RX_POLL 0x8000 /* DO automatic rxsync on poll */ + + uint16_t nr_cmd; +#define NETMAP_BDG_ATTACH 1 /* attach the NIC */ +#define NETMAP_BDG_DETACH 2 /* detach the NIC */ +#define NETMAP_BDG_REGOPS 3 /* register bridge callbacks */ +#define NETMAP_BDG_LIST 4 /* get bridge's info */ +#define NETMAP_BDG_VNET_HDR 5 /* set the port virtio-net-hdr length */ +#define NETMAP_BDG_OFFSET NETMAP_BDG_VNET_HDR /* deprecated alias */ +#define NETMAP_BDG_NEWIF 6 /* create a virtual port */ +#define NETMAP_BDG_DELIF 7 /* destroy a virtual port */ +#define NETMAP_PT_HOST_CREATE 8 /* create ptnetmap kthreads */ +#define NETMAP_PT_HOST_DELETE 9 /* delete ptnetmap kthreads */ +#define NETMAP_BDG_POLLING_ON 10 /* delete polling kthread */ +#define NETMAP_BDG_POLLING_OFF 11 /* delete polling kthread */ +#define NETMAP_VNET_HDR_GET 12 /* get the port virtio-net-hdr length */ + uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */ +#define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */ + + uint16_t nr_arg2; + uint32_t nr_arg3; /* req. extra buffers in NIOCREGIF */ + uint32_t nr_flags; + /* various modes, extends nr_ringid */ + uint32_t spare2[1]; +}; + +#define NR_REG_MASK 0xf /* values for nr_flags */ +enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */ + NR_REG_ALL_NIC = 1, + NR_REG_SW = 2, + NR_REG_NIC_SW = 3, + NR_REG_ONE_NIC = 4, + NR_REG_PIPE_MASTER = 5, + NR_REG_PIPE_SLAVE = 6, +}; +/* monitor uses the NR_REG to select the rings to monitor */ +#define NR_MONITOR_TX 0x100 +#define NR_MONITOR_RX 0x200 +#define NR_ZCOPY_MON 0x400 +/* request exclusive access to the selected rings */ +#define NR_EXCLUSIVE 0x800 +/* request ptnetmap host support */ +#define NR_PASSTHROUGH_HOST NR_PTNETMAP_HOST /* deprecated */ +#define NR_PTNETMAP_HOST 0x1000 +#define NR_RX_RINGS_ONLY 0x2000 +#define NR_TX_RINGS_ONLY 0x4000 +/* Applications set this flag if they are able to deal with virtio-net headers, + * that is send/receive frames that start with a virtio-net header. + * If not set, NIOCREGIF will fail with netmap ports that require applications + * to use those headers. If the flag is set, the application can use the + * NETMAP_VNET_HDR_GET command to figure out the header length. */ +#define NR_ACCEPT_VNET_HDR 0x8000 + + +/* + * Windows does not have _IOWR(). _IO(), _IOW() and _IOR() are defined + * in ws2def.h but not sure if they are in the form we need. + * XXX so we redefine them + * in a convenient way to use for DeviceIoControl signatures + */ +#ifdef _WIN32 +#undef _IO // ws2def.h +#define _WIN_NM_IOCTL_TYPE 40000 +#define _IO(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800) , \ + METHOD_BUFFERED, FILE_ANY_ACCESS ) +#define _IO_direct(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800) , \ + METHOD_OUT_DIRECT, FILE_ANY_ACCESS ) + +#define _IOWR(_c, _n, _s) _IO(_c, _n) + +/* We havesome internal sysctl in addition to the externally visible ones */ +#define NETMAP_MMAP _IO_direct('i', 160) // note METHOD_OUT_DIRECT +#define NETMAP_POLL _IO('i', 162) + +/* and also two setsockopt for sysctl emulation */ +#define NETMAP_SETSOCKOPT _IO('i', 140) +#define NETMAP_GETSOCKOPT _IO('i', 141) + + +//These linknames are for the Netmap Core Driver +#define NETMAP_NT_DEVICE_NAME L"\\Device\\NETMAP" +#define NETMAP_DOS_DEVICE_NAME L"\\DosDevices\\netmap" + +//Definition of a structure used to pass a virtual address within an IOCTL +typedef struct _MEMORY_ENTRY { + PVOID pUsermodeVirtualAddress; +} MEMORY_ENTRY, *PMEMORY_ENTRY; + +typedef struct _POLL_REQUEST_DATA { + int events; + int timeout; + int revents; +} POLL_REQUEST_DATA; + +#endif /* _WIN32 */ + +/* + * FreeBSD uses the size value embedded in the _IOWR to determine + * how much to copy in/out. So we need it to match the actual + * data structure we pass. We put some spares in the structure + * to ease compatibility with other versions + */ +#define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */ +#define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */ +#define NIOCTXSYNC _IO('i', 148) /* sync tx queues */ +#define NIOCRXSYNC _IO('i', 149) /* sync rx queues */ +#define NIOCCONFIG _IOWR('i',150, struct nm_ifreq) /* for ext. modules */ +#endif /* !NIOCREGIF */ + + +/* + * Helper functions for kernel and userspace + */ + +/* + * check if space is available in the ring. + */ +static inline int +nm_ring_empty(struct netmap_ring *ring) +{ + return (ring->cur == ring->tail); +} + +/* + * Opaque structure that is passed to an external kernel + * module via ioctl(fd, NIOCCONFIG, req) for a user-owned + * bridge port (at this point ephemeral VALE interface). + */ +#define NM_IFRDATA_LEN 256 +struct nm_ifreq { + char nifr_name[IFNAMSIZ]; + char data[NM_IFRDATA_LEN]; +}; + +/* + * netmap kernel thread configuration + */ +/* bhyve/vmm.ko MSIX parameters for IOCTL */ +struct ptn_vmm_ioctl_msix { + uint64_t msg; + uint64_t addr; +}; + +/* IOCTL parameters */ +struct nm_kth_ioctl { + u_long com; + /* TODO: use union */ + union { + struct ptn_vmm_ioctl_msix msix; + } data; +}; + +/* Configuration of a ptnetmap ring */ +struct ptnet_ring_cfg { + uint64_t ioeventfd; /* eventfd in linux, tsleep() parameter in FreeBSD */ + uint64_t irqfd; /* eventfd in linux, ioctl fd in FreeBSD */ + struct nm_kth_ioctl ioctl; /* ioctl parameter to send irq (only used in bhyve/FreeBSD) */ +}; +#endif /* _NET_NETMAP_H_ */ diff --git a/vnet/vnet/devices/netmap/netmap.c b/vnet/vnet/devices/netmap/netmap.c new file mode 100644 index 00000000000..2f3233a28e5 --- /dev/null +++ b/vnet/vnet/devices/netmap/netmap.c @@ -0,0 +1,240 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static u32 +netmap_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags) +{ + /* nothing for now */ + return 0; +} + +static clib_error_t * netmap_fd_read_ready (unix_file_t * uf) +{ + vlib_main_t * vm = vlib_get_main(); + netmap_main_t * nm = &netmap_main; + u32 idx = uf->private_data; + + nm->pending_input_bitmap = clib_bitmap_set (nm->pending_input_bitmap, idx, 1); + + /* Schedule the rx node */ + vlib_node_set_interrupt_pending (vm, netmap_input_node.index); + + return 0; +} + +static void +close_netmap_if(netmap_main_t * nm, netmap_if_t * nif) +{ + if (nif->unix_file_index != ~0) { + unix_file_del(&unix_main, unix_main.file_pool + nif->unix_file_index); + nif->unix_file_index = ~0; + } + + if (nif->fd > -1) + close(nif->fd); + + if (nif->mem_region) + { + netmap_mem_region_t * reg = &nm->mem_regions[nif->mem_region]; + if (--reg->refcnt == 0) + { + munmap(reg->mem, reg->region_size); + reg->region_size = 0; + } + } + + + mhash_unset(&nm->if_index_by_host_if_name, nif->host_if_name, &nif->if_index); + vec_free(nif->host_if_name); + vec_free(nif->req); + + memset(nif, 0, sizeof(*nif)); + pool_put(nm->interfaces, nif); +} + +int +netmap_create_if(vlib_main_t * vm, u8 * if_name, u8 * hw_addr_set, + u8 is_pipe, u8 is_master) +{ + netmap_main_t * nm = &netmap_main; + int ret = 0; + netmap_if_t * nif = 0; + u8 hw_addr[6]; + clib_error_t * error = 0; + vnet_sw_interface_t * sw; + vnet_main_t *vnm = vnet_get_main(); + uword * p; + struct nmreq * req = 0; + netmap_mem_region_t * reg; + int fd; + + p = mhash_get (&nm->if_index_by_host_if_name, if_name); + if (p) + return VNET_API_ERROR_SUBIF_ALREADY_EXISTS; + + fd = open("/dev/netmap", O_RDWR); + if (fd < 0) + return VNET_API_ERROR_SUBIF_ALREADY_EXISTS; + + pool_get (nm->interfaces, nif); + nif->if_index = nif - nm->interfaces; + nif->fd = fd; + nif->unix_file_index = ~0; + + vec_validate(req, 0); + nif->req = req; + req->nr_version = NETMAP_API; + req->nr_flags = NR_REG_ALL_NIC; + + if (is_pipe) + req->nr_flags = is_master ? NR_REG_PIPE_MASTER : NR_REG_PIPE_SLAVE; + else + req->nr_flags = NR_REG_ALL_NIC; + + req->nr_flags |= NR_ACCEPT_VNET_HDR; + snprintf(req->nr_name, IFNAMSIZ, "%s", if_name); + req->nr_name[IFNAMSIZ-1] = 0; + + if (ioctl(nif->fd, NIOCREGIF, req)) + { + ret = VNET_API_ERROR_NOT_CONNECTED; + goto error; + } + + nif->mem_region = req->nr_arg2; + vec_validate (nm->mem_regions, nif->mem_region); + reg = &nm->mem_regions[nif->mem_region]; + if (reg->region_size == 0) + { + reg->mem = mmap(NULL, req->nr_memsize, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + clib_warning("mem %p", reg->mem); + if (reg->mem == MAP_FAILED) + { + ret = VNET_API_ERROR_NOT_CONNECTED; + goto error; + } + reg->region_size = req->nr_memsize; + } + reg->refcnt++; + + nif->nifp = NETMAP_IF(reg->mem, req->nr_offset); + nif->first_rx_ring = 0; + nif->last_rx_ring = 0; + nif->first_tx_ring = 0; + nif->last_tx_ring = 0; + nif->host_if_name = if_name; + nif->per_interface_next_index = ~0; + + { + unix_file_t template = {0}; + template.read_function = netmap_fd_read_ready; + template.file_descriptor = nif->fd; + template.private_data = nif->if_index; + nif->unix_file_index = unix_file_add (&unix_main, &template); + } + + /*use configured or generate random MAC address */ + if (hw_addr_set) + memcpy(hw_addr, hw_addr_set, 6); + else + { + f64 now = vlib_time_now(vm); + u32 rnd; + rnd = (u32) (now * 1e6); + rnd = random_u32 (&rnd); + + memcpy (hw_addr+2, &rnd, sizeof(rnd)); + hw_addr[0] = 2; + hw_addr[1] = 0xfe; + } + + error = ethernet_register_interface(vnm, netmap_device_class.index, + nif->if_index, hw_addr, &nif->hw_if_index, + netmap_eth_flag_change); + + if (error) + { + clib_error_report (error); + ret = VNET_API_ERROR_SYSCALL_ERROR_1; + goto error; + } + + sw = vnet_get_hw_sw_interface (vnm, nif->hw_if_index); + nif->sw_if_index = sw->sw_if_index; + + vnet_hw_interface_set_flags (vnm, nif->hw_if_index, + VNET_HW_INTERFACE_FLAG_LINK_UP); + + mhash_set_mem (&nm->if_index_by_host_if_name, if_name, &nif->if_index, 0); + + return 0; + +error: + close_netmap_if(nm, nif); + return ret; +} + +int +netmap_delete_if(vlib_main_t *vm, u8 *host_if_name) +{ + vnet_main_t *vnm = vnet_get_main(); + netmap_main_t *nm = &netmap_main; + netmap_if_t *nif; + uword *p; + + p = mhash_get(&nm->if_index_by_host_if_name, host_if_name); + if (p == NULL) { + clib_warning("Host interface %s does not exist", host_if_name); + return VNET_API_ERROR_SYSCALL_ERROR_1; + } + nif = pool_elt_at_index(nm->interfaces, p[0]); + + /* bring down the interface */ + vnet_hw_interface_set_flags(vnm, nif->hw_if_index, 0); + + ethernet_delete_interface(vnm, nif->hw_if_index); + + close_netmap_if(nm, nif); + return 0; +} + +static clib_error_t * +netmap_init (vlib_main_t * vm) +{ + netmap_main_t * nm = &netmap_main; + + memset (nm, 0, sizeof (netmap_main_t)); + + mhash_init_vec_string (&nm->if_index_by_host_if_name, sizeof (uword)); + + return 0; +} + +VLIB_INIT_FUNCTION (netmap_init); diff --git a/vnet/vnet/devices/netmap/netmap.h b/vnet/vnet/devices/netmap/netmap.h new file mode 100644 index 00000000000..d9a476b6959 --- /dev/null +++ b/vnet/vnet/devices/netmap/netmap.h @@ -0,0 +1,131 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ +/* + * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include + +typedef struct { + CLIB_CACHE_LINE_ALIGN_MARK(cacheline0); + u8 * host_if_name; + uword if_index; + u32 hw_if_index; + u32 sw_if_index; + u32 unix_file_index; + + u32 per_interface_next_index; + u8 is_admin_up; + + /* netmap */ + struct nmreq * req; + u16 mem_region; + int fd; + struct netmap_if * nifp; + u16 first_tx_ring; + u16 last_tx_ring; + u16 first_rx_ring; + u16 last_rx_ring; + +} netmap_if_t; + +typedef struct { + char * mem; + u32 region_size; + int refcnt; +} netmap_mem_region_t; + +typedef struct { + CLIB_CACHE_LINE_ALIGN_MARK(cacheline0); + netmap_if_t * interfaces; + + /* bitmap of pending rx interfaces */ + uword * pending_input_bitmap; + + /* rx buffer cache */ + u32 * rx_buffers; + + /* hash of host interface names */ + mhash_t if_index_by_host_if_name; + + /* vector of memory regions */ + netmap_mem_region_t * mem_regions; +} netmap_main_t; + +netmap_main_t netmap_main; +extern vnet_device_class_t netmap_device_class; +extern vlib_node_registration_t netmap_input_node; + +int netmap_create_if(vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set, u8 is_pipe, u8 is_master); +int netmap_delete_if(vlib_main_t * vm, u8 * host_if_name); + + +/* Macros and helper functions from sys/net/netmap_user.h */ + +#define _NETMAP_OFFSET(type, ptr, offset) \ + ((type)(void *)((char *)(ptr) + (offset))) + +#define NETMAP_IF(_base, _ofs) _NETMAP_OFFSET(struct netmap_if *, _base, _ofs) + +#define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \ + nifp, (nifp)->ring_ofs[index] ) + +#define NETMAP_RXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \ + nifp, (nifp)->ring_ofs[index + (nifp)->ni_tx_rings + 1] ) + +#define NETMAP_BUF(ring, index) \ + ((char *)(ring) + (ring)->buf_ofs + ((index)*(ring)->nr_buf_size)) + +#define NETMAP_BUF_IDX(ring, buf) \ + ( ((char *)(buf) - ((char *)(ring) + (ring)->buf_ofs) ) / \ + (ring)->nr_buf_size ) + +static inline uint32_t +nm_ring_space(struct netmap_ring *ring) +{ + int ret = ring->tail - ring->cur; + if (ret < 0) + ret += ring->num_slots; + return ret; +} + + diff --git a/vnet/vnet/devices/netmap/node.c b/vnet/vnet/devices/netmap/node.c new file mode 100644 index 00000000000..f866a282cfb --- /dev/null +++ b/vnet/vnet/devices/netmap/node.c @@ -0,0 +1,284 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include +#include +#include + +#include + +#define foreach_netmap_input_error + +typedef enum { +#define _(f,s) NETMAP_INPUT_ERROR_##f, + foreach_netmap_input_error +#undef _ + NETMAP_INPUT_N_ERROR, +} netmap_input_error_t; + +static char * netmap_input_error_strings[] = { +#define _(n,s) s, + foreach_netmap_input_error +#undef _ +}; + +enum { + NETMAP_INPUT_NEXT_DROP, + NETMAP_INPUT_NEXT_ETHERNET_INPUT, + NETMAP_INPUT_N_NEXT, +}; + +typedef struct { + u32 next_index; + u32 hw_if_index; + struct netmap_slot slot; +} netmap_input_trace_t; + +static u8 * format_netmap_input_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + netmap_input_trace_t * t = va_arg (*args, netmap_input_trace_t *); + uword indent = format_get_indent (s); + + s = format (s, "netmap: hw_if_index %d next-index %d", + t->hw_if_index, t->next_index); + s = format (s, "\n%Uslot: flags 0x%x len %u buf_idx %u", + format_white_space, indent + 2, + t->slot.flags, t->slot.len, t->slot.buf_idx); + return s; +} + +always_inline void +buffer_add_to_chain(vlib_main_t *vm, u32 bi, u32 first_bi, u32 prev_bi) +{ + vlib_buffer_t * b = vlib_get_buffer (vm, bi); + vlib_buffer_t * first_b = vlib_get_buffer (vm, first_bi); + vlib_buffer_t * prev_b = vlib_get_buffer (vm, prev_bi); + + /* update first buffer */ + first_b->total_length_not_including_first_buffer += b->current_length; + + /* update previous buffer */ + prev_b->next_buffer = bi; + prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT; + + /* update current buffer */ + b->next_buffer = 0; + +#if DPDK > 0 + struct rte_mbuf * mbuf = rte_mbuf_from_vlib_buffer(b); + struct rte_mbuf * first_mbuf = rte_mbuf_from_vlib_buffer(first_b); + struct rte_mbuf * prev_mbuf = rte_mbuf_from_vlib_buffer(prev_b); + first_mbuf->nb_segs++; + prev_mbuf->next = mbuf; + mbuf->data_len = b->current_length; + mbuf->data_off = RTE_PKTMBUF_HEADROOM + b->current_data; + mbuf->next = 0; +#endif +} + +always_inline uword +netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame, u32 device_idx) +{ + u32 next_index = NETMAP_INPUT_NEXT_ETHERNET_INPUT; + uword n_trace = vlib_get_trace_count (vm, node); + netmap_main_t * nm = &netmap_main; + netmap_if_t * nif = pool_elt_at_index(nm->interfaces, device_idx); + u32 n_rx_packets = 0; + u32 n_rx_bytes = 0; + u32 * to_next = 0; + u32 n_free_bufs; + struct netmap_ring * ring; + int cur_ring; + u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + if (nif->per_interface_next_index != ~0) + next_index = nif->per_interface_next_index; + + n_free_bufs = vec_len (nm->rx_buffers); + if (PREDICT_FALSE(n_free_bufs < VLIB_FRAME_SIZE)) + { + vec_validate(nm->rx_buffers, VLIB_FRAME_SIZE + n_free_bufs - 1); + n_free_bufs += vlib_buffer_alloc(vm, &nm->rx_buffers[n_free_bufs], VLIB_FRAME_SIZE); + _vec_len (nm->rx_buffers) = n_free_bufs; + } + + cur_ring = nif->first_rx_ring; + while (cur_ring <= nif->last_rx_ring && n_free_bufs) + { + int r = 0; + u32 cur_slot_index; + ring = NETMAP_RXRING(nif->nifp, cur_ring); + r = nm_ring_space(ring); + + if (!r) + { + cur_ring++; + continue; + } + + if (r > n_free_bufs) + r = n_free_bufs; + + cur_slot_index = ring->cur; + while (r) + { + u32 n_left_to_next; + u32 next0 = next_index; + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (r && n_left_to_next) + { + vlib_buffer_t * b0, * first_b0 = 0; + u32 offset = 0; + u32 bi0 = 0, first_bi0 = 0, prev_bi0; + u32 next_slot_index = (cur_slot_index + 1) % ring->num_slots; + u32 next2_slot_index = (cur_slot_index + 2) % ring->num_slots; + struct netmap_slot * slot = &ring->slot[cur_slot_index]; + u32 data_len = slot->len; + + /* prefetch 2 slots in advance */ + CLIB_PREFETCH (&ring->slot[next2_slot_index], CLIB_CACHE_LINE_BYTES, LOAD); + /* prefetch start of next packet */ + CLIB_PREFETCH (NETMAP_BUF(ring, ring->slot[next_slot_index].buf_idx), + CLIB_CACHE_LINE_BYTES, LOAD); + + while (data_len && n_free_bufs) + { + /* grab free buffer */ + u32 last_empty_buffer = vec_len (nm->rx_buffers) - 1; + prev_bi0 = bi0; + bi0 = nm->rx_buffers[last_empty_buffer]; + b0 = vlib_get_buffer (vm, bi0); + _vec_len (nm->rx_buffers) = last_empty_buffer; + n_free_bufs--; + + /* copy data */ + u32 bytes_to_copy = data_len > n_buffer_bytes ? n_buffer_bytes : data_len; + b0->current_data = 0; + clib_memcpy (vlib_buffer_get_current (b0), + (u8 *) NETMAP_BUF(ring, slot->buf_idx) + offset, + bytes_to_copy); + + /* fill buffer header */ + b0->clone_count = 0; + b0->current_length = bytes_to_copy; + + if (offset == 0) + { +#if DPDK > 0 + struct rte_mbuf * mb = rte_mbuf_from_vlib_buffer(b0); + rte_pktmbuf_data_len (mb) = b0->current_length; + rte_pktmbuf_pkt_len (mb) = b0->current_length; +#endif + b0->total_length_not_including_first_buffer = 0; + b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID; + vnet_buffer(b0)->sw_if_index[VLIB_RX] = nif->sw_if_index; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0; + first_bi0 = bi0; + first_b0 = vlib_get_buffer(vm, first_bi0); + } + else + buffer_add_to_chain(vm, bi0, first_bi0, prev_bi0); + + offset += bytes_to_copy; + data_len -= bytes_to_copy; + } + + /* trace */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT(first_b0); + if (PREDICT_FALSE(n_trace > 0)) + { + netmap_input_trace_t *tr; + vlib_trace_buffer (vm, node, next0, first_b0, /* follow_chain */ 0); + vlib_set_trace_count (vm, node, --n_trace); + tr = vlib_add_trace (vm, node, first_b0, sizeof (*tr)); + tr->next_index = next0; + tr->hw_if_index = nif->hw_if_index; + memcpy (&tr->slot, slot, sizeof (struct netmap_slot)); + } + /* enque and take next packet */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, first_bi0, next0); + + /* next packet */ + n_rx_packets++; + n_rx_bytes += slot->len; + to_next[0] = first_bi0; + to_next += 1; + n_left_to_next--; + cur_slot_index = next_slot_index; + + r--; + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + ring->head = ring->cur = cur_slot_index; + cur_ring++; + } + + if (n_rx_packets) + ioctl(nif->fd, NIOCTXSYNC, NULL); + + vlib_increment_combined_counter + (vnet_get_main()->interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + os_get_cpu_number(), + nif->hw_if_index, + n_rx_packets, n_rx_bytes); + + return n_rx_packets; +} + +static uword +netmap_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + int i; + u32 n_rx_packets = 0; + + netmap_main_t * nm = &netmap_main; + + clib_bitmap_foreach (i, nm->pending_input_bitmap, + ({ + clib_bitmap_set (nm->pending_input_bitmap, i, 0); + n_rx_packets += netmap_device_input_fn(vm, node, frame, i); + })); + + return n_rx_packets; +} + + +VLIB_REGISTER_NODE (netmap_input_node) = { + .function = netmap_input_fn, + .name = "netmap-input", + .format_trace = format_netmap_input_trace, + .type = VLIB_NODE_TYPE_INPUT, + .state = VLIB_NODE_STATE_INTERRUPT, + .n_errors = NETMAP_INPUT_N_ERROR, + .error_strings = netmap_input_error_strings, + + .n_next_nodes = NETMAP_INPUT_N_NEXT, + .next_nodes = { + [NETMAP_INPUT_NEXT_DROP] = "error-drop", + [NETMAP_INPUT_NEXT_ETHERNET_INPUT] = "ethernet-input", + }, +}; + diff --git a/vnet/vnet/vcgn/cnat_cli_handler.c b/vnet/vnet/vcgn/cnat_cli_handler.c index 46c8493501b..e76d7cc4248 100644 --- a/vnet/vnet/vcgn/cnat_cli_handler.c +++ b/vnet/vnet/vcgn/cnat_cli_handler.c @@ -37,6 +37,10 @@ #include +#if DPDK +#include +#endif + u32 show_debug_level = 0; u32 diff --git a/vnet/vnet/vnet.h b/vnet/vnet/vnet.h index 2378c2420b8..9254da62372 100644 --- a/vnet/vnet/vnet.h +++ b/vnet/vnet/vnet.h @@ -84,9 +84,4 @@ vnet_main_t **vnet_mains; #include #include -#if DPDK > 0 -#include -#include -#endif - #endif /* included_vnet_vnet_h */ -- 2.16.6