From: Neale Ranns Date: Fri, 5 Feb 2021 09:04:35 +0000 (+0000) Subject: linux-cp: Linux Control Plane Netlink Listener X-Git-Tag: v22.10-rc0~451 X-Git-Url: https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commitdiff_plain;h=616447c392311791e630a604a07a2c7e47dbb7d6 linux-cp: Linux Control Plane Netlink Listener Type: feature please see FEATURE.yaml for details. Signed-off-by: Neale Ranns Signed-off-by: Matthew Smith Signed-off-by: Jon Loeliger Signed-off-by: Pim van Pelt Signed-off-by: Neale Ranns Change-Id: I6255fd9953d0b03e6b4fe75b67a6845a7c206f74 Signed-off-by: Pim van Pelt --- diff --git a/Makefile b/Makefile index 50f33bc8922..031bb7b1ebf 100644 --- a/Makefile +++ b/Makefile @@ -71,7 +71,7 @@ DEB_DEPENDS += libffi-dev python3-ply libmbedtls-dev DEB_DEPENDS += cmake ninja-build uuid-dev python3-jsonschema python3-yaml DEB_DEPENDS += python3-venv # ensurepip DEB_DEPENDS += python3-dev # needed for python3 -m pip install psutil -DEB_DEPENDS += libnl-3-dev libnl-route-3-dev +DEB_DEPENDS += libnl-3-dev libnl-route-3-dev libmnl-dev DEB_DEPENDS += enchant # for docs DEB_DEPENDS += python3-virtualenv DEB_DEPENDS += libssl-dev @@ -116,7 +116,7 @@ RPM_DEPENDS += mbedtls-devel RPM_DEPENDS += ccache RPM_DEPENDS += xmlto RPM_DEPENDS += elfutils-libelf-devel -RPM_DEPENDS += libnl3-devel +RPM_DEPENDS += libnl3-devel libmnl-devel ifeq ($(OS_ID),fedora) RPM_DEPENDS += dnf-utils diff --git a/src/plugins/linux-cp/CMakeLists.txt b/src/plugins/linux-cp/CMakeLists.txt index 5053207fff4..a30ece80501 100644 --- a/src/plugins/linux-cp/CMakeLists.txt +++ b/src/plugins/linux-cp/CMakeLists.txt @@ -60,3 +60,12 @@ add_vpp_plugin(linux_cp_unittest LINK_LIBRARIES lcp ) + +add_vpp_plugin(linux_nl + SOURCES + lcp_router.c + lcp_nl.c + + LINK_LIBRARIES + lcp +) diff --git a/src/plugins/linux-cp/FEATURE.yaml b/src/plugins/linux-cp/FEATURE.yaml index 088b0606f58..cf99b7aa5be 100644 --- a/src/plugins/linux-cp/FEATURE.yaml +++ b/src/plugins/linux-cp/FEATURE.yaml @@ -3,10 +3,10 @@ name: Linux Control Plane (integration) maintainer: Neale Ranns description: |- - This plugin provides the beginnings of an integration with the - Linux network stack. - The plugin provides the capability to 'mirror' VPP interfaces in - the Linux kernel. This means that for any interface in VPP the user + These plugins provide an integration with the Linux network stack. + + The "linux_cp" plugin provides the capability to 'mirror' VPP interfaces + in the Linux kernel. This means that for any interface in VPP the user can create a corresponding TAP or TUN device in the Linux kernel and have VPP plumb them together. The plumbing mechanics is different in each direction. @@ -17,8 +17,7 @@ description: |- In the TX direction, packets received by VPP an the mirror Tap/Tun are cross-connected to the VPP interfaces. For IP packets, IP output features are applied. - This is the beginnings of integration, because there needs to be - an external agent that will configure (and synchronize) the IP + The "linux_nl" plugin listens to netlink messages and synchronizes the IP configuration of the paired interfaces. state: experimental diff --git a/src/plugins/linux-cp/lcp.rst b/src/plugins/linux-cp/lcp.rst index f19981297a6..6f82a29bfbb 100644 --- a/src/plugins/linux-cp/lcp.rst +++ b/src/plugins/linux-cp/lcp.rst @@ -42,10 +42,7 @@ interfaces. Any configuration that is made on these Linux interfaces, also needs to be applied on the corresponding physical interface in VPP. -This is functionality is not provided in this plugin, but it can be -achieved in various ways, for example by listening to the netlink -messages and applying the config. As a result all e.g. routes -programmed in Linux, will also be present in VPP's FIB. +This is functionality is provided by the "linux_nl" plugin. Linux will own the [ARP/ND] neighbor tables (which will be copied via netlink to VPP also). This means that Linux will send packets with the diff --git a/src/plugins/linux-cp/lcp_api.c b/src/plugins/linux-cp/lcp_api.c index 01d66478b90..96aabb114a5 100644 --- a/src/plugins/linux-cp/lcp_api.c +++ b/src/plugins/linux-cp/lcp_api.c @@ -175,7 +175,6 @@ vl_api_lcp_default_ns_set_t_handler (vl_api_lcp_default_ns_set_t *mp) static void vl_api_lcp_default_ns_get_t_handler (vl_api_lcp_default_ns_get_t *mp) { - lcp_main_t *lcpm = &lcp_main; vl_api_lcp_default_ns_get_reply_t *rmp; vl_api_registration_t *reg; char *ns; @@ -186,7 +185,7 @@ vl_api_lcp_default_ns_get_t_handler (vl_api_lcp_default_ns_get_t *mp) rmp = vl_msg_api_alloc (sizeof (*rmp)); clib_memset (rmp, 0, sizeof (*rmp)); - rmp->_vl_msg_id = (VL_API_LCP_DEFAULT_NS_GET_REPLY + lcpm->msg_id_base); + rmp->_vl_msg_id = (VL_API_LCP_DEFAULT_NS_GET_REPLY); rmp->context = mp->context; ns = (char *) lcp_get_default_ns (); @@ -226,7 +225,7 @@ vl_api_lcp_itf_pair_replace_end_t_handler ( #include static clib_error_t * -lcp_plugin_api_hookup (vlib_main_t *vm) +lcp_api_init (vlib_main_t *vm) { /* Ask for a correctly-sized block of API message decode slots */ lcp_msg_id_base = setup_message_id_table (); @@ -234,7 +233,7 @@ lcp_plugin_api_hookup (vlib_main_t *vm) return (NULL); } -VLIB_INIT_FUNCTION (lcp_plugin_api_hookup); +VLIB_INIT_FUNCTION (lcp_api_init); #include VLIB_PLUGIN_REGISTER () = { diff --git a/src/plugins/linux-cp/lcp_interface.c b/src/plugins/linux-cp/lcp_interface.c index 3dbcb5987a1..3a6a6852f37 100644 --- a/src/plugins/linux-cp/lcp_interface.c +++ b/src/plugins/linux-cp/lcp_interface.c @@ -1159,7 +1159,7 @@ lcp_itf_pair_link_up_down (vnet_main_t *vnm, u32 hw_if_index, u32 flags) VNET_HW_INTERFACE_LINK_UP_DOWN_FUNCTION (lcp_itf_pair_link_up_down); static clib_error_t * -lcp_itf_pair_init (vlib_main_t *vm) +lcp_interface_init (vlib_main_t *vm) { vlib_punt_hdl_t punt_hdl = vlib_punt_client_register ("linux-cp"); @@ -1178,7 +1178,7 @@ lcp_itf_pair_init (vlib_main_t *vm) return NULL; } -VLIB_INIT_FUNCTION (lcp_itf_pair_init) = { +VLIB_INIT_FUNCTION (lcp_interface_init) = { .runs_after = VLIB_INITS ("vnet_interface_init", "tcp_init", "udp_init"), }; diff --git a/src/plugins/linux-cp/lcp_nl.c b/src/plugins/linux-cp/lcp_nl.c new file mode 100644 index 00000000000..8a55f4c5edd --- /dev/null +++ b/src/plugins/linux-cp/lcp_nl.c @@ -0,0 +1,582 @@ +/* + * Copyright (c) 2019 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define _GNU_SOURCE +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#include + +typedef enum nl_event_type_t_ +{ + NL_EVENT_READ, + NL_EVENT_ERR, +} nl_event_type_t; + +typedef struct nl_main +{ + + struct nl_sock *sk_route; + vlib_log_class_t nl_logger; + nl_vft_t *nl_vfts; + struct nl_cache *nl_caches[LCP_NL_N_OBJS]; + nl_msg_info_t *nl_msg_queue; + uword clib_file_index; + + u32 rx_buf_size; + u32 tx_buf_size; + u32 batch_size; + u32 batch_delay_ms; + +} nl_main_t; + +#define NL_RX_BUF_SIZE_DEF (1 << 27) /* 128 MB */ +#define NL_TX_BUF_SIZE_DEF (1 << 18) /* 256 kB */ +#define NL_BATCH_SIZE_DEF (1 << 11) /* 2048 */ +#define NL_BATCH_DELAY_MS_DEF 50 /* 50 ms, max 20 batch/s */ + +static nl_main_t nl_main = { + .rx_buf_size = NL_RX_BUF_SIZE_DEF, + .tx_buf_size = NL_TX_BUF_SIZE_DEF, + .batch_size = NL_BATCH_SIZE_DEF, + .batch_delay_ms = NL_BATCH_DELAY_MS_DEF, +}; + +/* #define foreach_nl_nft_proto \ */ +/* _(IP4, "ip", AF_INT) \ */ +/* _(IP6, "ip6", NFPROTO_IPV6) */ + +/* typedef enum nl_nft_proto_t_ */ +/* { */ +/* #define _(a,b,c) NL_NFT_PROTO_##a = c, */ +/* foreach_nl_nft_proto */ +/* #undef _ */ +/* } nl_nft_proto_t; */ + +#define FOREACH_VFT(__func, __arg) \ + { \ + nl_main_t *nm = &nl_main; \ + nl_vft_t *__nv; \ + vec_foreach (__nv, nm->nl_vfts) \ + { \ + if (!__nv->__func.cb) \ + continue; \ + \ + if (!__nv->__func.is_mp_safe) \ + vlib_worker_thread_barrier_sync (vlib_get_main ()); \ + \ + __nv->__func.cb (__arg); \ + \ + if (!__nv->__func.is_mp_safe) \ + vlib_worker_thread_barrier_release (vlib_get_main ()); \ + } \ + } + +#define FOREACH_VFT_CTX(__func, __arg, __ctx) \ + { \ + nl_main_t *nm = &nl_main; \ + nl_vft_t *__nv; \ + vec_foreach (__nv, nm->nl_vfts) \ + { \ + if (!__nv->__func.cb) \ + continue; \ + \ + if (!__nv->__func.is_mp_safe) \ + vlib_worker_thread_barrier_sync (vlib_get_main ()); \ + \ + __nv->__func.cb (__arg, __ctx); \ + \ + if (!__nv->__func.is_mp_safe) \ + vlib_worker_thread_barrier_release (vlib_get_main ()); \ + } \ + } + +void +nl_register_vft (const nl_vft_t *nv) +{ + nl_main_t *nm = &nl_main; + + vec_add1 (nm->nl_vfts, *nv); +} + +#define NL_DBG(...) vlib_log_debug (nl_main.nl_logger, __VA_ARGS__); +#define NL_INFO(...) vlib_log_notice (nl_main.nl_logger, __VA_ARGS__); +#define NL_ERROR(...) vlib_log_err (nl_main.nl_logger, __VA_ARGS__); + +static void lcp_nl_open_socket (void); +static void lcp_nl_close_socket (void); + +static void +nl_route_del (struct rtnl_route *rr, void *arg) +{ + FOREACH_VFT (nvl_rt_route_del, rr); +} + +static void +nl_route_add (struct rtnl_route *rr, void *arg) +{ + FOREACH_VFT (nvl_rt_route_add, rr); +} + +static void +nl_neigh_del (struct rtnl_neigh *rn, void *arg) +{ + FOREACH_VFT (nvl_rt_neigh_del, rn); +} + +static void +nl_neigh_add (struct rtnl_neigh *rn, void *arg) +{ + FOREACH_VFT (nvl_rt_neigh_add, rn); +} + +static void +nl_link_addr_del (struct rtnl_addr *rla, void *arg) +{ + FOREACH_VFT (nvl_rt_addr_del, rla); +} + +static void +nl_link_addr_add (struct rtnl_addr *rla, void *arg) +{ + FOREACH_VFT (nvl_rt_addr_add, rla); +} + +static void +nl_link_del (struct rtnl_link *rl, void *arg) +{ + FOREACH_VFT_CTX (nvl_rt_link_del, rl, arg); +} + +static void +nl_link_add (struct rtnl_link *rl, void *arg) +{ + FOREACH_VFT_CTX (nvl_rt_link_add, rl, arg); +} + +static void +nl_route_dispatch (struct nl_object *obj, void *arg) +{ + /* nothing can be done without interface mappings */ + if (!lcp_itf_num_pairs ()) + return; + + switch (nl_object_get_msgtype (obj)) + { + case RTM_NEWROUTE: + nl_route_add ((struct rtnl_route *) obj, arg); + break; + case RTM_DELROUTE: + nl_route_del ((struct rtnl_route *) obj, arg); + break; + case RTM_NEWNEIGH: + nl_neigh_add ((struct rtnl_neigh *) obj, arg); + break; + case RTM_DELNEIGH: + nl_neigh_del ((struct rtnl_neigh *) obj, arg); + break; + case RTM_NEWADDR: + nl_link_addr_add ((struct rtnl_addr *) obj, arg); + break; + case RTM_DELADDR: + nl_link_addr_del ((struct rtnl_addr *) obj, arg); + break; + case RTM_NEWLINK: + nl_link_add ((struct rtnl_link *) obj, arg); + break; + case RTM_DELLINK: + nl_link_del ((struct rtnl_link *) obj, arg); + break; + default: + NL_INFO ("unhandled: %s", nl_object_get_type (obj)); + break; + } +} + +static int +nl_route_process_msgs (void) +{ + nl_main_t *nm = &nl_main; + nl_msg_info_t *msg_info; + int err, n_msgs = 0; + + /* process a batch of messages. break if we hit our limit */ + vec_foreach (msg_info, nm->nl_msg_queue) + { + if ((err = nl_msg_parse (msg_info->msg, nl_route_dispatch, msg_info)) < + 0) + NL_ERROR ("Unable to parse object: %s", nl_geterror (err)); + nlmsg_free (msg_info->msg); + if (++n_msgs >= nm->batch_size) + break; + } + + /* remove the messages we processed from the head of the queue */ + if (n_msgs) + vec_delete (nm->nl_msg_queue, n_msgs, 0); + + NL_INFO ("Processed %u messages", n_msgs); + + return n_msgs; +} + +#define DAY_F64 (1.0 * (24 * 60 * 60)) + +static uword +nl_route_process (vlib_main_t *vm, vlib_node_runtime_t *node, + vlib_frame_t *frame) +{ + nl_main_t *nm = &nl_main; + uword event_type; + uword *event_data = 0; + f64 wait_time = DAY_F64; + + while (1) + { + /* If we process a batch of messages and stop because we reached the + * batch size limit, we want to wake up after the batch delay and + * process more. Otherwise we just want to wait for a read event. + */ + vlib_process_wait_for_event_or_clock (vm, wait_time); + event_type = vlib_process_get_events (vm, &event_data); + + switch (event_type) + { + /* process batch of queued messages on timeout or read event signal */ + case ~0: + case NL_EVENT_READ: + nl_route_process_msgs (); + wait_time = (vec_len (nm->nl_msg_queue) != 0) ? + nm->batch_delay_ms * 1e-3 : + DAY_F64; + break; + + /* reopen the socket if there was an error polling/reading it */ + case NL_EVENT_ERR: + lcp_nl_close_socket (); + lcp_nl_open_socket (); + break; + + default: + NL_ERROR ("Unknown event type: %u", (u32) event_type); + } + + vec_reset_length (event_data); + } + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (nl_route_process_node, static) = { + .function = nl_route_process, + .name = "linux-cp-netlink-process", + .type = VLIB_NODE_TYPE_PROCESS, + .process_log2_n_stack_bytes = 17, +}; + +static int +nl_route_cb (struct nl_msg *msg, void *arg) +{ + nl_main_t *nm = &nl_main; + nl_msg_info_t *msg_info = 0; + + /* delay processing - increment ref count and queue for later */ + vec_add2 (nm->nl_msg_queue, msg_info, 1); + + /* store a timestamp for the message */ + msg_info->ts = vlib_time_now (vlib_get_main ()); + msg_info->msg = msg; + nlmsg_get (msg); + + /* notify process node */ + vlib_process_signal_event (vlib_get_main (), nl_route_process_node.index, + NL_EVENT_READ, 0); + + return 0; +} + +int +lcp_nl_drain_messages (void) +{ + int err; + nl_main_t *nm = &nl_main; + + /* Read until there's an error. Unless the error is ENOBUFS, which means + * the kernel couldn't send a message due to socket buffer overflow. + * Continue reading when that happens. + * + * libnl translates both ENOBUFS and ENOMEM to NLE_NOMEM. So we need to + * check return status and errno to make sure we should keep going. + */ + while ((err = nl_recvmsgs_default (nm->sk_route)) > -1 || + (err == -NLE_NOMEM && errno == ENOBUFS)) + ; + + /* If there was an error other then EAGAIN, signal process node */ + if (err != -NLE_AGAIN) + vlib_process_signal_event (vlib_get_main (), nl_route_process_node.index, + NL_EVENT_ERR, 0); + + return err; +} + +void +lcp_nl_pair_add_cb (lcp_itf_pair_t *pair) +{ + lcp_nl_drain_messages (); +} + +static clib_error_t * +nl_route_read_cb (clib_file_t *f) +{ + int err; + err = lcp_nl_drain_messages (); + if (err < 0 && err != -NLE_AGAIN) + NL_ERROR ("Error reading netlink socket (fd %d): %s (%d)", + f->file_descriptor, nl_geterror (err), err); + + return 0; +} + +static clib_error_t * +nl_route_error_cb (clib_file_t *f) +{ + NL_ERROR ("Error polling netlink socket (fd %d)", f->file_descriptor); + + /* notify process node */ + vlib_process_signal_event (vlib_get_main (), nl_route_process_node.index, + NL_EVENT_ERR, 0); + + return clib_error_return (0, "Error polling netlink socket %d", + f->file_descriptor); +} + +struct nl_cache * +lcp_nl_get_cache (lcp_nl_obj_t t) +{ + nl_main_t *nm = &nl_main; + + return nm->nl_caches[t]; +} + +/* Set the RX buffer size to be used on the netlink socket */ +void +lcp_nl_set_buffer_size (u32 buf_size) +{ + nl_main_t *nm = &nl_main; + + nm->rx_buf_size = buf_size; + + if (nm->sk_route) + nl_socket_set_buffer_size (nm->sk_route, nm->rx_buf_size, nm->tx_buf_size); +} + +/* Set the batch size - maximum netlink messages to process at one time */ +void +lcp_nl_set_batch_size (u32 batch_size) +{ + nl_main_t *nm = &nl_main; + + nm->batch_size = batch_size; +} + +/* Set the batch delay - how long to wait in ms between processing batches */ +void +lcp_nl_set_batch_delay (u32 batch_delay_ms) +{ + nl_main_t *nm = &nl_main; + + nm->batch_delay_ms = batch_delay_ms; +} + +static clib_error_t * +lcp_itf_pair_config (vlib_main_t *vm, unformat_input_t *input) +{ + u32 buf_size, batch_size, batch_delay_ms; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "nl-rx-buffer-size %u", &buf_size)) + lcp_nl_set_buffer_size (buf_size); + else if (unformat (input, "nl-batch-size %u", &batch_size)) + lcp_nl_set_batch_size (batch_size); + else if (unformat (input, "nl-batch-delay-ms %u", &batch_delay_ms)) + lcp_nl_set_batch_delay (batch_delay_ms); + else + return clib_error_return (0, "invalid netlink option: %U", + format_unformat_error, input); + } + + return NULL; +} + +VLIB_CONFIG_FUNCTION (lcp_itf_pair_config, "linux-nl"); + +static void +lcp_nl_close_socket (void) +{ + nl_main_t *nm = &nl_main; + + /* delete existing fd from epoll fd set */ + if (nm->clib_file_index != ~0) + { + clib_file_main_t *fm = &file_main; + clib_file_t *f = clib_file_get (fm, nm->clib_file_index); + + if (f) + { + NL_INFO ("Stopping poll of fd %u", f->file_descriptor); + fm->file_update (f, UNIX_FILE_UPDATE_DELETE); + } + else + /* stored index was not a valid file, reset stored index to ~0 */ + nm->clib_file_index = ~0; + } + + /* If we already created a socket, close/free it */ + if (nm->sk_route) + { + NL_INFO ("Closing netlink socket %d", nl_socket_get_fd (nm->sk_route)); + nl_socket_free (nm->sk_route); + nm->sk_route = NULL; + } +} + +static void +lcp_nl_open_socket (void) +{ + nl_main_t *nm = &nl_main; + int dest_ns_fd, curr_ns_fd; + + /* Allocate a new socket for both routes and acls + * Notifications do not use sequence numbers, disable sequence number + * checking. + * Define a callback function, which will be called for each notification + * received + */ + nm->sk_route = nl_socket_alloc (); + nl_socket_disable_seq_check (nm->sk_route); + + dest_ns_fd = lcp_get_default_ns_fd (); + if (dest_ns_fd) + { + curr_ns_fd = open ("/proc/self/ns/net", O_RDONLY); + setns (dest_ns_fd, CLONE_NEWNET); + } + + nl_connect (nm->sk_route, NETLINK_ROUTE); + + if (dest_ns_fd) + { + setns (curr_ns_fd, CLONE_NEWNET); + close (curr_ns_fd); + } + + /* Subscribe to all the 'routing' notifications on the route socket */ + nl_socket_add_memberships (nm->sk_route, RTNLGRP_LINK, RTNLGRP_IPV6_IFADDR, + RTNLGRP_IPV4_IFADDR, RTNLGRP_IPV4_ROUTE, + RTNLGRP_IPV6_ROUTE, RTNLGRP_NEIGH, RTNLGRP_NOTIFY, +#ifdef RTNLGRP_MPLS_ROUTE /* not defined on CentOS/RHEL 7 */ + RTNLGRP_MPLS_ROUTE, +#endif + RTNLGRP_IPV4_RULE, RTNLGRP_IPV6_RULE, 0); + + /* Set socket in nonblocking mode and increase buffer sizes */ + nl_socket_set_nonblocking (nm->sk_route); + nl_socket_set_buffer_size (nm->sk_route, nm->rx_buf_size, nm->tx_buf_size); + + if (nm->clib_file_index == ~0) + { + clib_file_t rt_file = { + .read_function = nl_route_read_cb, + .error_function = nl_route_error_cb, + .file_descriptor = nl_socket_get_fd (nm->sk_route), + .description = format (0, "linux-cp netlink route socket"), + }; + + nm->clib_file_index = clib_file_add (&file_main, &rt_file); + NL_INFO ("Added file %u", nm->clib_file_index); + } + else + /* clib file already created and socket was closed due to error */ + { + clib_file_main_t *fm = &file_main; + clib_file_t *f = clib_file_get (fm, nm->clib_file_index); + + f->file_descriptor = nl_socket_get_fd (nm->sk_route); + fm->file_update (f, UNIX_FILE_UPDATE_ADD); + NL_INFO ("Starting poll of %d", f->file_descriptor); + } + + nl_socket_modify_cb (nm->sk_route, NL_CB_VALID, NL_CB_CUSTOM, nl_route_cb, + NULL); + NL_INFO ("Opened netlink socket %d", nl_socket_get_fd (nm->sk_route)); +} + +#include +clib_error_t * +lcp_nl_init (vlib_main_t *vm) +{ + nl_main_t *nm = &nl_main; + lcp_itf_pair_vft_t nl_itf_pair_vft = { + .pair_add_fn = lcp_nl_pair_add_cb, + }; + + nm->clib_file_index = ~0; + nm->nl_logger = vlib_log_register_class ("nl", "nl"); + + lcp_nl_open_socket (); + lcp_itf_pair_register_vft (&nl_itf_pair_vft); + + return (NULL); +} + +VLIB_INIT_FUNCTION (lcp_nl_init) = { + .runs_after = VLIB_INITS ("lcp_interface_init", "tuntap_init", + "ip_neighbor_init"), +}; + +#include +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "linux Control Plane - Netlink listener", + .default_disabled = 1, +}; + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/linux-cp/lcp_nl.h b/src/plugins/linux-cp/lcp_nl.h new file mode 100644 index 00000000000..0016da7bbad --- /dev/null +++ b/src/plugins/linux-cp/lcp_nl.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2019 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +typedef void (*nl_rt_link_cb_t) (struct rtnl_link *rl, void *ctx); +typedef void (*nl_rt_addr_cb_t) (struct rtnl_addr *ra); +typedef void (*nl_rt_neigh_cb_t) (struct rtnl_neigh *rr); +typedef void (*nl_rt_route_cb_t) (struct rtnl_route *rn); + +#define NL_RT_COMMON uword is_mp_safe + +typedef struct nl_rt_link_t_ +{ + NL_RT_COMMON; + + nl_rt_link_cb_t cb; +} nl_rt_link_t; + +typedef struct nl_rt_addr_t_ +{ + NL_RT_COMMON; + + nl_rt_addr_cb_t cb; +} nl_rt_addr_t; + +typedef struct nl_rt_neigh_t_ +{ + NL_RT_COMMON; + + nl_rt_neigh_cb_t cb; +} nl_rt_neigh_t; + +typedef struct nl_rt_route_t_ +{ + NL_RT_COMMON; + + nl_rt_route_cb_t cb; +} nl_rt_route_t; + +#undef NL_RT_COMMON + +typedef struct nl_vft_t_ +{ + nl_rt_link_t nvl_rt_link_add; + nl_rt_link_t nvl_rt_link_del; + nl_rt_addr_t nvl_rt_addr_add; + nl_rt_addr_t nvl_rt_addr_del; + nl_rt_neigh_t nvl_rt_neigh_add; + nl_rt_neigh_t nvl_rt_neigh_del; + nl_rt_route_t nvl_rt_route_add; + nl_rt_route_t nvl_rt_route_del; +} nl_vft_t; + +extern void nl_register_vft (const nl_vft_t *nv); + +typedef enum lcp_nl_obj_t_ +{ + LCP_NL_LINK, + LCP_NL_ADDR, + LCP_NL_NEIGH, + LCP_NL_ROUTE, +} lcp_nl_obj_t; + +/* struct type to hold context on the netlink message being processed. + * + * At creation of a pair, a tap/tun is created and configured to match its + * corresponding hardware interface (MAC address, link state, MTU). Netlink + * messages are sent announcing the creation and subsequent configuration. + * We do not need to (and should not) act on those messages since applying + * those same configurations again is unnecessary and can be disruptive. So + * a timestamp for a message is stored and can be compared against the time + * the interface came under linux-cp management in order to figure out + * whether we should apply any configuration. + */ +typedef struct nl_msg_info +{ + struct nl_msg *msg; + f64 ts; +} nl_msg_info_t; + +#define LCP_NL_N_OBJS (LCP_NL_ROUTE + 1) + +extern struct nl_cache *lcp_nl_get_cache (lcp_nl_obj_t t); +extern int lcp_nl_drain_messages (void); +extern void lcp_nl_set_buffer_size (u32 buf_size); +extern void lcp_nl_set_batch_size (u32 batch_size); +extern void lcp_nl_set_batch_delay (u32 batch_delay_ms); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/linux-cp/lcp_router.c b/src/plugins/linux-cp/lcp_router.c new file mode 100644 index 00000000000..598fb13e979 --- /dev/null +++ b/src/plugins/linux-cp/lcp_router.c @@ -0,0 +1,1053 @@ +/* + * Copyright (c) 2019 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +//#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +typedef struct lcp_router_table_t_ +{ + uint32_t nlt_id; + fib_protocol_t nlt_proto; + u32 nlt_fib_index; + u32 nlt_mfib_index; + u32 nlt_refs; +} lcp_router_table_t; + +static uword *lcp_router_table_db[FIB_PROTOCOL_MAX]; +static lcp_router_table_t *lcp_router_table_pool; +static vlib_log_class_t lcp_router_logger; + +const static fib_prefix_t pfx_all1s = { + .fp_addr = { + .ip4 = { + .as_u32 = 0xffffffff, + } + }, + .fp_proto = FIB_PROTOCOL_IP4, + .fp_len = 32, +}; + +static fib_source_t lcp_rt_fib_src; +static fib_source_t lcp_rt_fib_src_dynamic; + +#define LCP_ROUTER_DBG(...) vlib_log_debug (lcp_router_logger, __VA_ARGS__); + +#define LCP_ROUTER_INFO(...) vlib_log_notice (lcp_router_logger, __VA_ARGS__); + +#define LCP_ROUTER_ERROR(...) vlib_log_err (lcp_router_logger, __VA_ARGS__); + +static const mfib_prefix_t ip4_specials[] = { + /* ALL prefixes are in network order */ + { + /* (*,224.0.0.0)/24 - all local subnet */ + .fp_grp_addr = { + .ip4.data_u32 = 0x000000e0, + }, + .fp_len = 24, + .fp_proto = FIB_PROTOCOL_IP4, + }, +}; + +static const mfib_prefix_t ip6_specials[] = { + /* ALL prefixes are in network order */ + { + /* (*,ff00::)/8 - all local subnet */ + .fp_grp_addr = { + .ip6.as_u64[0] = 0x00000000000000ff, + }, + .fp_len = 8, + .fp_proto = FIB_PROTOCOL_IP6, + }, +}; + +/* VIF to PHY DB of managed interfaces */ +static uword *lcp_routing_itf_db; + +static u32 +lcp_router_intf_h2p (u32 host) +{ + lcp_itf_pair_t *lip; + index_t lipi; + uword *p; + + /* + * first check the linux side created interface (i.e. vlans, tunnels etc) + */ + p = hash_get (lcp_routing_itf_db, host); + + if (p) + return p[0]; + + /* + * then check the paired phys + */ + lipi = lcp_itf_pair_find_by_vif (host); + + if (INDEX_INVALID == lipi) + return (~0); + + lip = lcp_itf_pair_get (lipi); + + return lip->lip_phy_sw_if_index; +} + +/* + * Check timestamps on netlink message and interface pair to decide whether + * the message should be applied. See the declaration of nl_msg_info_t for + * an explanation on why this is necessary. + * If timestamps are good (message ts is newer than intf pair ts), return 0. + * Else, return -1. + */ +static int +lcp_router_lip_ts_check (nl_msg_info_t *msg_info, lcp_itf_pair_t *lip) +{ + if (msg_info->ts > lip->lip_create_ts) + return 0; + + LCP_ROUTER_INFO ("Early message received for %U", + format_vnet_sw_if_index_name, vnet_get_main (), + lip->lip_phy_sw_if_index); + return -1; +} + +static void +lcp_router_link_del (struct rtnl_link *rl, void *ctx) +{ + index_t lipi; + + if (!lcp_auto_subint ()) + return; + + lipi = lcp_itf_pair_find_by_vif (rtnl_link_get_ifindex (rl)); + + if (INDEX_INVALID != lipi) + { + lcp_itf_pair_t *lip; + + lip = lcp_itf_pair_get (lipi); + + if (lcp_router_lip_ts_check ((nl_msg_info_t *) ctx, lip)) + return; + + LCP_ROUTER_INFO ("delete link: %s - %U", rtnl_link_get_type (rl), + format_vnet_sw_if_index_name, vnet_get_main (), + lip->lip_phy_sw_if_index); + lcp_itf_pair_delete (lip->lip_phy_sw_if_index); + + if (rtnl_link_is_vlan (rl)) + { + LCP_ROUTER_INFO ("delete vlan: %s -> %U", rtnl_link_get_name (rl), + format_vnet_sw_if_index_name, vnet_get_main (), + lip->lip_phy_sw_if_index); + vnet_delete_sub_interface (lip->lip_phy_sw_if_index); + vnet_delete_sub_interface (lip->lip_host_sw_if_index); + } + } + else + LCP_ROUTER_INFO ("ignore link del: %s - %s", rtnl_link_get_type (rl), + rtnl_link_get_name (rl)); +} + +static void +lcp_router_ip4_mroutes_add_del (u32 sw_if_index, u8 is_add) +{ + const fib_route_path_t path = { + .frp_proto = DPO_PROTO_IP4, + .frp_addr = zero_addr, + .frp_sw_if_index = sw_if_index, + .frp_fib_index = ~0, + .frp_weight = 1, + .frp_mitf_flags = MFIB_ITF_FLAG_ACCEPT, + }; + u32 mfib_index; + int ii; + + mfib_index = + mfib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP4, sw_if_index); + + for (ii = 0; ii < ARRAY_LEN (ip4_specials); ii++) + { + if (is_add) + { + mfib_table_entry_path_update (mfib_index, &ip4_specials[ii], + MFIB_SOURCE_PLUGIN_LOW, + MFIB_ENTRY_FLAG_NONE, &path); + } + else + { + mfib_table_entry_path_remove (mfib_index, &ip4_specials[ii], + MFIB_SOURCE_PLUGIN_LOW, &path); + } + } +} + +static void +lcp_router_ip6_mroutes_add_del (u32 sw_if_index, u8 is_add) +{ + const fib_route_path_t path = { + .frp_proto = DPO_PROTO_IP6, + .frp_addr = zero_addr, + .frp_sw_if_index = sw_if_index, + .frp_fib_index = ~0, + .frp_weight = 1, + .frp_mitf_flags = MFIB_ITF_FLAG_ACCEPT, + }; + u32 mfib_index; + int ii; + + mfib_index = + mfib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index); + + for (ii = 0; ii < ARRAY_LEN (ip6_specials); ii++) + { + if (is_add) + { + mfib_table_entry_path_update (mfib_index, &ip6_specials[ii], + MFIB_SOURCE_PLUGIN_LOW, + MFIB_ENTRY_FLAG_NONE, &path); + } + else + { + mfib_table_entry_path_remove (mfib_index, &ip6_specials[ii], + MFIB_SOURCE_PLUGIN_LOW, &path); + } + } +} + +static void +lcp_router_link_mtu (struct rtnl_link *rl, u32 sw_if_index) +{ + vnet_main_t *vnm = vnet_get_main (); + u32 mtu; + vnet_sw_interface_t *sw; + vnet_hw_interface_t *hw; + + mtu = rtnl_link_get_mtu (rl); + if (!mtu) + return; + + sw = vnet_get_sw_interface (vnm, sw_if_index); + hw = vnet_get_sup_hw_interface (vnm, sw_if_index); + + /* If HW interface, try to change hw link */ + if ((sw->type == sw->sup_sw_if_index) && + (hw->hw_class_index == ethernet_hw_interface_class.index)) + vnet_hw_interface_set_mtu (vnm, hw->hw_if_index, mtu); + else + vnet_sw_interface_set_mtu (vnm, sw->sw_if_index, mtu); +} + +static void +lcp_router_link_addr (struct rtnl_link *rl, lcp_itf_pair_t *lip) +{ + vnet_main_t *vnm = vnet_get_main (); + struct nl_addr *mac_addr; + vnet_sw_interface_t *sw; + vnet_hw_interface_t *hw; + void *mac_addr_bytes; + + mac_addr = rtnl_link_get_addr (rl); + if (!mac_addr || (nl_addr_get_family (mac_addr) != AF_LLC)) + return; + + sw = vnet_get_sw_interface (vnm, lip->lip_phy_sw_if_index); + + /* can only change address on hw interface */ + if (sw->sw_if_index != sw->sup_sw_if_index) + return; + + hw = vnet_get_sup_hw_interface (vnm, lip->lip_phy_sw_if_index); + if (!vec_len (hw->hw_address)) + return; + + mac_addr_bytes = nl_addr_get_binary_addr (mac_addr); + if (clib_memcmp (mac_addr_bytes, hw->hw_address, nl_addr_get_len (mac_addr))) + vnet_hw_interface_change_mac_address (vnm, hw->hw_if_index, + mac_addr_bytes); + + /* mcast adjacencies need to be updated */ + vnet_update_adjacency_for_sw_interface (vnm, lip->lip_phy_sw_if_index, + lip->lip_phy_adjs.adj_index[AF_IP4]); + vnet_update_adjacency_for_sw_interface (vnm, lip->lip_phy_sw_if_index, + lip->lip_phy_adjs.adj_index[AF_IP6]); +} + +static void +lcp_router_link_add (struct rtnl_link *rl, void *ctx) +{ + index_t lipi; + int up; + vnet_main_t *vnm = vnet_get_main (); + + lipi = lcp_itf_pair_find_by_vif (rtnl_link_get_ifindex (rl)); + up = IFF_UP & rtnl_link_get_flags (rl); + + if (INDEX_INVALID != lipi) + { + lcp_itf_pair_t *lip; + + lip = lcp_itf_pair_get (lipi); + if (!vnet_get_sw_interface (vnm, lip->lip_phy_sw_if_index)) + return; + + if (lcp_router_lip_ts_check ((nl_msg_info_t *) ctx, lip)) + return; + + if (up) + { + vnet_sw_interface_admin_up (vnet_get_main (), + lip->lip_phy_sw_if_index); + } + else + { + vnet_sw_interface_admin_down (vnet_get_main (), + lip->lip_phy_sw_if_index); + } + LCP_ROUTER_DBG ("link: %s (%d) -> %U/%U %s", rtnl_link_get_name (rl), + rtnl_link_get_ifindex (rl), format_vnet_sw_if_index_name, + vnm, lip->lip_phy_sw_if_index, + format_vnet_sw_if_index_name, vnm, + lip->lip_host_sw_if_index, (up ? "up" : "down")); + + lcp_router_link_mtu (rl, lip->lip_phy_sw_if_index); + lcp_router_link_addr (rl, lip); + } + else if (lcp_auto_subint () && rtnl_link_is_vlan (rl)) + { + /* Find the pair based on the parent VIF */ + lipi = lcp_itf_pair_find_by_vif (rtnl_link_get_link (rl)); + + if (INDEX_INVALID != lipi) + { + u32 sub_phy_sw_if_index, sub_host_sw_if_index; + const lcp_itf_pair_t *lip; + int vlan; + u8 *ns = 0; /* FIXME */ + + lip = lcp_itf_pair_get (lipi); + + vlan = rtnl_link_vlan_get_id (rl); + + /* create the vlan interface on the parent phy */ + if (vnet_create_sub_interface (lip->lip_phy_sw_if_index, vlan, 18, 0, + vlan, &sub_phy_sw_if_index)) + { + LCP_ROUTER_INFO ("failed create phy vlan: %s on %U", + rtnl_link_get_name (rl), + format_vnet_sw_if_index_name, vnet_get_main (), + lip->lip_phy_sw_if_index); + return; + } + /* create the vlan interface on the parent host */ + if (vnet_create_sub_interface (lip->lip_host_sw_if_index, vlan, 18, + 0, vlan, &sub_host_sw_if_index)) + { + LCP_ROUTER_INFO ("failed create vlan: %s on %U", + rtnl_link_get_name (rl), + format_vnet_sw_if_index_name, vnet_get_main (), + lip->lip_host_sw_if_index); + return; + } + + char *if_name; + u8 *if_namev = 0; + + LCP_ROUTER_INFO ( + "create vlan: %s -> (%U, %U) : (%U, %U)", rtnl_link_get_name (rl), + format_vnet_sw_if_index_name, vnet_get_main (), + lip->lip_phy_sw_if_index, format_vnet_sw_if_index_name, + vnet_get_main (), sub_phy_sw_if_index, + format_vnet_sw_if_index_name, vnet_get_main (), + lip->lip_host_sw_if_index, format_vnet_sw_if_index_name, + vnet_get_main (), sub_host_sw_if_index); + + if ((if_name = rtnl_link_get_name (rl)) != NULL) + vec_validate_init_c_string (if_namev, if_name, + strnlen (if_name, IFNAMSIZ)); + lcp_itf_pair_add (sub_host_sw_if_index, sub_phy_sw_if_index, + if_namev, rtnl_link_get_ifindex (rl), + lip->lip_host_type, ns); + if (up) + vnet_sw_interface_admin_up (vnet_get_main (), sub_phy_sw_if_index); + vnet_sw_interface_admin_up (vnet_get_main (), sub_host_sw_if_index); + + vec_free (if_namev); + } + else + { + LCP_ROUTER_INFO ("ignore parent-link add: %s - %s", + rtnl_link_get_type (rl), rtnl_link_get_name (rl)); + } + } + else + LCP_ROUTER_INFO ("ignore link add: %s - %s", rtnl_link_get_type (rl), + rtnl_link_get_name (rl)); +} + +static fib_protocol_t +lcp_router_proto_k2f (uint32_t k) +{ + if (AF_INET6 == k) + return (FIB_PROTOCOL_IP6); + return (FIB_PROTOCOL_IP4); +} + +static void +lcp_router_mk_addr (const struct nl_addr *rna, ip_address_t *ia) +{ + fib_protocol_t fproto; + + ip_address_reset (ia); + fproto = lcp_router_proto_k2f (nl_addr_get_family (rna)); + + ip_address_set (ia, nl_addr_get_binary_addr (rna), + FIB_PROTOCOL_IP4 == fproto ? AF_IP4 : AF_IP6); +} + +static fib_protocol_t +lcp_router_mk_addr46 (const struct nl_addr *rna, ip46_address_t *ia) +{ + fib_protocol_t fproto; + + fproto = lcp_router_proto_k2f (nl_addr_get_family (rna)); + ip46_address_reset (ia); + if (FIB_PROTOCOL_IP4 == fproto) + memcpy (&ia->ip4, nl_addr_get_binary_addr (rna), nl_addr_get_len (rna)); + else + memcpy (&ia->ip6, nl_addr_get_binary_addr (rna), nl_addr_get_len (rna)); + + return (fproto); +} + +static void +lcp_router_link_addr_add_del (struct rtnl_addr *rla, int is_del) +{ + u32 sw_if_index; + + sw_if_index = lcp_router_intf_h2p (rtnl_addr_get_ifindex (rla)); + + if (~0 != sw_if_index) + { + ip_address_t nh; + + lcp_router_mk_addr (rtnl_addr_get_local (rla), &nh); + + if (AF_IP4 == ip_addr_version (&nh)) + { + ip4_add_del_interface_address ( + vlib_get_main (), sw_if_index, &ip_addr_v4 (&nh), + rtnl_addr_get_prefixlen (rla), is_del); + lcp_router_ip4_mroutes_add_del (sw_if_index, !is_del); + } + else if (AF_IP6 == ip_addr_version (&nh)) + { + if (ip6_address_is_link_local_unicast (&ip_addr_v6 (&nh))) + if (is_del) + ip6_link_disable (sw_if_index); + else + { + ip6_link_enable (sw_if_index, NULL); + ip6_link_set_local_address (sw_if_index, &ip_addr_v6 (&nh)); + } + else + ip6_add_del_interface_address ( + vlib_get_main (), sw_if_index, &ip_addr_v6 (&nh), + rtnl_addr_get_prefixlen (rla), is_del); + lcp_router_ip6_mroutes_add_del (sw_if_index, !is_del); + } + + LCP_ROUTER_DBG ("link-addr: %U %U/%d", format_vnet_sw_if_index_name, + vnet_get_main (), sw_if_index, format_ip_address, &nh, + rtnl_addr_get_prefixlen (rla)); + } +} + +static void +lcp_router_link_addr_del (struct rtnl_addr *la) +{ + lcp_router_link_addr_add_del (la, 1); +} + +static void +lcp_router_link_addr_add (struct rtnl_addr *la) +{ + lcp_router_link_addr_add_del (la, 0); +} + +static void +lcp_router_mk_mac_addr (const struct nl_addr *rna, mac_address_t *mac) +{ + mac_address_from_bytes (mac, nl_addr_get_binary_addr (rna)); +} + +static void +lcp_router_neigh_del (struct rtnl_neigh *rn) +{ + u32 sw_if_index; + + sw_if_index = lcp_router_intf_h2p (rtnl_neigh_get_ifindex (rn)); + + if (~0 != sw_if_index) + { + ip_address_t nh; + int rv; + + lcp_router_mk_addr (rtnl_neigh_get_dst (rn), &nh); + + rv = ip_neighbor_del (&nh, sw_if_index); + + if (rv) + { + LCP_ROUTER_ERROR ( + "Failed to delete neighbor: %U %U", format_ip_address, &nh, + format_vnet_sw_if_index_name, vnet_get_main (), sw_if_index); + } + else + { + LCP_ROUTER_DBG ("neighbor del: %U %U", format_ip_address, &nh, + format_vnet_sw_if_index_name, vnet_get_main (), + sw_if_index); + } + } + else + LCP_ROUTER_INFO ("ignore neighbour del on: %d", + rtnl_neigh_get_ifindex (rn)); +} + +#ifndef NUD_VALID +#define NUD_VALID \ + (NUD_PERMANENT | NUD_NOARP | NUD_REACHABLE | NUD_PROBE | NUD_STALE | \ + NUD_DELAY) +#endif + +static void +lcp_router_neigh_add (struct rtnl_neigh *rn) +{ + u32 sw_if_index; + + sw_if_index = lcp_router_intf_h2p (rtnl_neigh_get_ifindex (rn)); + + if (~0 != sw_if_index) + { + struct nl_addr *ll; + ip_address_t nh; + int state; + + lcp_router_mk_addr (rtnl_neigh_get_dst (rn), &nh); + ll = rtnl_neigh_get_lladdr (rn); + state = rtnl_neigh_get_state (rn); + + if (ll && (state & NUD_VALID)) + { + mac_address_t mac; + ip_neighbor_flags_t flags; + int rv; + + lcp_router_mk_mac_addr (ll, &mac); + + if (state & (NUD_NOARP | NUD_PERMANENT)) + flags = IP_NEIGHBOR_FLAG_STATIC; + else + flags = IP_NEIGHBOR_FLAG_DYNAMIC; + + rv = ip_neighbor_add (&nh, &mac, sw_if_index, flags, NULL); + + if (rv) + { + LCP_ROUTER_ERROR ( + "Failed to create neighbor: %U %U", format_ip_address, &nh, + format_vnet_sw_if_index_name, vnet_get_main (), sw_if_index); + } + else + { + LCP_ROUTER_DBG ("neighbor add: %U %U", format_ip_address, &nh, + format_vnet_sw_if_index_name, vnet_get_main (), + sw_if_index); + } + } + else + /* It's a delete */ + lcp_router_neigh_del (rn); + } + else + LCP_ROUTER_INFO ("ignore neighbour add on: %d", + rtnl_neigh_get_ifindex (rn)); +} + +static lcp_router_table_t * +lcp_router_table_find (uint32_t id, fib_protocol_t fproto) +{ + uword *p; + + p = hash_get (lcp_router_table_db[fproto], id); + + if (p) + return pool_elt_at_index (lcp_router_table_pool, p[0]); + + return (NULL); +} + +static uint32_t +lcp_router_table_k2f (uint32_t k) +{ + // the kernel's table ID 255 is the default table + if (k == 255 || k == 254) + return 0; + return k; +} + +static lcp_router_table_t * +lcp_router_table_add_or_lock (uint32_t id, fib_protocol_t fproto) +{ + lcp_router_table_t *nlt; + + id = lcp_router_table_k2f (id); + nlt = lcp_router_table_find (id, fproto); + + if (NULL == nlt) + { + pool_get_zero (lcp_router_table_pool, nlt); + + nlt->nlt_id = id; + nlt->nlt_proto = fproto; + + nlt->nlt_fib_index = fib_table_find_or_create_and_lock ( + nlt->nlt_proto, nlt->nlt_id, lcp_rt_fib_src); + nlt->nlt_mfib_index = mfib_table_find_or_create_and_lock ( + nlt->nlt_proto, nlt->nlt_id, MFIB_SOURCE_PLUGIN_LOW); + + hash_set (lcp_router_table_db[fproto], nlt->nlt_id, + nlt - lcp_router_table_pool); + + if (FIB_PROTOCOL_IP4 == fproto) + { + /* Set the all 1s address in this table to punt */ + fib_table_entry_special_add (nlt->nlt_fib_index, &pfx_all1s, + lcp_rt_fib_src, FIB_ENTRY_FLAG_LOCAL); + + const fib_route_path_t path = { + .frp_proto = DPO_PROTO_IP4, + .frp_addr = zero_addr, + .frp_sw_if_index = ~0, + .frp_fib_index = ~0, + .frp_weight = 1, + .frp_mitf_flags = MFIB_ITF_FLAG_FORWARD, + .frp_flags = FIB_ROUTE_PATH_LOCAL, + }; + int ii; + + for (ii = 0; ii < ARRAY_LEN (ip4_specials); ii++) + { + mfib_table_entry_path_update ( + nlt->nlt_mfib_index, &ip4_specials[ii], MFIB_SOURCE_PLUGIN_LOW, + MFIB_ENTRY_FLAG_NONE, &path); + } + } + else if (FIB_PROTOCOL_IP6 == fproto) + { + const fib_route_path_t path = { + .frp_proto = DPO_PROTO_IP6, + .frp_addr = zero_addr, + .frp_sw_if_index = ~0, + .frp_fib_index = ~0, + .frp_weight = 1, + .frp_mitf_flags = MFIB_ITF_FLAG_FORWARD, + .frp_flags = FIB_ROUTE_PATH_LOCAL, + }; + int ii; + + for (ii = 0; ii < ARRAY_LEN (ip6_specials); ii++) + { + mfib_table_entry_path_update ( + nlt->nlt_mfib_index, &ip6_specials[ii], MFIB_SOURCE_PLUGIN_LOW, + MFIB_ENTRY_FLAG_NONE, &path); + } + } + } + + nlt->nlt_refs++; + + return (nlt); +} + +static void +lcp_router_table_unlock (lcp_router_table_t *nlt) +{ + nlt->nlt_refs--; + + if (0 == nlt->nlt_refs) + { + if (FIB_PROTOCOL_IP4 == nlt->nlt_proto) + { + /* Set the all 1s address in this table to punt */ + fib_table_entry_special_remove (nlt->nlt_fib_index, &pfx_all1s, + lcp_rt_fib_src); + } + + fib_table_unlock (nlt->nlt_fib_index, nlt->nlt_proto, lcp_rt_fib_src); + + hash_unset (lcp_router_table_db[nlt->nlt_proto], nlt->nlt_id); + pool_put (lcp_router_table_pool, nlt); + } +} + +static void +lcp_router_route_mk_prefix (struct rtnl_route *r, fib_prefix_t *p) +{ + const struct nl_addr *addr = rtnl_route_get_dst (r); + + p->fp_len = nl_addr_get_prefixlen (addr); + p->fp_proto = lcp_router_mk_addr46 (addr, &p->fp_addr); +} + +static void +lcp_router_route_mk_mprefix (struct rtnl_route *r, mfib_prefix_t *p) +{ + const struct nl_addr *addr; + + addr = rtnl_route_get_dst (r); + + p->fp_len = nl_addr_get_prefixlen (addr); + p->fp_proto = lcp_router_mk_addr46 (addr, &p->fp_grp_addr); + + addr = rtnl_route_get_src (r); + if (addr) + p->fp_proto = lcp_router_mk_addr46 (addr, &p->fp_src_addr); +} + +typedef struct lcp_router_route_path_parse_t_ +{ + fib_route_path_t *paths; + fib_protocol_t route_proto; + bool is_mcast; + fib_route_path_flags_t type_flags; + u8 preference; +} lcp_router_route_path_parse_t; + +static void +lcp_router_route_path_parse (struct rtnl_nexthop *rnh, void *arg) +{ + lcp_router_route_path_parse_t *ctx = arg; + fib_route_path_t *path; + u32 sw_if_index; + + sw_if_index = lcp_router_intf_h2p (rtnl_route_nh_get_ifindex (rnh)); + + if (~0 != sw_if_index) + { + fib_protocol_t fproto; + struct nl_addr *addr; + + vec_add2 (ctx->paths, path, 1); + + path->frp_flags = FIB_ROUTE_PATH_FLAG_NONE | ctx->type_flags; + path->frp_sw_if_index = sw_if_index; + path->frp_weight = rtnl_route_nh_get_weight (rnh); + path->frp_preference = ctx->preference; + + addr = rtnl_route_nh_get_gateway (rnh); + + if (addr) + fproto = lcp_router_mk_addr46 (rtnl_route_nh_get_gateway (rnh), + &path->frp_addr); + else + fproto = ctx->route_proto; + + path->frp_proto = fib_proto_to_dpo (fproto); + + if (ctx->is_mcast) + path->frp_mitf_flags = MFIB_ITF_FLAG_FORWARD; + + LCP_ROUTER_DBG (" path:[%U]", format_fib_route_path, path); + } +} + +/* + * blackhole, unreachable, prohibit will not have a next hop in an + * RTM_NEWROUTE. Add a path for them. + */ +static void +lcp_router_route_path_add_special (struct rtnl_route *rr, + lcp_router_route_path_parse_t *ctx) +{ + fib_route_path_t *path; + + if (rtnl_route_get_type (rr) < RTN_BLACKHOLE) + return; + + /* if it already has a path, it does not need us to add one */ + if (vec_len (ctx->paths) > 0) + return; + + vec_add2 (ctx->paths, path, 1); + + path->frp_flags = FIB_ROUTE_PATH_FLAG_NONE | ctx->type_flags; + path->frp_sw_if_index = ~0; + path->frp_proto = fib_proto_to_dpo (ctx->route_proto); + path->frp_preference = ctx->preference; + + LCP_ROUTER_DBG (" path:[%U]", format_fib_route_path, path); +} + +/* + * Map of supported route types. Some types are omitted: + * RTN_LOCAL - interface address addition creates these automatically + * RTN_BROADCAST - same as RTN_LOCAL + * RTN_UNSPEC, RTN_ANYCAST, RTN_THROW, RTN_NAT, RTN_XRESOLVE - + * There's not a VPP equivalent for these currently. + */ +static const u8 lcp_router_route_type_valid[__RTN_MAX] = { + [RTN_UNICAST] = 1, [RTN_MULTICAST] = 1, [RTN_BLACKHOLE] = 1, + [RTN_UNREACHABLE] = 1, [RTN_PROHIBIT] = 1, +}; + +/* Map of fib entry flags by route type */ +static const fib_entry_flag_t lcp_router_route_type_feflags[__RTN_MAX] = { + [RTN_LOCAL] = FIB_ENTRY_FLAG_LOCAL | FIB_ENTRY_FLAG_CONNECTED, + [RTN_BROADCAST] = FIB_ENTRY_FLAG_DROP | FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT, + [RTN_BLACKHOLE] = FIB_ENTRY_FLAG_DROP, +}; + +/* Map of fib route path flags by route type */ +static const fib_route_path_flags_t + lcp_router_route_type_frpflags[__RTN_MAX] = { + [RTN_UNREACHABLE] = FIB_ROUTE_PATH_ICMP_UNREACH, + [RTN_PROHIBIT] = FIB_ROUTE_PATH_ICMP_PROHIBIT, + [RTN_BLACKHOLE] = FIB_ROUTE_PATH_DROP, + }; + +static inline fib_source_t +lcp_router_proto_fib_source (u8 rt_proto) +{ + return (rt_proto <= RTPROT_STATIC) ? lcp_rt_fib_src : lcp_rt_fib_src_dynamic; +} + +static fib_entry_flag_t +lcp_router_route_mk_entry_flags (uint8_t rtype, int table_id, uint8_t rproto) +{ + fib_entry_flag_t fef = FIB_ENTRY_FLAG_NONE; + + fef |= lcp_router_route_type_feflags[rtype]; + if ((rproto == RTPROT_KERNEL) || PREDICT_FALSE (255 == table_id)) + /* kernel proto is interface prefixes, 255 is linux's 'local' table */ + fef |= FIB_ENTRY_FLAG_ATTACHED | FIB_ENTRY_FLAG_CONNECTED; + + return (fef); +} + +static void +lcp_router_route_del (struct rtnl_route *rr) +{ + fib_entry_flag_t entry_flags; + uint32_t table_id; + fib_prefix_t pfx; + lcp_router_table_t *nlt; + uint8_t rtype, rproto; + + rtype = rtnl_route_get_type (rr); + table_id = rtnl_route_get_table (rr); + rproto = rtnl_route_get_protocol (rr); + + /* skip unsupported route types and local table */ + if (!lcp_router_route_type_valid[rtype] || (table_id == 255)) + return; + + lcp_router_route_mk_prefix (rr, &pfx); + entry_flags = lcp_router_route_mk_entry_flags (rtype, table_id, rproto); + nlt = lcp_router_table_find (lcp_router_table_k2f (table_id), pfx.fp_proto); + + LCP_ROUTER_DBG ("route del: %d:%U %U", rtnl_route_get_table (rr), + format_fib_prefix, &pfx, format_fib_entry_flags, + entry_flags); + + if (NULL == nlt) + return; + + lcp_router_route_path_parse_t np = { + .route_proto = pfx.fp_proto, + .type_flags = lcp_router_route_type_frpflags[rtype], + }; + + rtnl_route_foreach_nexthop (rr, lcp_router_route_path_parse, &np); + lcp_router_route_path_add_special (rr, &np); + + if (0 != vec_len (np.paths)) + { + fib_source_t fib_src; + + fib_src = lcp_router_proto_fib_source (rproto); + + if (pfx.fp_proto == FIB_PROTOCOL_IP6) + fib_table_entry_delete (nlt->nlt_fib_index, &pfx, fib_src); + else + fib_table_entry_path_remove2 (nlt->nlt_fib_index, &pfx, fib_src, + np.paths); + } + + vec_free (np.paths); + + lcp_router_table_unlock (nlt); +} + +static void +lcp_router_route_add (struct rtnl_route *rr) +{ + fib_entry_flag_t entry_flags; + uint32_t table_id; + fib_prefix_t pfx; + lcp_router_table_t *nlt; + uint8_t rtype, rproto; + + rtype = rtnl_route_get_type (rr); + table_id = rtnl_route_get_table (rr); + rproto = rtnl_route_get_protocol (rr); + + /* skip unsupported route types and local table */ + if (!lcp_router_route_type_valid[rtype] || (table_id == 255)) + return; + + lcp_router_route_mk_prefix (rr, &pfx); + entry_flags = lcp_router_route_mk_entry_flags (rtype, table_id, rproto); + + /* link local IPv6 */ + if (FIB_PROTOCOL_IP6 == pfx.fp_proto && + (ip6_address_is_multicast (&pfx.fp_addr.ip6) || + ip6_address_is_link_local_unicast (&pfx.fp_addr.ip6))) + { + LCP_ROUTER_DBG ("route skip: %d:%U %U", rtnl_route_get_table (rr), + format_fib_prefix, &pfx, format_fib_entry_flags, + entry_flags); + } + else + { + LCP_ROUTER_DBG ("route add: %d:%U %U", rtnl_route_get_table (rr), + format_fib_prefix, &pfx, format_fib_entry_flags, + entry_flags); + + lcp_router_route_path_parse_t np = { + .route_proto = pfx.fp_proto, + .is_mcast = (rtype == RTN_MULTICAST), + .type_flags = lcp_router_route_type_frpflags[rtype], + .preference = (u8) rtnl_route_get_priority (rr), + }; + + rtnl_route_foreach_nexthop (rr, lcp_router_route_path_parse, &np); + lcp_router_route_path_add_special (rr, &np); + + if (0 != vec_len (np.paths)) + { + nlt = lcp_router_table_add_or_lock (table_id, pfx.fp_proto); + if (rtype == RTN_MULTICAST) + { + /* it's not clear to me how linux expresses the RPF paramters + * so we'll allow from all interfaces and hope for the best */ + mfib_prefix_t mpfx = {}; + + lcp_router_route_mk_mprefix (rr, &mpfx); + + mfib_table_entry_update ( + nlt->nlt_mfib_index, &mpfx, MFIB_SOURCE_PLUGIN_LOW, + MFIB_RPF_ID_NONE, MFIB_ENTRY_FLAG_ACCEPT_ALL_ITF); + + mfib_table_entry_paths_update (nlt->nlt_mfib_index, &mpfx, + MFIB_SOURCE_PLUGIN_LOW, + MFIB_ENTRY_FLAG_NONE, np.paths); + } + else + { + fib_source_t fib_src; + + fib_src = lcp_router_proto_fib_source (rproto); + + if (pfx.fp_proto == FIB_PROTOCOL_IP6) + fib_table_entry_path_add2 (nlt->nlt_fib_index, &pfx, fib_src, + entry_flags, np.paths); + else + fib_table_entry_update (nlt->nlt_fib_index, &pfx, fib_src, + entry_flags, np.paths); + } + } + else + LCP_ROUTER_DBG ("no paths for route add: %d:%U %U", + rtnl_route_get_table (rr), format_fib_prefix, &pfx, + format_fib_entry_flags, entry_flags); + vec_free (np.paths); + } +} + +const nl_vft_t lcp_router_vft = { + .nvl_rt_link_add = { .is_mp_safe = 0, .cb = lcp_router_link_add }, + .nvl_rt_link_del = { .is_mp_safe = 0, .cb = lcp_router_link_del }, + .nvl_rt_addr_add = { .is_mp_safe = 0, .cb = lcp_router_link_addr_add }, + .nvl_rt_addr_del = { .is_mp_safe = 0, .cb = lcp_router_link_addr_del }, + .nvl_rt_neigh_add = { .is_mp_safe = 0, .cb = lcp_router_neigh_add }, + .nvl_rt_neigh_del = { .is_mp_safe = 0, .cb = lcp_router_neigh_del }, + .nvl_rt_route_add = { .is_mp_safe = 1, .cb = lcp_router_route_add }, + .nvl_rt_route_del = { .is_mp_safe = 1, .cb = lcp_router_route_del }, +}; + +static clib_error_t * +lcp_router_init (vlib_main_t *vm) +{ + lcp_router_logger = vlib_log_register_class ("linux-cp", "router"); + + nl_register_vft (&lcp_router_vft); + + /* + * allocate 2 route sources. The low priority source will be for + * dynamic routes. If a dynamic route daemon (FRR) tries to remove its + * route, it will use the low priority source to ensure it will not + * remove static routes which were added with the higher priority source. + */ + lcp_rt_fib_src = + fib_source_allocate ("lcp-rt", FIB_SOURCE_PRIORITY_HI, FIB_SOURCE_BH_API); + + lcp_rt_fib_src_dynamic = fib_source_allocate ( + "lcp-rt-dynamic", FIB_SOURCE_PRIORITY_HI + 1, FIB_SOURCE_BH_API); + + return (NULL); +} + +VLIB_INIT_FUNCTION (lcp_router_init) = { + .runs_before = VLIB_INITS ("lcp_nl_init"), +}; + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */