From 650223c0ae8fe781aea5f26c92a6cf9bc2ca59e5 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 14 Nov 2018 16:55:53 +0100 Subject: [PATCH] ethernet-input optimizations Change-Id: I4ec7750ef58363bd8966a16a2baeec6db18b7e9e Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/init.c | 8 +- src/plugins/dpdk/device/node.c | 291 +++++++--------------- src/vnet/ethernet/ethernet.h | 15 ++ src/vnet/ethernet/interface.c | 1 + src/vnet/ethernet/node.c | 539 +++++++++++++++++++++++++++++++++++++++-- src/vnet/pg/input.c | 20 +- 6 files changed, 637 insertions(+), 237 deletions(-) diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index e8718f73ab9..c2e4d220ae2 100644 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -1824,10 +1824,10 @@ dpdk_init (vlib_main_t * vm) vec_add1 (dm->conf->eal_init_args, (u8 *) "vnet"); /* Default vlib_buffer_t flags, DISABLES tcp/udp checksumming... */ - dm->buffer_flags_template = - (VLIB_BUFFER_TOTAL_LENGTH_VALID | VLIB_BUFFER_EXT_HDR_VALID - | VNET_BUFFER_F_L4_CHECKSUM_COMPUTED | - VNET_BUFFER_F_L4_CHECKSUM_CORRECT | VNET_BUFFER_F_L2_HDR_OFFSET_VALID); + dm->buffer_flags_template = (VLIB_BUFFER_TOTAL_LENGTH_VALID | + VLIB_BUFFER_EXT_HDR_VALID | + VNET_BUFFER_F_L4_CHECKSUM_COMPUTED | + VNET_BUFFER_F_L4_CHECKSUM_CORRECT); dm->stat_poll_interval = DPDK_STATS_POLL_INTERVAL; dm->link_state_poll_interval = DPDK_LINK_POLL_INTERVAL; diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index 6a3c1c936fc..a0144f702c6 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -34,46 +34,13 @@ static char *dpdk_error_strings[] = { #undef _ }; -STATIC_ASSERT (VNET_DEVICE_INPUT_NEXT_IP4_INPUT - 1 == - VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT, - "IP4_INPUT must follow IP4_NCS_INPUT"); - -enum -{ - DPDK_RX_F_CKSUM_GOOD = 7, - DPDK_RX_F_CKSUM_BAD = 4, - DPDK_RX_F_FDIR = 2, -}; - -/* currently we are just copying bit positions from DPDK, but that - might change in future, in case we start to be interested in something - stored in upper bytes. Currently we store only lower byte for perf reasons */ -STATIC_ASSERT (1 << DPDK_RX_F_CKSUM_GOOD == PKT_RX_IP_CKSUM_GOOD, ""); -STATIC_ASSERT (1 << DPDK_RX_F_CKSUM_BAD == PKT_RX_IP_CKSUM_BAD, ""); -STATIC_ASSERT (1 << DPDK_RX_F_FDIR == PKT_RX_FDIR, ""); -STATIC_ASSERT ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_FDIR) < +/* make sure all flags we need are stored in lower 8 bits */ +STATIC_ASSERT ((PKT_RX_IP_CKSUM_BAD | PKT_RX_FDIR) < 256, "dpdk flags not un lower byte, fix needed"); -always_inline u32 -dpdk_rx_next (vlib_node_runtime_t * node, u16 etype, u8 flags) -{ - if (PREDICT_TRUE (etype == clib_host_to_net_u16 (ETHERNET_TYPE_IP4))) - { - /* keep it branchless */ - u32 is_good = (flags >> DPDK_RX_F_CKSUM_GOOD) & 1; - return VNET_DEVICE_INPUT_NEXT_IP4_INPUT - is_good; - } - else if (PREDICT_TRUE (etype == clib_host_to_net_u16 (ETHERNET_TYPE_IP6))) - return VNET_DEVICE_INPUT_NEXT_IP6_INPUT; - else if (PREDICT_TRUE (etype == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS))) - return VNET_DEVICE_INPUT_NEXT_MPLS_INPUT; - else - return VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; -} - static_always_inline uword dpdk_process_subseq_segs (vlib_main_t * vm, vlib_buffer_t * b, - struct rte_mbuf * mb, vlib_buffer_free_list_t * fl) + struct rte_mbuf *mb, vlib_buffer_free_list_t * fl) { u8 nb_seg = 1; struct rte_mbuf *mb_seg = 0; @@ -140,20 +107,6 @@ dpdk_prefetch_buffer_x4 (struct rte_mbuf *mb[]) CLIB_PREFETCH (b, CLIB_CACHE_LINE_BYTES, LOAD); } -static_always_inline void -dpdk_prefetch_buffer_data_x4 (struct rte_mbuf *mb[]) -{ - vlib_buffer_t *b; - b = vlib_buffer_from_rte_mbuf (mb[0]); - CLIB_PREFETCH (b->data, CLIB_CACHE_LINE_BYTES, LOAD); - b = vlib_buffer_from_rte_mbuf (mb[1]); - CLIB_PREFETCH (b->data, CLIB_CACHE_LINE_BYTES, LOAD); - b = vlib_buffer_from_rte_mbuf (mb[2]); - CLIB_PREFETCH (b->data, CLIB_CACHE_LINE_BYTES, LOAD); - b = vlib_buffer_from_rte_mbuf (mb[3]); - CLIB_PREFETCH (b->data, CLIB_CACHE_LINE_BYTES, LOAD); -} - /** \brief Main DPDK input node @node dpdk-input @@ -217,20 +170,16 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd, vlib_buffer_free_list_t *fl; struct rte_mbuf **mb = ptd->mbufs; uword n_bytes = 0; - i16 off; u8 *flags, or_flags = 0; - u16 *next; - fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + if (maybe_multiseg) + fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); mb = ptd->mbufs; flags = ptd->flags; - next = ptd->next; while (n_left >= 8) { - CLIB_PREFETCH (mb + 8, CLIB_CACHE_LINE_BYTES, LOAD); - dpdk_prefetch_buffer_x4 (mb + 4); b[0] = vlib_buffer_from_rte_mbuf (mb[0]); @@ -245,42 +194,17 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd, or_flags |= dpdk_ol_flags_extract (mb, flags, 4); flags += 4; - /* we temporary store relative offset of ethertype into next[x] - so we can prefetch and get it faster later */ - - off = mb[0]->data_off; - next[0] = off + STRUCT_OFFSET_OF (ethernet_header_t, type); - off -= RTE_PKTMBUF_HEADROOM; - vnet_buffer (b[0])->l2_hdr_offset = off; - b[0]->current_data = off; - - off = mb[1]->data_off; - next[1] = off + STRUCT_OFFSET_OF (ethernet_header_t, type); - off -= RTE_PKTMBUF_HEADROOM; - vnet_buffer (b[1])->l2_hdr_offset = off; - b[1]->current_data = off; - - off = mb[2]->data_off; - next[2] = off + STRUCT_OFFSET_OF (ethernet_header_t, type); - off -= RTE_PKTMBUF_HEADROOM; - vnet_buffer (b[2])->l2_hdr_offset = off; - b[2]->current_data = off; - - off = mb[3]->data_off; - next[3] = off + STRUCT_OFFSET_OF (ethernet_header_t, type); - off -= RTE_PKTMBUF_HEADROOM; - vnet_buffer (b[3])->l2_hdr_offset = off; - b[3]->current_data = off; - - b[0]->current_length = mb[0]->data_len; - b[1]->current_length = mb[1]->data_len; - b[2]->current_length = mb[2]->data_len; - b[3]->current_length = mb[3]->data_len; - - n_bytes += mb[0]->data_len; - n_bytes += mb[1]->data_len; - n_bytes += mb[2]->data_len; - n_bytes += mb[3]->data_len; + b[0]->current_data = mb[0]->data_off - RTE_PKTMBUF_HEADROOM; + n_bytes += b[0]->current_length = mb[0]->data_len; + + b[1]->current_data = mb[1]->data_off - RTE_PKTMBUF_HEADROOM; + n_bytes += b[1]->current_length = mb[1]->data_len; + + b[2]->current_data = mb[2]->data_off - RTE_PKTMBUF_HEADROOM; + n_bytes += b[2]->current_length = mb[2]->data_len; + + b[3]->current_data = mb[3]->data_off - RTE_PKTMBUF_HEADROOM; + n_bytes += b[3]->current_length = mb[3]->data_len; if (maybe_multiseg) { @@ -298,7 +222,6 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd, /* next */ mb += 4; n_left -= 4; - next += 4; } while (n_left) @@ -308,13 +231,9 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd, or_flags |= dpdk_ol_flags_extract (mb, flags, 1); flags += 1; - off = mb[0]->data_off; - next[0] = off + STRUCT_OFFSET_OF (ethernet_header_t, type); - off -= RTE_PKTMBUF_HEADROOM; - vnet_buffer (b[0])->l2_hdr_offset = off; - b[0]->current_data = off; - b[0]->current_length = mb[0]->data_len; - n_bytes += mb[0]->data_len; + b[0]->current_data = mb[0]->data_off - RTE_PKTMBUF_HEADROOM; + n_bytes += b[0]->current_length = mb[0]->data_len; + if (maybe_multiseg) n_bytes += dpdk_process_subseq_segs (vm, b[0], mb[0], fl); VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]); @@ -322,79 +241,12 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd, /* next */ mb += 1; n_left -= 1; - next += 1; } *or_flagsp = or_flags; return n_bytes; } -static_always_inline void -dpdk_set_next_from_etype (vlib_main_t * vm, vlib_node_runtime_t * node, - dpdk_per_thread_data_t * ptd, uword n_rx_packets) -{ - vlib_buffer_t *b[4]; - i16 adv[4]; - u16 etype[4]; - struct rte_mbuf **mb = ptd->mbufs; - u8 *flags = ptd->flags; - u16 *next = ptd->next; - u32 n_left = n_rx_packets; - - while (n_left >= 12) - { - dpdk_prefetch_buffer_data_x4 (mb + 8); - dpdk_prefetch_buffer_x4 (mb + 8); - - b[0] = vlib_buffer_from_rte_mbuf (mb[0]); - b[1] = vlib_buffer_from_rte_mbuf (mb[1]); - b[2] = vlib_buffer_from_rte_mbuf (mb[2]); - b[3] = vlib_buffer_from_rte_mbuf (mb[3]); - etype[0] = *(u16 *) ((u8 *) mb[0] + next[0] + sizeof (vlib_buffer_t)); - etype[1] = *(u16 *) ((u8 *) mb[1] + next[1] + sizeof (vlib_buffer_t)); - etype[2] = *(u16 *) ((u8 *) mb[2] + next[2] + sizeof (vlib_buffer_t)); - etype[3] = *(u16 *) ((u8 *) mb[3] + next[3] + sizeof (vlib_buffer_t)); - next[0] = dpdk_rx_next (node, etype[0], flags[0]); - next[1] = dpdk_rx_next (node, etype[1], flags[1]); - next[2] = dpdk_rx_next (node, etype[2], flags[2]); - next[3] = dpdk_rx_next (node, etype[3], flags[3]); - adv[0] = device_input_next_node_advance[next[0]]; - adv[1] = device_input_next_node_advance[next[1]]; - adv[2] = device_input_next_node_advance[next[2]]; - adv[3] = device_input_next_node_advance[next[3]]; - b[0]->current_data += adv[0]; - b[1]->current_data += adv[1]; - b[2]->current_data += adv[2]; - b[3]->current_data += adv[3]; - b[0]->current_length -= adv[0]; - b[1]->current_length -= adv[1]; - b[2]->current_length -= adv[2]; - b[3]->current_length -= adv[3]; - - /* next */ - next += 4; - mb += 4; - n_left -= 4; - flags += 4; - } - - while (n_left) - { - b[0] = vlib_buffer_from_rte_mbuf (mb[0]); - next[0] = *(u16 *) ((u8 *) mb[0] + next[0] + sizeof (vlib_buffer_t)); - next[0] = dpdk_rx_next (node, next[0], flags[0]); - adv[0] = device_input_next_node_advance[next[0]]; - b[0]->current_data += adv[0]; - b[0]->current_length -= adv[0]; - - /* next */ - next += 1; - mb += 1; - n_left -= 1; - flags += 1; - } -} - static_always_inline void dpdk_process_flow_offload (dpdk_device_t * xd, dpdk_per_thread_data_t * ptd, uword n_rx_packets) @@ -406,7 +258,7 @@ dpdk_process_flow_offload (dpdk_device_t * xd, dpdk_per_thread_data_t * ptd, /* TODO prefetch and quad-loop */ for (n = 0; n < n_rx_packets; n++) { - if ((ptd->flags[n] & (1 << DPDK_RX_F_FDIR)) == 0) + if ((ptd->flags[n] & (1 << PKT_RX_FDIR)) == 0) continue; fle = pool_elt_at_index (xd->flow_lookup_entries, @@ -439,10 +291,10 @@ dpdk_device_input (vlib_main_t * vm, dpdk_main_t * dm, dpdk_device_t * xd, u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; struct rte_mbuf **mb; vlib_buffer_t *b0; - int known_next = 0; u16 *next; u8 or_flags; u32 n; + int single_next = 0; dpdk_per_thread_data_t *ptd = vec_elt_at_index (dm->per_thread_data, thread_index); @@ -472,77 +324,96 @@ dpdk_device_input (vlib_main_t * vm, dpdk_main_t * dm, dpdk_device_t * xd, /* as DPDK is allocating empty buffers from mempool provided before interface start for each queue, it is safe to store this in the template */ bt->buffer_pool_index = xd->buffer_pool_for_queue[queue_id]; + vnet_buffer (bt)->feature_arc_index = 0; + bt->current_config_index = 0; /* receive burst of packets from DPDK PMD */ if (PREDICT_FALSE (xd->per_interface_next_index != ~0)) - { - known_next = 1; - next_index = xd->per_interface_next_index; - } + next_index = xd->per_interface_next_index; /* as all packets belong to the same interface feature arc lookup can be don once and result stored in the buffer template */ if (PREDICT_FALSE (vnet_device_input_have_features (xd->sw_if_index))) - { - vnet_feature_start_device_input_x1 (xd->sw_if_index, &next_index, bt); - known_next = 1; - } + vnet_feature_start_device_input_x1 (xd->sw_if_index, &next_index, bt); if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG) n_rx_bytes = dpdk_process_rx_burst (vm, ptd, n_rx_packets, 1, &or_flags); else n_rx_bytes = dpdk_process_rx_burst (vm, ptd, n_rx_packets, 0, &or_flags); - if (PREDICT_FALSE (known_next)) + if (PREDICT_FALSE (or_flags & (1 << PKT_RX_FDIR))) { + /* some packets will need to go to different next nodes */ for (n = 0; n < n_rx_packets; n++) ptd->next[n] = next_index; - vnet_buffer (bt)->feature_arc_index = 0; - bt->current_config_index = 0; + /* flow offload - process if rx flow offload enabled and at least one + packet is marked */ + if (PREDICT_FALSE ((xd->flags & DPDK_DEVICE_FLAG_RX_FLOW_OFFLOAD) && + (or_flags & (1 << PKT_RX_FDIR)))) + dpdk_process_flow_offload (xd, ptd, n_rx_packets); + + /* enqueue buffers to the next node */ + vlib_get_buffer_indices_with_offset (vm, (void **) ptd->mbufs, + ptd->buffers, n_rx_packets, + sizeof (struct rte_mbuf)); + + vlib_buffer_enqueue_to_next (vm, node, ptd->buffers, ptd->next, + n_rx_packets); } else - dpdk_set_next_from_etype (vm, node, ptd, n_rx_packets); - - /* flow offload - process if rx flow offload enabled and at least one packet - is marked */ - if (PREDICT_FALSE ((xd->flags & DPDK_DEVICE_FLAG_RX_FLOW_OFFLOAD) && - (or_flags & (1 << DPDK_RX_F_FDIR)))) - dpdk_process_flow_offload (xd, ptd, n_rx_packets); - - /* is at least one packet marked as ip4 checksum bad? */ - if (PREDICT_FALSE (or_flags & (1 << DPDK_RX_F_CKSUM_BAD))) - for (n = 0; n < n_rx_packets; n++) - { - if ((ptd->flags[n] & (1 << DPDK_RX_F_CKSUM_BAD)) == 0) - continue; - if (ptd->next[n] != VNET_DEVICE_INPUT_NEXT_IP4_INPUT) - continue; - - b0 = vlib_buffer_from_rte_mbuf (ptd->mbufs[n]); - b0->error = node->errors[DPDK_ERROR_IP_CHECKSUM_ERROR]; - ptd->next[n] = VNET_DEVICE_INPUT_NEXT_DROP; - } - - /* enqueue buffers to the next node */ - vlib_get_buffer_indices_with_offset (vm, (void **) ptd->mbufs, ptd->buffers, - n_rx_packets, - sizeof (struct rte_mbuf)); - - vlib_buffer_enqueue_to_next (vm, node, ptd->buffers, ptd->next, - n_rx_packets); + { + u32 *to_next, n_left_to_next; + + vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next); + vlib_get_buffer_indices_with_offset (vm, (void **) ptd->mbufs, to_next, + n_rx_packets, + sizeof (struct rte_mbuf)); + + if (PREDICT_TRUE (next_index == VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT)) + { + vlib_next_frame_t *nf; + vlib_frame_t *f; + ethernet_input_frame_t *ef; + nf = vlib_node_runtime_get_next_frame (vm, node, next_index); + f = vlib_get_frame (vm, nf->frame_index); + f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX; + + ef = vlib_frame_scalar_args (f); + ef->sw_if_index = xd->sw_if_index; + ef->hw_if_index = xd->hw_if_index; + + /* if PMD supports ip4 checksum check and there are no packets + marked as ip4 checksum bad we can notify ethernet input so it + can send pacets to ip4-input-no-checksum node */ + if (xd->flags & DPDK_DEVICE_FLAG_RX_IP4_CKSUM && + (or_flags & (1 << PKT_RX_IP_CKSUM_BAD)) == 0) + f->flags |= ETH_INPUT_FRAME_F_IP4_CKSUM_OK; + } + n_left_to_next -= n_rx_packets; + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + single_next = 1; + } /* packet trace if enabled */ if (PREDICT_FALSE ((n_trace = vlib_get_trace_count (vm, node)))) { + if (single_next) + vlib_get_buffer_indices_with_offset (vm, (void **) ptd->mbufs, + ptd->buffers, n_rx_packets, + sizeof (struct rte_mbuf)); + n_left = n_rx_packets; buffers = ptd->buffers; mb = ptd->mbufs; next = ptd->next; + while (n_trace && n_left) { b0 = vlib_get_buffer (vm, buffers[0]); - vlib_trace_buffer (vm, node, next[0], b0, /* follow_chain */ 0); + if (single_next == 0) + next_index = next[0]; + vlib_trace_buffer (vm, node, next_index, b0, /* follow_chain */ 0); dpdk_rx_trace_t *t0 = vlib_add_trace (vm, node, b0, sizeof t0[0]); t0->queue_index = queue_id; diff --git a/src/vnet/ethernet/ethernet.h b/src/vnet/ethernet/ethernet.h index ece7aa58217..7435599c347 100644 --- a/src/vnet/ethernet/ethernet.h +++ b/src/vnet/ethernet/ethernet.h @@ -45,6 +45,20 @@ #include #include +/* ethernet-input frame flags and scalar data */ + +/* all packets in frame share same sw_if_index */ +#define ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX (1 << 0) + +/* all ip4 packets in frame have correct ip4 checksum */ +#define ETH_INPUT_FRAME_F_IP4_CKSUM_OK (1 << 1) + +typedef struct +{ + u32 sw_if_index; + u32 hw_if_index; +} ethernet_input_frame_t; + always_inline u64 ethernet_mac_address_u64 (const u8 * a) { @@ -154,6 +168,7 @@ typedef u32 (ethernet_flag_change_function_t) /* Ethernet interface instance. */ typedef struct ethernet_interface { + u32 flags; /* Accept all packets (promiscuous mode). */ #define ETHERNET_INTERFACE_FLAG_ACCEPT_ALL (1 << 0) diff --git a/src/vnet/ethernet/interface.c b/src/vnet/ethernet/interface.c index c66fa8232cf..f1e6785cc6b 100644 --- a/src/vnet/ethernet/interface.c +++ b/src/vnet/ethernet/interface.c @@ -379,6 +379,7 @@ ethernet_set_flags (vnet_main_t * vnm, u32 hw_if_index, u32 flags) ASSERT (hi->hw_class_index == ethernet_hw_interface_class.index); ei = pool_elt_at_index (em->interfaces, hi->hw_instance); + ei->flags = flags; if (ei->flag_change) return ei->flag_change (vnm, hi, flags); return (u32) ~ 0; diff --git a/src/vnet/ethernet/node.c b/src/vnet/ethernet/node.c index 53d5b4eb02d..c39c6d7e0c7 100755 --- a/src/vnet/ethernet/node.c +++ b/src/vnet/ethernet/node.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Cisco and/or its affiliates. + * Copyright (c) 2018 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: @@ -49,7 +49,9 @@ #define foreach_ethernet_input_next \ _ (PUNT, "error-punt") \ _ (DROP, "error-drop") \ - _ (LLC, "llc-input") + _ (LLC, "llc-input") \ + _ (IP4_INPUT, "ip4-input") \ + _ (IP4_INPUT_NCS, "ip4-input-no-checksum") typedef enum { @@ -62,6 +64,8 @@ typedef enum typedef struct { u8 packet_data[32]; + u16 frame_flags; + ethernet_input_frame_t frame_data; } ethernet_input_trace_t; static u8 * @@ -70,7 +74,16 @@ format_ethernet_input_trace (u8 * s, va_list * va) CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); ethernet_input_trace_t *t = va_arg (*va, ethernet_input_trace_t *); + u32 indent = format_get_indent (s); + if (t->frame_flags) + { + s = format (s, "frame: flags 0x%x", t->frame_flags); + if (t->frame_flags & ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX) + s = format (s, ", hw-if-index %u, sw-if-index %u", + t->frame_data.hw_if_index, t->frame_data.sw_if_index); + s = format (s, "\n%U", format_white_space, indent); + } s = format (s, "%U", format_ethernet_header, t->packet_data); return s; @@ -289,16 +302,401 @@ determine_next_node (ethernet_main_t * em, } } -static_always_inline uword +typedef enum +{ + ETYPE_ID_UNKNOWN = 0, + ETYPE_ID_IP4, + ETYPE_ID_IP6, + ETYPE_ID_MPLS, + ETYPE_N_IDS, +} etype_id_t; + +static_always_inline void +eth_input_advance_and_flags (vlib_main_t * vm, u32 * from, u32 n_left, + i16 advance, u32 and_flags, u32 or_flags) +{ + vlib_buffer_t *b[8]; + while (n_left >= 8) + { + vlib_get_buffers (vm, from, b, 8); + vlib_buffer_advance (b[0], advance); + vlib_buffer_advance (b[1], advance); + vlib_buffer_advance (b[2], advance); + vlib_buffer_advance (b[3], advance); + vlib_buffer_advance (b[4], advance); + vlib_buffer_advance (b[5], advance); + vlib_buffer_advance (b[6], advance); + vlib_buffer_advance (b[7], advance); + b[0]->flags = (b[0]->flags & and_flags) | or_flags; + b[1]->flags = (b[1]->flags & and_flags) | or_flags; + b[2]->flags = (b[2]->flags & and_flags) | or_flags; + b[3]->flags = (b[3]->flags & and_flags) | or_flags; + b[4]->flags = (b[4]->flags & and_flags) | or_flags; + b[5]->flags = (b[5]->flags & and_flags) | or_flags; + b[6]->flags = (b[6]->flags & and_flags) | or_flags; + b[7]->flags = (b[7]->flags & and_flags) | or_flags; + + n_left -= 8; + from += 8; + } + while (n_left) + { + vlib_get_buffers (vm, from, b, 1); + vlib_buffer_advance (b[0], advance); + b[0]->flags = (b[0]->flags & and_flags) | or_flags; + + n_left -= 1; + from += 1; + } +} + +typedef struct +{ + u16 etypes[VLIB_FRAME_SIZE]; + u32 bufs_by_etype[ETYPE_N_IDS][VLIB_FRAME_SIZE]; + u16 n_bufs_by_etype[ETYPE_N_IDS]; +} eth_input_data_t; + +/* following vector code relies on following assumptions */ +STATIC_ASSERT_OFFSET_OF (vlib_buffer_t, current_data, 0); +STATIC_ASSERT_OFFSET_OF (vlib_buffer_t, current_length, 2); +STATIC_ASSERT_OFFSET_OF (vlib_buffer_t, flags, 4); +STATIC_ASSERT (STRUCT_OFFSET_OF (vnet_buffer_opaque_t, l2_hdr_offset) == + STRUCT_OFFSET_OF (vnet_buffer_opaque_t, l3_hdr_offset) - 2, + "l3_hdr_offset must follow l2_hdr_offset"); + +static_always_inline void +eth_input_adv_and_flags_x4 (vlib_buffer_t ** b, i16 adv, u32 flags, int is_l3) +{ +#ifdef CLIB_HAVE_VEC256 + /* to reduce number of small loads/stores we are loading first 64 bits + of each buffer metadata into 256-bit register so we can advance + current_data, current_length and flags. + Observed saving of this code is ~2 clocks per packet */ + u64x4 r, radv; + + /* vector if signed 16 bit integers used in signed vector add operation + to advnce current_data and current_length */ + u32x8 flags4 = { 0, flags, 0, flags, 0, flags, 0, flags }; + i16x16 adv4 = { + adv, -adv, 0, 0, adv, -adv, 0, 0, + adv, -adv, 0, 0, adv, -adv, 0, 0 + }; + + /* load 4 x 64 bits */ + r = u64x4_gather (b[0], b[1], b[2], b[3]); + + /* set flags */ + r |= (u64x4) flags4; + + /* advance buffer */ + radv = (u64x4) ((i16x16) r + adv4); + + /* write 4 x 64 bits */ + u64x4_scatter (is_l3 ? radv : r, b[0], b[1], b[2], b[3]); + + /* use old current_data as l2_hdr_offset and new current_data as + l3_hdr_offset */ + r = (u64x4) u16x16_blend (r, radv << 16, 0xaa); + + /* store both l2_hdr_offset and l3_hdr_offset in single store operation */ + u32x8_scatter_one ((u32x8) r, 0, &vnet_buffer (b[0])->l2_hdr_offset); + u32x8_scatter_one ((u32x8) r, 2, &vnet_buffer (b[1])->l2_hdr_offset); + u32x8_scatter_one ((u32x8) r, 4, &vnet_buffer (b[2])->l2_hdr_offset); + u32x8_scatter_one ((u32x8) r, 6, &vnet_buffer (b[3])->l2_hdr_offset); + + ASSERT (b[0]->current_data == vnet_buffer (b[0])->l3_hdr_offset); + ASSERT (b[1]->current_data == vnet_buffer (b[1])->l3_hdr_offset); + ASSERT (b[2]->current_data == vnet_buffer (b[2])->l3_hdr_offset); + ASSERT (b[3]->current_data == vnet_buffer (b[3])->l3_hdr_offset); + + ASSERT (b[0]->current_data - vnet_buffer (b[0])->l2_hdr_offset == adv); + ASSERT (b[1]->current_data - vnet_buffer (b[1])->l2_hdr_offset == adv); + ASSERT (b[2]->current_data - vnet_buffer (b[2])->l2_hdr_offset == adv); + ASSERT (b[3]->current_data - vnet_buffer (b[3])->l2_hdr_offset == adv); + +#else + vnet_buffer (b[0])->l2_hdr_offset = b[0]->current_data; + vnet_buffer (b[1])->l2_hdr_offset = b[1]->current_data; + vnet_buffer (b[2])->l2_hdr_offset = b[2]->current_data; + vnet_buffer (b[3])->l2_hdr_offset = b[3]->current_data; + vnet_buffer (b[0])->l3_hdr_offset = b[0]->current_data + adv; + vnet_buffer (b[1])->l3_hdr_offset = b[1]->current_data + adv; + vnet_buffer (b[2])->l3_hdr_offset = b[2]->current_data + adv; + vnet_buffer (b[3])->l3_hdr_offset = b[3]->current_data + adv; + + if (is_l3) + { + vlib_buffer_advance (b[0], adv); + vlib_buffer_advance (b[1], adv); + vlib_buffer_advance (b[2], adv); + vlib_buffer_advance (b[3], adv); + } + + b[0]->flags |= flags; + b[1]->flags |= flags; + b[2]->flags |= flags; + b[3]->flags |= flags; +#endif + + if (!is_l3) + { + vnet_buffer (b[0])->l2.l2_len = adv; + vnet_buffer (b[1])->l2.l2_len = adv; + vnet_buffer (b[2])->l2.l2_len = adv; + vnet_buffer (b[3])->l2.l2_len = adv; + } +} + +static_always_inline void +eth_input_adv_and_flags_x1 (vlib_buffer_t ** b, i16 adv, u32 flags, int is_l3) +{ + vnet_buffer (b[0])->l2_hdr_offset = b[0]->current_data; + vnet_buffer (b[0])->l3_hdr_offset = b[0]->current_data + adv; + + if (is_l3) + vlib_buffer_advance (b[0], adv); + b[0]->flags |= flags; + if (!is_l3) + vnet_buffer (b[0])->l2.l2_len = adv; +} + +static_always_inline void +eth_input_process_frame (vlib_main_t * vm, u32 * from, u16 * etype, + u32 n_left, int is_l3) +{ + vlib_buffer_t *b[16]; + ethernet_header_t *e; + int adv = sizeof (ethernet_header_t); + + u32 flags = VNET_BUFFER_F_L2_HDR_OFFSET_VALID | + VNET_BUFFER_F_L3_HDR_OFFSET_VALID; + + while (n_left >= 16) + { + vlib_buffer_t **ph = b + 12, **pd = b + 8; + vlib_get_buffers (vm, from, b, 4); + vlib_get_buffers (vm, from + 8, b + 8, 8); + + vlib_prefetch_buffer_header (ph[0], LOAD); + vlib_prefetch_buffer_data (pd[0], LOAD); + e = vlib_buffer_get_current (b[0]); + etype[0] = e->type; + + vlib_prefetch_buffer_header (ph[1], LOAD); + vlib_prefetch_buffer_data (pd[1], LOAD); + e = vlib_buffer_get_current (b[1]); + etype[1] = e->type; + + vlib_prefetch_buffer_header (ph[2], LOAD); + vlib_prefetch_buffer_data (pd[2], LOAD); + e = vlib_buffer_get_current (b[2]); + etype[2] = e->type; + + vlib_prefetch_buffer_header (ph[3], LOAD); + vlib_prefetch_buffer_data (pd[3], LOAD); + e = vlib_buffer_get_current (b[3]); + etype[3] = e->type; + + eth_input_adv_and_flags_x4 (b, adv, flags, is_l3); + + /* next */ + n_left -= 4; + etype += 4; + from += 4; + } + while (n_left >= 4) + { + vlib_get_buffers (vm, from, b, 4); + + e = vlib_buffer_get_current (b[0]); + etype[0] = e->type; + + e = vlib_buffer_get_current (b[1]); + etype[1] = e->type; + + e = vlib_buffer_get_current (b[2]); + etype[2] = e->type; + + e = vlib_buffer_get_current (b[3]); + etype[3] = e->type; + + eth_input_adv_and_flags_x4 (b, adv, flags, is_l3); + + /* next */ + n_left -= 4; + etype += 4; + from += 4; + } + while (n_left) + { + vlib_get_buffers (vm, from, b, 1); + + e = vlib_buffer_get_current (b[0]); + etype[0] = e->type; + + eth_input_adv_and_flags_x1 (b, adv, flags, is_l3); + + /* next */ + n_left -= 1; + etype += 1; + from += 1; + } +} + +static_always_inline void +eth_input_sort (vlib_main_t * vm, u32 * from, u32 n_packets, + eth_input_data_t * d) +{ + u16 *etype = d->etypes; + i32 n_left = n_packets; + +#if defined (CLIB_HAVE_VEC256) + u16x16 e16; + u16x16 et16_ip4 = u16x16_splat (clib_host_to_net_u16 (ETHERNET_TYPE_IP4)); + u16x16 et16_ip6 = u16x16_splat (clib_host_to_net_u16 (ETHERNET_TYPE_IP6)); + u16x16 et16_mpls = u16x16_splat (clib_host_to_net_u16 (ETHERNET_TYPE_MPLS)); + u16x16 id16_ip4 = u16x16_splat (ETYPE_ID_IP4); + u16x16 id16_ip6 = u16x16_splat (ETYPE_ID_IP6); + u16x16 id16_mpls = u16x16_splat (ETYPE_ID_MPLS); + + while (n_left > 0) + { + u16x16 r = { 0 }; + e16 = u16x16_load_unaligned (etype); + r += (e16 == et16_ip4) & id16_ip4; + r += (e16 == et16_ip6) & id16_ip6; + r += (e16 == et16_mpls) & id16_mpls; + u16x16_store_unaligned (r, etype); + etype += 16; + n_left -= 16; + } +#elif defined (CLIB_HAVE_VEC128) + u16x8 e8; + u16x8 et8_ip4 = u16x8_splat (clib_host_to_net_u16 (ETHERNET_TYPE_IP4)); + u16x8 et8_ip6 = u16x8_splat (clib_host_to_net_u16 (ETHERNET_TYPE_IP6)); + u16x8 et8_mpls = u16x8_splat (clib_host_to_net_u16 (ETHERNET_TYPE_MPLS)); + u16x8 id8_ip4 = u16x8_splat (ETYPE_ID_IP4); + u16x8 id8_ip6 = u16x8_splat (ETYPE_ID_IP6); + u16x8 id8_mpls = u16x8_splat (ETYPE_ID_MPLS); + + while (n_left > 0) + { + u16x8 r = { 0 }; + e8 = u16x8_load_unaligned (etype); + r += (e8 == et8_ip4) & id8_ip4; + r += (e8 == et8_ip6) & id8_ip6; + r += (e8 == et8_mpls) & id8_mpls; + u16x8_store_unaligned (r, etype); + etype += 8; + n_left -= 8; + } +#else + while (n_left) + { + if (etype[0] == ETHERNET_TYPE_IP4) + etype[0] = ETYPE_ID_IP4; + else if (etype[0] == ETHERNET_TYPE_IP6) + etype[0] = ETYPE_ID_IP6; + else if (etype[0] == ETHERNET_TYPE_MPLS) + etype[0] = ETYPE_ID_MPLS; + else + etype[0] = ETYPE_ID_UNKNOWN; + + etype += 1; + n_left -= 1; + } +#endif + + etype = d->etypes; + n_left = n_packets; + + clib_memset_u16 (d->n_bufs_by_etype, 0, ETYPE_N_IDS); + while (n_left) + { + u16 x, y; + x = etype[0]; + y = d->n_bufs_by_etype[x]; + +#ifdef CLIB_HAVE_VEC256 + if (n_left >= 16 && u16x16_is_all_equal (u16x16_load_unaligned (etype), + etype[0])) + { + clib_memcpy_fast (&d->bufs_by_etype[x][y], from, 16 * sizeof (u32)); + d->n_bufs_by_etype[x] += 16; + + /* next */ + n_left -= 16; + etype += 16; + from += 16; + continue; + } +#endif +#ifdef CLIB_HAVE_VEC128 + if (n_left >= 8 && u16x8_is_all_equal (u16x8_load_unaligned (etype), + etype[0])) + { + clib_memcpy_fast (&d->bufs_by_etype[x][y], from, 8 * sizeof (u32)); + d->n_bufs_by_etype[x] += 8; + + /* next */ + n_left -= 8; + etype += 8; + from += 8; + continue; + } +#endif + d->bufs_by_etype[x][y] = from[0]; + d->n_bufs_by_etype[x]++; + + /* next */ + n_left -= 1; + etype += 1; + from += 1; + } +} + +static_always_inline void +ethernet_input_trace (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + u32 *from, n_left; + if ((node->flags & VLIB_NODE_FLAG_TRACE) == 0) + return; + + from = vlib_frame_vector_args (from_frame); + n_left = from_frame->n_vectors; + + while (n_left) + { + ethernet_input_trace_t *t0; + vlib_buffer_t *b0 = vlib_get_buffer (vm, from[0]); + + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + t0 = vlib_add_trace (vm, node, b0, sizeof (ethernet_input_trace_t)); + clib_memcpy_fast (t0->packet_data, b0->data + b0->current_data, + sizeof (t0->packet_data)); + t0->frame_flags = from_frame->flags; + clib_memcpy_fast (&t0->frame_data, + vlib_frame_scalar_args (from_frame), + sizeof (ethernet_input_frame_t)); + } + from += 1; + n_left -= 1; + } +} + +static_always_inline void ethernet_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, - vlib_frame_t * from_frame, + u32 * from, u32 n_packets, ethernet_input_variant_t variant) { vnet_main_t *vnm = vnet_get_main (); ethernet_main_t *em = ðernet_main; vlib_node_runtime_t *error_node; - u32 n_left_from, next_index, *from, *to_next; + u32 n_left_from, next_index, *to_next; u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; u32 thread_index = vm->thread_index; u32 cached_sw_if_index = ~0; @@ -310,15 +708,7 @@ ethernet_input_inline (vlib_main_t * vm, else error_node = node; - from = vlib_frame_vector_args (from_frame); - n_left_from = from_frame->n_vectors; - - if (node->flags & VLIB_NODE_FLAG_TRACE) - vlib_trace_frame_buffers_only (vm, node, - from, - n_left_from, - sizeof (from[0]), - sizeof (ethernet_input_trace_t)); + n_left_from = n_packets; next_index = node->cached_next_index; stats_sw_if_index = node->runtime_data[0]; @@ -764,32 +1154,136 @@ ethernet_input_inline (vlib_main_t * vm, thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } +} - return from_frame->n_vectors; +static_always_inline void +eth_input_enqueue_untagged (vlib_main_t * vm, vlib_node_runtime_t * node, + eth_input_data_t * d, int ip4_cksum_ok, int is_l3) +{ + ethernet_main_t *em = ðernet_main; + etype_id_t id; + u32 next_index; + + id = ETYPE_ID_IP4; + if (d->n_bufs_by_etype[id]) + { + if (is_l3) + { + next_index = em->l3_next.input_next_ip4; + if (next_index == ETHERNET_INPUT_NEXT_IP4_INPUT && ip4_cksum_ok) + next_index = ETHERNET_INPUT_NEXT_IP4_INPUT_NCS; + } + else + next_index = em->l2_next; + + vlib_buffer_enqueue_to_single_next (vm, node, d->bufs_by_etype[id], + next_index, d->n_bufs_by_etype[id]); + } + + id = ETYPE_ID_IP6; + if (d->n_bufs_by_etype[id]) + { + next_index = is_l3 ? em->l3_next.input_next_ip6 : em->l2_next; + vlib_buffer_enqueue_to_single_next (vm, node, d->bufs_by_etype[id], + next_index, d->n_bufs_by_etype[id]); + } + + id = ETYPE_ID_MPLS; + if (d->n_bufs_by_etype[id]) + { + next_index = is_l3 ? em->l3_next.input_next_mpls : em->l2_next; + vlib_buffer_enqueue_to_single_next (vm, node, d->bufs_by_etype[id], + next_index, d->n_bufs_by_etype[id]); + } + + id = ETYPE_ID_UNKNOWN; + if (d->n_bufs_by_etype[id]) + { + /* in case of l3 interfaces, we already advanced buffer so we need to + roll back */ + if (is_l3) + eth_input_advance_and_flags (vm, d->bufs_by_etype[id], + d->n_bufs_by_etype[id], + -(i16) sizeof (ethernet_header_t), + ~VNET_BUFFER_F_L3_HDR_OFFSET_VALID, 0); + ethernet_input_inline (vm, node, d->bufs_by_etype[id], + d->n_bufs_by_etype[id], + ETHERNET_INPUT_VARIANT_ETHERNET); + } } VLIB_NODE_FN (ethernet_input_node) (vlib_main_t * vm, vlib_node_runtime_t * node, - vlib_frame_t * from_frame) + vlib_frame_t * frame) { - return ethernet_input_inline (vm, node, from_frame, - ETHERNET_INPUT_VARIANT_ETHERNET); + vnet_main_t *vnm = vnet_get_main (); + ethernet_main_t *em = ðernet_main; + u32 *from = vlib_frame_vector_args (frame); + u32 n_packets = frame->n_vectors; + + ethernet_input_trace (vm, node, frame); + + if (frame->flags & ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX) + { + eth_input_data_t data, *d = &data; + ethernet_input_frame_t *ef = vlib_frame_scalar_args (frame); + vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, ef->hw_if_index); + main_intf_t *intf0 = vec_elt_at_index (em->main_intfs, hi->hw_if_index); + subint_config_t *subint0 = &intf0->untagged_subint; + int ip4_cksum_ok = (frame->flags & ETH_INPUT_FRAME_F_IP4_CKSUM_OK) != 0; + + if (subint0->flags & SUBINT_CONFIG_L2) + { + /* untagged packets are treated as L2 */ + eth_input_process_frame (vm, from, d->etypes, n_packets, 0); + eth_input_sort (vm, from, n_packets, d); + eth_input_enqueue_untagged (vm, node, d, ip4_cksum_ok, 0); + } + else + { + ethernet_interface_t *ei; + ei = pool_elt_at_index (em->interfaces, hi->hw_instance); + + /* currently only slowpath deals with dmac check */ + if (ei->flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL) + goto slowpath; + + /* untagged packets are treated as L3 */ + eth_input_process_frame (vm, from, d->etypes, n_packets, 1); + eth_input_sort (vm, from, n_packets, d); + eth_input_enqueue_untagged (vm, node, d, ip4_cksum_ok, 1); + } + return n_packets; + } + +slowpath: + ethernet_input_inline (vm, node, from, n_packets, + ETHERNET_INPUT_VARIANT_ETHERNET); + return n_packets; } VLIB_NODE_FN (ethernet_input_type_node) (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame) { - return ethernet_input_inline (vm, node, from_frame, - ETHERNET_INPUT_VARIANT_ETHERNET_TYPE); + u32 *from = vlib_frame_vector_args (from_frame); + u32 n_packets = from_frame->n_vectors; + ethernet_input_trace (vm, node, from_frame); + ethernet_input_inline (vm, node, from, n_packets, + ETHERNET_INPUT_VARIANT_ETHERNET_TYPE); + return n_packets; } VLIB_NODE_FN (ethernet_input_not_l2_node) (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame) { - return ethernet_input_inline (vm, node, from_frame, - ETHERNET_INPUT_VARIANT_NOT_L2); + u32 *from = vlib_frame_vector_args (from_frame); + u32 n_packets = from_frame->n_vectors; + ethernet_input_trace (vm, node, from_frame); + ethernet_input_inline (vm, node, from, n_packets, + ETHERNET_INPUT_VARIANT_NOT_L2); + return n_packets; } @@ -1159,6 +1653,7 @@ VLIB_REGISTER_NODE (ethernet_input_node) = { .name = "ethernet-input", /* Takes a vector of packets. */ .vector_size = sizeof (u32), + .scalar_size = sizeof (ethernet_input_frame_t), .n_errors = ETHERNET_N_ERROR, .error_strings = ethernet_error_strings, .n_next_nodes = ETHERNET_INPUT_N_NEXT, diff --git a/src/vnet/pg/input.c b/src/vnet/pg/input.c index 8b46688929f..ee6aad4995e 100644 --- a/src/vnet/pg/input.c +++ b/src/vnet/pg/input.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -1496,7 +1497,24 @@ pg_generate_packets (vlib_node_runtime_t * node, { u32 *head, *start, *end; - vlib_get_next_frame (vm, node, next_index, to_next, n_left); + if (PREDICT_TRUE (next_index == VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT)) + { + vlib_next_frame_t *nf; + vlib_frame_t *f; + ethernet_input_frame_t *ef; + pg_interface_t *pi; + vlib_get_new_next_frame (vm, node, next_index, to_next, n_left); + nf = vlib_node_runtime_get_next_frame (vm, node, next_index); + f = vlib_get_frame (vm, nf->frame_index); + f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX; + + ef = vlib_frame_scalar_args (f); + pi = pool_elt_at_index (pg->interfaces, s->pg_if_index); + ef->sw_if_index = pi->sw_if_index; + ef->hw_if_index = pi->hw_if_index; + } + else + vlib_get_next_frame (vm, node, next_index, to_next, n_left); n_this_frame = n_packets_to_generate; if (n_this_frame > n_left) -- 2.16.6