From 302db471a01b5c21b80e9bc769315cef4da0ed80 Mon Sep 17 00:00:00 2001 From: Vladislav Grishenko Date: Wed, 24 Jan 2024 16:17:23 +0500 Subject: [PATCH] mpls: fix default mpls lb hash config In case of multiple path within tunnel, mpls lookup node computes lb hash with mpls_compute_flow_hash config value 0, so only mpls label and l4 ports gets accounted, not 5-tuple. This leads to flow traffic polarization and disbalance over mpls paths. Use mpls hash config from lb instead, usually it'll be MPLS_FLOw_HASH_DEFAULT with 5-tuple plus flowlabel. As optimization, fix flow hash reuse from the previous lookup node if present, like ip_lookup does. Previously mpls lookup always calcs the hash. Test lb distribution for both cases. Also, use the same flow hash hex format in ip4/ip6 and mpls traces for easier reading, most code changes is due fixstyle formatting. Type: fix Signed-off-by: Vladislav Grishenko Change-Id: Ib89e1ab3edec14269866fe825a3e887d6c817b7c --- src/vnet/ip/ip4_forward.c | 8 +- src/vnet/ip/ip6_forward.c | 3 +- src/vnet/mpls/mpls_lookup.c | 235 ++++++++++++++++++++++---------------------- test/test_mpls.py | 107 ++++++++++++++++++-- 4 files changed, 226 insertions(+), 127 deletions(-) diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index e85c888f669..ff74b52eb18 100644 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -1190,9 +1190,11 @@ format_ip4_forward_next_trace (u8 * s, va_list * args) CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *); u32 indent = format_get_indent (s); - s = format (s, "%U%U", - format_white_space, indent, - format_ip4_header, t->packet_data, sizeof (t->packet_data)); + + s = format (s, "%Ufib:%d adj:%d flow:0x%08x", format_white_space, indent, + t->fib_index, t->dpo_index, t->flow_hash); + s = format (s, "\n%U%U", format_white_space, indent, format_ip4_header, + t->packet_data, sizeof (t->packet_data)); return s; } #endif diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c index 06c473b1495..48fb633fd32 100644 --- a/src/vnet/ip/ip6_forward.c +++ b/src/vnet/ip/ip6_forward.c @@ -948,8 +948,7 @@ format_ip6_forward_next_trace (u8 * s, va_list * args) ip6_forward_next_trace_t *t = va_arg (*args, ip6_forward_next_trace_t *); u32 indent = format_get_indent (s); - s = format (s, "%Ufib:%d adj:%d flow:%d", - format_white_space, indent, + s = format (s, "%Ufib:%d adj:%d flow:0x%08x", format_white_space, indent, t->fib_index, t->adj_index, t->flow_hash); s = format (s, "\n%U%U", format_white_space, indent, diff --git a/src/vnet/mpls/mpls_lookup.c b/src/vnet/mpls/mpls_lookup.c index db423392c03..a5ac56534a5 100644 --- a/src/vnet/mpls/mpls_lookup.c +++ b/src/vnet/mpls/mpls_lookup.c @@ -44,13 +44,13 @@ format_mpls_lookup_trace (u8 * s, va_list * args) CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); mpls_lookup_trace_t * t = va_arg (*args, mpls_lookup_trace_t *); - s = format (s, "MPLS: next [%d], lookup fib index %d, LB index %d hash %x " - "label %d eos %d", - t->next_index, t->lfib_index, t->lb_index, t->hash, - vnet_mpls_uc_get_label( - clib_net_to_host_u32(t->label_net_byte_order)), - vnet_mpls_uc_get_s( - clib_net_to_host_u32(t->label_net_byte_order))); + s = format ( + s, + "MPLS: next [%d], lookup fib index %d, LB index %d hash 0x%08x " + "label %d eos %d", + t->next_index, t->lfib_index, t->lb_index, t->hash, + vnet_mpls_uc_get_label (clib_net_to_host_u32 (t->label_net_byte_order)), + vnet_mpls_uc_get_s (clib_net_to_host_u32 (t->label_net_byte_order))); return s; } @@ -482,8 +482,8 @@ format_mpls_load_balance_trace (u8 * s, va_list * args) CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); mpls_load_balance_trace_t * t = va_arg (*args, mpls_load_balance_trace_t *); - s = format (s, "MPLS: next [%d], LB index %d hash %d", - t->next_index, t->lb_index, t->hash); + s = format (s, "MPLS: next [%d], LB index %d hash 0x%08x", t->next_index, + t->lb_index, t->hash); return s; } @@ -553,75 +553,77 @@ VLIB_NODE_FN (mpls_load_balance_node) (vlib_main_t * vm, * We don't want to use the same hash value at each level in the recursion * graph as that would lead to polarisation */ - hc0 = vnet_buffer (p0)->ip.flow_hash = 0; - hc1 = vnet_buffer (p1)->ip.flow_hash = 0; - - if (PREDICT_FALSE (lb0->lb_n_buckets > 1)) - { - if (PREDICT_TRUE (vnet_buffer(p0)->ip.flow_hash)) - { - hc0 = vnet_buffer(p0)->ip.flow_hash = vnet_buffer(p0)->ip.flow_hash >> 1; - } - else - { - hc0 = vnet_buffer(p0)->ip.flow_hash = mpls_compute_flow_hash(mpls0, hc0); - } - dpo0 = load_balance_get_fwd_bucket(lb0, (hc0 & lb0->lb_n_buckets_minus_1)); - } - else - { - dpo0 = load_balance_get_bucket_i (lb0, 0); - } - if (PREDICT_FALSE (lb1->lb_n_buckets > 1)) - { - if (PREDICT_TRUE (vnet_buffer(p1)->ip.flow_hash)) - { - hc1 = vnet_buffer(p1)->ip.flow_hash = vnet_buffer(p1)->ip.flow_hash >> 1; - } - else - { - hc1 = vnet_buffer(p1)->ip.flow_hash = mpls_compute_flow_hash(mpls1, hc1); - } - dpo1 = load_balance_get_fwd_bucket(lb1, (hc1 & lb1->lb_n_buckets_minus_1)); - } - else - { - dpo1 = load_balance_get_bucket_i (lb1, 0); - } - - next0 = dpo0->dpoi_next_node; - next1 = dpo1->dpoi_next_node; - - vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; - vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; - - vlib_increment_combined_counter - (cm, thread_index, lbi0, 1, - vlib_buffer_length_in_chain (vm, p0)); - vlib_increment_combined_counter - (cm, thread_index, lbi1, 1, - vlib_buffer_length_in_chain (vm, p1)); - - if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) - { - mpls_load_balance_trace_t *tr = vlib_add_trace (vm, node, - p0, sizeof (*tr)); - tr->next_index = next0; - tr->lb_index = lbi0; - tr->hash = hc0; - } - if (PREDICT_FALSE(p1->flags & VLIB_BUFFER_IS_TRACED)) - { - mpls_load_balance_trace_t *tr = vlib_add_trace (vm, node, - p1, sizeof (*tr)); - tr->next_index = next1; - tr->lb_index = lbi1; - tr->hash = hc1; - } - - vlib_validate_buffer_enqueue_x2 (vm, node, next, - to_next, n_left_to_next, - pi0, pi1, next0, next1); + hc0 = hc1 = 0; + + if (PREDICT_FALSE (lb0->lb_n_buckets > 1)) + { + if (PREDICT_TRUE (vnet_buffer (p0)->ip.flow_hash)) + { + hc0 = vnet_buffer (p0)->ip.flow_hash = + vnet_buffer (p0)->ip.flow_hash >> 1; + } + else + { + hc0 = vnet_buffer (p0)->ip.flow_hash = + mpls_compute_flow_hash (mpls0, lb0->lb_hash_config); + } + dpo0 = load_balance_get_fwd_bucket ( + lb0, (hc0 & lb0->lb_n_buckets_minus_1)); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); + } + if (PREDICT_FALSE (lb1->lb_n_buckets > 1)) + { + if (PREDICT_TRUE (vnet_buffer (p1)->ip.flow_hash)) + { + hc1 = vnet_buffer (p1)->ip.flow_hash = + vnet_buffer (p1)->ip.flow_hash >> 1; + } + else + { + hc1 = vnet_buffer (p1)->ip.flow_hash = + mpls_compute_flow_hash (mpls1, lb1->lb_hash_config); + } + dpo1 = load_balance_get_fwd_bucket ( + lb1, (hc1 & lb1->lb_n_buckets_minus_1)); + } + else + { + dpo1 = load_balance_get_bucket_i (lb1, 0); + } + + next0 = dpo0->dpoi_next_node; + next1 = dpo1->dpoi_next_node; + + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; + + vlib_increment_combined_counter ( + cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + vlib_increment_combined_counter ( + cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); + + if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_load_balance_trace_t *tr = + vlib_add_trace (vm, node, p0, sizeof (*tr)); + tr->next_index = next0; + tr->lb_index = lbi0; + tr->hash = hc0; + } + if (PREDICT_FALSE (p1->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_load_balance_trace_t *tr = + vlib_add_trace (vm, node, p1, sizeof (*tr)); + tr->next_index = next1; + tr->lb_index = lbi1; + tr->hash = hc1; + } + + vlib_validate_buffer_enqueue_x2 ( + vm, node, next, to_next, n_left_to_next, pi0, pi1, next0, next1); } while (n_left_from > 0 && n_left_to_next > 0) @@ -646,44 +648,45 @@ VLIB_NODE_FN (mpls_load_balance_node) (vlib_main_t * vm, lb0 = load_balance_get(lbi0); - hc0 = vnet_buffer (p0)->ip.flow_hash = 0; - if (PREDICT_FALSE (lb0->lb_n_buckets > 1)) - { - if (PREDICT_TRUE (vnet_buffer(p0)->ip.flow_hash)) - { - hc0 = vnet_buffer(p0)->ip.flow_hash = vnet_buffer(p0)->ip.flow_hash >> 1; - } - else - { - hc0 = vnet_buffer(p0)->ip.flow_hash = mpls_compute_flow_hash(mpls0, hc0); - } - dpo0 = load_balance_get_fwd_bucket(lb0, (hc0 & lb0->lb_n_buckets_minus_1)); - } - else - { - dpo0 = load_balance_get_bucket_i (lb0, 0); - } - - next0 = dpo0->dpoi_next_node; - vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; - - if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) - { - mpls_load_balance_trace_t *tr = vlib_add_trace (vm, node, - p0, sizeof (*tr)); - tr->next_index = next0; - tr->lb_index = lbi0; - tr->hash = hc0; - } - - vlib_increment_combined_counter - (cm, thread_index, lbi0, 1, - vlib_buffer_length_in_chain (vm, p0)); - - vlib_validate_buffer_enqueue_x1 (vm, node, next, - to_next, n_left_to_next, - pi0, next0); - } + hc0 = 0; + if (PREDICT_FALSE (lb0->lb_n_buckets > 1)) + { + if (PREDICT_TRUE (vnet_buffer (p0)->ip.flow_hash)) + { + hc0 = vnet_buffer (p0)->ip.flow_hash = + vnet_buffer (p0)->ip.flow_hash >> 1; + } + else + { + hc0 = vnet_buffer (p0)->ip.flow_hash = + mpls_compute_flow_hash (mpls0, lb0->lb_hash_config); + } + dpo0 = load_balance_get_fwd_bucket ( + lb0, (hc0 & lb0->lb_n_buckets_minus_1)); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); + } + + next0 = dpo0->dpoi_next_node; + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + + if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_load_balance_trace_t *tr = + vlib_add_trace (vm, node, p0, sizeof (*tr)); + tr->next_index = next0; + tr->lb_index = lbi0; + tr->hash = hc0; + } + + vlib_increment_combined_counter ( + cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + + vlib_validate_buffer_enqueue_x1 (vm, node, next, to_next, + n_left_to_next, pi0, next0); + } vlib_put_next_frame (vm, node, next, n_left_to_next); } diff --git a/test/test_mpls.py b/test/test_mpls.py index cd44f94d951..9c07251a481 100644 --- a/test/test_mpls.py +++ b/test/test_mpls.py @@ -188,16 +188,17 @@ class TestMPLS(VppTestCase): return pkts def create_stream_ip4( - self, src_if, dst_ip, ip_ttl=64, ip_dscp=0, payload_size=None + self, src_if, dst_ip, ip_ttl=64, ip_dscp=0, payload_size=None, n=257 ): self.reset_packet_infos() pkts = [] - for i in range(0, 257): + for i in range(0, n): + dst = dst_ip[i % len(dst_ip)] if isinstance(dst_ip, list) else dst_ip info = self.create_packet_info(src_if, src_if) payload = self.info_to_payload(info) p = ( Ether(dst=src_if.local_mac, src=src_if.remote_mac) - / IP(src=src_if.remote_ip4, dst=dst_ip, ttl=ip_ttl, tos=ip_dscp) + / IP(src=src_if.remote_ip4, dst=dst, ttl=ip_ttl, tos=ip_dscp) / UDP(sport=1234, dport=1234) / Raw(payload) ) @@ -207,15 +208,16 @@ class TestMPLS(VppTestCase): pkts.append(p) return pkts - def create_stream_ip6(self, src_if, dst_ip, ip_ttl=64, ip_dscp=0): + def create_stream_ip6(self, src_if, dst_ip, ip_ttl=64, ip_dscp=0, n=257): self.reset_packet_infos() pkts = [] - for i in range(0, 257): + for i in range(0, n): + dst = dst_ip[i % len(dst_ip)] if isinstance(dst_ip, list) else dst_ip info = self.create_packet_info(src_if, src_if) payload = self.info_to_payload(info) p = ( Ether(dst=src_if.local_mac, src=src_if.remote_mac) - / IPv6(src=src_if.remote_ip6, dst=dst_ip, hlim=ip_ttl, tc=ip_dscp) + / IPv6(src=src_if.remote_ip6, dst=dst, hlim=ip_ttl, tc=ip_dscp) / UDP(sport=1234, dport=1234) / Raw(payload) ) @@ -1341,6 +1343,99 @@ class TestMPLS(VppTestCase): ], ) + def test_tunnel_ecmp(self): + """MPLS Tunnel Tests - ECMP""" + + # + # Create a tunnel with multiple paths and labels + # + self.pg0.generate_remote_hosts(2) + self.pg0.configure_ipv4_neighbors() + mpls_tun = VppMPLSTunnelInterface( + self, + [ + VppRoutePath( + self.pg0.remote_hosts[0].ip4, + self.pg0.sw_if_index, + labels=[VppMplsLabel(3)], + ), + VppRoutePath( + self.pg0.remote_hosts[1].ip4, + self.pg0.sw_if_index, + labels=[VppMplsLabel(44)], + ), + ], + ) + mpls_tun.add_vpp_config() + mpls_tun.admin_up() + + self.vapi.cli("clear trace") + pkts = self.create_stream_ip4( + self.pg0, ["10.0.0.%d" % i for i in range(NUM_PKTS)], n=NUM_PKTS + ) + + def send_and_expect_mpls_lb(pkts, path_labels, min_ratio): + self.pg0.add_stream(pkts) + + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx = self.pg0.get_capture() + + paths = {} + for packet in rx: + eth = packet[Ether] + self.assertEqual(eth.type, 0x8847) + + mpls = packet[MPLS] + labels = [] + while True: + labels.append(mpls.label) + if mpls.s == 1: + break + mpls = mpls[MPLS].payload + self.assertIn(labels, path_labels) + + key = "{}-{}".format(eth.dst, "-".join(str(i) for i in labels)) + paths[key] = paths.get(key, 0) + 1 + + # + # Check distribution over multiple mpls paths + # + self.assertEqual(len(paths), len(path_labels)) + for n in paths.values(): + self.assertGreaterEqual(n, NUM_PKTS / len(paths) * min_ratio) + + # + # Add labelled route through the new tunnel, + # traffic should be balanced over all tunnel paths only. + # + route_10_0_0_0 = VppIpRoute( + self, + "10.0.0.0", + 16, + [VppRoutePath("0.0.0.0", mpls_tun._sw_if_index, labels=[33])], + ) + route_10_0_0_0.add_vpp_config() + send_and_expect_mpls_lb(pkts, [[33], [44, 33]], 0.85) + + # + # Add labelled multipath route through the new tunnel, + # traffic should be balanced over both paths first and + # then over all tunnel paths. + # + route_10_0_0_0 = VppIpRoute( + self, + "10.0.0.0", + 16, + [ + VppRoutePath("0.0.0.1", mpls_tun._sw_if_index, labels=[33]), + VppRoutePath("0.0.0.2", mpls_tun._sw_if_index, labels=[34]), + ], + ) + route_10_0_0_0.add_vpp_config() + send_and_expect_mpls_lb(pkts, [[33], [44, 33], [34], [44, 34]], 0.70) + def test_mpls_tunnel_many(self): """MPLS Multiple Tunnels""" -- 2.16.6