From ea5b5be4eeb0f4cd80cb466bd6e31cad33c57960 Mon Sep 17 00:00:00 2001 From: Matus Fabian Date: Mon, 3 Sep 2018 05:02:23 -0700 Subject: [PATCH] NAT44: client-IP based session affinity for load-balancing (VPP-1297) Enable client-IP based session affinity per LB NAT rule with specific timeout. Change-Id: I9aade152e330218d21dfda99cc5e984d769ab806 Signed-off-by: Matus Fabian --- src/plugins/nat/CMakeLists.txt | 1 + src/plugins/nat/in2out.c | 20 +-- src/plugins/nat/nat.api | 6 + src/plugins/nat/nat.c | 47 ++++++- src/plugins/nat/nat.h | 20 ++- src/plugins/nat/nat44_cli.c | 15 ++- src/plugins/nat/nat_affinity.c | 269 +++++++++++++++++++++++++++++++++++++++++ src/plugins/nat/nat_affinity.h | 142 ++++++++++++++++++++++ src/plugins/nat/nat_api.c | 3 +- src/plugins/nat/out2in.c | 41 ++++--- test/test_nat.py | 61 ++++++++++ test/vpp_papi_provider.py | 3 + 12 files changed, 588 insertions(+), 40 deletions(-) create mode 100644 src/plugins/nat/nat_affinity.c create mode 100644 src/plugins/nat/nat_affinity.h diff --git a/src/plugins/nat/CMakeLists.txt b/src/plugins/nat/CMakeLists.txt index ef82213412b..20cf0e70a9e 100644 --- a/src/plugins/nat/CMakeLists.txt +++ b/src/plugins/nat/CMakeLists.txt @@ -38,6 +38,7 @@ add_vpp_plugin(nat nat66_cli.c nat66_in2out.c nat66_out2in.c + nat_affinity.c API_FILES nat.api diff --git a/src/plugins/nat/in2out.c b/src/plugins/nat/in2out.c index c900393a3a1..0fe36338397 100755 --- a/src/plugins/nat/in2out.c +++ b/src/plugins/nat/in2out.c @@ -258,7 +258,7 @@ snat_not_translate (snat_main_t * sm, vlib_node_runtime_t *node, &value0)) { /* or is static mappings */ - if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0)) + if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0)) return 0; } else @@ -387,7 +387,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, key1.protocol = key0->protocol; /* First try to match static mapping by local address and port */ - if (snat_static_mapping_match (sm, *key0, &key1, 0, 0, 0, 0)) + if (snat_static_mapping_match (sm, *key0, &key1, 0, 0, 0, 0, 0)) { /* Try to create dynamic translation */ if (snat_alloc_outside_address_and_port (sm->addresses, rx_fib_index0, @@ -674,7 +674,7 @@ u32 icmp_match_in2out_fast(snat_main_t *sm, vlib_node_runtime_t *node, } key0.fib_index = rx_fib_index0; - if (snat_static_mapping_match(sm, key0, &sm0, 0, &is_addr_only, 0, 0)) + if (snat_static_mapping_match(sm, key0, &sm0, 0, &is_addr_only, 0, 0, 0)) { if (PREDICT_FALSE(snat_not_translate_fast(sm, node, sw_if_index0, ip0, IP_PROTOCOL_ICMP, rx_fib_index0))) @@ -875,7 +875,7 @@ snat_hairpinning (snat_main_t *sm, kv0.key = key0.as_u64; /* Check if destination is static mappings */ - if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0)) + if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0)) { new_dst_addr0 = sm0.addr.as_u32; new_dst_port0 = sm0.port; @@ -1006,7 +1006,7 @@ snat_icmp_hairpinning (snat_main_t *sm, if (rv) { /* or static mappings */ - if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0)) + if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0)) { new_dst_addr0 = sm0.addr.as_u32; vnet_buffer(b0)->sw_if_index[VLIB_TX] = sm0.fib_index; @@ -2031,7 +2031,7 @@ nat44_reass_hairpinning (snat_main_t *sm, udp0 = ip4_next_header (ip0); /* Check if destination is static mappings */ - if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0)) + if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0)) { new_dst_addr0 = sm0.addr.as_u32; new_dst_port0 = sm0.port; @@ -2535,7 +2535,7 @@ slow_path_ed (snat_main_t *sm, snat_session_t *s; snat_user_t *u; snat_session_key_t key0, key1; - u8 lb = 0, is_sm = 0; + lb_nat_type_t lb = 0, is_sm = 0; u32 address_index = ~0; snat_main_per_thread_data_t *tsm = &sm->per_thread_data[thread_index]; nat_ed_ses_key_t *key = (nat_ed_ses_key_t *) kv->key; @@ -2565,7 +2565,7 @@ slow_path_ed (snat_main_t *sm, key0.fib_index = rx_fib_index; key1.fib_index = sm->outside_fib_index; /* First try to match static mapping by local address and port */ - if (snat_static_mapping_match (sm, key0, &key1, 0, 0, 0, &lb)) + if (snat_static_mapping_match (sm, key0, &key1, 0, 0, 0, &lb, 0)) { /* Try to create dynamic translation */ if (snat_alloc_outside_address_and_port (sm->addresses, rx_fib_index, @@ -2691,7 +2691,7 @@ nat44_ed_not_translate (snat_main_t * sm, vlib_node_runtime_t *node, key0.protocol = proto; key0.fib_index = sm->outside_fib_index; /* or is static mappings */ - if (!snat_static_mapping_match(sm, key0, &key1, 1, 0, 0, 0)) + if (!snat_static_mapping_match(sm, key0, &key1, 1, 0, 0, 0, 0)) return 0; } else @@ -5321,7 +5321,7 @@ snat_in2out_fast_static_map_fn (vlib_main_t * vm, key0.port = udp0->src_port; key0.fib_index = rx_fib_index0; - if (snat_static_mapping_match(sm, key0, &sm0, 0, 0, 0, 0)) + if (snat_static_mapping_match(sm, key0, &sm0, 0, 0, 0, 0, 0)) { b0->error = node->errors[SNAT_IN2OUT_ERROR_NO_TRANSLATION]; next0= SNAT_IN2OUT_NEXT_DROP; diff --git a/src/plugins/nat/nat.api b/src/plugins/nat/nat.api index 8e37567a9e8..f1c95b2d65b 100644 --- a/src/plugins/nat/nat.api +++ b/src/plugins/nat/nat.api @@ -668,6 +668,8 @@ typeonly manual_endian define nat44_lb_addr_port { local address of internal host @param out2in_only - if 1 rule match only out2in direction @param tag - opaque string tag + @param affinity - if 0 disabled, otherwise client IP affinity sticky time + in seconds @param local_num - number of local network nodes @param locals - local network nodes */ @@ -682,6 +684,7 @@ autoreply manual_endian define nat44_add_del_lb_static_mapping { u8 self_twice_nat; u8 out2in_only; u8 tag[64]; + u32 affinity; u8 local_num; vl_api_nat44_lb_addr_port_t locals[local_num]; }; @@ -707,6 +710,8 @@ define nat44_lb_static_mapping_dump { local address of internal host @param out2in_only - if 1 rule match only out2in direction @param tag - opaque string tag + @param affinity - if 0 disabled, otherwise client IP affinity sticky time + in seconds @param local_num - number of local network nodes @param locals - local network nodes */ @@ -719,6 +724,7 @@ manual_endian define nat44_lb_static_mapping_details { u8 self_twice_nat; u8 out2in_only; u8 tag[64]; + u32 affinity; u8 local_num; vl_api_nat44_lb_addr_port_t locals[local_num]; }; diff --git a/src/plugins/nat/nat.c b/src/plugins/nat/nat.c index 364d5f5ce94..0ce1a60c976 100755 --- a/src/plugins/nat/nat.c +++ b/src/plugins/nat/nat.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -211,6 +212,9 @@ nat_free_session_data (snat_main_t * sm, snat_session_t * s, u32 thread_index) /* session lookup tables */ if (is_ed_session (s)) { + if (is_affinity_sessions (s)) + nat_affinity_unlock (s->ext_host_addr, s->out2in.addr, + s->in2out.protocol, s->out2in.port); ed_key.l_addr = s->out2in.addr; ed_key.r_addr = s->ext_host_addr; ed_key.fib_index = s->out2in.fib_index; @@ -230,7 +234,6 @@ nat_free_session_data (snat_main_t * sm, snat_session_t * s, u32 thread_index) ed_kv.key[1] = ed_key.as_u64[1]; if (clib_bihash_add_del_16_8 (&tsm->out2in_ed, &ed_kv, 0)) nat_log_warn ("out2in_ed key del failed"); - ed_key.l_addr = s->in2out.addr; ed_key.fib_index = s->in2out.fib_index; if (!snat_is_unk_proto_session (s)) @@ -1259,7 +1262,7 @@ int nat44_add_del_lb_static_mapping (ip4_address_t e_addr, u16 e_port, snat_protocol_t proto, nat44_lb_addr_port_t *locals, u8 is_add, twice_nat_type_t twice_nat, u8 out2in_only, - u8 *tag) + u8 *tag, u32 affinity) { snat_main_t * sm = &snat_main; snat_static_mapping_t *m; @@ -1343,6 +1346,13 @@ int nat44_add_del_lb_static_mapping (ip4_address_t e_addr, u16 e_port, m->proto = proto; m->twice_nat = twice_nat; m->out2in_only = out2in_only; + m->affinity = affinity; + + if (affinity) + m->affinity_per_service_list_head_index = + nat_affinity_get_per_service_list_head_index(); + else + m->affinity_per_service_list_head_index = ~0; m_key.addr = m->external_addr; m_key.port = m->external_port; @@ -1499,6 +1509,8 @@ int nat44_add_del_lb_static_mapping (ip4_address_t e_addr, u16 e_port, } } } + if (m->affinity) + nat_affinity_flush_service (m->affinity_per_service_list_head_index); vec_free(m->locals); vec_free(m->tag); vec_free(m->workers); @@ -2173,13 +2185,15 @@ int snat_static_mapping_match (snat_main_t * sm, u8 by_external, u8 *is_addr_only, twice_nat_type_t *twice_nat, - u8 *lb) + lb_nat_type_t *lb, + ip4_address_t * ext_host_addr) { clib_bihash_kv_8_8_t kv, value; snat_static_mapping_t *m; snat_session_key_t m_key; clib_bihash_8_8_t *mapping_hash = &sm->static_mapping_by_local; u32 rand, lo = 0, hi, mid; + u8 backend_index; m_key.fib_index = match.fib_index; if (by_external) @@ -2210,6 +2224,19 @@ int snat_static_mapping_match (snat_main_t * sm, { if (vec_len (m->locals)) { + if (PREDICT_FALSE(lb != 0)) + *lb = m->affinity ? AFFINITY_LB_NAT : LB_NAT; + if (m->affinity) + { + if (nat_affinity_find_and_lock (ext_host_addr[0], match.addr, + match.protocol, match.port, &backend_index)) + goto get_local; + + mapping->addr = m->locals[backend_index].addr; + mapping->port = clib_host_to_net_u16 (m->locals[backend_index].port); + mapping->fib_index = m->locals[backend_index].fib_index; + goto end; + } get_local: hi = vec_len (m->locals) - 1; rand = 1 + (random_u32 (&sm->random_seed) % m->locals[hi].prefix); @@ -2231,9 +2258,18 @@ get_local: mapping->addr = m->locals[lo].addr; mapping->port = clib_host_to_net_u16 (m->locals[lo].port); mapping->fib_index = m->locals[lo].fib_index; + if (m->affinity) + { + if (nat_affinity_create_and_lock (ext_host_addr[0], match.addr, + match.protocol, match.port, lo, m->affinity, + m->affinity_per_service_list_head_index)) + nat_log_info ("create affinity record failed"); + } } else { + if (PREDICT_FALSE(lb != 0)) + *lb = NO_LB_NAT; mapping->fib_index = m->fib_index; mapping->addr = m->local_addr; /* Address only mapping doesn't change port */ @@ -2251,15 +2287,13 @@ get_local: mapping->fib_index = sm->outside_fib_index; } +end: if (PREDICT_FALSE(is_addr_only != 0)) *is_addr_only = m->addr_only; if (PREDICT_FALSE(twice_nat != 0)) *twice_nat = m->twice_nat; - if (PREDICT_FALSE(lb != 0)) - *lb = vec_len (m->locals) > 0; - return 0; } @@ -2904,6 +2938,7 @@ snat_config (vlib_main_t * vm, unformat_input_t * input) sm->out2in_node_index = nat44_ed_out2in_node.index; sm->icmp_match_in2out_cb = icmp_match_in2out_ed; sm->icmp_match_out2in_cb = icmp_match_out2in_ed; + nat_affinity_init (vm); } else { diff --git a/src/plugins/nat/nat.h b/src/plugins/nat/nat.h index 76f57542d16..660fb4c2965 100644 --- a/src/plugins/nat/nat.h +++ b/src/plugins/nat/nat.h @@ -142,6 +142,7 @@ typedef enum { #define SNAT_SESSION_FLAG_TWICE_NAT 8 #define SNAT_SESSION_FLAG_ENDPOINT_DEPENDENT 16 #define SNAT_SESSION_FLAG_FWD_BYPASS 32 +#define SNAT_SESSION_FLAG_AFFINITY 64 #define NAT_INTERFACE_FLAG_IS_INSIDE 1 #define NAT_INTERFACE_FLAG_IS_OUTSIDE 2 @@ -241,6 +242,12 @@ typedef enum { TWICE_NAT_SELF, } twice_nat_type_t; +typedef enum { + NO_LB_NAT, + LB_NAT, + AFFINITY_LB_NAT, +} lb_nat_type_t; + typedef struct { ip4_address_t local_addr; ip4_address_t external_addr; @@ -252,9 +259,11 @@ typedef struct { u32 vrf_id; u32 fib_index; snat_protocol_t proto; + u32 affinity; u32 *workers; u8 *tag; nat44_lb_addr_port_t *locals; + u32 affinity_per_service_list_head_index; } snat_static_mapping_t; typedef struct { @@ -472,7 +481,8 @@ int snat_static_mapping_match (snat_main_t * sm, u8 by_external, u8 *is_addr_only, twice_nat_type_t *twice_nat, - u8 *lb); + lb_nat_type_t *lb, + ip4_address_t * ext_host_addr); void snat_add_del_addr_to_fib (ip4_address_t * addr, u8 p_len, @@ -526,6 +536,12 @@ typedef struct { */ #define is_ed_session(s) (s->flags & SNAT_SESSION_FLAG_ENDPOINT_DEPENDENT) +/** \brief Check if NAT session has affinity record. + @param s NAT session + @return 1 if NAT session has affinity record +*/ +#define is_affinity_sessions(s) (s->flags & SNAT_SESSION_FLAG_AFFINITY) + #define nat_interface_is_inside(i) i->flags & NAT_INTERFACE_FLAG_IS_INSIDE #define nat_interface_is_outside(i) i->flags & NAT_INTERFACE_FLAG_IS_OUTSIDE @@ -619,7 +635,7 @@ int nat44_add_del_lb_static_mapping (ip4_address_t e_addr, u16 e_port, snat_protocol_t proto, nat44_lb_addr_port_t *locals, u8 is_add, twice_nat_type_t twice_nat, u8 out2in_only, - u8 *tag); + u8 *tag, u32 affinity); int nat44_del_session (snat_main_t *sm, ip4_address_t *addr, u16 port, snat_protocol_t proto, u32 vrf_id, int is_in); int nat44_del_ed_session (snat_main_t *sm, ip4_address_t *addr, u16 port, diff --git a/src/plugins/nat/nat44_cli.c b/src/plugins/nat/nat44_cli.c index e51f6d68511..3847502ca7e 100644 --- a/src/plugins/nat/nat44_cli.c +++ b/src/plugins/nat/nat44_cli.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #define UNSUPPORTED_IN_DET_MODE_STR \ @@ -165,6 +166,7 @@ nat44_show_hash_commnad_fn (vlib_main_t * vm, unformat_input_t * input, { snat_main_t *sm = &snat_main; snat_main_per_thread_data_t *tsm; + nat_affinity_main_t *nam = &nat_affinity_main; int i; int verbose = 0; @@ -198,6 +200,9 @@ nat44_show_hash_commnad_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_output (vm, "%U", format_bihash_8_8, &tsm->user_hash, verbose); } + if (sm->endpoint_dependent) + vlib_cli_output (vm, "%U", format_bihash_16_8, &nam->affinity_hash, + verbose); return 0; } @@ -741,7 +746,7 @@ add_lb_static_mapping_command_fn (vlib_main_t * vm, snat_main_t *sm = &snat_main; clib_error_t *error = 0; ip4_address_t l_addr, e_addr; - u32 l_port = 0, e_port = 0, vrf_id = 0, probability = 0; + u32 l_port = 0, e_port = 0, vrf_id = 0, probability = 0, affinity = 0; int is_add = 1; int rv; snat_protocol_t proto; @@ -793,6 +798,8 @@ add_lb_static_mapping_command_fn (vlib_main_t * vm, out2in_only = 1; else if (unformat (line_input, "del")) is_add = 0; + else if (unformat (line_input, "affinity %u", &affinity)) + ; else { error = clib_error_return (0, "unknown input: '%U'", @@ -814,7 +821,8 @@ add_lb_static_mapping_command_fn (vlib_main_t * vm, } rv = nat44_add_del_lb_static_mapping (e_addr, (u16) e_port, proto, locals, - is_add, twice_nat, out2in_only, 0); + is_add, twice_nat, out2in_only, 0, + affinity); switch (rv) { @@ -1788,7 +1796,8 @@ VLIB_CLI_COMMAND (add_lb_static_mapping_command, static) = { .short_help = "nat44 add load-balancing static mapping protocol tcp|udp " "external : local : [vrf ] " - "probability [twice-nat|self-twice-nat] [out2in-only] [del]", + "probability [twice-nat|self-twice-nat] [out2in-only] " + "[affinity ] [del]", }; /*? diff --git a/src/plugins/nat/nat_affinity.c b/src/plugins/nat/nat_affinity.c new file mode 100644 index 00000000000..28c25aecda5 --- /dev/null +++ b/src/plugins/nat/nat_affinity.c @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief NAT plugin client-IP based session affinity for load-balancing + */ + +#include +#include + +nat_affinity_main_t nat_affinity_main; + +#define AFFINITY_HASH_BUCKETS 65536 +#define AFFINITY_HASH_MEMORY (2 << 25) + +u8 * +format_affinity_kvp (u8 * s, va_list * args) +{ + clib_bihash_kv_16_8_t *v = va_arg (*args, clib_bihash_kv_16_8_t *); + nat_affinity_key_t k; + + k.as_u64[0] = v->key[0]; + k.as_u64[1] = v->key[1]; + + s = format (s, "client %U backend %U:%d proto %U index %llu", + format_ip4_address, &k.client_addr, + format_ip4_address, &k.service_addr, + clib_net_to_host_u16 (k.service_port), + format_snat_protocol, k.proto); + + return s; +} + +clib_error_t * +nat_affinity_init (vlib_main_t * vm) +{ + nat_affinity_main_t *nam = &nat_affinity_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + clib_error_t *error = 0; + + if (tm->n_vlib_mains > 1) + clib_spinlock_init (&nam->affinity_lock); + + clib_bihash_init_16_8 (&nam->affinity_hash, "nat-affinity", + AFFINITY_HASH_BUCKETS, AFFINITY_HASH_MEMORY); + clib_bihash_set_kvp_format_fn_16_8 (&nam->affinity_hash, + format_affinity_kvp); + + nam->vlib_main = vm; + + return error; +} + +static_always_inline void +make_affinity_kv (clib_bihash_kv_16_8_t * kv, ip4_address_t client_addr, + ip4_address_t service_addr, u8 proto, u16 service_port) +{ + nat_affinity_key_t *key = (nat_affinity_key_t *) kv->key; + + key->client_addr = client_addr; + key->service_addr = service_addr; + key->proto = proto; + key->service_port = service_port; + + kv->value = ~0ULL; +} + +u32 +nat_affinity_get_per_service_list_head_index (void) +{ + nat_affinity_main_t *nam = &nat_affinity_main; + dlist_elt_t *head_elt; + + clib_spinlock_lock_if_init (&nam->affinity_lock); + + pool_get (nam->list_pool, head_elt); + clib_dlist_init (nam->list_pool, head_elt - nam->list_pool); + + clib_spinlock_unlock_if_init (&nam->affinity_lock); + + return head_elt - nam->list_pool; +} + +void +nat_affinity_flush_service (u32 affinity_per_service_list_head_index) +{ + nat_affinity_main_t *nam = &nat_affinity_main; + u32 elt_index; + dlist_elt_t *elt; + nat_affinity_t *a; + clib_bihash_kv_16_8_t kv; + + clib_spinlock_lock_if_init (&nam->affinity_lock); + + while ((elt_index = + clib_dlist_remove_head (nam->list_pool, + affinity_per_service_list_head_index)) != + ~0) + { + elt = pool_elt_at_index (nam->list_pool, elt_index); + a = pool_elt_at_index (nam->affinity_pool, elt->value); + kv.key[0] = a->key.as_u64[0]; + kv.key[1] = a->key.as_u64[1]; + pool_put_index (nam->affinity_pool, elt->value); + if (clib_bihash_add_del_16_8 (&nam->affinity_hash, &kv, 0)) + nat_log_warn ("affinity key del failed"); + pool_put_index (nam->list_pool, elt_index); + } + pool_put_index (nam->list_pool, affinity_per_service_list_head_index); + + clib_spinlock_unlock_if_init (&nam->affinity_lock); +} + +int +nat_affinity_find_and_lock (ip4_address_t client_addr, + ip4_address_t service_addr, u8 proto, + u16 service_port, u8 * backend_index) +{ + nat_affinity_main_t *nam = &nat_affinity_main; + clib_bihash_kv_16_8_t kv, value; + nat_affinity_t *a; + int rv = 0; + + make_affinity_kv (&kv, client_addr, service_addr, proto, service_port); + clib_spinlock_lock_if_init (&nam->affinity_lock); + if (clib_bihash_search_16_8 (&nam->affinity_hash, &kv, &value)) + { + rv = 1; + goto unlock; + } + + a = pool_elt_at_index (nam->affinity_pool, value.value); + /* if already expired delete */ + if (a->ref_cnt == 0) + { + if (a->expire < vlib_time_now (nam->vlib_main)) + { + clib_dlist_remove (nam->list_pool, a->per_service_index); + pool_put_index (nam->list_pool, a->per_service_index); + pool_put_index (nam->affinity_pool, value.value); + if (clib_bihash_add_del_16_8 (&nam->affinity_hash, &kv, 0)) + nat_log_warn ("affinity key del failed"); + rv = 1; + goto unlock; + } + } + a->ref_cnt++; + *backend_index = a->backend_index; + +unlock: + clib_spinlock_unlock_if_init (&nam->affinity_lock); + return rv; +} + +static int +affinity_is_expired_cb (clib_bihash_kv_16_8_t * kv, void *arg) +{ + nat_affinity_main_t *nam = &nat_affinity_main; + nat_affinity_t *a; + + a = pool_elt_at_index (nam->affinity_pool, kv->value); + if (a->ref_cnt == 0) + { + if (a->expire < vlib_time_now (nam->vlib_main)) + { + clib_dlist_remove (nam->list_pool, a->per_service_index); + pool_put_index (nam->list_pool, a->per_service_index); + pool_put_index (nam->affinity_pool, kv->value); + if (clib_bihash_add_del_16_8 (&nam->affinity_hash, kv, 0)) + nat_log_warn ("affinity key del failed"); + return 1; + } + } + + return 0; +} + +int +nat_affinity_create_and_lock (ip4_address_t client_addr, + ip4_address_t service_addr, u8 proto, + u16 service_port, u8 backend_index, + u32 sticky_time, + u32 affinity_per_service_list_head_index) +{ + nat_affinity_main_t *nam = &nat_affinity_main; + clib_bihash_kv_16_8_t kv, value; + nat_affinity_t *a; + dlist_elt_t *list_elt; + int rv = 0; + + make_affinity_kv (&kv, client_addr, service_addr, proto, service_port); + clib_spinlock_lock_if_init (&nam->affinity_lock); + if (!clib_bihash_search_16_8 (&nam->affinity_hash, &kv, &value)) + { + rv = 1; + nat_log_notice ("affinity key already exist"); + goto unlock; + } + + pool_get (nam->affinity_pool, a); + kv.value = a - nam->affinity_pool; + rv = + clib_bihash_add_or_overwrite_stale_16_8 (&nam->affinity_hash, &kv, + affinity_is_expired_cb, NULL); + if (rv) + { + nat_log_notice ("affinity key add failed"); + pool_put (nam->affinity_pool, a); + goto unlock; + } + + pool_get (nam->list_pool, list_elt); + clib_dlist_init (nam->list_pool, list_elt - nam->list_pool); + list_elt->value = a - nam->affinity_pool; + a->per_service_index = list_elt - nam->list_pool; + a->backend_index = backend_index; + a->ref_cnt = 1; + a->sticky_time = sticky_time; + a->key.as_u64[0] = kv.key[0]; + a->key.as_u64[1] = kv.key[1]; + clib_dlist_addtail (nam->list_pool, affinity_per_service_list_head_index, + a->per_service_index); + +unlock: + clib_spinlock_unlock_if_init (&nam->affinity_lock); + return rv; +} + +void +nat_affinity_unlock (ip4_address_t client_addr, ip4_address_t service_addr, + u8 proto, u16 service_port) +{ + nat_affinity_main_t *nam = &nat_affinity_main; + clib_bihash_kv_16_8_t kv, value; + nat_affinity_t *a; + + make_affinity_kv (&kv, client_addr, service_addr, proto, service_port); + clib_spinlock_lock_if_init (&nam->affinity_lock); + if (clib_bihash_search_16_8 (&nam->affinity_hash, &kv, &value)) + goto unlock; + + a = pool_elt_at_index (nam->affinity_pool, value.value); + a->ref_cnt--; + if (a->ref_cnt == 0) + a->expire = (u64) a->sticky_time + vlib_time_now (nam->vlib_main); + +unlock: + clib_spinlock_unlock_if_init (&nam->affinity_lock); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/nat/nat_affinity.h b/src/plugins/nat/nat_affinity.h new file mode 100644 index 00000000000..358e682eb49 --- /dev/null +++ b/src/plugins/nat/nat_affinity.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief NAT plugin client-IP based session affinity for load-balancing + */ + +#ifndef __included_nat_affinity_h__ +#define __included_nat_affinity_h__ + +#include +#include +#include + +typedef struct +{ + union + { + struct + { + ip4_address_t service_addr; + ip4_address_t client_addr; + /* align by making this 4 octets even though its a 1 octet field */ + u32 proto; + /* align by making this 4 octets even though its a 2 octets field */ + u32 service_port; + }; + u64 as_u64[2]; + }; +} nat_affinity_key_t; + +/* *INDENT-OFF* */ +typedef CLIB_PACKED(struct +{ + nat_affinity_key_t key; + u32 sticky_time; + u32 ref_cnt; + u32 per_service_index; + u8 backend_index; + f64 expire; +}) nat_affinity_t; +/* *INDENT-ON* */ + +typedef struct +{ + nat_affinity_t *affinity_pool; + clib_bihash_16_8_t affinity_hash; + clib_spinlock_t affinity_lock; + dlist_elt_t *list_pool; + vlib_main_t *vlib_main; +} nat_affinity_main_t; + +extern nat_affinity_main_t nat_affinity_main; + +/** + * @brief Get new affinity per service list head index. + * + * @returns new affinity per service list head index. + */ +u32 nat_affinity_get_per_service_list_head_index (void); + +/** + * @brief Flush all service affinity data. + * + * @param affinity_per_service_list_head_index Per sevice list head index. + */ +void nat_affinity_flush_service (u32 affinity_per_service_list_head_index); + +/** + * @brief Initialize NAT client-IP based affinity. + * + * @param vm vlib main. + * + * @return error code. + */ +clib_error_t *nat_affinity_init (vlib_main_t * vm); + +/** + * @brief Find service backend index for client-IP and take a reference + * counting lock. + * + * @param client_addr Client IP address. + * @param service_addr Service IP address. + * @param proto IP protocol number. + * @param service_port Service L4 port number. + * @param backend_index Service backend index for client-IP if found. + * + * @return 0 on success, non-zero value otherwise. + */ +int nat_affinity_find_and_lock (ip4_address_t client_addr, + ip4_address_t service_addr, u8 proto, + u16 service_port, u8 * backend_index); + +/** + * @brief Create affinity record and take reference counting lock. + * @param client_addr Client IP address. + * @param service_addr Service IP address. + * @param proto IP protocol number. + * @param service_port Service L4 port number. + * @param backend_index Service backend index for client-IP. + * @param sticky_time Affinity sticky time in seconds. + * @param affinity_per_service_list_head_index Per sevice list head index. + * + * @return 0 on success, non-zero value otherwise. + */ +int nat_affinity_create_and_lock (ip4_address_t client_addr, + ip4_address_t service_addr, u8 proto, + u16 service_port, u8 backend_index, + u32 sticky_time, + u32 affinity_per_service_list_head_index); +/** + * @brief Release a reference counting lock for affinity. + * + * @param client_addr Client IP address. + * @param service_addr Service IP address. + * @param proto IP protocol number. + */ +void nat_affinity_unlock (ip4_address_t client_addr, + ip4_address_t service_addr, u8 proto, + u16 service_port); + +#endif /* __included_nat_affinity_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/nat/nat_api.c b/src/plugins/nat/nat_api.c index 8055259616f..17009c93561 100644 --- a/src/plugins/nat/nat_api.c +++ b/src/plugins/nat/nat_api.c @@ -1463,7 +1463,8 @@ static void nat44_add_del_lb_static_mapping (e_addr, clib_net_to_host_u16 (mp->external_port), proto, locals, mp->is_add, twice_nat, - mp->out2in_only, tag); + mp->out2in_only, tag, + clib_net_to_host_u32 (mp->affinity)); vec_free (locals); vec_free (tag); diff --git a/src/plugins/nat/out2in.c b/src/plugins/nat/out2in.c index 46a8a1ed078..5029300dcdb 100755 --- a/src/plugins/nat/out2in.c +++ b/src/plugins/nat/out2in.c @@ -367,7 +367,7 @@ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, { /* Try to match static mapping by external address and port, destination address and port in packet */ - if (snat_static_mapping_match(sm, key0, &sm0, 1, &is_addr_only, 0, 0)) + if (snat_static_mapping_match(sm, key0, &sm0, 1, &is_addr_only, 0, 0, 0)) { if (!sm->forwarding_enabled) { @@ -475,7 +475,7 @@ u32 icmp_match_out2in_fast(snat_main_t *sm, vlib_node_runtime_t *node, } key0.fib_index = rx_fib_index0; - if (snat_static_mapping_match(sm, key0, &sm0, 1, &is_addr_only, 0, 0)) + if (snat_static_mapping_match(sm, key0, &sm0, 1, &is_addr_only, 0, 0, 0)) { /* Don't NAT packet aimed at the intfc address */ if (is_interface_addr(sm, node, sw_if_index0, ip0->dst_address.as_u32)) @@ -821,7 +821,7 @@ snat_out2in_node_fn (vlib_main_t * vm, { /* Try to match static mapping by external address and port, destination address and port in packet */ - if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0)) + if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0)) { /* * Send DHCP packets to the ipv4 stack, or we won't @@ -972,7 +972,7 @@ snat_out2in_node_fn (vlib_main_t * vm, { /* Try to match static mapping by external address and port, destination address and port in packet */ - if (snat_static_mapping_match(sm, key1, &sm1, 1, 0, 0, 0)) + if (snat_static_mapping_match(sm, key1, &sm1, 1, 0, 0, 0, 0)) { /* * Send DHCP packets to the ipv4 stack, or we won't @@ -1159,7 +1159,7 @@ snat_out2in_node_fn (vlib_main_t * vm, { /* Try to match static mapping by external address and port, destination address and port in packet */ - if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0)) + if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0)) { /* * Send DHCP packets to the ipv4 stack, or we won't @@ -1384,7 +1384,7 @@ nat44_out2in_reass_node_fn (vlib_main_t * vm, { /* Try to match static mapping by external address and port, destination address and port in packet */ - if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0)) + if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0)) { /* * Send DHCP packets to the ipv4 stack, or we won't @@ -1719,7 +1719,7 @@ create_session_for_static_mapping_ed (snat_main_t * sm, vlib_node_runtime_t * node, u32 thread_index, twice_nat_type_t twice_nat, - u8 is_lb, + lb_nat_type_t lb_nat, f64 now) { snat_session_t *s; @@ -1760,8 +1760,10 @@ create_session_for_static_mapping_ed (snat_main_t * sm, s->ext_host_addr.as_u32 = ip->src_address.as_u32; s->ext_host_port = e_key.protocol == SNAT_PROTOCOL_ICMP ? 0 : udp->src_port; s->flags |= SNAT_SESSION_FLAG_STATIC_MAPPING; - if (is_lb) + if (lb_nat) s->flags |= SNAT_SESSION_FLAG_LOAD_BALANCING; + if (lb_nat == AFFINITY_LB_NAT) + s->flags |= SNAT_SESSION_FLAG_AFFINITY; s->flags |= SNAT_SESSION_FLAG_ENDPOINT_DEPENDENT; s->outside_address_index = ~0; s->out2in = e_key; @@ -2005,7 +2007,7 @@ icmp_match_out2in_ed (snat_main_t * sm, vlib_node_runtime_t * node, e_key.port = key.l_port; e_key.protocol = ip_proto_to_snat_proto (key.proto); e_key.fib_index = rx_fib_index; - if (snat_static_mapping_match(sm, e_key, &l_key, 1, &is_addr_only, 0, 0)) + if (snat_static_mapping_match(sm, e_key, &l_key, 1, &is_addr_only, 0, 0, 0)) { if (!sm->forwarding_enabled) { @@ -2221,7 +2223,7 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm, clib_bihash_kv_16_8_t kv0, value0, kv1, value1; ip_csum_t sum0, sum1; snat_session_key_t e_key0, l_key0, e_key1, l_key1; - u8 is_lb0, is_lb1; + lb_nat_type_t lb_nat0, lb_nat1; twice_nat_type_t twice_nat0, twice_nat1; /* Prefetch next iteration. */ @@ -2324,7 +2326,7 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm, e_key0.protocol = proto0; e_key0.fib_index = rx_fib_index0; if (snat_static_mapping_match(sm, e_key0, &l_key0, 1, 0, - &twice_nat0, &is_lb0)) + &twice_nat0, &lb_nat0, &ip0->src_address)) { /* * Send DHCP packets to the ipv4 stack, or we won't @@ -2362,7 +2364,8 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm, s0 = create_session_for_static_mapping_ed(sm, b0, l_key0, e_key0, node, thread_index, - twice_nat0, is_lb0, + twice_nat0, + lb_nat0, now); if (!s0) @@ -2526,7 +2529,7 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm, e_key1.protocol = proto1; e_key1.fib_index = rx_fib_index1; if (snat_static_mapping_match(sm, e_key1, &l_key1, 1, 0, - &twice_nat1, &is_lb1)) + &twice_nat1, &lb_nat1, &ip1->src_address)) { /* * Send DHCP packets to the ipv4 stack, or we won't @@ -2564,7 +2567,8 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm, s1 = create_session_for_static_mapping_ed(sm, b1, l_key1, e_key1, node, thread_index, - twice_nat1, is_lb1, + twice_nat1, + lb_nat1, now); if (!s1) @@ -2673,7 +2677,7 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm, clib_bihash_kv_16_8_t kv0, value0; ip_csum_t sum0; snat_session_key_t e_key0, l_key0; - u8 is_lb0; + lb_nat_type_t lb_nat0; twice_nat_type_t twice_nat0; /* speculatively enqueue b0 to the current next frame */ @@ -2760,7 +2764,7 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm, e_key0.protocol = proto0; e_key0.fib_index = rx_fib_index0; if (snat_static_mapping_match(sm, e_key0, &l_key0, 1, 0, - &twice_nat0, &is_lb0)) + &twice_nat0, &lb_nat0, &ip0->src_address)) { /* * Send DHCP packets to the ipv4 stack, or we won't @@ -2798,7 +2802,8 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm, s0 = create_session_for_static_mapping_ed(sm, b0, l_key0, e_key0, node, thread_index, - twice_nat0, is_lb0, + twice_nat0, + lb_nat0, now); if (!s0) @@ -3874,7 +3879,7 @@ snat_out2in_fast_node_fn (vlib_main_t * vm, key0.port = udp0->dst_port; key0.fib_index = rx_fib_index0; - if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0)) + if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0)) { b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; goto trace00; diff --git a/test/test_nat.py b/test/test_nat.py index 79d26224ad0..73e414ad1ae 100644 --- a/test/test_nat.py +++ b/test/test_nat.py @@ -3741,6 +3741,67 @@ class TestNAT44EndpointDependent(MethodHolder): self.logger.error(ppp("Unexpected or invalid packet:", p)) raise + def test_lb_affinity(self): + """ NAT44 local service load balancing affinity """ + external_addr_n = socket.inet_pton(socket.AF_INET, self.nat_addr) + external_port = 80 + local_port = 8080 + server1 = self.pg0.remote_hosts[0] + server2 = self.pg0.remote_hosts[1] + + locals = [{'addr': server1.ip4n, + 'port': local_port, + 'probability': 50, + 'vrf_id': 0}, + {'addr': server2.ip4n, + 'port': local_port, + 'probability': 50, + 'vrf_id': 0}] + + self.nat44_add_address(self.nat_addr) + self.vapi.nat44_add_del_lb_static_mapping(external_addr_n, + external_port, + IP_PROTOS.tcp, + affinity=10800, + local_num=len(locals), + locals=locals) + self.vapi.nat44_interface_add_del_feature(self.pg0.sw_if_index) + self.vapi.nat44_interface_add_del_feature(self.pg1.sw_if_index, + is_inside=0) + + p = (Ether(dst=self.pg1.local_mac, src=self.pg1.remote_mac) / + IP(src=self.pg1.remote_ip4, dst=self.nat_addr) / + TCP(sport=1025, dport=external_port)) + self.pg1.add_stream(p) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + capture = self.pg0.get_capture(1) + backend = capture[0][IP].dst + + sessions = self.vapi.nat44_user_session_dump( + socket.inet_pton(socket.AF_INET, backend), 0) + self.assertEqual(len(sessions), 1) + self.assertTrue(sessions[0].ext_host_valid) + self.vapi.nat44_del_session( + sessions[0].inside_ip_address, + sessions[0].inside_port, + sessions[0].protocol, + ext_host_address=sessions[0].ext_host_address, + ext_host_port=sessions[0].ext_host_port) + + pkts = [] + for port in range(1030, 1100): + p = (Ether(dst=self.pg1.local_mac, src=self.pg1.remote_mac) / + IP(src=self.pg1.remote_ip4, dst=self.nat_addr) / + TCP(sport=port, dport=external_port)) + pkts.append(p) + self.pg1.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + capture = self.pg0.get_capture(len(pkts)) + for p in capture: + self.assertEqual(p[IP].dst, backend) + def test_unknown_proto(self): """ NAT44 translate packet with unknown protocol """ self.nat44_add_address(self.nat_addr) diff --git a/test/vpp_papi_provider.py b/test/vpp_papi_provider.py index e3d8459b0b1..e0d55c16cee 100644 --- a/test/vpp_papi_provider.py +++ b/test/vpp_papi_provider.py @@ -1594,6 +1594,7 @@ class VppPapiProvider(object): self_twice_nat=0, out2in_only=0, tag='', + affinity=0, local_num=0, locals=[], is_add=1): @@ -1601,6 +1602,7 @@ class VppPapiProvider(object): :param twice_nat: 1 if translate external host address and port :param tag: Opaque string tag + :param affinity: if 0 disabled, otherwise client IP affinity timeout :param is_add - 1 if add, 0 if delete """ return self.api( @@ -1613,6 +1615,7 @@ class VppPapiProvider(object): 'self_twice_nat': self_twice_nat, 'out2in_only': out2in_only, 'tag': tag, + 'affinity': affinity, 'local_num': local_num, 'locals': locals}) -- 2.16.6