NAT44: client-IP based session affinity for load-balancing (VPP-1297) 21/14621/2
authorMatus Fabian <matfabia@cisco.com>
Mon, 3 Sep 2018 12:02:23 +0000 (05:02 -0700)
committerDamjan Marion <dmarion@me.com>
Mon, 3 Sep 2018 14:48:54 +0000 (14:48 +0000)
Enable client-IP based session affinity per LB NAT rule with specific timeout.

Change-Id: I9aade152e330218d21dfda99cc5e984d769ab806
Signed-off-by: Matus Fabian <matfabia@cisco.com>
12 files changed:
src/plugins/nat/CMakeLists.txt
src/plugins/nat/in2out.c
src/plugins/nat/nat.api
src/plugins/nat/nat.c
src/plugins/nat/nat.h
src/plugins/nat/nat44_cli.c
src/plugins/nat/nat_affinity.c [new file with mode: 0644]
src/plugins/nat/nat_affinity.h [new file with mode: 0644]
src/plugins/nat/nat_api.c
src/plugins/nat/out2in.c
test/test_nat.py
test/vpp_papi_provider.py

index ef82213..20cf0e7 100644 (file)
@@ -38,6 +38,7 @@ add_vpp_plugin(nat
   nat66_cli.c
   nat66_in2out.c
   nat66_out2in.c
+  nat_affinity.c
 
   API_FILES
   nat.api
index c900393..0fe3633 100755 (executable)
@@ -258,7 +258,7 @@ snat_not_translate (snat_main_t * sm, vlib_node_runtime_t *node,
                               &value0))
     {
       /* or is static mappings */
-      if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0))
+      if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0))
         return 0;
     }
   else
@@ -387,7 +387,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0,
   key1.protocol = key0->protocol;
 
   /* First try to match static mapping by local address and port */
-  if (snat_static_mapping_match (sm, *key0, &key1, 0, 0, 0, 0))
+  if (snat_static_mapping_match (sm, *key0, &key1, 0, 0, 0, 0, 0))
     {
       /* Try to create dynamic translation */
       if (snat_alloc_outside_address_and_port (sm->addresses, rx_fib_index0,
@@ -674,7 +674,7 @@ u32 icmp_match_in2out_fast(snat_main_t *sm, vlib_node_runtime_t *node,
     }
   key0.fib_index = rx_fib_index0;
 
-  if (snat_static_mapping_match(sm, key0, &sm0, 0, &is_addr_only, 0, 0))
+  if (snat_static_mapping_match(sm, key0, &sm0, 0, &is_addr_only, 0, 0, 0))
     {
       if (PREDICT_FALSE(snat_not_translate_fast(sm, node, sw_if_index0, ip0,
           IP_PROTOCOL_ICMP, rx_fib_index0)))
@@ -875,7 +875,7 @@ snat_hairpinning (snat_main_t *sm,
   kv0.key = key0.as_u64;
 
   /* Check if destination is static mappings */
-  if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0))
+  if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0))
     {
       new_dst_addr0 = sm0.addr.as_u32;
       new_dst_port0 = sm0.port;
@@ -1006,7 +1006,7 @@ snat_icmp_hairpinning (snat_main_t *sm,
       if (rv)
         {
           /* or static mappings */
-          if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0))
+          if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0))
             {
               new_dst_addr0 = sm0.addr.as_u32;
               vnet_buffer(b0)->sw_if_index[VLIB_TX] = sm0.fib_index;
@@ -2031,7 +2031,7 @@ nat44_reass_hairpinning (snat_main_t *sm,
   udp0 = ip4_next_header (ip0);
 
   /* Check if destination is static mappings */
-  if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0))
+  if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0))
     {
       new_dst_addr0 = sm0.addr.as_u32;
       new_dst_port0 = sm0.port;
@@ -2535,7 +2535,7 @@ slow_path_ed (snat_main_t *sm,
   snat_session_t *s;
   snat_user_t *u;
   snat_session_key_t key0, key1;
-  u8 lb = 0, is_sm = 0;
+  lb_nat_type_t lb = 0, is_sm = 0;
   u32 address_index = ~0;
   snat_main_per_thread_data_t *tsm = &sm->per_thread_data[thread_index];
   nat_ed_ses_key_t *key = (nat_ed_ses_key_t *) kv->key;
@@ -2565,7 +2565,7 @@ slow_path_ed (snat_main_t *sm,
   key0.fib_index = rx_fib_index;
   key1.fib_index = sm->outside_fib_index;
   /* First try to match static mapping by local address and port */
-  if (snat_static_mapping_match (sm, key0, &key1, 0, 0, 0, &lb))
+  if (snat_static_mapping_match (sm, key0, &key1, 0, 0, 0, &lb, 0))
     {
       /* Try to create dynamic translation */
       if (snat_alloc_outside_address_and_port (sm->addresses, rx_fib_index,
@@ -2691,7 +2691,7 @@ nat44_ed_not_translate (snat_main_t * sm, vlib_node_runtime_t *node,
       key0.protocol = proto;
       key0.fib_index = sm->outside_fib_index;
       /* or is static mappings */
-      if (!snat_static_mapping_match(sm, key0, &key1, 1, 0, 0, 0))
+      if (!snat_static_mapping_match(sm, key0, &key1, 1, 0, 0, 0, 0))
         return 0;
     }
   else
@@ -5321,7 +5321,7 @@ snat_in2out_fast_static_map_fn (vlib_main_t * vm,
           key0.port = udp0->src_port;
           key0.fib_index = rx_fib_index0;
 
-          if (snat_static_mapping_match(sm, key0, &sm0, 0, 0, 0, 0))
+          if (snat_static_mapping_match(sm, key0, &sm0, 0, 0, 0, 0, 0))
             {
               b0->error = node->errors[SNAT_IN2OUT_ERROR_NO_TRANSLATION];
               next0= SNAT_IN2OUT_NEXT_DROP;
index 8e37567..f1c95b2 100644 (file)
@@ -668,6 +668,8 @@ typeonly manual_endian define nat44_lb_addr_port {
                             local address of internal host
     @param out2in_only - if 1 rule match only out2in direction
     @param tag - opaque string tag
+    @param affinity - if 0 disabled, otherwise client IP affinity sticky time
+                      in seconds
     @param local_num - number of local network nodes
     @param locals - local network nodes
 */
@@ -682,6 +684,7 @@ autoreply manual_endian define nat44_add_del_lb_static_mapping {
   u8 self_twice_nat;
   u8 out2in_only;
   u8 tag[64];
+  u32 affinity;
   u8 local_num;
   vl_api_nat44_lb_addr_port_t locals[local_num];
 };
@@ -707,6 +710,8 @@ define nat44_lb_static_mapping_dump {
                             local address of internal host
     @param out2in_only - if 1 rule match only out2in direction
     @param tag - opaque string tag
+    @param affinity - if 0 disabled, otherwise client IP affinity sticky time
+                      in seconds
     @param local_num - number of local network nodes
     @param locals - local network nodes
 */
@@ -719,6 +724,7 @@ manual_endian define nat44_lb_static_mapping_details {
   u8 self_twice_nat;
   u8 out2in_only;
   u8 tag[64];
+  u32 affinity;
   u8 local_num;
   vl_api_nat44_lb_addr_port_t locals[local_num];
 };
index 364d5f5..0ce1a60 100755 (executable)
@@ -28,6 +28,7 @@
 #include <nat/dslite.h>
 #include <nat/nat_reass.h>
 #include <nat/nat_inlines.h>
+#include <nat/nat_affinity.h>
 #include <vnet/fib/fib_table.h>
 #include <vnet/fib/ip4_fib.h>
 
@@ -211,6 +212,9 @@ nat_free_session_data (snat_main_t * sm, snat_session_t * s, u32 thread_index)
   /* session lookup tables */
   if (is_ed_session (s))
     {
+      if (is_affinity_sessions (s))
+        nat_affinity_unlock (s->ext_host_addr, s->out2in.addr,
+                             s->in2out.protocol, s->out2in.port);
       ed_key.l_addr = s->out2in.addr;
       ed_key.r_addr = s->ext_host_addr;
       ed_key.fib_index = s->out2in.fib_index;
@@ -230,7 +234,6 @@ nat_free_session_data (snat_main_t * sm, snat_session_t * s, u32 thread_index)
       ed_kv.key[1] = ed_key.as_u64[1];
       if (clib_bihash_add_del_16_8 (&tsm->out2in_ed, &ed_kv, 0))
         nat_log_warn ("out2in_ed key del failed");
-
       ed_key.l_addr = s->in2out.addr;
       ed_key.fib_index = s->in2out.fib_index;
       if (!snat_is_unk_proto_session (s))
@@ -1259,7 +1262,7 @@ int nat44_add_del_lb_static_mapping (ip4_address_t e_addr, u16 e_port,
                                      snat_protocol_t proto,
                                      nat44_lb_addr_port_t *locals, u8 is_add,
                                      twice_nat_type_t twice_nat, u8 out2in_only,
-                                     u8 *tag)
+                                     u8 *tag, u32 affinity)
 {
   snat_main_t * sm = &snat_main;
   snat_static_mapping_t *m;
@@ -1343,6 +1346,13 @@ int nat44_add_del_lb_static_mapping (ip4_address_t e_addr, u16 e_port,
       m->proto = proto;
       m->twice_nat = twice_nat;
       m->out2in_only = out2in_only;
+      m->affinity = affinity;
+
+      if (affinity)
+        m->affinity_per_service_list_head_index =
+          nat_affinity_get_per_service_list_head_index();
+      else
+        m->affinity_per_service_list_head_index = ~0;
 
       m_key.addr = m->external_addr;
       m_key.port = m->external_port;
@@ -1499,6 +1509,8 @@ int nat44_add_del_lb_static_mapping (ip4_address_t e_addr, u16 e_port,
                 }
             }
         }
+      if (m->affinity)
+        nat_affinity_flush_service (m->affinity_per_service_list_head_index);
       vec_free(m->locals);
       vec_free(m->tag);
       vec_free(m->workers);
@@ -2173,13 +2185,15 @@ int snat_static_mapping_match (snat_main_t * sm,
                                u8 by_external,
                                u8 *is_addr_only,
                                twice_nat_type_t *twice_nat,
-                               u8 *lb)
+                               lb_nat_type_t *lb,
+                               ip4_address_t * ext_host_addr)
 {
   clib_bihash_kv_8_8_t kv, value;
   snat_static_mapping_t *m;
   snat_session_key_t m_key;
   clib_bihash_8_8_t *mapping_hash = &sm->static_mapping_by_local;
   u32 rand, lo = 0, hi, mid;
+  u8 backend_index;
 
   m_key.fib_index = match.fib_index;
   if (by_external)
@@ -2210,6 +2224,19 @@ int snat_static_mapping_match (snat_main_t * sm,
     {
       if (vec_len (m->locals))
         {
+          if (PREDICT_FALSE(lb != 0))
+            *lb = m->affinity ? AFFINITY_LB_NAT : LB_NAT;
+          if (m->affinity)
+            {
+              if (nat_affinity_find_and_lock (ext_host_addr[0], match.addr,
+                  match.protocol, match.port, &backend_index))
+                goto get_local;
+
+              mapping->addr = m->locals[backend_index].addr;
+              mapping->port = clib_host_to_net_u16 (m->locals[backend_index].port);
+              mapping->fib_index = m->locals[backend_index].fib_index;
+              goto end;
+            }
 get_local:
           hi = vec_len (m->locals) - 1;
           rand = 1 + (random_u32 (&sm->random_seed) % m->locals[hi].prefix);
@@ -2231,9 +2258,18 @@ get_local:
           mapping->addr = m->locals[lo].addr;
           mapping->port = clib_host_to_net_u16 (m->locals[lo].port);
           mapping->fib_index = m->locals[lo].fib_index;
+          if (m->affinity)
+            {
+              if (nat_affinity_create_and_lock (ext_host_addr[0], match.addr,
+                  match.protocol, match.port, lo, m->affinity,
+                  m->affinity_per_service_list_head_index))
+                nat_log_info ("create affinity record failed");
+            }
         }
       else
         {
+          if (PREDICT_FALSE(lb != 0))
+            *lb = NO_LB_NAT;
           mapping->fib_index = m->fib_index;
           mapping->addr = m->local_addr;
           /* Address only mapping doesn't change port */
@@ -2251,15 +2287,13 @@ get_local:
       mapping->fib_index = sm->outside_fib_index;
     }
 
+end:
   if (PREDICT_FALSE(is_addr_only != 0))
     *is_addr_only = m->addr_only;
 
   if (PREDICT_FALSE(twice_nat != 0))
     *twice_nat = m->twice_nat;
 
-  if (PREDICT_FALSE(lb != 0))
-    *lb = vec_len (m->locals) > 0;
-
   return 0;
 }
 
@@ -2904,6 +2938,7 @@ snat_config (vlib_main_t * vm, unformat_input_t * input)
           sm->out2in_node_index = nat44_ed_out2in_node.index;
           sm->icmp_match_in2out_cb = icmp_match_in2out_ed;
           sm->icmp_match_out2in_cb = icmp_match_out2in_ed;
+          nat_affinity_init (vm);
         }
       else
         {
index 76f5754..660fb4c 100644 (file)
@@ -142,6 +142,7 @@ typedef enum {
 #define SNAT_SESSION_FLAG_TWICE_NAT            8
 #define SNAT_SESSION_FLAG_ENDPOINT_DEPENDENT   16
 #define SNAT_SESSION_FLAG_FWD_BYPASS           32
+#define SNAT_SESSION_FLAG_AFFINITY             64
 
 #define NAT_INTERFACE_FLAG_IS_INSIDE 1
 #define NAT_INTERFACE_FLAG_IS_OUTSIDE 2
@@ -241,6 +242,12 @@ typedef enum {
   TWICE_NAT_SELF,
 } twice_nat_type_t;
 
+typedef enum {
+  NO_LB_NAT,
+  LB_NAT,
+  AFFINITY_LB_NAT,
+} lb_nat_type_t;
+
 typedef struct {
   ip4_address_t local_addr;
   ip4_address_t external_addr;
@@ -252,9 +259,11 @@ typedef struct {
   u32 vrf_id;
   u32 fib_index;
   snat_protocol_t proto;
+  u32 affinity;
   u32 *workers;
   u8 *tag;
   nat44_lb_addr_port_t *locals;
+  u32 affinity_per_service_list_head_index;
 } snat_static_mapping_t;
 
 typedef struct {
@@ -472,7 +481,8 @@ int snat_static_mapping_match (snat_main_t * sm,
                                u8 by_external,
                                u8 *is_addr_only,
                                twice_nat_type_t *twice_nat,
-                               u8 *lb);
+                               lb_nat_type_t *lb,
+                               ip4_address_t * ext_host_addr);
 
 void snat_add_del_addr_to_fib (ip4_address_t * addr,
                                u8 p_len,
@@ -526,6 +536,12 @@ typedef struct {
 */
 #define is_ed_session(s) (s->flags & SNAT_SESSION_FLAG_ENDPOINT_DEPENDENT)
 
+/** \brief Check if NAT session has affinity record.
+    @param s NAT session
+    @return 1 if NAT session has affinity record
+*/
+#define is_affinity_sessions(s) (s->flags & SNAT_SESSION_FLAG_AFFINITY)
+
 #define nat_interface_is_inside(i) i->flags & NAT_INTERFACE_FLAG_IS_INSIDE
 #define nat_interface_is_outside(i) i->flags & NAT_INTERFACE_FLAG_IS_OUTSIDE
 
@@ -619,7 +635,7 @@ int nat44_add_del_lb_static_mapping (ip4_address_t e_addr, u16 e_port,
                                      snat_protocol_t proto,
                                      nat44_lb_addr_port_t *locals, u8 is_add,
                                      twice_nat_type_t twice_nat, u8 out2in_only,
-                                     u8 *tag);
+                                     u8 *tag, u32 affinity);
 int nat44_del_session (snat_main_t *sm, ip4_address_t *addr, u16 port,
                        snat_protocol_t proto, u32 vrf_id, int is_in);
 int nat44_del_ed_session (snat_main_t *sm, ip4_address_t *addr, u16 port,
index e51f6d6..3847502 100644 (file)
@@ -22,6 +22,7 @@
 #include <nat/nat_det.h>
 #include <nat/nat64.h>
 #include <nat/nat_inlines.h>
+#include <nat/nat_affinity.h>
 #include <vnet/fib/fib_table.h>
 
 #define UNSUPPORTED_IN_DET_MODE_STR \
@@ -165,6 +166,7 @@ nat44_show_hash_commnad_fn (vlib_main_t * vm, unformat_input_t * input,
 {
   snat_main_t *sm = &snat_main;
   snat_main_per_thread_data_t *tsm;
+  nat_affinity_main_t *nam = &nat_affinity_main;
   int i;
   int verbose = 0;
 
@@ -198,6 +200,9 @@ nat44_show_hash_commnad_fn (vlib_main_t * vm, unformat_input_t * input,
     vlib_cli_output (vm, "%U", format_bihash_8_8, &tsm->user_hash, verbose);
   }
 
+  if (sm->endpoint_dependent)
+    vlib_cli_output (vm, "%U", format_bihash_16_8, &nam->affinity_hash,
+                    verbose);
   return 0;
 }
 
@@ -741,7 +746,7 @@ add_lb_static_mapping_command_fn (vlib_main_t * vm,
   snat_main_t *sm = &snat_main;
   clib_error_t *error = 0;
   ip4_address_t l_addr, e_addr;
-  u32 l_port = 0, e_port = 0, vrf_id = 0, probability = 0;
+  u32 l_port = 0, e_port = 0, vrf_id = 0, probability = 0, affinity = 0;
   int is_add = 1;
   int rv;
   snat_protocol_t proto;
@@ -793,6 +798,8 @@ add_lb_static_mapping_command_fn (vlib_main_t * vm,
        out2in_only = 1;
       else if (unformat (line_input, "del"))
        is_add = 0;
+      else if (unformat (line_input, "affinity %u", &affinity))
+       ;
       else
        {
          error = clib_error_return (0, "unknown input: '%U'",
@@ -814,7 +821,8 @@ add_lb_static_mapping_command_fn (vlib_main_t * vm,
     }
 
   rv = nat44_add_del_lb_static_mapping (e_addr, (u16) e_port, proto, locals,
-                                       is_add, twice_nat, out2in_only, 0);
+                                       is_add, twice_nat, out2in_only, 0,
+                                       affinity);
 
   switch (rv)
     {
@@ -1788,7 +1796,8 @@ VLIB_CLI_COMMAND (add_lb_static_mapping_command, static) = {
   .short_help =
     "nat44 add load-balancing static mapping protocol tcp|udp "
     "external <addr>:<port> local <addr>:<port> [vrf <table-id>] "
-    "probability <n> [twice-nat|self-twice-nat] [out2in-only] [del]",
+    "probability <n> [twice-nat|self-twice-nat] [out2in-only] "
+    "[affinity <timeout-seconds>] [del]",
 };
 
 /*?
diff --git a/src/plugins/nat/nat_affinity.c b/src/plugins/nat/nat_affinity.c
new file mode 100644 (file)
index 0000000..28c25ae
--- /dev/null
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file
+ * @brief NAT plugin client-IP based session affinity for load-balancing
+ */
+
+#include <nat/nat_affinity.h>
+#include <nat/nat.h>
+
+nat_affinity_main_t nat_affinity_main;
+
+#define AFFINITY_HASH_BUCKETS 65536
+#define AFFINITY_HASH_MEMORY (2 << 25)
+
+u8 *
+format_affinity_kvp (u8 * s, va_list * args)
+{
+  clib_bihash_kv_16_8_t *v = va_arg (*args, clib_bihash_kv_16_8_t *);
+  nat_affinity_key_t k;
+
+  k.as_u64[0] = v->key[0];
+  k.as_u64[1] = v->key[1];
+
+  s = format (s, "client %U backend %U:%d proto %U index %llu",
+             format_ip4_address, &k.client_addr,
+             format_ip4_address, &k.service_addr,
+             clib_net_to_host_u16 (k.service_port),
+             format_snat_protocol, k.proto);
+
+  return s;
+}
+
+clib_error_t *
+nat_affinity_init (vlib_main_t * vm)
+{
+  nat_affinity_main_t *nam = &nat_affinity_main;
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+  clib_error_t *error = 0;
+
+  if (tm->n_vlib_mains > 1)
+    clib_spinlock_init (&nam->affinity_lock);
+
+  clib_bihash_init_16_8 (&nam->affinity_hash, "nat-affinity",
+                        AFFINITY_HASH_BUCKETS, AFFINITY_HASH_MEMORY);
+  clib_bihash_set_kvp_format_fn_16_8 (&nam->affinity_hash,
+                                     format_affinity_kvp);
+
+  nam->vlib_main = vm;
+
+  return error;
+}
+
+static_always_inline void
+make_affinity_kv (clib_bihash_kv_16_8_t * kv, ip4_address_t client_addr,
+                 ip4_address_t service_addr, u8 proto, u16 service_port)
+{
+  nat_affinity_key_t *key = (nat_affinity_key_t *) kv->key;
+
+  key->client_addr = client_addr;
+  key->service_addr = service_addr;
+  key->proto = proto;
+  key->service_port = service_port;
+
+  kv->value = ~0ULL;
+}
+
+u32
+nat_affinity_get_per_service_list_head_index (void)
+{
+  nat_affinity_main_t *nam = &nat_affinity_main;
+  dlist_elt_t *head_elt;
+
+  clib_spinlock_lock_if_init (&nam->affinity_lock);
+
+  pool_get (nam->list_pool, head_elt);
+  clib_dlist_init (nam->list_pool, head_elt - nam->list_pool);
+
+  clib_spinlock_unlock_if_init (&nam->affinity_lock);
+
+  return head_elt - nam->list_pool;
+}
+
+void
+nat_affinity_flush_service (u32 affinity_per_service_list_head_index)
+{
+  nat_affinity_main_t *nam = &nat_affinity_main;
+  u32 elt_index;
+  dlist_elt_t *elt;
+  nat_affinity_t *a;
+  clib_bihash_kv_16_8_t kv;
+
+  clib_spinlock_lock_if_init (&nam->affinity_lock);
+
+  while ((elt_index =
+         clib_dlist_remove_head (nam->list_pool,
+                                 affinity_per_service_list_head_index)) !=
+        ~0)
+    {
+      elt = pool_elt_at_index (nam->list_pool, elt_index);
+      a = pool_elt_at_index (nam->affinity_pool, elt->value);
+      kv.key[0] = a->key.as_u64[0];
+      kv.key[1] = a->key.as_u64[1];
+      pool_put_index (nam->affinity_pool, elt->value);
+      if (clib_bihash_add_del_16_8 (&nam->affinity_hash, &kv, 0))
+       nat_log_warn ("affinity key del failed");
+      pool_put_index (nam->list_pool, elt_index);
+    }
+  pool_put_index (nam->list_pool, affinity_per_service_list_head_index);
+
+  clib_spinlock_unlock_if_init (&nam->affinity_lock);
+}
+
+int
+nat_affinity_find_and_lock (ip4_address_t client_addr,
+                           ip4_address_t service_addr, u8 proto,
+                           u16 service_port, u8 * backend_index)
+{
+  nat_affinity_main_t *nam = &nat_affinity_main;
+  clib_bihash_kv_16_8_t kv, value;
+  nat_affinity_t *a;
+  int rv = 0;
+
+  make_affinity_kv (&kv, client_addr, service_addr, proto, service_port);
+  clib_spinlock_lock_if_init (&nam->affinity_lock);
+  if (clib_bihash_search_16_8 (&nam->affinity_hash, &kv, &value))
+    {
+      rv = 1;
+      goto unlock;
+    }
+
+  a = pool_elt_at_index (nam->affinity_pool, value.value);
+  /* if already expired delete */
+  if (a->ref_cnt == 0)
+    {
+      if (a->expire < vlib_time_now (nam->vlib_main))
+       {
+         clib_dlist_remove (nam->list_pool, a->per_service_index);
+         pool_put_index (nam->list_pool, a->per_service_index);
+         pool_put_index (nam->affinity_pool, value.value);
+         if (clib_bihash_add_del_16_8 (&nam->affinity_hash, &kv, 0))
+           nat_log_warn ("affinity key del failed");
+         rv = 1;
+         goto unlock;
+       }
+    }
+  a->ref_cnt++;
+  *backend_index = a->backend_index;
+
+unlock:
+  clib_spinlock_unlock_if_init (&nam->affinity_lock);
+  return rv;
+}
+
+static int
+affinity_is_expired_cb (clib_bihash_kv_16_8_t * kv, void *arg)
+{
+  nat_affinity_main_t *nam = &nat_affinity_main;
+  nat_affinity_t *a;
+
+  a = pool_elt_at_index (nam->affinity_pool, kv->value);
+  if (a->ref_cnt == 0)
+    {
+      if (a->expire < vlib_time_now (nam->vlib_main))
+       {
+         clib_dlist_remove (nam->list_pool, a->per_service_index);
+         pool_put_index (nam->list_pool, a->per_service_index);
+         pool_put_index (nam->affinity_pool, kv->value);
+         if (clib_bihash_add_del_16_8 (&nam->affinity_hash, kv, 0))
+           nat_log_warn ("affinity key del failed");
+         return 1;
+       }
+    }
+
+  return 0;
+}
+
+int
+nat_affinity_create_and_lock (ip4_address_t client_addr,
+                             ip4_address_t service_addr, u8 proto,
+                             u16 service_port, u8 backend_index,
+                             u32 sticky_time,
+                             u32 affinity_per_service_list_head_index)
+{
+  nat_affinity_main_t *nam = &nat_affinity_main;
+  clib_bihash_kv_16_8_t kv, value;
+  nat_affinity_t *a;
+  dlist_elt_t *list_elt;
+  int rv = 0;
+
+  make_affinity_kv (&kv, client_addr, service_addr, proto, service_port);
+  clib_spinlock_lock_if_init (&nam->affinity_lock);
+  if (!clib_bihash_search_16_8 (&nam->affinity_hash, &kv, &value))
+    {
+      rv = 1;
+      nat_log_notice ("affinity key already exist");
+      goto unlock;
+    }
+
+  pool_get (nam->affinity_pool, a);
+  kv.value = a - nam->affinity_pool;
+  rv =
+    clib_bihash_add_or_overwrite_stale_16_8 (&nam->affinity_hash, &kv,
+                                            affinity_is_expired_cb, NULL);
+  if (rv)
+    {
+      nat_log_notice ("affinity key add failed");
+      pool_put (nam->affinity_pool, a);
+      goto unlock;
+    }
+
+  pool_get (nam->list_pool, list_elt);
+  clib_dlist_init (nam->list_pool, list_elt - nam->list_pool);
+  list_elt->value = a - nam->affinity_pool;
+  a->per_service_index = list_elt - nam->list_pool;
+  a->backend_index = backend_index;
+  a->ref_cnt = 1;
+  a->sticky_time = sticky_time;
+  a->key.as_u64[0] = kv.key[0];
+  a->key.as_u64[1] = kv.key[1];
+  clib_dlist_addtail (nam->list_pool, affinity_per_service_list_head_index,
+                     a->per_service_index);
+
+unlock:
+  clib_spinlock_unlock_if_init (&nam->affinity_lock);
+  return rv;
+}
+
+void
+nat_affinity_unlock (ip4_address_t client_addr, ip4_address_t service_addr,
+                    u8 proto, u16 service_port)
+{
+  nat_affinity_main_t *nam = &nat_affinity_main;
+  clib_bihash_kv_16_8_t kv, value;
+  nat_affinity_t *a;
+
+  make_affinity_kv (&kv, client_addr, service_addr, proto, service_port);
+  clib_spinlock_lock_if_init (&nam->affinity_lock);
+  if (clib_bihash_search_16_8 (&nam->affinity_hash, &kv, &value))
+    goto unlock;
+
+  a = pool_elt_at_index (nam->affinity_pool, value.value);
+  a->ref_cnt--;
+  if (a->ref_cnt == 0)
+    a->expire = (u64) a->sticky_time + vlib_time_now (nam->vlib_main);
+
+unlock:
+  clib_spinlock_unlock_if_init (&nam->affinity_lock);
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/nat/nat_affinity.h b/src/plugins/nat/nat_affinity.h
new file mode 100644 (file)
index 0000000..358e682
--- /dev/null
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file
+ * @brief NAT plugin client-IP based session affinity for load-balancing
+ */
+
+#ifndef __included_nat_affinity_h__
+#define __included_nat_affinity_h__
+
+#include <vnet/ip/ip.h>
+#include <vppinfra/bihash_16_8.h>
+#include <vppinfra/dlist.h>
+
+typedef struct
+{
+  union
+  {
+    struct
+    {
+      ip4_address_t service_addr;
+      ip4_address_t client_addr;
+      /* align by making this 4 octets even though its a 1 octet field */
+      u32 proto;
+      /* align by making this 4 octets even though its a 2 octets field */
+      u32 service_port;
+    };
+    u64 as_u64[2];
+  };
+} nat_affinity_key_t;
+
+/* *INDENT-OFF* */
+typedef CLIB_PACKED(struct
+{
+  nat_affinity_key_t key;
+  u32 sticky_time;
+  u32 ref_cnt;
+  u32 per_service_index;
+  u8 backend_index;
+  f64 expire;
+}) nat_affinity_t;
+/* *INDENT-ON* */
+
+typedef struct
+{
+  nat_affinity_t *affinity_pool;
+  clib_bihash_16_8_t affinity_hash;
+  clib_spinlock_t affinity_lock;
+  dlist_elt_t *list_pool;
+  vlib_main_t *vlib_main;
+} nat_affinity_main_t;
+
+extern nat_affinity_main_t nat_affinity_main;
+
+/**
+ * @brief Get new affinity per service list head index.
+ *
+ * @returns new affinity per service list head index.
+ */
+u32 nat_affinity_get_per_service_list_head_index (void);
+
+/**
+ * @brief Flush all service affinity data.
+ *
+ * @param affinity_per_service_list_head_index Per sevice list head index.
+ */
+void nat_affinity_flush_service (u32 affinity_per_service_list_head_index);
+
+/**
+ * @brief Initialize NAT client-IP based affinity.
+ *
+ * @param vm vlib main.
+ *
+ * @return error code.
+ */
+clib_error_t *nat_affinity_init (vlib_main_t * vm);
+
+/**
+ * @brief Find service backend index for client-IP and take a reference
+ *  counting lock.
+ *
+ * @param client_addr Client IP address.
+ * @param service_addr Service IP address.
+ * @param proto IP protocol number.
+ * @param service_port Service L4 port number.
+ * @param backend_index Service backend index for client-IP if found.
+ *
+ * @return 0 on success, non-zero value otherwise.
+ */
+int nat_affinity_find_and_lock (ip4_address_t client_addr,
+                               ip4_address_t service_addr, u8 proto,
+                               u16 service_port, u8 * backend_index);
+
+/**
+ * @brief Create affinity record and take reference counting lock.
+ * @param client_addr Client IP address.
+ * @param service_addr Service IP address.
+ * @param proto IP protocol number.
+ * @param service_port Service L4 port number.
+ * @param backend_index Service backend index for client-IP.
+ * @param sticky_time Affinity sticky time in seconds.
+ * @param affinity_per_service_list_head_index Per sevice list head index.
+ *
+ * @return 0 on success, non-zero value otherwise.
+ */
+int nat_affinity_create_and_lock (ip4_address_t client_addr,
+                                 ip4_address_t service_addr, u8 proto,
+                                 u16 service_port, u8 backend_index,
+                                 u32 sticky_time,
+                                 u32 affinity_per_service_list_head_index);
+/**
+ * @brief Release a reference counting lock for affinity.
+ *
+ * @param client_addr Client IP address.
+ * @param service_addr Service IP address.
+ * @param proto IP protocol number.
+ */
+void nat_affinity_unlock (ip4_address_t client_addr,
+                         ip4_address_t service_addr, u8 proto,
+                         u16 service_port);
+
+#endif /* __included_nat_affinity_h__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
index 8055259..17009c9 100644 (file)
@@ -1463,7 +1463,8 @@ static void
     nat44_add_del_lb_static_mapping (e_addr,
                                     clib_net_to_host_u16 (mp->external_port),
                                     proto, locals, mp->is_add, twice_nat,
-                                    mp->out2in_only, tag);
+                                    mp->out2in_only, tag,
+                                    clib_net_to_host_u32 (mp->affinity));
 
   vec_free (locals);
   vec_free (tag);
index 46a8a1e..5029300 100755 (executable)
@@ -367,7 +367,7 @@ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node,
     {
       /* Try to match static mapping by external address and port,
          destination address and port in packet */
-      if (snat_static_mapping_match(sm, key0, &sm0, 1, &is_addr_only, 0, 0))
+      if (snat_static_mapping_match(sm, key0, &sm0, 1, &is_addr_only, 0, 0, 0))
         {
           if (!sm->forwarding_enabled)
             {
@@ -475,7 +475,7 @@ u32 icmp_match_out2in_fast(snat_main_t *sm, vlib_node_runtime_t *node,
     }
   key0.fib_index = rx_fib_index0;
 
-  if (snat_static_mapping_match(sm, key0, &sm0, 1, &is_addr_only, 0, 0))
+  if (snat_static_mapping_match(sm, key0, &sm0, 1, &is_addr_only, 0, 0, 0))
     {
       /* Don't NAT packet aimed at the intfc address */
       if (is_interface_addr(sm, node, sw_if_index0, ip0->dst_address.as_u32))
@@ -821,7 +821,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
             {
               /* Try to match static mapping by external address and port,
                  destination address and port in packet */
-              if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0))
+              if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0))
                 {
                   /*
                    * Send DHCP packets to the ipv4 stack, or we won't
@@ -972,7 +972,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
             {
               /* Try to match static mapping by external address and port,
                  destination address and port in packet */
-              if (snat_static_mapping_match(sm, key1, &sm1, 1, 0, 0, 0))
+              if (snat_static_mapping_match(sm, key1, &sm1, 1, 0, 0, 0, 0))
                 {
                   /*
                    * Send DHCP packets to the ipv4 stack, or we won't
@@ -1159,7 +1159,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
             {
               /* Try to match static mapping by external address and port,
                  destination address and port in packet */
-              if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0))
+              if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0))
                 {
                   /*
                    * Send DHCP packets to the ipv4 stack, or we won't
@@ -1384,7 +1384,7 @@ nat44_out2in_reass_node_fn (vlib_main_t * vm,
                 {
                   /* Try to match static mapping by external address and port,
                      destination address and port in packet */
-                  if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0))
+                  if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0))
                     {
                       /*
                        * Send DHCP packets to the ipv4 stack, or we won't
@@ -1719,7 +1719,7 @@ create_session_for_static_mapping_ed (snat_main_t * sm,
                                       vlib_node_runtime_t * node,
                                       u32 thread_index,
                                       twice_nat_type_t twice_nat,
-                                      u8 is_lb,
+                                      lb_nat_type_t lb_nat,
                                       f64 now)
 {
   snat_session_t *s;
@@ -1760,8 +1760,10 @@ create_session_for_static_mapping_ed (snat_main_t * sm,
   s->ext_host_addr.as_u32 = ip->src_address.as_u32;
   s->ext_host_port = e_key.protocol == SNAT_PROTOCOL_ICMP ? 0 : udp->src_port;
   s->flags |= SNAT_SESSION_FLAG_STATIC_MAPPING;
-  if (is_lb)
+  if (lb_nat)
     s->flags |= SNAT_SESSION_FLAG_LOAD_BALANCING;
+  if (lb_nat == AFFINITY_LB_NAT)
+    s->flags |= SNAT_SESSION_FLAG_AFFINITY;
   s->flags |= SNAT_SESSION_FLAG_ENDPOINT_DEPENDENT;
   s->outside_address_index = ~0;
   s->out2in = e_key;
@@ -2005,7 +2007,7 @@ icmp_match_out2in_ed (snat_main_t * sm, vlib_node_runtime_t * node,
       e_key.port = key.l_port;
       e_key.protocol = ip_proto_to_snat_proto (key.proto);
       e_key.fib_index = rx_fib_index;
-      if (snat_static_mapping_match(sm, e_key, &l_key, 1, &is_addr_only, 0, 0))
+      if (snat_static_mapping_match(sm, e_key, &l_key, 1, &is_addr_only, 0, 0, 0))
         {
           if (!sm->forwarding_enabled)
             {
@@ -2221,7 +2223,7 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm,
           clib_bihash_kv_16_8_t kv0, value0, kv1, value1;
           ip_csum_t sum0, sum1;
           snat_session_key_t e_key0, l_key0, e_key1, l_key1;
-          u8 is_lb0, is_lb1;
+          lb_nat_type_t lb_nat0, lb_nat1;
           twice_nat_type_t twice_nat0, twice_nat1;
 
          /* Prefetch next iteration. */
@@ -2324,7 +2326,7 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm,
                   e_key0.protocol = proto0;
                   e_key0.fib_index = rx_fib_index0;
                   if (snat_static_mapping_match(sm, e_key0, &l_key0, 1, 0,
-                      &twice_nat0, &is_lb0))
+                      &twice_nat0, &lb_nat0, &ip0->src_address))
                     {
                       /*
                        * Send DHCP packets to the ipv4 stack, or we won't
@@ -2362,7 +2364,8 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm,
                   s0 = create_session_for_static_mapping_ed(sm, b0, l_key0,
                                                             e_key0, node,
                                                             thread_index,
-                                                            twice_nat0, is_lb0,
+                                                            twice_nat0,
+                                                            lb_nat0,
                                                             now);
 
                   if (!s0)
@@ -2526,7 +2529,7 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm,
                   e_key1.protocol = proto1;
                   e_key1.fib_index = rx_fib_index1;
                   if (snat_static_mapping_match(sm, e_key1, &l_key1, 1, 0,
-                      &twice_nat1, &is_lb1))
+                      &twice_nat1, &lb_nat1, &ip1->src_address))
                     {
                       /*
                        * Send DHCP packets to the ipv4 stack, or we won't
@@ -2564,7 +2567,8 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm,
                   s1 = create_session_for_static_mapping_ed(sm, b1, l_key1,
                                                             e_key1, node,
                                                             thread_index,
-                                                            twice_nat1, is_lb1,
+                                                            twice_nat1,
+                                                            lb_nat1,
                                                             now);
 
                   if (!s1)
@@ -2673,7 +2677,7 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm,
           clib_bihash_kv_16_8_t kv0, value0;
           ip_csum_t sum0;
           snat_session_key_t e_key0, l_key0;
-          u8 is_lb0;
+          lb_nat_type_t lb_nat0;
           twice_nat_type_t twice_nat0;
 
           /* speculatively enqueue b0 to the current next frame */
@@ -2760,7 +2764,7 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm,
                   e_key0.protocol = proto0;
                   e_key0.fib_index = rx_fib_index0;
                   if (snat_static_mapping_match(sm, e_key0, &l_key0, 1, 0,
-                      &twice_nat0, &is_lb0))
+                      &twice_nat0, &lb_nat0, &ip0->src_address))
                     {
                       /*
                        * Send DHCP packets to the ipv4 stack, or we won't
@@ -2798,7 +2802,8 @@ nat44_ed_out2in_node_fn_inline (vlib_main_t * vm,
                   s0 = create_session_for_static_mapping_ed(sm, b0, l_key0,
                                                             e_key0, node,
                                                             thread_index,
-                                                            twice_nat0, is_lb0,
+                                                            twice_nat0,
+                                                            lb_nat0,
                                                             now);
 
                   if (!s0)
@@ -3874,7 +3879,7 @@ snat_out2in_fast_node_fn (vlib_main_t * vm,
           key0.port = udp0->dst_port;
           key0.fib_index = rx_fib_index0;
 
-          if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0))
+          if (snat_static_mapping_match(sm, key0, &sm0, 1, 0, 0, 0, 0))
             {
               b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION];
               goto trace00;
index 79d2622..73e414a 100644 (file)
@@ -3741,6 +3741,67 @@ class TestNAT44EndpointDependent(MethodHolder):
             self.logger.error(ppp("Unexpected or invalid packet:", p))
             raise
 
+    def test_lb_affinity(self):
+        """ NAT44 local service load balancing affinity """
+        external_addr_n = socket.inet_pton(socket.AF_INET, self.nat_addr)
+        external_port = 80
+        local_port = 8080
+        server1 = self.pg0.remote_hosts[0]
+        server2 = self.pg0.remote_hosts[1]
+
+        locals = [{'addr': server1.ip4n,
+                   'port': local_port,
+                   'probability': 50,
+                   'vrf_id': 0},
+                  {'addr': server2.ip4n,
+                   'port': local_port,
+                   'probability': 50,
+                   'vrf_id': 0}]
+
+        self.nat44_add_address(self.nat_addr)
+        self.vapi.nat44_add_del_lb_static_mapping(external_addr_n,
+                                                  external_port,
+                                                  IP_PROTOS.tcp,
+                                                  affinity=10800,
+                                                  local_num=len(locals),
+                                                  locals=locals)
+        self.vapi.nat44_interface_add_del_feature(self.pg0.sw_if_index)
+        self.vapi.nat44_interface_add_del_feature(self.pg1.sw_if_index,
+                                                  is_inside=0)
+
+        p = (Ether(dst=self.pg1.local_mac, src=self.pg1.remote_mac) /
+             IP(src=self.pg1.remote_ip4, dst=self.nat_addr) /
+             TCP(sport=1025, dport=external_port))
+        self.pg1.add_stream(p)
+        self.pg_enable_capture(self.pg_interfaces)
+        self.pg_start()
+        capture = self.pg0.get_capture(1)
+        backend = capture[0][IP].dst
+
+        sessions = self.vapi.nat44_user_session_dump(
+            socket.inet_pton(socket.AF_INET, backend), 0)
+        self.assertEqual(len(sessions), 1)
+        self.assertTrue(sessions[0].ext_host_valid)
+        self.vapi.nat44_del_session(
+            sessions[0].inside_ip_address,
+            sessions[0].inside_port,
+            sessions[0].protocol,
+            ext_host_address=sessions[0].ext_host_address,
+            ext_host_port=sessions[0].ext_host_port)
+
+        pkts = []
+        for port in range(1030, 1100):
+            p = (Ether(dst=self.pg1.local_mac, src=self.pg1.remote_mac) /
+                 IP(src=self.pg1.remote_ip4, dst=self.nat_addr) /
+                 TCP(sport=port, dport=external_port))
+            pkts.append(p)
+        self.pg1.add_stream(pkts)
+        self.pg_enable_capture(self.pg_interfaces)
+        self.pg_start()
+        capture = self.pg0.get_capture(len(pkts))
+        for p in capture:
+            self.assertEqual(p[IP].dst, backend)
+
     def test_unknown_proto(self):
         """ NAT44 translate packet with unknown protocol """
         self.nat44_add_address(self.nat_addr)
index e3d8459..e0d55c1 100644 (file)
@@ -1594,6 +1594,7 @@ class VppPapiProvider(object):
             self_twice_nat=0,
             out2in_only=0,
             tag='',
+            affinity=0,
             local_num=0,
             locals=[],
             is_add=1):
@@ -1601,6 +1602,7 @@ class VppPapiProvider(object):
 
         :param twice_nat: 1 if translate external host address and port
         :param tag: Opaque string tag
+        :param affinity: if 0 disabled, otherwise client IP affinity timeout
         :param is_add - 1 if add, 0 if delete
         """
         return self.api(
@@ -1613,6 +1615,7 @@ class VppPapiProvider(object):
              'self_twice_nat': self_twice_nat,
              'out2in_only': out2in_only,
              'tag': tag,
+             'affinity': affinity,
              'local_num': local_num,
              'locals': locals})