nat: ICMP handling fixes
[vpp.git] / src / plugins / nat / nat44-ed / nat44_ed.c
index 007296e..1a996cd 100644 (file)
@@ -36,6 +36,8 @@
 #include <nat/nat44-ed/nat44_ed_affinity.h>
 #include <nat/nat44-ed/nat44_ed_inlines.h>
 
+#include <vpp/stats/stat_segment.h>
+
 snat_main_t snat_main;
 
 static_always_inline void nat_validate_interface_counters (snat_main_t *sm,
@@ -147,12 +149,14 @@ VNET_FEATURE_INIT (ip4_snat_out2in_fast, static) = {
 VNET_FEATURE_INIT (ip4_snat_in2out_output, static) = {
   .arc_name = "ip4-output",
   .node_name = "nat44-in2out-output",
-  .runs_after = VNET_FEATURES ("acl-plugin-out-ip4-fa","ip4-sv-reassembly-output-feature"),
+  .runs_after = VNET_FEATURES ("ip4-sv-reassembly-output-feature"),
+  .runs_before = VNET_FEATURES ("acl-plugin-out-ip4-fa"),
 };
 VNET_FEATURE_INIT (ip4_snat_in2out_output_worker_handoff, static) = {
   .arc_name = "ip4-output",
   .node_name = "nat44-in2out-output-worker-handoff",
-  .runs_after = VNET_FEATURES ("acl-plugin-out-ip4-fa","ip4-sv-reassembly-output-feature"),
+  .runs_after = VNET_FEATURES ("ip4-sv-reassembly-output-feature"),
+  .runs_before = VNET_FEATURES ("acl-plugin-out-ip4-fa"),
 };
 VNET_FEATURE_INIT (nat_pre_in2out_output, static) = {
   .arc_name = "ip4-output",
@@ -176,13 +180,6 @@ static void nat44_ed_db_init (u32 translations, u32 translation_buckets);
 
 static void nat44_ed_db_free ();
 
-static u32
-nat44_ed_get_worker_out2in_cb (vlib_buffer_t * b, ip4_header_t * ip,
-                              u32 rx_fib_index, u8 is_output);
-
-static u32 nat44_ed_get_worker_in2out_cb (vlib_buffer_t *b, ip4_header_t *ip,
-                                         u32 rx_fib_index, u8 is_output);
-
 u32 nat_calc_bihash_buckets (u32 n_elts);
 
 u8 *
@@ -723,8 +720,8 @@ snat_add_static_mapping (ip4_address_t l_addr, ip4_address_t e_addr,
          ip4_header_t ip = {
            .src_address = m->local_addr,
          };
-         vec_add1 (m->workers,
-                   sm->worker_in2out_cb (0, &ip, m->fib_index, 0));
+         vec_add1 (m->workers, nat44_ed_get_in2out_worker_index (
+                                 0, &ip, m->fib_index, 0));
          tsm = vec_elt_at_index (sm->per_thread_data, m->workers[0]);
        }
       else
@@ -971,7 +968,8 @@ nat44_add_del_lb_static_mapping (ip4_address_t e_addr, u16 e_port,
                .src_address = locals[i].addr,
              };
              bitmap = clib_bitmap_set (
-               bitmap, sm->worker_in2out_cb (0, &ip, m->fib_index, 0), 1);
+               bitmap,
+               nat44_ed_get_in2out_worker_index (0, &ip, m->fib_index, 0), 1);
            }
        }
 
@@ -1051,7 +1049,7 @@ nat44_add_del_lb_static_mapping (ip4_address_t e_addr, u16 e_port,
              };
              tsm = vec_elt_at_index (
                sm->per_thread_data,
-               sm->worker_in2out_cb (0, &ip, m->fib_index, 0));
+               nat44_ed_get_in2out_worker_index (0, &ip, m->fib_index, 0));
            }
          else
            tsm = vec_elt_at_index (sm->per_thread_data, sm->num_workers);
@@ -1164,9 +1162,9 @@ nat44_lb_static_mapping_add_del_local (ip4_address_t e_addr, u16 e_port,
          ip4_header_t ip = {
            .src_address = local->addr,
          };
-         tsm =
-           vec_elt_at_index (sm->per_thread_data,
-                             sm->worker_in2out_cb (0, &ip, m->fib_index, 0));
+         tsm = vec_elt_at_index (
+           sm->per_thread_data,
+           nat44_ed_get_in2out_worker_index (0, &ip, m->fib_index, 0));
        }
       else
        tsm = vec_elt_at_index (sm->per_thread_data, sm->num_workers);
@@ -1197,7 +1195,8 @@ nat44_lb_static_mapping_add_del_local (ip4_address_t e_addr, u16 e_port,
         ip4_header_t ip;
        ip.src_address.as_u32 = local->addr.as_u32,
        bitmap = clib_bitmap_set (
-         bitmap, sm->worker_in2out_cb (0, &ip, local->fib_index, 0), 1);
+         bitmap,
+         nat44_ed_get_in2out_worker_index (0, &ip, local->fib_index, 0), 1);
       }
   }
 
@@ -2028,6 +2027,8 @@ nat_init (vlib_main_t * vm)
 
   nat_init_simple_counter (sm->total_sessions, "total-sessions",
                           "/nat44-ed/total-sessions");
+  sm->max_cfg_sessions_gauge = stat_segment_new_entry (
+    (u8 *) "/nat44-ed/max-cfg-sessions", STAT_DIR_TYPE_SCALAR_INDEX);
 
 #define _(x)                                                                  \
   nat_init_simple_counter (sm->counters.fastpath.in2out.x, #x,                \
@@ -2126,6 +2127,8 @@ nat44_plugin_enable (nat44_config_t c)
     c.sessions = 63 * 1024;
 
   sm->max_translations_per_thread = c.sessions;
+  stat_segment_set_state_counter (sm->max_cfg_sessions_gauge,
+                                 sm->max_translations_per_thread);
   sm->translation_buckets = nat_calc_bihash_buckets (c.sessions);
 
   // ED only feature
@@ -2140,9 +2143,6 @@ nat44_plugin_enable (nat44_config_t c)
   sm->outside_fib_index = fib_table_find_or_create_and_lock (
     FIB_PROTOCOL_IP4, c.outside_vrf, sm->fib_src_hi);
 
-  sm->worker_in2out_cb = nat44_ed_get_worker_in2out_cb;
-  sm->worker_out2in_cb = nat44_ed_get_worker_out2in_cb;
-
   nat44_ed_db_init (sm->max_translations_per_thread, sm->translation_buckets);
 
   nat_affinity_enable ();
@@ -2423,8 +2423,8 @@ snat_static_mapping_match (vlib_main_t *vm, snat_main_t *sm,
                  .src_address = local->addr,
                };
 
-               if (sm->worker_in2out_cb (0, &ip, m->fib_index, 0) ==
-                   thread_index)
+               if (nat44_ed_get_in2out_worker_index (0, &ip, m->fib_index,
+                                                     0) == thread_index)
                  {
                    vec_add1 (tmp, i);
                  }
@@ -2499,9 +2499,9 @@ end:
   return 0;
 }
 
-static u32
-nat44_ed_get_worker_in2out_cb (vlib_buffer_t *b, ip4_header_t *ip,
-                              u32 rx_fib_index, u8 is_output)
+u32
+nat44_ed_get_in2out_worker_index (vlib_buffer_t *b, ip4_header_t *ip,
+                                 u32 rx_fib_index, u8 is_output)
 {
   snat_main_t *sm = &snat_main;
   u32 next_worker_index = sm->first_worker_index;
@@ -2602,48 +2602,22 @@ out:
   return next_worker_index;
 }
 
-static u32
-nat44_ed_get_worker_out2in_cb (vlib_buffer_t * b, ip4_header_t * ip,
-                              u32 rx_fib_index, u8 is_output)
+u32
+nat44_ed_get_out2in_worker_index (vlib_buffer_t *b, ip4_header_t *ip,
+                                 u32 rx_fib_index, u8 is_output)
 {
   snat_main_t *sm = &snat_main;
   clib_bihash_kv_8_8_t kv, value;
   clib_bihash_kv_16_8_t kv16, value16;
-  snat_main_per_thread_data_t *tsm;
 
   u32 proto, next_worker_index = 0;
-  udp_header_t *udp;
   u16 port;
   snat_static_mapping_t *m;
   u32 hash;
 
   proto = ip_proto_to_nat_proto (ip->protocol);
 
-  if (PREDICT_TRUE (proto == NAT_PROTOCOL_UDP || proto == NAT_PROTOCOL_TCP))
-    {
-      udp = ip4_next_header (ip);
-
-      init_ed_k (&kv16, ip->dst_address, vnet_buffer (b)->ip.reass.l4_dst_port,
-                ip->src_address, vnet_buffer (b)->ip.reass.l4_src_port,
-                rx_fib_index, ip->protocol);
-
-      if (PREDICT_TRUE (
-           !clib_bihash_search_16_8 (&sm->flow_hash, &kv16, &value16)))
-       {
-         tsm =
-           vec_elt_at_index (sm->per_thread_data,
-                             ed_value_get_thread_index (&value16));
-         vnet_buffer2 (b)->nat.cached_session_index =
-           ed_value_get_session_index (&value16);
-         next_worker_index = sm->first_worker_index + tsm->thread_index;
-         nat_elog_debug_handoff (
-           sm, "HANDOFF OUT2IN (session)", next_worker_index, rx_fib_index,
-           clib_net_to_host_u32 (ip->src_address.as_u32),
-           clib_net_to_host_u32 (ip->dst_address.as_u32));
-         return next_worker_index;
-       }
-    }
-  else if (proto == NAT_PROTOCOL_ICMP)
+  if (PREDICT_FALSE (proto == NAT_PROTOCOL_ICMP))
     {
       ip4_address_t lookup_saddr, lookup_daddr;
       u16 lookup_sport, lookup_dport;
@@ -2657,10 +2631,7 @@ nat44_ed_get_worker_out2in_cb (vlib_buffer_t * b, ip4_header_t * ip,
          if (PREDICT_TRUE (
                !clib_bihash_search_16_8 (&sm->flow_hash, &kv16, &value16)))
            {
-             tsm =
-               vec_elt_at_index (sm->per_thread_data,
-                                 ed_value_get_thread_index (&value16));
-             next_worker_index = sm->first_worker_index + tsm->thread_index;
+             next_worker_index = ed_value_get_thread_index (&value16);
              nat_elog_debug_handoff (
                sm, "HANDOFF OUT2IN (session)", next_worker_index,
                rx_fib_index, clib_net_to_host_u32 (ip->src_address.as_u32),
@@ -2670,6 +2641,23 @@ nat44_ed_get_worker_out2in_cb (vlib_buffer_t * b, ip4_header_t * ip,
        }
     }
 
+  init_ed_k (&kv16, ip->src_address, vnet_buffer (b)->ip.reass.l4_src_port,
+            ip->dst_address, vnet_buffer (b)->ip.reass.l4_dst_port,
+            rx_fib_index, ip->protocol);
+
+  if (PREDICT_TRUE (
+       !clib_bihash_search_16_8 (&sm->flow_hash, &kv16, &value16)))
+    {
+      vnet_buffer2 (b)->nat.cached_session_index =
+       ed_value_get_session_index (&value16);
+      next_worker_index = ed_value_get_thread_index (&value16);
+      nat_elog_debug_handoff (sm, "HANDOFF OUT2IN (session)",
+                             next_worker_index, rx_fib_index,
+                             clib_net_to_host_u32 (ip->src_address.as_u32),
+                             clib_net_to_host_u32 (ip->dst_address.as_u32));
+      return next_worker_index;
+    }
+
   /* first try static mappings without port */
   if (PREDICT_FALSE (pool_elts (sm->static_mappings)))
     {
@@ -2691,11 +2679,11 @@ nat44_ed_get_worker_out2in_cb (vlib_buffer_t * b, ip4_header_t * ip,
       goto done;
     }
 
-  udp = ip4_next_header (ip);
   port = vnet_buffer (b)->ip.reass.l4_dst_port;
 
   if (PREDICT_FALSE (ip->protocol == IP_PROTOCOL_ICMP))
     {
+      udp_header_t *udp = ip4_next_header (ip);
       icmp46_header_t *icmp = (icmp46_header_t *) udp;
       icmp_echo_header_t *echo = (icmp_echo_header_t *) (icmp + 1);
       if (!icmp_type_is_error_message
@@ -2806,6 +2794,9 @@ nat44_update_session_limit (u32 session_limit, u32 vrf_id)
     return 1;
   sm->max_translations_per_thread = nat44_get_max_session_limit ();
 
+  stat_segment_set_state_counter (sm->max_cfg_sessions_gauge,
+                                 sm->max_translations_per_thread);
+
   sm->translation_buckets =
     nat_calc_bihash_buckets (sm->max_translations_per_thread);
 
@@ -3162,8 +3153,9 @@ nat44_del_ed_session (snat_main_t * sm, ip4_address_t * addr, u16 port,
 
   ip.dst_address.as_u32 = ip.src_address.as_u32 = addr->as_u32;
   if (sm->num_workers > 1)
-    tsm = vec_elt_at_index (sm->per_thread_data,
-                           sm->worker_in2out_cb (0, &ip, fib_index, 0));
+    tsm = vec_elt_at_index (
+      sm->per_thread_data,
+      nat44_ed_get_in2out_worker_index (0, &ip, fib_index, 0));
   else
     tsm = vec_elt_at_index (sm->per_thread_data, sm->num_workers);
 
@@ -3284,7 +3276,7 @@ static_always_inline int nat_6t_flow_icmp_translate (snat_main_t *sm,
 static_always_inline void
 nat_6t_flow_ip4_translate (snat_main_t *sm, vlib_buffer_t *b, ip4_header_t *ip,
                           nat_6t_flow_t *f, nat_protocol_t proto,
-                          int is_icmp_inner_ip4)
+                          int is_icmp_inner_ip4, int skip_saddr_rewrite)
 {
   udp_header_t *udp = ip4_next_header (ip);
   tcp_header_t *tcp = (tcp_header_t *) udp;
@@ -3327,7 +3319,10 @@ nat_6t_flow_ip4_translate (snat_main_t *sm, vlib_buffer_t *b, ip4_header_t *ip,
     {
       if (!is_icmp_inner_ip4)
        { // regular case
-         ip->src_address = f->rewrite.saddr;
+         if (!skip_saddr_rewrite)
+           {
+             ip->src_address = f->rewrite.saddr;
+           }
          ip->dst_address = f->rewrite.daddr;
        }
       else
@@ -3337,10 +3332,19 @@ nat_6t_flow_ip4_translate (snat_main_t *sm, vlib_buffer_t *b, ip4_header_t *ip,
        }
     }
 
-  ip_csum_t ip_sum = ip->checksum;
-  ip_sum = ip_csum_sub_even (ip_sum, f->l3_csum_delta);
-  ip->checksum = ip_csum_fold (ip_sum);
-  ASSERT (ip->checksum == ip4_header_checksum (ip));
+  if (skip_saddr_rewrite)
+    {
+      ip->checksum = ip4_header_checksum (ip);
+    }
+  else
+    {
+      ip_csum_t ip_sum = ip->checksum;
+      ip_sum = ip_csum_sub_even (ip_sum, f->l3_csum_delta);
+      ip->checksum = ip_csum_fold (ip_sum);
+    }
+  if (0xffff == ip->checksum)
+    ip->checksum = 0;
+  ASSERT (ip4_header_checksum_is_valid (ip));
 }
 
 static_always_inline int
@@ -3384,16 +3388,53 @@ nat_6t_flow_icmp_translate (snat_main_t *sm, vlib_buffer_t *b,
          nat_protocol_t inner_proto =
            ip_proto_to_nat_proto (inner_ip->protocol);
 
-         ip_csum_t icmp_sum = icmp->checksum;
+         ip_csum_t old_icmp_sum = icmp->checksum;
+         ip_csum_t old_inner_ip_sum = inner_ip->checksum;
+         ip_csum_t old_udp_sum;
+         ip_csum_t old_tcp_sum;
+         ip_csum_t new_icmp_sum;
+         udp_header_t *udp;
+         tcp_header_t *tcp;
 
          switch (inner_proto)
            {
            case NAT_PROTOCOL_UDP:
+             udp = (udp_header_t *) (inner_ip + 1);
+             old_udp_sum = udp->checksum;
+             nat_6t_flow_ip4_translate (sm, b, inner_ip, f, inner_proto,
+                                        1 /* is_icmp_inner_ip4 */,
+                                        0 /* skip_saddr_rewrite */);
+             new_icmp_sum = ip_csum_sub_even (old_icmp_sum, f->l3_csum_delta);
+             new_icmp_sum = ip_csum_sub_even (new_icmp_sum, f->l4_csum_delta);
+             new_icmp_sum =
+               ip_csum_update (new_icmp_sum, old_inner_ip_sum,
+                               inner_ip->checksum, ip4_header_t, checksum);
+             new_icmp_sum =
+               ip_csum_update (new_icmp_sum, old_udp_sum, udp->checksum,
+                               udp_header_t, checksum);
+             new_icmp_sum = ip_csum_fold (new_icmp_sum);
+             if (0xffff == new_icmp_sum)
+               new_icmp_sum = 0;
+             icmp->checksum = new_icmp_sum;
+             break;
            case NAT_PROTOCOL_TCP:
+             tcp = (tcp_header_t *) (inner_ip + 1);
+             old_tcp_sum = tcp->checksum;
              nat_6t_flow_ip4_translate (sm, b, inner_ip, f, inner_proto,
-                                        1 /* is_icmp_inner_ip4 */);
-             icmp_sum = ip_csum_sub_even (icmp_sum, f->l3_csum_delta);
-             icmp->checksum = ip_csum_fold (icmp_sum);
+                                        1 /* is_icmp_inner_ip4 */,
+                                        0 /* skip_saddr_rewrite */);
+             new_icmp_sum = ip_csum_sub_even (old_icmp_sum, f->l3_csum_delta);
+             new_icmp_sum = ip_csum_sub_even (new_icmp_sum, f->l4_csum_delta);
+             new_icmp_sum =
+               ip_csum_update (new_icmp_sum, old_inner_ip_sum,
+                               inner_ip->checksum, ip4_header_t, checksum);
+             new_icmp_sum =
+               ip_csum_update (new_icmp_sum, old_tcp_sum, tcp->checksum,
+                               tcp_header_t, checksum);
+             new_icmp_sum = ip_csum_fold (new_icmp_sum);
+             if (0xffff == new_icmp_sum)
+               new_icmp_sum = 0;
+             icmp->checksum = new_icmp_sum;
              break;
            case NAT_PROTOCOL_ICMP:
              if (f->ops & NAT_FLOW_OP_ICMP_ID_REWRITE)
@@ -3423,29 +3464,63 @@ nat_6t_flow_icmp_translate (snat_main_t *sm, vlib_buffer_t *b,
            }
        }
     }
+
   return NAT_ED_TRNSL_ERR_SUCCESS;
 }
 
-nat_translation_error_e
+static_always_inline nat_translation_error_e
 nat_6t_flow_buf_translate (snat_main_t *sm, vlib_buffer_t *b, ip4_header_t *ip,
                           nat_6t_flow_t *f, nat_protocol_t proto,
-                          int is_output_feature)
+                          int is_output_feature, int is_i2o)
 {
   if (!is_output_feature && f->ops & NAT_FLOW_OP_TXFIB_REWRITE)
     {
       vnet_buffer (b)->sw_if_index[VLIB_TX] = f->rewrite.fib_index;
     }
 
-  nat_6t_flow_ip4_translate (sm, b, ip, f, proto, 0 /* is_icmp_inner_ip4 */);
-
   if (NAT_PROTOCOL_ICMP == proto)
     {
+      if (ip->src_address.as_u32 != f->rewrite.saddr.as_u32)
+       {
+         // packet is returned from a router, not from destination
+         // skip source address rewrite if in o2i path
+         nat_6t_flow_ip4_translate (sm, b, ip, f, proto,
+                                    0 /* is_icmp_inner_ip4 */,
+                                    !is_i2o /* skip_saddr_rewrite */);
+       }
+      else
+       {
+         nat_6t_flow_ip4_translate (sm, b, ip, f, proto,
+                                    0 /* is_icmp_inner_ip4 */,
+                                    0 /* skip_saddr_rewrite */);
+       }
       return nat_6t_flow_icmp_translate (sm, b, ip, f);
     }
 
+  nat_6t_flow_ip4_translate (sm, b, ip, f, proto, 0 /* is_icmp_inner_ip4 */,
+                            0 /* skip_saddr_rewrite */);
+
   return NAT_ED_TRNSL_ERR_SUCCESS;
 }
 
+nat_translation_error_e
+nat_6t_flow_buf_translate_i2o (snat_main_t *sm, vlib_buffer_t *b,
+                              ip4_header_t *ip, nat_6t_flow_t *f,
+                              nat_protocol_t proto, int is_output_feature)
+{
+  return nat_6t_flow_buf_translate (sm, b, ip, f, proto, is_output_feature,
+                                   1 /* is_i2o */);
+}
+
+nat_translation_error_e
+nat_6t_flow_buf_translate_o2i (snat_main_t *sm, vlib_buffer_t *b,
+                              ip4_header_t *ip, nat_6t_flow_t *f,
+                              nat_protocol_t proto, int is_output_feature)
+{
+  return nat_6t_flow_buf_translate (sm, b, ip, f, proto, is_output_feature,
+                                   0 /* is_i2o */);
+}
+
 u8 *
 format_nat_6t (u8 *s, va_list *args)
 {