nat: ICMP handling fixes
[vpp.git] / src / plugins / nat / nat44-ed / nat44_ed.c
index 6a0b962..1a996cd 100644 (file)
@@ -36,6 +36,8 @@
 #include <nat/nat44-ed/nat44_ed_affinity.h>
 #include <nat/nat44-ed/nat44_ed_inlines.h>
 
+#include <vpp/stats/stat_segment.h>
+
 snat_main_t snat_main;
 
 static_always_inline void nat_validate_interface_counters (snat_main_t *sm,
@@ -147,12 +149,14 @@ VNET_FEATURE_INIT (ip4_snat_out2in_fast, static) = {
 VNET_FEATURE_INIT (ip4_snat_in2out_output, static) = {
   .arc_name = "ip4-output",
   .node_name = "nat44-in2out-output",
-  .runs_after = VNET_FEATURES ("acl-plugin-out-ip4-fa","ip4-sv-reassembly-output-feature"),
+  .runs_after = VNET_FEATURES ("ip4-sv-reassembly-output-feature"),
+  .runs_before = VNET_FEATURES ("acl-plugin-out-ip4-fa"),
 };
 VNET_FEATURE_INIT (ip4_snat_in2out_output_worker_handoff, static) = {
   .arc_name = "ip4-output",
   .node_name = "nat44-in2out-output-worker-handoff",
-  .runs_after = VNET_FEATURES ("acl-plugin-out-ip4-fa","ip4-sv-reassembly-output-feature"),
+  .runs_after = VNET_FEATURES ("ip4-sv-reassembly-output-feature"),
+  .runs_before = VNET_FEATURES ("acl-plugin-out-ip4-fa"),
 };
 VNET_FEATURE_INIT (nat_pre_in2out_output, static) = {
   .arc_name = "ip4-output",
@@ -2023,6 +2027,8 @@ nat_init (vlib_main_t * vm)
 
   nat_init_simple_counter (sm->total_sessions, "total-sessions",
                           "/nat44-ed/total-sessions");
+  sm->max_cfg_sessions_gauge = stat_segment_new_entry (
+    (u8 *) "/nat44-ed/max-cfg-sessions", STAT_DIR_TYPE_SCALAR_INDEX);
 
 #define _(x)                                                                  \
   nat_init_simple_counter (sm->counters.fastpath.in2out.x, #x,                \
@@ -2121,6 +2127,8 @@ nat44_plugin_enable (nat44_config_t c)
     c.sessions = 63 * 1024;
 
   sm->max_translations_per_thread = c.sessions;
+  stat_segment_set_state_counter (sm->max_cfg_sessions_gauge,
+                                 sm->max_translations_per_thread);
   sm->translation_buckets = nat_calc_bihash_buckets (c.sessions);
 
   // ED only feature
@@ -2601,7 +2609,6 @@ nat44_ed_get_out2in_worker_index (vlib_buffer_t *b, ip4_header_t *ip,
   snat_main_t *sm = &snat_main;
   clib_bihash_kv_8_8_t kv, value;
   clib_bihash_kv_16_8_t kv16, value16;
-  snat_main_per_thread_data_t *tsm;
 
   u32 proto, next_worker_index = 0;
   u16 port;
@@ -2610,29 +2617,7 @@ nat44_ed_get_out2in_worker_index (vlib_buffer_t *b, ip4_header_t *ip,
 
   proto = ip_proto_to_nat_proto (ip->protocol);
 
-  if (PREDICT_TRUE (proto == NAT_PROTOCOL_UDP || proto == NAT_PROTOCOL_TCP))
-    {
-      init_ed_k (&kv16, ip->dst_address, vnet_buffer (b)->ip.reass.l4_dst_port,
-                ip->src_address, vnet_buffer (b)->ip.reass.l4_src_port,
-                rx_fib_index, ip->protocol);
-
-      if (PREDICT_TRUE (
-           !clib_bihash_search_16_8 (&sm->flow_hash, &kv16, &value16)))
-       {
-         tsm =
-           vec_elt_at_index (sm->per_thread_data,
-                             ed_value_get_thread_index (&value16));
-         vnet_buffer2 (b)->nat.cached_session_index =
-           ed_value_get_session_index (&value16);
-         next_worker_index = sm->first_worker_index + tsm->thread_index;
-         nat_elog_debug_handoff (
-           sm, "HANDOFF OUT2IN (session)", next_worker_index, rx_fib_index,
-           clib_net_to_host_u32 (ip->src_address.as_u32),
-           clib_net_to_host_u32 (ip->dst_address.as_u32));
-         return next_worker_index;
-       }
-    }
-  else if (proto == NAT_PROTOCOL_ICMP)
+  if (PREDICT_FALSE (proto == NAT_PROTOCOL_ICMP))
     {
       ip4_address_t lookup_saddr, lookup_daddr;
       u16 lookup_sport, lookup_dport;
@@ -2646,10 +2631,7 @@ nat44_ed_get_out2in_worker_index (vlib_buffer_t *b, ip4_header_t *ip,
          if (PREDICT_TRUE (
                !clib_bihash_search_16_8 (&sm->flow_hash, &kv16, &value16)))
            {
-             tsm =
-               vec_elt_at_index (sm->per_thread_data,
-                                 ed_value_get_thread_index (&value16));
-             next_worker_index = sm->first_worker_index + tsm->thread_index;
+             next_worker_index = ed_value_get_thread_index (&value16);
              nat_elog_debug_handoff (
                sm, "HANDOFF OUT2IN (session)", next_worker_index,
                rx_fib_index, clib_net_to_host_u32 (ip->src_address.as_u32),
@@ -2659,6 +2641,23 @@ nat44_ed_get_out2in_worker_index (vlib_buffer_t *b, ip4_header_t *ip,
        }
     }
 
+  init_ed_k (&kv16, ip->src_address, vnet_buffer (b)->ip.reass.l4_src_port,
+            ip->dst_address, vnet_buffer (b)->ip.reass.l4_dst_port,
+            rx_fib_index, ip->protocol);
+
+  if (PREDICT_TRUE (
+       !clib_bihash_search_16_8 (&sm->flow_hash, &kv16, &value16)))
+    {
+      vnet_buffer2 (b)->nat.cached_session_index =
+       ed_value_get_session_index (&value16);
+      next_worker_index = ed_value_get_thread_index (&value16);
+      nat_elog_debug_handoff (sm, "HANDOFF OUT2IN (session)",
+                             next_worker_index, rx_fib_index,
+                             clib_net_to_host_u32 (ip->src_address.as_u32),
+                             clib_net_to_host_u32 (ip->dst_address.as_u32));
+      return next_worker_index;
+    }
+
   /* first try static mappings without port */
   if (PREDICT_FALSE (pool_elts (sm->static_mappings)))
     {
@@ -2795,6 +2794,9 @@ nat44_update_session_limit (u32 session_limit, u32 vrf_id)
     return 1;
   sm->max_translations_per_thread = nat44_get_max_session_limit ();
 
+  stat_segment_set_state_counter (sm->max_cfg_sessions_gauge,
+                                 sm->max_translations_per_thread);
+
   sm->translation_buckets =
     nat_calc_bihash_buckets (sm->max_translations_per_thread);
 
@@ -3274,7 +3276,7 @@ static_always_inline int nat_6t_flow_icmp_translate (snat_main_t *sm,
 static_always_inline void
 nat_6t_flow_ip4_translate (snat_main_t *sm, vlib_buffer_t *b, ip4_header_t *ip,
                           nat_6t_flow_t *f, nat_protocol_t proto,
-                          int is_icmp_inner_ip4)
+                          int is_icmp_inner_ip4, int skip_saddr_rewrite)
 {
   udp_header_t *udp = ip4_next_header (ip);
   tcp_header_t *tcp = (tcp_header_t *) udp;
@@ -3317,7 +3319,10 @@ nat_6t_flow_ip4_translate (snat_main_t *sm, vlib_buffer_t *b, ip4_header_t *ip,
     {
       if (!is_icmp_inner_ip4)
        { // regular case
-         ip->src_address = f->rewrite.saddr;
+         if (!skip_saddr_rewrite)
+           {
+             ip->src_address = f->rewrite.saddr;
+           }
          ip->dst_address = f->rewrite.daddr;
        }
       else
@@ -3327,9 +3332,16 @@ nat_6t_flow_ip4_translate (snat_main_t *sm, vlib_buffer_t *b, ip4_header_t *ip,
        }
     }
 
-  ip_csum_t ip_sum = ip->checksum;
-  ip_sum = ip_csum_sub_even (ip_sum, f->l3_csum_delta);
-  ip->checksum = ip_csum_fold (ip_sum);
+  if (skip_saddr_rewrite)
+    {
+      ip->checksum = ip4_header_checksum (ip);
+    }
+  else
+    {
+      ip_csum_t ip_sum = ip->checksum;
+      ip_sum = ip_csum_sub_even (ip_sum, f->l3_csum_delta);
+      ip->checksum = ip_csum_fold (ip_sum);
+    }
   if (0xffff == ip->checksum)
     ip->checksum = 0;
   ASSERT (ip4_header_checksum_is_valid (ip));
@@ -3376,16 +3388,53 @@ nat_6t_flow_icmp_translate (snat_main_t *sm, vlib_buffer_t *b,
          nat_protocol_t inner_proto =
            ip_proto_to_nat_proto (inner_ip->protocol);
 
-         ip_csum_t icmp_sum = icmp->checksum;
+         ip_csum_t old_icmp_sum = icmp->checksum;
+         ip_csum_t old_inner_ip_sum = inner_ip->checksum;
+         ip_csum_t old_udp_sum;
+         ip_csum_t old_tcp_sum;
+         ip_csum_t new_icmp_sum;
+         udp_header_t *udp;
+         tcp_header_t *tcp;
 
          switch (inner_proto)
            {
            case NAT_PROTOCOL_UDP:
+             udp = (udp_header_t *) (inner_ip + 1);
+             old_udp_sum = udp->checksum;
+             nat_6t_flow_ip4_translate (sm, b, inner_ip, f, inner_proto,
+                                        1 /* is_icmp_inner_ip4 */,
+                                        0 /* skip_saddr_rewrite */);
+             new_icmp_sum = ip_csum_sub_even (old_icmp_sum, f->l3_csum_delta);
+             new_icmp_sum = ip_csum_sub_even (new_icmp_sum, f->l4_csum_delta);
+             new_icmp_sum =
+               ip_csum_update (new_icmp_sum, old_inner_ip_sum,
+                               inner_ip->checksum, ip4_header_t, checksum);
+             new_icmp_sum =
+               ip_csum_update (new_icmp_sum, old_udp_sum, udp->checksum,
+                               udp_header_t, checksum);
+             new_icmp_sum = ip_csum_fold (new_icmp_sum);
+             if (0xffff == new_icmp_sum)
+               new_icmp_sum = 0;
+             icmp->checksum = new_icmp_sum;
+             break;
            case NAT_PROTOCOL_TCP:
+             tcp = (tcp_header_t *) (inner_ip + 1);
+             old_tcp_sum = tcp->checksum;
              nat_6t_flow_ip4_translate (sm, b, inner_ip, f, inner_proto,
-                                        1 /* is_icmp_inner_ip4 */);
-             icmp_sum = ip_csum_sub_even (icmp_sum, f->l3_csum_delta);
-             icmp->checksum = ip_csum_fold (icmp_sum);
+                                        1 /* is_icmp_inner_ip4 */,
+                                        0 /* skip_saddr_rewrite */);
+             new_icmp_sum = ip_csum_sub_even (old_icmp_sum, f->l3_csum_delta);
+             new_icmp_sum = ip_csum_sub_even (new_icmp_sum, f->l4_csum_delta);
+             new_icmp_sum =
+               ip_csum_update (new_icmp_sum, old_inner_ip_sum,
+                               inner_ip->checksum, ip4_header_t, checksum);
+             new_icmp_sum =
+               ip_csum_update (new_icmp_sum, old_tcp_sum, tcp->checksum,
+                               tcp_header_t, checksum);
+             new_icmp_sum = ip_csum_fold (new_icmp_sum);
+             if (0xffff == new_icmp_sum)
+               new_icmp_sum = 0;
+             icmp->checksum = new_icmp_sum;
              break;
            case NAT_PROTOCOL_ICMP:
              if (f->ops & NAT_FLOW_OP_ICMP_ID_REWRITE)
@@ -3415,29 +3464,63 @@ nat_6t_flow_icmp_translate (snat_main_t *sm, vlib_buffer_t *b,
            }
        }
     }
+
   return NAT_ED_TRNSL_ERR_SUCCESS;
 }
 
-nat_translation_error_e
+static_always_inline nat_translation_error_e
 nat_6t_flow_buf_translate (snat_main_t *sm, vlib_buffer_t *b, ip4_header_t *ip,
                           nat_6t_flow_t *f, nat_protocol_t proto,
-                          int is_output_feature)
+                          int is_output_feature, int is_i2o)
 {
   if (!is_output_feature && f->ops & NAT_FLOW_OP_TXFIB_REWRITE)
     {
       vnet_buffer (b)->sw_if_index[VLIB_TX] = f->rewrite.fib_index;
     }
 
-  nat_6t_flow_ip4_translate (sm, b, ip, f, proto, 0 /* is_icmp_inner_ip4 */);
-
   if (NAT_PROTOCOL_ICMP == proto)
     {
+      if (ip->src_address.as_u32 != f->rewrite.saddr.as_u32)
+       {
+         // packet is returned from a router, not from destination
+         // skip source address rewrite if in o2i path
+         nat_6t_flow_ip4_translate (sm, b, ip, f, proto,
+                                    0 /* is_icmp_inner_ip4 */,
+                                    !is_i2o /* skip_saddr_rewrite */);
+       }
+      else
+       {
+         nat_6t_flow_ip4_translate (sm, b, ip, f, proto,
+                                    0 /* is_icmp_inner_ip4 */,
+                                    0 /* skip_saddr_rewrite */);
+       }
       return nat_6t_flow_icmp_translate (sm, b, ip, f);
     }
 
+  nat_6t_flow_ip4_translate (sm, b, ip, f, proto, 0 /* is_icmp_inner_ip4 */,
+                            0 /* skip_saddr_rewrite */);
+
   return NAT_ED_TRNSL_ERR_SUCCESS;
 }
 
+nat_translation_error_e
+nat_6t_flow_buf_translate_i2o (snat_main_t *sm, vlib_buffer_t *b,
+                              ip4_header_t *ip, nat_6t_flow_t *f,
+                              nat_protocol_t proto, int is_output_feature)
+{
+  return nat_6t_flow_buf_translate (sm, b, ip, f, proto, is_output_feature,
+                                   1 /* is_i2o */);
+}
+
+nat_translation_error_e
+nat_6t_flow_buf_translate_o2i (snat_main_t *sm, vlib_buffer_t *b,
+                              ip4_header_t *ip, nat_6t_flow_t *f,
+                              nat_protocol_t proto, int is_output_feature)
+{
+  return nat_6t_flow_buf_translate (sm, b, ip, f, proto, is_output_feature,
+                                   0 /* is_i2o */);
+}
+
 u8 *
 format_nat_6t (u8 *s, va_list *args)
 {