From efcd1e9e1d7dda4e4ea3db5750925cd8f6894f4d Mon Sep 17 00:00:00 2001 From: Matus Fabian Date: Tue, 15 Aug 2017 06:59:19 -0700 Subject: [PATCH] SNAT: IP fragmentation (VPP-890) Translation of fragmented packets. Change-Id: I9b1f2e9433ce273638080f32c2d3bff39c49899d Signed-off-by: Matus Fabian --- src/plugins/nat.am | 1 + src/plugins/nat/in2out.c | 417 ++++++++++++++++++++++- src/plugins/nat/nat.api | 81 +++++ src/plugins/nat/nat.c | 8 +- src/plugins/nat/nat.h | 32 +- src/plugins/nat/nat64_db.c | 46 +++ src/plugins/nat/nat64_db.h | 21 ++ src/plugins/nat/nat64_in2out.c | 505 +++++++++++++++++++++++++++- src/plugins/nat/nat64_out2in.c | 409 ++++++++++++++++++++++- src/plugins/nat/nat_api.c | 144 ++++++++ src/plugins/nat/nat_reass.c | 739 +++++++++++++++++++++++++++++++++++++++++ src/plugins/nat/nat_reass.h | 293 ++++++++++++++++ src/plugins/nat/out2in.c | 336 ++++++++++++++++++- test/test_nat.py | 414 ++++++++++++++++++++++- test/vpp_papi_provider.py | 37 +++ 15 files changed, 3448 insertions(+), 35 deletions(-) create mode 100644 src/plugins/nat/nat_reass.c create mode 100644 src/plugins/nat/nat_reass.h diff --git a/src/plugins/nat.am b/src/plugins/nat.am index add82f081c9..b6c369fe324 100644 --- a/src/plugins/nat.am +++ b/src/plugins/nat.am @@ -22,6 +22,7 @@ nat_plugin_la_SOURCES = nat/nat.c \ nat/nat_plugin.api.h \ nat/nat_ipfix_logging.c \ nat/nat_det.c \ + nat/nat_reass.c \ nat/nat64.c \ nat/nat64_cli.c \ nat/nat64_in2out.c \ diff --git a/src/plugins/nat/in2out.c b/src/plugins/nat/in2out.c index b0593900a93..e4dbe917ce8 100755 --- a/src/plugins/nat/in2out.c +++ b/src/plugins/nat/in2out.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -83,6 +84,25 @@ static u8 * format_snat_in2out_worker_handoff_trace (u8 * s, va_list * args) return s; } +typedef struct { + u32 sw_if_index; + u32 next_index; + u8 cached; +} nat44_in2out_reass_trace_t; + +static u8 * format_nat44_in2out_reass_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + nat44_in2out_reass_trace_t * t = va_arg (*args, nat44_in2out_reass_trace_t *); + + s = format (s, "NAT44_IN2OUT_REASS: sw_if_index %d, next index %d, status %s", + t->sw_if_index, t->next_index, + t->cached ? "cached" : "translated"); + + return s; +} + vlib_node_registration_t snat_in2out_node; vlib_node_registration_t snat_in2out_slowpath_node; vlib_node_registration_t snat_in2out_fast_node; @@ -94,6 +114,7 @@ vlib_node_registration_t snat_in2out_output_worker_handoff_node; vlib_node_registration_t snat_hairpin_dst_node; vlib_node_registration_t snat_hairpin_src_node; vlib_node_registration_t nat44_hairpinning_node; +vlib_node_registration_t nat44_in2out_reass_node; #define foreach_snat_in2out_error \ @@ -103,7 +124,10 @@ _(OUT_OF_PORTS, "Out of ports") \ _(BAD_OUTSIDE_FIB, "Outside VRF ID not found") \ _(BAD_ICMP_TYPE, "unsupported ICMP type") \ _(NO_TRANSLATION, "No translation") \ -_(MAX_SESSIONS_EXCEEDED, "Maximum sessions exceeded") +_(MAX_SESSIONS_EXCEEDED, "Maximum sessions exceeded") \ +_(DROP_FRAGMENT, "Drop fragment") \ +_(MAX_REASS, "Maximum reassemblies exceeded") \ +_(MAX_FRAG, "Maximum fragments per reassembly exceeded") typedef enum { #define _(sym,str) SNAT_IN2OUT_ERROR_##sym, @@ -123,6 +147,7 @@ typedef enum { SNAT_IN2OUT_NEXT_DROP, SNAT_IN2OUT_NEXT_ICMP_ERROR, SNAT_IN2OUT_NEXT_SLOW_PATH, + SNAT_IN2OUT_NEXT_REASS, SNAT_IN2OUT_N_NEXT, } snat_in2out_next_t; @@ -243,6 +268,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, u32 address_index = ~0; u32 outside_fib_index; uword * p; + udp_header_t * udp0 = ip4_next_header (ip0); if (PREDICT_FALSE (maximum_sessions_exceeded(sm, thread_index))) { @@ -443,6 +469,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, s->out2in.protocol = key0->protocol; s->out2in.fib_index = outside_fib_index; s->ext_host_addr.as_u32 = ip0->dst_address.as_u32; + s->ext_host_port = udp0->dst_port; *sessionp = s; /* Add to translation hashes */ @@ -1645,6 +1672,12 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, next0 = SNAT_IN2OUT_NEXT_SLOW_PATH; goto trace00; } + + if (ip4_is_fragment (ip0)) + { + next0 = SNAT_IN2OUT_NEXT_REASS; + goto trace00; + } } key0.addr = ip0->src_address; @@ -1819,6 +1852,12 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, next1 = SNAT_IN2OUT_NEXT_SLOW_PATH; goto trace01; } + + if (ip4_is_fragment (ip1)) + { + next0 = SNAT_IN2OUT_NEXT_REASS; + goto trace01; + } } b1->flags |= VNET_BUFFER_F_IS_NATED; @@ -2029,6 +2068,12 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, next0 = SNAT_IN2OUT_NEXT_SLOW_PATH; goto trace0; } + + if (ip4_is_fragment (ip0)) + { + next0 = SNAT_IN2OUT_NEXT_REASS; + goto trace0; + } } key0.addr = ip0->src_address; @@ -2194,6 +2239,7 @@ VLIB_REGISTER_NODE (snat_in2out_node) = { [SNAT_IN2OUT_NEXT_LOOKUP] = "ip4-lookup", [SNAT_IN2OUT_NEXT_SLOW_PATH] = "nat44-in2out-slowpath", [SNAT_IN2OUT_NEXT_ICMP_ERROR] = "ip4-icmp-error", + [SNAT_IN2OUT_NEXT_REASS] = "nat44-in2out-reass", }, }; @@ -2227,6 +2273,7 @@ VLIB_REGISTER_NODE (snat_in2out_output_node) = { [SNAT_IN2OUT_NEXT_LOOKUP] = "interface-output", [SNAT_IN2OUT_NEXT_SLOW_PATH] = "nat44-in2out-output-slowpath", [SNAT_IN2OUT_NEXT_ICMP_ERROR] = "ip4-icmp-error", + [SNAT_IN2OUT_NEXT_REASS] = "nat44-in2out-reass", }, }; @@ -2261,6 +2308,7 @@ VLIB_REGISTER_NODE (snat_in2out_slowpath_node) = { [SNAT_IN2OUT_NEXT_LOOKUP] = "ip4-lookup", [SNAT_IN2OUT_NEXT_SLOW_PATH] = "nat44-in2out-slowpath", [SNAT_IN2OUT_NEXT_ICMP_ERROR] = "ip4-icmp-error", + [SNAT_IN2OUT_NEXT_REASS] = "nat44-in2out-reass", }, }; @@ -2295,6 +2343,7 @@ VLIB_REGISTER_NODE (snat_in2out_output_slowpath_node) = { [SNAT_IN2OUT_NEXT_LOOKUP] = "interface-output", [SNAT_IN2OUT_NEXT_SLOW_PATH] = "nat44-in2out-output-slowpath", [SNAT_IN2OUT_NEXT_ICMP_ERROR] = "ip4-icmp-error", + [SNAT_IN2OUT_NEXT_REASS] = "nat44-in2out-reass", }, }; @@ -2392,6 +2441,371 @@ VLIB_REGISTER_NODE (nat44_hairpinning_node) = { VLIB_NODE_FUNCTION_MULTIARCH (nat44_hairpinning_node, nat44_hairpinning_fn); +static inline void +nat44_reass_hairpinning (snat_main_t *sm, + vlib_buffer_t * b0, + ip4_header_t * ip0, + u16 sport, + u16 dport, + u32 proto0) +{ + snat_session_key_t key0, sm0; + snat_session_t * s0; + clib_bihash_kv_8_8_t kv0, value0; + ip_csum_t sum0; + u32 new_dst_addr0 = 0, old_dst_addr0, ti = 0, si; + u16 new_dst_port0, old_dst_port0; + udp_header_t * udp0; + tcp_header_t * tcp0; + + key0.addr = ip0->dst_address; + key0.port = dport; + key0.protocol = proto0; + key0.fib_index = sm->outside_fib_index; + kv0.key = key0.as_u64; + + udp0 = ip4_next_header (ip0); + + /* Check if destination is static mappings */ + if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0)) + { + new_dst_addr0 = sm0.addr.as_u32; + new_dst_port0 = sm0.port; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = sm0.fib_index; + } + /* or active sessions */ + else + { + if (sm->num_workers > 1) + ti = (clib_net_to_host_u16 (udp0->dst_port) - 1024) / sm->port_per_thread; + else + ti = sm->num_workers; + + if (!clib_bihash_search_8_8 (&sm->per_thread_data[ti].out2in, &kv0, &value0)) + { + si = value0.value; + s0 = pool_elt_at_index (sm->per_thread_data[ti].sessions, si); + new_dst_addr0 = s0->in2out.addr.as_u32; + new_dst_port0 = s0->in2out.port; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = s0->in2out.fib_index; + } + } + + /* Destination is behind the same NAT, use internal address and port */ + if (new_dst_addr0) + { + old_dst_addr0 = ip0->dst_address.as_u32; + ip0->dst_address.as_u32 = new_dst_addr0; + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_dst_addr0, new_dst_addr0, + ip4_header_t, dst_address); + ip0->checksum = ip_csum_fold (sum0); + + old_dst_port0 = dport; + if (PREDICT_TRUE(new_dst_port0 != old_dst_port0 && + ip4_is_first_fragment (ip0))) + { + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + tcp0 = ip4_next_header (ip0); + tcp0->dst = new_dst_port0; + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_dst_addr0, new_dst_addr0, + ip4_header_t, dst_address); + sum0 = ip_csum_update (sum0, old_dst_port0, new_dst_port0, + ip4_header_t /* cheat */, length); + tcp0->checksum = ip_csum_fold(sum0); + } + else + { + udp0->dst_port = new_dst_port0; + udp0->checksum = 0; + } + } + else + { + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + tcp0 = ip4_next_header (ip0); + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_dst_addr0, new_dst_addr0, + ip4_header_t, dst_address); + tcp0->checksum = ip_csum_fold(sum0); + } + } + } +} + +static uword +nat44_in2out_reass_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, *from, *to_next; + snat_in2out_next_t next_index; + u32 pkts_processed = 0; + snat_main_t *sm = &snat_main; + f64 now = vlib_time_now (vm); + u32 thread_index = vlib_get_thread_index (); + snat_main_per_thread_data_t *per_thread_data = + &sm->per_thread_data[thread_index]; + u32 *fragments_to_drop = 0; + u32 *fragments_to_loopback = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, sw_if_index0, proto0, rx_fib_index0, new_addr0, old_addr0; + vlib_buffer_t *b0; + u32 next0; + u8 cached0 = 0; + ip4_header_t *ip0; + nat_reass_ip4_t *reass0; + udp_header_t * udp0; + tcp_header_t * tcp0; + snat_session_key_t key0; + clib_bihash_kv_8_8_t kv0, value0; + snat_session_t * s0 = 0; + u16 old_port0, new_port0; + ip_csum_t sum0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + next0 = SNAT_IN2OUT_NEXT_LOOKUP; + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP4, + sw_if_index0); + + if (PREDICT_FALSE (nat_reass_is_drop_frag(0))) + { + next0 = SNAT_IN2OUT_NEXT_DROP; + b0->error = node->errors[SNAT_IN2OUT_ERROR_DROP_FRAGMENT]; + goto trace0; + } + + ip0 = (ip4_header_t *) vlib_buffer_get_current (b0); + udp0 = ip4_next_header (ip0); + tcp0 = (tcp_header_t *) udp0; + proto0 = ip_proto_to_snat_proto (ip0->protocol); + + reass0 = nat_ip4_reass_find_or_create (ip0->src_address, + ip0->dst_address, + ip0->fragment_id, + ip0->protocol, + 1, + &fragments_to_drop); + + if (PREDICT_FALSE (!reass0)) + { + next0 = SNAT_IN2OUT_NEXT_DROP; + b0->error = node->errors[SNAT_IN2OUT_ERROR_MAX_REASS]; + goto trace0; + } + + if (PREDICT_FALSE (ip4_is_first_fragment (ip0))) + { + key0.addr = ip0->src_address; + key0.port = udp0->src_port; + key0.protocol = proto0; + key0.fib_index = rx_fib_index0; + kv0.key = key0.as_u64; + + if (clib_bihash_search_8_8 (&per_thread_data->in2out, &kv0, &value0)) + { + if (PREDICT_FALSE(snat_not_translate(sm, node, sw_if_index0, + ip0, proto0, rx_fib_index0, thread_index))) + goto trace0; + + next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, + &s0, node, next0, thread_index); + + if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) + goto trace0; + + reass0->sess_index = s0 - per_thread_data->sessions; + } + else + { + s0 = pool_elt_at_index (per_thread_data->sessions, + value0.value); + reass0->sess_index = value0.value; + } + nat_ip4_reass_get_frags (reass0, &fragments_to_loopback); + } + else + { + if (PREDICT_FALSE (reass0->sess_index == (u32) ~0)) + { + if (nat_ip4_reass_add_fragment (reass0, bi0)) + { + b0->error = node->errors[SNAT_IN2OUT_ERROR_MAX_FRAG]; + next0 = SNAT_IN2OUT_NEXT_DROP; + goto trace0; + } + cached0 = 1; + goto trace0; + } + s0 = pool_elt_at_index (per_thread_data->sessions, + reass0->sess_index); + } + + old_addr0 = ip0->src_address.as_u32; + ip0->src_address = s0->out2in.addr; + new_addr0 = ip0->src_address.as_u32; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = s0->out2in.fib_index; + + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + src_address /* changed member */); + ip0->checksum = ip_csum_fold (sum0); + + if (PREDICT_FALSE (ip4_is_first_fragment (ip0))) + { + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + old_port0 = tcp0->src_port; + tcp0->src_port = s0->out2in.port; + new_port0 = tcp0->src_port; + + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + dst_address /* changed member */); + sum0 = ip_csum_update (sum0, old_port0, new_port0, + ip4_header_t /* cheat */, + length /* changed member */); + tcp0->checksum = ip_csum_fold(sum0); + } + else + { + old_port0 = udp0->src_port; + udp0->src_port = s0->out2in.port; + udp0->checksum = 0; + } + } + + /* Hairpinning */ + nat44_reass_hairpinning (sm, b0, ip0, s0->out2in.port, + s0->ext_host_port, proto0); + + /* Accounting */ + s0->last_heard = now; + s0->total_pkts++; + s0->total_bytes += vlib_buffer_length_in_chain (vm, b0); + /* Per-user LRU list maintenance for dynamic translation */ + if (!snat_is_session_static (s0)) + { + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, + s0->per_user_index); + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, + s0->per_user_list_head_index, + s0->per_user_index); + } + + trace0: + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + nat44_in2out_reass_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->cached = cached0; + t->sw_if_index = sw_if_index0; + t->next_index = next0; + } + + if (cached0) + { + n_left_to_next++; + to_next--; + } + else + { + pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + if (n_left_from == 0 && vec_len (fragments_to_loopback)) + { + from = vlib_frame_vector_args (frame); + u32 len = vec_len (fragments_to_loopback); + if (len <= VLIB_FRAME_SIZE) + { + clib_memcpy (from, fragments_to_loopback, sizeof (u32) * len); + n_left_from = len; + vec_reset_length (fragments_to_loopback); + } + else + { + clib_memcpy (from, + fragments_to_loopback + (len - VLIB_FRAME_SIZE), + sizeof (u32) * VLIB_FRAME_SIZE); + n_left_from = VLIB_FRAME_SIZE; + _vec_len (fragments_to_loopback) = len - VLIB_FRAME_SIZE; + } + } + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, nat44_in2out_reass_node.index, + SNAT_IN2OUT_ERROR_IN2OUT_PACKETS, + pkts_processed); + + nat_send_all_to_node (vm, fragments_to_drop, node, + &node->errors[SNAT_IN2OUT_ERROR_DROP_FRAGMENT], + SNAT_IN2OUT_NEXT_DROP); + + vec_free (fragments_to_drop); + vec_free (fragments_to_loopback); + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (nat44_in2out_reass_node) = { + .function = nat44_in2out_reass_node_fn, + .name = "nat44-in2out-reass", + .vector_size = sizeof (u32), + .format_trace = format_nat44_in2out_reass_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(snat_in2out_error_strings), + .error_strings = snat_in2out_error_strings, + + .n_next_nodes = SNAT_IN2OUT_N_NEXT, + .next_nodes = { + [SNAT_IN2OUT_NEXT_DROP] = "error-drop", + [SNAT_IN2OUT_NEXT_LOOKUP] = "ip4-lookup", + [SNAT_IN2OUT_NEXT_SLOW_PATH] = "nat44-in2out-slowpath", + [SNAT_IN2OUT_NEXT_ICMP_ERROR] = "ip4-icmp-error", + [SNAT_IN2OUT_NEXT_REASS] = "nat44-in2out-reass", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (nat44_in2out_reass_node, + nat44_in2out_reass_node_fn); + /**************************/ /*** deterministic mode ***/ /**************************/ @@ -3771,6 +4185,7 @@ VLIB_REGISTER_NODE (snat_in2out_fast_node) = { [SNAT_IN2OUT_NEXT_LOOKUP] = "ip4-lookup", [SNAT_IN2OUT_NEXT_SLOW_PATH] = "nat44-in2out-slowpath", [SNAT_IN2OUT_NEXT_ICMP_ERROR] = "ip4-icmp-error", + [SNAT_IN2OUT_NEXT_REASS] = "nat44-in2out-reass", }, }; diff --git a/src/plugins/nat/nat.api b/src/plugins/nat/nat.api index 187de25c8c1..d8fdf7283cf 100644 --- a/src/plugins/nat/nat.api +++ b/src/plugins/nat/nat.api @@ -760,6 +760,87 @@ autoreply define nat_ipfix_enable_disable { u8 enable; }; +/** \brief Set NAT virtual fragmentation reassembly + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param timeout - reassembly timeout + @param max_reass - maximum number of concurrent reassemblies + @param max_frag - maximum number of fragmets per reassembly + @param drop_frag - if 0 translate fragments, otherwise drop fragments + @param is_ip6 - 1 if IPv6, 0 if IPv4 +*/ +autoreply define nat_set_reass { + u32 client_index; + u32 context; + u32 timeout; + u16 max_reass; + u8 max_frag; + u8 drop_frag; + u8 is_ip6; +}; + +/** \brief Get NAT virtual fragmentation reassembly configuration + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat_get_reass { + u32 client_index; + u32 context; +}; + +/** \brief Get NAT virtual fragmentation reassembly configuration reply + @param context - sender context, to match reply w/ request + @param retval - return code + @param ip4_timeout - reassembly timeout + @param ip4_max_reass - maximum number of concurrent reassemblies + @param ip4_max_frag - maximum number of fragmets per reassembly + @param ip4_drop_frag - if 0 translate fragments, otherwise drop fragments + @param ip6_timeout - reassembly timeout + @param ip6_max_reass - maximum number of concurrent reassemblies + @param ip6_max_frag - maximum number of fragmets per reassembly + @param ip6_drop_frag - if 0 translate fragments, otherwise drop fragments +*/ +define nat_get_reass_reply { + u32 context; + i32 retval; + u32 ip4_timeout; + u16 ip4_max_reass; + u8 ip4_max_frag; + u8 ip4_drop_frag; + u32 ip6_timeout; + u16 ip6_max_reass; + u8 ip6_max_frag; + u8 ip6_drop_frag; +}; + +/** \brief Dump NAT virtual fragmentation reassemblies + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat_reass_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT virtual fragmentation reassemblies response + @param context - sender context, to match reply w/ request + @param is_ip4 - 1 if address type is IPv4 + @param src_addr - source IP address + @param dst_addr - destination IP address + @param frag_id - fragment ID + @param proto - protocol + @param frag_n - number of cached fragments +*/ +define nat_reass_details { + u32 context; + u8 is_ip4; + u8 src_addr[16]; + u8 dst_addr[16]; + u32 frag_id; + u8 proto; + u8 frag_n; +}; + /* * NAT44 APIs */ diff --git a/src/plugins/nat/nat.c b/src/plugins/nat/nat.c index cd5a6eb8fa8..7e651e5e9e5 100644 --- a/src/plugins/nat/nat.c +++ b/src/plugins/nat/nat.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -1447,11 +1448,15 @@ static clib_error_t * snat_init (vlib_main_t * vm) /* Init IPFIX logging */ snat_ipfix_logging_init(vm); + /* Init NAT64 */ error = nat64_init(vm); + if (error) + return error; dslite_init(vm); - return error; + /* Init virtual fragmenentation reassembly */ + return nat_reass_init(vm); } VLIB_INIT_FUNCTION (snat_init); @@ -2889,6 +2894,7 @@ show_snat_command_fn (vlib_main_t * vm, } } } + return 0; } diff --git a/src/plugins/nat/nat.h b/src/plugins/nat/nat.h index b72e075df35..5bd0a119f38 100644 --- a/src/plugins/nat/nat.h +++ b/src/plugins/nat/nat.h @@ -154,9 +154,9 @@ typedef CLIB_PACKED(struct { /* Outside address */ u32 outside_address_index; /* 64-67 */ - /* External host address */ + /* External host address and port */ ip4_address_t ext_host_addr; /* 68-71 */ - + u16 ext_host_port; /* 72-73 */ }) snat_session_t; @@ -563,4 +563,30 @@ maximum_sessions_exceeded (snat_main_t *sm, u32 thread_index) return 0; } -#endif /* __included_nat_h__ */ +static_always_inline void +nat_send_all_to_node(vlib_main_t *vm, u32 *bi_vector, + vlib_node_runtime_t *node, vlib_error_t *error, u32 next) +{ + u32 n_left_from, *from, next_index, *to_next, n_left_to_next; + + from = bi_vector; + n_left_from = vec_len(bi_vector); + next_index = node->cached_next_index; + while (n_left_from > 0) { + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + while (n_left_from > 0 && n_left_to_next > 0) { + u32 bi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + vlib_buffer_t *p0 = vlib_get_buffer(vm, bi0); + p0->error = *error; + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, next); + } + vlib_put_next_frame(vm, node, next_index, n_left_to_next); + } +} + +#endif /* __included_snat_h__ */ diff --git a/src/plugins/nat/nat64_db.c b/src/plugins/nat/nat64_db.c index da73ceee2d1..008a137da10 100644 --- a/src/plugins/nat/nat64_db.c +++ b/src/plugins/nat/nat64_db.c @@ -529,6 +529,52 @@ nat64_db_st_entry_find (nat64_db_t * db, ip46_address_t * l_addr, return ste; } +u32 +nat64_db_st_entry_get_index (nat64_db_t * db, nat64_db_st_entry_t * ste) +{ + nat64_db_st_entry_t *st; + + switch (ip_proto_to_snat_proto (ste->proto)) + { +/* *INDENT-OFF* */ +#define _(N, i, n, s) \ + case SNAT_PROTOCOL_##N: \ + st = db->st._##n##_st; \ + break; + foreach_snat_protocol +#undef _ +/* *INDENT-ON* */ + default: + st = db->st._unk_proto_st; + return (u32) ~ 0; + } + + return ste - st; +} + +nat64_db_st_entry_t * +nat64_db_st_entry_by_index (nat64_db_t * db, u8 proto, u32 ste_index) +{ + nat64_db_st_entry_t *st; + + switch (ip_proto_to_snat_proto (proto)) + { +/* *INDENT-OFF* */ +#define _(N, i, n, s) \ + case SNAT_PROTOCOL_##N: \ + st = db->st._##n##_st; \ + break; + foreach_snat_protocol +#undef _ +/* *INDENT-ON* */ + default: + st = db->st._unk_proto_st; + break; + } + + return pool_elt_at_index (st, ste_index); +} + void nad64_db_st_free_expired (nat64_db_t * db, u32 now) { diff --git a/src/plugins/nat/nat64_db.h b/src/plugins/nat/nat64_db.h index 394ca875bbb..94d9a8bdebf 100644 --- a/src/plugins/nat/nat64_db.h +++ b/src/plugins/nat/nat64_db.h @@ -296,6 +296,27 @@ void nad64_db_st_free_expired (nat64_db_t * db, u32 now); */ void nat64_db_free_out_addr (nat64_db_t * db, ip4_address_t * out_addr); +/* + * @brief Get ST entry index. + * + * @param db NAT64 DB. + * @param ste ST entry. + * + * @return ST entry index on success, ~0 otherwise. + */ +u32 nat64_db_st_entry_get_index (nat64_db_t * db, nat64_db_st_entry_t * ste); + +/** + * @brief Get ST entry by index and protocol. + * + * @param db NAT64 DB. + * @param proto L4 protocol. + * @param bibe_index ST entry index. + * + * @return BIB entry if found. + */ +nat64_db_st_entry_t *nat64_db_st_entry_by_index (nat64_db_t * db, + u8 proto, u32 ste_index); #endif /* __included_nat64_db_h__ */ /* diff --git a/src/plugins/nat/nat64_in2out.c b/src/plugins/nat/nat64_in2out.c index f78baff4e12..4f94575ebcc 100644 --- a/src/plugins/nat/nat64_in2out.c +++ b/src/plugins/nat/nat64_in2out.c @@ -18,6 +18,7 @@ */ #include +#include #include #include @@ -45,14 +46,42 @@ format_nat64_in2out_trace (u8 * s, va_list * args) return s; } +typedef struct +{ + u32 sw_if_index; + u32 next_index; + u8 cached; +} nat64_in2out_reass_trace_t; + +static u8 * +format_nat64_in2out_reass_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + nat64_in2out_reass_trace_t *t = + va_arg (*args, nat64_in2out_reass_trace_t *); + + s = + format (s, "NAT64-in2out-reass: sw_if_index %d, next index %d, status %s", + t->sw_if_index, t->next_index, + t->cached ? "cached" : "translated"); + + return s; +} + vlib_node_registration_t nat64_in2out_node; vlib_node_registration_t nat64_in2out_slowpath_node; +vlib_node_registration_t nat64_in2out_reass_node; + +#define foreach_nat64_in2out_error \ +_(UNSUPPORTED_PROTOCOL, "unsupported protocol") \ +_(IN2OUT_PACKETS, "good in2out packets processed") \ +_(NO_TRANSLATION, "no translation") \ +_(UNKNOWN, "unknown") \ +_(DROP_FRAGMENT, "Drop fragment") \ +_(MAX_REASS, "Maximum reassemblies exceeded") \ +_(MAX_FRAG, "Maximum fragments per reassembly exceeded") -#define foreach_nat64_in2out_error \ -_(UNSUPPORTED_PROTOCOL, "unsupported protocol") \ -_(IN2OUT_PACKETS, "good in2out packets processed") \ -_(NO_TRANSLATION, "no translation") \ -_(UNKNOWN, "unknown") typedef enum { @@ -74,6 +103,7 @@ typedef enum NAT64_IN2OUT_NEXT_IP6_LOOKUP, NAT64_IN2OUT_NEXT_DROP, NAT64_IN2OUT_NEXT_SLOWPATH, + NAT64_IN2OUT_NEXT_REASS, NAT64_IN2OUT_N_NEXT, } nat64_in2out_next_t; @@ -936,13 +966,6 @@ nat64_in2out_node_fn_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } proto0 = ip_proto_to_snat_proto (l4_protocol0); - if (frag_offset0 != 0) - { - next0 = NAT64_IN2OUT_NEXT_DROP; - b0->error = - node->errors[NAT64_IN2OUT_ERROR_UNSUPPORTED_PROTOCOL]; - goto trace0; - } if (is_slow_path) { @@ -979,6 +1002,13 @@ nat64_in2out_node_fn_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } } + if (PREDICT_FALSE + (ip60->protocol == IP_PROTOCOL_IPV6_FRAGMENTATION)) + { + next0 = NAT64_IN2OUT_NEXT_REASS; + goto trace0; + } + if (proto0 == SNAT_PROTOCOL_ICMP) { if (is_hairpinning (&ip60->dst_address)) @@ -1073,6 +1103,7 @@ VLIB_REGISTER_NODE (nat64_in2out_node) = { [NAT64_IN2OUT_NEXT_IP4_LOOKUP] = "ip4-lookup", [NAT64_IN2OUT_NEXT_IP6_LOOKUP] = "ip6-lookup", [NAT64_IN2OUT_NEXT_SLOWPATH] = "nat64-in2out-slowpath", + [NAT64_IN2OUT_NEXT_REASS] = "nat64-in2out-reass", }, }; /* *INDENT-ON* */ @@ -1102,6 +1133,7 @@ VLIB_REGISTER_NODE (nat64_in2out_slowpath_node) = { [NAT64_IN2OUT_NEXT_IP4_LOOKUP] = "ip4-lookup", [NAT64_IN2OUT_NEXT_IP6_LOOKUP] = "ip6-lookup", [NAT64_IN2OUT_NEXT_SLOWPATH] = "nat64-in2out-slowpath", + [NAT64_IN2OUT_NEXT_REASS] = "nat64-in2out-reass", }, }; /* *INDENT-ON* */ @@ -1109,6 +1141,455 @@ VLIB_REGISTER_NODE (nat64_in2out_slowpath_node) = { VLIB_NODE_FUNCTION_MULTIARCH (nat64_in2out_slowpath_node, nat64_in2out_slowpath_node_fn); +typedef struct nat64_in2out_frag_set_ctx_t_ +{ + vlib_main_t *vm; + u32 sess_index; + u16 l4_offset; + u8 proto; + u8 first_frag; +} nat64_in2out_frag_set_ctx_t; + +static int +nat64_in2out_frag_set_cb (ip6_header_t * ip6, ip4_header_t * ip4, void *arg) +{ + nat64_main_t *nm = &nat64_main; + nat64_in2out_frag_set_ctx_t *ctx = arg; + nat64_db_st_entry_t *ste; + nat64_db_bib_entry_t *bibe; + udp_header_t *udp; + + ste = nat64_db_st_entry_by_index (&nm->db, ctx->proto, ctx->sess_index); + if (!ste) + return -1; + + bibe = nat64_db_bib_entry_by_index (&nm->db, ctx->proto, ste->bibe_index); + if (!bibe) + return -1; + + nat64_session_reset_timeout (ste, ctx->vm); + + if (ctx->first_frag) + { + udp = (udp_header_t *) u8_ptr_add (ip6, ctx->l4_offset); + + if (ctx->proto == IP_PROTOCOL_TCP) + { + u16 *checksum; + ip_csum_t csum; + tcp_header_t *tcp = (tcp_header_t *) udp; + + checksum = &tcp->checksum; + csum = ip_csum_sub_even (*checksum, tcp->src_port); + csum = ip_csum_sub_even (csum, ip6->src_address.as_u64[0]); + csum = ip_csum_sub_even (csum, ip6->src_address.as_u64[1]); + csum = ip_csum_sub_even (csum, ip6->dst_address.as_u64[0]); + csum = ip_csum_sub_even (csum, ip6->dst_address.as_u64[1]); + csum = ip_csum_add_even (csum, bibe->out_port); + csum = ip_csum_add_even (csum, bibe->out_addr.as_u32); + csum = ip_csum_add_even (csum, ste->out_r_addr.as_u32); + *checksum = ip_csum_fold (csum); + } + + udp->src_port = bibe->out_port; + } + + ip4->src_address.as_u32 = bibe->out_addr.as_u32; + ip4->dst_address.as_u32 = ste->out_r_addr.as_u32; + + return 0; +} + +static int +nat64_in2out_frag_hairpinning (vlib_buffer_t * b, ip6_header_t * ip6, + nat64_in2out_frag_set_ctx_t * ctx) +{ + nat64_main_t *nm = &nat64_main; + nat64_db_st_entry_t *ste; + nat64_db_bib_entry_t *bibe; + udp_header_t *udp = (udp_header_t *) u8_ptr_add (ip6, ctx->l4_offset); + tcp_header_t *tcp = (tcp_header_t *) udp; + u16 sport = udp->src_port; + u16 dport = udp->dst_port; + u16 *checksum; + ip_csum_t csum; + ip46_address_t saddr, daddr; + + if (ctx->first_frag) + { + if (ctx->proto == IP_PROTOCOL_UDP) + checksum = &udp->checksum; + else + checksum = &tcp->checksum; + + csum = ip_csum_sub_even (*checksum, ip6->src_address.as_u64[0]); + csum = ip_csum_sub_even (csum, ip6->src_address.as_u64[1]); + csum = ip_csum_sub_even (csum, ip6->dst_address.as_u64[0]); + csum = ip_csum_sub_even (csum, ip6->dst_address.as_u64[1]); + csum = ip_csum_sub_even (csum, sport); + csum = ip_csum_sub_even (csum, dport); + } + + ste = nat64_db_st_entry_by_index (&nm->db, ctx->proto, ctx->sess_index); + if (!ste) + return -1; + + bibe = nat64_db_bib_entry_by_index (&nm->db, ctx->proto, ste->bibe_index); + if (!bibe) + return -1; + + nat64_session_reset_timeout (ste, ctx->vm); + + sport = bibe->out_port; + dport = ste->r_port; + + nat64_compose_ip6 (&ip6->src_address, &bibe->out_addr, bibe->fib_index); + + memset (&saddr, 0, sizeof (saddr)); + memset (&daddr, 0, sizeof (daddr)); + saddr.ip4.as_u32 = bibe->out_addr.as_u32; + daddr.ip4.as_u32 = ste->out_r_addr.as_u32; + + ste = + nat64_db_st_entry_find (&nm->db, &daddr, &saddr, dport, sport, ctx->proto, + 0, 0); + + if (ste) + { + bibe = + nat64_db_bib_entry_by_index (&nm->db, ctx->proto, ste->bibe_index); + if (!bibe) + return -1; + } + else + { + bibe = + nat64_db_bib_entry_find (&nm->db, &daddr, dport, ctx->proto, 0, 0); + + if (!bibe) + return -1; + + ste = + nat64_db_st_entry_create (&nm->db, bibe, &ip6->src_address, + &saddr.ip4, sport); + } + + ip6->dst_address.as_u64[0] = bibe->in_addr.as_u64[0]; + ip6->dst_address.as_u64[1] = bibe->in_addr.as_u64[1]; + + if (ctx->first_frag) + { + udp->dst_port = bibe->in_port; + udp->src_port = sport; + csum = ip_csum_add_even (csum, ip6->src_address.as_u64[0]); + csum = ip_csum_add_even (csum, ip6->src_address.as_u64[1]); + csum = ip_csum_add_even (csum, ip6->dst_address.as_u64[0]); + csum = ip_csum_add_even (csum, ip6->dst_address.as_u64[1]); + csum = ip_csum_add_even (csum, udp->src_port); + csum = ip_csum_add_even (csum, udp->dst_port); + *checksum = ip_csum_fold (csum); + } + + return 0; +} + +static uword +nat64_in2out_reass_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + u32 n_left_from, *from, *to_next; + nat64_in2out_next_t next_index; + u32 pkts_processed = 0; + u32 *fragments_to_drop = 0; + u32 *fragments_to_loopback = 0; + nat64_main_t *nm = &nat64_main; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0; + u8 cached0 = 0; + ip6_header_t *ip60; + u16 l4_offset0, frag_offset0; + u8 l4_protocol0; + nat_reass_ip6_t *reass0; + ip6_frag_hdr_t *frag0; + nat64_db_bib_entry_t *bibe0; + nat64_db_st_entry_t *ste0; + udp_header_t *udp0; + snat_protocol_t proto0; + u32 sw_if_index0, fib_index0; + ip46_address_t saddr0, daddr0; + nat64_in2out_frag_set_ctx_t ctx0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + next0 = NAT64_IN2OUT_NEXT_IP4_LOOKUP; + + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + fib_index0 = + fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, + sw_if_index0); + + if (PREDICT_FALSE (nat_reass_is_drop_frag (1))) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = node->errors[NAT64_IN2OUT_ERROR_DROP_FRAGMENT]; + goto trace0; + } + + ip60 = (ip6_header_t *) vlib_buffer_get_current (b0); + + if (PREDICT_FALSE + (ip6_parse + (ip60, b0->current_length, &l4_protocol0, &l4_offset0, + &frag_offset0))) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = node->errors[NAT64_IN2OUT_ERROR_UNKNOWN]; + goto trace0; + } + + if (PREDICT_FALSE + (!(l4_protocol0 == IP_PROTOCOL_TCP + || l4_protocol0 == IP_PROTOCOL_UDP))) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = node->errors[NAT64_IN2OUT_ERROR_DROP_FRAGMENT]; + goto trace0; + } + + udp0 = (udp_header_t *) u8_ptr_add (ip60, l4_offset0); + frag0 = (ip6_frag_hdr_t *) u8_ptr_add (ip60, frag_offset0); + proto0 = ip_proto_to_snat_proto (l4_protocol0); + + reass0 = nat_ip6_reass_find_or_create (ip60->src_address, + ip60->dst_address, + frag0->identification, + l4_protocol0, + 1, &fragments_to_drop); + + if (PREDICT_FALSE (!reass0)) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = node->errors[NAT64_IN2OUT_ERROR_MAX_REASS]; + goto trace0; + } + + if (PREDICT_TRUE (ip6_frag_hdr_offset (frag0))) + { + ctx0.first_frag = 0; + if (PREDICT_FALSE (reass0->sess_index == (u32) ~ 0)) + { + if (nat_ip6_reass_add_fragment (reass0, bi0)) + { + b0->error = node->errors[NAT64_IN2OUT_ERROR_MAX_FRAG]; + next0 = NAT64_IN2OUT_NEXT_DROP; + goto trace0; + } + cached0 = 1; + goto trace0; + } + } + else + { + ctx0.first_frag = 1; + + saddr0.as_u64[0] = ip60->src_address.as_u64[0]; + saddr0.as_u64[1] = ip60->src_address.as_u64[1]; + daddr0.as_u64[0] = ip60->dst_address.as_u64[0]; + daddr0.as_u64[1] = ip60->dst_address.as_u64[1]; + + ste0 = + nat64_db_st_entry_find (&nm->db, &saddr0, &daddr0, + udp0->src_port, udp0->dst_port, + l4_protocol0, fib_index0, 1); + if (!ste0) + { + bibe0 = + nat64_db_bib_entry_find (&nm->db, &saddr0, udp0->src_port, + l4_protocol0, fib_index0, 1); + if (!bibe0) + { + u16 out_port0; + ip4_address_t out_addr0; + if (nat64_alloc_out_addr_and_port + (fib_index0, proto0, &out_addr0, &out_port0)) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = + node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION]; + goto trace0; + } + + bibe0 = + nat64_db_bib_entry_create (&nm->db, + &ip60->src_address, + &out_addr0, udp0->src_port, + clib_host_to_net_u16 + (out_port0), fib_index0, + l4_protocol0, 0); + if (!bibe0) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = + node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION]; + goto trace0; + } + } + nat64_extract_ip4 (&ip60->dst_address, &daddr0.ip4, + fib_index0); + ste0 = + nat64_db_st_entry_create (&nm->db, bibe0, + &ip60->dst_address, &daddr0.ip4, + udp0->dst_port); + if (!ste0) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = + node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION]; + goto trace0; + } + } + reass0->sess_index = + nat64_db_st_entry_get_index (&nm->db, ste0); + + nat_ip6_reass_get_frags (reass0, &fragments_to_loopback); + } + + ctx0.sess_index = reass0->sess_index; + ctx0.proto = l4_protocol0; + ctx0.vm = vm; + ctx0.l4_offset = l4_offset0; + + if (PREDICT_FALSE (is_hairpinning (&ip60->dst_address))) + { + next0 = NAT64_IN2OUT_NEXT_IP6_LOOKUP; + if (nat64_in2out_frag_hairpinning (b0, ip60, &ctx0)) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION]; + } + goto trace0; + } + else + { + if (ip6_to_ip4_fragmented (b0, nat64_in2out_frag_set_cb, &ctx0)) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = node->errors[NAT64_IN2OUT_ERROR_UNKNOWN]; + goto trace0; + } + } + + trace0: + if (PREDICT_FALSE + ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + nat64_in2out_reass_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->cached = cached0; + t->sw_if_index = sw_if_index0; + t->next_index = next0; + } + + if (cached0) + { + n_left_to_next++; + to_next--; + } + else + { + pkts_processed += next0 != NAT64_IN2OUT_NEXT_DROP; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + if (n_left_from == 0 && vec_len (fragments_to_loopback)) + { + from = vlib_frame_vector_args (frame); + u32 len = vec_len (fragments_to_loopback); + if (len <= VLIB_FRAME_SIZE) + { + clib_memcpy (from, fragments_to_loopback, + sizeof (u32) * len); + n_left_from = len; + vec_reset_length (fragments_to_loopback); + } + else + { + clib_memcpy (from, + fragments_to_loopback + (len - + VLIB_FRAME_SIZE), + sizeof (u32) * VLIB_FRAME_SIZE); + n_left_from = VLIB_FRAME_SIZE; + _vec_len (fragments_to_loopback) = len - VLIB_FRAME_SIZE; + } + } + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, nat64_in2out_reass_node.index, + NAT64_IN2OUT_ERROR_IN2OUT_PACKETS, + pkts_processed); + + nat_send_all_to_node (vm, fragments_to_drop, node, + &node->errors[NAT64_IN2OUT_ERROR_DROP_FRAGMENT], + NAT64_IN2OUT_NEXT_DROP); + + vec_free (fragments_to_drop); + vec_free (fragments_to_loopback); + return frame->n_vectors; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (nat64_in2out_reass_node) = { + .function = nat64_in2out_reass_node_fn, + .name = "nat64-in2out-reass", + .vector_size = sizeof (u32), + .format_trace = format_nat64_in2out_reass_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (nat64_in2out_error_strings), + .error_strings = nat64_in2out_error_strings, + .n_next_nodes = NAT64_IN2OUT_N_NEXT, + /* edit / add dispositions here */ + .next_nodes = { + [NAT64_IN2OUT_NEXT_DROP] = "error-drop", + [NAT64_IN2OUT_NEXT_IP4_LOOKUP] = "ip4-lookup", + [NAT64_IN2OUT_NEXT_IP6_LOOKUP] = "ip6-lookup", + [NAT64_IN2OUT_NEXT_SLOWPATH] = "nat64-in2out-slowpath", + [NAT64_IN2OUT_NEXT_REASS] = "nat64-in2out-reass", + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (nat64_in2out_reass_node, + nat64_in2out_reass_node_fn); + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/plugins/nat/nat64_out2in.c b/src/plugins/nat/nat64_out2in.c index 61e88a7f304..eb5ecb4588d 100644 --- a/src/plugins/nat/nat64_out2in.c +++ b/src/plugins/nat/nat64_out2in.c @@ -18,6 +18,7 @@ */ #include +#include #include #include @@ -41,13 +42,41 @@ format_nat64_out2in_trace (u8 * s, va_list * args) return s; } +typedef struct +{ + u32 sw_if_index; + u32 next_index; + u8 cached; +} nat64_out2in_reass_trace_t; + +static u8 * +format_nat64_out2in_reass_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + nat64_out2in_reass_trace_t *t = + va_arg (*args, nat64_out2in_reass_trace_t *); + + s = + format (s, "NAT64-out2in-reass: sw_if_index %d, next index %d, status %s", + t->sw_if_index, t->next_index, + t->cached ? "cached" : "translated"); + + return s; +} + vlib_node_registration_t nat64_out2in_node; +vlib_node_registration_t nat64_out2in_reass_node; + +#define foreach_nat64_out2in_error \ +_(UNSUPPORTED_PROTOCOL, "Unsupported protocol") \ +_(OUT2IN_PACKETS, "Good out2in packets processed") \ +_(NO_TRANSLATION, "No translation") \ +_(UNKNOWN, "unknown") \ +_(DROP_FRAGMENT, "Drop fragment") \ +_(MAX_REASS, "Maximum reassemblies exceeded") \ +_(MAX_FRAG, "Maximum fragments per reassembly exceeded") -#define foreach_nat64_out2in_error \ -_(UNSUPPORTED_PROTOCOL, "Unsupported protocol") \ -_(OUT2IN_PACKETS, "Good out2in packets processed") \ -_(NO_TRANSLATION, "No translation") \ -_(UNKNOWN, "unknown") typedef enum { @@ -67,6 +96,7 @@ typedef enum { NAT64_OUT2IN_NEXT_LOOKUP, NAT64_OUT2IN_NEXT_DROP, + NAT64_OUT2IN_NEXT_REASS, NAT64_OUT2IN_N_NEXT, } nat64_out2in_next_t; @@ -412,20 +442,27 @@ nat64_out2in_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, proto0 = ip_proto_to_snat_proto (ip40->protocol); - if (proto0 == SNAT_PROTOCOL_ICMP) + if (PREDICT_FALSE (proto0 == ~0)) { - if (icmp_to_icmp6 - (b0, nat64_out2in_icmp_set_cb, &ctx0, - nat64_out2in_inner_icmp_set_cb, &ctx0)) + if (ip4_to_ip6 (b0, nat64_out2in_unk_proto_set_cb, &ctx0)) { next0 = NAT64_OUT2IN_NEXT_DROP; b0->error = node->errors[NAT64_OUT2IN_ERROR_NO_TRANSLATION]; - goto trace0; } + goto trace0; } - else if (proto0 == SNAT_PROTOCOL_TCP || proto0 == SNAT_PROTOCOL_UDP) + + if (PREDICT_FALSE (ip4_is_fragment (ip40))) { - if (ip4_to_ip6_tcp_udp (b0, nat64_out2in_tcp_udp_set_cb, &ctx0)) + next0 = NAT64_OUT2IN_NEXT_REASS; + goto trace0; + } + + if (proto0 == SNAT_PROTOCOL_ICMP) + { + if (icmp_to_icmp6 + (b0, nat64_out2in_icmp_set_cb, &ctx0, + nat64_out2in_inner_icmp_set_cb, &ctx0)) { next0 = NAT64_OUT2IN_NEXT_DROP; b0->error = node->errors[NAT64_OUT2IN_ERROR_NO_TRANSLATION]; @@ -434,7 +471,7 @@ nat64_out2in_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, } else { - if (ip4_to_ip6 (b0, nat64_out2in_unk_proto_set_cb, &ctx0)) + if (ip4_to_ip6_tcp_udp (b0, nat64_out2in_tcp_udp_set_cb, &ctx0)) { next0 = NAT64_OUT2IN_NEXT_DROP; b0->error = node->errors[NAT64_OUT2IN_ERROR_NO_TRANSLATION]; @@ -474,17 +511,361 @@ VLIB_REGISTER_NODE (nat64_out2in_node) = { .format_trace = format_nat64_out2in_trace, .type = VLIB_NODE_TYPE_INTERNAL, .n_errors = ARRAY_LEN (nat64_out2in_error_strings), - .error_strings = nat64_out2in_error_strings,.n_next_nodes = 2, + .error_strings = nat64_out2in_error_strings, + .n_next_nodes = NAT64_OUT2IN_N_NEXT, /* edit / add dispositions here */ .next_nodes = { [NAT64_OUT2IN_NEXT_DROP] = "error-drop", [NAT64_OUT2IN_NEXT_LOOKUP] = "ip6-lookup", + [NAT64_OUT2IN_NEXT_REASS] = "nat64-out2in-reass", }, }; /* *INDENT-ON* */ VLIB_NODE_FUNCTION_MULTIARCH (nat64_out2in_node, nat64_out2in_node_fn); +typedef struct nat64_out2in_frag_set_ctx_t_ +{ + vlib_main_t *vm; + vlib_buffer_t *b; + u32 sess_index; + u8 proto; + u8 first_frag; +} nat64_out2in_frag_set_ctx_t; + +static int +nat64_out2in_frag_set_cb (ip4_header_t * ip4, ip6_header_t * ip6, void *arg) +{ + nat64_main_t *nm = &nat64_main; + nat64_out2in_frag_set_ctx_t *ctx = arg; + nat64_db_st_entry_t *ste; + nat64_db_bib_entry_t *bibe; + udp_header_t *udp = ip4_next_header (ip4); + ip_csum_t csum; + u16 *checksum; + + ste = nat64_db_st_entry_by_index (&nm->db, ctx->proto, ctx->sess_index); + if (!ste) + return -1; + + bibe = nat64_db_bib_entry_by_index (&nm->db, ctx->proto, ste->bibe_index); + if (!bibe) + return -1; + + nat64_session_reset_timeout (ste, ctx->vm); + + if (ctx->first_frag) + { + udp->dst_port = bibe->in_port; + + if (ip4->protocol == IP_PROTOCOL_UDP) + { + checksum = &udp->checksum; + + if (!checksum) + { + u16 udp_len = + clib_host_to_net_u16 (ip4->length) - sizeof (*ip4); + csum = ip_incremental_checksum (0, udp, udp_len); + csum = + ip_csum_with_carry (csum, clib_host_to_net_u16 (udp_len)); + csum = + ip_csum_with_carry (csum, + clib_host_to_net_u16 (IP_PROTOCOL_UDP)); + csum = ip_csum_with_carry (csum, ste->in_r_addr.as_u64[0]); + csum = ip_csum_with_carry (csum, ste->in_r_addr.as_u64[1]); + csum = ip_csum_with_carry (csum, bibe->in_addr.as_u64[0]); + csum = ip_csum_with_carry (csum, bibe->in_addr.as_u64[1]); + *checksum = ~ip_csum_fold (csum); + } + else + { + csum = ip_csum_sub_even (*checksum, bibe->out_addr.as_u32); + csum = ip_csum_sub_even (csum, ste->out_r_addr.as_u32); + csum = ip_csum_sub_even (csum, bibe->out_port); + csum = ip_csum_add_even (csum, ste->in_r_addr.as_u64[0]); + csum = ip_csum_add_even (csum, ste->in_r_addr.as_u64[1]); + csum = ip_csum_add_even (csum, bibe->in_addr.as_u64[0]); + csum = ip_csum_add_even (csum, bibe->in_addr.as_u64[1]); + csum = ip_csum_add_even (csum, bibe->in_port); + *checksum = ip_csum_fold (csum); + } + } + else + { + tcp_header_t *tcp = ip4_next_header (ip4); + checksum = &tcp->checksum; + csum = ip_csum_sub_even (*checksum, bibe->out_addr.as_u32); + csum = ip_csum_sub_even (csum, ste->out_r_addr.as_u32); + csum = ip_csum_sub_even (csum, bibe->out_port); + csum = ip_csum_add_even (csum, ste->in_r_addr.as_u64[0]); + csum = ip_csum_add_even (csum, ste->in_r_addr.as_u64[1]); + csum = ip_csum_add_even (csum, bibe->in_addr.as_u64[0]); + csum = ip_csum_add_even (csum, bibe->in_addr.as_u64[1]); + csum = ip_csum_add_even (csum, bibe->in_port); + *checksum = ip_csum_fold (csum); + } + + } + + ip6->src_address.as_u64[0] = ste->in_r_addr.as_u64[0]; + ip6->src_address.as_u64[1] = ste->in_r_addr.as_u64[1]; + + ip6->dst_address.as_u64[0] = bibe->in_addr.as_u64[0]; + ip6->dst_address.as_u64[1] = bibe->in_addr.as_u64[1]; + + vnet_buffer (ctx->b)->sw_if_index[VLIB_TX] = bibe->fib_index; + + return 0; +} + +static uword +nat64_out2in_reass_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, *from, *to_next; + nat64_out2in_next_t next_index; + u32 pkts_processed = 0; + u32 *fragments_to_drop = 0; + u32 *fragments_to_loopback = 0; + nat64_main_t *nm = &nat64_main; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0; + ip4_header_t *ip40; + u8 cached0 = 0; + u32 sw_if_index0, fib_index0; + udp_header_t *udp0; + nat_reass_ip4_t *reass0; + ip46_address_t saddr0, daddr0; + nat64_db_st_entry_t *ste0; + nat64_db_bib_entry_t *bibe0; + ip6_address_t ip6_saddr0; + nat64_out2in_frag_set_ctx_t ctx0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + next0 = NAT64_OUT2IN_NEXT_LOOKUP; + + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + fib_index0 = + fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP4, + sw_if_index0); + + if (PREDICT_FALSE (nat_reass_is_drop_frag (1))) + { + next0 = NAT64_OUT2IN_NEXT_DROP; + b0->error = node->errors[NAT64_OUT2IN_ERROR_DROP_FRAGMENT]; + goto trace0; + } + + ip40 = vlib_buffer_get_current (b0); + + if (PREDICT_FALSE (!(ip40->protocol == IP_PROTOCOL_TCP + || ip40->protocol == IP_PROTOCOL_UDP))) + { + next0 = NAT64_OUT2IN_NEXT_DROP; + b0->error = node->errors[NAT64_OUT2IN_ERROR_DROP_FRAGMENT]; + goto trace0; + } + + udp0 = ip4_next_header (ip40); + + reass0 = nat_ip4_reass_find_or_create (ip40->src_address, + ip40->dst_address, + ip40->fragment_id, + ip40->protocol, + 1, &fragments_to_drop); + + if (PREDICT_FALSE (!reass0)) + { + next0 = NAT64_OUT2IN_NEXT_DROP; + b0->error = node->errors[NAT64_OUT2IN_ERROR_MAX_REASS]; + goto trace0; + } + + if (PREDICT_FALSE (ip4_is_first_fragment (ip40))) + { + ctx0.first_frag = 1; + + memset (&saddr0, 0, sizeof (saddr0)); + saddr0.ip4.as_u32 = ip40->src_address.as_u32; + memset (&daddr0, 0, sizeof (daddr0)); + daddr0.ip4.as_u32 = ip40->dst_address.as_u32; + + ste0 = + nat64_db_st_entry_find (&nm->db, &daddr0, &saddr0, + udp0->dst_port, udp0->src_port, + ip40->protocol, fib_index0, 0); + if (!ste0) + { + bibe0 = + nat64_db_bib_entry_find (&nm->db, &daddr0, udp0->dst_port, + ip40->protocol, fib_index0, 0); + if (!bibe0) + { + next0 = NAT64_OUT2IN_NEXT_DROP; + b0->error = + node->errors[NAT64_OUT2IN_ERROR_NO_TRANSLATION]; + goto trace0; + } + + nat64_compose_ip6 (&ip6_saddr0, &ip40->src_address, + bibe0->fib_index); + ste0 = + nat64_db_st_entry_create (&nm->db, bibe0, &ip6_saddr0, + &saddr0.ip4, udp0->src_port); + + if (!ste0) + { + next0 = NAT64_OUT2IN_NEXT_DROP; + b0->error = + node->errors[NAT64_OUT2IN_ERROR_NO_TRANSLATION]; + goto trace0; + } + } + reass0->sess_index = + nat64_db_st_entry_get_index (&nm->db, ste0); + + nat_ip4_reass_get_frags (reass0, &fragments_to_loopback); + } + else + { + ctx0.first_frag = 0; + + if (PREDICT_FALSE (reass0->sess_index == (u32) ~ 0)) + { + if (nat_ip4_reass_add_fragment (reass0, bi0)) + { + b0->error = node->errors[NAT64_OUT2IN_ERROR_MAX_FRAG]; + next0 = NAT64_OUT2IN_NEXT_DROP; + goto trace0; + } + cached0 = 1; + goto trace0; + } + } + + ctx0.sess_index = reass0->sess_index; + ctx0.proto = ip40->protocol; + ctx0.vm = vm; + ctx0.b = b0; + + if (ip4_to_ip6_fragmented (b0, nat64_out2in_frag_set_cb, &ctx0)) + { + next0 = NAT64_OUT2IN_NEXT_DROP; + b0->error = node->errors[NAT64_OUT2IN_ERROR_UNKNOWN]; + goto trace0; + } + + trace0: + if (PREDICT_FALSE + ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + nat64_out2in_reass_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->cached = cached0; + t->sw_if_index = sw_if_index0; + t->next_index = next0; + } + + if (cached0) + { + n_left_to_next++; + to_next--; + } + else + { + pkts_processed += next0 != NAT64_OUT2IN_NEXT_DROP; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + if (n_left_from == 0 && vec_len (fragments_to_loopback)) + { + from = vlib_frame_vector_args (frame); + u32 len = vec_len (fragments_to_loopback); + if (len <= VLIB_FRAME_SIZE) + { + clib_memcpy (from, fragments_to_loopback, + sizeof (u32) * len); + n_left_from = len; + vec_reset_length (fragments_to_loopback); + } + else + { + clib_memcpy (from, + fragments_to_loopback + (len - + VLIB_FRAME_SIZE), + sizeof (u32) * VLIB_FRAME_SIZE); + n_left_from = VLIB_FRAME_SIZE; + _vec_len (fragments_to_loopback) = len - VLIB_FRAME_SIZE; + } + } + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, nat64_out2in_reass_node.index, + NAT64_OUT2IN_ERROR_OUT2IN_PACKETS, + pkts_processed); + + nat_send_all_to_node (vm, fragments_to_drop, node, + &node->errors[NAT64_OUT2IN_ERROR_DROP_FRAGMENT], + NAT64_OUT2IN_NEXT_DROP); + + vec_free (fragments_to_drop); + vec_free (fragments_to_loopback); + return frame->n_vectors; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (nat64_out2in_reass_node) = { + .function = nat64_out2in_reass_node_fn, + .name = "nat64-out2in-reass", + .vector_size = sizeof (u32), + .format_trace = format_nat64_out2in_reass_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (nat64_out2in_error_strings), + .error_strings = nat64_out2in_error_strings, + .n_next_nodes = NAT64_OUT2IN_N_NEXT, + /* edit / add dispositions here */ + .next_nodes = { + [NAT64_OUT2IN_NEXT_DROP] = "error-drop", + [NAT64_OUT2IN_NEXT_LOOKUP] = "ip6-lookup", + [NAT64_OUT2IN_NEXT_REASS] = "nat64-out2in-reass", + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (nat64_out2in_reass_node, + nat64_out2in_reass_node_fn); + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/plugins/nat/nat_api.c b/src/plugins/nat/nat_api.c index 0ffa2f0e772..548a9e03bc4 100644 --- a/src/plugins/nat/nat_api.c +++ b/src/plugins/nat/nat_api.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -1496,6 +1497,146 @@ vl_api_nat_ipfix_enable_disable_t_print (vl_api_nat_ipfix_enable_disable_t * FINISH; } +static void +vl_api_nat_set_reass_t_handler (vl_api_nat_set_reass_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat_set_reass_reply_t *rmp; + int rv = 0; + + rv = + nat_reass_set (ntohl (mp->timeout), ntohs (mp->max_reass), mp->max_frag, + mp->drop_frag, mp->is_ip6); + + REPLY_MACRO (VL_API_NAT_SET_REASS_REPLY); +} + +static void * +vl_api_nat_set_reass_t_print (vl_api_nat_set_reass_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat_set_reass "); + s = format (s, "timeout %d max_reass %d max_frag %d drop_frag %d is_ip6 %d", + clib_host_to_net_u32 (mp->timeout), + clib_host_to_net_u16 (mp->max_reass), + mp->max_frag, mp->drop_frag, mp->is_ip6); + + FINISH; +} + +static void +vl_api_nat_get_reass_t_handler (vl_api_nat_get_reass_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat_get_reass_reply_t *rmp; + int rv = 0; + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_NAT_GET_REASS_REPLY, + ({ + rmp->ip4_timeout = htonl (nat_reass_get_timeout(0)); + rmp->ip4_max_reass = htons (nat_reass_get_max_reass(0)); + rmp->ip4_max_frag = nat_reass_get_max_frag(0); + rmp->ip4_drop_frag = nat_reass_is_drop_frag(0); + rmp->ip6_timeout = htonl (nat_reass_get_timeout(1)); + rmp->ip6_max_reass = htons (nat_reass_get_max_reass(1)); + rmp->ip6_max_frag = nat_reass_get_max_frag(1); + rmp->ip6_drop_frag = nat_reass_is_drop_frag(1); + })) + /* *INDENT-ON* */ +} + +static void * +vl_api_nat_get_reass_t_print (vl_api_nat_get_reass_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat_get_reass"); + + FINISH; +} + +typedef struct nat_api_walk_ctx_t_ +{ + unix_shared_memory_queue_t *q; + u32 context; +} nat_api_walk_ctx_t; + +static int +nat_ip4_reass_walk_api (nat_reass_ip4_t * reass, void *arg) +{ + vl_api_nat_reass_details_t *rmp; + snat_main_t *sm = &snat_main; + nat_api_walk_ctx_t *ctx = arg; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_NAT_REASS_DETAILS + sm->msg_id_base); + rmp->context = ctx->context; + clib_memcpy (rmp->src_addr, &(reass->key.src), 4); + clib_memcpy (rmp->dst_addr, &(reass->key.dst), 4); + rmp->proto = reass->key.proto; + rmp->frag_id = ntohl (reass->key.frag_id); + rmp->frag_n = reass->frag_n; + rmp->is_ip4 = 1; + + vl_msg_api_send_shmem (ctx->q, (u8 *) & rmp); + + return 0; +} + +static int +nat_ip6_reass_walk_api (nat_reass_ip6_t * reass, void *arg) +{ + vl_api_nat_reass_details_t *rmp; + snat_main_t *sm = &snat_main; + nat_api_walk_ctx_t *ctx = arg; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_NAT_REASS_DETAILS + sm->msg_id_base); + rmp->context = ctx->context; + clib_memcpy (rmp->src_addr, &(reass->key.src), 16); + clib_memcpy (rmp->dst_addr, &(reass->key.dst), 16); + rmp->proto = reass->key.proto; + rmp->frag_id = ntohl (reass->key.frag_id); + rmp->frag_n = reass->frag_n; + rmp->is_ip4 = 0; + + vl_msg_api_send_shmem (ctx->q, (u8 *) & rmp); + + return 0; +} + +static void +vl_api_nat_reass_dump_t_handler (vl_api_nat_reass_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + nat_api_walk_ctx_t ctx = { + .q = q, + .context = mp->context, + }; + + nat_ip4_reass_walk (nat_ip4_reass_walk_api, &ctx); + nat_ip6_reass_walk (nat_ip6_reass_walk_api, &ctx); +} + +static void * +vl_api_nat_reass_dump_t_print (vl_api_nat_reass_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat_reass_dump"); + + FINISH; +} + /*************/ /*** NAT44 ***/ /*************/ @@ -3406,6 +3547,9 @@ _(NAT_SHOW_CONFIG, nat_show_config) \ _(NAT_SET_WORKERS, nat_set_workers) \ _(NAT_WORKER_DUMP, nat_worker_dump) \ _(NAT_IPFIX_ENABLE_DISABLE, nat_ipfix_enable_disable) \ +_(NAT_SET_REASS, nat_set_reass) \ +_(NAT_GET_REASS, nat_get_reass) \ +_(NAT_REASS_DUMP, nat_reass_dump) \ _(NAT44_ADD_DEL_ADDRESS_RANGE, nat44_add_del_address_range) \ _(NAT44_INTERFACE_ADD_DEL_FEATURE, nat44_interface_add_del_feature) \ _(NAT44_ADD_DEL_STATIC_MAPPING, nat44_add_del_static_mapping) \ diff --git a/src/plugins/nat/nat_reass.c b/src/plugins/nat/nat_reass.c new file mode 100644 index 00000000000..239bc70d836 --- /dev/null +++ b/src/plugins/nat/nat_reass.c @@ -0,0 +1,739 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief NAT plugin virtual fragmentation reassembly + */ + +#include +#include + +nat_reass_main_t nat_reass_main; + +static u32 +nat_reass_get_nbuckets (u8 is_ip6) +{ + nat_reass_main_t *srm = &nat_reass_main; + u32 nbuckets; + u8 i; + + if (is_ip6) + nbuckets = (u32) (srm->ip6_max_reass / NAT_REASS_HT_LOAD_FACTOR); + else + nbuckets = (u32) (srm->ip4_max_reass / NAT_REASS_HT_LOAD_FACTOR); + + for (i = 0; i < 31; i++) + if ((1 << i) >= nbuckets) + break; + nbuckets = 1 << i; + + return nbuckets; +} + +static_always_inline void +nat_ip4_reass_get_frags_inline (nat_reass_ip4_t * reass, u32 ** bi) +{ + nat_reass_main_t *srm = &nat_reass_main; + u32 elt_index; + dlist_elt_t *elt; + + while ((elt_index = + clib_dlist_remove_head (srm->ip4_frags_list_pool, + reass->frags_per_reass_list_head_index)) != + ~0) + { + elt = pool_elt_at_index (srm->ip4_frags_list_pool, elt_index); + vec_add1 (*bi, elt->value); + reass->frag_n--; + pool_put_index (srm->ip4_frags_list_pool, elt_index); + } +} + +static_always_inline void +nat_ip6_reass_get_frags_inline (nat_reass_ip6_t * reass, u32 ** bi) +{ + nat_reass_main_t *srm = &nat_reass_main; + u32 elt_index; + dlist_elt_t *elt; + + while ((elt_index = + clib_dlist_remove_head (srm->ip6_frags_list_pool, + reass->frags_per_reass_list_head_index)) != + ~0) + { + elt = pool_elt_at_index (srm->ip6_frags_list_pool, elt_index); + vec_add1 (*bi, elt->value); + reass->frag_n--; + pool_put_index (srm->ip6_frags_list_pool, elt_index); + } +} + +int +nat_reass_set (u32 timeout, u16 max_reass, u8 max_frag, u8 drop_frag, + u8 is_ip6) +{ + nat_reass_main_t *srm = &nat_reass_main; + u32 nbuckets; + + if (is_ip6) + { + if (srm->ip6_max_reass != max_reass) + { + clib_spinlock_lock_if_init (&srm->ip6_reass_lock); + + srm->ip6_max_reass = max_reass; + pool_free (srm->ip6_reass_pool); + pool_alloc (srm->ip6_reass_pool, srm->ip4_max_reass); + nbuckets = nat_reass_get_nbuckets (0); + clib_bihash_free_48_8 (&srm->ip6_reass_hash); + clib_bihash_init_48_8 (&srm->ip6_reass_hash, "nat-ip6-reass", + nbuckets, nbuckets * 1024); + + clib_spinlock_unlock_if_init (&srm->ip6_reass_lock); + } + srm->ip6_timeout = timeout; + srm->ip6_max_frag = max_frag; + srm->ip6_drop_frag = drop_frag; + } + else + { + if (srm->ip4_max_reass != max_reass) + { + clib_spinlock_lock_if_init (&srm->ip4_reass_lock); + + srm->ip4_max_reass = max_reass; + pool_free (srm->ip4_reass_pool); + pool_alloc (srm->ip4_reass_pool, srm->ip4_max_reass); + nbuckets = nat_reass_get_nbuckets (0); + clib_bihash_free_16_8 (&srm->ip4_reass_hash); + clib_bihash_init_16_8 (&srm->ip4_reass_hash, "nat-ip4-reass", + nbuckets, nbuckets * 1024); + clib_spinlock_unlock_if_init (&srm->ip4_reass_lock); + } + srm->ip4_timeout = timeout; + srm->ip4_max_frag = max_frag; + srm->ip4_drop_frag = drop_frag; + } + + return 0; +} + +u32 +nat_reass_get_timeout (u8 is_ip6) +{ + nat_reass_main_t *srm = &nat_reass_main; + + if (is_ip6) + return srm->ip6_timeout; + + return srm->ip4_timeout; +} + +u16 +nat_reass_get_max_reass (u8 is_ip6) +{ + nat_reass_main_t *srm = &nat_reass_main; + + if (is_ip6) + return srm->ip6_max_reass; + + return srm->ip4_max_reass; +} + +u8 +nat_reass_get_max_frag (u8 is_ip6) +{ + nat_reass_main_t *srm = &nat_reass_main; + + if (is_ip6) + return srm->ip6_max_frag; + + return srm->ip4_max_frag; +} + +u8 +nat_reass_is_drop_frag (u8 is_ip6) +{ + nat_reass_main_t *srm = &nat_reass_main; + + if (is_ip6) + return srm->ip6_drop_frag; + + return srm->ip4_drop_frag; +} + +static_always_inline nat_reass_ip4_t * +nat_ip4_reass_lookup (nat_reass_ip4_key_t * k, f64 now) +{ + nat_reass_main_t *srm = &nat_reass_main; + clib_bihash_kv_16_8_t kv, value; + nat_reass_ip4_t *reass; + + kv.key[0] = k->as_u64[0]; + kv.key[1] = k->as_u64[1]; + + if (clib_bihash_search_16_8 (&srm->ip4_reass_hash, &kv, &value)) + return 0; + + reass = pool_elt_at_index (srm->ip4_reass_pool, value.value); + if (now < reass->last_heard + (f64) srm->ip4_timeout) + return reass; + + return 0; +} + +nat_reass_ip4_t * +nat_ip4_reass_find_or_create (ip4_address_t src, ip4_address_t dst, + u16 frag_id, u8 proto, u8 reset_timeout, + u32 ** bi_to_drop) +{ + nat_reass_main_t *srm = &nat_reass_main; + nat_reass_ip4_t *reass = 0; + nat_reass_ip4_key_t k; + f64 now = vlib_time_now (srm->vlib_main); + dlist_elt_t *oldest_elt, *elt; + dlist_elt_t *per_reass_list_head_elt; + u32 oldest_index, elt_index; + clib_bihash_kv_16_8_t kv; + + k.src.as_u32 = src.as_u32; + k.dst.as_u32 = dst.as_u32; + k.frag_id = frag_id; + k.proto = proto; + + clib_spinlock_lock_if_init (&srm->ip4_reass_lock); + + reass = nat_ip4_reass_lookup (&k, now); + if (reass) + { + if (reset_timeout) + { + reass->last_heard = now; + clib_dlist_remove (srm->ip4_reass_lru_list_pool, + reass->lru_list_index); + clib_dlist_addtail (srm->ip4_reass_lru_list_pool, + srm->ip4_reass_head_index, + reass->lru_list_index); + } + goto unlock; + } + + if (srm->ip4_reass_n >= srm->ip4_max_reass) + { + oldest_index = + clib_dlist_remove_head (srm->ip4_reass_lru_list_pool, + srm->ip4_reass_head_index); + ASSERT (oldest_index != ~0); + oldest_elt = + pool_elt_at_index (srm->ip4_reass_lru_list_pool, oldest_index); + reass = pool_elt_at_index (srm->ip4_reass_pool, oldest_elt->value); + if (now < reass->last_heard + (f64) srm->ip4_timeout) + { + clib_dlist_addhead (srm->ip4_reass_lru_list_pool, + srm->ip4_reass_head_index, oldest_index); + clib_warning ("no free resassembly slot"); + reass = 0; + goto unlock; + } + + clib_dlist_addtail (srm->ip4_reass_lru_list_pool, + srm->ip4_reass_head_index, oldest_index); + + kv.key[0] = k.as_u64[0]; + kv.key[1] = k.as_u64[1]; + if (clib_bihash_add_del_16_8 (&srm->ip4_reass_hash, &kv, 0)) + { + reass = 0; + goto unlock; + } + + nat_ip4_reass_get_frags_inline (reass, bi_to_drop); + } + else + { + pool_get (srm->ip4_reass_pool, reass); + pool_get (srm->ip4_reass_lru_list_pool, elt); + reass->lru_list_index = elt_index = elt - srm->ip4_reass_lru_list_pool; + clib_dlist_init (srm->ip4_reass_lru_list_pool, elt_index); + elt->value = reass - srm->ip4_reass_pool; + clib_dlist_addtail (srm->ip4_reass_lru_list_pool, + srm->ip4_reass_head_index, elt_index); + pool_get (srm->ip4_frags_list_pool, per_reass_list_head_elt); + reass->frags_per_reass_list_head_index = + per_reass_list_head_elt - srm->ip4_frags_list_pool; + clib_dlist_init (srm->ip4_frags_list_pool, + reass->frags_per_reass_list_head_index); + srm->ip4_reass_n++; + } + + reass->key.as_u64[0] = kv.key[0] = k.as_u64[0]; + reass->key.as_u64[1] = kv.key[1] = k.as_u64[1]; + kv.value = reass - srm->ip4_reass_pool; + reass->sess_index = (u32) ~ 0; + reass->last_heard = now; + + if (clib_bihash_add_del_16_8 (&srm->ip4_reass_hash, &kv, 1)) + { + reass = 0; + goto unlock; + } + +unlock: + clib_spinlock_unlock_if_init (&srm->ip4_reass_lock); + return reass; +} + +int +nat_ip4_reass_add_fragment (nat_reass_ip4_t * reass, u32 bi) +{ + nat_reass_main_t *srm = &nat_reass_main; + dlist_elt_t *elt; + u32 elt_index; + + if (reass->frag_n >= srm->ip4_max_frag) + return -1; + + clib_spinlock_lock_if_init (&srm->ip4_reass_lock); + + pool_get (srm->ip4_frags_list_pool, elt); + elt_index = elt - srm->ip4_frags_list_pool; + clib_dlist_init (srm->ip4_frags_list_pool, elt_index); + elt->value = bi; + clib_dlist_addtail (srm->ip4_frags_list_pool, + reass->frags_per_reass_list_head_index, elt_index); + reass->frag_n++; + + clib_spinlock_unlock_if_init (&srm->ip4_reass_lock); + + return 0; +} + +void +nat_ip4_reass_get_frags (nat_reass_ip4_t * reass, u32 ** bi) +{ + nat_reass_main_t *srm = &nat_reass_main; + + clib_spinlock_lock_if_init (&srm->ip4_reass_lock); + + nat_ip4_reass_get_frags_inline (reass, bi); + + clib_spinlock_unlock_if_init (&srm->ip4_reass_lock); +} + +void +nat_ip4_reass_walk (nat_ip4_reass_walk_fn_t fn, void *ctx) +{ + nat_reass_ip4_t *reass; + nat_reass_main_t *srm = &nat_reass_main; + f64 now = vlib_time_now (srm->vlib_main); + + /* *INDENT-OFF* */ + pool_foreach (reass, srm->ip4_reass_pool, + ({ + if (now < reass->last_heard + (f64) srm->ip4_timeout) + { + if (fn (reass, ctx)) + return; + } + })); + /* *INDENT-ON* */ +} + +static_always_inline nat_reass_ip6_t * +nat_ip6_reass_lookup (nat_reass_ip6_key_t * k, f64 now) +{ + nat_reass_main_t *srm = &nat_reass_main; + clib_bihash_kv_48_8_t kv, value; + nat_reass_ip6_t *reass; + + k->unused = 0; + kv.key[0] = k->as_u64[0]; + kv.key[1] = k->as_u64[1]; + kv.key[2] = k->as_u64[2]; + kv.key[3] = k->as_u64[3]; + kv.key[4] = k->as_u64[4]; + kv.key[5] = k->as_u64[5]; + + if (clib_bihash_search_48_8 (&srm->ip6_reass_hash, &kv, &value)) + return 0; + + reass = pool_elt_at_index (srm->ip6_reass_pool, value.value); + if (now < reass->last_heard + (f64) srm->ip6_timeout) + return reass; + + return 0; +} + +nat_reass_ip6_t * +nat_ip6_reass_find_or_create (ip6_address_t src, ip6_address_t dst, + u32 frag_id, u8 proto, u8 reset_timeout, + u32 ** bi_to_drop) +{ + nat_reass_main_t *srm = &nat_reass_main; + nat_reass_ip6_t *reass = 0; + nat_reass_ip6_key_t k; + f64 now = vlib_time_now (srm->vlib_main); + dlist_elt_t *oldest_elt, *elt; + dlist_elt_t *per_reass_list_head_elt; + u32 oldest_index, elt_index; + clib_bihash_kv_48_8_t kv; + + k.src.as_u64[0] = src.as_u64[0]; + k.src.as_u64[1] = src.as_u64[1]; + k.dst.as_u64[0] = dst.as_u64[0]; + k.dst.as_u64[1] = dst.as_u64[1]; + k.frag_id = frag_id; + k.proto = proto; + k.unused = 0; + + clib_spinlock_lock_if_init (&srm->ip6_reass_lock); + + reass = nat_ip6_reass_lookup (&k, now); + if (reass) + { + if (reset_timeout) + { + reass->last_heard = now; + clib_dlist_remove (srm->ip6_reass_lru_list_pool, + reass->lru_list_index); + clib_dlist_addtail (srm->ip6_reass_lru_list_pool, + srm->ip6_reass_head_index, + reass->lru_list_index); + } + goto unlock; + } + + if (srm->ip6_reass_n >= srm->ip6_max_reass) + { + oldest_index = + clib_dlist_remove_head (srm->ip6_reass_lru_list_pool, + srm->ip6_reass_head_index); + ASSERT (oldest_index != ~0); + oldest_elt = + pool_elt_at_index (srm->ip4_reass_lru_list_pool, oldest_index); + reass = pool_elt_at_index (srm->ip6_reass_pool, oldest_elt->value); + if (now < reass->last_heard + (f64) srm->ip6_timeout) + { + clib_dlist_addhead (srm->ip6_reass_lru_list_pool, + srm->ip6_reass_head_index, oldest_index); + clib_warning ("no free resassembly slot"); + reass = 0; + goto unlock; + } + + clib_dlist_addtail (srm->ip6_reass_lru_list_pool, + srm->ip6_reass_head_index, oldest_index); + + kv.key[0] = k.as_u64[0]; + kv.key[1] = k.as_u64[1]; + kv.key[2] = k.as_u64[2]; + kv.key[3] = k.as_u64[4]; + kv.key[4] = k.as_u64[5]; + if (clib_bihash_add_del_48_8 (&srm->ip6_reass_hash, &kv, 0)) + { + reass = 0; + goto unlock; + } + + nat_ip6_reass_get_frags_inline (reass, bi_to_drop); + } + else + { + pool_get (srm->ip6_reass_pool, reass); + pool_get (srm->ip6_reass_lru_list_pool, elt); + reass->lru_list_index = elt_index = elt - srm->ip6_reass_lru_list_pool; + clib_dlist_init (srm->ip6_reass_lru_list_pool, elt_index); + elt->value = reass - srm->ip6_reass_pool; + clib_dlist_addtail (srm->ip6_reass_lru_list_pool, + srm->ip6_reass_head_index, elt_index); + pool_get (srm->ip6_frags_list_pool, per_reass_list_head_elt); + reass->frags_per_reass_list_head_index = + per_reass_list_head_elt - srm->ip6_frags_list_pool; + clib_dlist_init (srm->ip6_frags_list_pool, + reass->frags_per_reass_list_head_index); + srm->ip6_reass_n++; + } + + reass->key.as_u64[0] = kv.key[0] = k.as_u64[0]; + reass->key.as_u64[1] = kv.key[1] = k.as_u64[1]; + reass->key.as_u64[2] = kv.key[2] = k.as_u64[2]; + reass->key.as_u64[3] = kv.key[3] = k.as_u64[3]; + reass->key.as_u64[4] = kv.key[4] = k.as_u64[4]; + reass->key.as_u64[5] = kv.key[5] = k.as_u64[5]; + kv.value = reass - srm->ip6_reass_pool; + reass->sess_index = (u32) ~ 0; + reass->last_heard = now; + + if (clib_bihash_add_del_48_8 (&srm->ip6_reass_hash, &kv, 1)) + { + reass = 0; + goto unlock; + } + +unlock: + clib_spinlock_unlock_if_init (&srm->ip6_reass_lock); + return reass; +} + +int +nat_ip6_reass_add_fragment (nat_reass_ip6_t * reass, u32 bi) +{ + nat_reass_main_t *srm = &nat_reass_main; + dlist_elt_t *elt; + u32 elt_index; + + if (reass->frag_n >= srm->ip6_max_frag) + return -1; + + clib_spinlock_lock_if_init (&srm->ip6_reass_lock); + + pool_get (srm->ip6_frags_list_pool, elt); + elt_index = elt - srm->ip6_frags_list_pool; + clib_dlist_init (srm->ip6_frags_list_pool, elt_index); + elt->value = bi; + clib_dlist_addtail (srm->ip6_frags_list_pool, + reass->frags_per_reass_list_head_index, elt_index); + reass->frag_n++; + + clib_spinlock_unlock_if_init (&srm->ip6_reass_lock); + + return 0; +} + +void +nat_ip6_reass_get_frags (nat_reass_ip6_t * reass, u32 ** bi) +{ + nat_reass_main_t *srm = &nat_reass_main; + + clib_spinlock_lock_if_init (&srm->ip6_reass_lock); + + nat_ip6_reass_get_frags_inline (reass, bi); + + clib_spinlock_unlock_if_init (&srm->ip6_reass_lock); +} + +void +nat_ip6_reass_walk (nat_ip6_reass_walk_fn_t fn, void *ctx) +{ + nat_reass_ip6_t *reass; + nat_reass_main_t *srm = &nat_reass_main; + f64 now = vlib_time_now (srm->vlib_main); + + /* *INDENT-OFF* */ + pool_foreach (reass, srm->ip6_reass_pool, + ({ + if (now < reass->last_heard + (f64) srm->ip4_timeout) + { + if (fn (reass, ctx)) + return; + } + })); + /* *INDENT-ON* */ +} + +clib_error_t * +nat_reass_init (vlib_main_t * vm) +{ + nat_reass_main_t *srm = &nat_reass_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + clib_error_t *error = 0; + dlist_elt_t *head; + u32 nbuckets, head_index; + + srm->vlib_main = vm; + srm->vnet_main = vnet_get_main (); + + /* IPv4 */ + srm->ip4_timeout = NAT_REASS_TIMEOUT_DEFAULT; + srm->ip4_max_reass = NAT_MAX_REASS_DEAFULT; + srm->ip4_max_frag = NAT_MAX_FRAG_DEFAULT; + srm->ip4_drop_frag = 0; + srm->ip4_reass_n = 0; + + if (tm->n_vlib_mains > 1) + clib_spinlock_init (&srm->ip4_reass_lock); + + pool_alloc (srm->ip4_reass_pool, srm->ip4_max_reass); + + nbuckets = nat_reass_get_nbuckets (0); + clib_bihash_init_16_8 (&srm->ip4_reass_hash, "nat-ip4-reass", nbuckets, + nbuckets * 1024); + + pool_get (srm->ip4_reass_lru_list_pool, head); + srm->ip4_reass_head_index = head_index = + head - srm->ip4_reass_lru_list_pool; + clib_dlist_init (srm->ip4_reass_lru_list_pool, head_index); + + /* IPv6 */ + srm->ip6_timeout = NAT_REASS_TIMEOUT_DEFAULT; + srm->ip6_max_reass = NAT_MAX_REASS_DEAFULT; + srm->ip6_max_frag = NAT_MAX_FRAG_DEFAULT; + srm->ip6_drop_frag = 0; + srm->ip6_reass_n = 0; + + if (tm->n_vlib_mains > 1) + clib_spinlock_init (&srm->ip6_reass_lock); + + pool_alloc (srm->ip6_reass_pool, srm->ip6_max_reass); + + nbuckets = nat_reass_get_nbuckets (1); + clib_bihash_init_48_8 (&srm->ip6_reass_hash, "nat-ip6-reass", nbuckets, + nbuckets * 1024); + + pool_get (srm->ip6_reass_lru_list_pool, head); + srm->ip6_reass_head_index = head_index = + head - srm->ip6_reass_lru_list_pool; + clib_dlist_init (srm->ip6_reass_lru_list_pool, head_index); + + return error; +} + +static clib_error_t * +nat_reass_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + clib_error_t *error = 0; + unformat_input_t _line_input, *line_input = &_line_input; + u32 timeout = 0, max_reass = 0, max_frag = 0; + u8 drop_frag = (u8) ~ 0, is_ip6 = 0; + int rv; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "max-reassemblies %u", &max_reass)) + ; + else if (unformat (line_input, "max-fragments %u", &max_frag)) + ; + else if (unformat (line_input, "timeout %u", &timeout)) + ; + else if (unformat (line_input, "enable")) + drop_frag = 0; + else if (unformat (line_input, "disable")) + drop_frag = 1; + else if (unformat (line_input, "ip4")) + is_ip6 = 0; + else if (unformat (line_input, "ip6")) + is_ip6 = 1; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (!timeout) + timeout = nat_reass_get_timeout (is_ip6); + if (!max_reass) + max_reass = nat_reass_get_max_reass (is_ip6); + if (!max_frag) + max_frag = nat_reass_get_max_frag (is_ip6); + if (drop_frag == (u8) ~ 0) + drop_frag = nat_reass_is_drop_frag (is_ip6); + + rv = + nat_reass_set (timeout, (u16) max_reass, (u8) max_frag, drop_frag, + is_ip6); + if (rv) + { + error = clib_error_return (0, "nat_set_reass return %d", rv); + goto done; + } + +done: + unformat_free (line_input); + + return error; +} + +static int +nat_ip4_reass_walk_cli (nat_reass_ip4_t * reass, void *ctx) +{ + vlib_main_t *vm = ctx; + + vlib_cli_output (vm, " src %U dst %U proto %u id 0x%04x cached %u", + format_ip4_address, &reass->key.src, + format_ip4_address, &reass->key.dst, + reass->key.proto, + clib_net_to_host_u16 (reass->key.frag_id), reass->frag_n); + + return 0; +} + +static int +nat_ip6_reass_walk_cli (nat_reass_ip6_t * reass, void *ctx) +{ + vlib_main_t *vm = ctx; + + vlib_cli_output (vm, " src %U dst %U proto %u id 0x%08x cached %u", + format_ip6_address, &reass->key.src, + format_ip6_address, &reass->key.dst, + reass->key.proto, + clib_net_to_host_u32 (reass->key.frag_id), reass->frag_n); + + return 0; +} + +static clib_error_t * +show_nat_reass_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vlib_cli_output (vm, "NAT IPv4 virtual fragmentation reassembly is %s", + nat_reass_is_drop_frag (0) ? "DISABLED" : "ENABLED"); + vlib_cli_output (vm, " max-reasssemblies %u", nat_reass_get_max_reass (0)); + vlib_cli_output (vm, " max-fragments %u", nat_reass_get_max_frag (0)); + vlib_cli_output (vm, " timeout %usec", nat_reass_get_timeout (0)); + vlib_cli_output (vm, " reassemblies:"); + nat_ip4_reass_walk (nat_ip4_reass_walk_cli, vm); + + vlib_cli_output (vm, "NAT IPv6 virtual fragmentation reassembly is %s", + nat_reass_is_drop_frag (1) ? "DISABLED" : "ENABLED"); + vlib_cli_output (vm, " max-reasssemblies %u", nat_reass_get_max_reass (1)); + vlib_cli_output (vm, " max-fragments %u", nat_reass_get_max_frag (1)); + vlib_cli_output (vm, " timeout %usec", nat_reass_get_timeout (1)); + vlib_cli_output (vm, " reassemblies:"); + nat_ip6_reass_walk (nat_ip6_reass_walk_cli, vm); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (nat_reass_command, static) = +{ + .path = "nat virtual-reassembly", + .short_help = "nat virtual-reassembly ip4|ip6 [max-reassemblies ] " + "[max-fragments ] [timeout ] [enable|disable]", + .function = nat_reass_command_fn, +}; + +VLIB_CLI_COMMAND (show_nat_reass_command, static) = +{ + .path = "show nat virtual-reassembly", + .short_help = "show nat virtual-reassembly", + .function = show_nat_reass_command_fn, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/nat/nat_reass.h b/src/plugins/nat/nat_reass.h new file mode 100644 index 00000000000..ae14a9604aa --- /dev/null +++ b/src/plugins/nat/nat_reass.h @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief NAT plugin virtual fragmentation reassembly + */ +#ifndef __included_nat_reass_h__ +#define __included_nat_reass_h__ + +#include +#include +#include +#include +#include + +#define NAT_REASS_TIMEOUT_DEFAULT 2 +#define NAT_MAX_REASS_DEAFULT 1024 +#define NAT_MAX_FRAG_DEFAULT 5 +#define NAT_REASS_HT_LOAD_FACTOR (0.75) + +typedef struct +{ + union + { + struct + { + ip4_address_t src; + ip4_address_t dst; + /* align by making this 4 octets even though its a 2 octets field */ + u32 frag_id; + /* align by making this 4 octets even though its a 1 octet field */ + u32 proto; + }; + u64 as_u64[2]; + }; +} nat_reass_ip4_key_t; + +/* *INDENT-OFF* */ +typedef CLIB_PACKED(struct +{ + nat_reass_ip4_key_t key; + u32 lru_list_index; + u32 sess_index; + f64 last_heard; + u32 frags_per_reass_list_head_index; + u8 frag_n; +}) nat_reass_ip4_t; +/* *INDENT-ON* */ + +typedef struct +{ + union + { + struct + { + ip6_address_t src; + ip6_address_t dst; + u32 frag_id; + /* align by making this 4 octets even though its a 1 octet field */ + u32 proto; + u64 unused; + }; + u64 as_u64[6]; + }; +} nat_reass_ip6_key_t; + +/* *INDENT-OFF* */ +typedef CLIB_PACKED(struct +{ + nat_reass_ip6_key_t key; + u32 lru_list_index; + u32 sess_index; + f64 last_heard; + u32 frags_per_reass_list_head_index; + u8 frag_n; +}) nat_reass_ip6_t; +/* *INDENT-ON* */ + +typedef struct +{ + /* IPv4 config */ + u32 ip4_timeout; + u16 ip4_max_reass; + u8 ip4_max_frag; + u8 ip4_drop_frag; + + /* IPv6 config */ + u32 ip6_timeout; + u16 ip6_max_reass; + u8 ip6_max_frag; + u8 ip6_drop_frag; + + /* IPv4 runtime */ + nat_reass_ip4_t *ip4_reass_pool; + clib_bihash_16_8_t ip4_reass_hash; + dlist_elt_t *ip4_reass_lru_list_pool; + dlist_elt_t *ip4_frags_list_pool; + u32 ip4_reass_head_index; + u16 ip4_reass_n; + clib_spinlock_t ip4_reass_lock; + + /* IPv6 runtime */ + nat_reass_ip6_t *ip6_reass_pool; + clib_bihash_48_8_t ip6_reass_hash; + dlist_elt_t *ip6_reass_lru_list_pool; + dlist_elt_t *ip6_frags_list_pool; + u32 ip6_reass_head_index; + u16 ip6_reass_n; + clib_spinlock_t ip6_reass_lock; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; +} nat_reass_main_t; + +/** + * @brief Set NAT virtual fragmentation reassembly configuration. + * + * @param timeout Reassembly timeout. + * @param max_reass Maximum number of concurrent reassemblies. + * @param max_frag Maximum number of fragmets per reassembly + * @param drop_frag If zero translate fragments, otherwise drop fragments. + * @param is_ip6 1 if IPv6, 0 if IPv4. + * + * @returns 0 on success, non-zero value otherwise. + */ +int nat_reass_set (u32 timeout, u16 max_reass, u8 max_frag, u8 drop_frag, + u8 is_ip6); + +/** + * @brief Get reassembly timeout. + * + * @param is_ip6 1 if IPv6, 0 if IPv4. + * + * @returns reassembly timeout. + */ +u32 nat_reass_get_timeout (u8 is_ip6); + +/** + * @brief Get maximum number of concurrent reassemblies. + * + * @param is_ip6 1 if IPv6, 0 if IPv4. + * + * @returns maximum number of concurrent reassemblies. + */ +u16 nat_reass_get_max_reass (u8 is_ip6); + +/** + * @brief Get maximum number of fragmets per reassembly. + * + * @param is_ip6 1 if IPv6, 0 if IPv4. + * + * @returns maximum number of fragmets per reassembly. + */ +u8 nat_reass_get_max_frag (u8 is_ip6); + +/** + * @brief Get status of virtual fragmentation reassembly. + * + * @param is_ip6 1 if IPv6, 0 if IPv4. + * + * @returns zero if translate fragments, non-zero value if drop fragments. + */ +u8 nat_reass_is_drop_frag (u8 is_ip6); + +/** + * @brief Initialize NAT virtual fragmentation reassembly. + * + * @param vm vlib main. + * + * @return error code. + */ +clib_error_t *nat_reass_init (vlib_main_t * vm); + +/** + * @brief Find or create reassembly. + * + * @param src Source IPv4 address. + * @param dst Destination IPv4 address. + * @param frag_id Fragment ID. + * @param proto L4 protocol. + * @param reset_timeout If non-zero value reset timeout. + * @param bi_to_drop Fragments to drop. + * + * @returns Reassembly data or 0 on failure. + */ +nat_reass_ip4_t *nat_ip4_reass_find_or_create (ip4_address_t src, + ip4_address_t dst, + u16 frag_id, u8 proto, + u8 reset_timeout, + u32 ** bi_to_drop); +/** + * @brief Cache fragment. + * + * @param reass Reassembly data. + * @param bi Buffer index. + * + * @returns 0 on success, non-zero value otherwise. + */ +int nat_ip4_reass_add_fragment (nat_reass_ip4_t * reass, u32 bi); + +/** + * @brief Get cached fragments. + * + * @param reass Reassembly data. + * @param bi Vector of buffer indexes. + */ +void nat_ip4_reass_get_frags (nat_reass_ip4_t * reass, u32 ** bi); + +/** + * @breif Call back function when walking IPv4 reassemblies, non-zero return + * value stop walk. + */ +typedef int (*nat_ip4_reass_walk_fn_t) (nat_reass_ip4_t * reass, void *ctx); + +/** + * @brief Walk IPv4 reassemblies. + * + * @param fn The function to invoke on each entry visited. + * @param ctx A context passed in the visit function. + */ +void nat_ip4_reass_walk (nat_ip4_reass_walk_fn_t fn, void *ctx); + +/** + * @brief Find or create reassembly. + * + * @param src Source IPv6 address. + * @param dst Destination IPv6 address. + * @param frag_id Fragment ID. + * @param proto L4 protocol. + * @param reset_timeout If non-zero value reset timeout. + * @param bi_to_drop Fragments to drop. + * + * @returns Reassembly data or 0 on failure. + */ +nat_reass_ip6_t *nat_ip6_reass_find_or_create (ip6_address_t src, + ip6_address_t dst, + u32 frag_id, u8 proto, + u8 reset_timeout, + u32 ** bi_to_drop); +/** + * @brief Cache fragment. + * + * @param reass Reassembly data. + * @param bi Buffer index. + * + * @returns 0 on success, non-zero value otherwise. + */ +int nat_ip6_reass_add_fragment (nat_reass_ip6_t * reass, u32 bi); + +/** + * @brief Get cached fragments. + * + * @param reass Reassembly data. + * @param bi Vector of buffer indexes. + */ +void nat_ip6_reass_get_frags (nat_reass_ip6_t * reass, u32 ** bi); + +/** + * @breif Call back function when walking IPv6 reassemblies, non-zero return + * value stop walk. + */ +typedef int (*nat_ip6_reass_walk_fn_t) (nat_reass_ip6_t * reass, void *ctx); + +/** + * @brief Walk IPv6 reassemblies. + * + * @param fn The function to invoke on each entry visited. + * @param ctx A context passed in the visit function. + */ +void nat_ip6_reass_walk (nat_ip6_reass_walk_fn_t fn, void *ctx); + +#endif /* __included_nat_reass_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/nat/out2in.c b/src/plugins/nat/out2in.c index f250136b86e..489afadb4a2 100755 --- a/src/plugins/nat/out2in.c +++ b/src/plugins/nat/out2in.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -78,17 +79,40 @@ static u8 * format_snat_out2in_worker_handoff_trace (u8 * s, va_list * args) return s; } +typedef struct { + u32 sw_if_index; + u32 next_index; + u8 cached; +} nat44_out2in_reass_trace_t; + +static u8 * format_nat44_out2in_reass_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + nat44_out2in_reass_trace_t * t = va_arg (*args, nat44_out2in_reass_trace_t *); + + s = format (s, "NAT44_OUT2IN_REASS: sw_if_index %d, next index %d, status %s", + t->sw_if_index, t->next_index, + t->cached ? "cached" : "translated"); + + return s; +} + vlib_node_registration_t snat_out2in_node; vlib_node_registration_t snat_out2in_fast_node; vlib_node_registration_t snat_out2in_worker_handoff_node; vlib_node_registration_t snat_det_out2in_node; +vlib_node_registration_t nat44_out2in_reass_node; #define foreach_snat_out2in_error \ _(UNSUPPORTED_PROTOCOL, "Unsupported protocol") \ _(OUT2IN_PACKETS, "Good out2in packets processed") \ _(BAD_ICMP_TYPE, "unsupported ICMP type") \ _(NO_TRANSLATION, "No translation") \ -_(MAX_SESSIONS_EXCEEDED, "Maximum sessions exceeded") +_(MAX_SESSIONS_EXCEEDED, "Maximum sessions exceeded") \ +_(DROP_FRAGMENT, "Drop fragment") \ +_(MAX_REASS, "Maximum reassemblies exceeded") \ +_(MAX_FRAG, "Maximum fragments per reassembly exceeded") typedef enum { #define _(sym,str) SNAT_OUT2IN_ERROR_##sym, @@ -107,6 +131,7 @@ typedef enum { SNAT_OUT2IN_NEXT_DROP, SNAT_OUT2IN_NEXT_LOOKUP, SNAT_OUT2IN_NEXT_ICMP_ERROR, + SNAT_OUT2IN_NEXT_REASS, SNAT_OUT2IN_N_NEXT, } snat_out2in_next_t; @@ -139,6 +164,7 @@ create_session_for_static_mapping (snat_main_t *sm, dlist_elt_t * per_user_translation_list_elt; dlist_elt_t * per_user_list_head_elt; ip4_header_t *ip0; + udp_header_t *udp0; if (PREDICT_FALSE (maximum_sessions_exceeded(sm, thread_index))) { @@ -147,6 +173,7 @@ create_session_for_static_mapping (snat_main_t *sm, } ip0 = vlib_buffer_get_current (b0); + udp0 = ip4_next_header (ip0); user_key.addr = in2out.addr; user_key.fib_index = in2out.fib_index; @@ -188,7 +215,8 @@ create_session_for_static_mapping (snat_main_t *sm, s->outside_address_index = ~0; s->flags |= SNAT_SESSION_FLAG_STATIC_MAPPING; - s->ext_host_addr.as_u32 = ip0->dst_address.as_u32; + s->ext_host_addr.as_u32 = ip0->src_address.as_u32; + s->ext_host_port = udp0->src_port; u->nstaticsessions++; /* Create list elts */ @@ -1033,6 +1061,12 @@ snat_out2in_node_fn (vlib_main_t * vm, goto trace0; } + if (PREDICT_FALSE (ip4_is_fragment (ip0))) + { + next0 = SNAT_OUT2IN_NEXT_REASS; + goto trace0; + } + key0.addr = ip0->dst_address; key0.port = udp0->dst_port; key0.protocol = proto0; @@ -1188,6 +1222,12 @@ snat_out2in_node_fn (vlib_main_t * vm, goto trace1; } + if (PREDICT_FALSE (ip4_is_fragment (ip1))) + { + next1 = SNAT_OUT2IN_NEXT_REASS; + goto trace1; + } + key1.addr = ip1->dst_address; key1.port = udp1->dst_port; key1.protocol = proto1; @@ -1379,6 +1419,12 @@ snat_out2in_node_fn (vlib_main_t * vm, goto trace00; } + if (PREDICT_FALSE (ip4_is_fragment (ip0))) + { + next0 = SNAT_OUT2IN_NEXT_REASS; + goto trace00; + } + key0.addr = ip0->dst_address; key0.port = udp0->dst_port; key0.protocol = proto0; @@ -1530,10 +1576,294 @@ VLIB_REGISTER_NODE (snat_out2in_node) = { [SNAT_OUT2IN_NEXT_DROP] = "error-drop", [SNAT_OUT2IN_NEXT_LOOKUP] = "ip4-lookup", [SNAT_OUT2IN_NEXT_ICMP_ERROR] = "ip4-icmp-error", + [SNAT_OUT2IN_NEXT_REASS] = "nat44-out2in-reass", }, }; VLIB_NODE_FUNCTION_MULTIARCH (snat_out2in_node, snat_out2in_node_fn); +static uword +nat44_out2in_reass_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, *from, *to_next; + snat_out2in_next_t next_index; + u32 pkts_processed = 0; + snat_main_t *sm = &snat_main; + f64 now = vlib_time_now (vm); + u32 thread_index = vlib_get_thread_index (); + snat_main_per_thread_data_t *per_thread_data = + &sm->per_thread_data[thread_index]; + u32 *fragments_to_drop = 0; + u32 *fragments_to_loopback = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, sw_if_index0, proto0, rx_fib_index0, new_addr0, old_addr0; + vlib_buffer_t *b0; + u32 next0; + u8 cached0 = 0; + ip4_header_t *ip0; + nat_reass_ip4_t *reass0; + udp_header_t * udp0; + tcp_header_t * tcp0; + snat_session_key_t key0, sm0; + clib_bihash_kv_8_8_t kv0, value0; + snat_session_t * s0 = 0; + u16 old_port0, new_port0; + ip_csum_t sum0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + next0 = SNAT_OUT2IN_NEXT_LOOKUP; + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP4, + sw_if_index0); + + if (PREDICT_FALSE (nat_reass_is_drop_frag(0))) + { + next0 = SNAT_OUT2IN_NEXT_DROP; + b0->error = node->errors[SNAT_OUT2IN_ERROR_DROP_FRAGMENT]; + goto trace0; + } + + ip0 = (ip4_header_t *) vlib_buffer_get_current (b0); + udp0 = ip4_next_header (ip0); + tcp0 = (tcp_header_t *) udp0; + proto0 = ip_proto_to_snat_proto (ip0->protocol); + + reass0 = nat_ip4_reass_find_or_create (ip0->src_address, + ip0->dst_address, + ip0->fragment_id, + ip0->protocol, + 1, + &fragments_to_drop); + + if (PREDICT_FALSE (!reass0)) + { + next0 = SNAT_OUT2IN_NEXT_DROP; + b0->error = node->errors[SNAT_OUT2IN_ERROR_MAX_REASS]; + goto trace0; + } + + if (PREDICT_FALSE (ip4_is_first_fragment (ip0))) + { + key0.addr = ip0->dst_address; + key0.port = udp0->dst_port; + key0.protocol = proto0; + key0.fib_index = rx_fib_index0; + kv0.key = key0.as_u64; + + if (clib_bihash_search_8_8 (&per_thread_data->out2in, &kv0, &value0)) + { + /* Try to match static mapping by external address and port, + destination address and port in packet */ + if (snat_static_mapping_match(sm, key0, &sm0, 1, 0)) + { + b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + /* + * Send DHCP packets to the ipv4 stack, or we won't + * be able to use dhcp client on the outside interface + */ + if (proto0 != SNAT_PROTOCOL_UDP + || (udp0->dst_port + != clib_host_to_net_u16(UDP_DST_PORT_dhcp_to_client))) + + next0 = SNAT_OUT2IN_NEXT_DROP; + goto trace0; + } + + /* Create session initiated by host from external network */ + s0 = create_session_for_static_mapping(sm, b0, sm0, key0, node, + thread_index); + if (!s0) + { + b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + next0 = SNAT_OUT2IN_NEXT_DROP; + goto trace0; + } + reass0->sess_index = s0 - per_thread_data->sessions; + } + else + { + s0 = pool_elt_at_index (per_thread_data->sessions, + value0.value); + reass0->sess_index = value0.value; + } + nat_ip4_reass_get_frags (reass0, &fragments_to_loopback); + } + else + { + if (PREDICT_FALSE (reass0->sess_index == (u32) ~0)) + { + if (nat_ip4_reass_add_fragment (reass0, bi0)) + { + b0->error = node->errors[SNAT_OUT2IN_ERROR_MAX_FRAG]; + next0 = SNAT_OUT2IN_NEXT_DROP; + goto trace0; + } + cached0 = 1; + goto trace0; + } + s0 = pool_elt_at_index (per_thread_data->sessions, + reass0->sess_index); + } + + old_addr0 = ip0->dst_address.as_u32; + ip0->dst_address = s0->in2out.addr; + new_addr0 = ip0->dst_address.as_u32; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = s0->in2out.fib_index; + + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + dst_address /* changed member */); + ip0->checksum = ip_csum_fold (sum0); + + if (PREDICT_FALSE (ip4_is_first_fragment (ip0))) + { + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + old_port0 = tcp0->dst_port; + tcp0->dst_port = s0->in2out.port; + new_port0 = tcp0->dst_port; + + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + dst_address /* changed member */); + + sum0 = ip_csum_update (sum0, old_port0, new_port0, + ip4_header_t /* cheat */, + length /* changed member */); + tcp0->checksum = ip_csum_fold(sum0); + } + else + { + old_port0 = udp0->dst_port; + udp0->dst_port = s0->in2out.port; + udp0->checksum = 0; + } + } + + /* Accounting */ + s0->last_heard = now; + s0->total_pkts++; + s0->total_bytes += vlib_buffer_length_in_chain (vm, b0); + /* Per-user LRU list maintenance for dynamic translation */ + if (!snat_is_session_static (s0)) + { + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, + s0->per_user_index); + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, + s0->per_user_list_head_index, + s0->per_user_index); + } + + trace0: + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + nat44_out2in_reass_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->cached = cached0; + t->sw_if_index = sw_if_index0; + t->next_index = next0; + } + + if (cached0) + { + n_left_to_next++; + to_next--; + } + else + { + pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + if (n_left_from == 0 && vec_len (fragments_to_loopback)) + { + from = vlib_frame_vector_args (frame); + u32 len = vec_len (fragments_to_loopback); + if (len <= VLIB_FRAME_SIZE) + { + clib_memcpy (from, fragments_to_loopback, sizeof (u32) * len); + n_left_from = len; + vec_reset_length (fragments_to_loopback); + } + else + { + clib_memcpy (from, + fragments_to_loopback + (len - VLIB_FRAME_SIZE), + sizeof (u32) * VLIB_FRAME_SIZE); + n_left_from = VLIB_FRAME_SIZE; + _vec_len (fragments_to_loopback) = len - VLIB_FRAME_SIZE; + } + } + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, nat44_out2in_reass_node.index, + SNAT_OUT2IN_ERROR_OUT2IN_PACKETS, + pkts_processed); + + nat_send_all_to_node (vm, fragments_to_drop, node, + &node->errors[SNAT_OUT2IN_ERROR_DROP_FRAGMENT], + SNAT_OUT2IN_NEXT_DROP); + + vec_free (fragments_to_drop); + vec_free (fragments_to_loopback); + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (nat44_out2in_reass_node) = { + .function = nat44_out2in_reass_node_fn, + .name = "nat44-out2in-reass", + .vector_size = sizeof (u32), + .format_trace = format_nat44_out2in_reass_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(snat_out2in_error_strings), + .error_strings = snat_out2in_error_strings, + + .n_next_nodes = SNAT_OUT2IN_N_NEXT, + + /* edit / add dispositions here */ + .next_nodes = { + [SNAT_OUT2IN_NEXT_DROP] = "error-drop", + [SNAT_OUT2IN_NEXT_LOOKUP] = "ip4-lookup", + [SNAT_OUT2IN_NEXT_ICMP_ERROR] = "ip4-icmp-error", + [SNAT_OUT2IN_NEXT_REASS] = "nat44-out2in-reass", + }, +}; +VLIB_NODE_FUNCTION_MULTIARCH (nat44_out2in_reass_node, + nat44_out2in_reass_node_fn); + /**************************/ /*** deterministic mode ***/ /**************************/ @@ -2017,6 +2347,7 @@ VLIB_REGISTER_NODE (snat_det_out2in_node) = { [SNAT_OUT2IN_NEXT_DROP] = "error-drop", [SNAT_OUT2IN_NEXT_LOOKUP] = "ip4-lookup", [SNAT_OUT2IN_NEXT_ICMP_ERROR] = "ip4-icmp-error", + [SNAT_OUT2IN_NEXT_REASS] = "nat44-out2in-reass", }, }; VLIB_NODE_FUNCTION_MULTIARCH (snat_det_out2in_node, snat_det_out2in_node_fn); @@ -2509,6 +2840,7 @@ VLIB_REGISTER_NODE (snat_out2in_fast_node) = { [SNAT_OUT2IN_NEXT_LOOKUP] = "ip4-lookup", [SNAT_OUT2IN_NEXT_DROP] = "error-drop", [SNAT_OUT2IN_NEXT_ICMP_ERROR] = "ip4-icmp-error", + [SNAT_OUT2IN_NEXT_REASS] = "nat44-out2in-reass", }, }; VLIB_NODE_FUNCTION_MULTIARCH (snat_out2in_fast_node, snat_out2in_fast_node_fn); diff --git a/test/test_nat.py b/test/test_nat.py index e420baffd1d..3c002bb8eca 100644 --- a/test/test_nat.py +++ b/test/test_nat.py @@ -3,16 +3,19 @@ import socket import unittest import struct +import StringIO +import random from framework import VppTestCase, VppTestRunner, running_extended_tests from vpp_ip_route import VppIpRoute, VppRoutePath, DpoProto from scapy.layers.inet import IP, TCP, UDP, ICMP from scapy.layers.inet import IPerror, TCPerror, UDPerror, ICMPerror from scapy.layers.inet6 import IPv6, ICMPv6EchoRequest, ICMPv6EchoReply -from scapy.layers.inet6 import ICMPv6DestUnreach, IPerror6 +from scapy.layers.inet6 import ICMPv6DestUnreach, IPerror6, IPv6ExtHdrFragment from scapy.layers.l2 import Ether, ARP, GRE from scapy.data import IP_PROTOS -from scapy.packet import bind_layers +from scapy.packet import bind_layers, Raw +from scapy.all import fragment6 from util import ppp from ipfix import IPFIX, Set, Template, Data, IPFIXDecoder from time import sleep @@ -464,6 +467,121 @@ class MethodHolder(VppTestCase): "(inside network):", packet)) raise + def create_stream_frag(self, src_if, dst, sport, dport, data): + """ + Create fragmented packet stream + + :param src_if: Source interface + :param dst: Destination IPv4 address + :param sport: Source TCP port + :param dport: Destination TCP port + :param data: Payload data + :returns: Fragmets + """ + id = random.randint(0, 65535) + p = (IP(src=src_if.remote_ip4, dst=dst) / + TCP(sport=sport, dport=dport) / + Raw(data)) + p = p.__class__(str(p)) + chksum = p['TCP'].chksum + pkts = [] + p = (Ether(src=src_if.remote_mac, dst=src_if.local_mac) / + IP(src=src_if.remote_ip4, dst=dst, flags="MF", frag=0, id=id) / + TCP(sport=sport, dport=dport, chksum=chksum) / + Raw(data[0:4])) + pkts.append(p) + p = (Ether(src=src_if.remote_mac, dst=src_if.local_mac) / + IP(src=src_if.remote_ip4, dst=dst, flags="MF", frag=3, id=id, + proto=IP_PROTOS.tcp) / + Raw(data[4:20])) + pkts.append(p) + p = (Ether(src=src_if.remote_mac, dst=src_if.local_mac) / + IP(src=src_if.remote_ip4, dst=dst, frag=5, proto=IP_PROTOS.tcp, + id=id) / + Raw(data[20:])) + pkts.append(p) + return pkts + + def create_stream_frag_ip6(self, src_if, dst, sport, dport, data, + pref=None, plen=0, frag_size=128): + """ + Create fragmented packet stream + + :param src_if: Source interface + :param dst: Destination IPv4 address + :param sport: Source TCP port + :param dport: Destination TCP port + :param data: Payload data + :param pref: NAT64 prefix + :param plen: NAT64 prefix length + :param fragsize: size of fragments + :returns: Fragmets + """ + if pref is None: + dst_ip6 = ''.join(['64:ff9b::', dst]) + else: + dst_ip6 = self.compose_ip6(dst, pref, plen) + + p = (Ether(dst=src_if.local_mac, src=src_if.remote_mac) / + IPv6(src=src_if.remote_ip6, dst=dst_ip6) / + IPv6ExtHdrFragment(id=random.randint(0, 65535)) / + TCP(sport=sport, dport=dport) / + Raw(data)) + + return fragment6(p, frag_size) + + def reass_frags_and_verify(self, frags, src, dst): + """ + Reassemble and verify fragmented packet + + :param frags: Captured fragments + :param src: Source IPv4 address to verify + :param dst: Destination IPv4 address to verify + + :returns: Reassembled IPv4 packet + """ + buffer = StringIO.StringIO() + for p in frags: + self.assertEqual(p[IP].src, src) + self.assertEqual(p[IP].dst, dst) + self.check_ip_checksum(p) + buffer.seek(p[IP].frag * 8) + buffer.write(p[IP].payload) + ip = frags[0].getlayer(IP) + ip = IP(src=frags[0][IP].src, dst=frags[0][IP].dst, + proto=frags[0][IP].proto) + if ip.proto == IP_PROTOS.tcp: + p = (ip / TCP(buffer.getvalue())) + self.check_tcp_checksum(p) + elif ip.proto == IP_PROTOS.udp: + p = (ip / UDP(buffer.getvalue())) + return p + + def reass_frags_and_verify_ip6(self, frags, src, dst): + """ + Reassemble and verify fragmented packet + + :param frags: Captured fragments + :param src: Source IPv6 address to verify + :param dst: Destination IPv6 address to verify + + :returns: Reassembled IPv6 packet + """ + buffer = StringIO.StringIO() + for p in frags: + self.assertEqual(p[IPv6].src, src) + self.assertEqual(p[IPv6].dst, dst) + buffer.seek(p[IPv6ExtHdrFragment].offset * 8) + buffer.write(p[IPv6ExtHdrFragment].payload) + ip = IPv6(src=frags[0][IPv6].src, dst=frags[0][IPv6].dst, + nh=frags[0][IPv6ExtHdrFragment].nh) + if ip.nh == IP_PROTOS.tcp: + p = (ip / TCP(buffer.getvalue())) + self.check_tcp_checksum(p) + elif ip.nh == IP_PROTOS.udp: + p = (ip / UDP(buffer.getvalue())) + return p + def verify_ipfix_nat44_ses(self, data): """ Verify IPFIX NAT44 session create/delete event @@ -586,6 +704,8 @@ class TestNAT44(MethodHolder): cls.pg4._remote_ip4 = cls.pg9._remote_hosts[0]._ip4 = "10.0.0.2" cls.pg9.resolve_arp() + random.seed() + except Exception: super(TestNAT44, cls).tearDownClass() raise @@ -671,6 +791,9 @@ class TestNAT44(MethodHolder): addr.ip_address, is_add=0) + self.vapi.nat_set_reass() + self.vapi.nat_set_reass(is_ip6=1) + def nat44_add_static_mapping(self, local_ip, external_ip='0.0.0.0', local_port=0, external_port=0, vrf_id=0, is_add=1, external_sw_if_index=0xFFFFFFFF, @@ -2480,10 +2603,164 @@ class TestNAT44(MethodHolder): sessions = self.vapi.nat44_user_session_dump(self.pg0.remote_ip4n, 0) self.assertEqual(nsessions - len(sessions), 2) + def test_set_get_reass(self): + """ NAT44 set/get virtual fragmentation reassembly """ + reas_cfg1 = self.vapi.nat_get_reass() + + self.vapi.nat_set_reass(timeout=reas_cfg1.ip4_timeout + 5, + max_reass=reas_cfg1.ip4_max_reass * 2, + max_frag=reas_cfg1.ip4_max_frag * 2) + + reas_cfg2 = self.vapi.nat_get_reass() + + self.assertEqual(reas_cfg1.ip4_timeout + 5, reas_cfg2.ip4_timeout) + self.assertEqual(reas_cfg1.ip4_max_reass * 2, reas_cfg2.ip4_max_reass) + self.assertEqual(reas_cfg1.ip4_max_frag * 2, reas_cfg2.ip4_max_frag) + + self.vapi.nat_set_reass(drop_frag=1) + self.assertTrue(self.vapi.nat_get_reass().ip4_drop_frag) + + def test_frag_in_order(self): + """ NAT44 translate fragments arriving in order """ + self.nat44_add_address(self.nat_addr) + self.vapi.nat44_interface_add_del_feature(self.pg0.sw_if_index) + self.vapi.nat44_interface_add_del_feature(self.pg1.sw_if_index, + is_inside=0) + + data = "A" * 4 + "B" * 16 + "C" * 3 + self.tcp_port_in = random.randint(1025, 65535) + + reass = self.vapi.nat_reass_dump() + reass_n_start = len(reass) + + # in2out + pkts = self.create_stream_frag(self.pg0, + self.pg1.remote_ip4, + self.tcp_port_in, + 20, + data) + self.pg0.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + frags = self.pg1.get_capture(len(pkts)) + p = self.reass_frags_and_verify(frags, + self.nat_addr, + self.pg1.remote_ip4) + self.assertEqual(p[TCP].dport, 20) + self.assertNotEqual(p[TCP].sport, self.tcp_port_in) + self.tcp_port_out = p[TCP].sport + self.assertEqual(data, p[Raw].load) + + # out2in + pkts = self.create_stream_frag(self.pg1, + self.nat_addr, + 20, + self.tcp_port_out, + data) + self.pg1.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + frags = self.pg0.get_capture(len(pkts)) + p = self.reass_frags_and_verify(frags, + self.pg1.remote_ip4, + self.pg0.remote_ip4) + self.assertEqual(p[TCP].sport, 20) + self.assertEqual(p[TCP].dport, self.tcp_port_in) + self.assertEqual(data, p[Raw].load) + + reass = self.vapi.nat_reass_dump() + reass_n_end = len(reass) + + self.assertEqual(reass_n_end - reass_n_start, 2) + + def test_reass_hairpinning(self): + """ NAT44 fragments hairpinning """ + host = self.pg0.remote_hosts[0] + server = self.pg0.remote_hosts[1] + host_in_port = random.randint(1025, 65535) + host_out_port = 0 + server_in_port = random.randint(1025, 65535) + server_out_port = random.randint(1025, 65535) + data = "A" * 4 + "B" * 16 + "C" * 3 + + self.nat44_add_address(self.nat_addr) + self.vapi.nat44_interface_add_del_feature(self.pg0.sw_if_index) + self.vapi.nat44_interface_add_del_feature(self.pg1.sw_if_index, + is_inside=0) + # add static mapping for server + self.nat44_add_static_mapping(server.ip4, self.nat_addr, + server_in_port, server_out_port, + proto=IP_PROTOS.tcp) + + # send packet from host to server + pkts = self.create_stream_frag(self.pg0, + self.nat_addr, + host_in_port, + server_out_port, + data) + self.pg0.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + frags = self.pg0.get_capture(len(pkts)) + p = self.reass_frags_and_verify(frags, + self.nat_addr, + server.ip4) + self.assertNotEqual(p[TCP].sport, host_in_port) + self.assertEqual(p[TCP].dport, server_in_port) + self.assertEqual(data, p[Raw].load) + + def test_frag_out_of_order(self): + """ NAT44 translate fragments arriving out of order """ + self.nat44_add_address(self.nat_addr) + self.vapi.nat44_interface_add_del_feature(self.pg0.sw_if_index) + self.vapi.nat44_interface_add_del_feature(self.pg1.sw_if_index, + is_inside=0) + + data = "A" * 4 + "B" * 16 + "C" * 3 + random.randint(1025, 65535) + + # in2out + pkts = self.create_stream_frag(self.pg0, + self.pg1.remote_ip4, + self.tcp_port_in, + 20, + data) + pkts.reverse() + self.pg0.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + frags = self.pg1.get_capture(len(pkts)) + p = self.reass_frags_and_verify(frags, + self.nat_addr, + self.pg1.remote_ip4) + self.assertEqual(p[TCP].dport, 20) + self.assertNotEqual(p[TCP].sport, self.tcp_port_in) + self.tcp_port_out = p[TCP].sport + self.assertEqual(data, p[Raw].load) + + # out2in + pkts = self.create_stream_frag(self.pg1, + self.nat_addr, + 20, + self.tcp_port_out, + data) + pkts.reverse() + self.pg1.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + frags = self.pg0.get_capture(len(pkts)) + p = self.reass_frags_and_verify(frags, + self.pg1.remote_ip4, + self.pg0.remote_ip4) + self.assertEqual(p[TCP].sport, 20) + self.assertEqual(p[TCP].dport, self.tcp_port_in) + self.assertEqual(data, p[Raw].load) + def tearDown(self): super(TestNAT44, self).tearDown() if not self.vpp_dead: self.logger.info(self.vapi.cli("show nat44 verbose")) + self.logger.info(self.vapi.cli("show nat virtual-reassembly")) self.clear_nat44() @@ -3928,6 +4205,138 @@ class TestNAT64(MethodHolder): self.logger.error(ppp("Unexpected or invalid packet:", p)) raise + def test_frag_in_order(self): + """ NAT64 translate fragments arriving in order """ + self.tcp_port_in = random.randint(1025, 65535) + + self.vapi.nat64_add_del_pool_addr_range(self.nat_addr_n, + self.nat_addr_n) + self.vapi.nat64_add_del_interface(self.pg0.sw_if_index) + self.vapi.nat64_add_del_interface(self.pg1.sw_if_index, is_inside=0) + + reass = self.vapi.nat_reass_dump() + reass_n_start = len(reass) + + # in2out + data = 'a' * 200 + pkts = self.create_stream_frag_ip6(self.pg0, self.pg1.remote_ip4, + self.tcp_port_in, 20, data) + self.pg0.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + frags = self.pg1.get_capture(len(pkts)) + p = self.reass_frags_and_verify(frags, + self.nat_addr, + self.pg1.remote_ip4) + self.assertEqual(p[TCP].dport, 20) + self.assertNotEqual(p[TCP].sport, self.tcp_port_in) + self.tcp_port_out = p[TCP].sport + self.assertEqual(data, p[Raw].load) + + # out2in + data = "A" * 4 + "b" * 16 + "C" * 3 + pkts = self.create_stream_frag(self.pg1, + self.nat_addr, + 20, + self.tcp_port_out, + data) + self.pg1.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + frags = self.pg0.get_capture(len(pkts)) + src = self.compose_ip6(self.pg1.remote_ip4, '64:ff9b::', 96) + p = self.reass_frags_and_verify_ip6(frags, src, self.pg0.remote_ip6) + self.assertEqual(p[TCP].sport, 20) + self.assertEqual(p[TCP].dport, self.tcp_port_in) + self.assertEqual(data, p[Raw].load) + + reass = self.vapi.nat_reass_dump() + reass_n_end = len(reass) + + self.assertEqual(reass_n_end - reass_n_start, 2) + + def test_reass_hairpinning(self): + """ NAT64 fragments hairpinning """ + data = 'a' * 200 + client = self.pg0.remote_hosts[0] + server = self.pg0.remote_hosts[1] + server_in_port = random.randint(1025, 65535) + server_out_port = random.randint(1025, 65535) + client_in_port = random.randint(1025, 65535) + ip = IPv6(src=''.join(['64:ff9b::', self.nat_addr])) + nat_addr_ip6 = ip.src + + self.vapi.nat64_add_del_pool_addr_range(self.nat_addr_n, + self.nat_addr_n) + self.vapi.nat64_add_del_interface(self.pg0.sw_if_index) + self.vapi.nat64_add_del_interface(self.pg1.sw_if_index, is_inside=0) + + # add static BIB entry for server + self.vapi.nat64_add_del_static_bib(server.ip6n, + self.nat_addr_n, + server_in_port, + server_out_port, + IP_PROTOS.tcp) + + # send packet from host to server + pkts = self.create_stream_frag_ip6(self.pg0, + self.nat_addr, + client_in_port, + server_out_port, + data) + self.pg0.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + frags = self.pg0.get_capture(len(pkts)) + p = self.reass_frags_and_verify_ip6(frags, nat_addr_ip6, server.ip6) + self.assertNotEqual(p[TCP].sport, client_in_port) + self.assertEqual(p[TCP].dport, server_in_port) + self.assertEqual(data, p[Raw].load) + + def test_frag_out_of_order(self): + """ NAT64 translate fragments arriving out of order """ + self.tcp_port_in = random.randint(1025, 65535) + + self.vapi.nat64_add_del_pool_addr_range(self.nat_addr_n, + self.nat_addr_n) + self.vapi.nat64_add_del_interface(self.pg0.sw_if_index) + self.vapi.nat64_add_del_interface(self.pg1.sw_if_index, is_inside=0) + + # in2out + data = 'a' * 200 + pkts = self.create_stream_frag_ip6(self.pg0, self.pg1.remote_ip4, + self.tcp_port_in, 20, data) + pkts.reverse() + self.pg0.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + frags = self.pg1.get_capture(len(pkts)) + p = self.reass_frags_and_verify(frags, + self.nat_addr, + self.pg1.remote_ip4) + self.assertEqual(p[TCP].dport, 20) + self.assertNotEqual(p[TCP].sport, self.tcp_port_in) + self.tcp_port_out = p[TCP].sport + self.assertEqual(data, p[Raw].load) + + # out2in + data = "A" * 4 + "B" * 16 + "C" * 3 + pkts = self.create_stream_frag(self.pg1, + self.nat_addr, + 20, + self.tcp_port_out, + data) + pkts.reverse() + self.pg1.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + frags = self.pg0.get_capture(len(pkts)) + src = self.compose_ip6(self.pg1.remote_ip4, '64:ff9b::', 96) + p = self.reass_frags_and_verify_ip6(frags, src, self.pg0.remote_ip6) + self.assertEqual(p[TCP].sport, 20) + self.assertEqual(p[TCP].dport, self.tcp_port_in) + self.assertEqual(data, p[Raw].load) + def nat64_get_ses_num(self): """ Return number of active NAT64 sessions. @@ -4006,6 +4415,7 @@ class TestNAT64(MethodHolder): self.logger.info(self.vapi.cli("show nat64 prefix")) self.logger.info(self.vapi.cli("show nat64 bib all")) self.logger.info(self.vapi.cli("show nat64 session table all")) + self.logger.info(self.vapi.cli("show nat virtual-reassembly")) self.clear_nat64() diff --git a/test/vpp_papi_provider.py b/test/vpp_papi_provider.py index 31d7ac48d1f..63f938376ae 100644 --- a/test/vpp_papi_provider.py +++ b/test/vpp_papi_provider.py @@ -1408,6 +1408,43 @@ class VppPapiProvider(object): 'vrf_id': vrf_id, 'is_in': is_in}) + def nat_set_reass( + self, + timeout=2, + max_reass=1024, + max_frag=5, + drop_frag=0, + is_ip6=0): + """Set NAT virtual fragmentation reassembly + + :param timeout: reassembly timeout (Default 2sec) + :param max_reass: maximum concurrent reassemblies (Default 1024) + :param max_frag: maximum fragmets per reassembly (Default 5) + :param drop_frag: if 0 translate fragments, otherwise drop fragments + :param is_ip6: 1 if IPv6, 0 if IPv4 + """ + return self.api( + self.papi.nat_set_reass, + {'timeout': timeout, + 'max_reass': max_reass, + 'max_frag': max_frag, + 'drop_frag': drop_frag, + 'is_ip6': is_ip6}) + + def nat_get_reass(self): + """Get NAT virtual fragmentation reassembly configuration + + :return: NAT virtual fragmentation reassembly configuration + """ + return self.api(self.papi.nat_get_reass, {}) + + def nat_reass_dump(self): + """Dump NAT virtual fragmentation reassemblies + + :return: Dictionary of NAT virtual fragmentation reassemblies + """ + return self.api(self.papi.nat_reass_dump, {}) + def nat_det_add_del_map( self, in_addr, -- 2.16.6