X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp.c;h=5a215b658a0e8b44cc587637a39d1e8c0c4d4241;hb=26dd6de91b4d36ac04154c7eb6339684db6684a0;hp=39d683c78e7e6f1ea781a70f543868d235b1a082;hpb=4759683744d079f868a6bcca8084eba8466620d7;p=vpp.git diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 39d683c78e7..5a215b658a0 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -25,6 +25,7 @@ #include #include #include +#include tcp_main_t tcp_main; @@ -69,6 +70,46 @@ tcp_add_del_adjacency (tcp_connection_t * tc, u8 is_add) sizeof (args)); } +static void +tcp_cc_init (tcp_connection_t * tc) +{ + tc->cc_algo = tcp_cc_algo_get (tcp_main.cc_algo); + tc->cc_algo->init (tc); +} + +static void +tcp_cc_cleanup (tcp_connection_t * tc) +{ + if (tc->cc_algo->cleanup) + tc->cc_algo->cleanup (tc); +} + +void +tcp_cc_algo_register (tcp_cc_algorithm_type_e type, + const tcp_cc_algorithm_t * vft) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vec_validate (tm->cc_algos, type); + + tm->cc_algos[type] = *vft; + hash_set_mem (tm->cc_algo_by_name, vft->name, type); +} + +tcp_cc_algorithm_t * +tcp_cc_algo_get (tcp_cc_algorithm_type_e type) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + return &tm->cc_algos[type]; +} + +tcp_cc_algorithm_type_e +tcp_cc_algo_new_type (const tcp_cc_algorithm_t * vft) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_cc_algo_register (++tm->cc_last_type, vft); + return tm->cc_last_type; +} + static u32 tcp_connection_bind (u32 session_index, transport_endpoint_t * lcl) { @@ -200,6 +241,8 @@ tcp_connection_cleanup (tcp_connection_t * tc) { tcp_main_t *tm = &tcp_main; + TCP_EVT_DBG (TCP_EVT_DELETE, tc); + /* Cleanup local endpoint if this was an active connect */ transport_endpoint_cleanup (TRANSPORT_PROTO_TCP, &tc->c_lcl_ip, tc->c_lcl_port); @@ -223,9 +266,13 @@ tcp_connection_cleanup (tcp_connection_t * tc) if (!tc->c_is_ip4 && ip6_address_is_link_local_unicast (&tc->c_rmt_ip6)) tcp_add_del_adjacency (tc, 0); + tcp_cc_cleanup (tc); vec_free (tc->snd_sacks); vec_free (tc->snd_sacks_fl); + if (tc->flags & TCP_CONN_RATE_SAMPLE) + tcp_bt_cleanup (tc); + /* Poison the entry */ if (CLIB_DEBUG > 0) clib_memset (tc, 0xFA, sizeof (*tc)); @@ -243,7 +290,6 @@ tcp_connection_cleanup (tcp_connection_t * tc) void tcp_connection_del (tcp_connection_t * tc) { - TCP_EVT_DBG (TCP_EVT_DELETE, tc); session_transport_delete_notify (&tc->connection); tcp_connection_cleanup (tc); } @@ -265,9 +311,14 @@ void tcp_connection_free (tcp_connection_t * tc) { tcp_main_t *tm = &tcp_main; + if (CLIB_DEBUG) + { + u8 thread_index = tc->c_thread_index; + clib_memset (tc, 0xFA, sizeof (*tc)); + pool_put (tm->connections[thread_index], tc); + return; + } pool_put (tm->connections[tc->c_thread_index], tc); - if (CLIB_DEBUG > 0) - clib_memset (tc, 0xFA, sizeof (*tc)); } /** Notify session that connection has been reset. @@ -296,6 +347,7 @@ tcp_connection_reset (tcp_connection_t * tc) tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); session_transport_reset_notify (&tc->connection); tcp_connection_set_state (tc, TCP_STATE_CLOSED); + session_transport_closed_notify (&tc->connection); break; case TCP_STATE_CLOSE_WAIT: case TCP_STATE_FIN_WAIT_1: @@ -306,8 +358,8 @@ tcp_connection_reset (tcp_connection_t * tc) tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); /* Make sure we mark the session as closed. In some states we may * be still trying to send data */ - session_transport_closed_notify (&tc->connection); tcp_connection_set_state (tc, TCP_STATE_CLOSED); + session_transport_closed_notify (&tc->connection); break; case TCP_STATE_CLOSED: case TCP_STATE_TIME_WAIT: @@ -350,6 +402,16 @@ tcp_connection_close (tcp_connection_t * tc) tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_FINWAIT1_TIME); break; case TCP_STATE_ESTABLISHED: + /* If closing with unread data, reset the connection */ + if (transport_max_rx_dequeue (&tc->connection)) + { + tcp_send_reset (tc); + tcp_connection_timers_reset (tc); + tcp_connection_set_state (tc, TCP_STATE_CLOSED); + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); + session_transport_closed_notify (&tc->connection); + break; + } if (!transport_max_tx_dequeue (&tc->connection)) tcp_send_fin (tc); else @@ -530,30 +592,6 @@ tcp_connection_fib_attach (tcp_connection_t * tc) } #endif /* 0 */ -static void -tcp_cc_init (tcp_connection_t * tc) -{ - tc->cc_algo = tcp_cc_algo_get (tcp_main.cc_algo); - tc->cc_algo->init (tc); -} - -void -tcp_cc_algo_register (tcp_cc_algorithm_type_e type, - const tcp_cc_algorithm_t * vft) -{ - tcp_main_t *tm = vnet_get_tcp_main (); - vec_validate (tm->cc_algos, type); - - tm->cc_algos[type] = *vft; -} - -tcp_cc_algorithm_t * -tcp_cc_algo_get (tcp_cc_algorithm_type_e type) -{ - tcp_main_t *tm = vnet_get_tcp_main (); - return &tm->cc_algos[type]; -} - /** * Generate random iss as per rfc6528 */ @@ -629,6 +667,9 @@ tcp_connection_init_vars (tcp_connection_t * tc) if (transport_connection_is_tx_paced (&tc->connection) || tcp_main.tx_pacing) tcp_enable_pacing (tc); + + if (tc->flags & TCP_CONN_RATE_SAMPLE) + tcp_bt_init (tc); } static int @@ -841,9 +882,10 @@ format_tcp_vars (u8 * s, va_list * args) tcp_flight_size (tc), tcp_available_output_snd_space (tc), tcp_rcv_wnd_available (tc)); s = format (s, " tsval_recent %u\n", tc->tsval_recent); - s = format (s, " tsecr %u tsecr_last_ack %u tsval_recent_age %u\n", + s = format (s, " tsecr %u tsecr_last_ack %u tsval_recent_age %u", tc->rcv_opts.tsecr, tc->tsecr_last_ack, tcp_time_now () - tc->tsval_recent_age); + s = format (s, " snd_mss %u\n", tc->snd_mss); s = format (s, " rto %u rto_boff %u srtt %u us %.3f rttvar %u rtt_ts %.4f", tc->rto, tc->rto_boff, tc->srtt, tc->mrtt_us * 1000, tc->rttvar, tc->rtt_ts); @@ -924,6 +966,7 @@ static u8 * format_tcp_listener_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); + u32 __clib_unused thread_index = va_arg (*args, u32); u32 verbose = va_arg (*args, u32); tcp_connection_t *tc = tcp_listener_get (tci); s = format (s, "%-50U", format_tcp_connection_id, tc); @@ -936,6 +979,7 @@ static u8 * format_tcp_half_open_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); + u32 __clib_unused thread_index = va_arg (*args, u32); tcp_connection_t *tc = tcp_half_open_connection_get (tci); return format (s, "%U", format_tcp_connection_id, tc); } @@ -1152,8 +1196,6 @@ tcp_update_time (f64 now, u8 thread_index) tcp_set_time_now (wrk); tw_timer_expire_timers_16t_2w_512sl (&wrk->timer_wheel, now); - tcp_do_fastretransmits (wrk); - tcp_send_acks (wrk); tcp_flush_frames_to_output (wrk); } @@ -1184,11 +1226,14 @@ const static transport_proto_vft_t tcp_proto = { .update_time = tcp_update_time, .tx_fifo_offset = tcp_session_tx_fifo_offset, .flush_data = tcp_session_flush_data, + .custom_tx = tcp_session_custom_tx, .format_connection = format_tcp_session, .format_listener = format_tcp_listener_session, .format_half_open = format_tcp_half_open_session, - .tx_type = TRANSPORT_TX_PEEK, - .service_type = TRANSPORT_SERVICE_VC, + .transport_options = { + .tx_type = TRANSPORT_TX_PEEK, + .service_type = TRANSPORT_SERVICE_VC, + }, }; /* *INDENT-ON* */ @@ -1243,10 +1288,10 @@ tcp_timer_establish_handler (u32 conn_index) ASSERT (tc->state == TCP_STATE_SYN_RCVD); tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; tcp_connection_set_state (tc, TCP_STATE_CLOSED); - /* Start cleanup. App wasn't notified yet so use delete notify as - * opposed to delete to cleanup session layer state. */ tcp_connection_timers_reset (tc); - session_transport_delete_notify (&tc->connection); + /* Start cleanup. Do NOT delete the session until we do the connection + * cleanup. Otherwise, we end up with a dangling session index in the + * tcp connection. */ tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); } @@ -1271,7 +1316,7 @@ tcp_timer_establish_ao_handler (u32 conn_index) static void tcp_timer_waitclose_handler (u32 conn_index) { - u32 thread_index = vlib_get_thread_index (), rto; + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; tc = tcp_connection_get (conn_index, thread_index); @@ -1307,17 +1352,14 @@ tcp_timer_waitclose_handler (u32 conn_index) break; case TCP_STATE_FIN_WAIT_1: tcp_connection_timers_reset (tc); + session_transport_closed_notify (&tc->connection); if (tc->flags & TCP_CONN_FINPNDG) { - /* If FIN pending send it before closing and wait as long as - * the rto timeout would wait. Notify session layer that transport - * is closed. We haven't sent everything but we did try. */ - tcp_cong_recovery_off (tc); - tcp_send_fin (tc); - rto = clib_max ((tc->rto >> tc->rto_boff) * TCP_TO_TIMER_TICK, 1); - tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, - clib_min (rto, TCP_2MSL_TIME)); - session_transport_closed_notify (&tc->connection); + /* If FIN pending, we haven't sent everything, but we did try. + * Notify session layer that transport is closed. */ + tcp_connection_set_state (tc, TCP_STATE_CLOSED); + tcp_send_reset (tc); + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); } else { @@ -1433,17 +1475,9 @@ tcp_main_enable (vlib_main_t * vm) for (thread = 0; thread < num_threads; thread++) { - vec_validate (tm->wrk_ctx[thread].pending_fast_rxt, 255); - vec_validate (tm->wrk_ctx[thread].ongoing_fast_rxt, 255); - vec_validate (tm->wrk_ctx[thread].postponed_fast_rxt, 255); vec_validate (tm->wrk_ctx[thread].pending_deq_acked, 255); - vec_validate (tm->wrk_ctx[thread].pending_acks, 255); vec_validate (tm->wrk_ctx[thread].pending_disconnects, 255); - vec_reset_length (tm->wrk_ctx[thread].pending_fast_rxt); - vec_reset_length (tm->wrk_ctx[thread].ongoing_fast_rxt); - vec_reset_length (tm->wrk_ctx[thread].postponed_fast_rxt); vec_reset_length (tm->wrk_ctx[thread].pending_deq_acked); - vec_reset_length (tm->wrk_ctx[thread].pending_acks); vec_reset_length (tm->wrk_ctx[thread].pending_disconnects); tm->wrk_ctx[thread].vm = vlib_mains[thread]; @@ -1476,7 +1510,7 @@ tcp_main_enable (vlib_main_t * vm) tcp_initialize_iss_seed (tm); tm->bytes_per_buffer = vlib_buffer_get_default_data_size (vm); - + tm->cc_last_type = TCP_CC_LAST; return error; } @@ -1532,8 +1566,11 @@ tcp_init (vlib_main_t * vm) FIB_PROTOCOL_IP6, tcp6_output_node.index); tcp_api_reference (); + tm->cc_algo_by_name = hash_create_string (0, sizeof (uword)); tm->tx_pacing = 1; tm->cc_algo = TCP_CC_NEWRENO; + tm->default_mtu = 1460; + tm->initial_cwnd_multiplier = 0; return 0; } @@ -1543,15 +1580,20 @@ uword unformat_tcp_cc_algo (unformat_input_t * input, va_list * va) { uword *result = va_arg (*va, uword *); + tcp_main_t *tm = &tcp_main; + char *cc_algo_name; + u8 found = 0; + uword *p; - if (unformat (input, "newreno")) - *result = TCP_CC_NEWRENO; - else if (unformat (input, "cubic")) - *result = TCP_CC_CUBIC; - else - return 0; + if (unformat (input, "%s", &cc_algo_name) + && ((p = hash_get_mem (tm->cc_algo_by_name, cc_algo_name)))) + { + *result = *p; + found = 1; + } - return 1; + vec_free (cc_algo_name); + return found; } uword @@ -1596,6 +1638,11 @@ tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) else if (unformat (input, "max-rx-fifo %U", unformat_memory_size, &tm->max_rx_fifo)) ; + else if (unformat (input, "mtu %d", &tm->default_mtu)) + ; + else if (unformat (input, "initial-cwnd-multiplier %d", + &tm->initial_cwnd_multiplier)) + ; else if (unformat (input, "no-tx-pacing")) tm->tx_pacing = 0; else if (unformat (input, "cc-algo %U", unformat_tcp_cc_algo, @@ -1631,14 +1678,10 @@ tcp_configure_v4_source_address_range (vlib_main_t * vm, vnet_main_t *vnm = vnet_get_main (); u32 start_host_byte_order, end_host_byte_order; fib_prefix_t prefix; - vnet_sw_interface_t *si; fib_node_index_t fei; u32 fib_index = 0; u32 sw_if_index; int rv; - int vnet_proxy_arp_add_del (ip4_address_t * lo_addr, - ip4_address_t * hi_addr, u32 fib_index, - int is_del); clib_memset (&prefix, 0, sizeof (prefix)); @@ -1667,13 +1710,14 @@ tcp_configure_v4_source_address_range (vlib_main_t * vm, sw_if_index = fib_entry_get_resolving_interface (fei); - /* Enable proxy arp on the interface */ - si = vnet_get_sw_interface (vnm, sw_if_index); - si->flags |= VNET_SW_INTERFACE_FLAG_PROXY_ARP; - /* Configure proxy arp across the range */ rv = vnet_proxy_arp_add_del (start, end, fib_index, 0 /* is_del */ ); + if (rv) + return rv; + + rv = vnet_proxy_arp_enable_disable (vnm, sw_if_index, 1); + if (rv) return rv;