X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp.c;h=e32b5c417aef52d2472c0cf61068ea5221d84290;hb=b7b929931a07fbb27b43d5cd105f366c3e29807e;hp=15ac7d37edc38c76b1edffe9c4a6ed3c638c7a20;hpb=ca1c8f3e782dc68a51aa2792771d9b4aac696ddd;p=vpp.git diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 15ac7d37edc..e32b5c417ae 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -77,7 +77,7 @@ tcp_connection_bind (u32 session_index, transport_endpoint_t * lcl) void *iface_ip; pool_get (tm->listener_pool, listener); - memset (listener, 0, sizeof (*listener)); + clib_memset (listener, 0, sizeof (*listener)); listener->c_c_index = listener - tm->listener_pool; listener->c_lcl_port = lcl->port; @@ -103,7 +103,7 @@ tcp_connection_bind (u32 session_index, transport_endpoint_t * lcl) return listener->c_c_index; } -u32 +static u32 tcp_session_bind (u32 session_index, transport_endpoint_t * tep) { return tcp_connection_bind (session_index, tep); @@ -121,19 +121,19 @@ tcp_connection_unbind (u32 listener_index) /* Poison the entry */ if (CLIB_DEBUG > 0) - memset (tc, 0xFA, sizeof (*tc)); + clib_memset (tc, 0xFA, sizeof (*tc)); pool_put_index (tm->listener_pool, listener_index); } -u32 +static u32 tcp_session_unbind (u32 listener_index) { tcp_connection_unbind (listener_index); return 0; } -transport_connection_t * +static transport_connection_t * tcp_session_get_listener (u32 listener_index) { tcp_main_t *tm = vnet_get_tcp_main (); @@ -146,14 +146,14 @@ tcp_session_get_listener (u32 listener_index) * Cleanup half-open connection * */ -void +static void tcp_half_open_connection_del (tcp_connection_t * tc) { tcp_main_t *tm = vnet_get_tcp_main (); clib_spinlock_lock_if_init (&tm->half_open_lock); pool_put_index (tm->half_open_connections, tc->c_c_index); if (CLIB_DEBUG) - memset (tc, 0xFA, sizeof (*tc)); + clib_memset (tc, 0xFA, sizeof (*tc)); clib_spinlock_unlock_if_init (&tm->half_open_lock); } @@ -178,14 +178,14 @@ tcp_half_open_connection_cleanup (tcp_connection_t * tc) return 0; } -tcp_connection_t * +static tcp_connection_t * tcp_half_open_connection_new (void) { tcp_main_t *tm = vnet_get_tcp_main (); tcp_connection_t *tc = 0; ASSERT (vlib_get_thread_index () == 0); pool_get (tm->half_open_connections, tc); - memset (tc, 0, sizeof (*tc)); + clib_memset (tc, 0, sizeof (*tc)); tc->c_c_index = tc - tm->half_open_connections; return tc; } @@ -210,7 +210,8 @@ tcp_connection_cleanup (tcp_connection_t * tc) /* Try to remove the half-open connection. If this is not the owning * thread, tc won't be removed. Retransmit or establish timers will * eventually expire and call again cleanup on the right thread. */ - tcp_half_open_connection_cleanup (tc); + if (tcp_half_open_connection_cleanup (tc)) + tc->flags |= TCP_CONN_HALF_OPEN_DONE; } else { @@ -224,7 +225,7 @@ tcp_connection_cleanup (tcp_connection_t * tc) /* Poison the entry */ if (CLIB_DEBUG > 0) - memset (tc, 0xFA, sizeof (*tc)); + clib_memset (tc, 0xFA, sizeof (*tc)); pool_put (tm->connections[thread_index], tc); } } @@ -251,7 +252,7 @@ tcp_connection_new (u8 thread_index) tcp_connection_t *tc; pool_get (tm->connections[thread_index], tc); - memset (tc, 0, sizeof (*tc)); + clib_memset (tc, 0, sizeof (*tc)); tc->c_c_index = tc - tm->connections[thread_index]; tc->c_thread_index = thread_index; return tc; @@ -277,19 +278,19 @@ tcp_connection_reset (tcp_connection_t * tc) tcp_connection_cleanup (tc); break; case TCP_STATE_ESTABLISHED: + tcp_connection_timers_reset (tc); + /* Set the cleanup timer, in case the session layer/app don't + * cleanly close the connection */ + tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); stream_session_reset_notify (&tc->connection); - /* fall through */ + break; case TCP_STATE_CLOSE_WAIT: case TCP_STATE_FIN_WAIT_1: case TCP_STATE_FIN_WAIT_2: case TCP_STATE_CLOSING: tc->state = TCP_STATE_CLOSED; TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc); - - /* Make sure all timers are cleared */ tcp_connection_timers_reset (tc); - - /* Wait for cleanup from session layer but not forever */ tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); break; case TCP_STATE_CLOSED: @@ -322,21 +323,28 @@ tcp_connection_close (tcp_connection_t * tc) tc->state = TCP_STATE_CLOSED; break; case TCP_STATE_SYN_RCVD: + tcp_connection_timers_reset (tc); tcp_send_fin (tc); tc->state = TCP_STATE_FIN_WAIT_1; + tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); break; case TCP_STATE_ESTABLISHED: - if (!stream_session_tx_fifo_max_dequeue (&tc->connection)) + if (!session_tx_fifo_max_dequeue (&tc->connection)) tcp_send_fin (tc); else tc->flags |= TCP_CONN_FINPNDG; tc->state = TCP_STATE_FIN_WAIT_1; break; case TCP_STATE_CLOSE_WAIT: - tcp_send_fin (tc); - tcp_connection_timers_reset (tc); - tc->state = TCP_STATE_LAST_ACK; - tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + if (!session_tx_fifo_max_dequeue (&tc->connection)) + { + tcp_send_fin (tc); + tcp_connection_timers_reset (tc); + tc->state = TCP_STATE_LAST_ACK; + tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + } + else + tc->flags |= TCP_CONN_FINPNDG; break; case TCP_STATE_FIN_WAIT_1: tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); @@ -353,7 +361,7 @@ tcp_connection_close (tcp_connection_t * tc) tcp_connection_del (tc); } -void +static void tcp_session_close (u32 conn_index, u32 thread_index) { tcp_connection_t *tc; @@ -361,17 +369,15 @@ tcp_session_close (u32 conn_index, u32 thread_index) tcp_connection_close (tc); } -void +static void tcp_session_cleanup (u32 conn_index, u32 thread_index) { tcp_connection_t *tc; tc = tcp_connection_get (conn_index, thread_index); tcp_connection_timers_reset (tc); - - /* Wait for the session tx events to clear */ tc->state = TCP_STATE_CLOSED; TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc); - tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); + tcp_connection_cleanup (tc); } /** @@ -429,7 +435,7 @@ tcp_connection_select_lb_bucket (tcp_connection_t * tc, const dpo_id_t * dpo, if (tc->c_is_ip4) { ip4_tcp_hdr_t hdr; - memset (&hdr, 0, sizeof (hdr)); + clib_memset (&hdr, 0, sizeof (hdr)); hdr.ip.protocol = IP_PROTOCOL_TCP; hdr.ip.address_pair.src.as_u32 = tc->c_lcl_ip.ip4.as_u32; hdr.ip.address_pair.dst.as_u32 = tc->c_rmt_ip.ip4.as_u32; @@ -440,7 +446,7 @@ tcp_connection_select_lb_bucket (tcp_connection_t * tc, const dpo_id_t * dpo, else { ip6_tcp_hdr_t hdr; - memset (&hdr, 0, sizeof (hdr)); + clib_memset (&hdr, 0, sizeof (hdr)); hdr.ip.protocol = IP_PROTOCOL_TCP; clib_memcpy (&hdr.ip.src_address, &tc->c_lcl_ip.ip6, sizeof (ip6_address_t)); @@ -501,6 +507,31 @@ tcp_connection_fib_attach (tcp_connection_t * tc) } #endif /* 0 */ +static void +tcp_cc_init (tcp_connection_t * tc) +{ + tc->cc_algo = tcp_cc_algo_get (TCP_CC_NEWRENO); + tc->cc_algo->init (tc); +} + +void +tcp_cc_algo_register (tcp_cc_algorithm_type_e type, + const tcp_cc_algorithm_t * vft) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vec_validate (tm->cc_algos, type); + + tm->cc_algos[type] = *vft; +} + +tcp_cc_algorithm_t * +tcp_cc_algo_get (tcp_cc_algorithm_type_e type) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + return &tm->cc_algos[type]; +} + + /** * Initialize connection send variables. */ @@ -574,7 +605,7 @@ tcp_alloc_custom_local_endpoint (tcp_main_t * tm, ip46_address_t * lcl_addr, return 0; } -int +static int tcp_connection_open (transport_endpoint_t * rmt) { tcp_main_t *tm = vnet_get_tcp_main (); @@ -621,7 +652,7 @@ tcp_connection_open (transport_endpoint_t * rmt) return tc->c_c_index; } -int +static int tcp_session_open (transport_endpoint_t * tep) { return tcp_connection_open (tep); @@ -657,7 +688,7 @@ const char *tcp_connection_flags_str[] = { #undef _ }; -u8 * +static u8 * format_tcp_connection_flags (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); @@ -682,7 +713,7 @@ const char *tcp_conn_timers[] = { #undef _ }; -u8 * +static u8 * format_tcp_timers (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); @@ -707,7 +738,7 @@ format_tcp_timers (u8 * s, va_list * args) return s; } -u8 * +static u8 * format_tcp_congestion_status (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); @@ -720,7 +751,13 @@ format_tcp_congestion_status (u8 * s, va_list * args) return s; } -u8 * +static i32 +tcp_rcv_wnd_available (tcp_connection_t * tc) +{ + return (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); +} + +static u8 * format_tcp_vars (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); @@ -731,8 +768,9 @@ format_tcp_vars (u8 * s, va_list * args) tc->snd_una_max - tc->iss); s = format (s, " rcv_nxt %u rcv_las %u\n", tc->rcv_nxt - tc->irs, tc->rcv_las - tc->irs); - s = format (s, " snd_wnd %u rcv_wnd %u snd_wl1 %u snd_wl2 %u\n", - tc->snd_wnd, tc->rcv_wnd, tc->snd_wl1 - tc->irs, + s = format (s, " snd_wnd %u rcv_wnd %u rcv_wscale %u ", + tc->snd_wnd, tc->rcv_wnd, tc->rcv_wscale); + s = format (s, "snd_wl1 %u snd_wl2 %u\n", tc->snd_wl1 - tc->irs, tc->snd_wl2 - tc->iss); s = format (s, " flight size %u out space %u cc space %u rcv_wnd_av %u\n", tcp_flight_size (tc), tcp_available_output_snd_space (tc), @@ -760,7 +798,7 @@ format_tcp_vars (u8 * s, va_list * args) return s; } -u8 * +static u8 * format_tcp_connection_id (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); @@ -803,7 +841,7 @@ format_tcp_connection (u8 * s, va_list * args) return s; } -u8 * +static u8 * format_tcp_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); @@ -819,7 +857,7 @@ format_tcp_session (u8 * s, va_list * args) return s; } -u8 * +static u8 * format_tcp_listener_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); @@ -827,7 +865,7 @@ format_tcp_listener_session (u8 * s, va_list * args) return format (s, "%U", format_tcp_connection_id, tc); } -u8 * +static u8 * format_tcp_half_open_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); @@ -883,7 +921,7 @@ format_tcp_rcv_sacks (u8 * s, va_list * args) return s; } -u8 * +static u8 * format_tcp_sack_hole (u8 * s, va_list * args) { sack_scoreboard_hole_t *hole = va_arg (*args, sack_scoreboard_hole_t *); @@ -904,9 +942,11 @@ format_tcp_scoreboard (u8 * s, va_list * args) s = format (s, "sacked_bytes %u last_sacked_bytes %u lost_bytes %u\n", sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes); s = format (s, " last_bytes_delivered %u high_sacked %u snd_una_adv %u\n", - sb->last_bytes_delivered, sb->high_sacked, sb->snd_una_adv); + sb->last_bytes_delivered, sb->high_sacked - tc->iss, + sb->snd_una_adv); s = format (s, " cur_rxt_hole %u high_rxt %u rescue_rxt %u", - sb->cur_rxt_hole, sb->high_rxt, sb->rescue_rxt); + sb->cur_rxt_hole, sb->high_rxt - tc->iss, + sb->rescue_rxt - tc->iss); hole = scoreboard_first_hole (sb); if (hole) @@ -921,14 +961,14 @@ format_tcp_scoreboard (u8 * s, va_list * args) return s; } -transport_connection_t * +static transport_connection_t * tcp_session_get_transport (u32 conn_index, u32 thread_index) { tcp_connection_t *tc = tcp_connection_get (conn_index, thread_index); return &tc->connection; } -transport_connection_t * +static transport_connection_t * tcp_half_open_session_get_transport (u32 conn_index) { tcp_connection_t *tc = tcp_half_open_connection_get (conn_index); @@ -942,7 +982,7 @@ tcp_half_open_session_get_transport (u32 conn_index) * the tcp options to be used in the next burst and subtracts their * length from the connection's snd_mss. */ -u16 +static u16 tcp_session_send_mss (transport_connection_t * trans_conn) { tcp_connection_t *tc = (tcp_connection_t *) trans_conn; @@ -950,7 +990,7 @@ tcp_session_send_mss (transport_connection_t * trans_conn) /* Ensure snd_mss does accurately reflect the amount of data we can push * in a segment. This also makes sure that options are updated according to * the current state of the connection. */ - tcp_update_snd_mss (tc); + tcp_update_burst_snd_vars (tc); return tc->snd_mss; } @@ -982,8 +1022,8 @@ tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) * @param tc tcp connection * @return number of bytes session is allowed to write */ -u32 -tcp_snd_space (tcp_connection_t * tc) +static inline u32 +tcp_snd_space_inline (tcp_connection_t * tc) { int snd_space, snt_limited; @@ -1032,20 +1072,20 @@ tcp_snd_space (tcp_connection_t * tc) } u32 -tcp_session_send_space (transport_connection_t * trans_conn) +tcp_snd_space (tcp_connection_t * tc) { - tcp_connection_t *tc = (tcp_connection_t *) trans_conn; - return clib_min (tcp_snd_space (tc), - tc->snd_wnd - (tc->snd_nxt - tc->snd_una)); + return tcp_snd_space_inline (tc); } -i32 -tcp_rcv_wnd_available (tcp_connection_t * tc) +static u32 +tcp_session_send_space (transport_connection_t * trans_conn) { - return (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); + tcp_connection_t *tc = (tcp_connection_t *) trans_conn; + return clib_min (tcp_snd_space_inline (tc), + tc->snd_wnd - (tc->snd_nxt - tc->snd_una)); } -u32 +static u32 tcp_session_tx_fifo_offset (transport_connection_t * trans_conn) { tcp_connection_t *tc = (tcp_connection_t *) trans_conn; @@ -1056,21 +1096,29 @@ tcp_session_tx_fifo_offset (transport_connection_t * trans_conn) return (tc->snd_nxt - tc->snd_una); } -void +static void tcp_update_time (f64 now, u8 thread_index) { tcp_set_time_now (thread_index); - tw_timer_expire_timers_16t_2w_512sl (&tcp_main.timer_wheels[thread_index], + tw_timer_expire_timers_16t_2w_512sl (&tcp_main. + wrk_ctx[thread_index].timer_wheel, now); tcp_flush_frames_to_output (thread_index); } +static u32 +tcp_session_push_header (transport_connection_t * tconn, vlib_buffer_t * b) +{ + tcp_connection_t *tc = (tcp_connection_t *) tconn; + return tcp_push_header (tc, b); +} + /* *INDENT-OFF* */ const static transport_proto_vft_t tcp_proto = { .enable = vnet_tcp_enable_disable, .bind = tcp_session_bind, .unbind = tcp_session_unbind, - .push_header = tcp_push_header, + .push_header = tcp_session_push_header, .get_connection = tcp_session_get_transport, .get_listener = tcp_session_get_listener, .get_half_open = tcp_half_open_session_get_transport, @@ -1089,7 +1137,7 @@ const static transport_proto_vft_t tcp_proto = { }; /* *INDENT-ON* */ -void +static void tcp_timer_keep_handler (u32 conn_index) { u32 thread_index = vlib_get_thread_index (); @@ -1101,7 +1149,7 @@ tcp_timer_keep_handler (u32 conn_index) tcp_connection_close (tc); } -void +static void tcp_timer_establish_handler (u32 conn_index) { tcp_connection_t *tc; @@ -1129,7 +1177,7 @@ tcp_timer_establish_handler (u32 conn_index) tcp_connection_cleanup (tc); } -void +static void tcp_timer_waitclose_handler (u32 conn_index) { u32 thread_index = vlib_get_thread_index (); @@ -1149,6 +1197,10 @@ tcp_timer_waitclose_handler (u32 conn_index) clib_warning ("FIN was sent and still in CLOSE WAIT. Weird!"); } + /* Make sure we don't try to send unsent data */ + tcp_connection_timers_reset (tc); + tcp_cong_recovery_off (tc); + tc->snd_una_max = tc->snd_nxt = tc->snd_una; tcp_send_fin (tc); tc->state = TCP_STATE_LAST_ACK; @@ -1194,13 +1246,13 @@ tcp_expired_timers_dispatch (u32 * expired_timers) } } -void +static void tcp_initialize_timer_wheels (tcp_main_t * tm) { tw_timer_wheel_16t_2w_512sl_t *tw; /* *INDENT-OFF* */ foreach_vlib_main (({ - tw = &tm->timer_wheels[ii]; + tw = &tm->wrk_ctx[ii].timer_wheel; tw_timer_wheel_init_16t_2w_512sl (tw, tcp_expired_timers_dispatch, 100e-3 /* timer period 100ms */ , ~0); tw->last_run_time = vlib_time_now (this_vlib_main); @@ -1208,7 +1260,7 @@ tcp_initialize_timer_wheels (tcp_main_t * tm) /* *INDENT-ON* */ } -clib_error_t * +static clib_error_t * tcp_main_enable (vlib_main_t * vm) { tcp_main_t *tm = vnet_get_tcp_main (); @@ -1269,13 +1321,6 @@ tcp_main_enable (vlib_main_t * vm) pool_init_fixed (tm->half_open_connections, tm->preallocated_half_open_connections); - /* Initialize per worker thread tx buffers (used for control messages) */ - vec_validate (tm->tx_buffers, num_threads - 1); - - /* Initialize timer wheels */ - vec_validate (tm->timer_wheels, num_threads - 1); - tcp_initialize_timer_wheels (tm); - /* Initialize clocks per tick for TCP timestamp. Used to compute * monotonically increasing timestamps. */ tm->tstamp_ticks_per_clock = vm->clib_time.seconds_per_clock @@ -1286,15 +1331,12 @@ tcp_main_enable (vlib_main_t * vm) clib_spinlock_init (&tm->half_open_lock); } - vec_validate (tm->tx_frames[0], num_threads - 1); - vec_validate (tm->tx_frames[1], num_threads - 1); - vec_validate (tm->ip_lookup_tx_frames[0], num_threads - 1); - vec_validate (tm->ip_lookup_tx_frames[1], num_threads - 1); + vec_validate (tm->wrk_ctx, num_threads - 1); + tcp_initialize_timer_wheels (tm); tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); - vec_validate (tm->time_now, num_threads - 1); return error; } @@ -1326,7 +1368,7 @@ tcp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add) tm->punt_unknown6 = is_add; } -clib_error_t * +static clib_error_t * tcp_init (vlib_main_t * vm) { tcp_main_t *tm = vnet_get_tcp_main (); @@ -1362,9 +1404,8 @@ tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { - if (unformat - (input, "preallocated-connections %d", - &tm->preallocated_connections)) + if (unformat (input, "preallocated-connections %d", + &tm->preallocated_connections)) ; else if (unformat (input, "preallocated-half-open-connections %d", &tm->preallocated_half_open_connections)) @@ -1372,6 +1413,9 @@ tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) else if (unformat (input, "buffer-fail-fraction %f", &tm->buffer_fail_fraction)) ; + else if (unformat (input, "max-rx-fifo %U", unformat_memory_size, + &tm->max_rx_fifo)) + ; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); @@ -1409,7 +1453,7 @@ tcp_configure_v4_source_address_range (vlib_main_t * vm, ip4_address_t * hi_addr, u32 fib_index, int is_del); - memset (&prefix, 0, sizeof (prefix)); + clib_memset (&prefix, 0, sizeof (prefix)); fib_index = fib_table_find (FIB_PROTOCOL_IP4, table_id); @@ -1494,7 +1538,7 @@ tcp_configure_v6_source_address_range (vlib_main_t * vm, fib_node_index_t fei; u32 sw_if_index; - memset (&prefix, 0, sizeof (prefix)); + clib_memset (&prefix, 0, sizeof (prefix)); fib_index = fib_table_find (FIB_PROTOCOL_IP6, table_id); @@ -1719,7 +1763,7 @@ tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose) if (!tc) return s; - memset (dummy_tc, 0, sizeof (*dummy_tc)); + clib_memset (dummy_tc, 0, sizeof (*dummy_tc)); tcp_connection_timers_init (dummy_tc); scoreboard_init (&dummy_tc->sack_sb); dummy_tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK;