X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp.c;h=b6c348288ece3811fe644de32759b312c4850b2e;hb=586afd762bfa149f5ca167bd5fd5a0cd59ce94fe;hp=0f9b7097b42d2f37eca7b424f64461186ed90b62;hpb=68b0fb0c620c7451ef1a6380c43c39de6614db51;p=vpp.git diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 0f9b7097b42..b6c348288ec 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -21,7 +21,7 @@ tcp_main_t tcp_main; static u32 -tcp_connection_bind (vlib_main_t * vm, u32 session_index, ip46_address_t * ip, +tcp_connection_bind (u32 session_index, ip46_address_t * ip, u16 port_host_byte_order, u8 is_ip4) { tcp_main_t *tm = &tcp_main; @@ -43,42 +43,41 @@ tcp_connection_bind (vlib_main_t * vm, u32 session_index, ip46_address_t * ip, listener->state = TCP_STATE_LISTEN; listener->c_is_ip4 = 1; + tcp_connection_timers_init (listener); + + TCP_EVT_DBG (TCP_EVT_BIND, listener); + return listener->c_c_index; } u32 -tcp_session_bind_ip4 (vlib_main_t * vm, u32 session_index, - ip46_address_t * ip, u16 port_host_byte_order) +tcp_session_bind_ip4 (u32 session_index, ip46_address_t * ip, + u16 port_host_byte_order) { - return tcp_connection_bind (vm, session_index, ip, port_host_byte_order, 1); + return tcp_connection_bind (session_index, ip, port_host_byte_order, 1); } u32 -tcp_session_bind_ip6 (vlib_main_t * vm, u32 session_index, - ip46_address_t * ip, u16 port_host_byte_order) +tcp_session_bind_ip6 (u32 session_index, ip46_address_t * ip, + u16 port_host_byte_order) { - return tcp_connection_bind (vm, session_index, ip, port_host_byte_order, 0); + return tcp_connection_bind (session_index, ip, port_host_byte_order, 0); } static void -tcp_session_unbind (u32 listener_index) +tcp_connection_unbind (u32 listener_index) { tcp_main_t *tm = vnet_get_tcp_main (); + TCP_EVT_DBG (TCP_EVT_UNBIND, + pool_elt_at_index (tm->listener_pool, listener_index)); pool_put_index (tm->listener_pool, listener_index); } u32 -tcp_session_unbind_ip4 (vlib_main_t * vm, u32 listener_index) -{ - tcp_session_unbind (listener_index); - return 0; -} - -u32 -tcp_session_unbind_ip6 (vlib_main_t * vm, u32 listener_index) +tcp_session_unbind (u32 listener_index) { - tcp_session_unbind (listener_index); + tcp_connection_unbind (listener_index); return 0; } @@ -135,10 +134,25 @@ tcp_connection_cleanup (tcp_connection_t * tc) void tcp_connection_del (tcp_connection_t * tc) { + TCP_EVT_DBG (TCP_EVT_DELETE, tc); stream_session_delete_notify (&tc->connection); tcp_connection_cleanup (tc); } +/** Notify session that connection has been reset. + * + * Switch state to closed and wait for session to call cleanup. + */ +void +tcp_connection_reset (tcp_connection_t * tc) +{ + if (tc->state == TCP_STATE_CLOSED) + return; + + tc->state = TCP_STATE_CLOSED; + stream_session_reset_notify (&tc->connection); +} + /** * Begin connection closing procedure. * @@ -149,10 +163,14 @@ tcp_connection_del (tcp_connection_t * tc) * calls cleanup. * 2) TIME_WAIT (active close) whereby after 2MSL the 2MSL timer triggers * and cleanup is called. + * + * N.B. Half-close connections are not supported */ void tcp_connection_close (tcp_connection_t * tc) { + TCP_EVT_DBG (TCP_EVT_CLOSE, tc); + /* Send FIN if needed */ if (tc->state == TCP_STATE_ESTABLISHED || tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_CLOSE_WAIT) @@ -166,9 +184,9 @@ tcp_connection_close (tcp_connection_t * tc) else if (tc->state == TCP_STATE_CLOSE_WAIT) tc->state = TCP_STATE_LAST_ACK; - /* Half-close connections are not supported XXX */ - - if (tc->state == TCP_STATE_CLOSED) + /* If in CLOSED and WAITCLOSE timer is not set, delete connection now */ + if (tc->timers[TCP_TIMER_WAITCLOSE] == TCP_TIMER_HANDLE_INVALID + && tc->state == TCP_STATE_CLOSED) tcp_connection_del (tc); } @@ -185,7 +203,10 @@ tcp_session_cleanup (u32 conn_index, u32 thread_index) { tcp_connection_t *tc; tc = tcp_connection_get (conn_index, thread_index); - tcp_connection_cleanup (tc); + + /* Wait for the session tx events to clear */ + tc->state = TCP_STATE_CLOSED; + tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); } void * @@ -217,6 +238,7 @@ ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4) return 0; } +#define PORT_MASK ((1 << 16)- 1) /** * Allocate local port and add if successful add entry to local endpoint * table to mark the pair as used. @@ -224,10 +246,10 @@ ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4) u16 tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) { - u8 unique = 0; transport_endpoint_t *tep; u32 time_now, tei; - u16 min = 1024, max = 65535, tries; /* XXX configurable ? */ + u16 min = 1024, max = 65535; /* XXX configurable ? */ + int tries; tries = max - min; time_now = tcp_time_now (); @@ -235,37 +257,34 @@ tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) /* Start at random point or max */ pool_get (tm->local_endpoints, tep); clib_memcpy (&tep->ip, ip, sizeof (*ip)); - tep->port = random_u32 (&time_now) << 16; - tep->port = tep->port < min ? max : tep->port; /* Search for first free slot */ - while (tries) + for (; tries >= 0; tries--) { + u16 port = 0; + + /* Find a port in the specified range */ + while (1) + { + port = random_u32 (&time_now) & PORT_MASK; + if (PREDICT_TRUE (port >= min && port < max)) + break; + } + + tep->port = port; + + /* Look it up */ tei = transport_endpoint_lookup (&tm->local_endpoints_table, &tep->ip, tep->port); + /* If not found, we're done */ if (tei == TRANSPORT_ENDPOINT_INVALID_INDEX) { - unique = 1; - break; + transport_endpoint_table_add (&tm->local_endpoints_table, tep, + tep - tm->local_endpoints); + return tep->port; } - - tep->port--; - - if (tep->port < min) - tep->port = max; - - tries--; - } - - if (unique) - { - transport_endpoint_table_add (&tm->local_endpoints_table, tep, - tep - tm->local_endpoints); - - return tep->port; } - - /* Failed */ + /* No free ports */ pool_put (tm->local_endpoints, tep); return -1; } @@ -309,7 +328,7 @@ tcp_connection_init_vars (tcp_connection_t * tc) { tcp_connection_timers_init (tc); tcp_set_snd_mss (tc); - tc->sack_sb.head = TCP_INVALID_SACK_HOLE_INDEX; + scoreboard_init (&tc->sack_sb); tcp_cc_init (tc); } @@ -360,7 +379,10 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) /* Allocate source port */ lcl_port = tcp_allocate_local_port (tm, &lcl_addr); if (lcl_port < 1) - return -1; + { + clib_warning ("Failed to allocate src port"); + return -1; + } /* * Create connection and send SYN @@ -383,6 +405,8 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) tc->state = TCP_STATE_SYN_SENT; + TCP_EVT_DBG (TCP_EVT_OPEN, tc); + return tc->c_c_index; } @@ -398,82 +422,119 @@ tcp_session_open_ip6 (ip46_address_t * addr, u16 port) return tcp_connection_open (addr, port, 0); } +const char *tcp_dbg_evt_str[] = { +#define _(sym, str) str, + foreach_tcp_dbg_evt +#undef _ +}; + +const char *tcp_fsm_states[] = { +#define _(sym, str) str, + foreach_tcp_fsm_state +#undef _ +}; + u8 * -format_tcp_session_ip4 (u8 * s, va_list * args) +format_tcp_state (u8 * s, va_list * args) { - u32 tci = va_arg (*args, u32); - u32 thread_index = va_arg (*args, u32); - tcp_connection_t *tc; + tcp_state_t *state = va_arg (*args, tcp_state_t *); - tc = tcp_connection_get (tci, thread_index); + if (*state < TCP_N_STATES) + s = format (s, "%s", tcp_fsm_states[*state]); + else + s = format (s, "UNKNOWN"); - s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip4_address, - &tc->c_lcl_ip4, clib_net_to_host_u16 (tc->c_lcl_port), - format_ip4_address, &tc->c_rmt_ip4, - clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +const char *tcp_conn_timers[] = { +#define _(sym, str) str, + foreach_tcp_timer +#undef _ +}; + +u8 * +format_tcp_timers (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + int i, last = 0; + + for (i = 0; i < TCP_N_TIMERS; i++) + if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID) + last = i; + + s = format (s, "["); + for (i = 0; i < last; i++) + { + if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID) + s = format (s, "%s,", tcp_conn_timers[i]); + } + + if (last > 0) + s = format (s, "%s]", tcp_conn_timers[i]); + else + s = format (s, "]"); return s; } u8 * -format_tcp_session_ip6 (u8 * s, va_list * args) +format_tcp_connection (u8 * s, va_list * args) { - u32 tci = va_arg (*args, u32); - u32 thread_index = va_arg (*args, u32); - tcp_connection_t *tc = tcp_connection_get (tci, thread_index); - s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip6_address, - &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), - format_ip6_address, &tc->c_rmt_ip6, - clib_net_to_host_u16 (tc->c_rmt_port)); + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + + if (tc->c_is_ip4) + { + s = format (s, "[#%d][%s] %U:%d->%U:%d", tc->c_thread_index, "T", + format_ip4_address, &tc->c_lcl_ip4, + clib_net_to_host_u16 (tc->c_lcl_port), format_ip4_address, + &tc->c_rmt_ip4, clib_net_to_host_u16 (tc->c_rmt_port)); + } + else + { + s = format (s, "[#%d][%s] %U:%d->%U:%d", tc->c_thread_index, "T", + format_ip6_address, &tc->c_lcl_ip6, + clib_net_to_host_u16 (tc->c_lcl_port), format_ip6_address, + &tc->c_rmt_ip6, clib_net_to_host_u16 (tc->c_rmt_port)); + } + return s; } u8 * -format_tcp_listener_session_ip4 (u8 * s, va_list * args) +format_tcp_connection_verbose (u8 * s, va_list * args) { - u32 tci = va_arg (*args, u32); - tcp_connection_t *tc = tcp_listener_get (tci); - s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip4_address, - &tc->c_lcl_ip4, clib_net_to_host_u16 (tc->c_lcl_port), - format_ip4_address, &tc->c_rmt_ip4, - clib_net_to_host_u16 (tc->c_rmt_port)); + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + s = format (s, "%U %U %U", format_tcp_connection, tc, format_tcp_state, + &tc->state, format_tcp_timers, tc); return s; } u8 * -format_tcp_listener_session_ip6 (u8 * s, va_list * args) +format_tcp_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); - tcp_connection_t *tc = tcp_listener_get (tci); - s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip6_address, - &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), - format_ip6_address, &tc->c_rmt_ip6, - clib_net_to_host_u16 (tc->c_rmt_port)); - return s; + u32 thread_index = va_arg (*args, u32); + tcp_connection_t *tc; + + tc = tcp_connection_get (tci, thread_index); + return format (s, "%U", format_tcp_connection, tc); } u8 * -format_tcp_half_open_session_ip4 (u8 * s, va_list * args) +format_tcp_listener_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); - tcp_connection_t *tc = tcp_half_open_connection_get (tci); - s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip4_address, - &tc->c_lcl_ip4, clib_net_to_host_u16 (tc->c_lcl_port), - format_ip4_address, &tc->c_rmt_ip4, - clib_net_to_host_u16 (tc->c_rmt_port)); - return s; + tcp_connection_t *tc = tcp_listener_get (tci); + return format (s, "%U", format_tcp_connection, tc); } u8 * -format_tcp_half_open_session_ip6 (u8 * s, va_list * args) +format_tcp_half_open_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); tcp_connection_t *tc = tcp_half_open_connection_get (tci); - s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip6_address, - &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), - format_ip6_address, &tc->c_rmt_ip6, - clib_net_to_host_u16 (tc->c_rmt_port)); - return s; + return format (s, "%U", format_tcp_connection, tc); } transport_connection_t * @@ -497,24 +558,57 @@ tcp_session_send_mss (transport_connection_t * trans_conn) return tc->snd_mss; } +/** + * Compute tx window session is allowed to fill. + */ u32 tcp_session_send_space (transport_connection_t * trans_conn) { + u32 snd_space; tcp_connection_t *tc = (tcp_connection_t *) trans_conn; - return tcp_available_snd_space (tc); + + /* If we haven't gotten dupacks or if we did and have gotten sacked bytes + * then we can still send */ + if (PREDICT_TRUE (tcp_in_fastrecovery (tc) == 0 + && (tc->rcv_dupacks == 0 + || tc->sack_sb.last_sacked_bytes))) + { + snd_space = tcp_available_snd_space (tc); + + /* If we can't write at least a segment, don't try at all */ + if (snd_space < tc->snd_mss) + return 0; + + /* round down to mss multiple */ + return snd_space - (snd_space % tc->snd_mss); + } + + /* If in fast recovery, send 1 SMSS if wnd allows */ + if (tcp_in_fastrecovery (tc) && tcp_available_snd_space (tc) + && tcp_fastrecovery_sent_1_smss (tc)) + { + tcp_fastrecovery_1_smss_on (tc); + return tc->snd_mss; + } + + return 0; } u32 -tcp_session_rx_fifo_offset (transport_connection_t * trans_conn) +tcp_session_tx_fifo_offset (transport_connection_t * trans_conn) { tcp_connection_t *tc = (tcp_connection_t *) trans_conn; - return (tc->snd_una_max - tc->snd_una); + + ASSERT (seq_geq (tc->snd_nxt, tc->snd_una)); + + /* This still works if fast retransmit is on */ + return (tc->snd_nxt - tc->snd_una); } /* *INDENT-OFF* */ const static transport_proto_vft_t tcp4_proto = { .bind = tcp_session_bind_ip4, - .unbind = tcp_session_unbind_ip4, + .unbind = tcp_session_unbind, .push_header = tcp_push_header, .get_connection = tcp_session_get_transport, .get_listener = tcp_session_get_listener, @@ -524,15 +618,15 @@ const static transport_proto_vft_t tcp4_proto = { .cleanup = tcp_session_cleanup, .send_mss = tcp_session_send_mss, .send_space = tcp_session_send_space, - .rx_fifo_offset = tcp_session_rx_fifo_offset, - .format_connection = format_tcp_session_ip4, - .format_listener = format_tcp_listener_session_ip4, - .format_half_open = format_tcp_half_open_session_ip4 + .tx_fifo_offset = tcp_session_tx_fifo_offset, + .format_connection = format_tcp_session, + .format_listener = format_tcp_listener_session, + .format_half_open = format_tcp_half_open_session, }; const static transport_proto_vft_t tcp6_proto = { .bind = tcp_session_bind_ip6, - .unbind = tcp_session_unbind_ip6, + .unbind = tcp_session_unbind, .push_header = tcp_push_header, .get_connection = tcp_session_get_transport, .get_listener = tcp_session_get_listener, @@ -542,20 +636,20 @@ const static transport_proto_vft_t tcp6_proto = { .cleanup = tcp_session_cleanup, .send_mss = tcp_session_send_mss, .send_space = tcp_session_send_space, - .rx_fifo_offset = tcp_session_rx_fifo_offset, - .format_connection = format_tcp_session_ip6, - .format_listener = format_tcp_listener_session_ip6, - .format_half_open = format_tcp_half_open_session_ip6 + .tx_fifo_offset = tcp_session_tx_fifo_offset, + .format_connection = format_tcp_session, + .format_listener = format_tcp_listener_session, + .format_half_open = format_tcp_half_open_session, }; /* *INDENT-ON* */ void tcp_timer_keep_handler (u32 conn_index) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; - tc = tcp_connection_get (conn_index, cpu_index); + tc = tcp_connection_get (conn_index, thread_index); tc->timers[TCP_TIMER_KEEP] = TCP_TIMER_HANDLE_INVALID; tcp_connection_close (tc); @@ -579,13 +673,32 @@ tcp_timer_establish_handler (u32 conn_index) } void -tcp_timer_2msl_handler (u32 conn_index) +tcp_timer_waitclose_handler (u32 conn_index) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; - tc = tcp_connection_get (conn_index, cpu_index); - tc->timers[TCP_TIMER_2MSL] = TCP_TIMER_HANDLE_INVALID; + tc = tcp_connection_get (conn_index, thread_index); + tc->timers[TCP_TIMER_WAITCLOSE] = TCP_TIMER_HANDLE_INVALID; + + /* Session didn't come back with a close(). Send FIN either way + * and switch to LAST_ACK. */ + if (tc->state == TCP_STATE_CLOSE_WAIT) + { + if (tc->flags & TCP_CONN_FINSNT) + { + clib_warning ("FIN was sent and still in CLOSE WAIT. Weird!"); + } + + tcp_send_fin (tc); + tc->state = TCP_STATE_LAST_ACK; + + /* Make sure we don't wait in LAST ACK forever */ + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + + /* Don't delete the connection yet */ + return; + } tcp_connection_del (tc); } @@ -595,9 +708,9 @@ static timer_expiration_handler *timer_expiration_handlers[TCP_N_TIMERS] = { tcp_timer_retransmit_handler, tcp_timer_delack_handler, - 0, + tcp_timer_persist_handler, tcp_timer_keep_handler, - tcp_timer_2msl_handler, + tcp_timer_waitclose_handler, tcp_timer_retransmit_syn_handler, tcp_timer_establish_handler }; @@ -615,6 +728,8 @@ tcp_expired_timers_dispatch (u32 * expired_timers) connection_index = expired_timers[i] & 0x0FFFFFFF; timer_id = expired_timers[i] >> 28; + TCP_EVT_DBG (TCP_EVT_TIMER_POP, connection_index, timer_id); + /* Handle expiration */ (*timer_expiration_handlers[timer_id]) (connection_index); } @@ -633,18 +748,15 @@ tcp_initialize_timer_wheels (tcp_main_t * tm) } clib_error_t * -tcp_init (vlib_main_t * vm) +tcp_main_enable (vlib_main_t * vm) { - ip_main_t *im = &ip_main; - ip_protocol_info_t *pi; tcp_main_t *tm = vnet_get_tcp_main (); + ip_protocol_info_t *pi; + ip_main_t *im = &ip_main; vlib_thread_main_t *vtm = vlib_get_thread_main (); clib_error_t *error = 0; u32 num_threads; - tm->vlib_main = vm; - tm->vnet_main = vnet_get_main (); - if ((error = vlib_call_init_function (vm, ip_main_init))) return error; if ((error = vlib_call_init_function (vm, ip4_lookup_init))) @@ -683,7 +795,7 @@ tcp_init (vlib_main_t * vm) vec_validate (tm->timer_wheels, num_threads - 1); tcp_initialize_timer_wheels (tm); - vec_validate (tm->delack_connections, num_threads - 1); +// vec_validate (tm->delack_connections, num_threads - 1); /* Initialize clocks per tick for TCP timestamp. Used to compute * monotonically increasing timestamps. */ @@ -697,6 +809,36 @@ tcp_init (vlib_main_t * vm) return error; } +clib_error_t * +vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en) +{ + if (is_en) + { + if (tcp_main.is_enabled) + return 0; + + return tcp_main_enable (vm); + } + else + { + tcp_main.is_enabled = 0; + } + + return 0; +} + +clib_error_t * +tcp_init (vlib_main_t * vm) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + + tm->vlib_main = vm; + tm->vnet_main = vnet_get_main (); + tm->is_enabled = 0; + + return 0; +} + VLIB_INIT_FUNCTION (tcp_init); /*