X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp.c;h=09c47d989ac260a1b46a02ee83df7c1f5d4d673b;hb=f65074e4df47d05238e051615dbaf5d2bcbaddf2;hp=f703d634b5418d2678c821217cc7a7acd2c170a3;hpb=4af830cd7b4a11bb84840183f7eebd6fb43497a6;p=vpp.git diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index f703d634b54..09c47d989ac 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Cisco and/or its affiliates. + * Copyright (c) 2016-2019 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: @@ -172,7 +172,7 @@ tcp_half_open_connection_cleanup (tcp_connection_t * tc) /* Make sure this is the owning thread */ if (tc->c_thread_index != vlib_get_thread_index ()) return 1; - tcp_timer_reset (tc, TCP_TIMER_ESTABLISH); + tcp_timer_reset (tc, TCP_TIMER_ESTABLISH_AO); tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT_SYN); tcp_half_open_connection_del (tc); return 0; @@ -223,6 +223,9 @@ tcp_connection_cleanup (tcp_connection_t * tc) if (!tc->c_is_ip4 && ip6_address_is_link_local_unicast (&tc->c_rmt_ip6)) tcp_add_del_adjacency (tc, 0); + vec_free (tc->snd_sacks); + vec_free (tc->snd_sacks_fl); + /* Poison the entry */ if (CLIB_DEBUG > 0) clib_memset (tc, 0xFA, sizeof (*tc)); @@ -241,12 +244,12 @@ void tcp_connection_del (tcp_connection_t * tc) { TCP_EVT_DBG (TCP_EVT_DELETE, tc); - stream_session_delete_notify (&tc->connection); + session_transport_delete_notify (&tc->connection); tcp_connection_cleanup (tc); } tcp_connection_t * -tcp_connection_new (u8 thread_index) +tcp_connection_alloc (u8 thread_index) { tcp_main_t *tm = vnet_get_tcp_main (); tcp_connection_t *tc; @@ -258,6 +261,15 @@ tcp_connection_new (u8 thread_index) return tc; } +void +tcp_connection_free (tcp_connection_t * tc) +{ + tcp_main_t *tm = &tcp_main; + pool_put (tm->connections[tc->c_thread_index], tc); + if (CLIB_DEBUG > 0) + clib_memset (tc, 0xFA, sizeof (*tc)); +} + /** Notify session that connection has been reset. * * Switch state to closed and wait for session to call cleanup. @@ -270,7 +282,7 @@ tcp_connection_reset (tcp_connection_t * tc) { case TCP_STATE_SYN_RCVD: /* Cleanup everything. App wasn't notified yet */ - stream_session_delete_notify (&tc->connection); + session_transport_delete_notify (&tc->connection); tcp_connection_cleanup (tc); break; case TCP_STATE_SYN_SENT: @@ -281,22 +293,28 @@ tcp_connection_reset (tcp_connection_t * tc) tcp_connection_timers_reset (tc); /* Set the cleanup timer, in case the session layer/app don't * cleanly close the connection */ - tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); - stream_session_reset_notify (&tc->connection); + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); + session_transport_reset_notify (&tc->connection); + tcp_connection_set_state (tc, TCP_STATE_CLOSED); break; case TCP_STATE_CLOSE_WAIT: case TCP_STATE_FIN_WAIT_1: case TCP_STATE_FIN_WAIT_2: case TCP_STATE_CLOSING: - tc->state = TCP_STATE_CLOSED; - TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc); + case TCP_STATE_LAST_ACK: tcp_connection_timers_reset (tc); - tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); + /* Make sure we mark the session as closed. In some states we may + * be still trying to send data */ + session_transport_closed_notify (&tc->connection); + tcp_connection_set_state (tc, TCP_STATE_CLOSED); break; case TCP_STATE_CLOSED: - return; + case TCP_STATE_TIME_WAIT: + break; + default: + TCP_DBG ("reset state: %u", tc->state); } - tc->state = TCP_STATE_CLOSED; } /** @@ -321,27 +339,42 @@ tcp_connection_close (tcp_connection_t * tc) switch (tc->state) { case TCP_STATE_SYN_SENT: - tc->state = TCP_STATE_CLOSED; + /* Try to cleanup. If not on the right thread, mark as half-open done. + * Connection will be cleaned up when establish timer pops */ + tcp_connection_cleanup (tc); break; case TCP_STATE_SYN_RCVD: tcp_connection_timers_reset (tc); tcp_send_fin (tc); - tc->state = TCP_STATE_FIN_WAIT_1; - tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); + tcp_connection_set_state (tc, TCP_STATE_FIN_WAIT_1); + tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_FINWAIT1_TIME); break; case TCP_STATE_ESTABLISHED: - if (!session_tx_fifo_max_dequeue (&tc->connection)) + /* If closing with unread data, reset the connection */ + if (transport_max_rx_dequeue (&tc->connection)) + { + tcp_send_reset (tc); + tcp_connection_timers_reset (tc); + tcp_connection_set_state (tc, TCP_STATE_CLOSED); + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); + break; + } + if (!transport_max_tx_dequeue (&tc->connection)) tcp_send_fin (tc); else tc->flags |= TCP_CONN_FINPNDG; - tc->state = TCP_STATE_FIN_WAIT_1; + tcp_connection_set_state (tc, TCP_STATE_FIN_WAIT_1); + /* Set a timer in case the peer stops responding. Otherwise the + * connection will be stuck here forever. */ + ASSERT (tc->timers[TCP_TIMER_WAITCLOSE] == TCP_TIMER_HANDLE_INVALID); + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_FINWAIT1_TIME); break; case TCP_STATE_CLOSE_WAIT: - if (!session_tx_fifo_max_dequeue (&tc->connection)) + if (!transport_max_tx_dequeue (&tc->connection)) { tcp_send_fin (tc); tcp_connection_timers_reset (tc); - tc->state = TCP_STATE_LAST_ACK; + tcp_connection_set_state (tc, TCP_STATE_LAST_ACK); tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); } else @@ -352,17 +385,14 @@ tcp_connection_close (tcp_connection_t * tc) break; case TCP_STATE_CLOSED: tcp_connection_timers_reset (tc); + /* Delete connection but instead of doing it now wait until next + * dispatch cycle to give the session layer a chance to clear + * unhandled events */ + tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); break; default: TCP_DBG ("state: %u", tc->state); } - - TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc); - - /* If in CLOSED and WAITCLOSE timer is not set, delete connection now */ - if (!tcp_timer_is_active (tc, TCP_TIMER_WAITCLOSE) - && tc->state == TCP_STATE_CLOSED) - tcp_connection_del (tc); } static void @@ -378,9 +408,7 @@ tcp_session_cleanup (u32 conn_index, u32 thread_index) { tcp_connection_t *tc; tc = tcp_connection_get (conn_index, thread_index); - tcp_connection_timers_reset (tc); - tc->state = TCP_STATE_CLOSED; - TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc); + tcp_connection_set_state (tc, TCP_STATE_CLOSED); tcp_connection_cleanup (tc); } @@ -526,6 +554,7 @@ tcp_cc_algo_register (tcp_cc_algorithm_type_e type, vec_validate (tm->cc_algos, type); tm->cc_algos[type] = *vft; + hash_set_mem (tm->cc_algo_by_name, vft->name, type); } tcp_cc_algorithm_t * @@ -535,6 +564,26 @@ tcp_cc_algo_get (tcp_cc_algorithm_type_e type) return &tm->cc_algos[type]; } +/** + * Generate random iss as per rfc6528 + */ +static u32 +tcp_generate_random_iss (tcp_connection_t * tc) +{ + tcp_main_t *tm = &tcp_main; + u64 tmp; + + if (tc->c_is_ip4) + tmp = (u64) tc->c_lcl_ip.ip4.as_u32 << 32 | (u64) tc->c_rmt_ip.ip4.as_u32; + else + tmp = tc->c_lcl_ip.ip6.as_u64[0] ^ tc->c_lcl_ip.ip6.as_u64[1] + ^ tc->c_rmt_ip.ip6.as_u64[0] ^ tc->c_rmt_ip.ip6.as_u64[1]; + + tmp ^= tm->iss_seed.first | ((u64) tc->c_lcl_port << 16 | tc->c_rmt_port); + tmp ^= tm->iss_seed.second; + tmp = clib_xxhash (tmp) + clib_cpu_time_now (); + return ((tmp >> 32) ^ (tmp & 0xffffffff)); +} /** * Initialize connection send variables. @@ -542,8 +591,6 @@ tcp_cc_algo_get (tcp_cc_algorithm_type_e type) void tcp_init_snd_vars (tcp_connection_t * tc) { - u32 time_now; - /* * We use the time to randomize iss and for setting up the initial * timestamp. Make sure it's updated otherwise syn and ack in the @@ -551,9 +598,8 @@ tcp_init_snd_vars (tcp_connection_t * tc) * direction for us. */ tcp_set_time_now (tcp_get_worker (vlib_get_thread_index ())); - time_now = tcp_time_now (); - tc->iss = random_u32 (&time_now); + tc->iss = tcp_generate_random_iss (tc); tc->snd_una = tc->iss; tc->snd_nxt = tc->iss + 1; tc->snd_una_max = tc->snd_nxt; @@ -775,15 +821,14 @@ format_tcp_congestion (u8 * s, va_list * args) u32 indent = format_get_indent (s); s = format (s, "%U ", format_tcp_congestion_status, tc); - s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n", - tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked); - s = format (s, "%Ucc space %u prev_ssthresh %u snd_congestion %u" - " dupack %u\n", format_white_space, indent, - tcp_available_cc_snd_space (tc), tc->prev_ssthresh, - tc->snd_congestion - tc->iss, tc->rcv_dupacks); - s = format (s, "%Utsecr %u tsecr_last_ack %u limited_transmit %u\n", - format_white_space, indent, tc->rcv_opts.tsecr, - tc->tsecr_last_ack, tc->limited_transmit - tc->iss); + s = format (s, "algo %s cwnd %u ssthresh %u bytes_acked %u\n", + tc->cc_algo->name, tc->cwnd, tc->ssthresh, tc->bytes_acked); + s = format (s, "%Ucc space %u prev_cwnd %u prev_ssthresh %u rtx_bytes %u\n", + format_white_space, indent, tcp_available_cc_snd_space (tc), + tc->prev_cwnd, tc->prev_ssthresh, tc->snd_rxt_bytes); + s = format (s, "%Usnd_congestion %u dupack %u limited_transmit %u\n", + format_white_space, indent, tc->snd_congestion - tc->iss, + tc->rcv_dupacks, tc->limited_transmit - tc->iss); return s; } @@ -802,12 +847,15 @@ format_tcp_vars (u8 * s, va_list * args) tc->snd_wnd, tc->rcv_wnd, tc->rcv_wscale); s = format (s, "snd_wl1 %u snd_wl2 %u\n", tc->snd_wl1 - tc->irs, tc->snd_wl2 - tc->iss); - s = format (s, " flight size %u out space %u rcv_wnd_av %u\n", + s = format (s, " flight size %u out space %u rcv_wnd_av %u", tcp_flight_size (tc), tcp_available_output_snd_space (tc), tcp_rcv_wnd_available (tc)); - s = format (s, " tsval_recent %u tsval_recent_age %u\n", tc->tsval_recent, + s = format (s, " tsval_recent %u\n", tc->tsval_recent); + s = format (s, " tsecr %u tsecr_last_ack %u tsval_recent_age %u", + tc->rcv_opts.tsecr, tc->tsecr_last_ack, tcp_time_now () - tc->tsval_recent_age); - s = format (s, " rto %u rto_boff %u srtt %u us %.3f rttvar %u rtt_ts %x", + s = format (s, " snd_mss %u\n", tc->snd_mss); + s = format (s, " rto %u rto_boff %u srtt %u us %.3f rttvar %u rtt_ts %.4f", tc->rto, tc->rto_boff, tc->srtt, tc->mrtt_us * 1000, tc->rttvar, tc->rtt_ts); s = format (s, " rtt_seq %u\n", tc->rtt_seq - tc->iss); @@ -887,8 +935,12 @@ static u8 * format_tcp_listener_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); + u32 verbose = va_arg (*args, u32); tcp_connection_t *tc = tcp_listener_get (tci); - return format (s, "%U", format_tcp_connection_id, tc); + s = format (s, "%-50U", format_tcp_connection_id, tc); + if (verbose) + s = format (s, "%-15U", format_tcp_state, tc->state); + return s; } static u8 * @@ -1116,29 +1168,33 @@ tcp_update_time (f64 now, u8 thread_index) tcp_flush_frames_to_output (wrk); } -static u32 -tcp_session_push_header (transport_connection_t * tconn, vlib_buffer_t * b) +static void +tcp_session_flush_data (transport_connection_t * tconn) { tcp_connection_t *tc = (tcp_connection_t *) tconn; - return tcp_push_header (tc, b); + if (tc->flags & TCP_CONN_PSH_PENDING) + return; + tc->flags |= TCP_CONN_PSH_PENDING; + tc->psh_seq = tc->snd_una + transport_max_tx_dequeue (tconn) - 1; } /* *INDENT-OFF* */ const static transport_proto_vft_t tcp_proto = { .enable = vnet_tcp_enable_disable, - .bind = tcp_session_bind, - .unbind = tcp_session_unbind, + .start_listen = tcp_session_bind, + .stop_listen = tcp_session_unbind, .push_header = tcp_session_push_header, .get_connection = tcp_session_get_transport, .get_listener = tcp_session_get_listener, .get_half_open = tcp_half_open_session_get_transport, - .open = tcp_session_open, + .connect = tcp_session_open, .close = tcp_session_close, .cleanup = tcp_session_cleanup, .send_mss = tcp_session_send_mss, .send_space = tcp_session_send_space, .update_time = tcp_update_time, .tx_fifo_offset = tcp_session_tx_fifo_offset, + .flush_data = tcp_session_flush_data, .format_connection = format_tcp_session, .format_listener = format_tcp_listener_session, .format_half_open = format_tcp_half_open_session, @@ -1191,33 +1247,42 @@ tcp_timer_establish_handler (u32 conn_index) { tcp_connection_t *tc; - tc = tcp_half_open_connection_get (conn_index); - if (tc) - { - ASSERT (tc->state == TCP_STATE_SYN_SENT); - session_stream_connect_notify (&tc->connection, 1 /* fail */ ); - TCP_DBG ("establish pop: %U", format_tcp_connection, tc, 2); - } - else - { - tc = tcp_connection_get (conn_index, vlib_get_thread_index ()); - /* note: the connection may have already disappeared */ - if (PREDICT_FALSE (tc == 0)) - return; - TCP_DBG ("establish pop: %U", format_tcp_connection, tc, 2); - ASSERT (tc->state == TCP_STATE_SYN_RCVD); - /* Start cleanup. App wasn't notified yet so use delete notify as - * opposed to delete to cleanup session layer state. */ - stream_session_delete_notify (&tc->connection); - } + tc = tcp_connection_get (conn_index, vlib_get_thread_index ()); + /* note: the connection may have already disappeared */ + if (PREDICT_FALSE (tc == 0)) + return; + ASSERT (tc->state == TCP_STATE_SYN_RCVD); tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; + tcp_connection_set_state (tc, TCP_STATE_CLOSED); + /* Start cleanup. App wasn't notified yet so use delete notify as + * opposed to delete to cleanup session layer state. */ + tcp_connection_timers_reset (tc); + session_transport_delete_notify (&tc->connection); + tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); +} + +static void +tcp_timer_establish_ao_handler (u32 conn_index) +{ + tcp_connection_t *tc; + + tc = tcp_half_open_connection_get (conn_index); + if (!tc) + return; + + ASSERT (tc->state == TCP_STATE_SYN_SENT); + /* Notify app if we haven't tried to clean this up already */ + if (!(tc->flags & TCP_CONN_HALF_OPEN_DONE)) + session_stream_connect_notify (&tc->connection, 1 /* fail */ ); + + tc->timers[TCP_TIMER_ESTABLISH_AO] = TCP_TIMER_HANDLE_INVALID; tcp_connection_cleanup (tc); } static void tcp_timer_waitclose_handler (u32 conn_index) { - u32 thread_index = vlib_get_thread_index (); + u32 thread_index = vlib_get_thread_index (), rto; tcp_connection_t *tc; tc = tcp_connection_get (conn_index, thread_index); @@ -1225,30 +1290,65 @@ tcp_timer_waitclose_handler (u32 conn_index) return; tc->timers[TCP_TIMER_WAITCLOSE] = TCP_TIMER_HANDLE_INVALID; - /* Session didn't come back with a close(). Send FIN either way - * and switch to LAST_ACK. */ - if (tc->state == TCP_STATE_CLOSE_WAIT) + switch (tc->state) { - if (tc->flags & TCP_CONN_FINSNT) + case TCP_STATE_CLOSE_WAIT: + tcp_connection_timers_reset (tc); + session_transport_closed_notify (&tc->connection); + + if (!(tc->flags & TCP_CONN_FINPNDG)) { - clib_warning ("FIN was sent and still in CLOSE WAIT. Weird!"); + tcp_connection_set_state (tc, TCP_STATE_CLOSED); + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); + break; } - /* Make sure we don't try to send unsent data */ - tcp_connection_timers_reset (tc); + /* Session didn't come back with a close. Send FIN either way + * and switch to LAST_ACK. */ tcp_cong_recovery_off (tc); - tc->snd_una_max = tc->snd_nxt = tc->snd_una; + /* Make sure we don't try to send unsent data */ + tc->snd_nxt = tc->snd_una; tcp_send_fin (tc); - tc->state = TCP_STATE_LAST_ACK; + tcp_connection_set_state (tc, TCP_STATE_LAST_ACK); /* Make sure we don't wait in LAST ACK forever */ tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); /* Don't delete the connection yet */ - return; + break; + case TCP_STATE_FIN_WAIT_1: + tcp_connection_timers_reset (tc); + if (tc->flags & TCP_CONN_FINPNDG) + { + /* If FIN pending send it before closing and wait as long as + * the rto timeout would wait. Notify session layer that transport + * is closed. We haven't sent everything but we did try. */ + tcp_cong_recovery_off (tc); + tcp_send_fin (tc); + rto = clib_max ((tc->rto >> tc->rto_boff) * TCP_TO_TIMER_TICK, 1); + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, + clib_min (rto, TCP_2MSL_TIME)); + session_transport_closed_notify (&tc->connection); + } + else + { + /* We've sent the fin but no progress. Close the connection and + * to make sure everything is flushed, setup a cleanup timer */ + tcp_connection_set_state (tc, TCP_STATE_CLOSED); + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); + } + break; + case TCP_STATE_LAST_ACK: + case TCP_STATE_CLOSING: + tcp_connection_timers_reset (tc); + tcp_connection_set_state (tc, TCP_STATE_CLOSED); + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); + session_transport_closed_notify (&tc->connection); + break; + default: + tcp_connection_del (tc); + break; } - - tcp_connection_del (tc); } /* *INDENT-OFF* */ @@ -1260,7 +1360,8 @@ static timer_expiration_handler *timer_expiration_handlers[TCP_N_TIMERS] = tcp_timer_keep_handler, tcp_timer_waitclose_handler, tcp_timer_retransmit_syn_handler, - tcp_timer_establish_handler + tcp_timer_establish_handler, + tcp_timer_establish_ao_handler, }; /* *INDENT-ON* */ @@ -1297,6 +1398,16 @@ tcp_initialize_timer_wheels (tcp_main_t * tm) /* *INDENT-ON* */ } +static void +tcp_initialize_iss_seed (tcp_main_t * tm) +{ + u32 default_seed = random_default_seed (); + u64 time_now = clib_cpu_time_now (); + + tm->iss_seed.first = (u64) random_u32 (&default_seed) << 32; + tm->iss_seed.second = random_u64 (&time_now); +} + static clib_error_t * tcp_main_enable (vlib_main_t * vm) { @@ -1373,9 +1484,9 @@ tcp_main_enable (vlib_main_t * vm) } tcp_initialize_timer_wheels (tm); + tcp_initialize_iss_seed (tm); - tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size - (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + tm->bytes_per_buffer = vlib_buffer_get_default_data_size (vm); return error; } @@ -1432,8 +1543,10 @@ tcp_init (vlib_main_t * vm) FIB_PROTOCOL_IP6, tcp6_output_node.index); tcp_api_reference (); + tm->cc_algo_by_name = hash_create_string (0, sizeof (uword)); tm->tx_pacing = 1; tm->cc_algo = TCP_CC_NEWRENO; + tm->default_mtu = 1460; return 0; } @@ -1443,15 +1556,20 @@ uword unformat_tcp_cc_algo (unformat_input_t * input, va_list * va) { uword *result = va_arg (*va, uword *); + tcp_main_t *tm = &tcp_main; + char *cc_algo_name; + u8 found = 0; + uword *p; - if (unformat (input, "newreno")) - *result = TCP_CC_NEWRENO; - else if (unformat (input, "cubic")) - *result = TCP_CC_CUBIC; - else - return 0; + if (unformat (input, "%s", &cc_algo_name) + && ((p = hash_get_mem (tm->cc_algo_by_name, cc_algo_name)))) + { + *result = *p; + found = 1; + } - return 1; + vec_free (cc_algo_name); + return found; } uword @@ -1496,6 +1614,8 @@ tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) else if (unformat (input, "max-rx-fifo %U", unformat_memory_size, &tm->max_rx_fifo)) ; + else if (unformat (input, "mtu %d", &tm->default_mtu)) + ; else if (unformat (input, "no-tx-pacing")) tm->tx_pacing = 0; else if (unformat (input, "cc-algo %U", unformat_tcp_cc_algo,