From 222e1f4160a5828bb2b5bf62716cd76664f6100b Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Sat, 16 Feb 2019 20:47:32 -0800 Subject: [PATCH] tcp: harden for high scale scenarios - Better handle buffer starvation scenarios - Handle case when both peers enter recovery due to packet loss. - Fix passive open establish cleanup Change-Id: I2f28baa2ff0383bb8f5f6d2452b49aa38ce69bce Signed-off-by: Florin Coras --- src/vnet/session-apps/echo_client.c | 10 ++++----- src/vnet/session/application.c | 1 + src/vnet/session/session.c | 7 ++++++ src/vnet/session/session_types.h | 1 + src/vnet/tcp/tcp.c | 8 ++++--- src/vnet/tcp/tcp.h | 2 +- src/vnet/tcp/tcp_error.def | 1 + src/vnet/tcp/tcp_input.c | 44 +++++++++++++++++++++++++++---------- src/vnet/tcp/tcp_output.c | 43 +++++++++++++++++++++++++----------- 9 files changed, 82 insertions(+), 35 deletions(-) diff --git a/src/vnet/session-apps/echo_client.c b/src/vnet/session-apps/echo_client.c index 8b7788fa266..cb93e81054c 100644 --- a/src/vnet/session-apps/echo_client.c +++ b/src/vnet/session-apps/echo_client.c @@ -605,13 +605,11 @@ echo_clients_connect (vlib_main_t * vm, u32 n_clients) return clib_error_return (0, "connect returned: %d", rv); /* Crude pacing for call setups */ - if ((i % 4) == 0) - vlib_process_suspend (vm, 10e-6); + if ((i % 16) == 0) + vlib_process_suspend (vm, 100e-6); ASSERT (i + 1 >= ecm->ready_connections); - while (i + 1 - ecm->ready_connections > 1000) - { - vlib_process_suspend (vm, 100e-6); - } + while (i + 1 - ecm->ready_connections > 128) + vlib_process_suspend (vm, 1e-3); } return 0; } diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c index e6292157dfa..ee69cf88c88 100644 --- a/src/vnet/session/application.c +++ b/src/vnet/session/application.c @@ -73,6 +73,7 @@ application_local_listen_session_alloc (application_t * app) ll->session_index = ll - app->local_listen_sessions; ll->session_type = session_type_from_proto_and_ip (TRANSPORT_PROTO_NONE, 0); ll->app_index = app->app_index; + ll->session_state = SESSION_STATE_LISTENING; return ll; } diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index a94a0c40882..58e085ee29b 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -862,6 +862,12 @@ session_transport_delete_notify (transport_connection_t * tc) switch (s->session_state) { + case SESSION_STATE_CREATED: + /* Session was created but accept notification was not yet sent to the + * app. Cleanup everything. */ + session_lookup_del_session (s); + session_free_w_fifos (s); + break; case SESSION_STATE_ACCEPTING: case SESSION_STATE_TRANSPORT_CLOSING: /* If transport finishes or times out before we get a reply @@ -963,6 +969,7 @@ session_stream_accept (transport_connection_t * tc, u32 listener_index, s->app_wrk_index = app_wrk->wrk_index; s->listener_index = listener_index; + s->session_state = SESSION_STATE_CREATED; /* Shoulder-tap the server */ if (notify) diff --git a/src/vnet/session/session_types.h b/src/vnet/session/session_types.h index 846af25c777..e10dceafa16 100644 --- a/src/vnet/session/session_types.h +++ b/src/vnet/session/session_types.h @@ -113,6 +113,7 @@ typedef u64 session_handle_t; */ typedef enum { + SESSION_STATE_CREATED, SESSION_STATE_LISTENING, SESSION_STATE_CONNECTING, SESSION_STATE_ACCEPTING, diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 81f209b5d7c..c51224447fc 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Cisco and/or its affiliates. + * Copyright (c) 2016-2019 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: @@ -1244,11 +1244,13 @@ tcp_timer_establish_handler (u32 conn_index) if (PREDICT_FALSE (tc == 0)) return; ASSERT (tc->state == TCP_STATE_SYN_RCVD); + tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; + tcp_connection_set_state (tc, TCP_STATE_CLOSED); /* Start cleanup. App wasn't notified yet so use delete notify as * opposed to delete to cleanup session layer state. */ + tcp_connection_timers_reset (tc); session_transport_delete_notify (&tc->connection); - tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; - tcp_connection_cleanup (tc); + tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); } static void diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 68750ce373f..d1fbf156ed5 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Cisco and/or its affiliates. + * Copyright (c) 2016-2019 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: diff --git a/src/vnet/tcp/tcp_error.def b/src/vnet/tcp/tcp_error.def index 7bed10f1840..7dbe952e104 100644 --- a/src/vnet/tcp/tcp_error.def +++ b/src/vnet/tcp/tcp_error.def @@ -28,6 +28,7 @@ tcp_error (SEGMENT_INVALID, "Invalid segments") tcp_error (SYNS_RCVD, "SYNs received") tcp_error (SPURIOUS_SYN, "Spurious SYNs received") tcp_error (SYN_ACKS_RCVD, "SYN-ACKs received") +tcp_error (SPURIOUS_SYN_ACK, "Spurious SYN-ACKs received") tcp_error (MSG_QUEUE_FULL, "Events not sent for lack of msg queue space") tcp_error (CREATE_SESSION_FAIL, "Sessions couldn't be allocated") tcp_error (ACK_OK, "Pure ACKs received") diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 6514dca4b4a..3d8ae890372 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -350,20 +350,34 @@ tcp_segment_validate (tcp_worker_ctx_t * wrk, tcp_connection_t * tc0, goto error; } - *error0 = TCP_ERROR_RCV_WND; /* If our window is 0 and the packet is in sequence, let it pass * through for ack processing. It should be dropped later. */ - if (!(tc0->rcv_wnd == 0 - && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number)) + if (tc0->rcv_wnd == 0 + && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number) + goto check_reset; + + /* If we entered recovery and peer did so as well, there's a chance that + * dup acks won't be acceptable on either end because seq_end may be less + * than rcv_las. This can happen if acks are lost in both directions. */ + if (tcp_in_recovery (tc0) + && seq_geq (vnet_buffer (b0)->tcp.seq_number, + tc0->rcv_las - tc0->rcv_wnd) + && seq_leq (vnet_buffer (b0)->tcp.seq_end, + tc0->rcv_nxt + tc0->rcv_wnd)) + goto check_reset; + + *error0 = TCP_ERROR_RCV_WND; + + /* If not RST, send dup ack */ + if (!tcp_rst (th0)) { - /* If not RST, send dup ack */ - if (!tcp_rst (th0)) - { - tcp_program_dupack (wrk, tc0); - TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp); - } - goto error; + tcp_program_dupack (wrk, tc0); + TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp); } + goto error; + + check_reset: + ; } /* 2nd: check the RST bit */ @@ -507,6 +521,7 @@ tcp_estimate_initial_rtt (tcp_connection_t * tc) if (tc->rtt_ts) { tc->mrtt_us = tcp_time_now_us (thread_index) - tc->rtt_ts; + tc->mrtt_us = clib_max (tc->mrtt_us, 0.0001); mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1); tc->rtt_ts = 0; } @@ -514,6 +529,9 @@ tcp_estimate_initial_rtt (tcp_connection_t * tc) { mrtt = tcp_time_now_w_thread (thread_index) - tc->rcv_opts.tsecr; mrtt = clib_max (mrtt, 1); + /* Due to retransmits we don't know the initial mrtt */ + if (tc->rto_boff && mrtt > 1 * THZ) + mrtt = 1 * THZ; tc->mrtt_us = (f64) mrtt *TCP_TICK; } @@ -2360,6 +2378,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Make sure the connection actually exists */ ASSERT (tcp_lookup_connection (tc0->c_fib_index, b0, my_thread_index, is_ip4)); + error0 = TCP_ERROR_SPURIOUS_SYN_ACK; goto drop; } @@ -2441,7 +2460,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* No SYN flag. Drop. */ if (!tcp_syn (tcp0)) { - clib_warning ("not synack"); error0 = TCP_ERROR_SEGMENT_INVALID; goto drop; } @@ -2449,7 +2467,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Parse options */ if (tcp_options_parse (tcp0, &tc0->rcv_opts, 1)) { - clib_warning ("options parse fail"); error0 = TCP_ERROR_OPTIONS; goto drop; } @@ -3676,6 +3693,9 @@ do { \ _(SYN_SENT, TCP_FLAG_RST, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); _(SYN_SENT, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); + _(SYN_SENT, TCP_FLAG_FIN, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); + _(SYN_SENT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, + TCP_ERROR_NONE); /* ACK for for established connection -> tcp-established. */ _(ESTABLISHED, TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); /* FIN for for established connection -> tcp-established. */ diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 9f38851b026..725ffec0852 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Cisco and/or its affiliates. + * Copyright (c) 2016-2019 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: @@ -587,11 +587,6 @@ tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) tcp_options_write ((u8 *) (th + 1), snd_opts); vnet_buffer (b)->tcp.connection_index = tc->c_c_index; - - /* Init retransmit timer. Use update instead of set because of - * retransmissions */ - tcp_retransmit_timer_force_update (tc); - TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc); } always_inline void @@ -951,7 +946,10 @@ tcp_send_syn (tcp_connection_t * tc) tc->rto * TCP_TO_TIMER_TICK); if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1))) - return; + { + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN, 1); + return; + } b = vlib_get_buffer (vm, bi); tcp_init_buffer (vm, b); @@ -975,14 +973,20 @@ tcp_send_synack (tcp_connection_t * tc) vlib_buffer_t *b; u32 bi; + tcp_retransmit_timer_force_update (tc); + if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1))) - return; + { + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1); + return; + } tc->rtt_ts = tcp_time_now_us (tc->c_thread_index); b = vlib_get_buffer (vm, bi); tcp_init_buffer (vm, b); tcp_make_synack (tc, b); tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc); } /** @@ -1050,6 +1054,9 @@ tcp_send_fin (tcp_connection_t * tc) tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1); if (fin_snt) tc->snd_nxt = tc->snd_una_max; + else + /* Make sure retransmit retries a fin not data */ + tc->flags |= TCP_CONN_FINSNT; return; } @@ -1195,7 +1202,10 @@ tcp_send_ack (tcp_connection_t * tc) u32 bi; if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1))) - return; + { + tcp_update_rcv_wnd (tc); + return; + } b = vlib_get_buffer (vm, bi); tcp_init_buffer (vm, b); tcp_make_ack (tc, b); @@ -1458,6 +1468,8 @@ tcp_rxt_timeout_cc (tcp_connection_t * tc) scoreboard_clear (&tc->sack_sb); tcp_cc_fastrecovery_exit (tc); } + else + tc->rcv_dupacks = 0; /* Start again from the beginning */ tc->cc_algo->congestion (tc); @@ -1504,7 +1516,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); /* Lost FIN, retransmit and return */ - if (tcp_is_lost_fin (tc)) + if (tc->flags & TCP_CONN_FINSNT) { tcp_send_fin (tc); tc->rto_boff += 1; @@ -1553,7 +1565,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) n_bytes = tcp_prepare_retransmit_segment (wrk, tc, 0, tc->snd_mss, &b); if (!n_bytes) { - tcp_retransmit_timer_force_update (tc); + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1); return; } @@ -1591,7 +1603,10 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tc->rto * TCP_TO_TIMER_TICK); if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1))) - return; + { + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN, 1); + return; + } b = vlib_get_buffer (vm, bi); tcp_init_buffer (vm, b); @@ -1614,9 +1629,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); tc->rtt_ts = 0; + tcp_retransmit_timer_force_update (tc); + if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1))) { - tcp_retransmit_timer_force_update (tc); + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1); return; } -- 2.16.6