From efefc6b4b219e2897e48def83352b4df52bc03a0 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Wed, 7 Nov 2018 12:49:19 -0800 Subject: [PATCH] tcp: pacer and mrtt estimation improvements - update pacer once per burst - better estimate initial rtt - compute smoothed average for higher precision rtt estimate Change-Id: I06d41a98784cdf861bedfbee2e7d0afc0d0154ef Signed-off-by: Florin Coras --- src/vnet/session/session_node.c | 4 +-- src/vnet/tcp/tcp.c | 7 +++-- src/vnet/tcp/tcp.h | 1 - src/vnet/tcp/tcp_debug.h | 9 +++--- src/vnet/tcp/tcp_input.c | 62 +++++++++++++++++++++++++++++++++-------- src/vnet/tcp/tcp_output.c | 2 +- 6 files changed, 63 insertions(+), 22 deletions(-) diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c index 64c873cc758..22d8d3c45b0 100644 --- a/src/vnet/session/session_node.c +++ b/src/vnet/session/session_node.c @@ -99,7 +99,7 @@ session_mq_reset_reply_handler (void *data) s = session_get_if_valid (index, thread_index); if (!s) { - clib_warning ("Invalid session!"); + SESSION_DBG ("Invalid session!"); return; } app_wrk = app_worker_get (s->app_wrk_index); @@ -751,7 +751,7 @@ static void session_update_dispatch_period (session_manager_worker_t * wrk, f64 now, u32 thread_index) { - if (wrk->last_tx_packets > 1) + if (wrk->last_tx_packets) { f64 sample = now - wrk->last_vlib_time; wrk->dispatch_period = (wrk->dispatch_period + sample) * 0.5; diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index ea350dddc69..d759cf0d0cd 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -802,9 +802,10 @@ format_tcp_vars (u8 * s, va_list * args) tcp_rcv_wnd_available (tc)); s = format (s, " tsval_recent %u tsval_recent_age %u\n", tc->tsval_recent, tcp_time_now () - tc->tsval_recent_age); - s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %2.5f ", - tc->rto, tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts); - s = format (s, "rtt_seq %u\n", tc->rtt_seq - tc->iss); + s = format (s, " rto %u rto_boff %u srtt %u us %.3f rttvar %u rtt_ts %x", + tc->rto, tc->rto_boff, tc->srtt, tc->mrtt_us * 1000, tc->rttvar, + tc->rtt_ts); + s = format (s, " rtt_seq %u\n", tc->rtt_seq - tc->iss); s = format (s, " cong: %U", format_tcp_congestion, tc); if (tc->state >= TCP_STATE_ESTABLISHED) diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 5a3a96570d2..843b90d987e 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -753,7 +753,6 @@ tcp_cc_rcv_ack (tcp_connection_t * tc) { tc->cc_algo->rcv_ack (tc); tc->tsecr_last_ack = tc->rcv_opts.tsecr; - tcp_connection_tx_pacer_update (tc); } always_inline void diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index cd4a6f04d6e..d125ee84612 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -806,13 +806,14 @@ if (TCP_DEBUG_CC > 1) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "rcv_stat: rto %u srtt %u rttvar %u ", \ - .format_args = "i4i4i4", \ + .format = "rcv_stat: rto %u srtt %u mrtt-us %u rttvar %u", \ + .format_args = "i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 3); \ + DECLARE_ETD(_tc, _e, 4); \ ed->data[0] = _tc->rto; \ ed->data[1] = _tc->srtt; \ - ed->data[2] = _tc->rttvar; \ + ed->data[2] = (u32) (_tc->mrtt_us * 1e6); \ + ed->data[3] = _tc->rttvar; \ } #define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...) \ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 9c303eb01a5..0f1ab1ab3b0 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -455,8 +455,11 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq)) { - tc->mrtt_us = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts; - mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1); + f64 sample = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts; + tc->mrtt_us = tc->mrtt_us + (sample - tc->mrtt_us) * 0.125; + mrtt = clib_max ((u32) (sample * THZ), 1); + /* Allow measuring of a new RTT */ + tc->rtt_ts = 0; } /* As per RFC7323 TSecr can be used for RTTM only if the segment advances * snd_una, i.e., the left side of the send window: @@ -475,9 +478,6 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) done: - /* Allow measuring of a new RTT */ - tc->rtt_ts = 0; - /* If we got here something must've been ACKed so make sure boff is 0, * even if mrtt is not valid since we update the rto lower */ tc->rto_boff = 0; @@ -486,6 +486,29 @@ done: return 0; } +static void +tcp_estimate_initial_rtt (tcp_connection_t * tc) +{ + u8 thread_index = vlib_num_workers ()? 1 : 0; + int mrtt; + + if (tc->rtt_ts) + { + tc->mrtt_us = tcp_time_now_us (thread_index) - tc->rtt_ts; + mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1); + tc->rtt_ts = 0; + } + else + { + mrtt = tcp_time_now_w_thread (thread_index) - tc->rcv_opts.tsecr; + tc->mrtt_us = (f64) mrtt *TCP_TICK; + + } + + if (mrtt > 0 && mrtt < TCP_RTT_MAX) + tcp_estimate_rtt (tc, mrtt); +} + /** * Dequeue bytes for connections that have received acks in last burst */ @@ -506,6 +529,9 @@ tcp_handle_postponed_dequeues (tcp_worker_ctx_t * wrk) tc = tcp_connection_get (pending_deq_acked[i], thread_index); tc->flags &= ~TCP_CONN_DEQ_PENDING; + if (PREDICT_FALSE (!tc->burst_acked)) + continue; + /* Dequeue the newly ACKed bytes */ stream_session_dequeue_drop (&tc->connection, tc->burst_acked); tc->burst_acked = 0; @@ -514,6 +540,11 @@ tcp_handle_postponed_dequeues (tcp_worker_ctx_t * wrk) /* If everything has been acked, stop retransmit timer * otherwise update. */ tcp_retransmit_timer_update (tc); + + /* If not congested, update pacer based on our new + * cwnd estimate */ + if (!tcp_in_fastrecovery (tc)) + tcp_connection_tx_pacer_update (tc); } _vec_len (wrk->pending_deq_acked) = 0; } @@ -1084,6 +1115,7 @@ tcp_cc_recovery_exit (tcp_connection_t * tc) tcp_update_rto (tc); tc->snd_rxt_ts = 0; tc->snd_nxt = tc->snd_una_max; + tc->rtt_ts = 0; tcp_recovery_off (tc); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); } @@ -1096,6 +1128,7 @@ tcp_cc_fastrecovery_exit (tcp_connection_t * tc) tc->rcv_dupacks = 0; tc->snd_nxt = tc->snd_una_max; tc->snd_rxt_bytes = 0; + tc->rtt_ts = 0; tcp_fastrecovery_off (tc); tcp_fastrecovery_1_smss_off (tc); @@ -1381,6 +1414,10 @@ partial_ack: * Legitimate ACK. 1) See if we can exit recovery */ + /* Update the pacing rate. For the first partial ack we move from + * the artificially constrained rate to the one after congestion */ + tcp_connection_tx_pacer_update (tc); + if (seq_geq (tc->snd_una, tc->snd_congestion)) { tcp_retransmit_timer_update (tc); @@ -1403,10 +1440,6 @@ partial_ack: * Legitimate ACK. 2) If PARTIAL ACK try to retransmit */ - /* Update the pacing rate. For the first partial ack we move from - * the artificially constrained rate to the one after congestion */ - tcp_connection_tx_pacer_update (tc); - /* XXX limit this only to first partial ack? */ tcp_retransmit_timer_force_update (tc); @@ -2427,7 +2460,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* Update rtt with the syn-ack sample */ - tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number); + tcp_estimate_initial_rtt (new_tc0); TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, new_tc0); error0 = TCP_ERROR_SYN_ACKS_RCVD; } @@ -2636,7 +2669,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* Update rtt and rto */ - tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number); + tcp_estimate_initial_rtt (tc0); /* Switch state to ESTABLISHED */ tc0->state = TCP_STATE_ESTABLISHED; @@ -2687,6 +2720,12 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * wait for peer's FIN but not indefinitely. */ tcp_connection_timers_reset (tc0); tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + + /* Don't try to deq the FIN acked */ + if (tc0->burst_acked > 1) + stream_session_dequeue_drop (&tc0->connection, + tc0->burst_acked - 1); + tc0->burst_acked = 0; } break; case TCP_STATE_FIN_WAIT_2: @@ -2695,6 +2734,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * acknowledged ("ok") but do not delete the TCB. */ if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0)) goto drop; + tc0->burst_acked = 0; break; case TCP_STATE_CLOSE_WAIT: /* Do the same processing as for the ESTABLISHED state. */ diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 089f85a0ea0..192e820e648 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1000,7 +1000,7 @@ tcp_send_syn (tcp_connection_t * tc) tcp_make_syn (tc, b); /* Measure RTT with this */ - tc->rtt_ts = tcp_time_now (); + tc->rtt_ts = tcp_time_now_us (vlib_num_workers ()? 1 : 0); tc->rtt_seq = tc->snd_nxt; tc->rto_boff = 0; -- 2.16.6