From: Florin Coras Date: Thu, 1 Nov 2018 06:09:22 +0000 (-0700) Subject: tcp: fast retransmit pacing X-Git-Tag: v19.04-rc0~479 X-Git-Url: https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commitdiff_plain;h=e55a6d7a97044c2f4fd0231242e062924d75c7b6 tcp: fast retransmit pacing Force pacing for fast retransmit to avoid bursts of retransmitted packets. Change-Id: I2ff42c328899b36322c4de557b1f7d853dba8fe2 Signed-off-by: Florin Coras --- diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c index 6ef461a5f34..34131723665 100644 --- a/src/vnet/session/session_node.c +++ b/src/vnet/session/session_node.c @@ -570,7 +570,8 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, ctx->tc = session_tx_get_transport (ctx, peek_data); ctx->snd_mss = ctx->transport_vft->send_mss (ctx->tc); ctx->snd_space = - transport_connection_max_tx_burst (ctx->tc, vm->clib_time.last_cpu_time); + transport_connection_snd_space (ctx->tc, vm->clib_time.last_cpu_time, + ctx->snd_mss); if (ctx->snd_space == 0 || ctx->snd_mss == 0) { vec_add1 (wrk->pending_event_vector, *e); diff --git a/src/vnet/session/transport.c b/src/vnet/session/transport.c index fefedcae8f2..0dd9ccd3213 100644 --- a/src/vnet/session/transport.c +++ b/src/vnet/session/transport.c @@ -497,9 +497,14 @@ static inline u32 spacer_max_burst (spacer_t * pacer, u64 norm_time_now) { u64 n_periods = norm_time_now - pacer->last_update; + u64 inc; + + if (n_periods > 0 && (inc = n_periods * pacer->tokens_per_period) > 10) + { + pacer->last_update = norm_time_now; + pacer->bucket += inc; + } - pacer->last_update = norm_time_now; - pacer->bucket += n_periods * pacer->tokens_per_period; return clib_min (pacer->bucket, pacer->max_burst_size); } @@ -525,17 +530,22 @@ spacer_set_pace_rate (spacer_t * pacer, u64 rate_bytes_per_sec) void transport_connection_tx_pacer_init (transport_connection_t * tc, - u32 rate_bytes_per_sec, u32 burst_bytes) + u32 rate_bytes_per_sec, + u32 initial_bucket) { vlib_main_t *vm = vlib_get_main (); u64 time_now = vm->clib_time.last_cpu_time; spacer_t *pacer = &tc->pacer; + f64 dispatch_period; + u32 burst_size; tc->flags |= TRANSPORT_CONNECTION_F_IS_TX_PACED; - spacer_update_max_burst_size (&tc->pacer, burst_bytes); + dispatch_period = transport_dispatch_period (tc->thread_index); + burst_size = rate_bytes_per_sec * dispatch_period; + spacer_update_max_burst_size (&tc->pacer, burst_size); spacer_set_pace_rate (&tc->pacer, rate_bytes_per_sec); pacer->last_update = time_now >> SPACER_CPU_TICKS_PER_PERIOD_SHIFT; - pacer->bucket = burst_bytes; + pacer->bucket = initial_bucket; } void @@ -550,17 +560,24 @@ transport_connection_tx_pacer_update (transport_connection_t * tc, } u32 -transport_connection_max_tx_burst (transport_connection_t * tc, u64 time_now) +transport_connection_tx_pacer_burst (transport_connection_t * tc, + u64 time_now) +{ + time_now >>= SPACER_CPU_TICKS_PER_PERIOD_SHIFT; + return spacer_max_burst (&tc->pacer, time_now); +} + +u32 +transport_connection_snd_space (transport_connection_t * tc, u64 time_now, + u16 mss) { u32 snd_space, max_paced_burst; - u32 mss; snd_space = tp_vfts[tc->proto].send_space (tc); if (transport_connection_is_tx_paced (tc)) { time_now >>= SPACER_CPU_TICKS_PER_PERIOD_SHIFT; max_paced_burst = spacer_max_burst (&tc->pacer, time_now); - mss = tp_vfts[tc->proto].send_mss (tc); max_paced_burst = (max_paced_burst < mss) ? 0 : max_paced_burst; snd_space = clib_min (snd_space, max_paced_burst); snd_space = snd_space - snd_space % mss; @@ -576,6 +593,13 @@ transport_connection_update_tx_stats (transport_connection_t * tc, u32 bytes) spacer_update_bucket (&tc->pacer, bytes); } +void +transport_connection_tx_pacer_update_bytes (transport_connection_t * tc, + u32 bytes) +{ + spacer_update_bucket (&tc->pacer, bytes); +} + void transport_init_tx_pacers_period (void) { diff --git a/src/vnet/session/transport_interface.h b/src/vnet/session/transport_interface.h index ce3bb7fab7c..b7aa4b7fa3c 100644 --- a/src/vnet/session/transport_interface.h +++ b/src/vnet/session/transport_interface.h @@ -111,7 +111,7 @@ void transport_enable_disable (vlib_main_t * vm, u8 is_en); */ void transport_connection_tx_pacer_init (transport_connection_t * tc, u32 rate_bytes_per_sec, - u32 burst_bytes); + u32 initial_bucket); /** * Update tx pacer pacing rate @@ -127,9 +127,13 @@ void transport_connection_tx_pacer_update (transport_connection_t * tc, * * @param tc transport connection * @param time_now current cpu time as returned by @ref clib_cpu_time_now + * @param mss transport's mss */ -u32 transport_connection_max_tx_burst (transport_connection_t * tc, - u64 time_now); +u32 transport_connection_snd_space (transport_connection_t * tc, + u64 time_now, u16 mss); + +u32 transport_connection_tx_pacer_burst (transport_connection_t * tc, + u64 time_now); /** * Initialize period for tx pacers @@ -163,6 +167,10 @@ u8 *format_transport_pacer (u8 * s, va_list * args); void transport_connection_update_tx_stats (transport_connection_t * tc, u32 bytes); +void +transport_connection_tx_pacer_update_bytes (transport_connection_t * tc, + u32 bytes); + #endif /* SRC_VNET_SESSION_TRANSPORT_INTERFACE_H_ */ /* diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 8c0c5da07a1..bdf9c7af608 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -558,10 +558,11 @@ tcp_init_snd_vars (tcp_connection_t * tc) void tcp_enable_pacing (tcp_connection_t * tc) { - u32 max_burst, byte_rate; - max_burst = 16 * tc->snd_mss; + u32 initial_bucket, byte_rate; + initial_bucket = 16 * tc->snd_mss; byte_rate = 2 << 16; - transport_connection_tx_pacer_init (&tc->connection, byte_rate, max_burst); + transport_connection_tx_pacer_init (&tc->connection, byte_rate, + initial_bucket); tc->mrtt_us = (u32) ~ 0; } @@ -1318,6 +1319,7 @@ tcp_main_enable (vlib_main_t * vm) num_threads = 1 /* main thread */ + vtm->n_threads; vec_validate (tm->connections, num_threads - 1); + vec_validate (tm->wrk_ctx, num_threads - 1); /* * Preallocate connections. Assume that thread 0 won't @@ -1339,6 +1341,13 @@ tcp_main_enable (vlib_main_t * vm) if (preallocated_connections_per_thread) pool_init_fixed (tm->connections[thread], preallocated_connections_per_thread); + vec_validate (tm->wrk_ctx[thread].pending_fast_rxt, 0); + vec_validate (tm->wrk_ctx[thread].ongoing_fast_rxt, 0); + vec_validate (tm->wrk_ctx[thread].postponed_fast_rxt, 0); + vec_reset_length (tm->wrk_ctx[thread].pending_fast_rxt); + vec_reset_length (tm->wrk_ctx[thread].ongoing_fast_rxt); + vec_reset_length (tm->wrk_ctx[thread].postponed_fast_rxt); + tm->wrk_ctx[thread].vm = vlib_mains[thread]; } /* @@ -1358,7 +1367,6 @@ tcp_main_enable (vlib_main_t * vm) clib_spinlock_init (&tm->half_open_lock); } - vec_validate (tm->wrk_ctx, num_threads - 1); tcp_initialize_timer_wheels (tm); tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 4ba3d5e9d87..f7424c3202e 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -160,7 +160,7 @@ enum }; #define TCP_SCOREBOARD_TRACE (0) -#define TCP_MAX_SACK_BLOCKS 32 /**< Max number of SACK blocks stored */ +#define TCP_MAX_SACK_BLOCKS 256 /**< Max number of SACK blocks stored */ #define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0) typedef struct _scoreboard_trace_elt @@ -390,6 +390,9 @@ typedef struct tcp_worker_ctx_ needing fast rxt */ u32 *ongoing_fast_rxt; /**< vector of connections now doing fast rxt */ + u32 *postponed_fast_rxt; /**< vector of connections + that will do fast rxt */ + vlib_main_t *vm; /**< pointer to vm */ CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); u8 cached_opts[40]; /**< cached 'on the wire' @@ -722,8 +725,8 @@ always_inline void tcp_cc_rcv_ack (tcp_connection_t * tc) { tc->cc_algo->rcv_ack (tc); - tcp_update_pacer (tc); tc->tsecr_last_ack = tc->rcv_opts.tsecr; + tcp_update_pacer (tc); } always_inline void diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index 8f626b1afeb..cd4a6f04d6e 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -627,10 +627,8 @@ if (_av > 0) \ #if TCP_DEBUG_CC -#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) \ +#define TCP_EVT_CC_EVT_PRINT(_tc, _sub_evt) \ { \ - if (_tc->snd_una != _tc->iss) \ - TCP_EVT_CC_STAT_PRINT (_tc); \ ELOG_TYPE_DECLARE (_e) = \ { \ .format = "cc: %s snd_space %u snd_una %u out %u flight %u", \ @@ -638,8 +636,8 @@ if (_av > 0) \ .n_enum_strings = 7, \ .enum_strings = { \ "fast-rxt", \ - "rxt-timeout", \ "first-rxt", \ + "rxt-timeout", \ "recovered", \ "congestion", \ "undo", \ @@ -653,8 +651,18 @@ if (_av > 0) \ ed->data[3] = tcp_bytes_out(_tc); \ ed->data[4] = tcp_flight_size (_tc); \ } + +#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) \ +{ \ + if (_tc->snd_una != _tc->iss) \ + TCP_EVT_CC_STAT_PRINT (_tc); \ + if ((_sub_evt <= 1 && TCP_DEBUG_CC > 1) \ + || (_sub_evt > 1 && TCP_DEBUG_CC > 0)) \ + TCP_EVT_CC_EVT_PRINT (_tc, _sub_evt); \ +} #else -#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) +#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) \ + #endif #if TCP_DEBUG_CC > 1 diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index ac0e996567e..154b9ac8363 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1199,67 +1199,60 @@ void tcp_do_fastretransmits (u32 thread_index) { tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[thread_index]; - u32 max_burst_size, burst_size, n_segs = 0; + u32 max_burst_size, burst_size, n_segs = 0, n_segs_now; + u32 *ongoing_fast_rxt, burst_bytes, sent_bytes; tcp_connection_t *tc; + u64 last_cpu_time; int i; - if (vec_len (wrk->pending_fast_rxt) == 0) + if (vec_len (wrk->pending_fast_rxt) == 0 + && vec_len (wrk->postponed_fast_rxt) == 0) return; - vec_append (wrk->ongoing_fast_rxt, wrk->pending_fast_rxt); - vec_reset_length (wrk->pending_fast_rxt); + last_cpu_time = wrk->vm->clib_time.last_cpu_time; + ongoing_fast_rxt = wrk->ongoing_fast_rxt; + vec_append (ongoing_fast_rxt, wrk->postponed_fast_rxt); + vec_append (ongoing_fast_rxt, wrk->pending_fast_rxt); + + _vec_len (wrk->postponed_fast_rxt) = 0; + _vec_len (wrk->pending_fast_rxt) = 0; max_burst_size = VLIB_FRAME_SIZE / vec_len (wrk->ongoing_fast_rxt); max_burst_size = clib_max (max_burst_size, 1); - for (i = 0; i < vec_len (wrk->ongoing_fast_rxt); i++) + for (i = 0; i < vec_len (ongoing_fast_rxt); i++) { - tc = tcp_connection_get (wrk->ongoing_fast_rxt[i], thread_index); + if (n_segs >= VLIB_FRAME_SIZE) + { + vec_add1 (wrk->postponed_fast_rxt, ongoing_fast_rxt[i]); + continue; + } + + tc = tcp_connection_get (ongoing_fast_rxt[i], thread_index); tc->flags &= ~TCP_CONN_FRXT_PENDING; if (!tcp_in_fastrecovery (tc)) continue; - /* TODO tx pacer instead of this */ - if (n_segs >= VLIB_FRAME_SIZE) + burst_size = clib_min (max_burst_size, VLIB_FRAME_SIZE - n_segs); + burst_bytes = transport_connection_tx_pacer_burst (&tc->connection, + last_cpu_time); + burst_size = clib_min (burst_size, burst_bytes / tc->snd_mss); + if (!burst_size) { tcp_program_fastretransmit (tc); continue; } - burst_size = clib_min (max_burst_size, VLIB_FRAME_SIZE - n_segs); + n_segs_now = tcp_fast_retransmit (tc, burst_size); + sent_bytes = clib_min (n_segs_now * tc->snd_mss, burst_bytes); + transport_connection_tx_pacer_update_bytes (&tc->connection, + sent_bytes); - if (tc->cwnd > tc->ssthresh + 3 * tc->snd_mss) - { - /* The first segment MUST be retransmitted */ - if (tcp_retransmit_first_unacked (tc)) - { - tcp_program_fastretransmit (tc); - continue; - } - - /* Post retransmit update cwnd to ssthresh and account for the - * three segments that have left the network and should've been - * buffered at the receiver XXX */ - tc->cwnd = tc->ssthresh + 3 * tc->snd_mss; - - /* If cwnd allows, send more data */ - if (tcp_opts_sack_permitted (&tc->rcv_opts)) - { - scoreboard_init_high_rxt (&tc->sack_sb, - tc->snd_una + tc->snd_mss); - tc->sack_sb.rescue_rxt = tc->snd_una - 1; - n_segs += tcp_fast_retransmit_sack (tc, burst_size); - } - else - { - n_segs += tcp_fast_retransmit_no_sack (tc, burst_size); - } - } - else - n_segs += tcp_fast_retransmit (tc, burst_size); + n_segs += n_segs_now; } - vec_reset_length (wrk->ongoing_fast_rxt); + _vec_len (ongoing_fast_rxt) = 0; + wrk->ongoing_fast_rxt = ongoing_fast_rxt; } /** @@ -1298,6 +1291,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) } else if (tcp_should_fastrecover (tc)) { + u32 byte_rate; ASSERT (!tcp_in_fastrecovery (tc)); /* Heuristic to catch potential late dupacks @@ -1313,8 +1307,21 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); if (tcp_opts_sack_permitted (&tc->rcv_opts)) - tc->sack_sb.high_rxt = tc->snd_una; + { + tc->cwnd = tc->ssthresh; + scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una); + tc->sack_sb.rescue_rxt = tc->snd_una - 1; + } + else + { + /* Post retransmit update cwnd to ssthresh and account for the + * three segments that have left the network and should've been + * buffered at the receiver XXX */ + tc->cwnd = tc->ssthresh + 3 * tc->snd_mss; + } + byte_rate = (0.3 * tc->cwnd) / ((f64) TCP_TICK * tc->srtt); + transport_connection_tx_pacer_init (&tc->connection, byte_rate, 0); tcp_program_fastretransmit (tc); return; } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index f14a61263d4..81579ef96a2 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1452,10 +1452,10 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; } - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); - if (tc->state >= TCP_STATE_ESTABLISHED) { + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); + /* Lost FIN, retransmit and return */ if (tcp_is_lost_fin (tc)) { @@ -1536,6 +1536,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) return; } + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); + /* Try without increasing RTO a number of times. If this fails, * start growing RTO exponentially */ tc->rto_boff += 1; @@ -1562,6 +1564,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Retransmit SYN-ACK */ else if (tc->state == TCP_STATE_SYN_RCVD) { + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); + tc->rto_boff += 1; if (tc->rto_boff > TCP_RTO_SYN_RETRIES) tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); @@ -1693,7 +1697,7 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b); if (!n_bytes)