From cc4d6d022fb0d4b4f0ea9f63e9c6b1c0e8d95cca Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Wed, 29 Jul 2020 23:03:39 -0700 Subject: [PATCH] tcp: track reorder with sacks Type: feature Change-Id: I041bff2e8d589c171661de286fa1503531dff891 Signed-off-by: Florin Coras --- src/plugins/unittest/tcp_test.c | 6 +++- src/vnet/tcp/tcp.c | 16 +++++++---- src/vnet/tcp/tcp_cli.c | 3 +- src/vnet/tcp/tcp_input.c | 13 ++------- src/vnet/tcp/tcp_output.c | 1 + src/vnet/tcp/tcp_sack.c | 62 ++++++++++++++++++++++++++++------------- src/vnet/tcp/tcp_types.h | 4 ++- 7 files changed, 66 insertions(+), 39 deletions(-) diff --git a/src/plugins/unittest/tcp_test.c b/src/plugins/unittest/tcp_test.c index b98d360ca51..3db04050136 100644 --- a/src/plugins/unittest/tcp_test.c +++ b/src/plugins/unittest/tcp_test.c @@ -505,7 +505,9 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) sb->last_sacked_bytes); TCP_TEST ((sb->last_bytes_delivered == 0), "last bytes delivered %d", sb->last_bytes_delivered); + /* Hole should be split in 2 lost holes that add up to 300 */ TCP_TEST ((sb->lost_bytes == 300), "lost bytes %u", sb->lost_bytes); + TCP_TEST ((sb->reorder == 7), "reorder %u", sb->reorder); /* * Ack [100 300] in two steps @@ -590,7 +592,9 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) sb->last_sacked_bytes); TCP_TEST ((sb->last_bytes_delivered == 0), "last bytes delivered %d", sb->last_bytes_delivered); - TCP_TEST ((sb->lost_bytes == 200), "lost bytes %u", sb->lost_bytes); + /* No bytes lost because of reorder */ + TCP_TEST ((sb->lost_bytes == 0), "lost bytes %u", sb->lost_bytes); + TCP_TEST ((sb->reorder == 7), "reorder %u", sb->reorder); TCP_TEST ((!sb->is_reneging), "is not reneging"); /* diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 938a863238f..1563be4eb86 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -925,17 +925,21 @@ tcp_snd_space_inline (tcp_connection_t * tc) * to force the peer to send enough dupacks to start retransmitting as * per Limited Transmit (RFC3042) */ - if (PREDICT_FALSE (tc->rcv_dupacks != 0 || tc->sack_sb.sacked_bytes)) + if (PREDICT_FALSE (tc->rcv_dupacks || tc->sack_sb.sacked_bytes)) { - if (tc->limited_transmit != tc->snd_nxt - && (seq_lt (tc->limited_transmit, tc->snd_nxt - 2 * tc->snd_mss) - || seq_gt (tc->limited_transmit, tc->snd_nxt))) + int snt_limited, n_pkts; + + n_pkts = tcp_opts_sack_permitted (&tc->rcv_opts) + ? tc->sack_sb.reorder - 1 : 2; + + if ((seq_lt (tc->limited_transmit, tc->snd_nxt - n_pkts * tc->snd_mss) + || seq_gt (tc->limited_transmit, tc->snd_nxt))) tc->limited_transmit = tc->snd_nxt; ASSERT (seq_leq (tc->limited_transmit, tc->snd_nxt)); - int snt_limited = tc->snd_nxt - tc->limited_transmit; - snd_space = clib_max ((int) 2 * tc->snd_mss - snt_limited, 0); + snt_limited = tc->snd_nxt - tc->limited_transmit; + snd_space = clib_max (n_pkts * tc->snd_mss - snt_limited, 0); } return tcp_round_snd_space (tc, snd_space); } diff --git a/src/vnet/tcp/tcp_cli.c b/src/vnet/tcp/tcp_cli.c index 6ad2425b53d..edd4d2d1bc1 100644 --- a/src/vnet/tcp/tcp_cli.c +++ b/src/vnet/tcp/tcp_cli.c @@ -341,9 +341,10 @@ format_tcp_scoreboard (u8 * s, va_list * args) " rxt_sacked %u\n", sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes, sb->last_lost_bytes, sb->rxt_sacked); - s = format (s, "%Ulast_delivered %u high_sacked %u is_reneging %u\n", + s = format (s, "%Ulast_delivered %u high_sacked %u is_reneging %u", format_white_space, indent, sb->last_bytes_delivered, sb->high_sacked - tc->iss, sb->is_reneging); + s = format (s, " reorder %u\n", sb->reorder); s = format (s, "%Ucur_rxt_hole %u high_rxt %u rescue_rxt %u", format_white_space, indent, sb->cur_rxt_hole, sb->high_rxt - tc->iss, sb->rescue_rxt - tc->iss); diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 5fa7bf23c72..de9d89181fa 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -689,7 +689,7 @@ tcp_cc_init_congestion (tcp_connection_t * tc) * three segments that have left the network and should've been * buffered at the receiver XXX */ if (!tcp_opts_sack_permitted (&tc->rcv_opts)) - tc->cwnd += 3 * tc->snd_mss; + tc->cwnd += TCP_DUPACK_THRESHOLD * tc->snd_mss; tc->fr_occurences += 1; TCP_EVT (TCP_EVT_CC_EVT, tc, 4); @@ -720,14 +720,6 @@ tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) return (tcp_cc_is_spurious_timeout_rxt (tc)); } -static inline u8 -tcp_should_fastrecover_sack (tcp_connection_t * tc) -{ - return (tc->sack_sb.lost_bytes - || ((TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss - < tc->sack_sb.sacked_bytes)); -} - static inline u8 tcp_should_fastrecover (tcp_connection_t * tc, u8 has_sack) { @@ -752,8 +744,7 @@ tcp_should_fastrecover (tcp_connection_t * tc, u8 has_sack) return 0; } } - return ((tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) - || tcp_should_fastrecover_sack (tc)); + return tc->sack_sb.lost_bytes || tc->rcv_dupacks >= tc->sack_sb.reorder; } static int diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index e9bff5b954a..294b18c0294 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1290,6 +1290,7 @@ tcp_cc_init_rxt_timeout (tcp_connection_t * tc) tc->rtt_ts = 0; tc->cwnd_acc_bytes = 0; tc->tr_occurences += 1; + tc->sack_sb.reorder = TCP_DUPACK_THRESHOLD; tcp_recovery_on (tc); } diff --git a/src/vnet/tcp/tcp_sack.c b/src/vnet/tcp/tcp_sack.c index b9e1166d779..8f51b517361 100644 --- a/src/vnet/tcp/tcp_sack.c +++ b/src/vnet/tcp/tcp_sack.c @@ -88,10 +88,23 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, } always_inline void -scoreboard_update_sacked_rxt (sack_scoreboard_t * sb, u32 start, u32 end, - u8 has_rxt) +scoreboard_update_sacked (sack_scoreboard_t * sb, u32 start, u32 end, + u8 has_rxt, u16 snd_mss) { - if (!has_rxt || seq_geq (start, sb->high_rxt)) + if (!has_rxt) + { + /* Sequence was not retransmitted but it was sacked. Estimate reorder + * only if not in congestion recovery */ + if (seq_lt (start, sb->high_sacked)) + { + u32 reord = (sb->high_sacked - start + snd_mss - 1) / snd_mss; + reord = clib_min (reord, TCP_MAX_SACK_REORDER); + sb->reorder = clib_max (sb->reorder, reord); + } + return; + } + + if (seq_geq (start, sb->high_rxt)) return; sb->rxt_sacked += @@ -125,8 +138,14 @@ scoreboard_update_bytes (sack_scoreboard_t * sb, u32 ack, u32 snd_mss) blks = 1; } - while (sacked < (TCP_DUPACK_THRESHOLD - 1) * snd_mss - && blks < TCP_DUPACK_THRESHOLD) + /* As per RFC 6675 a sequence number is lost if: + * DupThresh discontiguous SACKed sequences have arrived above + * 'SeqNum' or more than (DupThresh - 1) * SMSS bytes with sequence + * numbers greater than 'SeqNum' have been SACKed. + * To avoid spurious retransmits, use reordering estimate instead of + * DupThresh to detect loss. + */ + while (sacked <= (sb->reorder - 1) * snd_mss && blks < sb->reorder) { if (right->is_lost) sb->lost_bytes += scoreboard_hole_bytes (right); @@ -251,6 +270,7 @@ scoreboard_init (sack_scoreboard_t * sb) sb->head = TCP_INVALID_SACK_HOLE_INDEX; sb->tail = TCP_INVALID_SACK_HOLE_INDEX; sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + sb->reorder = TCP_DUPACK_THRESHOLD; } void @@ -270,6 +290,7 @@ scoreboard_clear (sack_scoreboard_t * sb) sb->last_lost_bytes = 0; sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; sb->is_reneging = 0; + sb->reorder = TCP_DUPACK_THRESHOLD; } void @@ -307,7 +328,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) sack_scoreboard_hole_t *hole, *next_hole; sack_scoreboard_t *sb = &tc->sack_sb; sack_block_t *blk, *rcv_sacks; - u32 blk_index = 0, i, j; + u32 blk_index = 0, i, j, high_sacked; u8 has_rxt; sb->last_sacked_bytes = 0; @@ -391,8 +412,9 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX, tc->snd_una, tc->snd_nxt); sb->tail = scoreboard_hole_index (sb, hole); + sb->high_sacked = tc->snd_una; } - sb->high_sacked = rcv_sacks[vec_len (rcv_sacks) - 1].end; + high_sacked = rcv_sacks[vec_len (rcv_sacks) - 1].end; } else { @@ -412,11 +434,10 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) tc->snd_nxt); } } - /* Keep track of max byte sacked for when the last hole * is acked */ - sb->high_sacked = seq_max (rcv_sacks[vec_len (rcv_sacks) - 1].end, - sb->high_sacked); + high_sacked = seq_max (rcv_sacks[vec_len (rcv_sacks) - 1].end, + sb->high_sacked); } /* Walk the holes with the SACK blocks */ @@ -442,7 +463,8 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) /* If covered by ack, compute delivered bytes */ if (blk->end == ack) { - u32 sacked = next_hole ? next_hole->start : sb->high_sacked; + u32 sacked = next_hole ? next_hole->start : + seq_max (sb->high_sacked, hole->end); if (PREDICT_FALSE (seq_lt (ack, sacked))) { sb->last_bytes_delivered += ack - hole->end; @@ -454,8 +476,8 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) sb->is_reneging = 0; } } - scoreboard_update_sacked_rxt (sb, hole->start, hole->end, - has_rxt); + scoreboard_update_sacked (sb, hole->start, hole->end, + has_rxt, tc->snd_mss); scoreboard_remove_hole (sb, hole); hole = next_hole; } @@ -464,8 +486,8 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { if (seq_gt (blk->end, hole->start)) { - scoreboard_update_sacked_rxt (sb, hole->start, blk->end, - has_rxt); + scoreboard_update_sacked (sb, hole->start, blk->end, + has_rxt, tc->snd_mss); hole->start = blk->end; } blk_index++; @@ -482,23 +504,25 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) /* Pool might've moved */ hole = scoreboard_get_hole (sb, hole_index); hole->end = blk->start; + next_hole->is_lost = hole->is_lost; - scoreboard_update_sacked_rxt (sb, blk->start, blk->end, - has_rxt); + scoreboard_update_sacked (sb, blk->start, blk->end, + has_rxt, tc->snd_mss); blk_index++; ASSERT (hole->next == scoreboard_hole_index (sb, next_hole)); } else if (seq_lt (blk->start, hole->end)) { - scoreboard_update_sacked_rxt (sb, blk->start, hole->end, - has_rxt); + scoreboard_update_sacked (sb, blk->start, hole->end, + has_rxt, tc->snd_mss); hole->end = blk->start; } hole = scoreboard_next_hole (sb, hole); } } + sb->high_sacked = high_sacked; scoreboard_update_bytes (sb, ack, tc->snd_mss); ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc)); diff --git a/src/vnet/tcp/tcp_types.h b/src/vnet/tcp/tcp_types.h index d7bcac5e3bc..027f0e63300 100644 --- a/src/vnet/tcp/tcp_types.h +++ b/src/vnet/tcp/tcp_types.h @@ -151,6 +151,7 @@ typedef enum tcp_connection_flag_ #define TCP_SCOREBOARD_TRACE (0) #define TCP_MAX_SACK_BLOCKS 255 /**< Max number of SACK blocks stored */ #define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0) +#define TCP_MAX_SACK_REORDER 300 typedef struct _scoreboard_trace_elt { @@ -185,7 +186,8 @@ typedef struct _sack_scoreboard u32 lost_bytes; /**< Bytes lost as per RFC6675 */ u32 last_lost_bytes; /**< Number of bytes last lost */ u32 cur_rxt_hole; /**< Retransmitting from this hole */ - u8 is_reneging; + u32 reorder; /**< Estimate of segment reordering */ + u8 is_reneging; /**< Flag set if peer is reneging*/ #if TCP_SCOREBOARD_TRACE scoreboard_trace_elt_t *trace; -- 2.16.6