From 36ee9f1ca37daf277c2cd8d33bf16eabc15773e5 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Fri, 2 Nov 2018 12:52:10 -0700 Subject: [PATCH] tcp: send unsent data in fast recovery Allows sending of unsent data in fast recovery and consolidates logic in tcp, instead of splitting it between tcp fast retransmit and tcp output path called by the session layer. Change-Id: I9b12cdf2aa2ac50b9f25e46856fed037163501fe Signed-off-by: Florin Coras --- src/vnet/session/session.c | 2 +- src/vnet/session/session_node.c | 4 +- src/vnet/tcp/tcp.c | 51 ++++----- src/vnet/tcp/tcp_input.c | 29 +++-- src/vnet/tcp/tcp_output.c | 231 +++++++++++++++++++++++++++------------- 5 files changed, 196 insertions(+), 121 deletions(-) diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index f6894868a3a..b944f5a2104 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -1081,7 +1081,7 @@ stream_session_disconnect (stream_session_t * s) * held, just append a new event to pending disconnects vector. */ if (vlib_thread_is_main_w_barrier () || thread_index == s->thread_index) { - wrk = session_manager_get_worker (thread_index); + wrk = session_manager_get_worker (s->thread_index); vec_add2 (wrk->pending_disconnects, evt, 1); clib_memset (evt, 0, sizeof (*evt)); evt->session_handle = session_handle (s); diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c index 5ed681d03c7..edc518ee872 100644 --- a/src/vnet/session/session_node.c +++ b/src/vnet/session/session_node.c @@ -789,7 +789,7 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, /* Make sure postponed events are handled first */ fifo_events = wrk->free_event_vector; vec_append (fifo_events, wrk->postponed_event_vector); - _vec_len (wrk->pending_disconnects) = 0; + _vec_len (wrk->postponed_event_vector) = 0; /* Try to dequeue what is available. Don't wait for lock. * XXX: we may need priorities here */ @@ -810,8 +810,8 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vec_append (fifo_events, wrk->pending_event_vector); vec_append (fifo_events, wrk->pending_disconnects); - _vec_len (wrk->postponed_event_vector) = 0; _vec_len (wrk->pending_event_vector) = 0; + _vec_len (wrk->pending_disconnects) = 0; n_events = vec_len (fifo_events); if (PREDICT_FALSE (!n_events)) diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 1fb95b3ad3a..f8e74a88fcf 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -972,13 +972,13 @@ format_tcp_scoreboard (u8 * s, va_list * args) hole = scoreboard_first_hole (sb); if (hole) - s = format (s, "\n%Uhead %u tail %u %u holes:\n", format_white_space, - indent, sb->head, sb->tail, pool_elts (sb->holes)); + s = format (s, "\n%Uhead %u tail %u %u holes:\n%U", format_white_space, + indent, sb->head, sb->tail, pool_elts (sb->holes), + format_white_space, indent); while (hole) { - s = format (s, "%U%U", format_white_space, indent, format_tcp_sack_hole, - hole, tc); + s = format (s, "%U", format_tcp_sack_hole, hole, tc); hole = scoreboard_next_hole (sb, hole); } @@ -1051,38 +1051,25 @@ tcp_snd_space_inline (tcp_connection_t * tc) { int snd_space, snt_limited; - if (PREDICT_TRUE (!tcp_in_fastrecovery (tc))) - { - snd_space = tcp_available_output_snd_space (tc); - - /* If we haven't gotten dupacks or if we did and have gotten sacked - * bytes then we can still send as per Limited Transmit (RFC3042) */ - if (PREDICT_FALSE (tc->rcv_dupacks != 0 - && (tcp_opts_sack_permitted (tc) - && tc->sack_sb.last_sacked_bytes == 0))) - { - if (tc->rcv_dupacks == 1 && tc->limited_transmit != tc->snd_nxt) - tc->limited_transmit = tc->snd_nxt; - ASSERT (seq_leq (tc->limited_transmit, tc->snd_nxt)); + if (PREDICT_FALSE (tcp_in_fastrecovery (tc))) + return 0; - snt_limited = tc->snd_nxt - tc->limited_transmit; - snd_space = clib_max (2 * tc->snd_mss - snt_limited, 0); - } - return tcp_round_snd_space (tc, snd_space); - } + snd_space = tcp_available_output_snd_space (tc); - /* RFC 5681: When previously unsent data is available and the new value of - * cwnd and the receiver's advertised window allow, a TCP SHOULD send 1*SMSS - * bytes of previously unsent data. */ - if (tcp_in_fastrecovery (tc) && !tcp_fastrecovery_sent_1_smss (tc)) + /* If we haven't gotten dupacks or if we did and have gotten sacked + * bytes then we can still send as per Limited Transmit (RFC3042) */ + if (PREDICT_FALSE (tc->rcv_dupacks != 0 + && (tcp_opts_sack_permitted (tc) + && tc->sack_sb.last_sacked_bytes == 0))) { - if (tcp_available_cc_snd_space (tc) < tc->snd_mss) - return 0; - tcp_fastrecovery_1_smss_on (tc); - return tc->snd_mss; - } + if (tc->rcv_dupacks == 1 && tc->limited_transmit != tc->snd_nxt) + tc->limited_transmit = tc->snd_nxt; + ASSERT (seq_leq (tc->limited_transmit, tc->snd_nxt)); - return 0; + snt_limited = tc->snd_nxt - tc->limited_transmit; + snd_space = clib_max (2 * tc->snd_mss - snt_limited, 0); + } + return tcp_round_snd_space (tc, snd_space); } u32 diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 21e5f3cdaba..0b79a6699d7 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -719,8 +719,7 @@ scoreboard_update_bytes (tcp_connection_t * tc, sack_scoreboard_t * sb) sack_scoreboard_hole_t * scoreboard_next_rxt_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * start, - u8 have_sent_1_smss, - u8 * can_rescue, u8 * snd_limited) + u8 have_unsent, u8 * can_rescue, u8 * snd_limited) { sack_scoreboard_hole_t *hole = 0; @@ -742,11 +741,11 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb, } else { - /* Rule (2): output takes care of transmitting new data */ - if (!have_sent_1_smss) + /* Rule (2): available unsent data */ + if (have_unsent) { - hole = 0; sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + return 0; } /* Rule (3): if hole not lost */ else if (seq_lt (hole->start, sb->high_sacked)) @@ -772,16 +771,17 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb, } static void -scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 seq) +scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 snd_una) { sack_scoreboard_hole_t *hole; hole = scoreboard_first_hole (sb); if (hole) { - seq = seq_gt (seq, hole->start) ? seq : hole->start; + snd_una = seq_gt (snd_una, hole->start) ? snd_una : hole->start; sb->cur_rxt_hole = sb->head; } - sb->high_rxt = seq; + sb->high_rxt = snd_una; + sb->rescue_rxt = snd_una - 1; } void @@ -1306,7 +1306,6 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) { tc->cwnd = tc->ssthresh; scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una); - tc->sack_sb.rescue_rxt = tc->snd_una - 1; } else { @@ -1316,6 +1315,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) tc->cwnd = tc->ssthresh + 3 * tc->snd_mss; } + /* Constrain rate until we get a partial ack */ pacer_wnd = clib_max (0.1 * tc->cwnd, 2 * tc->snd_mss); tcp_connection_tx_pacer_reset (tc, pacer_wnd, 0 /* start bucket */ ); @@ -1387,6 +1387,10 @@ partial_ack: * Legitimate ACK. 2) If PARTIAL ACK try to retransmit */ + /* Update the pacing rate. For the first partial ack we move from + * the artificially constrained rate to the one after congestion */ + tcp_connection_tx_pacer_update (tc); + /* XXX limit this only to first partial ack? */ tcp_retransmit_timer_force_update (tc); @@ -1422,10 +1426,14 @@ partial_ack: { /* Apparently all retransmitted holes have been acked */ tc->snd_rxt_bytes = 0; + tc->sack_sb.high_rxt = tc->snd_una; } } else { + tcp_fastrecovery_first_on (tc); + /* Reuse last bytes delivered to track total bytes acked */ + tc->sack_sb.last_bytes_delivered += tc->bytes_acked; if (tc->snd_rxt_bytes > tc->bytes_acked) tc->snd_rxt_bytes -= tc->bytes_acked; else @@ -1473,7 +1481,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, { tcp_make_ack (tc, b); *next = tcp_next_output (tc->c_is_ip4); - *error = TCP_ERROR_ACK_INVALID; + *error = TCP_ERROR_ACK_FUTURE; TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 0, vnet_buffer (b)->tcp.ack_number); return -1; @@ -1483,7 +1491,6 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, vnet_buffer (b)->tcp.ack_number); tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; - *error = TCP_ERROR_ACK_FUTURE; } /* If old ACK, probably it's an old dupack */ diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 78f7c3f8294..b15cf9b362b 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1240,49 +1240,27 @@ tcp_timer_delack_handler (u32 index) } /** - * Build a retransmit segment + * Allocate a new buffer and build a new tcp segment * - * @return the number of bytes in the segment or 0 if there's nothing to - * retransmit + * @param wrk tcp worker + * @param tc connection for which the segment will be allocated + * @param offset offset of the first byte in the tx fifo + * @param max_deq_byte segment size + * @param[out] b pointer to buffer allocated + * + * @return the number of bytes in the segment or 0 if buffer cannot be + * allocated or no data available */ -static u32 -tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk, - tcp_connection_t * tc, u32 offset, - u32 max_deq_bytes, vlib_buffer_t ** b) +static int +tcp_prepare_segment (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, + u32 offset, u32 max_deq_bytes, vlib_buffer_t ** b) { u32 bytes_per_buffer = vnet_get_tcp_main ()->bytes_per_buffer; + u32 bi, seg_size; vlib_main_t *vm = wrk->vm; int n_bytes = 0; - u32 start, bi, available_bytes, seg_size; u8 *data; - ASSERT (tc->state >= TCP_STATE_ESTABLISHED); - ASSERT (max_deq_bytes != 0); - - /* - * Make sure we can retransmit something - */ - available_bytes = session_tx_fifo_max_dequeue (&tc->connection); - ASSERT (available_bytes >= offset); - available_bytes -= offset; - if (!available_bytes) - return 0; - max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes); - max_deq_bytes = clib_min (available_bytes, max_deq_bytes); - - /* Start is beyond snd_congestion */ - start = tc->snd_una + offset; - if (seq_geq (start, tc->snd_congestion)) - goto done; - - /* Don't overshoot snd_congestion */ - if (seq_gt (start + max_deq_bytes, tc->snd_congestion)) - { - max_deq_bytes = tc->snd_congestion - start; - if (max_deq_bytes == 0) - goto done; - } - seg_size = max_deq_bytes + MAX_HDRS_LEN; /* @@ -1374,6 +1352,55 @@ tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk, ASSERT (n_bytes > 0); ASSERT (((*b)->current_data + (*b)->current_length) <= bytes_per_buffer); + return n_bytes; +} + +/** + * Build a retransmit segment + * + * @return the number of bytes in the segment or 0 if there's nothing to + * retransmit + */ +static u32 +tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk, + tcp_connection_t * tc, u32 offset, + u32 max_deq_bytes, vlib_buffer_t ** b) +{ + u32 start, available_bytes; + int n_bytes = 0; + + ASSERT (tc->state >= TCP_STATE_ESTABLISHED); + ASSERT (max_deq_bytes != 0); + + /* + * Make sure we can retransmit something + */ + available_bytes = session_tx_fifo_max_dequeue (&tc->connection); + ASSERT (available_bytes >= offset); + available_bytes -= offset; + if (!available_bytes) + return 0; + + max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes); + max_deq_bytes = clib_min (available_bytes, max_deq_bytes); + + /* Start is beyond snd_congestion */ + start = tc->snd_una + offset; + if (seq_geq (start, tc->snd_congestion)) + goto done; + + /* Don't overshoot snd_congestion */ + if (seq_gt (start + max_deq_bytes, tc->snd_congestion)) + { + max_deq_bytes = tc->snd_congestion - start; + if (max_deq_bytes == 0) + goto done; + } + + n_bytes = tcp_prepare_segment (wrk, tc, offset, max_deq_bytes, b); + if (!n_bytes) + return 0; + if (tcp_in_fastrecovery (tc)) tc->snd_rxt_bytes += n_bytes; @@ -1696,6 +1723,36 @@ tcp_retransmit_first_unacked (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) return 0; } +static int +tcp_fast_retransmit_unsent (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, + u32 burst_size) +{ + u32 offset, n_segs = 0, n_written, bi; + vlib_main_t *vm = wrk->vm; + vlib_buffer_t *b = 0; + + tc->snd_nxt = tc->snd_una_max; + offset = tc->snd_una_max - tc->snd_una; + while (n_segs < burst_size) + { + n_written = tcp_prepare_segment (wrk, tc, offset, tc->snd_mss, &b); + if (!n_written) + goto done; + + bi = vlib_get_buffer_index (vm, b); + tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); + offset += n_written; + n_segs += 1; + } + +done: + return n_segs; +} + +#define scoreboard_rescue_rxt_valid(_sb, _tc) \ + (seq_geq (_sb->rescue_rxt, _tc->snd_una) \ + && seq_leq (_sb->rescue_rxt, _tc->snd_congestion)) + /** * Do fast retransmit with SACKs */ @@ -1703,55 +1760,54 @@ int tcp_fast_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, u32 burst_size) { + u32 n_written = 0, offset, max_bytes, n_segs = 0, n_segs_now; + sack_scoreboard_hole_t *hole; vlib_main_t *vm = wrk->vm; - u32 n_written = 0, offset, max_bytes, n_segs = 0; vlib_buffer_t *b = 0; - sack_scoreboard_hole_t *hole; sack_scoreboard_t *sb; u32 bi, old_snd_nxt; int snd_space; + u32 max_deq; u8 snd_limited = 0, can_rescue = 0; ASSERT (tcp_in_fastrecovery (tc)); - old_snd_nxt = tc->snd_nxt; - sb = &tc->sack_sb; snd_space = tcp_available_cc_snd_space (tc); - hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); - if (snd_space < tc->snd_mss) { tcp_program_fastretransmit (wrk, tc); - goto done; + return 0; } TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); + old_snd_nxt = tc->snd_nxt; + sb = &tc->sack_sb; + hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); + + max_deq = session_tx_fifo_max_dequeue (&tc->connection); + max_deq -= tc->snd_una_max - tc->snd_una; + while (snd_space > 0 && n_segs < burst_size) { - hole = scoreboard_next_rxt_hole (sb, hole, - tcp_fastrecovery_sent_1_smss (tc), - &can_rescue, &snd_limited); + hole = scoreboard_next_rxt_hole (sb, hole, max_deq, &can_rescue, + &snd_limited); if (!hole) { - if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una) - || seq_gt (sb->rescue_rxt, - tc->snd_congestion))) + if (max_deq) { - if (tcp_fastrecovery_first (tc)) - break; - - /* We tend to lose the first segment. Try re-resending - * it but only once and after we've tried everything */ - hole = scoreboard_first_hole (sb); - if (hole && hole->start == tc->snd_una) - { - tcp_retransmit_first_unacked (wrk, tc); - tcp_fastrecovery_first_on (tc); - n_segs += 1; - } - break; + snd_space = clib_min (max_deq, snd_space); + burst_size = clib_min (burst_size - n_segs, + snd_space / tc->snd_mss); + n_segs_now = tcp_fast_retransmit_unsent (wrk, tc, burst_size); + if (max_deq > n_segs_now * tc->snd_mss) + tcp_program_fastretransmit (wrk, tc); + n_segs += n_segs_now; + goto done; } + if (!can_rescue || scoreboard_rescue_rxt_valid (sb, tc)) + break; + /* If rescue rxt undefined or less than snd_una then one segment of * up to SMSS octets that MUST include the highest outstanding * unSACKed sequence number SHOULD be returned, and RescueRxt set to @@ -1778,19 +1834,21 @@ tcp_fast_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, max_bytes = snd_limited ? clib_min (max_bytes, tc->snd_mss) : max_bytes; if (max_bytes == 0) break; + offset = sb->high_rxt - tc->snd_una; tc->snd_nxt = sb->high_rxt; n_written = tcp_prepare_retransmit_segment (wrk, tc, offset, max_bytes, &b); + ASSERT (n_written <= snd_space); /* Nothing left to retransmit */ if (n_written == 0) break; bi = vlib_get_buffer_index (vm, b); - sb->high_rxt += n_written; tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); - ASSERT (n_written <= snd_space); + + sb->high_rxt += n_written; snd_space -= n_written; n_segs += 1; } @@ -1811,24 +1869,26 @@ int tcp_fast_retransmit_no_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, u32 burst_size) { - u32 n_written = 0, offset = 0, bi, old_snd_nxt; + u32 n_written = 0, offset = 0, bi, old_snd_nxt, max_deq, n_segs_now; vlib_main_t *vm = wrk->vm; int snd_space, n_segs = 0; vlib_buffer_t *b; ASSERT (tcp_in_fastrecovery (tc)); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); - - /* Start resending from first un-acked segment */ old_snd_nxt = tc->snd_nxt; - tc->snd_nxt = tc->snd_una; - snd_space = tcp_available_cc_snd_space (tc); + if (!tcp_fastrecovery_first (tc)) + goto send_unsent; + + /* RFC 6582: [If a partial ack], retransmit the first unacknowledged + * segment. */ + snd_space = tc->sack_sb.last_bytes_delivered; + tc->snd_nxt = tc->snd_una; while (snd_space > 0 && n_segs < burst_size) { - offset += n_written; - n_written = tcp_prepare_retransmit_segment (wrk, tc, offset, snd_space, - &b); + n_written = tcp_prepare_retransmit_segment (wrk, tc, offset, + tc->snd_mss, &b); /* Nothing left to retransmit */ if (n_written == 0) @@ -1837,16 +1897,37 @@ tcp_fast_retransmit_no_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); snd_space -= n_written; + offset += n_written; n_segs += 1; } - /* More data to resend */ - if (seq_lt (tc->snd_nxt, tc->snd_congestion)) - tcp_program_fastretransmit (wrk, tc); + if (n_segs == burst_size) + goto done; + +send_unsent: - /* Restore snd_nxt. If window allows, send 1 SMSS of new data */ + /* RFC 6582: Send a new segment if permitted by the new value of cwnd. */ + snd_space = tcp_available_cc_snd_space (tc); + if (snd_space < tc->snd_mss) + goto done; + + max_deq = session_tx_fifo_max_dequeue (&tc->connection); + max_deq -= tc->snd_una_max - tc->snd_una; + if (max_deq) + { + snd_space = clib_min (max_deq, snd_space); + burst_size = clib_min (burst_size - n_segs, snd_space / tc->snd_mss); + n_segs_now = tcp_fast_retransmit_unsent (wrk, tc, burst_size); + if (max_deq > n_segs_now * tc->snd_mss) + tcp_program_fastretransmit (wrk, tc); + n_segs += n_segs_now; + } + + /* Restore snd_nxt */ tc->snd_nxt = old_snd_nxt; +done: + tcp_fastrecovery_first_off (tc); return n_segs; } -- 2.16.6