X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp_output.c;h=089f85a0ea0f6ce1e78f4d89fc9998b203fad831;hb=7ac053b27fee8f9e437cf7b61357943356381061;hp=c135a311fb5d01ac4130b74fd6f9cecab4b078b6;hpb=c44a558164a466a74a4c10d4e7d7dd1b9a4b01dd;p=vpp.git diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index c135a311fb5..089f85a0ea0 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -559,7 +559,6 @@ tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) tcp_reuse_buffer (vm, b); tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK); TCP_EVT_DBG (TCP_EVT_ACK_SENT, tc); - vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; tc->rcv_las = tc->rcv_nxt; } @@ -631,7 +630,6 @@ tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) tcp_options_write ((u8 *) (th + 1), snd_opts); vnet_buffer (b)->tcp.connection_index = tc->c_c_index; - vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; /* Init retransmit timer. Use update instead of set because of * retransmissions */ @@ -862,7 +860,7 @@ tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) { flags = TCP_FLAG_RST; seq = pkt_th->ack_number; - ack = (tc && tc->state >= TCP_STATE_SYN_RCVD) ? tc->rcv_nxt : 0; + ack = (tc->state >= TCP_STATE_SYN_RCVD) ? tc->rcv_nxt : 0; } else { @@ -1011,6 +1009,23 @@ tcp_send_syn (tcp_connection_t * tc) TCP_EVT_DBG (TCP_EVT_SYN_SENT, tc); } +void +tcp_send_synack (tcp_connection_t * tc) +{ + tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index); + vlib_main_t *vm = wrk->vm; + vlib_buffer_t *b; + u32 bi; + + /* Get buffer */ + if (PREDICT_FALSE (tcp_get_free_buffer_index (wrk, &bi))) + return; + + b = vlib_get_buffer (vm, bi); + tcp_make_synack (tc, b); + tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); +} + /** * Flush tx frame populated by retransmits and timer pops */ @@ -1223,6 +1238,56 @@ tcp_send_ack (tcp_connection_t * tc) tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); } +void +tcp_program_ack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) +{ + if (!(tc->flags & TCP_CONN_SNDACK)) + { + vec_add1 (wrk->pending_acks, tc->c_c_index); + tc->flags |= TCP_CONN_SNDACK; + } +} + +void +tcp_program_dupack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) +{ + if (!(tc->flags & TCP_CONN_SNDACK)) + { + vec_add1 (wrk->pending_acks, tc->c_c_index); + tc->flags |= TCP_CONN_SNDACK; + } + if (tc->pending_dupacks < 255) + tc->pending_dupacks += 1; +} + +void +tcp_send_acks (tcp_worker_ctx_t * wrk) +{ + u32 thread_index, *pending_acks; + tcp_connection_t *tc; + int i, j, n_acks; + + if (!vec_len (wrk->pending_acks)) + return; + + thread_index = wrk->vm->thread_index; + pending_acks = wrk->pending_acks; + for (i = 0; i < vec_len (pending_acks); i++) + { + tc = tcp_connection_get (pending_acks[i], thread_index); + tc->flags &= ~TCP_CONN_SNDACK; + n_acks = clib_max (1, tc->pending_dupacks); + /* If we're supposed to send dupacks but have no ooo data + * send only one ack */ + if (tc->pending_dupacks && !vec_len (tc->snd_sacks)) + n_acks = 1; + for (j = 0; j < n_acks; j++) + tcp_send_ack (tc); + tc->pending_dupacks = 0; + } + _vec_len (wrk->pending_acks) = 0; +} + /** * Delayed ack timer handler * @@ -1240,49 +1305,27 @@ tcp_timer_delack_handler (u32 index) } /** - * Build a retransmit segment + * Allocate a new buffer and build a new tcp segment * - * @return the number of bytes in the segment or 0 if there's nothing to - * retransmit + * @param wrk tcp worker + * @param tc connection for which the segment will be allocated + * @param offset offset of the first byte in the tx fifo + * @param max_deq_byte segment size + * @param[out] b pointer to buffer allocated + * + * @return the number of bytes in the segment or 0 if buffer cannot be + * allocated or no data available */ -static u32 -tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk, - tcp_connection_t * tc, u32 offset, - u32 max_deq_bytes, vlib_buffer_t ** b) +static int +tcp_prepare_segment (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, + u32 offset, u32 max_deq_bytes, vlib_buffer_t ** b) { u32 bytes_per_buffer = vnet_get_tcp_main ()->bytes_per_buffer; + u32 bi, seg_size; vlib_main_t *vm = wrk->vm; int n_bytes = 0; - u32 start, bi, available_bytes, seg_size; u8 *data; - ASSERT (tc->state >= TCP_STATE_ESTABLISHED); - ASSERT (max_deq_bytes != 0); - - /* - * Make sure we can retransmit something - */ - available_bytes = session_tx_fifo_max_dequeue (&tc->connection); - ASSERT (available_bytes >= offset); - available_bytes -= offset; - if (!available_bytes) - return 0; - max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes); - max_deq_bytes = clib_min (available_bytes, max_deq_bytes); - - /* Start is beyond snd_congestion */ - start = tc->snd_una + offset; - if (seq_geq (start, tc->snd_congestion)) - goto done; - - /* Don't overshoot snd_congestion */ - if (seq_gt (start + max_deq_bytes, tc->snd_congestion)) - { - max_deq_bytes = tc->snd_congestion - start; - if (max_deq_bytes == 0) - goto done; - } - seg_size = max_deq_bytes + MAX_HDRS_LEN; /* @@ -1330,7 +1373,7 @@ tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk, } } - tcp_get_free_buffer_index (wrk, &bi); + (void) tcp_get_free_buffer_index (wrk, &bi); ASSERT (bi != (u32) ~ 0); *b = vlib_get_buffer (vm, bi); data = tcp_init_buffer (vm, *b); @@ -1374,6 +1417,55 @@ tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk, ASSERT (n_bytes > 0); ASSERT (((*b)->current_data + (*b)->current_length) <= bytes_per_buffer); + return n_bytes; +} + +/** + * Build a retransmit segment + * + * @return the number of bytes in the segment or 0 if there's nothing to + * retransmit + */ +static u32 +tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk, + tcp_connection_t * tc, u32 offset, + u32 max_deq_bytes, vlib_buffer_t ** b) +{ + u32 start, available_bytes; + int n_bytes = 0; + + ASSERT (tc->state >= TCP_STATE_ESTABLISHED); + ASSERT (max_deq_bytes != 0); + + /* + * Make sure we can retransmit something + */ + available_bytes = session_tx_fifo_max_dequeue (&tc->connection); + ASSERT (available_bytes >= offset); + available_bytes -= offset; + if (!available_bytes) + return 0; + + max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes); + max_deq_bytes = clib_min (available_bytes, max_deq_bytes); + + /* Start is beyond snd_congestion */ + start = tc->snd_una + offset; + if (seq_geq (start, tc->snd_congestion)) + goto done; + + /* Don't overshoot snd_congestion */ + if (seq_gt (start + max_deq_bytes, tc->snd_congestion)) + { + max_deq_bytes = tc->snd_congestion - start; + if (max_deq_bytes == 0) + goto done; + } + + n_bytes = tcp_prepare_segment (wrk, tc, offset, max_deq_bytes, b); + if (!n_bytes) + return 0; + if (tcp_in_fastrecovery (tc)) tc->snd_rxt_bytes += n_bytes; @@ -1696,6 +1788,36 @@ tcp_retransmit_first_unacked (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) return 0; } +static int +tcp_fast_retransmit_unsent (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, + u32 burst_size) +{ + u32 offset, n_segs = 0, n_written, bi; + vlib_main_t *vm = wrk->vm; + vlib_buffer_t *b = 0; + + tc->snd_nxt = tc->snd_una_max; + offset = tc->snd_una_max - tc->snd_una; + while (n_segs < burst_size) + { + n_written = tcp_prepare_segment (wrk, tc, offset, tc->snd_mss, &b); + if (!n_written) + goto done; + + bi = vlib_get_buffer_index (vm, b); + tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); + offset += n_written; + n_segs += 1; + } + +done: + return n_segs; +} + +#define scoreboard_rescue_rxt_valid(_sb, _tc) \ + (seq_geq (_sb->rescue_rxt, _tc->snd_una) \ + && seq_leq (_sb->rescue_rxt, _tc->snd_congestion)) + /** * Do fast retransmit with SACKs */ @@ -1703,55 +1825,54 @@ int tcp_fast_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, u32 burst_size) { + u32 n_written = 0, offset, max_bytes, n_segs = 0, n_segs_now; + sack_scoreboard_hole_t *hole; vlib_main_t *vm = wrk->vm; - u32 n_written = 0, offset, max_bytes, n_segs = 0; vlib_buffer_t *b = 0; - sack_scoreboard_hole_t *hole; sack_scoreboard_t *sb; u32 bi, old_snd_nxt; int snd_space; + u32 max_deq; u8 snd_limited = 0, can_rescue = 0; ASSERT (tcp_in_fastrecovery (tc)); - old_snd_nxt = tc->snd_nxt; - sb = &tc->sack_sb; snd_space = tcp_available_cc_snd_space (tc); - hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); - if (snd_space < tc->snd_mss) { tcp_program_fastretransmit (wrk, tc); - goto done; + return 0; } TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); + old_snd_nxt = tc->snd_nxt; + sb = &tc->sack_sb; + hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); + + max_deq = session_tx_fifo_max_dequeue (&tc->connection); + max_deq -= tc->snd_una_max - tc->snd_una; + while (snd_space > 0 && n_segs < burst_size) { - hole = scoreboard_next_rxt_hole (sb, hole, - tcp_fastrecovery_sent_1_smss (tc), - &can_rescue, &snd_limited); + hole = scoreboard_next_rxt_hole (sb, hole, max_deq, &can_rescue, + &snd_limited); if (!hole) { - if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una) - || seq_gt (sb->rescue_rxt, - tc->snd_congestion))) + if (max_deq) { - if (tcp_fastrecovery_first (tc)) - break; - - /* We tend to lose the first segment. Try re-resending - * it but only once and after we've tried everything */ - hole = scoreboard_first_hole (sb); - if (hole && hole->start == tc->snd_una) - { - tcp_retransmit_first_unacked (wrk, tc); - tcp_fastrecovery_first_on (tc); - n_segs += 1; - } - break; + snd_space = clib_min (max_deq, snd_space); + burst_size = clib_min (burst_size - n_segs, + snd_space / tc->snd_mss); + n_segs_now = tcp_fast_retransmit_unsent (wrk, tc, burst_size); + if (max_deq > n_segs_now * tc->snd_mss) + tcp_program_fastretransmit (wrk, tc); + n_segs += n_segs_now; + goto done; } + if (!can_rescue || scoreboard_rescue_rxt_valid (sb, tc)) + break; + /* If rescue rxt undefined or less than snd_una then one segment of * up to SMSS octets that MUST include the highest outstanding * unSACKed sequence number SHOULD be returned, and RescueRxt set to @@ -1778,19 +1899,21 @@ tcp_fast_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, max_bytes = snd_limited ? clib_min (max_bytes, tc->snd_mss) : max_bytes; if (max_bytes == 0) break; + offset = sb->high_rxt - tc->snd_una; tc->snd_nxt = sb->high_rxt; n_written = tcp_prepare_retransmit_segment (wrk, tc, offset, max_bytes, &b); + ASSERT (n_written <= snd_space); /* Nothing left to retransmit */ if (n_written == 0) break; bi = vlib_get_buffer_index (vm, b); - sb->high_rxt += n_written; tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); - ASSERT (n_written <= snd_space); + + sb->high_rxt += n_written; snd_space -= n_written; n_segs += 1; } @@ -1811,24 +1934,26 @@ int tcp_fast_retransmit_no_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, u32 burst_size) { - u32 n_written = 0, offset = 0, bi, old_snd_nxt; + u32 n_written = 0, offset = 0, bi, old_snd_nxt, max_deq, n_segs_now; vlib_main_t *vm = wrk->vm; int snd_space, n_segs = 0; vlib_buffer_t *b; ASSERT (tcp_in_fastrecovery (tc)); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); - - /* Start resending from first un-acked segment */ old_snd_nxt = tc->snd_nxt; - tc->snd_nxt = tc->snd_una; - snd_space = tcp_available_cc_snd_space (tc); + if (!tcp_fastrecovery_first (tc)) + goto send_unsent; + + /* RFC 6582: [If a partial ack], retransmit the first unacknowledged + * segment. */ + snd_space = tc->sack_sb.last_bytes_delivered; + tc->snd_nxt = tc->snd_una; while (snd_space > 0 && n_segs < burst_size) { - offset += n_written; - n_written = tcp_prepare_retransmit_segment (wrk, tc, offset, snd_space, - &b); + n_written = tcp_prepare_retransmit_segment (wrk, tc, offset, + tc->snd_mss, &b); /* Nothing left to retransmit */ if (n_written == 0) @@ -1837,16 +1962,37 @@ tcp_fast_retransmit_no_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); snd_space -= n_written; + offset += n_written; n_segs += 1; } - /* More data to resend */ - if (seq_lt (tc->snd_nxt, tc->snd_congestion)) - tcp_program_fastretransmit (wrk, tc); + if (n_segs == burst_size) + goto done; + +send_unsent: + + /* RFC 6582: Send a new segment if permitted by the new value of cwnd. */ + snd_space = tcp_available_cc_snd_space (tc); + if (snd_space < tc->snd_mss || tc->snd_mss == 0) + goto done; + + max_deq = session_tx_fifo_max_dequeue (&tc->connection); + max_deq -= tc->snd_una_max - tc->snd_una; + if (max_deq) + { + snd_space = clib_min (max_deq, snd_space); + burst_size = clib_min (burst_size - n_segs, snd_space / tc->snd_mss); + n_segs_now = tcp_fast_retransmit_unsent (wrk, tc, burst_size); + if (max_deq > n_segs_now * tc->snd_mss) + tcp_program_fastretransmit (wrk, tc); + n_segs += n_segs_now; + } - /* Restore snd_nxt. If window allows, send 1 SMSS of new data */ + /* Restore snd_nxt */ tc->snd_nxt = old_snd_nxt; +done: + tcp_fastrecovery_first_off (tc); return n_segs; } @@ -1863,13 +2009,6 @@ tcp_fast_retransmit (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, return tcp_fast_retransmit_no_sack (wrk, tc, burst_size); } -static u32 -tcp_session_has_ooo_data (tcp_connection_t * tc) -{ - stream_session_t *s = session_get (tc->c_s_index, tc->c_thread_index); - return svm_fifo_has_ooo_data (s->server_rx_fifo); -} - static void tcp_output_handle_link_local (tcp_connection_t * tc0, vlib_buffer_t * b0, u16 * next0, u32 * error0) @@ -1974,25 +2113,6 @@ tcp_output_handle_packet (tcp_connection_t * tc0, vlib_buffer_t * b0, tcp_output_handle_link_local (tc0, b0, next0, error0); } - /* Filter out DUPACKs if there are no OOO segments left */ - if (PREDICT_FALSE (vnet_buffer (b0)->tcp.flags & TCP_BUF_FLAG_DUPACK)) - { - /* N.B. Should not filter burst of dupacks. Two issues: - * 1) dupacks open cwnd on remote peer when congested - * 2) acks leaving should have the latest rcv_wnd since the - * burst may have eaten up all of it, so only the old ones - * could be filtered. - */ - if (!tcp_session_has_ooo_data (tc0)) - { - *error0 = TCP_ERROR_FILTERED_DUPACKS; - *next0 = TCP_OUTPUT_NEXT_DROP; - return; - } - } - - /* Stop DELACK timer and fix flags */ - tc0->flags &= ~(TCP_CONN_SNDACK); if (!TCP_ALWAYS_ACK) tcp_timer_reset (tc0, TCP_TIMER_DELACK); }