X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp_input.c;h=21e5f3cdabaeb1d9ba4256473368acf86633870e;hb=776f3d85ebd4edfb70b0f748890f1efd98d8474c;hp=0c13bbec626e60bfb6c6d93ca246585fda2cdfc8;hpb=8b20bf5ef72a85ed70d7457f33c096f1eef51d0a;p=vpp.git diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 0c13bbec626..21e5f3cdaba 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -123,7 +123,7 @@ tcp_segment_in_rcv_wnd (tcp_connection_t * tc, u32 seq, u32 end_seq) * @param to TCP options data structure to be populated * @return -1 if parsing failed */ -int +static int tcp_options_parse (tcp_header_t * th, tcp_options_t * to) { const u8 *data; @@ -135,7 +135,8 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) data = (const u8 *) (th + 1); /* Zero out all flags but those set in SYN */ - to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE); + to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE + | TCP_OPTS_FLAG_SACK); for (; opts_len > 0; opts_len -= opt_len, data += opt_len) { @@ -257,7 +258,7 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end) { ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval)); tc->tsval_recent = tc->rcv_opts.tsval; - tc->tsval_recent_age = tcp_time_now (); + tc->tsval_recent_age = tcp_time_now_w_thread (tc->c_thread_index); } } @@ -307,7 +308,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, /* If it just so happens that a segment updates tsval_recent for a * segment over 24 days old, invalidate tsval_recent. */ if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE, - tcp_time_now ())) + tcp_time_now_w_thread (tc0->c_thread_index))) { /* Age isn't reset until we get a valid tsval (bsd inspired) */ tc0->tsval_recent = 0; @@ -453,18 +454,24 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) /* Karn's rule, part 1. Don't use retransmitted segments to estimate * RTT because they're ambiguous. */ if (tcp_in_cong_recovery (tc) || tc->sack_sb.sacked_bytes) - goto done; + { + if (tcp_in_recovery (tc)) + return 0; + goto done; + } if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq)) { - mrtt = tcp_time_now () - tc->rtt_ts; + tc->mrtt_us = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts; + mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1); } /* As per RFC7323 TSecr can be used for RTTM only if the segment advances * snd_una, i.e., the left side of the send window: * seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */ else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr) { - mrtt = tcp_time_now () - tc->rcv_opts.tsecr; + u32 now = tcp_time_now_w_thread (tc->c_thread_index); + mrtt = clib_max (now - tc->rcv_opts.tsecr, 1); } /* Ignore dubious measurements */ @@ -479,7 +486,7 @@ done: tc->rtt_ts = 0; /* If we got here something must've been ACKed so make sure boff is 0, - * even if mrrt is not valid since we update the rto lower */ + * even if mrtt is not valid since we update the rto lower */ tc->rto_boff = 0; tcp_update_rto (tc); @@ -534,7 +541,60 @@ tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b, return ((*is_dack || tcp_in_cong_recovery (tc)) && !tcp_is_lost_fin (tc)); } -void +static u32 +scoreboard_hole_index (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + ASSERT (!pool_is_free_index (sb->holes, hole - sb->holes)); + return hole - sb->holes; +} + +static u32 +scoreboard_hole_bytes (sack_scoreboard_hole_t * hole) +{ + return hole->end - hole->start; +} + +sack_scoreboard_hole_t * +scoreboard_get_hole (sack_scoreboard_t * sb, u32 index) +{ + if (index != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, index); + return 0; +} + +sack_scoreboard_hole_t * +scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + if (hole->next != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, hole->next); + return 0; +} + +sack_scoreboard_hole_t * +scoreboard_prev_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, hole->prev); + return 0; +} + +sack_scoreboard_hole_t * +scoreboard_first_hole (sack_scoreboard_t * sb) +{ + if (sb->head != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, sb->head); + return 0; +} + +sack_scoreboard_hole_t * +scoreboard_last_hole (sack_scoreboard_t * sb) +{ + if (sb->tail != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, sb->tail); + return 0; +} + +static void scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) { sack_scoreboard_hole_t *next, *prev; @@ -564,12 +624,12 @@ scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) /* Poison the entry */ if (CLIB_DEBUG > 0) - memset (hole, 0xfe, sizeof (*hole)); + clib_memset (hole, 0xfe, sizeof (*hole)); pool_put (sb->holes, hole); } -sack_scoreboard_hole_t * +static sack_scoreboard_hole_t * scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, u32 start, u32 end) { @@ -577,7 +637,7 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, u32 hole_index; pool_get (sb->holes, hole); - memset (hole, 0, sizeof (*hole)); + clib_memset (hole, 0, sizeof (*hole)); hole->start = start; hole->end = end; @@ -606,42 +666,48 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, return hole; } -void +static void scoreboard_update_bytes (tcp_connection_t * tc, sack_scoreboard_t * sb) { - sack_scoreboard_hole_t *hole, *prev; + sack_scoreboard_hole_t *left, *right; u32 bytes = 0, blks = 0; sb->lost_bytes = 0; sb->sacked_bytes = 0; - hole = scoreboard_last_hole (sb); - if (!hole) + left = scoreboard_last_hole (sb); + if (!left) return; - if (seq_gt (sb->high_sacked, hole->end)) + if (seq_gt (sb->high_sacked, left->end)) { - bytes = sb->high_sacked - hole->end; + bytes = sb->high_sacked - left->end; blks = 1; } - while ((prev = scoreboard_prev_hole (sb, hole)) - && (bytes < (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss - && blks < TCP_DUPACK_THRESHOLD)) + while ((right = left) + && bytes < (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss + && blks < TCP_DUPACK_THRESHOLD + /* left not updated if above conditions fail */ + && (left = scoreboard_prev_hole (sb, right))) { - bytes += hole->start - prev->end; + bytes += right->start - left->end; blks++; - hole = prev; } - while (hole) + /* left is first lost */ + if (left) { - sb->lost_bytes += scoreboard_hole_bytes (hole); - hole->is_lost = 1; - prev = hole; - hole = scoreboard_prev_hole (sb, hole); - if (hole) - bytes += prev->start - hole->end; + do + { + sb->lost_bytes += scoreboard_hole_bytes (right); + left->is_lost = 1; + left = scoreboard_prev_hole (sb, right); + if (left) + bytes += right->start - left->end; + } + while ((right = left)); } + sb->sacked_bytes = bytes; } @@ -685,7 +751,7 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb, /* Rule (3): if hole not lost */ else if (seq_lt (hole->start, sb->high_sacked)) { - *snd_limited = 1; + *snd_limited = 0; sb->cur_rxt_hole = scoreboard_hole_index (sb, hole); } /* Rule (4): if hole beyond high_sacked */ @@ -705,7 +771,7 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb, return hole; } -void +static void scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 seq) { sack_scoreboard_hole_t *hole; @@ -718,18 +784,47 @@ scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 seq) sb->high_rxt = seq; } +void +scoreboard_init (sack_scoreboard_t * sb) +{ + sb->head = TCP_INVALID_SACK_HOLE_INDEX; + sb->tail = TCP_INVALID_SACK_HOLE_INDEX; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; +} + +void +scoreboard_clear (sack_scoreboard_t * sb) +{ + sack_scoreboard_hole_t *hole; + while ((hole = scoreboard_first_hole (sb))) + { + scoreboard_remove_hole (sb, hole); + } + ASSERT (sb->head == sb->tail && sb->head == TCP_INVALID_SACK_HOLE_INDEX); + ASSERT (pool_elts (sb->holes) == 0); + sb->sacked_bytes = 0; + sb->last_sacked_bytes = 0; + sb->last_bytes_delivered = 0; + sb->snd_una_adv = 0; + sb->high_sacked = 0; + sb->high_rxt = 0; + sb->lost_bytes = 0; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; +} + /** * Test that scoreboard is sane after recovery * * Returns 1 if scoreboard is empty or if first hole beyond * snd_una. */ -u8 +static u8 tcp_scoreboard_is_sane_post_recovery (tcp_connection_t * tc) { sack_scoreboard_hole_t *hole; hole = scoreboard_first_hole (&tc->sack_sb); - return (!hole || seq_geq (hole->start, tc->snd_una)); + return (!hole || (seq_geq (hole->start, tc->snd_una) + && seq_lt (hole->end, tc->snd_una_max))); } void @@ -742,14 +837,15 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) int i, j; sb->last_sacked_bytes = 0; - sb->snd_una_adv = 0; - old_sacked_bytes = sb->sacked_bytes; sb->last_bytes_delivered = 0; + sb->snd_una_adv = 0; if (!tcp_opts_sack (&tc->rcv_opts) && sb->head == TCP_INVALID_SACK_HOLE_INDEX) return; + old_sacked_bytes = sb->sacked_bytes; + /* Remove invalid blocks */ blk = tc->rcv_opts.sacks; while (blk < vec_end (tc->rcv_opts.sacks)) @@ -887,14 +983,22 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) } } + if (pool_elts (sb->holes) == 1) + { + hole = scoreboard_first_hole (sb); + if (hole->start == ack + sb->snd_una_adv + && hole->end == tc->snd_una_max) + scoreboard_remove_hole (sb, hole); + } + scoreboard_update_bytes (tc, sb); sb->last_sacked_bytes = sb->sacked_bytes - (old_sacked_bytes - sb->last_bytes_delivered); ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc)); - ASSERT (sb->sacked_bytes == 0 + ASSERT (sb->sacked_bytes == 0 || tcp_in_recovery (tc) || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack)); ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max - - seq_max (tc->snd_una, ack)); + - seq_max (tc->snd_una, ack) || tcp_in_recovery (tc)); ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc) || sb->holes[sb->head].start == ack + sb->snd_una_adv); TCP_EVT_DBG (TCP_EVT_CC_SCOREBOARD, tc); @@ -950,6 +1054,9 @@ tcp_cc_init_congestion (tcp_connection_t * tc) tcp_fastrecovery_on (tc); tc->snd_congestion = tc->snd_una_max; tc->cwnd_acc_bytes = 0; + tc->snd_rxt_bytes = 0; + tc->prev_ssthresh = tc->ssthresh; + tc->prev_cwnd = tc->cwnd; tc->cc_algo->congestion (tc); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4); } @@ -972,8 +1079,12 @@ tcp_cc_fastrecovery_exit (tcp_connection_t * tc) tc->snd_rxt_bytes = 0; tc->rcv_dupacks = 0; tc->snd_nxt = tc->snd_una_max; + tc->snd_rxt_bytes = 0; + tcp_fastrecovery_off (tc); tcp_fastrecovery_1_smss_off (tc); + tcp_fastrecovery_first_off (tc); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); } @@ -986,13 +1097,14 @@ tcp_cc_congestion_undo (tcp_connection_t * tc) tc->rcv_dupacks = 0; if (tcp_in_recovery (tc)) tcp_cc_recovery_exit (tc); + else if (tcp_in_fastrecovery (tc)) + tcp_cc_fastrecovery_exit (tc); ASSERT (tc->rto_boff == 0); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 5); - /* TODO extend for fastrecovery */ } -static u8 -tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) +static inline u8 +tcp_cc_is_spurious_timeout_rxt (tcp_connection_t * tc) { return (tcp_in_recovery (tc) && tc->rto_boff == 1 && tc->snd_rxt_ts @@ -1000,7 +1112,21 @@ tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts)); } -int +static inline u8 +tcp_cc_is_spurious_fast_rxt (tcp_connection_t * tc) +{ + return (tcp_in_fastrecovery (tc) + && tc->cwnd > tc->ssthresh + 3 * tc->snd_mss); +} + +static u8 +tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) +{ + return (tcp_cc_is_spurious_timeout_rxt (tc) + || tcp_cc_is_spurious_fast_rxt (tc)); +} + +static int tcp_cc_recover (tcp_connection_t * tc) { ASSERT (tcp_in_cong_recovery (tc)); @@ -1027,8 +1153,7 @@ tcp_cc_update (tcp_connection_t * tc, vlib_buffer_t * b) ASSERT (!tcp_in_cong_recovery (tc) || tcp_is_lost_fin (tc)); /* Congestion avoidance */ - tc->cc_algo->rcv_ack (tc); - tc->tsecr_last_ack = tc->rcv_opts.tsecr; + tcp_cc_rcv_ack (tc); /* If a cumulative ack, make sure dupacks is 0 */ tc->rcv_dupacks = 0; @@ -1056,6 +1181,75 @@ tcp_should_fastrecover (tcp_connection_t * tc) || tcp_should_fastrecover_sack (tc)); } +void +tcp_program_fastretransmit (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) +{ + if (!(tc->flags & TCP_CONN_FRXT_PENDING)) + { + vec_add1 (wrk->pending_fast_rxt, tc->c_c_index); + tc->flags |= TCP_CONN_FRXT_PENDING; + } +} + +void +tcp_do_fastretransmits (tcp_worker_ctx_t * wrk) +{ + u32 *ongoing_fast_rxt, burst_bytes, sent_bytes, thread_index; + u32 max_burst_size, burst_size, n_segs = 0, n_segs_now; + tcp_connection_t *tc; + u64 last_cpu_time; + int i; + + if (vec_len (wrk->pending_fast_rxt) == 0 + && vec_len (wrk->postponed_fast_rxt) == 0) + return; + + thread_index = wrk->vm->thread_index; + last_cpu_time = wrk->vm->clib_time.last_cpu_time; + ongoing_fast_rxt = wrk->ongoing_fast_rxt; + vec_append (ongoing_fast_rxt, wrk->postponed_fast_rxt); + vec_append (ongoing_fast_rxt, wrk->pending_fast_rxt); + + _vec_len (wrk->postponed_fast_rxt) = 0; + _vec_len (wrk->pending_fast_rxt) = 0; + + max_burst_size = VLIB_FRAME_SIZE / vec_len (ongoing_fast_rxt); + max_burst_size = clib_max (max_burst_size, 1); + + for (i = 0; i < vec_len (ongoing_fast_rxt); i++) + { + if (n_segs >= VLIB_FRAME_SIZE) + { + vec_add1 (wrk->postponed_fast_rxt, ongoing_fast_rxt[i]); + continue; + } + + tc = tcp_connection_get (ongoing_fast_rxt[i], thread_index); + tc->flags &= ~TCP_CONN_FRXT_PENDING; + + if (!tcp_in_fastrecovery (tc)) + continue; + + burst_size = clib_min (max_burst_size, VLIB_FRAME_SIZE - n_segs); + burst_bytes = transport_connection_tx_pacer_burst (&tc->connection, + last_cpu_time); + burst_size = clib_min (burst_size, burst_bytes / tc->snd_mss); + if (!burst_size) + { + tcp_program_fastretransmit (wrk, tc); + continue; + } + + n_segs_now = tcp_fast_retransmit (wrk, tc, burst_size); + sent_bytes = clib_min (n_segs_now * tc->snd_mss, burst_bytes); + transport_connection_tx_pacer_update_bytes (&tc->connection, + sent_bytes); + n_segs += n_segs_now; + } + _vec_len (ongoing_fast_rxt) = 0; + wrk->ongoing_fast_rxt = ongoing_fast_rxt; +} + /** * One function to rule them all ... and in the darkness bind them */ @@ -1068,7 +1262,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) { if (tc->bytes_acked) goto partial_ack; - tcp_fast_retransmit (tc); + tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index), tc); return; } /* @@ -1092,22 +1286,14 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) } else if (tcp_should_fastrecover (tc)) { + u32 pacer_wnd; + ASSERT (!tcp_in_fastrecovery (tc)); - /* If of of the two conditions lower hold, reset dupacks because - * we're probably after timeout (RFC6582 heuristics). - * If Cumulative ack does not cover more than congestion threshold, - * and: - * 1) The following doesn't hold: The congestion window is greater - * than SMSS bytes and the difference between highest_ack - * and prev_highest_ack is at most 4*SMSS bytes - * 2) Echoed timestamp in the last non-dup ack does not equal the - * stored timestamp - */ - if (seq_leq (tc->snd_una, tc->snd_congestion) - && ((!(tc->cwnd > tc->snd_mss - && tc->bytes_acked <= 4 * tc->snd_mss)) - || (tc->rcv_opts.tsecr != tc->tsecr_last_ack))) + /* Heuristic to catch potential late dupacks + * after fast retransmit exits */ + if (is_dack && tc->snd_una == tc->snd_congestion + && timestamp_leq (tc->rcv_opts.tsecr, tc->tsecr_last_ack)) { tc->rcv_dupacks = 0; return; @@ -1116,26 +1302,25 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) tcp_cc_init_congestion (tc); tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); - /* The first segment MUST be retransmitted */ - tcp_retransmit_first_unacked (tc); - - /* Post retransmit update cwnd to ssthresh and account for the - * three segments that have left the network and should've been - * buffered at the receiver XXX */ - tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss; - ASSERT (tc->cwnd >= tc->snd_mss); - - /* If cwnd allows, send more data */ if (tcp_opts_sack_permitted (&tc->rcv_opts)) { - scoreboard_init_high_rxt (&tc->sack_sb, - tc->snd_una + tc->snd_mss); - tcp_fast_retransmit_sack (tc); + tc->cwnd = tc->ssthresh; + scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una); + tc->sack_sb.rescue_rxt = tc->snd_una - 1; } else { - tcp_fast_retransmit_no_sack (tc); + /* Post retransmit update cwnd to ssthresh and account for the + * three segments that have left the network and should've been + * buffered at the receiver XXX */ + tc->cwnd = tc->ssthresh + 3 * tc->snd_mss; } + + pacer_wnd = clib_max (0.1 * tc->cwnd, 2 * tc->snd_mss); + tcp_connection_tx_pacer_reset (tc, pacer_wnd, + 0 /* start bucket */ ); + tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index), + tc); return; } else if (!tc->bytes_acked @@ -1147,6 +1332,28 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) else goto partial_ack; } + /* Don't allow entry in fast recovery if still in recovery, for now */ + else if (0 && is_dack && tcp_in_recovery (tc)) + { + /* If of of the two conditions lower hold, reset dupacks because + * we're probably after timeout (RFC6582 heuristics). + * If Cumulative ack does not cover more than congestion threshold, + * and: + * 1) The following doesn't hold: The congestion window is greater + * than SMSS bytes and the difference between highest_ack + * and prev_highest_ack is at most 4*SMSS bytes + * 2) Echoed timestamp in the last non-dup ack does not equal the + * stored timestamp + */ + if (seq_leq (tc->snd_una, tc->snd_congestion) + && ((!(tc->cwnd > tc->snd_mss + && tc->bytes_acked <= 4 * tc->snd_mss)) + || (tc->rcv_opts.tsecr != tc->tsecr_last_ack))) + { + tc->rcv_dupacks = 0; + return; + } + } if (!tc->bytes_acked) return; @@ -1157,11 +1364,11 @@ partial_ack: /* * Legitimate ACK. 1) See if we can exit recovery */ - /* XXX limit this only to first partial ack? */ - tcp_retransmit_timer_update (tc); if (seq_geq (tc->snd_una, tc->snd_congestion)) { + tcp_retransmit_timer_update (tc); + /* If spurious return, we've already updated everything */ if (tcp_cc_recover (tc)) { @@ -1172,8 +1379,7 @@ partial_ack: tc->snd_nxt = tc->snd_una_max; /* Treat as congestion avoidance ack */ - tc->cc_algo->rcv_ack (tc); - tc->tsecr_last_ack = tc->rcv_opts.tsecr; + tcp_cc_rcv_ack (tc); return; } @@ -1181,6 +1387,9 @@ partial_ack: * Legitimate ACK. 2) If PARTIAL ACK try to retransmit */ + /* XXX limit this only to first partial ack? */ + tcp_retransmit_timer_force_update (tc); + /* RFC6675: If the incoming ACK is a cumulative acknowledgment, * reset dupacks to 0. Also needed if in congestion recovery */ tc->rcv_dupacks = 0; @@ -1188,30 +1397,39 @@ partial_ack: /* Post RTO timeout don't try anything fancy */ if (tcp_in_recovery (tc)) { - tc->cc_algo->rcv_ack (tc); - tc->tsecr_last_ack = tc->rcv_opts.tsecr; + tcp_cc_rcv_ack (tc); + transport_add_tx_event (&tc->connection); return; } /* Remove retransmitted bytes that have been delivered */ - ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv - >= tc->sack_sb.last_bytes_delivered - || (tc->flags & TCP_CONN_FINSNT)); - - if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { + ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv + >= tc->sack_sb.last_bytes_delivered + || (tc->flags & TCP_CONN_FINSNT)); + /* If we have sacks and we haven't gotten an ack beyond high_rxt, * remove sacked bytes delivered */ - rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv - - tc->sack_sb.last_bytes_delivered; - ASSERT (tc->snd_rxt_bytes >= rxt_delivered); - tc->snd_rxt_bytes -= rxt_delivered; + if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt)) + { + rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv + - tc->sack_sb.last_bytes_delivered; + ASSERT (tc->snd_rxt_bytes >= rxt_delivered); + tc->snd_rxt_bytes -= rxt_delivered; + } + else + { + /* Apparently all retransmitted holes have been acked */ + tc->snd_rxt_bytes = 0; + } } else { - /* Either all retransmitted holes have been acked, or we're - * "in the blind" and retransmitting segment by segment */ - tc->snd_rxt_bytes = 0; + if (tc->snd_rxt_bytes > tc->bytes_acked) + tc->snd_rxt_bytes -= tc->bytes_acked; + else + tc->snd_rxt_bytes = 0; } tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); @@ -1219,14 +1437,7 @@ partial_ack: /* * Since this was a partial ack, try to retransmit some more data */ - tcp_fast_retransmit (tc); -} - -void -tcp_cc_init (tcp_connection_t * tc) -{ - tc->cc_algo = tcp_cc_algo_get (TCP_CC_NEWRENO); - tc->cc_algo->init (tc); + tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index), tc); } /** @@ -1247,10 +1458,11 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, /* When we entered recovery, we reset snd_nxt to snd_una. Seems peer * still has the data so accept the ack */ if (tcp_in_recovery (tc) - && seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_congestion) - && seq_geq (vnet_buffer (b)->tcp.ack_number, tc->snd_una)) + && seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_congestion)) { - tc->snd_una_max = tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; + tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; + if (seq_gt (tc->snd_nxt, tc->snd_una_max)) + tc->snd_una_max = tc->snd_nxt; goto process_ack; } @@ -1317,7 +1529,9 @@ process_ack: if (!tcp_in_cong_recovery (tc)) return 0; *error = TCP_ERROR_ACK_DUP; - return vnet_buffer (b)->tcp.data_len ? 0 : -1; + if (vnet_buffer (b)->tcp.data_len || tcp_is_fin (th)) + return 0; + return -1; } /* @@ -1415,7 +1629,7 @@ tcp_sack_list_bytes (tcp_connection_t * tc) } /** Enqueue data for delivery to application */ -always_inline int +static int tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, u16 data_len) { @@ -1470,7 +1684,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, } /** Enqueue out-of-order data */ -always_inline int +static int tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, u16 data_len) { @@ -1648,7 +1862,7 @@ typedef struct tcp_connection_t tcp_connection; } tcp_rx_trace_t; -u8 * +static u8 * format_tcp_rx_trace (u8 * s, va_list * args) { CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); @@ -1664,7 +1878,7 @@ format_tcp_rx_trace (u8 * s, va_list * args) return s; } -u8 * +static u8 * format_tcp_rx_trace_short (u8 * s, va_list * args) { CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); @@ -1672,14 +1886,14 @@ format_tcp_rx_trace_short (u8 * s, va_list * args) tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *); s = format (s, "%d -> %d (%U)", - clib_net_to_host_u16 (t->tcp_header.src_port), - clib_net_to_host_u16 (t->tcp_header.dst_port), format_tcp_state, + clib_net_to_host_u16 (t->tcp_header.dst_port), + clib_net_to_host_u16 (t->tcp_header.src_port), format_tcp_state, t->tcp_connection.state); return s; } -void +static void tcp_set_rx_trace_data (tcp_rx_trace_t * t0, tcp_connection_t * tc0, tcp_header_t * th0, vlib_buffer_t * b0, u8 is_ip4) { @@ -1694,6 +1908,40 @@ tcp_set_rx_trace_data (tcp_rx_trace_t * t0, tcp_connection_t * tc0, clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); } +static void +tcp_established_trace_frame (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame, u8 is_ip4) +{ + u32 *from, n_left; + + n_left = frame->n_vectors; + from = vlib_frame_vector_args (frame); + + while (n_left >= 1) + { + tcp_connection_t *tc0; + tcp_rx_trace_t *t0; + tcp_header_t *th0; + vlib_buffer_t *b0; + u32 bi0; + + bi0 = from[0]; + b0 = vlib_get_buffer (vm, bi0); + + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, + vm->thread_index); + th0 = tcp_buffer_hdr (b0); + tcp_set_rx_trace_data (t0, tc0, th0, b0, is_ip4); + } + + from += 1; + n_left -= 1; + } +} + always_inline void tcp_node_inc_counter_i (vlib_main_t * vm, u32 tcp4_node, u32 tcp6_node, u8 is_ip4, u32 evt, u32 val) @@ -1734,15 +1982,18 @@ tcp_node_inc_counter_i (vlib_main_t * vm, u32 tcp4_node, u32 tcp6_node, always_inline uword tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, - vlib_frame_t * from_frame, int is_ip4) + vlib_frame_t * frame, int is_ip4) { - u32 my_thread_index = vm->thread_index, errors = 0; + u32 thread_index = vm->thread_index, errors = 0; u32 n_left_from, next_index, *from, *to_next; u16 err_counters[TCP_N_ERROR] = { 0 }; u8 is_fin = 0; - from = vlib_frame_vector_args (from_frame); - n_left_from = from_frame->n_vectors; + if (node->flags & VLIB_NODE_FLAG_TRACE) + tcp_established_trace_frame (vm, node, frame, is_ip4); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; next_index = node->cached_next_index; while (n_left_from > 0) @@ -1775,7 +2026,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, b0 = vlib_get_buffer (vm, bi0); tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, - my_thread_index); + thread_index); if (PREDICT_FALSE (tc0 == 0)) { @@ -1838,13 +2089,6 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, done: b0->error = node->errors[error0]; - if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) - { - tcp_rx_trace_t *t0 = vlib_add_trace (vm, node, b0, - sizeof (*t0)); - tcp_set_rx_trace_data (t0, tc0, th0, b0, is_ip4); - } - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); } @@ -1853,11 +2097,12 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } errors = session_manager_flush_enqueue_events (TRANSPORT_PROTO_TCP, - my_thread_index); + thread_index); err_counters[TCP_ERROR_EVENT_FIFO_FULL] = errors; tcp_store_err_counters (established, err_counters); - tcp_flush_frame_to_output (vm, my_thread_index, is_ip4); - return from_frame->n_vectors; + tcp_flush_frame_to_output (tcp_get_worker (thread_index), is_ip4); + + return frame->n_vectors; } static uword @@ -2167,8 +2412,8 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tcp_opts_wscale (&new_tc0->rcv_opts)) new_tc0->snd_wscale = new_tc0->rcv_opts.wscale; - /* RFC1323: SYN and SYN-ACK wnd not scaled */ - new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window); + new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window) + << new_tc0->snd_wscale; new_tc0->snd_wl1 = seq0; new_tc0->snd_wl2 = ack0; @@ -2557,9 +2802,12 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tc0->state = TCP_STATE_CLOSED; TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); - /* Delete the connection/session since the pipes should be - * clear by now */ - tcp_connection_del (tc0); + + /* Don't free the connection from the data path since + * we can't ensure that we have no packets already enqueued + * to output. Rely instead on the waitclose timer */ + tcp_connection_timers_reset (tc0); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, 1); goto drop; @@ -2620,10 +2868,10 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_SYN_RCVD: /* Send FIN-ACK notify app and enter CLOSE-WAIT */ tcp_connection_timers_reset (tc0); - tcp_retransmit_timer_set (tc0); tcp_make_fin (tc0, b0); tc0->snd_nxt += 1; tc0->snd_una_max = tc0->snd_nxt; + tcp_retransmit_timer_set (tc0); next0 = tcp_next_output (tc0->c_is_ip4); stream_session_disconnect_notify (&tc0->connection); tc0->state = TCP_STATE_CLOSE_WAIT; @@ -3007,22 +3255,23 @@ typedef enum _tcp_input_next static void tcp_input_trace_frame (vlib_main_t * vm, vlib_node_runtime_t * node, - vlib_buffer_t ** bufs, u32 n_bufs, u8 is_ip4) + vlib_buffer_t ** bs, u32 n_bufs, u8 is_ip4) { tcp_connection_t *tc; tcp_header_t *tcp; tcp_rx_trace_t *t; - u32 n_trace; int i; - n_trace = vlib_get_trace_count (vm, node); - for (i = 0; i < clib_min (n_trace, n_bufs); i++) + for (i = 0; i < n_bufs; i++) { - t = vlib_add_trace (vm, node, bufs[i], sizeof (*t)); - tc = tcp_connection_get (vnet_buffer (bufs[i])->tcp.connection_index, - vm->thread_index); - tcp = vlib_buffer_get_current (bufs[i]); - tcp_set_rx_trace_data (t, tc, tcp, bufs[i], is_ip4); + if (bs[i]->flags & VLIB_BUFFER_IS_TRACED) + { + t = vlib_add_trace (vm, node, bs[i], sizeof (*t)); + tc = tcp_connection_get (vnet_buffer (bs[i])->tcp.connection_index, + vm->thread_index); + tcp = vlib_buffer_get_current (bs[i]); + tcp_set_rx_trace_data (t, tc, tcp, bs[i], is_ip4); + } } } @@ -3142,7 +3391,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b; u16 nexts[VLIB_FRAME_SIZE], *next; - tcp_set_time_now (thread_index); + tcp_set_time_now (tcp_get_worker (thread_index)); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -3325,7 +3574,11 @@ do { \ /* ACK for for a SYN-ACK -> tcp-rcv-process. */ _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(SYN_RCVD, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); _(SYN_RCVD, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); /* SYN-ACK for a SYN */ _(SYN_SENT, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); @@ -3366,6 +3619,7 @@ do { \ _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(TIME_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); @@ -3378,7 +3632,7 @@ do { \ #undef _ } -clib_error_t * +static clib_error_t * tcp_input_init (vlib_main_t * vm) { clib_error_t *error = 0;