data = (const u8 *) (th + 1);
/* Zero out all flags but those set in SYN */
- to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE);
+ to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE
+ | TCP_OPTS_FLAG_SACK);
for (; opts_len > 0; opts_len -= opt_len, data += opt_len)
{
{
ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval));
tc->tsval_recent = tc->rcv_opts.tsval;
- tc->tsval_recent_age = tcp_time_now ();
+ tc->tsval_recent_age = tcp_time_now_w_thread (tc->c_thread_index);
}
}
/* If it just so happens that a segment updates tsval_recent for a
* segment over 24 days old, invalidate tsval_recent. */
if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE,
- tcp_time_now ()))
+ tcp_time_now_w_thread (tc0->c_thread_index)))
{
/* Age isn't reset until we get a valid tsval (bsd inspired) */
tc0->tsval_recent = 0;
if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq))
{
- mrtt = tcp_time_now () - tc->rtt_ts;
+ tc->mrtt_us = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts;
+ mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1);
}
/* As per RFC7323 TSecr can be used for RTTM only if the segment advances
* snd_una, i.e., the left side of the send window:
* seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */
else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr)
{
- mrtt = tcp_time_now () - tc->rcv_opts.tsecr;
+ u32 now = tcp_time_now_w_thread (tc->c_thread_index);
+ mrtt = clib_max (now - tc->rcv_opts.tsecr, 1);
}
/* Ignore dubious measurements */
/* Poison the entry */
if (CLIB_DEBUG > 0)
- memset (hole, 0xfe, sizeof (*hole));
+ clib_memset (hole, 0xfe, sizeof (*hole));
pool_put (sb->holes, hole);
}
u32 hole_index;
pool_get (sb->holes, hole);
- memset (hole, 0, sizeof (*hole));
+ clib_memset (hole, 0, sizeof (*hole));
hole->start = start;
hole->end = end;
static void
scoreboard_update_bytes (tcp_connection_t * tc, sack_scoreboard_t * sb)
{
- sack_scoreboard_hole_t *hole, *prev;
+ sack_scoreboard_hole_t *left, *right;
u32 bytes = 0, blks = 0;
sb->lost_bytes = 0;
sb->sacked_bytes = 0;
- hole = scoreboard_last_hole (sb);
- if (!hole)
+ left = scoreboard_last_hole (sb);
+ if (!left)
return;
- if (seq_gt (sb->high_sacked, hole->end))
+ if (seq_gt (sb->high_sacked, left->end))
{
- bytes = sb->high_sacked - hole->end;
+ bytes = sb->high_sacked - left->end;
blks = 1;
}
- while ((prev = scoreboard_prev_hole (sb, hole))
- && (bytes < (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss
- && blks < TCP_DUPACK_THRESHOLD))
+ while ((right = left)
+ && bytes < (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss
+ && blks < TCP_DUPACK_THRESHOLD
+ /* left not updated if above conditions fail */
+ && (left = scoreboard_prev_hole (sb, right)))
{
- bytes += hole->start - prev->end;
+ bytes += right->start - left->end;
blks++;
- hole = prev;
}
- while (hole)
+ /* left is first lost */
+ if (left)
{
- sb->lost_bytes += scoreboard_hole_bytes (hole);
- hole->is_lost = 1;
- prev = hole;
- hole = scoreboard_prev_hole (sb, hole);
- if (hole)
- bytes += prev->start - hole->end;
+ do
+ {
+ sb->lost_bytes += scoreboard_hole_bytes (right);
+ left->is_lost = 1;
+ left = scoreboard_prev_hole (sb, right);
+ if (left)
+ bytes += right->start - left->end;
+ }
+ while ((right = left));
}
+
sb->sacked_bytes = bytes;
}
/* Rule (3): if hole not lost */
else if (seq_lt (hole->start, sb->high_sacked))
{
- *snd_limited = 1;
+ *snd_limited = 0;
sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
}
/* Rule (4): if hole beyond high_sacked */
{
sack_scoreboard_hole_t *hole;
hole = scoreboard_first_hole (&tc->sack_sb);
- return (!hole || seq_geq (hole->start, tc->snd_una));
+ return (!hole || (seq_geq (hole->start, tc->snd_una)
+ && seq_lt (hole->end, tc->snd_una_max)));
}
void
}
}
+ if (pool_elts (sb->holes) == 1)
+ {
+ hole = scoreboard_first_hole (sb);
+ if (hole->start == ack + sb->snd_una_adv
+ && hole->end == tc->snd_una_max)
+ scoreboard_remove_hole (sb, hole);
+ }
+
scoreboard_update_bytes (tc, sb);
sb->last_sacked_bytes = sb->sacked_bytes
- (old_sacked_bytes - sb->last_bytes_delivered);
ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc));
- ASSERT (sb->sacked_bytes == 0
+ ASSERT (sb->sacked_bytes == 0 || tcp_in_recovery (tc)
|| sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack));
ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max
- - seq_max (tc->snd_una, ack));
+ - seq_max (tc->snd_una, ack) || tcp_in_recovery (tc));
ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc)
|| sb->holes[sb->head].start == ack + sb->snd_una_adv);
TCP_EVT_DBG (TCP_EVT_CC_SCOREBOARD, tc);
tcp_fastrecovery_on (tc);
tc->snd_congestion = tc->snd_una_max;
tc->cwnd_acc_bytes = 0;
+ tc->snd_rxt_bytes = 0;
+ tc->prev_ssthresh = tc->ssthresh;
+ tc->prev_cwnd = tc->cwnd;
tc->cc_algo->congestion (tc);
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4);
}
tc->snd_rxt_bytes = 0;
tc->rcv_dupacks = 0;
tc->snd_nxt = tc->snd_una_max;
+ tc->snd_rxt_bytes = 0;
+
tcp_fastrecovery_off (tc);
tcp_fastrecovery_1_smss_off (tc);
+ tcp_fastrecovery_first_off (tc);
+
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
}
tc->rcv_dupacks = 0;
if (tcp_in_recovery (tc))
tcp_cc_recovery_exit (tc);
+ else if (tcp_in_fastrecovery (tc))
+ tcp_cc_fastrecovery_exit (tc);
ASSERT (tc->rto_boff == 0);
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 5);
- /* TODO extend for fastrecovery */
}
-static u8
-tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
+static inline u8
+tcp_cc_is_spurious_timeout_rxt (tcp_connection_t * tc)
{
return (tcp_in_recovery (tc) && tc->rto_boff == 1
&& tc->snd_rxt_ts
&& timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts));
}
+static inline u8
+tcp_cc_is_spurious_fast_rxt (tcp_connection_t * tc)
+{
+ return (tcp_in_fastrecovery (tc)
+ && tc->cwnd > tc->ssthresh + 3 * tc->snd_mss);
+}
+
+static u8
+tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
+{
+ return (tcp_cc_is_spurious_timeout_rxt (tc)
+ || tcp_cc_is_spurious_fast_rxt (tc));
+}
+
static int
tcp_cc_recover (tcp_connection_t * tc)
{
ASSERT (!tcp_in_cong_recovery (tc) || tcp_is_lost_fin (tc));
/* Congestion avoidance */
- tc->cc_algo->rcv_ack (tc);
- tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+ tcp_cc_rcv_ack (tc);
/* If a cumulative ack, make sure dupacks is 0 */
tc->rcv_dupacks = 0;
|| tcp_should_fastrecover_sack (tc));
}
+void
+tcp_program_fastretransmit (tcp_worker_ctx_t * wrk, tcp_connection_t * tc)
+{
+ if (!(tc->flags & TCP_CONN_FRXT_PENDING))
+ {
+ vec_add1 (wrk->pending_fast_rxt, tc->c_c_index);
+ tc->flags |= TCP_CONN_FRXT_PENDING;
+ }
+}
+
+void
+tcp_do_fastretransmits (tcp_worker_ctx_t * wrk)
+{
+ u32 *ongoing_fast_rxt, burst_bytes, sent_bytes, thread_index;
+ u32 max_burst_size, burst_size, n_segs = 0, n_segs_now;
+ tcp_connection_t *tc;
+ u64 last_cpu_time;
+ int i;
+
+ if (vec_len (wrk->pending_fast_rxt) == 0
+ && vec_len (wrk->postponed_fast_rxt) == 0)
+ return;
+
+ thread_index = wrk->vm->thread_index;
+ last_cpu_time = wrk->vm->clib_time.last_cpu_time;
+ ongoing_fast_rxt = wrk->ongoing_fast_rxt;
+ vec_append (ongoing_fast_rxt, wrk->postponed_fast_rxt);
+ vec_append (ongoing_fast_rxt, wrk->pending_fast_rxt);
+
+ _vec_len (wrk->postponed_fast_rxt) = 0;
+ _vec_len (wrk->pending_fast_rxt) = 0;
+
+ max_burst_size = VLIB_FRAME_SIZE / vec_len (wrk->ongoing_fast_rxt);
+ max_burst_size = clib_max (max_burst_size, 1);
+
+ for (i = 0; i < vec_len (ongoing_fast_rxt); i++)
+ {
+ if (n_segs >= VLIB_FRAME_SIZE)
+ {
+ vec_add1 (wrk->postponed_fast_rxt, ongoing_fast_rxt[i]);
+ continue;
+ }
+
+ tc = tcp_connection_get (ongoing_fast_rxt[i], thread_index);
+ tc->flags &= ~TCP_CONN_FRXT_PENDING;
+
+ if (!tcp_in_fastrecovery (tc))
+ continue;
+
+ burst_size = clib_min (max_burst_size, VLIB_FRAME_SIZE - n_segs);
+ burst_bytes = transport_connection_tx_pacer_burst (&tc->connection,
+ last_cpu_time);
+ burst_size = clib_min (burst_size, burst_bytes / tc->snd_mss);
+ if (!burst_size)
+ {
+ tcp_program_fastretransmit (wrk, tc);
+ continue;
+ }
+
+ n_segs_now = tcp_fast_retransmit (wrk, tc, burst_size);
+ sent_bytes = clib_min (n_segs_now * tc->snd_mss, burst_bytes);
+ transport_connection_tx_pacer_update_bytes (&tc->connection,
+ sent_bytes);
+ n_segs += n_segs_now;
+ }
+ _vec_len (ongoing_fast_rxt) = 0;
+ wrk->ongoing_fast_rxt = ongoing_fast_rxt;
+}
+
/**
* One function to rule them all ... and in the darkness bind them
*/
{
if (tc->bytes_acked)
goto partial_ack;
- tcp_fast_retransmit (tc);
+ tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index), tc);
return;
}
/*
}
else if (tcp_should_fastrecover (tc))
{
+ u32 pacer_wnd;
+
ASSERT (!tcp_in_fastrecovery (tc));
- /* If of of the two conditions lower hold, reset dupacks because
- * we're probably after timeout (RFC6582 heuristics).
- * If Cumulative ack does not cover more than congestion threshold,
- * and:
- * 1) The following doesn't hold: The congestion window is greater
- * than SMSS bytes and the difference between highest_ack
- * and prev_highest_ack is at most 4*SMSS bytes
- * 2) Echoed timestamp in the last non-dup ack does not equal the
- * stored timestamp
- */
- if (seq_leq (tc->snd_una, tc->snd_congestion)
- && ((!(tc->cwnd > tc->snd_mss
- && tc->bytes_acked <= 4 * tc->snd_mss))
- || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
+ /* Heuristic to catch potential late dupacks
+ * after fast retransmit exits */
+ if (is_dack && tc->snd_una == tc->snd_congestion
+ && timestamp_leq (tc->rcv_opts.tsecr, tc->tsecr_last_ack))
{
tc->rcv_dupacks = 0;
return;
tcp_cc_init_congestion (tc);
tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
- /* The first segment MUST be retransmitted */
- tcp_retransmit_first_unacked (tc);
-
- /* Post retransmit update cwnd to ssthresh and account for the
- * three segments that have left the network and should've been
- * buffered at the receiver XXX */
- tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss;
- ASSERT (tc->cwnd >= tc->snd_mss);
-
- /* If cwnd allows, send more data */
if (tcp_opts_sack_permitted (&tc->rcv_opts))
{
- scoreboard_init_high_rxt (&tc->sack_sb,
- tc->snd_una + tc->snd_mss);
- tcp_fast_retransmit_sack (tc);
+ tc->cwnd = tc->ssthresh;
+ scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una);
+ tc->sack_sb.rescue_rxt = tc->snd_una - 1;
}
else
{
- tcp_fast_retransmit_no_sack (tc);
+ /* Post retransmit update cwnd to ssthresh and account for the
+ * three segments that have left the network and should've been
+ * buffered at the receiver XXX */
+ tc->cwnd = tc->ssthresh + 3 * tc->snd_mss;
}
+
+ pacer_wnd = clib_max (0.1 * tc->cwnd, 2 * tc->snd_mss);
+ tcp_connection_tx_pacer_reset (tc, pacer_wnd,
+ 0 /* start bucket */ );
+ tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index),
+ tc);
return;
}
else if (!tc->bytes_acked
else
goto partial_ack;
}
+ /* Don't allow entry in fast recovery if still in recovery, for now */
+ else if (0 && is_dack && tcp_in_recovery (tc))
+ {
+ /* If of of the two conditions lower hold, reset dupacks because
+ * we're probably after timeout (RFC6582 heuristics).
+ * If Cumulative ack does not cover more than congestion threshold,
+ * and:
+ * 1) The following doesn't hold: The congestion window is greater
+ * than SMSS bytes and the difference between highest_ack
+ * and prev_highest_ack is at most 4*SMSS bytes
+ * 2) Echoed timestamp in the last non-dup ack does not equal the
+ * stored timestamp
+ */
+ if (seq_leq (tc->snd_una, tc->snd_congestion)
+ && ((!(tc->cwnd > tc->snd_mss
+ && tc->bytes_acked <= 4 * tc->snd_mss))
+ || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
+ {
+ tc->rcv_dupacks = 0;
+ return;
+ }
+ }
if (!tc->bytes_acked)
return;
/*
* Legitimate ACK. 1) See if we can exit recovery
*/
- /* XXX limit this only to first partial ack? */
- if (seq_lt (tc->snd_una, tc->snd_congestion))
- tcp_retransmit_timer_force_update (tc);
- else
- tcp_retransmit_timer_update (tc);
if (seq_geq (tc->snd_una, tc->snd_congestion))
{
+ tcp_retransmit_timer_update (tc);
+
/* If spurious return, we've already updated everything */
if (tcp_cc_recover (tc))
{
tc->snd_nxt = tc->snd_una_max;
/* Treat as congestion avoidance ack */
- tc->cc_algo->rcv_ack (tc);
- tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+ tcp_cc_rcv_ack (tc);
return;
}
* Legitimate ACK. 2) If PARTIAL ACK try to retransmit
*/
+ /* XXX limit this only to first partial ack? */
+ tcp_retransmit_timer_force_update (tc);
+
/* RFC6675: If the incoming ACK is a cumulative acknowledgment,
* reset dupacks to 0. Also needed if in congestion recovery */
tc->rcv_dupacks = 0;
/* Post RTO timeout don't try anything fancy */
if (tcp_in_recovery (tc))
{
- tc->cc_algo->rcv_ack (tc);
- tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+ tcp_cc_rcv_ack (tc);
transport_add_tx_event (&tc->connection);
return;
}
/* Remove retransmitted bytes that have been delivered */
- ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
- >= tc->sack_sb.last_bytes_delivered
- || (tc->flags & TCP_CONN_FINSNT));
-
- if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt))
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
{
+ ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
+ >= tc->sack_sb.last_bytes_delivered
+ || (tc->flags & TCP_CONN_FINSNT));
+
/* If we have sacks and we haven't gotten an ack beyond high_rxt,
* remove sacked bytes delivered */
- rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv
- - tc->sack_sb.last_bytes_delivered;
- ASSERT (tc->snd_rxt_bytes >= rxt_delivered);
- tc->snd_rxt_bytes -= rxt_delivered;
+ if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt))
+ {
+ rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv
+ - tc->sack_sb.last_bytes_delivered;
+ ASSERT (tc->snd_rxt_bytes >= rxt_delivered);
+ tc->snd_rxt_bytes -= rxt_delivered;
+ }
+ else
+ {
+ /* Apparently all retransmitted holes have been acked */
+ tc->snd_rxt_bytes = 0;
+ }
}
else
{
- /* Either all retransmitted holes have been acked, or we're
- * "in the blind" and retransmitting segment by segment */
- tc->snd_rxt_bytes = 0;
+ if (tc->snd_rxt_bytes > tc->bytes_acked)
+ tc->snd_rxt_bytes -= tc->bytes_acked;
+ else
+ tc->snd_rxt_bytes = 0;
}
tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK);
/*
* Since this was a partial ack, try to retransmit some more data
*/
- tcp_fast_retransmit (tc);
+ tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index), tc);
}
/**
if (!tcp_in_cong_recovery (tc))
return 0;
*error = TCP_ERROR_ACK_DUP;
- return vnet_buffer (b)->tcp.data_len ? 0 : -1;
+ if (vnet_buffer (b)->tcp.data_len || tcp_is_fin (th))
+ return 0;
+ return -1;
}
/*
thread_index);
err_counters[TCP_ERROR_EVENT_FIFO_FULL] = errors;
tcp_store_err_counters (established, err_counters);
- tcp_flush_frame_to_output (vm, thread_index, is_ip4);
+ tcp_flush_frame_to_output (tcp_get_worker (thread_index), is_ip4);
return frame->n_vectors;
}
if (tcp_opts_wscale (&new_tc0->rcv_opts))
new_tc0->snd_wscale = new_tc0->rcv_opts.wscale;
- /* RFC1323: SYN and SYN-ACK wnd not scaled */
- new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window);
+ new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
+ << new_tc0->snd_wscale;
new_tc0->snd_wl1 = seq0;
new_tc0->snd_wl2 = ack0;
tc0->state = TCP_STATE_CLOSED;
TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
- /* Delete the connection/session since the pipes should be
- * clear by now */
- tcp_connection_del (tc0);
+
+ /* Don't free the connection from the data path since
+ * we can't ensure that we have no packets already enqueued
+ * to output. Rely instead on the waitclose timer */
+ tcp_connection_timers_reset (tc0);
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, 1);
goto drop;
case TCP_STATE_SYN_RCVD:
/* Send FIN-ACK notify app and enter CLOSE-WAIT */
tcp_connection_timers_reset (tc0);
- tcp_retransmit_timer_set (tc0);
tcp_make_fin (tc0, b0);
tc0->snd_nxt += 1;
tc0->snd_una_max = tc0->snd_nxt;
+ tcp_retransmit_timer_set (tc0);
next0 = tcp_next_output (tc0->c_is_ip4);
stream_session_disconnect_notify (&tc0->connection);
tc0->state = TCP_STATE_CLOSE_WAIT;
vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
u16 nexts[VLIB_FRAME_SIZE], *next;
- tcp_set_time_now (thread_index);
+ tcp_set_time_now (tcp_get_worker (thread_index));
from = vlib_frame_vector_args (frame);
n_left_from = frame->n_vectors;
/* ACK for for a SYN-ACK -> tcp-rcv-process. */
_(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(SYN_RCVD, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
+ TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
+ TCP_ERROR_NONE);
/* SYN-ACK for a SYN */
_(SYN_SENT, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT,
TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(LAST_ACK, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(TIME_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);