/* Zero out all flags but those set in SYN */
to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE
- | TCP_OPTS_FLAG_TSTAMP | TCP_OPTION_MSS);
+ | TCP_OPTS_FLAG_TSTAMP | TCP_OPTS_FLAG_MSS);
for (; opts_len > 0; opts_len -= opt_len, data += opt_len)
{
if (tcp_in_cong_recovery (tc))
{
/* Accept rtt estimates for samples that have not been retransmitted */
- if ((tc->flags & TCP_CONN_RATE_SAMPLE) && !(rs->flags & TCP_BTS_IS_RXT))
+ if ((tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
+ && !(rs->flags & TCP_BTS_IS_RXT))
{
mrtt = rs->rtt_time * THZ;
goto estimate_rtt;
}
- if (tcp_in_recovery (tc))
- return 0;
goto done;
}
tcp_update_rto (tc);
}
+always_inline u8
+tcp_recovery_no_snd_space (tcp_connection_t * tc)
+{
+ return (tcp_in_fastrecovery (tc)
+ && tcp_fastrecovery_prr_snd_space (tc) < tc->snd_mss)
+ || (tcp_in_recovery (tc)
+ && tcp_available_output_snd_space (tc) < tc->snd_mss);
+}
+
/**
* Dequeue bytes for connections that have received acks in last burst
*/
tc = tcp_connection_get (pending_deq_acked[i], thread_index);
tc->flags &= ~TCP_CONN_DEQ_PENDING;
- if (PREDICT_FALSE (!tc->burst_acked))
- continue;
+ if (tc->burst_acked)
+ {
+ /* Dequeue the newly ACKed bytes */
+ session_tx_fifo_dequeue_drop (&tc->connection, tc->burst_acked);
+ tc->burst_acked = 0;
+ tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
+
+ if (PREDICT_FALSE (tc->flags & TCP_CONN_PSH_PENDING))
+ {
+ if (seq_leq (tc->psh_seq, tc->snd_una))
+ tc->flags &= ~TCP_CONN_PSH_PENDING;
+ }
- /* Dequeue the newly ACKed bytes */
- session_tx_fifo_dequeue_drop (&tc->connection, tc->burst_acked);
- tc->burst_acked = 0;
- tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
+ /* If everything has been acked, stop retransmit timer
+ * otherwise update. */
+ tcp_retransmit_timer_update (tc);
- if (PREDICT_FALSE (tc->flags & TCP_CONN_PSH_PENDING))
- {
- if (seq_leq (tc->psh_seq, tc->snd_una))
- tc->flags &= ~TCP_CONN_PSH_PENDING;
+ /* Update pacer based on our new cwnd estimate */
+ tcp_connection_tx_pacer_update (tc);
}
- /* If everything has been acked, stop retransmit timer
- * otherwise update. */
- tcp_retransmit_timer_update (tc);
-
- /* If not congested, update pacer based on our new
- * cwnd estimate */
- if (!tcp_in_fastrecovery (tc))
- tcp_connection_tx_pacer_update (tc);
+ /* Reset the pacer if we've been idle, i.e., no data sent or if
+ * we're in recovery and snd space constrained */
+ if (tc->data_segs_out == tc->prev_dsegs_out
+ || tcp_recovery_no_snd_space (tc))
+ transport_connection_tx_pacer_reset_bucket (&tc->connection,
+ wrk->vm->clib_time.
+ last_cpu_time);
+ tc->prev_dsegs_out = tc->data_segs_out;
}
_vec_len (wrk->pending_deq_acked) = 0;
}
tc->burst_acked += tc->bytes_acked;
}
-/**
- * Check if duplicate ack as per RFC5681 Sec. 2
- */
-static u8
-tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd,
- u32 prev_snd_una)
-{
- return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una)
- && seq_gt (tc->snd_nxt, tc->snd_una)
- && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number)
- && (prev_snd_wnd == tc->snd_wnd));
-}
-
-/**
- * Checks if ack is a congestion control event.
- */
-static u8
-tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b,
- u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack)
-{
- /* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are
- * defined to be 'duplicate' */
- *is_dack = tc->sack_sb.last_sacked_bytes
- || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una);
-
- return ((*is_dack || tcp_in_cong_recovery (tc)) && !tcp_is_lost_fin (tc));
-}
-
#ifndef CLIB_MARCH_VARIANT
static u32
scoreboard_hole_index (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
return hole;
}
-#endif /* CLIB_MARCH_VARIANT */
-#ifndef CLIB_MARCH_VARIANT
+always_inline void
+scoreboard_update_sacked_rxt (sack_scoreboard_t * sb, u32 start, u32 end,
+ u8 has_rxt)
+{
+ if (!has_rxt || seq_geq (start, sb->high_rxt))
+ return;
+
+ sb->rxt_sacked +=
+ seq_lt (end, sb->high_rxt) ? (end - start) : (sb->high_rxt - start);
+}
always_inline void
scoreboard_update_bytes (sack_scoreboard_t * sb, u32 ack, u32 snd_mss)
while (right)
{
sb->lost_bytes += scoreboard_hole_bytes (right);
- sb->last_lost_bytes += right->is_lost ? 0 : right->end - right->start;
+ sb->last_lost_bytes += right->is_lost ? 0 : (right->end - right->start);
right->is_lost = 1;
left = scoreboard_prev_hole (sb, right);
if (!left)
/* Rule (3): if hole not lost */
else if (seq_lt (hole->start, sb->high_sacked))
{
+ /* And we didn't already retransmit it */
+ if (seq_leq (hole->end, sb->high_rxt))
+ {
+ sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+ return 0;
+ }
*snd_limited = 0;
sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
}
return hole;
}
-#endif /* CLIB_MARCH_VARIANT */
-static void
-scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 snd_una)
+void
+scoreboard_init_rxt (sack_scoreboard_t * sb, u32 snd_una)
{
sack_scoreboard_hole_t *hole;
hole = scoreboard_first_hole (sb);
sb->rescue_rxt = snd_una - 1;
}
-#ifndef CLIB_MARCH_VARIANT
void
scoreboard_init (sack_scoreboard_t * sb)
{
sb->sacked_bytes = 0;
sb->last_sacked_bytes = 0;
sb->last_bytes_delivered = 0;
- sb->high_sacked = 0;
- sb->high_rxt = 0;
sb->lost_bytes = 0;
sb->last_lost_bytes = 0;
sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
sb->is_reneging = 0;
}
+
+void
+scoreboard_clear_reneging (sack_scoreboard_t * sb, u32 start, u32 end)
+{
+ sack_scoreboard_hole_t *last_hole;
+
+ clib_warning ("sack reneging");
+
+ scoreboard_clear (sb);
+ last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX,
+ start, end);
+ last_hole->is_lost = 1;
+ sb->tail = scoreboard_hole_index (sb, last_hole);
+ sb->high_sacked = start;
+ scoreboard_init_rxt (sb, start);
+}
+
#endif /* CLIB_MARCH_VARIANT */
/**
sack_scoreboard_t *sb = &tc->sack_sb;
sack_block_t *blk, *rcv_sacks;
u32 blk_index = 0, i, j;
+ u8 has_rxt;
sb->last_sacked_bytes = 0;
sb->last_bytes_delivered = 0;
+ sb->rxt_sacked = 0;
if (!tcp_opts_sack (&tc->rcv_opts)
&& sb->head == TCP_INVALID_SACK_HOLE_INDEX)
return;
+ has_rxt = tcp_in_cong_recovery (tc);
+
/* Remove invalid blocks */
blk = tc->rcv_opts.sacks;
while (blk < vec_end (tc->rcv_opts.sacks))
sb->is_reneging = 0;
}
}
+ scoreboard_update_sacked_rxt (sb, hole->start, hole->end,
+ has_rxt);
scoreboard_remove_hole (sb, hole);
hole = next_hole;
}
{
if (seq_gt (blk->end, hole->start))
{
+ scoreboard_update_sacked_rxt (sb, hole->start, blk->end,
+ has_rxt);
hole->start = blk->end;
}
blk_index++;
/* Pool might've moved */
hole = scoreboard_get_hole (sb, hole_index);
hole->end = blk->start;
+
+ scoreboard_update_sacked_rxt (sb, blk->start, blk->end,
+ has_rxt);
+
blk_index++;
ASSERT (hole->next == scoreboard_hole_index (sb, next_hole));
}
else if (seq_lt (blk->start, hole->end))
{
+ scoreboard_update_sacked_rxt (sb, blk->start, hole->end,
+ has_rxt);
hole->end = blk->start;
}
hole = scoreboard_next_hole (sb, hole);
ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc)
|| sb->is_reneging || sb->holes[sb->head].start == ack);
ASSERT (sb->last_lost_bytes <= sb->lost_bytes);
+ ASSERT ((ack - tc->snd_una) + sb->last_sacked_bytes
+ - sb->last_bytes_delivered >= sb->rxt_sacked);
+ ASSERT ((ack - tc->snd_una) >= tc->sack_sb.last_bytes_delivered
+ || (tc->flags & TCP_CONN_FINSNT));
TCP_EVT (TCP_EVT_CC_SCOREBOARD, tc);
}
}
}
-#ifndef CLIB_MARCH_VARIANT
/**
* Init loss recovery/fast recovery.
*
* Triggered by dup acks as opposed to timer timeout. Note that cwnd is
* updated in @ref tcp_cc_handle_event after fast retransmit
*/
-void
+static void
tcp_cc_init_congestion (tcp_connection_t * tc)
{
tcp_fastrecovery_on (tc);
tc->snd_congestion = tc->snd_nxt;
tc->cwnd_acc_bytes = 0;
tc->snd_rxt_bytes = 0;
+ tc->rxt_delivered = 0;
+ tc->prr_delivered = 0;
+ tc->prr_start = tc->snd_una;
tc->prev_ssthresh = tc->ssthresh;
tc->prev_cwnd = tc->cwnd;
- tc->cc_algo->congestion (tc);
- tc->fr_occurences += 1;
- TCP_EVT (TCP_EVT_CC_EVT, tc, 4);
-}
-#endif /* CLIB_MARCH_VARIANT */
-
-static void
-tcp_cc_recovery_exit (tcp_connection_t * tc)
-{
- tc->rto_boff = 0;
- tcp_update_rto (tc);
- tc->snd_rxt_ts = 0;
- tc->rtt_ts = 0;
- tcp_recovery_off (tc);
- TCP_EVT (TCP_EVT_CC_EVT, tc, 3);
-}
-#ifndef CLIB_MARCH_VARIANT
-void
-tcp_cc_fastrecovery_clear (tcp_connection_t * tc)
-{
- tc->snd_rxt_bytes = 0;
- tc->rcv_dupacks = 0;
- tc->rtt_ts = 0;
+ tc->snd_rxt_ts = tcp_tstamp (tc);
+ tcp_cc_congestion (tc);
- tcp_fastrecovery_off (tc);
- tcp_fastrecovery_first_off (tc);
- tc->flags &= ~TCP_CONN_FRXT_PENDING;
+ /* Post retransmit update cwnd to ssthresh and account for the
+ * three segments that have left the network and should've been
+ * buffered at the receiver XXX */
+ if (!tcp_opts_sack_permitted (&tc->rcv_opts))
+ tc->cwnd += 3 * tc->snd_mss;
- TCP_EVT (TCP_EVT_CC_EVT, tc, 3);
+ tc->fr_occurences += 1;
+ TCP_EVT (TCP_EVT_CC_EVT, tc, 4);
}
-#endif /* CLIB_MARCH_VARIANT */
static void
tcp_cc_congestion_undo (tcp_connection_t * tc)
{
tc->cwnd = tc->prev_cwnd;
tc->ssthresh = tc->prev_ssthresh;
- tc->rcv_dupacks = 0;
- if (tcp_in_recovery (tc))
- {
- tcp_cc_recovery_exit (tc);
- tc->snd_nxt = seq_max (tc->snd_nxt, tc->snd_congestion);
- }
- else if (tcp_in_fastrecovery (tc))
- {
- tcp_cc_fastrecovery_clear (tc);
- }
tcp_cc_undo_recovery (tc);
ASSERT (tc->rto_boff == 0);
TCP_EVT (TCP_EVT_CC_EVT, tc, 5);
}
static inline u8
-tcp_cc_is_spurious_fast_rxt (tcp_connection_t * tc)
+tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
{
- return (tcp_in_fastrecovery (tc)
- && tc->cwnd > tc->ssthresh + 3 * tc->snd_mss);
+ return (tcp_cc_is_spurious_timeout_rxt (tc));
}
-static u8
-tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
+static inline u8
+tcp_should_fastrecover_sack (tcp_connection_t * tc)
{
- return (tcp_cc_is_spurious_timeout_rxt (tc)
- || tcp_cc_is_spurious_fast_rxt (tc));
+ return (tc->sack_sb.lost_bytes
+ || ((TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss
+ < tc->sack_sb.sacked_bytes));
+}
+
+static inline u8
+tcp_should_fastrecover (tcp_connection_t * tc, u8 has_sack)
+{
+ if (!has_sack)
+ {
+ /* If of of the two conditions lower hold, reset dupacks because
+ * we're probably after timeout (RFC6582 heuristics).
+ * If Cumulative ack does not cover more than congestion threshold,
+ * and:
+ * 1) The following doesn't hold: The congestion window is greater
+ * than SMSS bytes and the difference between highest_ack
+ * and prev_highest_ack is at most 4*SMSS bytes
+ * 2) Echoed timestamp in the last non-dup ack does not equal the
+ * stored timestamp
+ */
+ if (seq_leq (tc->snd_una, tc->snd_congestion)
+ && ((!(tc->cwnd > tc->snd_mss
+ && tc->bytes_acked <= 4 * tc->snd_mss))
+ || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
+ {
+ tc->rcv_dupacks = 0;
+ return 0;
+ }
+ }
+ return ((tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
+ || tcp_should_fastrecover_sack (tc));
}
static int
tcp_cc_recover (tcp_connection_t * tc)
{
sack_scoreboard_hole_t *hole;
+ u8 is_spurious = 0;
ASSERT (tcp_in_cong_recovery (tc));
- hole = scoreboard_first_hole (&tc->sack_sb);
- if (hole && hole->start == tc->snd_una && hole->end == tc->snd_nxt)
- scoreboard_clear (&tc->sack_sb);
-
if (tcp_cc_is_spurious_retransmit (tc))
{
tcp_cc_congestion_undo (tc);
- return 1;
+ is_spurious = 1;
}
- if (tcp_in_recovery (tc))
- tcp_cc_recovery_exit (tc);
- else if (tcp_in_fastrecovery (tc))
+ tc->rcv_dupacks = 0;
+ tc->prr_delivered = 0;
+ tc->rxt_delivered = 0;
+ tc->snd_rxt_bytes = 0;
+ tc->snd_rxt_ts = 0;
+ tc->rtt_ts = 0;
+ tc->flags &= ~TCP_CONN_RXT_PENDING;
+
+ tcp_connection_tx_pacer_reset (tc, tc->cwnd, 0 /* start bucket */ );
+
+ /* Previous recovery left us congested. Continue sending as part
+ * of the current recovery event with an updated snd_congestion */
+ if (tc->sack_sb.sacked_bytes)
{
- tcp_cc_recovered (tc);
- tcp_cc_fastrecovery_clear (tc);
+ tc->snd_congestion = tc->snd_nxt;
+ tc->snd_rxt_ts = tcp_tstamp (tc);
+ tc->prr_start = tc->snd_una;
+ scoreboard_init_rxt (&tc->sack_sb, tc->snd_una);
+ tcp_program_retransmit (tc);
+ return is_spurious;
}
+ hole = scoreboard_first_hole (&tc->sack_sb);
+ if (hole && hole->start == tc->snd_una && hole->end == tc->snd_nxt)
+ scoreboard_clear (&tc->sack_sb);
+
+ if (!tcp_in_recovery (tc) && !is_spurious)
+ tcp_cc_recovered (tc);
+
+ tcp_fastrecovery_off (tc);
+ tcp_fastrecovery_first_off (tc);
+ tcp_recovery_off (tc);
+ TCP_EVT (TCP_EVT_CC_EVT, tc, 3);
+
ASSERT (tc->rto_boff == 0);
ASSERT (!tcp_in_cong_recovery (tc));
ASSERT (tcp_scoreboard_is_sane_post_recovery (tc));
- return 0;
+ return is_spurious;
}
static void
tc->snd_congestion = tc->snd_una - 1;
}
-static u8
-tcp_should_fastrecover_sack (tcp_connection_t * tc)
-{
- return (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss < tc->sack_sb.sacked_bytes;
-}
-
-static u8
-tcp_should_fastrecover (tcp_connection_t * tc)
-{
- return (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD
- || tcp_should_fastrecover_sack (tc));
-}
-
/**
* One function to rule them all ... and in the darkness bind them
*/
tcp_cc_handle_event (tcp_connection_t * tc, tcp_rate_sample_t * rs,
u32 is_dack)
{
- u32 rxt_delivered;
+ u8 has_sack = tcp_opts_sack_permitted (&tc->rcv_opts);
- if (tcp_in_fastrecovery (tc) && tcp_opts_sack_permitted (&tc->rcv_opts))
- {
- if (tc->bytes_acked)
- goto partial_ack;
- tcp_program_fastretransmit (tc);
- return;
- }
/*
- * Duplicate ACK. Check if we should enter fast recovery, or if already in
- * it account for the bytes that left the network.
+ * If not in recovery, figure out if we should enter
*/
- else if (is_dack && !tcp_in_recovery (tc))
+ if (!tcp_in_cong_recovery (tc))
{
- TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1);
- ASSERT (tc->snd_una != tc->snd_nxt || tc->sack_sb.last_sacked_bytes);
+ ASSERT (is_dack);
tc->rcv_dupacks++;
+ TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1);
+ tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs);
- /* Pure duplicate ack. If some data got acked, it's handled lower */
- if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked)
+ if (tcp_should_fastrecover (tc, has_sack))
{
- ASSERT (tcp_in_fastrecovery (tc));
- tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs);
- return;
- }
- else if (tcp_should_fastrecover (tc))
- {
- u32 pacer_wnd;
-
- ASSERT (!tcp_in_fastrecovery (tc));
-
- /* Heuristic to catch potential late dupacks
- * after fast retransmit exits */
- if (is_dack && tc->snd_una == tc->snd_congestion
- && timestamp_leq (tc->rcv_opts.tsecr, tc->tsecr_last_ack))
- {
- tc->rcv_dupacks = 0;
- return;
- }
-
tcp_cc_init_congestion (tc);
- if (tcp_opts_sack_permitted (&tc->rcv_opts))
- scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una);
+ if (has_sack)
+ scoreboard_init_rxt (&tc->sack_sb, tc->snd_una);
- /* Constrain rate until we get a partial ack */
- pacer_wnd = clib_max (0.1 * tc->cwnd, 2 * tc->snd_mss);
- tcp_connection_tx_pacer_reset (tc, pacer_wnd,
- 0 /* start bucket */ );
- tcp_program_fastretransmit (tc);
- return;
- }
- else if (!tc->bytes_acked
- || (tc->bytes_acked && !tcp_in_cong_recovery (tc)))
- {
- tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs);
- return;
- }
- else
- goto partial_ack;
- }
- /* Don't allow entry in fast recovery if still in recovery, for now */
- else if (0 && is_dack && tcp_in_recovery (tc))
- {
- /* If of of the two conditions lower hold, reset dupacks because
- * we're probably after timeout (RFC6582 heuristics).
- * If Cumulative ack does not cover more than congestion threshold,
- * and:
- * 1) The following doesn't hold: The congestion window is greater
- * than SMSS bytes and the difference between highest_ack
- * and prev_highest_ack is at most 4*SMSS bytes
- * 2) Echoed timestamp in the last non-dup ack does not equal the
- * stored timestamp
- */
- if (seq_leq (tc->snd_una, tc->snd_congestion)
- && ((!(tc->cwnd > tc->snd_mss
- && tc->bytes_acked <= 4 * tc->snd_mss))
- || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
- {
- tc->rcv_dupacks = 0;
- return;
+ tcp_connection_tx_pacer_reset (tc, tc->cwnd, 0 /* start bucket */ );
+ tcp_program_retransmit (tc);
}
- }
- if (!tc->bytes_acked)
- return;
-
-partial_ack:
- TCP_EVT (TCP_EVT_CC_PACK, tc);
+ return;
+ }
/*
- * Legitimate ACK. 1) See if we can exit recovery
+ * Already in recovery. See if we can exit and stop retransmitting
*/
- /* Update the pacing rate. For the first partial ack we move from
- * the artificially constrained rate to the one after congestion */
- tcp_connection_tx_pacer_update (tc);
-
if (seq_geq (tc->snd_una, tc->snd_congestion))
{
- tcp_retransmit_timer_update (tc);
-
/* If spurious return, we've already updated everything */
if (tcp_cc_recover (tc))
{
}
/*
- * Legitimate ACK. 2) If PARTIAL ACK try to retransmit
+ * Process (re)transmit feedback. Output path uses this to decide how much
+ * more data to release into the network
*/
-
- /* XXX limit this only to first partial ack? */
- tcp_retransmit_timer_update (tc);
-
- /* RFC6675: If the incoming ACK is a cumulative acknowledgment,
- * reset dupacks to 0. Also needed if in congestion recovery */
- tc->rcv_dupacks = 0;
-
- /* Post RTO timeout don't try anything fancy */
- if (tcp_in_recovery (tc))
+ if (has_sack)
{
- tcp_cc_rcv_ack (tc, rs);
- transport_add_tx_event (&tc->connection);
- return;
- }
+ tc->rxt_delivered += tc->sack_sb.rxt_sacked;
+ tc->prr_delivered += tc->bytes_acked + tc->sack_sb.last_sacked_bytes
+ - tc->sack_sb.last_bytes_delivered;
- /* Remove retransmitted bytes that have been delivered */
- if (tcp_opts_sack_permitted (&tc->rcv_opts))
+ tcp_program_retransmit (tc);
+ }
+ else
{
- ASSERT (tc->bytes_acked >= tc->sack_sb.last_bytes_delivered
- || (tc->flags & TCP_CONN_FINSNT));
-
- /* If we have sacks and we haven't gotten an ack beyond high_rxt,
- * remove sacked bytes delivered */
- if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt))
+ if (is_dack)
{
- rxt_delivered = tc->bytes_acked - tc->sack_sb.last_bytes_delivered;
- ASSERT (tc->snd_rxt_bytes >= rxt_delivered);
- tc->snd_rxt_bytes -= rxt_delivered;
+ tc->rcv_dupacks += 1;
+ TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1);
}
+ tc->rxt_delivered = clib_max (tc->rxt_delivered + tc->bytes_acked,
+ tc->snd_rxt_bytes);
+ if (is_dack)
+ tc->prr_delivered += 1;
else
- {
- /* Apparently all retransmitted holes have been acked */
- tc->snd_rxt_bytes = 0;
- tc->sack_sb.high_rxt = tc->snd_una;
- }
+ tc->prr_delivered += tc->bytes_acked - tc->snd_mss * tc->rcv_dupacks;
+
+ /* If partial ack, assume that the first un-acked segment was lost */
+ if (tc->bytes_acked || tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
+ tcp_fastrecovery_first_on (tc);
+
+ tcp_program_retransmit (tc);
}
- else
+
+ /*
+ * Notify cc of the event
+ */
+
+ if (!tc->bytes_acked)
{
- tcp_fastrecovery_first_on (tc);
- if (tc->snd_rxt_bytes > tc->bytes_acked)
- tc->snd_rxt_bytes -= tc->bytes_acked;
- else
- tc->snd_rxt_bytes = 0;
+ tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs);
+ return;
}
- tcp_cc_rcv_cong_ack (tc, TCP_CC_PARTIALACK, rs);
+ /* RFC6675: If the incoming ACK is a cumulative acknowledgment,
+ * reset dupacks to 0. Also needed if in congestion recovery */
+ tc->rcv_dupacks = 0;
- /*
- * Since this was a partial ack, try to retransmit some more data
- */
- tcp_program_fastretransmit (tc);
+ if (tcp_in_recovery (tc))
+ tcp_cc_rcv_ack (tc, rs);
+ else
+ tcp_cc_rcv_cong_ack (tc, TCP_CC_PARTIALACK, rs);
+}
+
+/**
+ * Check if duplicate ack as per RFC5681 Sec. 2
+ */
+always_inline u8
+tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd,
+ u32 prev_snd_una)
+{
+ return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una)
+ && seq_gt (tc->snd_nxt, tc->snd_una)
+ && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number)
+ && (prev_snd_wnd == tc->snd_wnd));
+}
+
+/**
+ * Checks if ack is a congestion control event.
+ */
+static u8
+tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b,
+ u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack)
+{
+ /* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are
+ * defined to be 'duplicate' as well */
+ *is_dack = tc->sack_sb.last_sacked_bytes
+ || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una);
+
+ /* If reneging, wait for timer based retransmits */
+ if (PREDICT_FALSE (tcp_is_lost_fin (tc) || tc->sack_sb.is_reneging))
+ return 0;
+
+ return (*is_dack || tcp_in_cong_recovery (tc));
}
/**
tc->snd_una = vnet_buffer (b)->tcp.ack_number;
tcp_validate_txf_size (tc, tc->bytes_acked);
- if (tc->flags & TCP_CONN_RATE_SAMPLE)
+ if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
tcp_bt_sample_delivery_rate (tc, &rs);
+ tcp_program_dequeue (wrk, tc);
+
if (tc->bytes_acked)
- {
- tcp_program_dequeue (wrk, tc);
- tcp_update_rtt (tc, &rs, vnet_buffer (b)->tcp.ack_number);
- }
+ tcp_update_rtt (tc, &rs, vnet_buffer (b)->tcp.ack_number);
TCP_EVT (TCP_EVT_ACK_RCVD, tc);
/* Account for the FIN and send ack */
tc->rcv_nxt += 1;
+ tc->flags |= TCP_CONN_FINRCVD;
tcp_program_ack (tc);
/* Enter CLOSE-WAIT and notify session. To avoid lingering
* in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
sw_if_idx = dpo->dpoi_index;
hw_if = vnet_get_sup_hw_interface (vnm, sw_if_idx);
- tc->is_tso =
- ((hw_if->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO) == 0) ? 0 : 1;
+ if (hw_if->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO)
+ tc->cfg_flags |= TCP_CFG_F_TSO;
}
-
always_inline uword
tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_frame_t * from_frame, int is_ip4)
goto drop;
}
- tcp_check_tx_offload (new_tc0, is_ip4);
+ if (!(new_tc0->cfg_flags & TCP_CFG_F_NO_TSO))
+ tcp_check_tx_offload (new_tc0, is_ip4);
/* Read data, if any */
if (PREDICT_FALSE (vnet_buffer (b0)->tcp.data_len))
}
else
{
- tcp_program_ack (new_tc0);
+ /* Send ack now instead of programming it because connection was
+ * just established and it's not optional. */
+ tcp_send_ack (new_tc0);
}
drop:
if (CLIB_DEBUG)
{
- tcp_connection_t *tmp;
- tmp = tcp_lookup_connection (tc0->c_fib_index, b0, thread_index,
- is_ip4);
- if (tmp->state != tc0->state)
+ if (!(tc0->connection.flags & TRANSPORT_CONNECTION_F_NO_LOOKUP))
{
- if (tc0->state != TCP_STATE_CLOSED)
- clib_warning ("state changed");
- goto drop;
+ tcp_connection_t *tmp;
+ tmp = tcp_lookup_connection (tc0->c_fib_index, b0, thread_index,
+ is_ip4);
+ if (tmp->state != tc0->state)
+ {
+ if (tc0->state != TCP_STATE_CLOSED)
+ clib_warning ("state changed");
+ goto drop;
+ }
}
}
/* Update rtt and rto */
tcp_estimate_initial_rtt (tc0);
+ tcp_connection_tx_pacer_update (tc0);
/* Switch state to ESTABLISHED */
tc0->state = TCP_STATE_ESTABLISHED;
TCP_EVT (TCP_EVT_STATE_CHANGE, tc0);
- tcp_check_tx_offload (tc0, is_ip4);
+ if (!(tc0->cfg_flags & TCP_CFG_F_NO_TSO))
+ tcp_check_tx_offload (tc0, is_ip4);
/* Initialize session variables */
tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;