{
ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval));
tc->tsval_recent = tc->rcv_opts.tsval;
- tc->tsval_recent_age = tcp_time_now ();
+ tc->tsval_recent_age = tcp_time_now_w_thread (tc->c_thread_index);
}
}
/* If it just so happens that a segment updates tsval_recent for a
* segment over 24 days old, invalidate tsval_recent. */
if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE,
- tcp_time_now ()))
+ tcp_time_now_w_thread (tc0->c_thread_index)))
{
/* Age isn't reset until we get a valid tsval (bsd inspired) */
tc0->tsval_recent = 0;
* seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */
else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr)
{
- mrtt = clib_max (tcp_time_now () - tc->rcv_opts.tsecr, 1);
+ u32 now = tcp_time_now_w_thread (tc->c_thread_index);
+ mrtt = clib_max (now - tc->rcv_opts.tsecr, 1);
}
/* Ignore dubious measurements */
}
/**
- * Dequeue bytes that have been acked and while at it update RTT estimates.
+ * Dequeue bytes for connections that have received acks in last burst
*/
static void
-tcp_dequeue_acked (tcp_connection_t * tc, u32 ack)
+tcp_handle_postponed_dequeues (tcp_worker_ctx_t * wrk)
{
- /* Dequeue the newly ACKed add SACKed bytes */
- stream_session_dequeue_drop (&tc->connection,
- tc->bytes_acked + tc->sack_sb.snd_una_adv);
+ u32 thread_index = wrk->vm->thread_index;
+ u32 *pending_deq_acked;
+ tcp_connection_t *tc;
+ int i;
+
+ if (!vec_len (wrk->pending_deq_acked))
+ return;
+
+ pending_deq_acked = wrk->pending_deq_acked;
+ for (i = 0; i < vec_len (pending_deq_acked); i++)
+ {
+ tc = tcp_connection_get (pending_deq_acked[i], thread_index);
+ tc->flags &= ~TCP_CONN_DEQ_PENDING;
- tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
+ /* Dequeue the newly ACKed bytes */
+ stream_session_dequeue_drop (&tc->connection, tc->burst_acked);
+ tc->burst_acked = 0;
+ tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
- /* Update rtt and rto */
- tcp_update_rtt (tc, ack);
+ /* If everything has been acked, stop retransmit timer
+ * otherwise update. */
+ tcp_retransmit_timer_update (tc);
+ }
+ _vec_len (wrk->pending_deq_acked) = 0;
+}
- /* If everything has been acked, stop retransmit timer
- * otherwise update. */
- tcp_retransmit_timer_update (tc);
+static void
+tcp_program_dequeue (tcp_worker_ctx_t * wrk, tcp_connection_t * tc)
+{
+ if (!(tc->flags & TCP_CONN_DEQ_PENDING))
+ {
+ vec_add1 (wrk->pending_deq_acked, tc->c_c_index);
+ tc->flags |= TCP_CONN_DEQ_PENDING;
+ }
+ tc->burst_acked += tc->bytes_acked + tc->sack_sb.snd_una_adv;
}
/**
sack_scoreboard_hole_t *
scoreboard_next_rxt_hole (sack_scoreboard_t * sb,
sack_scoreboard_hole_t * start,
- u8 have_sent_1_smss,
- u8 * can_rescue, u8 * snd_limited)
+ u8 have_unsent, u8 * can_rescue, u8 * snd_limited)
{
sack_scoreboard_hole_t *hole = 0;
}
else
{
- /* Rule (2): output takes care of transmitting new data */
- if (!have_sent_1_smss)
+ /* Rule (2): available unsent data */
+ if (have_unsent)
{
- hole = 0;
sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+ return 0;
}
/* Rule (3): if hole not lost */
else if (seq_lt (hole->start, sb->high_sacked))
}
static void
-scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 seq)
+scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 snd_una)
{
sack_scoreboard_hole_t *hole;
hole = scoreboard_first_hole (sb);
if (hole)
{
- seq = seq_gt (seq, hole->start) ? seq : hole->start;
+ snd_una = seq_gt (snd_una, hole->start) ? snd_una : hole->start;
sb->cur_rxt_hole = sb->head;
}
- sb->high_rxt = seq;
+ sb->high_rxt = snd_una;
+ sb->rescue_rxt = snd_una - 1;
}
void
tc->snd_wl2 = ack;
TCP_EVT_DBG (TCP_EVT_SND_WND, tc);
- if (tc->snd_wnd < tc->snd_mss)
+ if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss))
{
/* Set persist timer if not set and we just got 0 wnd */
if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST)
else
{
tcp_persist_timer_reset (tc);
- if (!tcp_in_recovery (tc) && tc->rto_boff > 0)
+ if (PREDICT_FALSE (!tcp_in_recovery (tc) && tc->rto_boff > 0))
{
tc->rto_boff = 0;
tcp_update_rto (tc);
tcp_fastrecovery_1_smss_off (tc);
tcp_fastrecovery_first_off (tc);
- /* Update pacer because our cwnd changed. Also makes sure
- * that we recompute the max burst size */
- tcp_update_pacer (tc);
-
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
}
}
void
-tcp_program_fastretransmit (tcp_connection_t * tc)
+tcp_program_fastretransmit (tcp_worker_ctx_t * wrk, tcp_connection_t * tc)
{
- tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[tc->c_thread_index];
if (!(tc->flags & TCP_CONN_FRXT_PENDING))
{
vec_add1 (wrk->pending_fast_rxt, tc->c_c_index);
}
void
-tcp_do_fastretransmits (u32 thread_index)
+tcp_do_fastretransmits (tcp_worker_ctx_t * wrk)
{
- tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[thread_index];
+ u32 *ongoing_fast_rxt, burst_bytes, sent_bytes, thread_index;
u32 max_burst_size, burst_size, n_segs = 0, n_segs_now;
- u32 *ongoing_fast_rxt, burst_bytes, sent_bytes;
tcp_connection_t *tc;
u64 last_cpu_time;
int i;
&& vec_len (wrk->postponed_fast_rxt) == 0)
return;
+ thread_index = wrk->vm->thread_index;
last_cpu_time = wrk->vm->clib_time.last_cpu_time;
ongoing_fast_rxt = wrk->ongoing_fast_rxt;
vec_append (ongoing_fast_rxt, wrk->postponed_fast_rxt);
_vec_len (wrk->postponed_fast_rxt) = 0;
_vec_len (wrk->pending_fast_rxt) = 0;
- max_burst_size = VLIB_FRAME_SIZE / vec_len (wrk->ongoing_fast_rxt);
+ max_burst_size = VLIB_FRAME_SIZE / vec_len (ongoing_fast_rxt);
max_burst_size = clib_max (max_burst_size, 1);
for (i = 0; i < vec_len (ongoing_fast_rxt); i++)
burst_size = clib_min (burst_size, burst_bytes / tc->snd_mss);
if (!burst_size)
{
- tcp_program_fastretransmit (tc);
+ tcp_program_fastretransmit (wrk, tc);
continue;
}
- n_segs_now = tcp_fast_retransmit (tc, burst_size);
+ n_segs_now = tcp_fast_retransmit (wrk, tc, burst_size);
sent_bytes = clib_min (n_segs_now * tc->snd_mss, burst_bytes);
transport_connection_tx_pacer_update_bytes (&tc->connection,
sent_bytes);
-
n_segs += n_segs_now;
}
_vec_len (ongoing_fast_rxt) = 0;
{
if (tc->bytes_acked)
goto partial_ack;
- tcp_program_fastretransmit (tc);
+ tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index), tc);
return;
}
/*
}
else if (tcp_should_fastrecover (tc))
{
- u32 byte_rate;
+ u32 pacer_wnd;
+
ASSERT (!tcp_in_fastrecovery (tc));
/* Heuristic to catch potential late dupacks
{
tc->cwnd = tc->ssthresh;
scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una);
- tc->sack_sb.rescue_rxt = tc->snd_una - 1;
}
else
{
tc->cwnd = tc->ssthresh + 3 * tc->snd_mss;
}
- byte_rate = (0.3 * tc->cwnd) / ((f64) TCP_TICK * tc->srtt);
- transport_connection_tx_pacer_init (&tc->connection, byte_rate, 0);
- tcp_program_fastretransmit (tc);
+ /* Constrain rate until we get a partial ack */
+ pacer_wnd = clib_max (0.1 * tc->cwnd, 2 * tc->snd_mss);
+ tcp_connection_tx_pacer_reset (tc, pacer_wnd,
+ 0 /* start bucket */ );
+ tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index),
+ tc);
return;
}
else if (!tc->bytes_acked
* Legitimate ACK. 2) If PARTIAL ACK try to retransmit
*/
+ /* Update the pacing rate. For the first partial ack we move from
+ * the artificially constrained rate to the one after congestion */
+ tcp_connection_tx_pacer_update (tc);
+
/* XXX limit this only to first partial ack? */
tcp_retransmit_timer_force_update (tc);
{
/* Apparently all retransmitted holes have been acked */
tc->snd_rxt_bytes = 0;
+ tc->sack_sb.high_rxt = tc->snd_una;
}
}
else
{
+ tcp_fastrecovery_first_on (tc);
+ /* Reuse last bytes delivered to track total bytes acked */
+ tc->sack_sb.last_bytes_delivered += tc->bytes_acked;
if (tc->snd_rxt_bytes > tc->bytes_acked)
tc->snd_rxt_bytes -= tc->bytes_acked;
else
/*
* Since this was a partial ack, try to retransmit some more data
*/
- tcp_program_fastretransmit (tc);
+ tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index), tc);
}
/**
* Process incoming ACK
*/
static int
-tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
+tcp_rcv_ack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, vlib_buffer_t * b,
tcp_header_t * th, u32 * next, u32 * error)
{
u32 prev_snd_wnd, prev_snd_una;
{
tcp_make_ack (tc, b);
*next = tcp_next_output (tc->c_is_ip4);
- *error = TCP_ERROR_ACK_INVALID;
+ *error = TCP_ERROR_ACK_FUTURE;
TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 0,
vnet_buffer (b)->tcp.ack_number);
return -1;
vnet_buffer (b)->tcp.ack_number);
tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
- *error = TCP_ERROR_ACK_FUTURE;
}
/* If old ACK, probably it's an old dupack */
tcp_validate_txf_size (tc, tc->bytes_acked);
if (tc->bytes_acked)
- tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number);
+ {
+ tcp_program_dequeue (wrk, tc);
+ tcp_update_rtt (tc, vnet_buffer (b)->tcp.ack_number);
+ }
TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
vlib_frame_t * frame, int is_ip4)
{
u32 thread_index = vm->thread_index, errors = 0;
+ tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
u32 n_left_from, next_index, *from, *to_next;
u16 err_counters[TCP_N_ERROR] = { 0 };
u8 is_fin = 0;
}
/* 5: check the ACK field */
- if (PREDICT_FALSE (tcp_rcv_ack (tc0, b0, th0, &next0, &error0)))
+ if (PREDICT_FALSE (tcp_rcv_ack (wrk, tc0, b0, th0, &next0,
+ &error0)))
{
tcp_maybe_inc_err_counter (err_counters, error0);
goto done;
thread_index);
err_counters[TCP_ERROR_EVENT_FIFO_FULL] = errors;
tcp_store_err_counters (established, err_counters);
- tcp_flush_frame_to_output (vm, thread_index, is_ip4);
+ tcp_handle_postponed_dequeues (wrk);
+ tcp_flush_frame_to_output (wrk, is_ip4);
return frame->n_vectors;
}
{
u32 n_left_from, next_index, *from, *to_next, n_fins = 0;
u32 my_thread_index = vm->thread_index, errors = 0;
+ tcp_worker_ctx_t *wrk = tcp_get_worker (my_thread_index);
from = vlib_frame_vector_args (from_frame);
n_left_from = from_frame->n_vectors;
case TCP_STATE_ESTABLISHED:
/* We can get packets in established state here because they
* were enqueued before state change */
- if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+ if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0))
{
tcp_maybe_inc_counter (rcv_process, error0, 1);
goto drop;
/* In addition to the processing for the ESTABLISHED state, if
* our FIN is now acknowledged then enter FIN-WAIT-2 and
* continue processing in that state. */
- if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+ if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0))
{
tcp_maybe_inc_counter (rcv_process, error0, 1);
goto drop;
/* In addition to the processing for the ESTABLISHED state, if
* the retransmission queue is empty, the user's CLOSE can be
* acknowledged ("ok") but do not delete the TCB. */
- if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+ if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0))
{
tcp_maybe_inc_counter (rcv_process, error0, 1);
goto drop;
break;
case TCP_STATE_CLOSE_WAIT:
/* Do the same processing as for the ESTABLISHED state. */
- if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+ if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0))
{
tcp_maybe_inc_counter (rcv_process, error0, 1);
goto drop;
/* In addition to the processing for the ESTABLISHED state, if
* the ACK acknowledges our FIN then enter the TIME-WAIT state,
* otherwise ignore the segment. */
- if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+ if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0))
{
tcp_maybe_inc_counter (rcv_process, error0, 1);
goto drop;
* retransmission of the remote FIN. Acknowledge it, and restart
* the 2 MSL timeout. */
- if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+ if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0))
{
tcp_maybe_inc_counter (rcv_process, error0, 1);
goto drop;
my_thread_index);
tcp_inc_counter (rcv_process, TCP_ERROR_EVENT_FIFO_FULL, errors);
tcp_inc_counter (rcv_process, TCP_ERROR_FIN_RCVD, n_fins);
+ tcp_handle_postponed_dequeues (wrk);
return from_frame->n_vectors;
}
vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
u16 nexts[VLIB_FRAME_SIZE], *next;
- tcp_set_time_now (thread_index);
+ tcp_set_time_now (tcp_get_worker (thread_index));
from = vlib_frame_vector_args (frame);
n_left_from = frame->n_vectors;