{
TCP_OUTPUT_NEXT_DROP,
TCP_OUTPUT_NEXT_IP_LOOKUP,
+ TCP_OUTPUT_NEXT_IP_REWRITE,
+ TCP_OUTPUT_NEXT_IP_ARP,
TCP_OUTPUT_N_NEXT
} tcp_output_next_t;
#define foreach_tcp4_output_next \
_ (DROP, "error-drop") \
- _ (IP_LOOKUP, "ip4-lookup")
+ _ (IP_LOOKUP, "ip4-lookup") \
+ _ (IP_REWRITE, "ip4-rewrite") \
+ _ (IP_ARP, "ip4-arp")
#define foreach_tcp6_output_next \
_ (DROP, "error-drop") \
- _ (IP_LOOKUP, "ip6-lookup")
+ _ (IP_LOOKUP, "ip6-lookup") \
+ _ (IP_REWRITE, "ip6-rewrite") \
+ _ (IP_ARP, "ip6-discover-neighbor")
static char *tcp_error_strings[] = {
#define tcp_error(n,s) s,
return clib_min (tc->rcv_wnd, TCP_WND_MAX);
}
-/**
- * Compute and return window to advertise, scaled as per RFC1323
- */
-u32
-tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state)
-{
- if (state < TCP_STATE_ESTABLISHED)
- return tcp_initial_window_to_advertise (tc);
-
- tcp_update_rcv_wnd (tc);
-
- if (tc->rcv_wnd == 0)
- {
- tc->flags |= TCP_CONN_SENT_RCV_WND0;
- }
- else
- {
- tc->flags &= ~TCP_CONN_SENT_RCV_WND0;
- }
-
- return tc->rcv_wnd >> tc->rcv_wscale;
-}
-
-void
+static void
tcp_update_rcv_wnd (tcp_connection_t * tc)
{
i32 observed_wnd;
/*
* Figure out how much space we have available
*/
- available_space = stream_session_max_rx_enqueue (&tc->connection);
- max_fifo = stream_session_rx_fifo_size (&tc->connection);
+ available_space = transport_max_rx_enqueue (&tc->connection);
+ max_fifo = transport_rx_fifo_size (&tc->connection);
ASSERT (tc->rcv_opts.mss < max_fifo);
if (available_space < tc->rcv_opts.mss && available_space < max_fifo >> 3)
tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale);
}
+/**
+ * Compute and return window to advertise, scaled as per RFC1323
+ */
+static u32
+tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state)
+{
+ if (state < TCP_STATE_ESTABLISHED)
+ return tcp_initial_window_to_advertise (tc);
+
+ tcp_update_rcv_wnd (tc);
+
+ if (tc->rcv_wnd == 0)
+ {
+ tc->flags |= TCP_CONN_SENT_RCV_WND0;
+ }
+ else
+ {
+ tc->flags &= ~TCP_CONN_SENT_RCV_WND0;
+ }
+
+ return tc->rcv_wnd >> tc->rcv_wscale;
+}
+
/**
* Write TCP options to segment.
*/
-u32
+static u32
tcp_options_write (u8 * data, tcp_options_t * opts)
{
u32 opts_len = 0;
return opts_len;
}
-always_inline int
+static int
tcp_make_syn_options (tcp_options_t * opts, u8 wnd_scale)
{
u8 len = 0;
return len;
}
-always_inline int
+static int
tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts)
{
u8 len = 0;
return len;
}
-always_inline int
+static int
tcp_make_established_options (tcp_connection_t * tc, tcp_options_t * opts)
{
u8 len = 0;
{
case TCP_STATE_ESTABLISHED:
case TCP_STATE_FIN_WAIT_1:
+ case TCP_STATE_CLOSED:
+ case TCP_STATE_CLOSE_WAIT:
return tcp_make_established_options (tc, opts);
case TCP_STATE_SYN_RCVD:
return tcp_make_synack_options (tc, opts);
}
/**
- * Update snd_mss to reflect the effective segment size that we can send
- * by taking into account all TCP options, including SACKs
+ * Update burst send vars
+ *
+ * - Updates snd_mss to reflect the effective segment size that we can send
+ * by taking into account all TCP options, including SACKs.
+ * - Cache 'on the wire' options for reuse
+ * - Updates receive window which can be reused for a burst.
+ *
+ * This should *only* be called when doing bursts
*/
void
-tcp_update_snd_mss (tcp_connection_t * tc)
+tcp_update_burst_snd_vars (tcp_connection_t * tc)
{
+ tcp_main_t *tm = &tcp_main;
+
/* Compute options to be used for connection. These may be reused when
* sending data or to compute the effective mss (snd_mss) */
- tc->snd_opts_len =
- tcp_make_options (tc, &tc->snd_opts, TCP_STATE_ESTABLISHED);
+ tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts,
+ TCP_STATE_ESTABLISHED);
/* XXX check if MTU has been updated */
tc->snd_mss = clib_min (tc->mss, tc->rcv_opts.mss) - tc->snd_opts_len;
ASSERT (tc->snd_mss > 0);
+
+ tcp_options_write (tm->wrk_ctx[tc->c_thread_index].cached_opts,
+ &tc->snd_opts);
+
+ tcp_update_rcv_wnd (tc);
}
void
tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP;
}
-always_inline int
-tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers)
+static int
+tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u16 * n_bufs,
+ u32 wanted)
{
+ tcp_worker_ctx_t *ctx = &tm->wrk_ctx[thread_index];
vlib_main_t *vm = vlib_get_main ();
- u32 current_length = vec_len (tm->tx_buffers[thread_index]);
- u32 n_allocated;
-
- vec_validate (tm->tx_buffers[thread_index],
- current_length + n_free_buffers - 1);
- n_allocated =
- vlib_buffer_alloc (vm, &tm->tx_buffers[thread_index][current_length],
- n_free_buffers);
- _vec_len (tm->tx_buffers[thread_index]) = current_length + n_allocated;
- /* buffer shortage, report failure */
- if (vec_len (tm->tx_buffers[thread_index]) == 0)
- {
- clib_warning ("out of buffers");
- return -1;
- }
- return 0;
+ u32 n_alloc;
+
+ ASSERT (wanted > *n_bufs);
+ vec_validate_aligned (ctx->tx_buffers, wanted - 1, CLIB_CACHE_LINE_BYTES);
+ n_alloc = vlib_buffer_alloc (vm, &ctx->tx_buffers[*n_bufs],
+ wanted - *n_bufs);
+ *n_bufs += n_alloc;
+ _vec_len (ctx->tx_buffers) = *n_bufs;
+ return n_alloc;
}
always_inline int
tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx)
{
- u32 *my_tx_buffers;
u32 thread_index = vlib_get_thread_index ();
+ tcp_worker_ctx_t *ctx = &tm->wrk_ctx[thread_index];
+ u16 n_bufs = vec_len (ctx->tx_buffers);
TCP_DBG_BUFFER_ALLOC_MAYBE_FAIL (thread_index);
- if (PREDICT_FALSE (vec_len (tm->tx_buffers[thread_index]) == 0))
+ if (PREDICT_FALSE (!n_bufs))
{
- if (tcp_alloc_tx_buffers (tm, thread_index, VLIB_FRAME_SIZE))
- return -1;
+ if (!tcp_alloc_tx_buffers (tm, thread_index, &n_bufs, VLIB_FRAME_SIZE))
+ {
+ *bidx = ~0;
+ return -1;
+ }
}
- my_tx_buffers = tm->tx_buffers[thread_index];
- *bidx = my_tx_buffers[vec_len (my_tx_buffers) - 1];
- _vec_len (my_tx_buffers) -= 1;
+ *bidx = ctx->tx_buffers[--n_bufs];
+ _vec_len (ctx->tx_buffers) = n_bufs;
return 0;
}
-always_inline void
-tcp_return_buffer (tcp_main_t * tm)
-{
- _vec_len (tm->tx_buffers[vlib_get_thread_index ()]) += 1;
-}
-
-always_inline void *
+static void *
tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b)
{
if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
return vlib_buffer_make_headroom (b, MAX_HDRS_LEN);
}
-always_inline void *
+static void *
tcp_init_buffer (vlib_main_t * vm, vlib_buffer_t * b)
{
ASSERT ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
b->flags &= VLIB_BUFFER_NON_DEFAULT_FREELIST;
b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
b->total_length_not_including_first_buffer = 0;
+ b->current_data = 0;
vnet_buffer (b)->tcp.flags = 0;
VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b);
/* Leave enough space for headers */
/**
* Prepare ACK
*/
-void
+static void
tcp_make_ack_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_state_t state,
u8 flags)
{
next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index;
tcp_trajectory_add_start (b, 1);
- f = tm->ip_lookup_tx_frames[!is_ip4][thread_index];
+ f = tm->wrk_ctx[thread_index].ip_lookup_tx_frames[!is_ip4];
if (!f)
{
f = vlib_get_frame_to_node (vm, next_index);
ASSERT (f);
- tm->ip_lookup_tx_frames[!is_ip4][thread_index] = f;
+ tm->wrk_ctx[thread_index].ip_lookup_tx_frames[!is_ip4] = f;
}
to_next = vlib_frame_vector_args (f);
if (flush || f->n_vectors == VLIB_FRAME_SIZE)
{
vlib_put_frame_to_node (vm, next_index, f);
- tm->ip_lookup_tx_frames[!is_ip4][thread_index] = 0;
+ tm->wrk_ctx[thread_index].ip_lookup_tx_frames[!is_ip4] = 0;
}
}
-always_inline void
+static void
tcp_enqueue_to_ip_lookup_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
u8 is_ip4, u32 fib_index)
{
tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, fib_index, 1);
}
-always_inline void
+static void
tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
u8 is_ip4, u32 fib_index)
{
tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, fib_index, 0);
+ if (vm->thread_index == 0 && vlib_num_workers ())
+ session_flush_frames_main_thread (vm);
}
always_inline void
tcp_trajectory_add_start (b, 2);
/* Get frame to v4/6 output node */
- f = tm->tx_frames[!is_ip4][thread_index];
+ f = tm->wrk_ctx[thread_index].tx_frames[!is_ip4];
if (!f)
{
f = vlib_get_frame_to_node (vm, next_index);
ASSERT (f);
- tm->tx_frames[!is_ip4][thread_index] = f;
+ tm->wrk_ctx[thread_index].tx_frames[!is_ip4] = f;
}
to_next = vlib_frame_vector_args (f);
to_next[f->n_vectors] = bi;
if (flush || f->n_vectors == VLIB_FRAME_SIZE)
{
vlib_put_frame_to_node (vm, next_index, f);
- tm->tx_frames[!is_ip4][thread_index] = 0;
+ tm->wrk_ctx[thread_index].tx_frames[!is_ip4] = 0;
}
}
-always_inline void
+static void
tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4)
{
tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 0);
}
-always_inline void
+static void
tcp_enqueue_to_output_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
u8 is_ip4)
{
tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 1);
}
-int
+static int
tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0,
tcp_state_t state, u8 thread_index, u8 is_ip4)
{
TCP_EVT_DBG (TCP_EVT_RST_SENT, tc);
}
-void
+static void
tcp_push_ip_hdr (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b)
{
tcp_header_t *th = vlib_buffer_get_current (b);
void
tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4)
{
- if (tcp_main.tx_frames[!is_ip4][thread_index])
+ if (tcp_main.wrk_ctx[thread_index].tx_frames[!is_ip4])
{
u32 next_index;
next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index;
vlib_put_frame_to_node (vm, next_index,
- tcp_main.tx_frames[!is_ip4][thread_index]);
- tcp_main.tx_frames[!is_ip4][thread_index] = 0;
+ tcp_main.
+ wrk_ctx[thread_index].tx_frames[!is_ip4]);
+ tcp_main.wrk_ctx[thread_index].tx_frames[!is_ip4] = 0;
}
}
/**
* Flush ip lookup tx frames populated by timer pops
*/
-always_inline void
+static void
tcp_flush_frame_to_ip_lookup (vlib_main_t * vm, u8 thread_index, u8 is_ip4)
{
- if (tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index])
+ if (tcp_main.wrk_ctx[thread_index].ip_lookup_tx_frames[!is_ip4])
{
u32 next_index;
next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index;
vlib_put_frame_to_node (vm, next_index,
- tcp_main.ip_lookup_tx_frames[!is_ip4]
- [thread_index]);
- tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index] = 0;
+ tcp_main.
+ wrk_ctx[thread_index].ip_lookup_tx_frames
+ [!is_ip4]);
+ tcp_main.wrk_ctx[thread_index].ip_lookup_tx_frames[!is_ip4] = 0;
}
}
if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
return;
b = vlib_get_buffer (vm, bi);
+ tcp_init_buffer (vm, b);
fin_snt = tc->flags & TCP_CONN_FINSNT;
if (fin_snt)
tc->snd_nxt = tc->snd_una;
switch (next_state)
{
case TCP_STATE_ESTABLISHED:
+ case TCP_STATE_CLOSE_WAIT:
return TCP_FLAG_ACK;
case TCP_STATE_SYN_RCVD:
return TCP_FLAG_SYN | TCP_FLAG_ACK;
/**
* Push TCP header and update connection variables
*/
-static void
+always_inline void
tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b,
- tcp_state_t next_state, u8 compute_opts)
+ tcp_state_t next_state, u8 compute_opts, u8 maybe_burst)
{
u32 advertise_wnd, data_len;
- u8 tcp_hdr_opts_len, opts_write_len, flags;
+ u8 tcp_hdr_opts_len, flags;
+ tcp_main_t *tm = &tcp_main;
tcp_header_t *th;
- data_len = b->current_length + b->total_length_not_including_first_buffer;
- ASSERT (!b->total_length_not_including_first_buffer
- || (b->flags & VLIB_BUFFER_NEXT_PRESENT));
+ data_len = b->current_length;
+ if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+ data_len += b->total_length_not_including_first_buffer;
+
vnet_buffer (b)->tcp.flags = 0;
+ vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
if (compute_opts)
tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t);
- advertise_wnd = tcp_window_to_advertise (tc, next_state);
+
+ if (maybe_burst)
+ advertise_wnd = tc->rcv_wnd >> tc->rcv_wscale;
+ else
+ advertise_wnd = tcp_window_to_advertise (tc, next_state);
+
flags = tcp_make_state_flags (tc, next_state);
- /* Push header and options */
th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt,
tc->rcv_nxt, tcp_hdr_opts_len, flags,
advertise_wnd);
- opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts);
- ASSERT (opts_write_len == tc->snd_opts_len);
- vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
+ if (maybe_burst)
+ {
+ clib_memcpy ((u8 *) (th + 1),
+ tm->wrk_ctx[tc->c_thread_index].cached_opts,
+ tc->snd_opts_len);
+ }
+ else
+ {
+ u8 len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts);
+ ASSERT (len == tc->snd_opts_len);
+ }
/*
* Update connection variables
tc->snd_nxt += data_len;
tc->rcv_las = tc->rcv_nxt;
- /* TODO this is updated in output as well ... */
- if (seq_gt (tc->snd_nxt, tc->snd_una_max))
+ TCP_EVT_DBG (TCP_EVT_PKTIZE, tc);
+}
+
+u32
+tcp_push_header (tcp_connection_t * tc, vlib_buffer_t * b)
+{
+ tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, /* compute opts */ 0,
+ /* burst */ 1);
+ tc->snd_una_max = tc->snd_nxt;
+ ASSERT (seq_leq (tc->snd_una_max, tc->snd_una + tc->snd_wnd
+ + tcp_fastrecovery_sent_1_smss (tc) * tc->snd_mss));
+ tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
+ /* If not tracking an ACK, start tracking */
+ if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc))
+ {
+ tc->rtt_ts = tcp_time_now ();
+ tc->rtt_seq = tc->snd_nxt;
+ }
+ if (PREDICT_FALSE (!tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)))
{
- tc->snd_una_max = tc->snd_nxt;
- tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
+ tcp_retransmit_timer_set (tc);
+ tc->rto_boff = 0;
}
-
- TCP_EVT_DBG (TCP_EVT_PKTIZE, tc);
+ tcp_trajectory_add_start (b, 3);
+ return 0;
}
void
if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
return;
b = vlib_get_buffer (vm, bi);
+ tcp_init_buffer (vm, b);
/* Fill in the ACK */
tcp_make_ack (tc, b);
* @return the number of bytes in the segment or 0 if there's nothing to
* retransmit
*/
-u32
+static u32
tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset,
u32 max_deq_bytes, vlib_buffer_t ** b)
{
/*
* Make sure we can retransmit something
*/
- available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection);
+ available_bytes = session_tx_fifo_max_dequeue (&tc->connection);
ASSERT (available_bytes >= offset);
available_bytes -= offset;
if (!available_bytes)
* Allocate and fill in buffer(s)
*/
- if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
- return 0;
- *b = vlib_get_buffer (vm, bi);
- data = tcp_init_buffer (vm, *b);
-
/* Easy case, buffer size greater than mss */
if (PREDICT_TRUE (seg_size <= tm->bytes_per_buffer))
{
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return 0;
+ *b = vlib_get_buffer (vm, bi);
+ data = tcp_init_buffer (vm, *b);
n_bytes = stream_session_peek_bytes (&tc->connection, data, offset,
max_deq_bytes);
ASSERT (n_bytes == max_deq_bytes);
b[0]->current_length = n_bytes;
- tcp_push_hdr_i (tc, *b, tc->state, 0);
+ tcp_push_hdr_i (tc, *b, tc->state, /* compute opts */ 0, /* burst */ 0);
+ if (seq_gt (tc->snd_nxt, tc->snd_una_max))
+ tc->snd_una_max = tc->snd_nxt;
}
/* Split mss into multiple buffers */
else
vlib_buffer_t *chain_b, *prev_b;
int i;
- n_bufs_per_seg = ceil ((double) seg_size / tm->bytes_per_buffer);
-
/* Make sure we have enough buffers */
- available_bufs = vec_len (tm->tx_buffers[thread_index]);
+ n_bufs_per_seg = ceil ((double) seg_size / tm->bytes_per_buffer);
+ available_bufs = vec_len (tm->wrk_ctx[thread_index].tx_buffers);
if (n_bufs_per_seg > available_bufs)
{
- if (tcp_alloc_tx_buffers (tm, thread_index,
- VLIB_FRAME_SIZE - available_bufs))
+ tcp_alloc_tx_buffers (tm, thread_index, &available_bufs,
+ VLIB_FRAME_SIZE);
+
+ if (n_bufs_per_seg > available_bufs)
{
- tcp_return_buffer (tm);
*b = 0;
return 0;
}
}
+ tcp_get_free_buffer_index (tm, &bi);
+ ASSERT (bi != (u32) ~ 0);
+ *b = vlib_get_buffer (vm, bi);
+ data = tcp_init_buffer (vm, *b);
n_bytes = stream_session_peek_bytes (&tc->connection, data, offset,
tm->bytes_per_buffer -
MAX_HDRS_LEN);
ASSERT (n_peeked == len_to_deq);
n_bytes += n_peeked;
chain_b->current_length = n_peeked;
- chain_b->flags &= VLIB_BUFFER_NON_DEFAULT_FREELIST;
chain_b->next_buffer = 0;
/* update previous buffer */
b[0]->total_length_not_including_first_buffer += n_peeked;
}
- tcp_push_hdr_i (tc, *b, tc->state, 0);
+ tcp_push_hdr_i (tc, *b, tc->state, /* compute opts */ 0, /* burst */ 0);
+ if (seq_gt (tc->snd_nxt, tc->snd_una_max))
+ tc->snd_una_max = tc->snd_nxt;
}
ASSERT (n_bytes > 0);
* Reset congestion control, switch cwnd to loss window and try again.
*/
static void
-tcp_rtx_timeout_cc (tcp_connection_t * tc)
+tcp_rxt_timeout_cc (tcp_connection_t * tc)
{
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 6);
tc->prev_ssthresh = tc->ssthresh;
tc->prev_cwnd = tc->cwnd;
tcp_cc_fastrecovery_exit (tc);
/* Start again from the beginning */
- tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss);
+ tc->cc_algo->congestion (tc);
tc->cwnd = tcp_loss_wnd (tc);
tc->snd_congestion = tc->snd_una_max;
tc->rtt_ts = 0;
+ tc->cwnd_acc_bytes = 0;
+
tcp_recovery_on (tc);
}
-static void
+static inline void
tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
{
tcp_main_t *tm = vnet_get_tcp_main ();
tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID;
}
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
+
if (tc->state >= TCP_STATE_ESTABLISHED)
{
/* Lost FIN, retransmit and return */
return;
}
- /* Shouldn't be here */
- if (tc->snd_una == tc->snd_una_max)
+ /* Shouldn't be here. This condition is tricky because it has to take
+ * into account boff > 0 due to persist timeout. */
+ if ((tc->rto_boff == 0 && tc->snd_una == tc->snd_una_max)
+ || (tc->rto_boff > 0 && seq_geq (tc->snd_una, tc->snd_congestion)
+ && !tcp_flight_size (tc)))
{
- tcp_recovery_off (tc);
+ ASSERT (!tcp_in_recovery (tc));
+ tc->rto_boff = 0;
return;
}
- /* We're not in recovery so make sure rto_boff is 0 */
+ /* We're not in recovery so make sure rto_boff is 0. Can be non 0 due
+ * to persist timer timeout */
if (!tcp_in_recovery (tc) && tc->rto_boff > 0)
{
tc->rto_boff = 0;
/* First retransmit timeout */
if (tc->rto_boff == 1)
- tcp_rtx_timeout_cc (tc);
+ tcp_rxt_timeout_cc (tc);
- tc->snd_nxt = tc->snd_una;
- tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+ /* If we've sent beyond snd_congestion, update it */
+ if (seq_gt (tc->snd_una_max, tc->snd_congestion))
+ tc->snd_congestion = tc->snd_una_max;
- TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
+ tc->snd_una_max = tc->snd_nxt = tc->snd_una;
+ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
- /* Send one segment. Note that n_bytes may be zero due to buffer shortfall */
+ /* Send one segment. Note that n_bytes may be zero due to buffer
+ * shortfall */
n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
/* TODO be less aggressive about this */
if (n_bytes == 0)
{
- tcp_retransmit_timer_set (tc);
+ tcp_retransmit_timer_force_update (tc);
return;
}
tc->snd_rxt_ts = tcp_time_now ();
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
- tcp_retransmit_timer_update (tc);
+ tcp_retransmit_timer_force_update (tc);
}
/* Retransmit for SYN */
else if (tc->state == TCP_STATE_SYN_SENT)
}
b = vlib_get_buffer (vm, bi);
+ tcp_init_buffer (vm, b);
tcp_make_synack (tc, b);
TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 1);
|| tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc))
return;
- available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection);
+ available_bytes = session_tx_fifo_max_dequeue (&tc->connection);
offset = tc->snd_una_max - tc->snd_una;
/* Reprogram persist if no new bytes available to send. We may have data
* Try to force the first unsent segment (or buffer)
*/
if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
- return;
+ {
+ tcp_persist_timer_set (tc);
+ return;
+ }
b = vlib_get_buffer (vm, bi);
data = tcp_init_buffer (vm, b);
|| tc->snd_nxt == tc->snd_una_max
|| tc->rto_boff > 1));
- tcp_push_hdr_i (tc, b, tc->state, 0);
+ tcp_push_hdr_i (tc, b, tc->state, /* compute opts */ 0, /* burst */ 0);
+ tc->snd_una_max = tc->snd_nxt;
+ tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
/* Just sent new data, enable retransmit */
tcp_fast_retransmit_sack (tcp_connection_t * tc)
{
vlib_main_t *vm = vlib_get_main ();
- u32 n_written = 0, offset, max_bytes;
+ u32 n_written = 0, offset, max_bytes, n_segs = 0;
vlib_buffer_t *b = 0;
sack_scoreboard_hole_t *hole;
sack_scoreboard_t *sb;
u8 snd_limited = 0, can_rescue = 0;
ASSERT (tcp_in_fastrecovery (tc));
- TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
old_snd_nxt = tc->snd_nxt;
sb = &tc->sack_sb;
- snd_space = tcp_available_snd_space (tc);
+ snd_space = tcp_available_cc_snd_space (tc);
+
+ if (snd_space < tc->snd_mss)
+ goto done;
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
- while (hole && snd_space > 0)
+ while (hole && snd_space > 0 && n_segs++ < VLIB_FRAME_SIZE)
{
hole = scoreboard_next_rxt_hole (sb, hole,
tcp_fastrecovery_sent_1_smss (tc),
tc->snd_nxt = tc->snd_una + offset;
n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes,
&b);
- ASSERT (n_written);
+ if (!n_written)
+ goto done;
+
bi = vlib_get_buffer_index (vm, b);
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
break;
snd_space -= n_written;
}
+done:
/* If window allows, send 1 SMSS of new data */
tc->snd_nxt = old_snd_nxt;
}
/* Start resending from first un-acked segment */
old_snd_nxt = tc->snd_nxt;
tc->snd_nxt = tc->snd_una;
- snd_space = tcp_available_snd_space (tc);
+ snd_space = tcp_available_cc_snd_space (tc);
while (snd_space > 0)
{
void
tcp_fast_retransmit (tcp_connection_t * tc)
{
- if (tcp_opts_sack_permitted (&tc->rcv_opts)
- && scoreboard_first_hole (&tc->sack_sb))
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
tcp_fast_retransmit_sack (tc);
else
tcp_fast_retransmit_no_sack (tc);
}
-always_inline u32
+static u32
tcp_session_has_ooo_data (tcp_connection_t * tc)
{
stream_session_t *s = session_get (tc->c_s_index, tc->c_thread_index);
return svm_fifo_has_ooo_data (s->server_rx_fifo);
}
-always_inline uword
-tcp46_output_inline (vlib_main_t * vm,
- vlib_node_runtime_t * node,
- vlib_frame_t * from_frame, int is_ip4)
+static void
+tcp_output_handle_link_local (tcp_connection_t * tc0, vlib_buffer_t * b0,
+ u16 * next0, u32 * error0)
{
- u32 n_left_from, next_index, *from, *to_next;
- u32 my_thread_index = vm->thread_index;
+ ip_adjacency_t *adj;
+ adj_index_t ai;
- from = vlib_frame_vector_args (from_frame);
- n_left_from = from_frame->n_vectors;
- next_index = node->cached_next_index;
- tcp_set_time_now (my_thread_index);
+ /* Not thread safe but as long as the connection exists the adj should
+ * not be removed */
+ ai = adj_nbr_find (FIB_PROTOCOL_IP6, VNET_LINK_IP6, &tc0->c_rmt_ip,
+ tc0->sw_if_index);
+ if (ai == ADJ_INDEX_INVALID)
+ {
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0;
+ *next0 = TCP_OUTPUT_NEXT_DROP;
+ *error0 = TCP_ERROR_LINK_LOCAL_RW;
+ return;
+ }
- while (n_left_from > 0)
+ adj = adj_get (ai);
+ if (PREDICT_TRUE (adj->lookup_next_index == IP_LOOKUP_NEXT_REWRITE))
+ *next0 = TCP_OUTPUT_NEXT_IP_REWRITE;
+ else if (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP)
+ *next0 = TCP_OUTPUT_NEXT_IP_ARP;
+ else
{
- u32 n_left_to_next;
+ *next0 = TCP_OUTPUT_NEXT_DROP;
+ *error0 = TCP_ERROR_LINK_LOCAL_RW;
+ }
+ vnet_buffer (b0)->ip.adj_index[VLIB_TX] = ai;
+}
- vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+static void
+tcp46_output_trace_frame (vlib_main_t * vm, vlib_node_runtime_t * node,
+ u32 * to_next, u32 n_bufs)
+{
+ u32 n_trace = vlib_get_trace_count (vm, node);
+ tcp_connection_t *tc;
+ tcp_tx_trace_t *t;
+ vlib_buffer_t *b;
+ tcp_header_t *th;
+ int i;
- while (n_left_from > 0 && n_left_to_next > 0)
+ for (i = 0; i < clib_min (n_trace, n_bufs); i++)
+ {
+ b = vlib_get_buffer (vm, to_next[i]);
+ th = vlib_buffer_get_current (b);
+ tc = tcp_connection_get (vnet_buffer (b)->tcp.connection_index,
+ vm->thread_index);
+ t = vlib_add_trace (vm, node, b, sizeof (*t));
+ clib_memcpy (&t->tcp_header, th, sizeof (t->tcp_header));
+ clib_memcpy (&t->tcp_connection, tc, sizeof (t->tcp_connection));
+ }
+}
+
+always_inline void
+tcp_output_push_ip (vlib_main_t * vm, vlib_buffer_t * b0,
+ tcp_connection_t * tc0, u8 is_ip4)
+{
+ tcp_header_t *th0 = 0;
+
+ th0 = vlib_buffer_get_current (b0);
+ TCP_EVT_DBG (TCP_EVT_OUTPUT, tc0, th0->flags, b0->current_length);
+ if (is_ip4)
+ {
+ vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, &tc0->c_rmt_ip4,
+ IP_PROTOCOL_TCP, 1);
+ b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
+ vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data;
+ th0->checksum = 0;
+ }
+ else
+ {
+ ip6_header_t *ih0;
+ ih0 = vlib_buffer_push_ip6 (vm, b0, &tc0->c_lcl_ip6,
+ &tc0->c_rmt_ip6, IP_PROTOCOL_TCP);
+ b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
+ vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data;
+ vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data;
+ th0->checksum = 0;
+ }
+}
+
+always_inline void
+tcp_output_handle_packet (tcp_connection_t * tc0, vlib_buffer_t * b0,
+ u32 * error0, u16 * next0, u8 is_ip4)
+{
+
+ if (PREDICT_FALSE (tc0->state == TCP_STATE_CLOSED))
+ {
+ *error0 = TCP_ERROR_INVALID_CONNECTION;
+ *next0 = TCP_OUTPUT_NEXT_DROP;
+ return;
+ }
+
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = tc0->c_fib_index;
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0;
+
+ if (!is_ip4)
+ {
+ if (PREDICT_FALSE (ip6_address_is_link_local_unicast (&tc0->c_rmt_ip6)))
+ tcp_output_handle_link_local (tc0, b0, next0, error0);
+ }
+
+ /* Filter out DUPACKs if there are no OOO segments left */
+ if (PREDICT_FALSE (vnet_buffer (b0)->tcp.flags & TCP_BUF_FLAG_DUPACK))
+ {
+ /* N.B. Should not filter burst of dupacks. Two issues:
+ * 1) dupacks open cwnd on remote peer when congested
+ * 2) acks leaving should have the latest rcv_wnd since the
+ * burst may have eaten up all of it, so only the old ones
+ * could be filtered.
+ */
+ if (!tcp_session_has_ooo_data (tc0))
{
- u32 bi0;
- vlib_buffer_t *b0;
- tcp_connection_t *tc0;
- tcp_tx_trace_t *t0;
- tcp_header_t *th0 = 0;
- u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP;
+ *error0 = TCP_ERROR_FILTERED_DUPACKS;
+ *next0 = TCP_OUTPUT_NEXT_DROP;
+ return;
+ }
+ }
- if (n_left_from > 1)
- {
- vlib_buffer_t *pb;
- pb = vlib_get_buffer (vm, from[1]);
- vlib_prefetch_buffer_header (pb, STORE);
- CLIB_PREFETCH (pb->data, 2 * CLIB_CACHE_LINE_BYTES, STORE);
- }
+ /* Stop DELACK timer and fix flags */
+ tc0->flags &= ~(TCP_CONN_SNDACK);
+ if (!TCP_ALWAYS_ACK)
+ tcp_timer_reset (tc0, TCP_TIMER_DELACK);
+}
- bi0 = from[0];
- to_next[0] = bi0;
- from += 1;
- to_next += 1;
- n_left_from -= 1;
- n_left_to_next -= 1;
+always_inline uword
+tcp46_output_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * frame, int is_ip4)
+{
+ u32 n_left_from, *from, thread_index = vm->thread_index;
+ vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
+ u16 nexts[VLIB_FRAME_SIZE], *next;
- b0 = vlib_get_buffer (vm, bi0);
- tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
- my_thread_index);
- if (PREDICT_FALSE (tc0 == 0 || tc0->state == TCP_STATE_CLOSED))
- {
- error0 = TCP_ERROR_INVALID_CONNECTION;
- next0 = TCP_OUTPUT_NEXT_DROP;
- goto done;
- }
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ tcp_set_time_now (thread_index);
- th0 = vlib_buffer_get_current (b0);
- TCP_EVT_DBG (TCP_EVT_OUTPUT, tc0, th0->flags, b0->current_length);
+ if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE))
+ tcp46_output_trace_frame (vm, node, from, n_left_from);
- if (is_ip4)
- {
- vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, &tc0->c_rmt_ip4,
- IP_PROTOCOL_TCP, 1);
- b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
- vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data;
- th0->checksum = 0;
- }
- else
- {
- ip6_header_t *ih0;
- ih0 = vlib_buffer_push_ip6 (vm, b0, &tc0->c_lcl_ip6,
- &tc0->c_rmt_ip6, IP_PROTOCOL_TCP);
- b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
- vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data;
- vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data;
- th0->checksum = 0;
- }
+ vlib_get_buffers (vm, from, bufs, n_left_from);
+ b = bufs;
+ next = nexts;
- /* Filter out DUPACKs if there are no OOO segments left */
- if (PREDICT_FALSE
- (vnet_buffer (b0)->tcp.flags & TCP_BUF_FLAG_DUPACK))
- {
- /* N.B. Should not filter burst of dupacks. Two issues:
- * 1) dupacks open cwnd on remote peer when congested
- * 2) acks leaving should have the latest rcv_wnd since the
- * burst may have eaten up all of it, so only the old ones
- * could be filtered.
- */
- if (!tcp_session_has_ooo_data (tc0))
- {
- error0 = TCP_ERROR_FILTERED_DUPACKS;
- next0 = TCP_OUTPUT_NEXT_DROP;
- goto done;
- }
- }
+ while (n_left_from >= 4)
+ {
+ u32 error0 = TCP_ERROR_PKTS_SENT, error1 = TCP_ERROR_PKTS_SENT;
+ tcp_connection_t *tc0, *tc1;
- /* Stop DELACK timer and fix flags */
- tc0->flags &= ~(TCP_CONN_SNDACK);
- tcp_timer_reset (tc0, TCP_TIMER_DELACK);
+ {
+ vlib_prefetch_buffer_header (b[2], STORE);
+ CLIB_PREFETCH (b[2]->data, 2 * CLIB_CACHE_LINE_BYTES, STORE);
- /* If not retransmitting
- * 1) update snd_una_max (SYN, SYNACK, FIN)
- * 2) If we're not tracking an ACK, start tracking */
- if (seq_lt (tc0->snd_una_max, tc0->snd_nxt))
- {
- tc0->snd_una_max = tc0->snd_nxt;
- if (tc0->rtt_ts == 0)
- {
- tc0->rtt_ts = tcp_time_now ();
- tc0->rtt_seq = tc0->snd_nxt;
- }
- }
+ vlib_prefetch_buffer_header (b[3], STORE);
+ CLIB_PREFETCH (b[3]->data, 2 * CLIB_CACHE_LINE_BYTES, STORE);
+ }
- /* Set the retransmit timer if not set already and not
- * doing a pure ACK */
- if (!tcp_timer_is_active (tc0, TCP_TIMER_RETRANSMIT)
- && tc0->snd_nxt != tc0->snd_una)
- {
- tcp_retransmit_timer_set (tc0);
- tc0->rto_boff = 0;
- }
+ next[0] = next[1] = TCP_OUTPUT_NEXT_IP_LOOKUP;
-#if 0
- /* Make sure we haven't lost route to our peer */
- if (PREDICT_FALSE (tc0->last_fib_check
- < tc0->snd_opts.tsval + TCP_FIB_RECHECK_PERIOD))
- {
- if (PREDICT_TRUE
- (tc0->c_rmt_fei == tcp_lookup_rmt_in_fib (tc0)))
- {
- tc0->last_fib_check = tc0->snd_opts.tsval;
- }
- else
- {
- clib_warning ("lost connection to peer");
- tcp_connection_reset (tc0);
- goto done;
- }
- }
+ tc0 = tcp_connection_get (vnet_buffer (b[0])->tcp.connection_index,
+ thread_index);
+ tc1 = tcp_connection_get (vnet_buffer (b[1])->tcp.connection_index,
+ thread_index);
- /* Use pre-computed dpo to set next node */
- next0 = tc0->c_rmt_dpo.dpoi_next_node;
- vnet_buffer (b0)->ip.adj_index[VLIB_TX] = tc0->c_rmt_dpo.dpoi_index;
-#endif
+ tcp_output_push_ip (vm, b[0], tc0, is_ip4);
+ tcp_output_push_ip (vm, b[1], tc1, is_ip4);
- vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0;
- vnet_buffer (b0)->sw_if_index[VLIB_TX] = tc0->c_fib_index;
+ tcp_output_handle_packet (tc0, b[0], &error0, &next[0], is_ip4);
+ tcp_output_handle_packet (tc1, b[1], &error1, &next[1], is_ip4);
- b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
- done:
- b0->error = node->errors[error0];
- if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
- {
- t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
- if (th0)
- {
- clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header));
- }
- else
- {
- memset (&t0->tcp_header, 0, sizeof (t0->tcp_header));
- }
- clib_memcpy (&t0->tcp_connection, tc0,
- sizeof (t0->tcp_connection));
- }
+ b += 2;
+ next += 2;
+ n_left_from -= 2;
+ }
+ while (n_left_from > 0)
+ {
+ u32 error0 = TCP_ERROR_PKTS_SENT;
+ tcp_connection_t *tc0;
- vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
- n_left_to_next, bi0, next0);
+ if (n_left_from > 1)
+ {
+ vlib_prefetch_buffer_header (b[1], STORE);
+ CLIB_PREFETCH (b[1]->data, 2 * CLIB_CACHE_LINE_BYTES, STORE);
}
- vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ next[0] = TCP_OUTPUT_NEXT_IP_LOOKUP;
+ tc0 = tcp_connection_get (vnet_buffer (b[0])->tcp.connection_index,
+ thread_index);
+
+ tcp_output_push_ip (vm, b[0], tc0, is_ip4);
+ tcp_output_handle_packet (tc0, b[0], &error0, &next[0], is_ip4);
+
+ b += 1;
+ next += 1;
+ n_left_from -= 1;
}
- return from_frame->n_vectors;
+ vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
+ return frame->n_vectors;
}
static uword
VLIB_NODE_FUNCTION_MULTIARCH (tcp6_output_node, tcp6_output);
-u32
-tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b)
-{
- tcp_connection_t *tc;
-
- tc = (tcp_connection_t *) tconn;
- tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0);
- ASSERT (seq_leq (tc->snd_una_max, tc->snd_una + tc->snd_wnd));
-
- if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc))
- {
- tc->rtt_ts = tcp_time_now ();
- tc->rtt_seq = tc->snd_nxt;
- }
- tcp_trajectory_add_start (b, 3);
- return 0;
-}
-
typedef enum _tcp_reset_next
{
TCP_RESET_NEXT_DROP,