{
TCP_OUTPUT_NEXT_DROP,
TCP_OUTPUT_NEXT_IP_LOOKUP,
+ TCP_OUTPUT_NEXT_IP_REWRITE,
+ TCP_OUTPUT_NEXT_IP_ARP,
TCP_OUTPUT_N_NEXT
} tcp_output_next_t;
#define foreach_tcp4_output_next \
_ (DROP, "error-drop") \
- _ (IP_LOOKUP, "ip4-lookup")
+ _ (IP_LOOKUP, "ip4-lookup") \
+ _ (IP_REWRITE, "ip4-rewrite") \
+ _ (IP_ARP, "ip4-arp")
#define foreach_tcp6_output_next \
_ (DROP, "error-drop") \
- _ (IP_LOOKUP, "ip6-lookup")
+ _ (IP_LOOKUP, "ip6-lookup") \
+ _ (IP_REWRITE, "ip6-rewrite") \
+ _ (IP_ARP, "ip6-discover-neighbor")
static char *tcp_error_strings[] = {
#define tcp_error(n,s) s,
CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
tcp_tx_trace_t *t = va_arg (*args, tcp_tx_trace_t *);
- uword indent = format_get_indent (s);
+ u32 indent = format_get_indent (s);
s = format (s, "%U\n%U%U",
format_tcp_header, &t->tcp_header, 128,
}
static u8
-tcp_window_compute_scale (u32 available_space)
+tcp_window_compute_scale (u32 window)
{
u8 wnd_scale = 0;
- while (wnd_scale < TCP_MAX_WND_SCALE
- && (available_space >> wnd_scale) > TCP_WND_MAX)
+ while (wnd_scale < TCP_MAX_WND_SCALE && (window >> wnd_scale) > TCP_WND_MAX)
wnd_scale++;
return wnd_scale;
}
/*
* Figure out how much space we have available
*/
- available_space = stream_session_max_rx_enqueue (&tc->connection);
- max_fifo = stream_session_rx_fifo_size (&tc->connection);
+ available_space = transport_max_rx_enqueue (&tc->connection);
+ max_fifo = transport_rx_fifo_size (&tc->connection);
ASSERT (tc->rcv_opts.mss < max_fifo);
if (available_space < tc->rcv_opts.mss && available_space < max_fifo >> 3)
{
case TCP_STATE_ESTABLISHED:
case TCP_STATE_FIN_WAIT_1:
+ case TCP_STATE_CLOSED:
+ case TCP_STATE_CLOSE_WAIT:
return tcp_make_established_options (tc, opts);
case TCP_STATE_SYN_RCVD:
return tcp_make_synack_options (tc, opts);
case TCP_STATE_SYN_SENT:
return tcp_make_syn_options (opts, tc->rcv_wscale);
default:
- clib_warning ("Not handled!");
+ clib_warning ("State not handled! %d", state);
return 0;
}
}
}
always_inline int
-tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers)
+tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u16 * n_bufs,
+ u32 wanted)
{
- u32 current_length = vec_len (tm->tx_buffers[thread_index]);
-
- vec_validate (tm->tx_buffers[thread_index],
- current_length + n_free_buffers - 1);
- _vec_len (tm->tx_buffers[thread_index]) =
- current_length + vlib_buffer_alloc_from_free_list (vlib_get_main (),
- tm->tx_buffers
- [thread_index],
- n_free_buffers,
- VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
- /* buffer shortage, report failure */
- if (vec_len (tm->tx_buffers[thread_index]) == 0)
- {
- clib_warning ("out of buffers");
- return -1;
- }
- return 0;
+ vlib_main_t *vm = vlib_get_main ();
+ u32 n_alloc;
+
+ ASSERT (wanted > *n_bufs);
+ vec_validate_aligned (tm->tx_buffers[thread_index], wanted - 1,
+ CLIB_CACHE_LINE_BYTES);
+ n_alloc = vlib_buffer_alloc (vm, &tm->tx_buffers[thread_index][*n_bufs],
+ wanted - *n_bufs);
+ *n_bufs += n_alloc;
+ _vec_len (tm->tx_buffers[thread_index]) = *n_bufs;
+ return n_alloc;
}
always_inline int
tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx)
{
- u32 *my_tx_buffers;
u32 thread_index = vlib_get_thread_index ();
- if (PREDICT_FALSE (vec_len (tm->tx_buffers[thread_index]) == 0))
+ u16 n_bufs = vec_len (tm->tx_buffers[thread_index]);
+
+ TCP_DBG_BUFFER_ALLOC_MAYBE_FAIL (thread_index);
+
+ if (PREDICT_FALSE (!n_bufs))
{
- if (tcp_alloc_tx_buffers (tm, thread_index, VLIB_FRAME_SIZE))
- return -1;
+ if (!tcp_alloc_tx_buffers (tm, thread_index, &n_bufs, VLIB_FRAME_SIZE))
+ {
+ *bidx = ~0;
+ return -1;
+ }
}
- my_tx_buffers = tm->tx_buffers[thread_index];
- *bidx = my_tx_buffers[_vec_len (my_tx_buffers) - 1];
- _vec_len (my_tx_buffers) -= 1;
+ *bidx = tm->tx_buffers[thread_index][--n_bufs];
+ _vec_len (tm->tx_buffers[thread_index]) = n_bufs;
return 0;
}
-always_inline void
-tcp_return_buffer (tcp_main_t * tm)
-{
- u32 *my_tx_buffers;
- u32 thread_index = vlib_get_thread_index ();
- my_tx_buffers = tm->tx_buffers[thread_index];
- _vec_len (my_tx_buffers) += 1;
-}
-
always_inline void *
tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b)
{
if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
vlib_buffer_free_one (vm, b->next_buffer);
- b->flags = 0;
+ /* Zero all flags but free list index and trace flag */
+ b->flags &= VLIB_BUFFER_NEXT_PRESENT - 1;
b->current_data = 0;
b->current_length = 0;
b->total_length_not_including_first_buffer = 0;
tcp_init_buffer (vlib_main_t * vm, vlib_buffer_t * b)
{
ASSERT ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
- b->flags = VNET_BUFFER_F_LOCALLY_ORIGINATED;
+ b->flags &= VLIB_BUFFER_NON_DEFAULT_FREELIST;
+ b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
b->total_length_not_including_first_buffer = 0;
+ b->current_data = 0;
vnet_buffer (b)->tcp.flags = 0;
-
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b);
/* Leave enough space for headers */
return vlib_buffer_make_headroom (b, MAX_HDRS_LEN);
}
/* Reset flags, make sure ack is sent */
vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK;
+}
+
+/**
+ * Convert buffer to SYN
+ */
+void
+tcp_make_syn (tcp_connection_t * tc, vlib_buffer_t * b)
+{
+ u8 tcp_hdr_opts_len, tcp_opts_len;
+ tcp_header_t *th;
+ u16 initial_wnd;
+ tcp_options_t snd_opts;
- tc->snd_nxt += 1;
+ initial_wnd = tcp_initial_window_to_advertise (tc);
+
+ /* Make and write options */
+ memset (&snd_opts, 0, sizeof (snd_opts));
+ tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale);
+ tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
+
+ th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
+ tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN,
+ initial_wnd);
+ vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
+ tcp_options_write ((u8 *) (th + 1), &snd_opts);
}
/**
u8 tcp_opts_len, tcp_hdr_opts_len;
tcp_header_t *th;
u16 initial_wnd;
- u32 time_now;
memset (snd_opts, 0, sizeof (*snd_opts));
-
tcp_reuse_buffer (vm, b);
- /* Set random initial sequence */
- time_now = tcp_time_now ();
-
- tc->iss = random_u32 (&time_now);
- tc->snd_una = tc->iss;
- tc->snd_nxt = tc->iss + 1;
- tc->snd_una_max = tc->snd_nxt;
-
initial_wnd = tcp_initial_window_to_advertise (tc);
-
- /* Make and write options */
tcp_opts_len = tcp_make_synack_options (tc, snd_opts);
tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
tc->rcv_nxt, tcp_hdr_opts_len,
TCP_FLAG_SYN | TCP_FLAG_ACK, initial_wnd);
-
tcp_options_write ((u8 *) (th + 1), snd_opts);
vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK;
- /* Init retransmit timer */
- tcp_retransmit_timer_set (tc);
+ /* Init retransmit timer. Use update instead of set because of
+ * retransmissions */
+ tcp_retransmit_timer_force_update (tc);
TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc);
}
always_inline void
-tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
- u8 is_ip4)
+tcp_enqueue_to_ip_lookup_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+ u8 is_ip4, u32 fib_index, u8 flush)
{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ u32 thread_index = vlib_get_thread_index ();
u32 *to_next, next_index;
vlib_frame_t *f;
b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
b->error = 0;
- /* Default FIB for now */
- vnet_buffer (b)->sw_if_index[VLIB_TX] = 0;
+ vnet_buffer (b)->sw_if_index[VLIB_TX] = fib_index;
+ vnet_buffer (b)->sw_if_index[VLIB_RX] = 0;
/* Send to IP lookup */
next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index;
- f = vlib_get_frame_to_node (vm, next_index);
+ tcp_trajectory_add_start (b, 1);
+
+ f = tm->ip_lookup_tx_frames[!is_ip4][thread_index];
+ if (!f)
+ {
+ f = vlib_get_frame_to_node (vm, next_index);
+ ASSERT (f);
+ tm->ip_lookup_tx_frames[!is_ip4][thread_index] = f;
+ }
- /* Enqueue the packet */
to_next = vlib_frame_vector_args (f);
- to_next[0] = bi;
- f->n_vectors = 1;
- vlib_put_frame_to_node (vm, next_index, f);
+ to_next[f->n_vectors] = bi;
+ f->n_vectors += 1;
+ if (flush || f->n_vectors == VLIB_FRAME_SIZE)
+ {
+ vlib_put_frame_to_node (vm, next_index, f);
+ tm->ip_lookup_tx_frames[!is_ip4][thread_index] = 0;
+ }
+}
+
+always_inline void
+tcp_enqueue_to_ip_lookup_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+ u8 is_ip4, u32 fib_index)
+{
+ tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, fib_index, 1);
+}
+
+always_inline void
+tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+ u8 is_ip4, u32 fib_index)
+{
+ tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, fib_index, 0);
+ if (vm->thread_index == 0 && vlib_num_workers ())
+ session_flush_frames_main_thread (vm);
}
always_inline void
/* Decide where to send the packet */
next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index;
-
- /* Initialize the trajectory trace, if configured */
- if (VLIB_BUFFER_TRACE_TRAJECTORY > 0)
- {
- b->pre_data[0] = 1;
- b->pre_data[1] = next_index;
- }
+ tcp_trajectory_add_start (b, 2);
/* Get frame to v4/6 output node */
f = tm->tx_frames[!is_ip4][thread_index];
}
tcp_reuse_buffer (vm, b0);
+ tcp_trajectory_add_start (b0, 4);
th0 = vlib_buffer_push_tcp_net_order (b0, dst_port, src_port, seq, ack,
sizeof (tcp_header_t), flags, 0);
tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4)
{
vlib_buffer_t *b;
- u32 bi;
+ u32 bi, sw_if_index, fib_index;
tcp_main_t *tm = vnet_get_tcp_main ();
vlib_main_t *vm = vlib_get_main ();
u8 tcp_hdr_len, flags = 0;
u32 seq, ack;
ip4_header_t *ih4, *pkt_ih4;
ip6_header_t *ih6, *pkt_ih6;
+ fib_protocol_t fib_proto;
if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
return;
b = vlib_get_buffer (vm, bi);
+ sw_if_index = vnet_buffer (pkt)->sw_if_index[VLIB_RX];
+ fib_proto = is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6;
+ fib_index = fib_table_get_index_for_sw_if_index (fib_proto, sw_if_index);
tcp_init_buffer (vm, b);
/* Make and write options */
ASSERT (!bogus);
}
- tcp_enqueue_to_ip_lookup (vm, b, bi, is_ip4);
+ tcp_enqueue_to_ip_lookup_now (vm, b, bi, is_ip4, fib_index);
TCP_EVT_DBG (TCP_EVT_RST_SENT, tc);
}
opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts);
ASSERT (opts_write_len == tc->snd_opts_len);
vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
- tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4);
+ if (tc->c_is_ip4)
+ {
+ ip4_header_t *ih4;
+ ih4 = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip.ip4,
+ &tc->c_rmt_ip.ip4, IP_PROTOCOL_TCP, 0);
+ th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih4);
+ }
+ else
+ {
+ int bogus = ~0;
+ ip6_header_t *ih6;
+ ih6 = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip.ip6,
+ &tc->c_rmt_ip.ip6, IP_PROTOCOL_TCP);
+ th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih6, &bogus);
+ ASSERT (!bogus);
+ }
+ tcp_enqueue_to_ip_lookup_now (vm, b, bi, tc->c_is_ip4, tc->c_fib_index);
+ TCP_EVT_DBG (TCP_EVT_RST_SENT, tc);
}
void
u32 bi;
tcp_main_t *tm = vnet_get_tcp_main ();
vlib_main_t *vm = vlib_get_main ();
- u8 tcp_hdr_opts_len, tcp_opts_len;
- tcp_header_t *th;
- u32 time_now;
- u16 initial_wnd;
- tcp_options_t snd_opts;
+
+ /*
+ * Setup retransmit and establish timers before requesting buffer
+ * such that we can return if we've ran out.
+ */
+ tcp_timer_set (tc, TCP_TIMER_ESTABLISH, TCP_ESTABLISH_TIME);
+ tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN,
+ tc->rto * TCP_TO_TIMER_TICK);
if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
return;
b = vlib_get_buffer (vm, bi);
tcp_init_buffer (vm, b);
-
- /* Set random initial sequence */
- time_now = tcp_time_now ();
-
- tc->iss = random_u32 (&time_now);
- tc->snd_una = tc->iss;
- tc->snd_una_max = tc->snd_nxt = tc->iss + 1;
-
- initial_wnd = tcp_initial_window_to_advertise (tc);
-
- /* Make and write options */
- memset (&snd_opts, 0, sizeof (snd_opts));
- tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale);
- tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
-
- th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
- tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN,
- initial_wnd);
-
- tcp_options_write ((u8 *) (th + 1), &snd_opts);
+ tcp_make_syn (tc, b);
/* Measure RTT with this */
tc->rtt_ts = tcp_time_now ();
tc->rtt_seq = tc->snd_nxt;
-
- /* Start retransmit trimer */
- tcp_timer_set (tc, TCP_TIMER_RETRANSMIT_SYN, tc->rto * TCP_TO_TIMER_TICK);
tc->rto_boff = 0;
- /* Set the connection establishment timer */
- tcp_timer_set (tc, TCP_TIMER_ESTABLISH, TCP_ESTABLISH_TIME);
-
tcp_push_ip_hdr (tm, tc, b);
- tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4);
+ tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4, tc->c_fib_index);
TCP_EVT_DBG (TCP_EVT_SYN_SENT, tc);
}
}
/**
- * Flush both v4 and v6 tx frames for thread index
+ * Flush ip lookup tx frames populated by timer pops
+ */
+always_inline void
+tcp_flush_frame_to_ip_lookup (vlib_main_t * vm, u8 thread_index, u8 is_ip4)
+{
+ if (tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index])
+ {
+ u32 next_index;
+ next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index;
+ vlib_put_frame_to_node (vm, next_index,
+ tcp_main.ip_lookup_tx_frames[!is_ip4]
+ [thread_index]);
+ tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index] = 0;
+ }
+}
+
+/**
+ * Flush v4 and v6 tcp and ip-lookup tx frames for thread index
*/
void
tcp_flush_frames_to_output (u8 thread_index)
vlib_main_t *vm = vlib_get_main ();
tcp_flush_frame_to_output (vm, thread_index, 1);
tcp_flush_frame_to_output (vm, thread_index, 0);
+ tcp_flush_frame_to_ip_lookup (vm, thread_index, 1);
+ tcp_flush_frame_to_ip_lookup (vm, thread_index, 0);
}
/**
void
tcp_send_fin (tcp_connection_t * tc)
{
- vlib_buffer_t *b;
- u32 bi;
tcp_main_t *tm = vnet_get_tcp_main ();
vlib_main_t *vm = vlib_get_main ();
+ vlib_buffer_t *b;
+ u32 bi;
+ u8 fin_snt = 0;
+ tcp_retransmit_timer_force_update (tc);
if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
return;
b = vlib_get_buffer (vm, bi);
- /* buffer will be initialized by in tcp_make_fin */
+ tcp_init_buffer (vm, b);
+ fin_snt = tc->flags & TCP_CONN_FINSNT;
+ if (fin_snt)
+ tc->snd_nxt = tc->snd_una;
tcp_make_fin (tc, b);
tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4);
- tc->flags |= TCP_CONN_FINSNT;
- tc->flags &= ~TCP_CONN_FINPNDG;
- tcp_retransmit_timer_force_update (tc);
+ if (!fin_snt)
+ {
+ tc->flags |= TCP_CONN_FINSNT;
+ tc->flags &= ~TCP_CONN_FINPNDG;
+ /* Account for the FIN */
+ tc->snd_una_max += 1;
+ tc->snd_nxt = tc->snd_una_max;
+ }
+ else
+ {
+ tc->snd_nxt = tc->snd_una_max;
+ }
TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc);
}
if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
return;
b = vlib_get_buffer (vm, bi);
+ tcp_init_buffer (vm, b);
/* Fill in the ACK */
tcp_make_ack (tc, b);
/*
* Make sure we can retransmit something
*/
- available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection);
+ available_bytes = session_tx_fifo_max_dequeue (&tc->connection);
+ ASSERT (available_bytes >= offset);
available_bytes -= offset;
if (!available_bytes)
return 0;
* Allocate and fill in buffer(s)
*/
- if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
- return 0;
- *b = vlib_get_buffer (vm, bi);
- data = tcp_init_buffer (vm, *b);
-
/* Easy case, buffer size greater than mss */
if (PREDICT_TRUE (seg_size <= tm->bytes_per_buffer))
{
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return 0;
+ *b = vlib_get_buffer (vm, bi);
+ data = tcp_init_buffer (vm, *b);
n_bytes = stream_session_peek_bytes (&tc->connection, data, offset,
max_deq_bytes);
ASSERT (n_bytes == max_deq_bytes);
vlib_buffer_t *chain_b, *prev_b;
int i;
- n_bufs_per_seg = ceil ((double) seg_size / tm->bytes_per_buffer);
-
/* Make sure we have enough buffers */
+ n_bufs_per_seg = ceil ((double) seg_size / tm->bytes_per_buffer);
available_bufs = vec_len (tm->tx_buffers[thread_index]);
if (n_bufs_per_seg > available_bufs)
{
- if (tcp_alloc_tx_buffers (tm, thread_index,
- VLIB_FRAME_SIZE - available_bufs))
+ tcp_alloc_tx_buffers (tm, thread_index, &available_bufs,
+ VLIB_FRAME_SIZE);
+
+ if (n_bufs_per_seg > available_bufs)
{
- tcp_return_buffer (tm);
+ *b = 0;
return 0;
}
}
+ tcp_get_free_buffer_index (tm, &bi);
+ ASSERT (bi != (u32) ~ 0);
+ *b = vlib_get_buffer (vm, bi);
+ data = tcp_init_buffer (vm, *b);
n_bytes = stream_session_peek_bytes (&tc->connection, data, offset,
tm->bytes_per_buffer -
MAX_HDRS_LEN);
ASSERT (n_peeked == len_to_deq);
n_bytes += n_peeked;
chain_b->current_length = n_peeked;
- chain_b->flags = 0;
chain_b->next_buffer = 0;
/* update previous buffer */
* Reset congestion control, switch cwnd to loss window and try again.
*/
static void
-tcp_rtx_timeout_cc (tcp_connection_t * tc)
+tcp_rxt_timeout_cc (tcp_connection_t * tc)
{
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 6);
tc->prev_ssthresh = tc->ssthresh;
tc->prev_cwnd = tc->cwnd;
tcp_cc_fastrecovery_exit (tc);
/* Start again from the beginning */
- tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss);
+ tc->cc_algo->congestion (tc);
tc->cwnd = tcp_loss_wnd (tc);
tc->snd_congestion = tc->snd_una_max;
+ tc->rtt_ts = 0;
+ tc->cwnd_acc_bytes = 0;
tcp_recovery_on (tc);
}
tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID;
}
- if (!tcp_in_recovery (tc) && tc->rto_boff > 0
- && tc->state >= TCP_STATE_ESTABLISHED)
- {
- tc->rto_boff = 0;
- tcp_update_rto (tc);
- }
-
- /* Increment RTO backoff (also equal to number of retries) */
- tc->rto_boff += 1;
-
- /* Go back to first un-acked byte */
- tc->snd_nxt = tc->snd_una;
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
if (tc->state >= TCP_STATE_ESTABLISHED)
{
if (tcp_is_lost_fin (tc))
{
tcp_send_fin (tc);
+ tc->rto_boff += 1;
+ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+ return;
+ }
+
+ /* Shouldn't be here */
+ if ((tc->rto_boff == 0 && tc->snd_una == tc->snd_una_max)
+ || (tc->rto_boff > 0 && seq_geq (tc->snd_una, tc->snd_congestion)))
+ {
+ tcp_recovery_off (tc);
return;
}
+ /* We're not in recovery so make sure rto_boff is 0 */
+ if (!tcp_in_recovery (tc) && tc->rto_boff > 0)
+ {
+ tc->rto_boff = 0;
+ tcp_update_rto (tc);
+ }
+
+ /* Increment RTO backoff (also equal to number of retries) and go back
+ * to first un-acked byte */
+ tc->rto_boff += 1;
+
/* First retransmit timeout */
if (tc->rto_boff == 1)
- tcp_rtx_timeout_cc (tc);
+ tcp_rxt_timeout_cc (tc);
- /* Exponential backoff */
+ tc->snd_una_max = tc->snd_nxt = tc->snd_una;
tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
- TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
-
/* Send one segment. Note that n_bytes may be zero due to buffer shortfall */
n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
if (n_bytes == 0)
{
- if (b)
- {
- clib_warning ("retransmit fail: %U", format_tcp_connection, tc,
- 2);
- ASSERT (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion);
- }
- /* Try again eventually */
tcp_retransmit_timer_set (tc);
return;
}
/* For first retransmit, record timestamp (Eifel detection RFC3522) */
if (tc->rto_boff == 1)
tc->snd_rxt_ts = tcp_time_now ();
+
+ tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+ tcp_retransmit_timer_update (tc);
}
- /* Retransmit for SYN/SYNACK */
- else if (tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_SYN_SENT)
+ /* Retransmit for SYN */
+ else if (tc->state == TCP_STATE_SYN_SENT)
{
/* Half-open connection actually moved to established but we were
* waiting for syn retransmit to pop to call cleanup from the right
* thread. */
if (tc->flags & TCP_CONN_HALF_OPEN_DONE)
{
- ASSERT (tc->state == TCP_STATE_SYN_SENT);
if (tcp_half_open_connection_cleanup (tc))
{
clib_warning ("could not remove half-open connection");
/* Try without increasing RTO a number of times. If this fails,
* start growing RTO exponentially */
+ tc->rto_boff += 1;
if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+ tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN,
+ tc->rto * TCP_TO_TIMER_TICK);
+
if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
- {
- clib_warning ("tcp_get_free_buffer_index FAIL");
- return;
- }
+ return;
+
b = vlib_get_buffer (vm, bi);
tcp_init_buffer (vm, b);
- tcp_push_hdr_i (tc, b, tc->state, 1);
+ tcp_make_syn (tc, b);
- /* Account for the SYN */
- tc->snd_nxt += 1;
tc->rtt_ts = 0;
- TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc,
- (tc->state == TCP_STATE_SYN_SENT ? 0 : 1));
+ TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 0);
+
+ /* This goes straight to ipx_lookup. Retransmit timer set already */
+ tcp_push_ip_hdr (tm, tc, b);
+ tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4, tc->c_fib_index);
}
- else
+ /* Retransmit SYN-ACK */
+ else if (tc->state == TCP_STATE_SYN_RCVD)
{
- ASSERT (tc->state == TCP_STATE_CLOSED);
- clib_warning ("connection closed ...");
- return;
- }
+ tc->rto_boff += 1;
+ if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
+ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+ tc->rtt_ts = 0;
- if (!is_syn)
- {
- tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ {
+ tcp_retransmit_timer_force_update (tc);
+ return;
+ }
- /* Re-enable retransmit timer */
- tcp_retransmit_timer_set (tc);
+ b = vlib_get_buffer (vm, bi);
+ tcp_init_buffer (vm, b);
+ tcp_make_synack (tc, b);
+ TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 1);
+
+ /* Retransmit timer already updated, just enqueue to output */
+ tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
}
else
{
- ASSERT (tc->state == TCP_STATE_SYN_SENT);
-
- /* This goes straight to ipx_lookup */
- tcp_push_ip_hdr (tm, tc, b);
- tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4);
-
- /* Re-enable retransmit timer */
- tcp_timer_set (tc, TCP_TIMER_RETRANSMIT_SYN,
- tc->rto * TCP_TO_TIMER_TICK);
+ ASSERT (tc->state == TCP_STATE_CLOSED);
+ return;
}
}
u32 thread_index = vlib_get_thread_index ();
tcp_connection_t *tc;
vlib_buffer_t *b;
- u32 bi, old_snd_nxt, max_snd_bytes, available_bytes, offset;
+ u32 bi, max_snd_bytes, available_bytes, offset;
int n_bytes = 0;
u8 *data;
|| tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc))
return;
- available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection);
+ available_bytes = session_tx_fifo_max_dequeue (&tc->connection);
offset = tc->snd_una_max - tc->snd_una;
/* Reprogram persist if no new bytes available to send. We may have data
n_bytes = stream_session_peek_bytes (&tc->connection, data, offset,
max_snd_bytes);
b->current_length = n_bytes;
- ASSERT (n_bytes != 0 && (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1
- || tcp_timer_is_active (tc,
- TCP_TIMER_RETRANSMIT)));
+ ASSERT (n_bytes != 0 && (tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)
+ || tc->snd_nxt == tc->snd_una_max
+ || tc->rto_boff > 1));
- /* Allow updating of snd_una_max but don't update snd_nxt */
- old_snd_nxt = tc->snd_nxt;
tcp_push_hdr_i (tc, b, tc->state, 0);
- tc->snd_nxt = old_snd_nxt;
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
/* Just sent new data, enable retransmit */
tcp_fast_retransmit_sack (tcp_connection_t * tc)
{
vlib_main_t *vm = vlib_get_main ();
- u32 n_written = 0, offset, max_bytes;
+ u32 n_written = 0, offset, max_bytes, n_segs = 0;
vlib_buffer_t *b = 0;
sack_scoreboard_hole_t *hole;
sack_scoreboard_t *sb;
u8 snd_limited = 0, can_rescue = 0;
ASSERT (tcp_in_fastrecovery (tc));
- TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
old_snd_nxt = tc->snd_nxt;
sb = &tc->sack_sb;
- snd_space = tcp_available_snd_space (tc);
+ snd_space = tcp_available_cc_snd_space (tc);
+ if (snd_space < tc->snd_mss)
+ goto done;
+
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
- while (hole && snd_space > 0)
+ while (hole && snd_space > 0 && n_segs++ < VLIB_FRAME_SIZE)
{
hole = scoreboard_next_rxt_hole (sb, hole,
tcp_fastrecovery_sent_1_smss (tc),
tc->snd_nxt = tc->snd_una + offset;
n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes,
&b);
- ASSERT (n_written);
+ if (!n_written)
+ goto done;
+
bi = vlib_get_buffer_index (vm, b);
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
break;
snd_space -= n_written;
}
+done:
/* If window allows, send 1 SMSS of new data */
tc->snd_nxt = old_snd_nxt;
}
/* Start resending from first un-acked segment */
old_snd_nxt = tc->snd_nxt;
tc->snd_nxt = tc->snd_una;
- snd_space = tcp_available_snd_space (tc);
+ snd_space = tcp_available_cc_snd_space (tc);
while (snd_space > 0)
{
void
tcp_fast_retransmit (tcp_connection_t * tc)
{
- if (tcp_opts_sack_permitted (&tc->rcv_opts)
- && scoreboard_first_hole (&tc->sack_sb))
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
tcp_fast_retransmit_sack (tc);
else
tcp_fast_retransmit_no_sack (tc);
always_inline u32
tcp_session_has_ooo_data (tcp_connection_t * tc)
{
- stream_session_t *s =
- stream_session_get (tc->c_s_index, tc->c_thread_index);
+ stream_session_t *s = session_get (tc->c_s_index, tc->c_thread_index);
return svm_fifo_has_ooo_data (s->server_rx_fifo);
}
+static void
+tcp_output_handle_link_local (tcp_connection_t * tc0, vlib_buffer_t * b0,
+ u32 * next0, u32 * error0)
+{
+ ip_adjacency_t *adj;
+ adj_index_t ai;
+
+ /* Not thread safe but as long as the connection exists the adj should
+ * not be removed */
+ ai = adj_nbr_find (FIB_PROTOCOL_IP6, VNET_LINK_IP6, &tc0->c_rmt_ip,
+ tc0->sw_if_index);
+ if (ai == ADJ_INDEX_INVALID)
+ {
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0;
+ *next0 = TCP_OUTPUT_NEXT_DROP;
+ *error0 = TCP_ERROR_LINK_LOCAL_RW;
+ return;
+ }
+
+ adj = adj_get (ai);
+ if (PREDICT_TRUE (adj->lookup_next_index == IP_LOOKUP_NEXT_REWRITE))
+ *next0 = TCP_OUTPUT_NEXT_IP_REWRITE;
+ else if (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP)
+ *next0 = TCP_OUTPUT_NEXT_IP_ARP;
+ else
+ {
+ *next0 = TCP_OUTPUT_NEXT_DROP;
+ *error0 = TCP_ERROR_LINK_LOCAL_RW;
+ }
+ vnet_buffer (b0)->ip.adj_index[VLIB_TX] = ai;
+}
+
always_inline uword
tcp46_output_inline (vlib_main_t * vm,
vlib_node_runtime_t * node,
tcp_header_t *th0 = 0;
u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP;
+ if (n_left_from > 1)
+ {
+ vlib_buffer_t *pb;
+ pb = vlib_get_buffer (vm, from[1]);
+ vlib_prefetch_buffer_header (pb, STORE);
+ CLIB_PREFETCH (pb->data, 2 * CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
bi0 = from[0];
to_next[0] = bi0;
from += 1;
th0 = vlib_buffer_get_current (b0);
TCP_EVT_DBG (TCP_EVT_OUTPUT, tc0, th0->flags, b0->current_length);
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = tc0->c_fib_index;
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0;
if (is_ip4)
{
vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data;
vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data;
th0->checksum = 0;
+
+ if (PREDICT_FALSE
+ (ip6_address_is_link_local_unicast (&tc0->c_rmt_ip6)))
+ tcp_output_handle_link_local (tc0, b0, &next0, &error0);
}
/* Filter out DUPACKs if there are no OOO segments left */
if (PREDICT_FALSE
(vnet_buffer (b0)->tcp.flags & TCP_BUF_FLAG_DUPACK))
{
+ /* N.B. Should not filter burst of dupacks. Two issues:
+ * 1) dupacks open cwnd on remote peer when congested
+ * 2) acks leaving should have the latest rcv_wnd since the
+ * burst may have eaten up all of it, so only the old ones
+ * could be filtered.
+ */
if (!tcp_session_has_ooo_data (tc0))
{
error0 = TCP_ERROR_FILTERED_DUPACKS;
vnet_buffer (b0)->ip.adj_index[VLIB_TX] = tc0->c_rmt_dpo.dpoi_index;
#endif
- vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0;
- vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0;
-
- b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
done:
b0->error = node->errors[error0];
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
tc->rtt_ts = tcp_time_now ();
tc->rtt_seq = tc->snd_nxt;
}
+ tcp_trajectory_add_start (b, 3);
return 0;
}
}
/* Prepare to send to IP lookup */
- vnet_buffer (b0)->sw_if_index[VLIB_TX] = 0;
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0;
next0 = TCP_RESET_NEXT_IP_LOOKUP;
done: