vlib_node_registration_t tcp4_output_node;
vlib_node_registration_t tcp6_output_node;
-typedef enum _tcp_output_nect
+typedef enum _tcp_output_next
{
TCP_OUTPUT_NEXT_DROP,
TCP_OUTPUT_NEXT_IP_LOOKUP,
}
/**
- * TCP's IW as recommended by RFC6928
+ * Update max segment size we're able to process.
+ *
+ * The value is constrained by our interface's MTU and IP options. It is
+ * also what we advertise to our peer.
+ */
+void
+tcp_update_rcv_mss (tcp_connection_t * tc)
+{
+ /* TODO find our iface MTU */
+ tc->mss = dummy_mtu;
+}
+
+/**
+ * TCP's initial window
*/
always_inline u32
tcp_initial_wnd_unscaled (tcp_connection_t * tc)
{
- return TCP_IW_N_SEGMENTS * tc->mss;
+ /* RFC 6928 recommends the value lower. However at the time our connections
+ * are initialized, fifos may not be allocated. Therefore, advertise the
+ * smallest possible unscaled window size and update once fifos are
+ * assigned to the session.
+ */
+ /*
+ tcp_update_rcv_mss (tc);
+ TCP_IW_N_SEGMENTS * tc->mss;
+ */
+ return TCP_MIN_RX_FIFO_SIZE;
}
/**
}
}
-/**
- * Update max segment size we're able to process.
- *
- * The value is constrained by our interface's MTU and IP options. It is
- * also what we advertise to our peer.
- */
-void
-tcp_update_rcv_mss (tcp_connection_t * tc)
-{
- /* TODO find our iface MTU */
- tc->mss = dummy_mtu;
-}
-
/**
* Update snd_mss to reflect the effective segment size that we can send
* by taking into account all TCP options, including SACKs
tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP;
}
-#define tcp_get_free_buffer_index(tm, bidx) \
-do { \
- u32 *my_tx_buffers, n_free_buffers; \
- u32 thread_index = vlib_get_thread_index(); \
- my_tx_buffers = tm->tx_buffers[thread_index]; \
- if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \
- { \
- n_free_buffers = 32; /* TODO config or macro */ \
- vec_validate (my_tx_buffers, n_free_buffers - 1); \
- _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \
- tm->vlib_main, my_tx_buffers, n_free_buffers, \
- VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \
- tm->tx_buffers[thread_index] = my_tx_buffers; \
- } \
- /* buffer shortage */ \
- if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \
- return; \
- *bidx = my_tx_buffers[_vec_len (my_tx_buffers)-1]; \
- _vec_len (my_tx_buffers) -= 1; \
-} while (0)
-
-#define tcp_return_buffer(tm) \
-do { \
- u32 *my_tx_buffers; \
- u32 thread_index = vlib_get_thread_index(); \
- my_tx_buffers = tm->tx_buffers[thread_index]; \
- _vec_len (my_tx_buffers) +=1; \
-} while (0)
+always_inline int
+tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx)
+{
+ u32 *my_tx_buffers, n_free_buffers;
+ u32 thread_index = vlib_get_thread_index ();
+ my_tx_buffers = tm->tx_buffers[thread_index];
+ if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0))
+ {
+ n_free_buffers = VLIB_FRAME_SIZE;
+ vec_validate (my_tx_buffers, n_free_buffers - 1);
+ _vec_len (my_tx_buffers) =
+ vlib_buffer_alloc_from_free_list (vlib_get_main (), my_tx_buffers,
+ n_free_buffers,
+ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+ /* buffer shortage, report failure */
+ if (vec_len (my_tx_buffers) == 0)
+ {
+ clib_warning ("out of buffers");
+ return -1;
+ }
+ tm->tx_buffers[thread_index] = my_tx_buffers;
+ }
+ *bidx = my_tx_buffers[_vec_len (my_tx_buffers) - 1];
+ _vec_len (my_tx_buffers) -= 1;
+ return 0;
+}
+
+always_inline void
+tcp_return_buffer (tcp_main_t * tm)
+{
+ u32 *my_tx_buffers;
+ u32 thread_index = vlib_get_thread_index ();
+ my_tx_buffers = tm->tx_buffers[thread_index];
+ _vec_len (my_tx_buffers) += 1;
+}
always_inline void
tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b)
/* Init retransmit timer */
tcp_retransmit_timer_set (tc);
+ TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc);
}
always_inline void
u32 *to_next, next_index;
vlib_frame_t *f;
- b->flags |= VNET_BUFFER_LOCALLY_ORIGINATED;
+ b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
b->error = 0;
/* Default FIB for now */
if (is_ip4)
{
ih4 = vlib_buffer_push_ip4 (vm, b0, &dst_ip40, &src_ip40,
- IP_PROTOCOL_TCP);
+ IP_PROTOCOL_TCP, 1);
th0->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ih4);
}
else
* Send reset without reusing existing buffer
*/
void
-tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4)
+tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4)
{
vlib_buffer_t *b;
u32 bi;
ip4_header_t *ih4, *pkt_ih4;
ip6_header_t *ih6, *pkt_ih6;
- tcp_get_free_buffer_index (tm, &bi);
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+
b = vlib_get_buffer (vm, bi);
/* Leave enough space for headers */
{
flags = TCP_FLAG_RST;
seq = pkt_th->ack_number;
- ack = 0;
+ ack = (tc && tc->state >= TCP_STATE_SYN_RCVD) ? tc->rcv_nxt : 0;
}
else
{
{
ASSERT ((pkt_ih4->ip_version_and_header_length & 0xF0) == 0x40);
ih4 = vlib_buffer_push_ip4 (vm, b, &pkt_ih4->dst_address,
- &pkt_ih4->src_address, IP_PROTOCOL_TCP);
+ &pkt_ih4->src_address, IP_PROTOCOL_TCP, 1);
th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih4);
}
else
}
tcp_enqueue_to_ip_lookup (vm, b, bi, is_ip4);
+ TCP_EVT_DBG (TCP_EVT_RST_SENT, tc);
}
void
tcp_push_ip_hdr (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b)
{
tcp_header_t *th = vlib_buffer_get_current (b);
-
+ vlib_main_t *vm = vlib_get_main ();
if (tc->c_is_ip4)
{
ip4_header_t *ih;
- ih = vlib_buffer_push_ip4 (tm->vlib_main, b, &tc->c_lcl_ip4,
- &tc->c_rmt_ip4, IP_PROTOCOL_TCP);
- th->checksum = ip4_tcp_udp_compute_checksum (tm->vlib_main, b, ih);
+ ih = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip4,
+ &tc->c_rmt_ip4, IP_PROTOCOL_TCP, 1);
+ th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih);
}
else
{
ip6_header_t *ih;
int bogus = ~0;
- ih = vlib_buffer_push_ip6 (tm->vlib_main, b, &tc->c_lcl_ip6,
+ ih = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip6,
&tc->c_rmt_ip6, IP_PROTOCOL_TCP);
- th->checksum = ip6_tcp_udp_icmp_compute_checksum (tm->vlib_main, b, ih,
- &bogus);
+ th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih, &bogus);
ASSERT (!bogus);
}
}
u16 initial_wnd;
tcp_options_t snd_opts;
- tcp_get_free_buffer_index (tm, &bi);
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+
b = vlib_get_buffer (vm, bi);
/* Leave enough space for headers */
tcp_push_ip_hdr (tm, tc, b);
tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4);
+ TCP_EVT_DBG (TCP_EVT_SYN_SENT, tc);
}
always_inline void
-tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4)
+tcp_enqueue_to_output_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+ u8 is_ip4, u8 flush)
{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ u32 thread_index = vlib_get_thread_index ();
u32 *to_next, next_index;
vlib_frame_t *f;
- b->flags |= VNET_BUFFER_LOCALLY_ORIGINATED;
+ b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
b->error = 0;
/* Decide where to send the packet */
next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index;
- /* Enqueue the packet */
- f = vlib_get_frame_to_node (vm, next_index);
+ /* Initialize the trajectory trace, if configured */
+ if (VLIB_BUFFER_TRACE_TRAJECTORY > 0)
+ {
+ b->pre_data[0] = 1;
+ b->pre_data[1] = next_index;
+ }
+
+ /* Get frame to v4/6 output node */
+ f = tm->tx_frames[!is_ip4][thread_index];
+ if (!f)
+ {
+ f = vlib_get_frame_to_node (vm, next_index);
+ ASSERT (f);
+ tm->tx_frames[!is_ip4][thread_index] = f;
+ }
to_next = vlib_frame_vector_args (f);
- to_next[0] = bi;
- f->n_vectors = 1;
- vlib_put_frame_to_node (vm, next_index, f);
+ to_next[f->n_vectors] = bi;
+ f->n_vectors += 1;
+ if (flush || f->n_vectors == VLIB_FRAME_SIZE)
+ {
+ vlib_put_frame_to_node (vm, next_index, f);
+ tm->tx_frames[!is_ip4][thread_index] = 0;
+ }
+}
+
+always_inline void
+tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4)
+{
+ tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 0);
+}
+
+always_inline void
+tcp_enqueue_to_output_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+ u8 is_ip4)
+{
+ tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 1);
+}
+
+/**
+ * Flush tx frame populated by retransmits and timer pops
+ */
+void
+tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4)
+{
+ if (tcp_main.tx_frames[!is_ip4][thread_index])
+ {
+ u32 next_index;
+ next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index;
+ vlib_put_frame_to_node (vm, next_index,
+ tcp_main.tx_frames[!is_ip4][thread_index]);
+ tcp_main.tx_frames[!is_ip4][thread_index] = 0;
+ }
+}
+
+/**
+ * Flush both v4 and v6 tx frames for thread index
+ */
+void
+tcp_flush_frames_to_output (u8 thread_index)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ tcp_flush_frame_to_output (vm, thread_index, 1);
+ tcp_flush_frame_to_output (vm, thread_index, 0);
}
/**
tcp_main_t *tm = vnet_get_tcp_main ();
vlib_main_t *vm = vlib_get_main ();
- tcp_get_free_buffer_index (tm, &bi);
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
b = vlib_get_buffer (vm, bi);
/* Leave enough space for headers */
vlib_buffer_make_headroom (b, MAX_HDRS_LEN);
tcp_make_fin (tc, b);
- tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+ tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4);
tc->flags |= TCP_CONN_FINSNT;
tcp_retransmit_timer_force_update (tc);
TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc);
u32 bi;
/* Get buffer */
- tcp_get_free_buffer_index (tm, &bi);
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
b = vlib_get_buffer (vm, bi);
/* Fill in the ACK */
tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss);
tc->cwnd = tcp_loss_wnd (tc);
tc->snd_congestion = tc->snd_una_max;
+
tcp_recovery_on (tc);
}
if (is_syn)
{
tc = tcp_half_open_connection_get (index);
+ tc->timers[TCP_TIMER_RETRANSMIT_SYN] = TCP_TIMER_HANDLE_INVALID;
}
else
{
tc = tcp_connection_get (index, thread_index);
+ tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID;
}
- /* Make sure timer handle is set to invalid */
- tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID;
-
if (!tcp_in_recovery (tc) && tc->rto_boff > 0
&& tc->state >= TCP_STATE_ESTABLISHED)
{
/* Go back to first un-acked byte */
tc->snd_nxt = tc->snd_una;
- tcp_get_free_buffer_index (tm, &bi);
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+
b = vlib_get_buffer (vm, bi);
if (tc->state >= TCP_STATE_ESTABLISHED)
/* Lost FIN, retransmit and return */
if (tc->flags & TCP_CONN_FINSNT)
{
+ tcp_return_buffer (tm);
tcp_send_fin (tc);
return;
}
tcp_retransmit_timer_set (tc);
ASSERT (0 || (tc->rto_boff > 1
&& tc->snd_una == tc->snd_congestion));
+ tcp_return_buffer (tm);
return;
}
/* Retransmit for SYN/SYNACK */
else if (tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_SYN_SENT)
{
+ /* Half-open connection actually moved to established but we were
+ * waiting for syn retransmit to pop to call cleanup from the right
+ * thread. */
+ if (tc->flags & TCP_CONN_HALF_OPEN_DONE)
+ {
+ ASSERT (tc->state == TCP_STATE_SYN_SENT);
+ if (tcp_half_open_connection_cleanup (tc))
+ {
+ clib_warning ("could not remove half-open connection");
+ ASSERT (0);
+ }
+ tcp_return_buffer (tm);
+ return;
+ }
+
/* Try without increasing RTO a number of times. If this fails,
* start growing RTO exponentially */
if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
vlib_buffer_make_headroom (b, MAX_HDRS_LEN);
-
tcp_push_hdr_i (tc, b, tc->state, 1);
/* Account for the SYN */
tc->snd_nxt += 1;
+ tc->rtt_ts = 0;
+ TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc,
+ (tc->state == TCP_STATE_SYN_SENT ? 0 : 1));
}
else
{
ASSERT (tc->state == TCP_STATE_CLOSED);
clib_warning ("connection closed ...");
+ tcp_return_buffer (tm);
return;
}
{
ASSERT (tc->state == TCP_STATE_SYN_SENT);
- TCP_EVT_DBG (TCP_EVT_SYN_RTX, tc);
-
/* This goes straight to ipx_lookup */
tcp_push_ip_hdr (tm, tc, b);
tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4);
tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID;
/* Problem already solved or worse */
- if (tc->state == TCP_STATE_CLOSED
+ if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED
|| tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc))
return;
tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
/* Try to force the first unsent segment */
- tcp_get_free_buffer_index (tm, &bi);
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+
b = vlib_get_buffer (vm, bi);
tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
/* Nothing to send */
if (n_bytes <= 0)
{
- clib_warning ("persist found nothing to send");
+ // clib_warning ("persist found nothing to send");
tcp_return_buffer (tm);
return;
}
tc->snd_nxt = tc->snd_una;
/* Get buffer */
- tcp_get_free_buffer_index (tm, &bi);
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+
b = vlib_get_buffer (vm, bi);
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
while (hole && snd_space > 0)
{
- tcp_get_free_buffer_index (tm, &bi);
- b = vlib_get_buffer (vm, bi);
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+ b = vlib_get_buffer (vm, bi);
hole = scoreboard_next_rxt_hole (sb, hole,
tcp_fastrecovery_sent_1_smss (tc),
&can_rescue, &snd_limited);
while (snd_space > 0)
{
- tcp_get_free_buffer_index (tm, &bi);
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
b = vlib_get_buffer (vm, bi);
-
offset += n_written;
n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space);
if (is_ip4)
{
- ip4_header_t *ih0;
- ih0 = vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4,
- &tc0->c_rmt_ip4, IP_PROTOCOL_TCP);
- th0->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ih0);
+ vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, &tc0->c_rmt_ip4,
+ IP_PROTOCOL_TCP, 1);
+ b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
+ vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data;
+ th0->checksum = 0;
}
else
{
ip6_header_t *ih0;
- int bogus = ~0;
-
ih0 = vlib_buffer_push_ip6 (vm, b0, &tc0->c_lcl_ip6,
&tc0->c_rmt_ip6, IP_PROTOCOL_TCP);
- th0->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ih0,
- &bogus);
- ASSERT (!bogus);
+ b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
+ vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data;
+ vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data;
+ th0->checksum = 0;
}
/* Filter out DUPACKs if there are no OOO segments left */
/* Stop DELACK timer and fix flags */
tc0->flags &= ~(TCP_CONN_SNDACK);
- if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK))
- {
- tcp_timer_reset (tc0, TCP_TIMER_DELACK);
- }
+ tcp_timer_reset (tc0, TCP_TIMER_DELACK);
/* If not retransmitting
* 1) update snd_una_max (SYN, SYNACK, FIN)
tc0->rto_boff = 0;
}
- /* set fib index to default and lookup node */
- /* XXX network virtualization (vrf/vni) */
+#if 0
+ /* Make sure we haven't lost route to our peer */
+ if (PREDICT_FALSE (tc0->last_fib_check
+ < tc0->snd_opts.tsval + TCP_FIB_RECHECK_PERIOD))
+ {
+ if (PREDICT_TRUE
+ (tc0->c_rmt_fei == tcp_lookup_rmt_in_fib (tc0)))
+ {
+ tc0->last_fib_check = tc0->snd_opts.tsval;
+ }
+ else
+ {
+ clib_warning ("lost connection to peer");
+ tcp_connection_reset (tc0);
+ goto done;
+ }
+ }
+
+ /* Use pre-computed dpo to set next node */
+ next0 = tc0->c_rmt_dpo.dpoi_next_node;
+ vnet_buffer (b0)->ip.adj_index[VLIB_TX] = tc0->c_rmt_dpo.dpoi_index;
+#endif
+
vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0;
- vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0;
- b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED;
+ b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
done:
b0->error = node->errors[error0];
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
tc = (tcp_connection_t *) tconn;
tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0);
- if (tc->rtt_ts == 0)
+ if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc))
{
tc->rtt_ts = tcp_time_now ();
tc->rtt_seq = tc->snd_nxt;
done:
b0->error = node->errors[error0];
- b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED;
+ b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
th0 = vlib_buffer_get_current (b0);