always_inline int
tcp_segment_check_paws (tcp_connection_t * tc)
{
- /* XXX normally test for timestamp should be lt instead of leq, but for
- * local testing this is not enough */
return tcp_opts_tstamp (&tc->opt) && tc->tsval_recent
&& timestamp_lt (tc->opt.tsval, tc->tsval_recent);
}
if (tc0->rcv_wnd == 0
&& tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number)
{
- /* Make it look as if there's nothing to dequeue */
- vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number;
+ /* TODO Should segment be tagged? */
}
else
{
if (tc->rtt_seq && seq_gt (ack, tc->rtt_seq) && !tc->rto_boff)
{
mrtt = tcp_time_now () - tc->rtt_ts;
- tc->rtt_seq = 0;
}
/* As per RFC7323 TSecr can be used for RTTM only if the segment advances
tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
+ /* Allow measuring of RTT and make sure boff is 0 */
+ tc->rtt_seq = 0;
+ tc->rto_boff = 0;
+
return 1;
}
stream_session_dequeue_drop (&tc->connection, tc->bytes_acked);
/* Update rtt and rto */
- if (tcp_update_rtt (tc, ack))
- {
- /* Good ACK received and valid RTT, make sure retransmit backoff is 0 */
- tc->rto_boff = 0;
- }
+ tcp_update_rtt (tc, ack);
}
/**
tc->snd_wl1 = seq;
tc->snd_wl2 = ack;
TCP_EVT_DBG (TCP_EVT_SND_WND, tc);
+
+ /* Set probe timer if we just got 0 wnd */
+ if (tc->snd_wnd < tc->snd_mss
+ && !tcp_timer_is_active (tc, TCP_TIMER_PERSIST))
+ tcp_persist_timer_set (tc);
+ else
+ tcp_persist_timer_reset (tc);
}
}
void
tcp_cc_recover (tcp_connection_t * tc)
{
+ /* TODO: check if time to recover was small. It might be that RTO popped
+ * too soon.
+ */
+
tc->cc_algo->recovered (tc);
tc->rtx_bytes = 0;
tc->cc_algo->rcv_ack (tc);
tc->tsecr_last_ack = tc->opt.tsecr;
- tcp_fastrecovery_1_smss_off (tc);
- tcp_fastrecovery_off (tc);
+ tcp_cong_recovery_off (tc);
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
}
{
u8 partial_ack;
- if (tcp_in_recovery (tc))
+ if (tcp_in_cong_recovery (tc))
{
partial_ack = seq_lt (tc->snd_una, tc->snd_congestion);
if (!partial_ack)
/* In case snd_nxt is still in the past and output tries to
* shove some new bytes */
- tc->snd_nxt = tc->snd_una;
+ tc->snd_nxt = tc->snd_una_max;
/* XXX need proper RFC6675 support */
- if (tc->sack_sb.last_sacked_bytes)
+ if (tc->sack_sb.last_sacked_bytes && !tcp_in_recovery (tc))
{
tcp_fast_retransmit (tc);
}
{
/* Retransmit first unacked segment */
tcp_retransmit_first_unacked (tc);
- /* If window allows, send 1 SMSS of new data */
- if (seq_lt (tc->snd_nxt, tc->snd_congestion))
- tc->snd_nxt = tc->snd_congestion;
}
}
}
return -1;
}
- tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
- *error = TCP_ERROR_ACK_FUTURE;
TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 2,
vnet_buffer (b)->tcp.ack_number);
+
+ tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
+ *error = TCP_ERROR_ACK_FUTURE;
}
/* If old ACK, probably it's an old dupack */
* timer. */
if (tc->bytes_acked)
{
- TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc, vnet_buffer (b)->tcp.ack_number);
+ TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
/* Updates congestion control (slow start/congestion avoidance) */
tcp_cc_rcv_ack (tc, b);
tc->rcv_nxt += written;
/* Depending on how fast the app is, all remaining buffers in burst will
- * not be enqueued. Should we inform peer of the damage? XXX */
+ * not be enqueued. Inform peer */
+ tc->flags |= TCP_CONN_SNDACK;
+
return TCP_ERROR_PARTIALLY_ENQUEUED;
}
else
{
+ tc->flags |= TCP_CONN_SNDACK;
return TCP_ERROR_FIFO_FULL;
}
u16 data_len)
{
stream_session_t *s0;
- u32 offset, seq;
+ u32 offset;
int rv;
/* Pure ACK. Do nothing */
}
s0 = stream_session_get (tc->c_s_index, tc->c_thread_index);
- seq = vnet_buffer (b)->tcp.seq_number;
- offset = seq - tc->rcv_nxt;
+ offset = vnet_buffer (b)->tcp.seq_number - tc->irs;
- rv = svm_fifo_enqueue_with_offset (s0->server_rx_fifo, s0->pid, offset,
- data_len, vlib_buffer_get_current (b));
+ clib_warning ("ooo: offset %d len %d", offset, data_len);
+
+ rv = svm_fifo_enqueue_with_offset (s0->server_rx_fifo, offset, data_len,
+ vlib_buffer_get_current (b));
/* Nothing written */
if (rv)
/* Get the newest segment from the fifo */
newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo);
- start = tc->rcv_nxt + ooo_segment_offset (s0->server_rx_fifo, newest);
- end = tc->rcv_nxt + ooo_segment_end_offset (s0->server_rx_fifo, newest);
+ start = ooo_segment_offset (s0->server_rx_fifo, newest);
+ end = ooo_segment_end_offset (s0->server_rx_fifo, newest);
tcp_update_sack_list (tc, start, end);
}
{
/* Old sequence numbers allowed through because they overlapped
* the rx window */
+
if (seq_lt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt))
{
error = TCP_ERROR_SEGMENT_OLD;
goto done;
}
- if (PREDICT_FALSE (error == TCP_ERROR_FIFO_FULL))
- *next0 = TCP_NEXT_DROP;
-
/* Check if ACK can be delayed */
- if (!tcp_can_delack (tc))
- {
- /* Nothing to do for pure ACKs XXX */
- if (n_data_bytes == 0)
- goto done;
-
- *next0 = tcp_next_output (tc->c_is_ip4);
- tcp_make_ack (tc, b);
- }
- else
+ if (tcp_can_delack (tc))
{
if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK))
tcp_timer_set (tc, TCP_TIMER_DELACK, TCP_DELACK_TIME);
+ goto done;
}
+ *next0 = tcp_next_output (tc->c_is_ip4);
+ tcp_make_ack (tc, b);
+
done:
return error;
}
+typedef struct
+{
+ tcp_header_t tcp_header;
+ tcp_connection_t tcp_connection;
+} tcp_rx_trace_t;
+
+u8 *
+format_tcp_rx_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
+ uword indent = format_get_indent (s);
+
+ s = format (s, "%U\n%U%U",
+ format_tcp_header, &t->tcp_header, 128,
+ format_white_space, indent,
+ format_tcp_connection_verbose, &t->tcp_connection);
+
+ return s;
+}
+
+u8 *
+format_tcp_rx_trace_short (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
+
+ s = format (s, "%d -> %d (%U)",
+ clib_net_to_host_u16 (t->tcp_header.src_port),
+ clib_net_to_host_u16 (t->tcp_header.dst_port), format_tcp_state,
+ &t->tcp_connection.state);
+
+ return s;
+}
+
always_inline void
tcp_established_inc_counter (vlib_main_t * vm, u8 is_ip4, u8 evt, u8 val)
{
vlib_frame_t * from_frame, int is_ip4)
{
u32 n_left_from, next_index, *from, *to_next;
- u32 my_thread_index = vm->cpu_index, errors = 0;
+ u32 my_thread_index = vm->thread_index, errors = 0;
tcp_main_t *tm = vnet_get_tcp_main ();
+ u8 is_fin = 0;
from = vlib_frame_vector_args (from_frame);
n_left_from = from_frame->n_vectors;
{
u32 bi0;
vlib_buffer_t *b0;
+ tcp_rx_trace_t *t0;
tcp_header_t *th0 = 0;
tcp_connection_t *tc0;
ip4_header_t *ip40;
n_advance_bytes0 += sizeof (ip60[0]);
}
+ is_fin = (th0->flags & TCP_FLAG_FIN) != 0;
+
/* SYNs, FINs and data consume sequence numbers */
vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number
- + tcp_is_syn (th0) + tcp_is_fin (th0) + n_data_bytes0;
+ + tcp_is_syn (th0) + is_fin + n_data_bytes0;
/* TODO header prediction fast path */
vlib_buffer_advance (b0, n_advance_bytes0);
error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0);
+ /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a
+ * dangling reference. */
+
/* 8: check the FIN bit */
- if (tcp_fin (th0))
+ if (is_fin)
{
/* Enter CLOSE-WAIT and notify session. Don't send ACK, instead
* wait for session to call close. To avoid lingering
b0->error = node->errors[error0];
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
-
+ t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
+ clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header));
+ clib_memcpy (&t0->tcp_connection, tc0,
+ sizeof (t0->tcp_connection));
}
vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
foreach_tcp_state_next
#undef _
},
+ .format_trace = format_tcp_rx_trace_short,
};
/* *INDENT-ON* */
foreach_tcp_state_next
#undef _
},
+ .format_trace = format_tcp_rx_trace_short,
};
/* *INDENT-ON* */
{
tcp_main_t *tm = vnet_get_tcp_main ();
u32 n_left_from, next_index, *from, *to_next;
- u32 my_thread_index = vm->cpu_index, errors = 0;
+ u32 my_thread_index = vm->thread_index, errors = 0;
u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
from = vlib_frame_vector_args (from_frame);
{
u32 bi0, ack0, seq0;
vlib_buffer_t *b0;
+ tcp_rx_trace_t *t0;
tcp_header_t *tcp0 = 0;
tcp_connection_t *tc0;
ip4_header_t *ip40;
b0->error = error0 ? node->errors[error0] : 0;
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
-
+ t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
+ clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header));
+ clib_memcpy (&t0->tcp_connection, tc0,
+ sizeof (t0->tcp_connection));
}
vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
foreach_tcp_state_next
#undef _
},
+ .format_trace = format_tcp_rx_trace_short,
};
/* *INDENT-ON* */
#define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
foreach_tcp_state_next
#undef _
- }
-,};
+ },
+ .format_trace = format_tcp_rx_trace_short,
+};
/* *INDENT-ON* */
VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv);
{
tcp_main_t *tm = vnet_get_tcp_main ();
u32 n_left_from, next_index, *from, *to_next;
- u32 my_thread_index = vm->cpu_index, errors = 0;
+ u32 my_thread_index = vm->thread_index, errors = 0;
from = vlib_frame_vector_args (from_frame);
n_left_from = from_frame->n_vectors;
{
u32 bi0;
vlib_buffer_t *b0;
+ tcp_rx_trace_t *t0;
tcp_header_t *tcp0 = 0;
tcp_connection_t *tc0;
ip4_header_t *ip40;
case TCP_STATE_ESTABLISHED:
case TCP_STATE_FIN_WAIT_1:
case TCP_STATE_FIN_WAIT_2:
+ vlib_buffer_advance (b0, n_advance_bytes0);
error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0);
break;
case TCP_STATE_CLOSE_WAIT:
drop:
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
-
+ t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
+ clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header));
+ clib_memcpy (&t0->tcp_connection, tc0,
+ sizeof (t0->tcp_connection));
}
vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
foreach_tcp_state_next
#undef _
},
+ .format_trace = format_tcp_rx_trace_short,
};
/* *INDENT-ON* */
foreach_tcp_state_next
#undef _
},
+ .format_trace = format_tcp_rx_trace_short,
};
/* *INDENT-ON* */
vlib_frame_t * from_frame, int is_ip4)
{
u32 n_left_from, next_index, *from, *to_next;
- u32 my_thread_index = vm->cpu_index;
+ u32 my_thread_index = vm->thread_index;
tcp_main_t *tm = vnet_get_tcp_main ();
u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
{
u32 bi0;
vlib_buffer_t *b0;
+ tcp_rx_trace_t *t0;
tcp_header_t *th0 = 0;
tcp_connection_t *lc0;
ip4_header_t *ip40;
child0->irs = vnet_buffer (b0)->tcp.seq_number;
child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1;
+ child0->rcv_las = child0->rcv_nxt;
child0->state = TCP_STATE_SYN_RCVD;
/* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK}
drop:
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
-
+ t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
+ clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header));
+ clib_memcpy (&t0->tcp_connection, lc0,
+ sizeof (t0->tcp_connection));
}
b0->error = node->errors[error0];
foreach_tcp_state_next
#undef _
},
+ .format_trace = format_tcp_rx_trace_short,
};
/* *INDENT-ON* */
foreach_tcp_state_next
#undef _
},
+ .format_trace = format_tcp_rx_trace_short,
};
/* *INDENT-ON* */
_ (ESTABLISHED, "tcp6-established") \
_ (RESET, "tcp6-reset")
-typedef struct
-{
- u16 src_port;
- u16 dst_port;
- u8 state;
-} tcp_rx_trace_t;
-
-u8 *
-format_tcp_rx_trace (u8 * s, va_list * args)
-{
- CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
- CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
- tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
-
- s = format (s, "TCP: src-port %d dst-port %U%s\n",
- clib_net_to_host_u16 (t->src_port),
- clib_net_to_host_u16 (t->dst_port), format_tcp_state, t->state);
-
- return s;
-}
-
#define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN)
always_inline uword
vlib_frame_t * from_frame, int is_ip4)
{
u32 n_left_from, next_index, *from, *to_next;
- u32 my_thread_index = vm->cpu_index;
+ u32 my_thread_index = vm->thread_index;
tcp_main_t *tm = vnet_get_tcp_main ();
from = vlib_frame_vector_args (from_frame);
{
u32 bi0;
vlib_buffer_t *b0;
+ tcp_rx_trace_t *t0;
tcp_header_t *tcp0 = 0;
tcp_connection_t *tc0;
ip4_header_t *ip40;
if (PREDICT_FALSE (error0 == TCP_ERROR_DISPATCH))
{
+ tcp_state_t state0 = tc0->state;
/* Overload tcp flags to store state */
vnet_buffer (b0)->tcp.flags = tc0->state;
+ clib_warning ("disp error state %U flags %U",
+ format_tcp_state, &state0,
+ format_tcp_flags, flags0);
}
}
else
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
-
+ t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
+ clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header));
+ if (tc0)
+ clib_memcpy (&t0->tcp_connection, tc0, sizeof (*tc0));
}
vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
/* *INDENT-ON* */
VLIB_NODE_FUNCTION_MULTIARCH (tcp6_input_node, tcp6_input);
-void
-tcp_update_time (f64 now, u32 thread_index)
-{
- tcp_main_t *tm = vnet_get_tcp_main ();
- tw_timer_expire_timers_16t_2w_512sl (&tm->timer_wheels[thread_index], now);
-}
static void
tcp_dispatch_table_init (tcp_main_t * tm)