}
}
+static void
+tcp_handle_rst (tcp_connection_t * tc)
+{
+ switch (tc->rst_state)
+ {
+ case TCP_STATE_SYN_RCVD:
+ /* Cleanup everything. App wasn't notified yet */
+ session_transport_delete_notify (&tc->connection);
+ tcp_connection_cleanup (tc);
+ break;
+ case TCP_STATE_SYN_SENT:
+ session_stream_connect_notify (&tc->connection, 1 /* fail */ );
+ tcp_connection_cleanup (tc);
+ break;
+ case TCP_STATE_ESTABLISHED:
+ session_transport_reset_notify (&tc->connection);
+ session_transport_closed_notify (&tc->connection);
+ break;
+ case TCP_STATE_CLOSE_WAIT:
+ case TCP_STATE_FIN_WAIT_1:
+ case TCP_STATE_FIN_WAIT_2:
+ case TCP_STATE_CLOSING:
+ case TCP_STATE_LAST_ACK:
+ session_transport_closed_notify (&tc->connection);
+ break;
+ case TCP_STATE_CLOSED:
+ case TCP_STATE_TIME_WAIT:
+ break;
+ default:
+ TCP_DBG ("reset state: %u", tc->state);
+ }
+}
+
+static void
+tcp_program_reset_ntf (tcp_worker_ctx_t * wrk, tcp_connection_t * tc)
+{
+ if (!tcp_disconnect_pending (tc))
+ {
+ tc->rst_state = tc->state;
+ vec_add1 (wrk->pending_resets, tc->c_c_index);
+ tcp_disconnect_pending_on (tc);
+ }
+}
+
+/**
+ * Handle reset packet
+ *
+ * Programs disconnect/reset notification that should be sent
+ * later by calling @ref tcp_handle_disconnects
+ */
+static void
+tcp_rcv_rst (tcp_worker_ctx_t * wrk, tcp_connection_t * tc)
+{
+ TCP_EVT (TCP_EVT_RST_RCVD, tc);
+ switch (tc->state)
+ {
+ case TCP_STATE_SYN_RCVD:
+ tcp_program_reset_ntf (wrk, tc);
+ tcp_connection_set_state (tc, TCP_STATE_CLOSED);
+ break;
+ case TCP_STATE_SYN_SENT:
+ /* Do not program ntf because the connection is half-open */
+ tcp_handle_rst (tc);
+ break;
+ case TCP_STATE_ESTABLISHED:
+ tcp_connection_timers_reset (tc);
+ tcp_cong_recovery_off (tc);
+ tcp_program_reset_ntf (wrk, tc);
+ tcp_connection_set_state (tc, TCP_STATE_CLOSED);
+ tcp_program_cleanup (wrk, tc);
+ break;
+ case TCP_STATE_CLOSE_WAIT:
+ case TCP_STATE_FIN_WAIT_1:
+ case TCP_STATE_FIN_WAIT_2:
+ case TCP_STATE_CLOSING:
+ case TCP_STATE_LAST_ACK:
+ tcp_connection_timers_reset (tc);
+ tcp_cong_recovery_off (tc);
+ tcp_program_reset_ntf (wrk, tc);
+ /* Make sure we mark the session as closed. In some states we may
+ * be still trying to send data */
+ tcp_connection_set_state (tc, TCP_STATE_CLOSED);
+ tcp_program_cleanup (wrk, tc);
+ break;
+ case TCP_STATE_CLOSED:
+ case TCP_STATE_TIME_WAIT:
+ break;
+ default:
+ TCP_DBG ("reset state: %u", tc->state);
+ }
+}
+
/**
* Validate incoming segment as per RFC793 p. 69 and RFC1323 p. 19
*
/* 2nd: check the RST bit */
if (PREDICT_FALSE (tcp_rst (th0)))
{
- tcp_connection_reset (tc0);
+ tcp_rcv_rst (wrk, tc0);
*error0 = TCP_ERROR_RST_RCVD;
goto error;
}
if (!right)
{
sb->sacked_bytes = sb->high_sacked - ack;
+ sb->last_sacked_bytes = sb->sacked_bytes
+ - (old_sacked - sb->last_bytes_delivered);
return;
}
sb->last_bytes_delivered = 0;
sb->rxt_sacked = 0;
- if (!tcp_opts_sack (&tc->rcv_opts)
+ if (!tcp_opts_sack (&tc->rcv_opts) && !sb->sacked_bytes
&& sb->head == TCP_INVALID_SACK_HOLE_INDEX)
return;
{
if (seq_lt (blk->start, blk->end)
&& seq_gt (blk->start, tc->snd_una)
- && seq_gt (blk->start, ack)
- && seq_lt (blk->start, tc->snd_nxt)
- && seq_leq (blk->end, tc->snd_nxt))
+ && seq_gt (blk->start, ack) && seq_leq (blk->end, tc->snd_nxt))
{
blk++;
continue;
hole = pool_elt_at_index (sb->holes, sb->head);
if (PREDICT_FALSE (sb->is_reneging))
- sb->last_bytes_delivered += hole->start - tc->snd_una;
+ {
+ sb->last_bytes_delivered += clib_min (hole->start - tc->snd_una,
+ ack - tc->snd_una);
+ sb->is_reneging = seq_lt (ack, hole->start);
+ }
while (hole && blk_index < vec_len (rcv_sacks))
{
}
else
{
- tcp_persist_timer_reset (tc);
+ if (PREDICT_FALSE (tcp_timer_is_active (tc, TCP_TIMER_PERSIST)))
+ tcp_persist_timer_reset (tc);
+
if (PREDICT_FALSE (!tcp_in_recovery (tc) && tc->rto_boff > 0))
{
tc->rto_boff = 0;
tc->rcv_dupacks += 1;
TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1);
}
- tc->rxt_delivered = clib_max (tc->rxt_delivered + tc->bytes_acked,
+ tc->rxt_delivered = clib_min (tc->rxt_delivered + tc->bytes_acked,
tc->snd_rxt_bytes);
if (is_dack)
tc->prr_delivered += clib_min (tc->snd_mss,
tcp_cc_rcv_cong_ack (tc, TCP_CC_PARTIALACK, rs);
}
+static void
+tcp_handle_old_ack (tcp_connection_t * tc, tcp_rate_sample_t * rs)
+{
+ if (!tcp_in_cong_recovery (tc))
+ return;
+
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
+ tcp_rcv_sacks (tc, tc->snd_una);
+
+ tc->bytes_acked = 0;
+
+ if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
+ tcp_bt_sample_delivery_rate (tc, rs);
+
+ tcp_cc_handle_event (tc, rs, 1);
+}
+
/**
* Check if duplicate ack as per RFC5681 Sec. 2
*/
tc->errors.below_ack_wnd += 1;
*error = TCP_ERROR_ACK_OLD;
TCP_EVT (TCP_EVT_ACK_RCV_ERR, tc, 1, vnet_buffer (b)->tcp.ack_number);
- if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
- tcp_cc_handle_event (tc, 0, 1);
+
+ if (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una - tc->rcv_wnd))
+ return -1;
+
+ tcp_handle_old_ack (tc, &rs);
+
/* Don't drop yet */
return 0;
}
static void
tcp_handle_disconnects (tcp_worker_ctx_t * wrk)
{
- u32 thread_index, *pending_disconnects;
+ u32 thread_index, *pending_disconnects, *pending_resets;
tcp_connection_t *tc;
int i;
- if (!vec_len (wrk->pending_disconnects))
- return;
+ if (vec_len (wrk->pending_disconnects))
+ {
+ thread_index = wrk->vm->thread_index;
+ pending_disconnects = wrk->pending_disconnects;
+ for (i = 0; i < vec_len (pending_disconnects); i++)
+ {
+ tc = tcp_connection_get (pending_disconnects[i], thread_index);
+ tcp_disconnect_pending_off (tc);
+ session_transport_closing_notify (&tc->connection);
+ }
+ _vec_len (wrk->pending_disconnects) = 0;
+ }
- thread_index = wrk->vm->thread_index;
- pending_disconnects = wrk->pending_disconnects;
- for (i = 0; i < vec_len (pending_disconnects); i++)
+ if (vec_len (wrk->pending_resets))
{
- tc = tcp_connection_get (pending_disconnects[i], thread_index);
- tcp_disconnect_pending_off (tc);
- session_transport_closing_notify (&tc->connection);
+ thread_index = wrk->vm->thread_index;
+ pending_resets = wrk->pending_resets;
+ for (i = 0; i < vec_len (pending_resets); i++)
+ {
+ tc = tcp_connection_get (pending_resets[i], thread_index);
+ tcp_disconnect_pending_off (tc);
+ tcp_handle_rst (tc);
+ }
+ _vec_len (wrk->pending_resets) = 0;
}
- _vec_len (wrk->pending_disconnects) = 0;
}
static void
* retransmissions since we may not have any data to send */
if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
{
- tcp_program_ack (tc);
+ tcp_program_dupack (tc);
+ tc->errors.below_data_wnd++;
error = TCP_ERROR_SEGMENT_OLD;
goto done;
}
CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
+ tcp_connection_t *tc = &t->tcp_connection;
u32 indent = format_get_indent (s);
- s = format (s, "%U\n%U%U",
- format_tcp_header, &t->tcp_header, 128,
- format_white_space, indent,
- format_tcp_connection, &t->tcp_connection, 1);
+ s = format (s, "%U state %U\n%U%U", format_tcp_connection_id, tc,
+ format_tcp_state, tc->state, format_white_space, indent,
+ format_tcp_header, &t->tcp_header, 128);
return s;
}
return tc;
}
+static tcp_connection_t *
+tcp_lookup_listener (vlib_buffer_t * b, u32 fib_index, int is_ip4)
+{
+ session_t *s;
+
+ if (is_ip4)
+ {
+ ip4_header_t *ip4 = vlib_buffer_get_current (b);
+ tcp_header_t *tcp = tcp_buffer_hdr (b);
+ s = session_lookup_listener4 (fib_index,
+ &ip4->dst_address,
+ tcp->dst_port, TRANSPORT_PROTO_TCP, 1);
+ }
+ else
+ {
+ ip6_header_t *ip6 = vlib_buffer_get_current (b);
+ tcp_header_t *tcp = tcp_buffer_hdr (b);
+ s = session_lookup_listener6 (fib_index,
+ &ip6->dst_address,
+ tcp->dst_port, TRANSPORT_PROTO_TCP, 1);
+
+ }
+ if (PREDICT_TRUE (s != 0))
+ return tcp_get_connection_from_transport (transport_get_listener
+ (TRANSPORT_PROTO_TCP,
+ s->connection_index));
+ else
+ return 0;
+}
+
always_inline void
tcp_check_tx_offload (tcp_connection_t * tc, int is_ipv4)
{
}
lb = load_balance_get (lb_idx);
+ if (PREDICT_FALSE (lb->lb_n_buckets > 1))
+ return;
dpo = load_balance_get_bucket_i (lb, 0);
- sw_if_idx = dpo->dpoi_index;
- hw_if = vnet_get_sup_hw_interface (vnm, sw_if_idx);
+ sw_if_idx = dpo_get_urpf (dpo);
+ if (PREDICT_FALSE (sw_if_idx == ~0))
+ return;
+ hw_if = vnet_get_sup_hw_interface (vnm, sw_if_idx);
if (hw_if->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO)
tc->cfg_flags |= TCP_CFG_F_TSO;
}
/* If ACK is acceptable, signal client that peer is not
* willing to accept connection and drop connection*/
if (tcp_ack (tcp0))
- tcp_connection_reset (tc0);
+ tcp_rcv_rst (wrk, tc0);
error0 = TCP_ERROR_RST_RCVD;
goto drop;
}
my_thread_index);
tcp_inc_counter (syn_sent, TCP_ERROR_MSG_QUEUE_FULL, errors);
vlib_buffer_free (vm, first_buffer, from_frame->n_vectors);
+ tcp_handle_disconnects (wrk);
return from_frame->n_vectors;
}
/* Make sure the segment is exactly right */
if (tc0->rcv_nxt != vnet_buffer (b0)->tcp.seq_number || is_fin0)
{
- tcp_connection_reset (tc0);
+ tcp_send_reset_w_pkt (tc0, b0, thread_index, is_ip4);
error0 = TCP_ERROR_SEGMENT_INVALID;
goto drop;
}
*/
if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
{
- tcp_connection_reset (tc0);
+ tcp_send_reset_w_pkt (tc0, b0, thread_index, is_ip4);
+ error0 = TCP_ERROR_SEGMENT_INVALID;
goto drop;
}
if (session_stream_accept_notify (&tc0->connection))
{
error0 = TCP_ERROR_MSG_QUEUE_FULL;
- tcp_connection_reset (tc0);
+ tcp_send_reset (tc0);
+ session_transport_delete_notify (&tc0->connection);
+ tcp_connection_cleanup (tc0);
goto drop;
}
error0 = TCP_ERROR_ACK_OK;
if (tc0->flags & TCP_CONN_FINRCVD)
{
tcp_connection_set_state (tc0, TCP_STATE_CLOSED);
- tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE,
- tcp_cfg.cleanup_time);
session_transport_closed_notify (&tc0->connection);
+ tcp_program_cleanup (wrk, tc0);
goto drop;
}
* we can't ensure that we have no packets already enqueued
* to output. Rely instead on the waitclose timer */
tcp_connection_timers_reset (tc0);
- tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.cleanup_time);
+ tcp_program_cleanup (tcp_get_worker (tc0->c_thread_index), tc0);
goto drop;
{
u32 n_left_from, *from, n_syns = 0, *first_buffer;
u32 my_thread_index = vm->thread_index;
+ tcp_connection_t *tc0;
from = first_buffer = vlib_frame_vector_args (from_frame);
n_left_from = from_frame->n_vectors;
n_left_from -= 1;
b0 = vlib_get_buffer (vm, bi0);
- lc0 = tcp_listener_get (vnet_buffer (b0)->tcp.connection_index);
if (is_ip4)
{
ip40 = vlib_buffer_get_current (b0);
- th0 = ip4_next_header (ip40);
+ th0 = tcp_buffer_hdr (b0);
}
else
{
ip60 = vlib_buffer_get_current (b0);
- th0 = ip6_next_header (ip60);
+ th0 = tcp_buffer_hdr (b0);
+ }
+
+ lc0 = tcp_listener_get (vnet_buffer (b0)->tcp.connection_index);
+ if (PREDICT_FALSE (lc0 == 0))
+ {
+ tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
+ my_thread_index);
+ if (tc0->state != TCP_STATE_TIME_WAIT)
+ {
+ error0 = TCP_ERROR_CREATE_EXISTS;
+ goto drop;
+ }
+ lc0 = tcp_lookup_listener (b0, tc0->c_fib_index, is_ip4);
+ /* clean up the old session */
+ tcp_connection_del (tc0);
}
/* Create child session. For syn-flood protection use filter */
static inline void
tcp_input_dispatch_buffer (tcp_main_t * tm, tcp_connection_t * tc,
- vlib_buffer_t * b, u16 * next, u32 * error)
+ vlib_buffer_t * b, u16 * next,
+ vlib_node_runtime_t * error_node)
{
tcp_header_t *tcp;
+ u32 error;
u8 flags;
tcp = tcp_buffer_hdr (b);
flags = tcp->flags & filter_flags;
*next = tm->dispatch_table[tc->state][flags].next;
- *error = tm->dispatch_table[tc->state][flags].error;
+ error = tm->dispatch_table[tc->state][flags].error;
tc->segs_in += 1;
- if (PREDICT_FALSE (*error == TCP_ERROR_DISPATCH
- || *next == TCP_INPUT_NEXT_RESET))
+ if (PREDICT_FALSE (error != TCP_ERROR_NONE))
{
- /* Overload tcp flags to store state */
- tcp_state_t state = tc->state;
- vnet_buffer (b)->tcp.flags = tc->state;
-
- if (*error == TCP_ERROR_DISPATCH)
+ b->error = error_node->errors[error];
+ if (error == TCP_ERROR_DISPATCH)
clib_warning ("tcp conn %u disp error state %U flags %U",
- tc->c_c_index, format_tcp_state, state,
+ tc->c_c_index, format_tcp_state, tc->state,
format_tcp_flags, (int) flags);
}
}
tcp_main_t *tm = vnet_get_tcp_main ();
vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
u16 nexts[VLIB_FRAME_SIZE], *next;
+ vlib_node_runtime_t *error_node;
tcp_set_time_now (tcp_get_worker (thread_index));
+ error_node = vlib_node_get_runtime (vm, tcp_node_index (input, is_ip4));
from = vlib_frame_vector_args (frame);
n_left_from = frame->n_vectors;
vlib_get_buffers (vm, from, bufs, n_left_from);
vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index;
vnet_buffer (b[1])->tcp.connection_index = tc1->c_c_index;
- tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], &error0);
- tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], &error1);
+ tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], error_node);
+ tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], error_node);
}
else
{
{
ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0])));
vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index;
- tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], &error0);
+ tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], error_node);
}
else
- tcp_input_set_error_next (tm, &next[0], &error0, is_ip4);
+ {
+ tcp_input_set_error_next (tm, &next[0], &error0, is_ip4);
+ b[0]->error = error_node->errors[error0];
+ }
if (PREDICT_TRUE (tc1 != 0))
{
ASSERT (tcp_lookup_is_valid (tc1, b[1], tcp_buffer_hdr (b[1])));
vnet_buffer (b[1])->tcp.connection_index = tc1->c_c_index;
- tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], &error1);
+ tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], error_node);
}
else
- tcp_input_set_error_next (tm, &next[1], &error1, is_ip4);
+ {
+ tcp_input_set_error_next (tm, &next[1], &error1, is_ip4);
+ b[1]->error = error_node->errors[error1];
+ }
}
b += 2;
{
ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0])));
vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index;
- tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], &error0);
+ tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], error_node);
}
else
- tcp_input_set_error_next (tm, &next[0], &error0, is_ip4);
+ {
+ tcp_input_set_error_next (tm, &next[0], &error0, is_ip4);
+ b[0]->error = error_node->errors[error0];
+ }
b += 1;
next += 1;
_(LISTEN, TCP_FLAG_FIN | TCP_FLAG_RST, TCP_INPUT_NEXT_DROP,
TCP_ERROR_SEGMENT_INVALID);
_(LISTEN, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP,
- TCP_ERROR_NONE);
+ TCP_ERROR_SEGMENT_INVALID);
_(LISTEN, TCP_FLAG_FIN | TCP_FLAG_SYN, TCP_INPUT_NEXT_DROP,
TCP_ERROR_SEGMENT_INVALID);
_(LISTEN, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP,
TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
- _(TIME_WAIT, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(TIME_WAIT, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE);
_(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(TIME_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
_(CLOSED, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP,
TCP_ERROR_CONNECTION_CLOSED);
- _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE);
- _(CLOSED, TCP_FLAG_SYN, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE);
+ _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED);
+ _(CLOSED, TCP_FLAG_SYN, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED);
_(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET,
- TCP_ERROR_NONE);
+ TCP_ERROR_CONNECTION_CLOSED);
#undef _
}