* SEG.TSval */
else if (!tcp_rst (th0))
{
- tcp_program_ack (wrk, tc0);
+ tcp_program_ack (tc0);
TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
goto error;
}
}
else
{
- tcp_program_ack (wrk, tc0);
+ tcp_program_ack (tc0);
TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, tc0);
*error0 = TCP_ERROR_SYN_ACKS_RCVD;
}
*error0 = TCP_ERROR_RCV_WND;
+ tc0->errors.below_data_wnd += seq_lt (vnet_buffer (b0)->tcp.seq_end,
+ tc0->rcv_las);
+
/* If not RST, send dup ack */
if (!tcp_rst (th0))
{
- tcp_program_dupack (wrk, tc0);
+ tcp_program_dupack (tc0);
TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
}
goto error;
/* 4th: check the SYN bit (in window) */
if (PREDICT_FALSE (tcp_syn (th0)))
{
+ /* As per RFC5961 send challenge ack instead of reset */
+ tcp_program_ack (tc0);
*error0 = TCP_ERROR_SPURIOUS_SYN;
- tcp_send_reset (tc0);
goto error;
}
}
always_inline int
-tcp_rcv_ack_is_acceptable (tcp_connection_t * tc0, vlib_buffer_t * tb0)
+tcp_rcv_ack_no_cc (tcp_connection_t * tc, vlib_buffer_t * b, u32 * error)
{
/* SND.UNA =< SEG.ACK =< SND.NXT */
- return (seq_leq (tc0->snd_una, vnet_buffer (tb0)->tcp.ack_number)
- && seq_leq (vnet_buffer (tb0)->tcp.ack_number, tc0->snd_una_max));
+ if (!(seq_leq (tc->snd_una, vnet_buffer (b)->tcp.ack_number)
+ && seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
+ {
+ if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max)
+ && seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))
+ {
+ tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
+ goto acceptable;
+ }
+ *error = TCP_ERROR_ACK_INVALID;
+ return -1;
+ }
+
+acceptable:
+ tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
+ tc->snd_una = vnet_buffer (b)->tcp.ack_number;
+ *error = TCP_ERROR_ACK_OK;
+ return 0;
}
/**
* seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */
else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr)
{
- u32 now = tcp_time_now_w_thread (tc->c_thread_index);
+ u32 now = tcp_tstamp (tc);
mrtt = clib_max (now - tc->rcv_opts.tsecr, 1);
}
sack_scoreboard_hole_t *left, *right;
u32 bytes = 0, blks = 0;
+ sb->last_lost_bytes = 0;
sb->lost_bytes = 0;
sb->sacked_bytes = 0;
left = scoreboard_last_hole (sb);
do
{
sb->lost_bytes += scoreboard_hole_bytes (right);
+ sb->last_lost_bytes += left->is_lost ? 0 : left->end - left->start;
left->is_lost = 1;
left = scoreboard_prev_hole (sb, right);
if (left)
sb->high_sacked = 0;
sb->high_rxt = 0;
sb->lost_bytes = 0;
+ sb->last_lost_bytes = 0;
sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
}
#endif /* CLIB_MARCH_VARIANT */
}
#ifndef CLIB_MARCH_VARIANT
+
void
tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
{
- sack_scoreboard_t *sb = &tc->sack_sb;
- sack_block_t *blk, tmp;
sack_scoreboard_hole_t *hole, *next_hole, *last_hole;
u32 blk_index = 0, old_sacked_bytes, hole_index;
+ sack_scoreboard_t *sb = &tc->sack_sb;
+ sack_block_t *blk, tmp;
int i, j;
sb->last_sacked_bytes = 0;
sb->last_bytes_delivered += sb->high_sacked - hole->end;
}
}
-
scoreboard_remove_hole (sb, hole);
hole = next_hole;
}
scoreboard_update_bytes (tc, sb);
sb->last_sacked_bytes = sb->sacked_bytes
- (old_sacked_bytes - sb->last_bytes_delivered);
+
ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc));
ASSERT (sb->sacked_bytes == 0 || tcp_in_recovery (tc)
|| sb->sacked_bytes < tc->snd_nxt - seq_max (tc->snd_una, ack));
- seq_max (tc->snd_una, ack) || tcp_in_recovery (tc));
ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc)
|| sb->holes[sb->head].start == ack + sb->snd_una_adv);
+ ASSERT (sb->last_lost_bytes <= sb->lost_bytes);
+
TCP_EVT_DBG (TCP_EVT_CC_SCOREBOARD, tc);
}
#endif /* CLIB_MARCH_VARIANT */
tc->prev_ssthresh = tc->ssthresh;
tc->prev_cwnd = tc->cwnd;
tc->cc_algo->congestion (tc);
+ tc->fr_occurences += 1;
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4);
}
#endif /* CLIB_MARCH_VARIANT */
#ifndef CLIB_MARCH_VARIANT
void
-tcp_cc_fastrecovery_exit (tcp_connection_t * tc)
+tcp_cc_fastrecovery_clear (tcp_connection_t * tc)
{
- tc->cc_algo->recovered (tc);
tc->snd_rxt_bytes = 0;
tc->rcv_dupacks = 0;
- tc->snd_rxt_bytes = 0;
tc->rtt_ts = 0;
tcp_fastrecovery_off (tc);
tcp_fastrecovery_first_off (tc);
+ tc->flags &= ~TCP_CONN_FRXT_PENDING;
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
}
}
else if (tcp_in_fastrecovery (tc))
{
- tcp_cc_fastrecovery_exit (tc);
+ tcp_cc_fastrecovery_clear (tc);
}
+ tcp_cc_undo_recovery (tc);
ASSERT (tc->rto_boff == 0);
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 5);
}
if (tcp_in_recovery (tc))
tcp_cc_recovery_exit (tc);
else if (tcp_in_fastrecovery (tc))
- tcp_cc_fastrecovery_exit (tc);
+ {
+ tcp_cc_recovered (tc);
+ tcp_cc_fastrecovery_clear (tc);
+ }
ASSERT (tc->rto_boff == 0);
ASSERT (!tcp_in_cong_recovery (tc));
}
static void
-tcp_cc_update (tcp_connection_t * tc, vlib_buffer_t * b)
+tcp_cc_update (tcp_connection_t * tc, tcp_rate_sample_t * rs)
{
ASSERT (!tcp_in_cong_recovery (tc) || tcp_is_lost_fin (tc));
/* Congestion avoidance */
- tcp_cc_rcv_ack (tc);
+ tcp_cc_rcv_ack (tc, rs);
/* If a cumulative ack, make sure dupacks is 0 */
tc->rcv_dupacks = 0;
|| tcp_should_fastrecover_sack (tc));
}
-#ifndef CLIB_MARCH_VARIANT
-void
-tcp_program_fastretransmit (tcp_worker_ctx_t * wrk, tcp_connection_t * tc)
-{
- if (!(tc->flags & TCP_CONN_FRXT_PENDING))
- {
- vec_add1 (wrk->pending_fast_rxt, tc->c_c_index);
- tc->flags |= TCP_CONN_FRXT_PENDING;
- }
-}
-
-void
-tcp_do_fastretransmits (tcp_worker_ctx_t * wrk)
-{
- u32 *ongoing_fast_rxt, burst_bytes, sent_bytes, thread_index;
- u32 max_burst_size, burst_size, n_segs = 0, n_segs_now;
- tcp_connection_t *tc;
- u64 last_cpu_time;
- int i;
-
- if (vec_len (wrk->pending_fast_rxt) == 0
- && vec_len (wrk->postponed_fast_rxt) == 0)
- return;
-
- thread_index = wrk->vm->thread_index;
- last_cpu_time = wrk->vm->clib_time.last_cpu_time;
- ongoing_fast_rxt = wrk->ongoing_fast_rxt;
- vec_append (ongoing_fast_rxt, wrk->postponed_fast_rxt);
- vec_append (ongoing_fast_rxt, wrk->pending_fast_rxt);
-
- _vec_len (wrk->postponed_fast_rxt) = 0;
- _vec_len (wrk->pending_fast_rxt) = 0;
-
- max_burst_size = VLIB_FRAME_SIZE / vec_len (ongoing_fast_rxt);
- max_burst_size = clib_max (max_burst_size, 1);
-
- for (i = 0; i < vec_len (ongoing_fast_rxt); i++)
- {
- tc = tcp_connection_get (ongoing_fast_rxt[i], thread_index);
- if (!tcp_in_fastrecovery (tc))
- {
- tc->flags &= ~TCP_CONN_FRXT_PENDING;
- continue;
- }
-
- if (n_segs >= VLIB_FRAME_SIZE)
- {
- vec_add1 (wrk->postponed_fast_rxt, ongoing_fast_rxt[i]);
- continue;
- }
-
- tc->flags &= ~TCP_CONN_FRXT_PENDING;
- burst_size = clib_min (max_burst_size, VLIB_FRAME_SIZE - n_segs);
- burst_bytes = transport_connection_tx_pacer_burst (&tc->connection,
- last_cpu_time);
- burst_size = clib_min (burst_size, burst_bytes / tc->snd_mss);
- if (!burst_size)
- {
- tcp_program_fastretransmit (wrk, tc);
- continue;
- }
-
- n_segs_now = tcp_fast_retransmit (wrk, tc, burst_size);
- sent_bytes = clib_min (n_segs_now * tc->snd_mss, burst_bytes);
- transport_connection_tx_pacer_update_bytes (&tc->connection,
- sent_bytes);
- n_segs += n_segs_now;
- }
- _vec_len (ongoing_fast_rxt) = 0;
- wrk->ongoing_fast_rxt = ongoing_fast_rxt;
-}
-#endif /* CLIB_MARCH_VARIANT */
-
/**
* One function to rule them all ... and in the darkness bind them
*/
static void
-tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
+tcp_cc_handle_event (tcp_connection_t * tc, tcp_rate_sample_t * rs,
+ u32 is_dack)
{
u32 rxt_delivered;
{
if (tc->bytes_acked)
goto partial_ack;
- tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index), tc);
+ tcp_program_fastretransmit (tc);
return;
}
/*
if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked)
{
ASSERT (tcp_in_fastrecovery (tc));
- tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
+ tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs);
return;
}
else if (tcp_should_fastrecover (tc))
}
tcp_cc_init_congestion (tc);
- tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
+ tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs);
if (tcp_opts_sack_permitted (&tc->rcv_opts))
{
pacer_wnd = clib_max (0.1 * tc->cwnd, 2 * tc->snd_mss);
tcp_connection_tx_pacer_reset (tc, pacer_wnd,
0 /* start bucket */ );
- tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index),
- tc);
+ tcp_program_fastretransmit (tc);
return;
}
else if (!tc->bytes_acked
|| (tc->bytes_acked && !tcp_in_cong_recovery (tc)))
{
- tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
+ tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs);
return;
}
else
}
/* Treat as congestion avoidance ack */
- tcp_cc_rcv_ack (tc);
+ tcp_cc_rcv_ack (tc, rs);
return;
}
/* Post RTO timeout don't try anything fancy */
if (tcp_in_recovery (tc))
{
- tcp_cc_rcv_ack (tc);
+ tcp_cc_rcv_ack (tc, rs);
transport_add_tx_event (&tc->connection);
return;
}
else
{
tcp_fastrecovery_first_on (tc);
- /* Reuse last bytes delivered to track total bytes acked */
- tc->sack_sb.last_bytes_delivered += tc->bytes_acked;
if (tc->snd_rxt_bytes > tc->bytes_acked)
tc->snd_rxt_bytes -= tc->bytes_acked;
else
tc->snd_rxt_bytes = 0;
}
- tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK);
+ tcp_cc_rcv_cong_ack (tc, TCP_CC_PARTIALACK, rs);
/*
* Since this was a partial ack, try to retransmit some more data
*/
- tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index), tc);
+ tcp_program_fastretransmit (tc);
}
/**
tcp_header_t * th, u32 * error)
{
u32 prev_snd_wnd, prev_snd_una;
+ tcp_rate_sample_t rs = { 0 };
u8 is_dack;
TCP_EVT_DBG (TCP_EVT_CC_STAT, tc);
{
/* We've probably entered recovery and the peer still has some
* of the data we've sent. Update snd_nxt and accept the ack */
- if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max))
+ if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max)
+ && seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))
{
tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
goto process_ack;
}
+ tc->errors.above_ack_wnd += 1;
*error = TCP_ERROR_ACK_FUTURE;
TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 0,
vnet_buffer (b)->tcp.ack_number);
/* If old ACK, probably it's an old dupack */
if (PREDICT_FALSE (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)))
{
+ tc->errors.below_ack_wnd += 1;
*error = TCP_ERROR_ACK_OLD;
TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1,
vnet_buffer (b)->tcp.ack_number);
if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
- tcp_cc_handle_event (tc, 1);
+ tcp_cc_handle_event (tc, 0, 1);
/* Don't drop yet */
return 0;
}
/*
* Looks okay, process feedback
*/
+
if (tcp_opts_sack_permitted (&tc->rcv_opts))
tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
tcp_update_rtt (tc, vnet_buffer (b)->tcp.ack_number);
}
+ if (tc->flags & TCP_CONN_RATE_SAMPLE)
+ tcp_bt_sample_delivery_rate (tc, &rs);
+
TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
/*
if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack))
{
- tcp_cc_handle_event (tc, is_dack);
+ tcp_cc_handle_event (tc, &rs, is_dack);
+ tc->dupacks_in += is_dack;
if (!tcp_in_cong_recovery (tc))
{
*error = TCP_ERROR_ACK_OK;
/*
* Update congestion control (slow start/congestion avoidance)
*/
- tcp_cc_update (tc, b);
+ tcp_cc_update (tc, &rs);
*error = TCP_ERROR_ACK_OK;
return 0;
}
tcp_rcv_fin (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, vlib_buffer_t * b,
u32 * error)
{
+ /* Reject out-of-order fins */
+ if (vnet_buffer (b)->tcp.seq_end != tc->rcv_nxt)
+ return;
+
/* Account for the FIN and send ack */
tc->rcv_nxt += 1;
- tcp_program_ack (wrk, tc);
+ tcp_program_ack (tc);
/* Enter CLOSE-WAIT and notify session. To avoid lingering
* in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
tcp_connection_set_state (tc, TCP_STATE_CLOSE_WAIT);
ASSERT (data_len);
written = session_enqueue_stream_connection (&tc->connection, b, 0,
1 /* queue event */ , 1);
+ tc->bytes_in += written;
TCP_EVT_DBG (TCP_EVT_INPUT, tc, 0, data_len, written);
}
TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, data_len);
+ tc->bytes_in += data_len;
/* Update SACK list if in use */
if (tcp_opts_sack_permitted (&tc->rcv_opts))
newest = svm_fifo_newest_ooo_segment (s0->rx_fifo);
if (newest)
{
- offset = ooo_segment_offset (s0->rx_fifo, newest);
+ offset = ooo_segment_offset_prod (s0->rx_fifo, newest);
ASSERT (offset <= vnet_buffer (b)->tcp.seq_number - tc->rcv_nxt);
start = tc->rcv_nxt + offset;
end = start + ooo_segment_length (s0->rx_fifo, newest);
{
/* Send ack if ... */
if (TCP_ALWAYS_ACK
- /* just sent a rcv wnd 0 */
- || (tc->flags & TCP_CONN_SENT_RCV_WND0) != 0
+ /* just sent a rcv wnd 0
+ || (tc->flags & TCP_CONN_SENT_RCV_WND0) != 0 */
/* constrained to send ack */
|| (tc->flags & TCP_CONN_SNDACK) != 0
/* we're almost out of tx wnd */
vlib_buffer_advance (b, vnet_buffer (b)->tcp.data_offset);
n_data_bytes = vnet_buffer (b)->tcp.data_len;
ASSERT (n_data_bytes);
+ tc->data_segs_in += 1;
/* Handle out-of-order data */
if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt))
* retransmissions since we may not have any data to send */
if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
{
- tcp_program_ack (wrk, tc);
+ tcp_program_ack (tc);
error = TCP_ERROR_SEGMENT_OLD;
goto done;
}
/* RFC2581: Enqueue and send DUPACK for fast retransmit */
error = tcp_session_enqueue_ooo (tc, b, n_data_bytes);
- tcp_program_dupack (wrk, tc);
+ tcp_program_dupack (tc);
TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc, vnet_buffer (b)->tcp);
+ tc->errors.above_data_wnd += seq_gt (vnet_buffer (b)->tcp.seq_end,
+ tc->rcv_las + tc->rcv_wnd);
goto done;
}
goto done;
}
- tcp_program_ack (wrk, tc);
+ tcp_program_ack (tc);
done:
return error;
new_tc0->c_thread_index = my_thread_index;
new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end;
new_tc0->irs = seq0;
- new_tc0->timers[TCP_TIMER_ESTABLISH_AO] = TCP_TIMER_HANDLE_INVALID;
new_tc0->timers[TCP_TIMER_RETRANSMIT_SYN] = TCP_TIMER_HANDLE_INVALID;
new_tc0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX];
* allocate session send reset */
if (session_stream_connect_notify (&new_tc0->connection, 0))
{
- clib_warning ("connect notify fail");
tcp_send_reset_w_pkt (new_tc0, b0, my_thread_index, is_ip4);
tcp_connection_cleanup (new_tc0);
+ error0 = TCP_ERROR_CREATE_SESSION_FAIL;
goto drop;
}
tcp_connection_cleanup (new_tc0);
tcp_send_reset_w_pkt (tc0, b0, my_thread_index, is_ip4);
TCP_EVT_DBG (TCP_EVT_RST_SENT, tc0);
+ error0 = TCP_ERROR_CREATE_SESSION_FAIL;
goto drop;
}
}
else
{
- tcp_program_ack (wrk, new_tc0);
+ tcp_program_ack (new_tc0);
}
drop:
switch (tc0->state)
{
case TCP_STATE_SYN_RCVD:
+
+ /* Make sure the segment is exactly right */
+ if (tc0->rcv_nxt != vnet_buffer (b0)->tcp.seq_number || is_fin0)
+ {
+ tcp_connection_reset (tc0);
+ error0 = TCP_ERROR_SEGMENT_INVALID;
+ goto drop;
+ }
+
/*
* If the segment acknowledgment is not acceptable, form a
* reset segment,
* <SEQ=SEG.ACK><CTL=RST>
* and send it.
*/
- if (!tcp_rcv_ack_is_acceptable (tc0, b0))
+ if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
{
tcp_connection_reset (tc0);
- error0 = TCP_ERROR_ACK_INVALID;
- goto drop;
- }
-
- /* Make sure the ack is exactly right */
- if (tc0->rcv_nxt != vnet_buffer (b0)->tcp.seq_number || is_fin0)
- {
- tcp_connection_reset (tc0);
- error0 = TCP_ERROR_SEGMENT_INVALID;
goto drop;
}
/* Reset SYN-ACK retransmit and SYN_RCV establish timers */
tcp_retransmit_timer_reset (tc0);
- tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH);
if (session_stream_accept_notify (&tc0->connection))
{
error0 = TCP_ERROR_MSG_QUEUE_FULL;
max_dequeue = transport_max_tx_dequeue (&tc0->connection);
if (max_dequeue <= tc0->burst_acked)
tcp_send_fin (tc0);
+ /* If a fin was received and data was acked extend wait */
+ else if ((tc0->flags & TCP_CONN_FINRCVD) && tc0->bytes_acked)
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE,
+ TCP_CLOSEWAIT_TIME);
}
/* If FIN is ACKed */
else if (tc0->snd_una == tc0->snd_nxt)
{
- tcp_connection_set_state (tc0, TCP_STATE_FIN_WAIT_2);
-
/* Stop all retransmit timers because we have nothing more
- * to send. Enable waitclose though because we're willing to
- * wait for peer's FIN but not indefinitely. */
+ * to send. */
tcp_connection_timers_reset (tc0);
+
+ /* We already have a FIN but didn't transition to CLOSING
+ * because of outstanding tx data. Close the connection. */
+ if (tc0->flags & TCP_CONN_FINRCVD)
+ {
+ tcp_connection_set_state (tc0, TCP_STATE_CLOSED);
+ tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
+ session_transport_closed_notify (&tc0->connection);
+ goto drop;
+ }
+
+ tcp_connection_set_state (tc0, TCP_STATE_FIN_WAIT_2);
+ /* Enable waitclose because we're willing to wait for peer's
+ * FIN but not indefinitely. */
tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
/* Don't try to deq the FIN acked */
/* In addition to the processing for the ESTABLISHED state, if
* the retransmission queue is empty, the user's CLOSE can be
* acknowledged ("ok") but do not delete the TCB. */
- if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
+ if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
goto drop;
tc0->burst_acked = 0;
break;
if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
goto drop;
- if (tc0->flags & TCP_CONN_FINPNDG)
- {
- /* TX fifo finally drained */
- if (!transport_max_tx_dequeue (&tc0->connection))
- {
- tcp_send_fin (tc0);
- tcp_connection_timers_reset (tc0);
- tcp_connection_set_state (tc0, TCP_STATE_LAST_ACK);
- tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
- }
- }
+ if (!(tc0->flags & TCP_CONN_FINPNDG))
+ break;
+
+ /* Still have outstanding tx data */
+ max_dequeue = transport_max_tx_dequeue (&tc0->connection);
+ if (max_dequeue > tc0->burst_acked)
+ break;
+
+ tcp_send_fin (tc0);
+ tcp_connection_timers_reset (tc0);
+ tcp_connection_set_state (tc0, TCP_STATE_LAST_ACK);
+ tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
break;
case TCP_STATE_CLOSING:
/* In addition to the processing for the ESTABLISHED state, if
* the ACK acknowledges our FIN then enter the TIME-WAIT state,
* otherwise ignore the segment. */
- if (!tcp_rcv_ack_is_acceptable (tc0, b0))
- {
- error0 = TCP_ERROR_ACK_INVALID;
- goto drop;
- }
+ if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
+ goto drop;
- error0 = TCP_ERROR_ACK_OK;
- tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
- /* Ack moved snd_una beyond snd_nxt so reprogram fin */
- if (seq_gt (tc0->snd_una, tc0->snd_nxt))
- {
- tc0->snd_nxt = tc0->snd_una;
- tc0->flags &= ~TCP_CONN_FINSNT;
- goto drop;
- }
+ if (tc0->snd_una != tc0->snd_nxt)
+ goto drop;
tcp_connection_timers_reset (tc0);
tcp_connection_set_state (tc0, TCP_STATE_TIME_WAIT);
tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME);
+ session_transport_closed_notify (&tc0->connection);
goto drop;
break;
* acknowledgment of our FIN. If our FIN is now acknowledged,
* delete the TCB, enter the CLOSED state, and return. */
- if (!tcp_rcv_ack_is_acceptable (tc0, b0))
- {
- error0 = TCP_ERROR_ACK_INVALID;
- goto drop;
- }
- error0 = TCP_ERROR_ACK_OK;
- tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
+ if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
+ goto drop;
+
/* Apparently our ACK for the peer's FIN was lost */
if (is_fin0 && tc0->snd_una != tc0->snd_nxt)
{
}
tcp_connection_set_state (tc0, TCP_STATE_CLOSED);
+ session_transport_closed_notify (&tc0->connection);
/* Don't free the connection from the data path since
* we can't ensure that we have no packets already enqueued
* retransmission of the remote FIN. Acknowledge it, and restart
* the 2 MSL timeout. */
- if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
+ if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
goto drop;
if (!is_fin0)
goto drop;
- tcp_program_ack (wrk, tc0);
+ tcp_program_ack (tc0);
tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME);
goto drop;
case TCP_STATE_ESTABLISHED:
/* Account for the FIN and send ack */
tc0->rcv_nxt += 1;
- tcp_program_ack (wrk, tc0);
+ tcp_program_ack (tc0);
tcp_connection_set_state (tc0, TCP_STATE_CLOSE_WAIT);
tcp_program_disconnect (wrk, tc0);
tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
break;
case TCP_STATE_FIN_WAIT_1:
tc0->rcv_nxt += 1;
- tcp_connection_set_state (tc0, TCP_STATE_CLOSING);
+
if (tc0->flags & TCP_CONN_FINPNDG)
{
- /* Drop all outstanding tx data. */
- session_tx_fifo_dequeue_drop (&tc0->connection,
- transport_max_tx_dequeue
- (&tc0->connection));
- /* Make it look as if we've recovered, if needed */
- if (tcp_in_cong_recovery (tc0))
- {
- scoreboard_clear (&tc0->sack_sb);
- tcp_fastrecovery_off (tc0);
- tcp_recovery_off (tc0);
- tcp_connection_timers_reset (tc0);
- tc0->snd_nxt = tc0->snd_una;
- }
- tcp_send_fin (tc0);
+ /* If data is outstanding, stay in FIN_WAIT_1 and try to finish
+ * sending it. Since we already received a fin, do not wait
+ * for too long. */
+ tc0->flags |= TCP_CONN_FINRCVD;
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
}
else
- tcp_program_ack (wrk, tc0);
- /* Wait for ACK for our FIN but not forever */
- tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+ {
+ tcp_connection_set_state (tc0, TCP_STATE_CLOSING);
+ tcp_program_ack (tc0);
+ /* Wait for ACK for our FIN but not forever */
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+ }
break;
case TCP_STATE_FIN_WAIT_2:
/* Got FIN, send ACK! Be more aggressive with resource cleanup */
tcp_connection_set_state (tc0, TCP_STATE_TIME_WAIT);
tcp_connection_timers_reset (tc0);
tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME);
- tcp_program_ack (wrk, tc0);
+ tcp_program_ack (tc0);
+ session_transport_closed_notify (&tc0->connection);
break;
case TCP_STATE_TIME_WAIT:
/* Remain in the TIME-WAIT state. Restart the time-wait
thread_index);
tcp_inc_counter (rcv_process, TCP_ERROR_MSG_QUEUE_FULL, errors);
tcp_handle_postponed_dequeues (wrk);
+ tcp_handle_disconnects (wrk);
vlib_buffer_free (vm, first_buffer, from_frame->n_vectors);
return from_frame->n_vectors;
tcp_connection_init_vars (child0);
child0->rto = TCP_RTO_MIN;
- TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0, 1);
if (session_stream_accept (&child0->connection, lc0->c_s_index,
- 0 /* notify */ ))
+ lc0->c_thread_index, 0 /* notify */ ))
{
tcp_connection_cleanup (child0);
error0 = TCP_ERROR_CREATE_SESSION_FAIL;
goto drop;
}
+ TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0, 1);
child0->tx_fifo_size = transport_tx_fifo_size (&child0->connection);
tcp_send_synack (child0);
- tcp_timer_set (child0, TCP_TIMER_ESTABLISH, TCP_SYN_RCVD_TIME);
drop:
}
}
-static inline tcp_connection_t *
+always_inline tcp_connection_t *
tcp_input_lookup_buffer (vlib_buffer_t * b, u8 thread_index, u32 * error,
- u8 is_ip4)
+ u8 is_ip4, u8 is_nolookup)
{
u32 fib_index = vnet_buffer (b)->ip.fib_index;
int n_advance_bytes, n_data_bytes;
return 0;
}
- tc = session_lookup_connection_wt4 (fib_index, &ip4->dst_address,
- &ip4->src_address, tcp->dst_port,
- tcp->src_port, TRANSPORT_PROTO_TCP,
- thread_index, &result);
+ if (!is_nolookup)
+ tc = session_lookup_connection_wt4 (fib_index, &ip4->dst_address,
+ &ip4->src_address, tcp->dst_port,
+ tcp->src_port,
+ TRANSPORT_PROTO_TCP, thread_index,
+ &result);
}
else
{
*error = TCP_ERROR_LENGTH;
return 0;
}
- if (PREDICT_FALSE
- (ip6_address_is_link_local_unicast (&ip6->dst_address)))
+
+ if (!is_nolookup)
{
- ip4_main_t *im = &ip4_main;
- fib_index = vec_elt (im->fib_index_by_sw_if_index,
- vnet_buffer (b)->sw_if_index[VLIB_RX]);
- }
+ if (PREDICT_FALSE
+ (ip6_address_is_link_local_unicast (&ip6->dst_address)))
+ {
+ ip4_main_t *im = &ip4_main;
+ fib_index = vec_elt (im->fib_index_by_sw_if_index,
+ vnet_buffer (b)->sw_if_index[VLIB_RX]);
+ }
- tc = session_lookup_connection_wt6 (fib_index, &ip6->dst_address,
- &ip6->src_address, tcp->dst_port,
- tcp->src_port, TRANSPORT_PROTO_TCP,
- thread_index, &result);
+ tc = session_lookup_connection_wt6 (fib_index, &ip6->dst_address,
+ &ip6->src_address,
+ tcp->dst_port, tcp->src_port,
+ TRANSPORT_PROTO_TCP,
+ thread_index, &result);
+ }
}
+ if (is_nolookup)
+ tc =
+ (transport_connection_t *) tcp_connection_get (vnet_buffer (b)->
+ tcp.connection_index,
+ thread_index);
+
vnet_buffer (b)->tcp.seq_number = clib_net_to_host_u32 (tcp->seq_number);
vnet_buffer (b)->tcp.ack_number = clib_net_to_host_u32 (tcp->ack_number);
vnet_buffer (b)->tcp.data_offset = n_advance_bytes;
flags = tcp->flags & filter_flags;
*next = tm->dispatch_table[tc->state][flags].next;
*error = tm->dispatch_table[tc->state][flags].error;
+ tc->segs_in += 1;
if (PREDICT_FALSE (*error == TCP_ERROR_DISPATCH
|| *next == TCP_INPUT_NEXT_RESET))
always_inline uword
tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
- vlib_frame_t * frame, int is_ip4)
+ vlib_frame_t * frame, int is_ip4, u8 is_nolookup)
{
u32 n_left_from, *from, thread_index = vm->thread_index;
tcp_main_t *tm = vnet_get_tcp_main ();
next[0] = next[1] = TCP_INPUT_NEXT_DROP;
- tc0 = tcp_input_lookup_buffer (b[0], thread_index, &error0, is_ip4);
- tc1 = tcp_input_lookup_buffer (b[1], thread_index, &error1, is_ip4);
+ tc0 = tcp_input_lookup_buffer (b[0], thread_index, &error0, is_ip4,
+ is_nolookup);
+ tc1 = tcp_input_lookup_buffer (b[1], thread_index, &error1, is_ip4,
+ is_nolookup);
if (PREDICT_TRUE (!tc0 + !tc1 == 0))
{
}
next[0] = TCP_INPUT_NEXT_DROP;
- tc0 = tcp_input_lookup_buffer (b[0], thread_index, &error0, is_ip4);
+ tc0 = tcp_input_lookup_buffer (b[0], thread_index, &error0, is_ip4,
+ is_nolookup);
if (PREDICT_TRUE (tc0 != 0))
{
ASSERT (tcp_lookup_is_valid (tc0, tcp_buffer_hdr (b[0])));
return frame->n_vectors;
}
+VLIB_NODE_FN (tcp4_input_nolookup_node) (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ,
+ 1 /* is_nolookup */ );
+}
+
+VLIB_NODE_FN (tcp6_input_nolookup_node) (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ,
+ 1 /* is_nolookup */ );
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp4_input_nolookup_node) =
+{
+ .name = "tcp4-input-nolookup",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_INPUT_N_NEXT,
+ .next_nodes =
+ {
+#define _(s,n) [TCP_INPUT_NEXT_##s] = n,
+ foreach_tcp4_input_next
+#undef _
+ },
+ .format_buffer = format_tcp_header,
+ .format_trace = format_tcp_rx_trace,
+};
+/* *INDENT-ON* */
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp6_input_nolookup_node) =
+{
+ .name = "tcp6-input-nolookup",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_INPUT_N_NEXT,
+ .next_nodes =
+ {
+#define _(s,n) [TCP_INPUT_NEXT_##s] = n,
+ foreach_tcp6_input_next
+#undef _
+ },
+ .format_buffer = format_tcp_header,
+ .format_trace = format_tcp_rx_trace,
+};
+/* *INDENT-ON* */
+
VLIB_NODE_FN (tcp4_input_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_frame_t * from_frame)
{
- return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ );
+ return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ,
+ 0 /* is_nolookup */ );
}
VLIB_NODE_FN (tcp6_input_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_frame_t * from_frame)
{
- return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ );
+ return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ,
+ 0 /* is_nolookup */ );
}
/* *INDENT-OFF* */
TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(CLOSING, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
+ TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
/* FIN confirming that the peer (app) has closed */