X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;ds=sidebyside;f=src%2Fvnet%2Ftcp%2Ftcp_input.c;h=5ec4099fa4633556a1affd4ed63165debdb03ef5;hb=607ece36b6f0fc6fe1122db58f2eb7d3713f34cc;hp=b8c889ee1cc7f3c901ae83e9132c1f314377fea9;hpb=d2f5174dd045da53395939cd55a0f6e2821f6dcf;p=vpp.git diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index b8c889ee1cc..5ec4099fa46 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -149,7 +149,7 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end) { ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval)); tc->tsval_recent = tc->rcv_opts.tsval; - tc->tsval_recent_age = tcp_time_now_w_thread (tc->c_thread_index); + tc->tsval_recent_age = tcp_time_tstamp (tc->c_thread_index); } } @@ -164,7 +164,7 @@ tcp_handle_rst (tcp_connection_t * tc) tcp_connection_cleanup (tc); break; case TCP_STATE_SYN_SENT: - session_stream_connect_notify (&tc->connection, 1 /* fail */ ); + session_stream_connect_notify (&tc->connection, SESSION_E_REFUSED); tcp_connection_cleanup (tc); break; case TCP_STATE_ESTABLISHED: @@ -215,6 +215,7 @@ tcp_rcv_rst (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) break; case TCP_STATE_SYN_SENT: /* Do not program ntf because the connection is half-open */ + tc->rst_state = tc->state; tcp_handle_rst (tc); break; case TCP_STATE_ESTABLISHED: @@ -287,7 +288,7 @@ tcp_segment_validate (tcp_worker_ctx_t * wrk, tcp_connection_t * tc0, /* If it just so happens that a segment updates tsval_recent for a * segment over 24 days old, invalidate tsval_recent. */ if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE, - tcp_time_now_w_thread (tc0->c_thread_index))) + tcp_time_tstamp (tc0->c_thread_index))) { tc0->tsval_recent = tc0->rcv_opts.tsval; clib_warning ("paws failed: 24-day old segment"); @@ -403,7 +404,7 @@ tcp_rcv_ack_no_cc (tcp_connection_t * tc, vlib_buffer_t * b, u32 * error) if (!(seq_leq (tc->snd_una, vnet_buffer (b)->tcp.ack_number) && seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt))) { - if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max) + if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt) && seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)) { tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; @@ -423,31 +424,26 @@ acceptable: /** * Compute smoothed RTT as per VJ's '88 SIGCOMM and RFC6298 * - * Note that although the original article, srtt and rttvar are scaled + * Note that although in the original article srtt and rttvar are scaled * to minimize round-off errors, here we don't. Instead, we rely on * better precision time measurements. + * + * A known limitation of the algorithm is that a drop in rtt results in a + * rttvar increase and bigger RTO. + * + * mrtt must be provided in @ref TCP_TICK multiples, i.e., in us. Note that + * timestamps are measured as ms ticks so they must be converted before + * calling this function. */ static void tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) { int err, diff; - if (tc->srtt != 0) - { - err = mrtt - tc->srtt; - - /* XXX Drop in RTT results in RTTVAR increase and bigger RTO. - * The increase should be bound */ - tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1); - diff = (clib_abs (err) - (int) tc->rttvar) >> 2; - tc->rttvar = clib_max ((int) tc->rttvar + diff, 1); - } - else - { - /* First measurement. */ - tc->srtt = mrtt; - tc->rttvar = mrtt >> 1; - } + err = mrtt - tc->srtt; + tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1); + diff = (clib_abs (err) - (int) tc->rttvar) >> 2; + tc->rttvar = clib_max ((int) tc->rttvar + diff, 1); } static inline void @@ -505,8 +501,8 @@ tcp_update_rtt (tcp_connection_t * tc, tcp_rate_sample_t * rs, u32 ack) * seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */ else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr) { - u32 now = tcp_tstamp (tc); - mrtt = clib_max (now - tc->rcv_opts.tsecr, 1); + mrtt = clib_max (tcp_tstamp (tc) - tc->rcv_opts.tsecr, 1); + mrtt *= TCP_TSTP_TO_HZ; } estimate_rtt: @@ -542,8 +538,8 @@ tcp_estimate_initial_rtt (tcp_connection_t * tc) } else { - mrtt = tcp_time_now_w_thread (thread_index) - tc->rcv_opts.tsecr; - mrtt = clib_max (mrtt, 1); + mrtt = tcp_tstamp (tc) - tc->rcv_opts.tsecr; + mrtt = clib_max (mrtt, 1) * TCP_TSTP_TO_HZ; /* Due to retransmits we don't know the initial mrtt */ if (tc->rto_boff && mrtt > 1 * THZ) mrtt = 1 * THZ; @@ -551,7 +547,11 @@ tcp_estimate_initial_rtt (tcp_connection_t * tc) } if (mrtt > 0 && mrtt < TCP_RTT_MAX) - tcp_estimate_rtt (tc, mrtt); + { + /* First measurement as per RFC 6298 */ + tc->srtt = mrtt; + tc->rttvar = mrtt >> 1; + } tcp_update_rto (tc); } @@ -580,13 +580,7 @@ tcp_handle_postponed_dequeues (tcp_worker_ctx_t * wrk) /* Dequeue the newly ACKed bytes */ session_tx_fifo_dequeue_drop (&tc->connection, tc->burst_acked); - tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); - - if (PREDICT_FALSE (tc->flags & TCP_CONN_PSH_PENDING)) - { - if (seq_leq (tc->psh_seq, tc->snd_una)) - tc->flags &= ~TCP_CONN_PSH_PENDING; - } + tcp_validate_txf_size (tc, tc->snd_nxt - tc->snd_una); if (tcp_is_descheduled (tc)) tcp_reschedule (tc); @@ -689,7 +683,7 @@ tcp_cc_init_congestion (tcp_connection_t * tc) * three segments that have left the network and should've been * buffered at the receiver XXX */ if (!tcp_opts_sack_permitted (&tc->rcv_opts)) - tc->cwnd += 3 * tc->snd_mss; + tc->cwnd += TCP_DUPACK_THRESHOLD * tc->snd_mss; tc->fr_occurences += 1; TCP_EVT (TCP_EVT_CC_EVT, tc, 4); @@ -720,14 +714,6 @@ tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) return (tcp_cc_is_spurious_timeout_rxt (tc)); } -static inline u8 -tcp_should_fastrecover_sack (tcp_connection_t * tc) -{ - return (tc->sack_sb.lost_bytes - || ((TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss - < tc->sack_sb.sacked_bytes)); -} - static inline u8 tcp_should_fastrecover (tcp_connection_t * tc, u8 has_sack) { @@ -752,8 +738,7 @@ tcp_should_fastrecover (tcp_connection_t * tc, u8 has_sack) return 0; } } - return ((tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) - || tcp_should_fastrecover_sack (tc)); + return tc->sack_sb.lost_bytes || tc->rcv_dupacks >= tc->sack_sb.reorder; } static int @@ -1008,7 +993,7 @@ tcp_rcv_ack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, vlib_buffer_t * b, { /* We've probably entered recovery and the peer still has some * of the data we've sent. Update snd_nxt and accept the ack */ - if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max) + if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt) && seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)) { tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; @@ -1202,7 +1187,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, } /* Update SACK list if need be */ - if (tcp_opts_sack_permitted (&tc->rcv_opts)) + if (tcp_opts_sack_permitted (&tc->rcv_opts) && vec_len (tc->snd_sacks)) { /* Remove SACK blocks that have been delivered */ tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt); @@ -1307,6 +1292,10 @@ tcp_segment_rcv (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, ASSERT (n_data_bytes); tc->data_segs_in += 1; + /* Make sure we don't consume trailing bytes */ + if (PREDICT_FALSE (b->current_length > n_data_bytes)) + b->current_length = n_data_bytes; + /* Handle out-of-order data */ if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt)) { @@ -1787,7 +1776,7 @@ tcp_check_tx_offload (tcp_connection_t * tc, int is_ipv4) return; hw_if = vnet_get_sup_hw_interface (vnm, sw_if_idx); - if (hw_if->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO) + if (hw_if->caps & VNET_HW_INTERFACE_CAP_SUPPORTS_TCP_GSO) tc->cfg_flags |= TCP_CFG_F_TSO; } @@ -1823,13 +1812,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - /* Half-open completed recently but the connection was't removed - * yet by the owning thread */ + /* Half-open completed or cancelled recently but the connection + * was't removed yet by the owning thread */ if (PREDICT_FALSE (tc0->flags & TCP_CONN_HALF_OPEN_DONE)) { - /* Make sure the connection actually exists */ - ASSERT (tcp_lookup_connection (tc0->c_fib_index, b0, - my_thread_index, is_ip4)); error0 = TCP_ERROR_SPURIOUS_SYN_ACK; goto drop; } @@ -1934,7 +1920,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tcp_opts_tstamp (&new_tc0->rcv_opts)) { new_tc0->tsval_recent = new_tc0->rcv_opts.tsval; - new_tc0->tsval_recent_age = tcp_time_now (); + new_tc0->tsval_recent_age = tcp_time_tstamp (my_thread_index); } if (tcp_opts_wscale (&new_tc0->rcv_opts)) @@ -1972,6 +1958,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto cleanup_ho; } + transport_fifos_init_ooo (&new_tc0->connection); new_tc0->tx_fifo_size = transport_tx_fifo_size (&new_tc0->connection); /* Update rtt with the syn-ack sample */ @@ -1995,6 +1982,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto cleanup_ho; } + transport_fifos_init_ooo (&new_tc0->connection); new_tc0->tx_fifo_size = transport_tx_fifo_size (&new_tc0->connection); new_tc0->rtt_ts = 0; @@ -2389,6 +2377,9 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_FIN_WAIT_2: if (vnet_buffer (b0)->tcp.data_len) error0 = tcp_segment_rcv (wrk, tc0, b0); + /* Don't accept out of order fins lower */ + if (vnet_buffer (b0)->tcp.seq_end != tc0->rcv_nxt) + goto drop; break; case TCP_STATE_CLOSE_WAIT: case TCP_STATE_CLOSING: @@ -2570,20 +2561,31 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, b = vlib_get_buffer (vm, bi); - lc = tcp_listener_get (vnet_buffer (b)->tcp.connection_index); - if (PREDICT_FALSE (lc == 0)) + /* Flags initialized with connection state after lookup */ + if (vnet_buffer (b)->tcp.flags == TCP_STATE_LISTEN) + { + lc = tcp_listener_get (vnet_buffer (b)->tcp.connection_index); + } + else { tcp_connection_t *tc; tc = tcp_connection_get (vnet_buffer (b)->tcp.connection_index, thread_index); if (tc->state != TCP_STATE_TIME_WAIT) { + lc = 0; error = TCP_ERROR_CREATE_EXISTS; goto done; } lc = tcp_lookup_listener (b, tc->c_fib_index, is_ip4); /* clean up the old session */ tcp_connection_del (tc); + /* listener was cleaned up */ + if (!lc) + { + error = TCP_ERROR_NO_LISTENER; + goto done; + } } /* Make sure connection wasn't just created */ @@ -2630,6 +2632,13 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_connection_init_vars (child); child->rto = TCP_RTO_MIN; + /* + * This initializes elog track, must be done before synack. + * We also do it before possible tcp_connection_cleanup() as it + * generates TCP_EVT_DELETE event. + */ + TCP_EVT (TCP_EVT_SYN_RCVD, child, 1); + if (session_stream_accept (&child->connection, lc->c_s_index, lc->c_thread_index, 0 /* notify */ )) { @@ -2638,23 +2647,17 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto done; } + transport_fifos_init_ooo (&child->connection); child->tx_fifo_size = transport_tx_fifo_size (&child->connection); - /* This initializes elog track, must be done before synack */ - TCP_EVT (TCP_EVT_SYN_RCVD, child, 1); - tcp_send_synack (child); done: if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED)) { - tcp_rx_trace_t *t; - t = vlib_add_trace (vm, node, b, sizeof (*t)); - clib_memcpy_fast (&t->tcp_header, tcp_buffer_hdr (b), - sizeof (t->tcp_header)); - clib_memcpy_fast (&t->tcp_connection, lc, - sizeof (t->tcp_connection)); + tcp_rx_trace_t *t = vlib_add_trace (vm, node, b, sizeof (*t)); + tcp_set_rx_trace_data (t, lc, tcp_buffer_hdr (b), b, is_ip4); } n_syns += (error == TCP_ERROR_NONE); @@ -2804,6 +2807,10 @@ tcp_input_dispatch_buffer (tcp_main_t * tm, tcp_connection_t * tc, error = tm->dispatch_table[tc->state][flags].error; tc->segs_in += 1; + /* Track connection state when packet was received. It helps + * @ref tcp46_listen_inline detect port reuse */ + vnet_buffer (b)->tcp.flags = tc->state; + if (PREDICT_FALSE (error != TCP_ERROR_NONE)) { b->error = error_node->errors[error]; @@ -2823,7 +2830,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b; u16 nexts[VLIB_FRAME_SIZE], *next; - tcp_set_time_now (tcp_get_worker (thread_index)); + tcp_update_time_now (tcp_get_worker (thread_index)); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -3043,6 +3050,12 @@ VLIB_REGISTER_NODE (tcp6_input_node) = /* *INDENT-ON* */ #ifndef CLIB_MARCH_VARIANT +void +tcp_check_gso (tcp_connection_t *tc) +{ + tcp_check_tx_offload (tc, tc->c_is_ip4); +} + static void tcp_dispatch_table_init (tcp_main_t * tm) { @@ -3219,12 +3232,14 @@ do { \ _(FIN_WAIT_2, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(FIN_WAIT_2, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_2, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(CLOSE_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(CLOSE_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(CLOSE_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(CLOSE_WAIT, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(CLOSE_WAIT, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID); _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);