X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp_input.c;h=7f325719980880d337f8b5066c91294034132392;hb=e52eafd0471d7a6d2aca992d654786008d9a0f87;hp=713e11fd896470cea0e583248df1d0c92c3f1624;hpb=0765d97abef74727c040d2eaf9112865d59f1593;p=vpp.git diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c old mode 100755 new mode 100644 index 713e11fd896..7f325719980 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -215,6 +215,7 @@ tcp_rcv_rst (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) break; case TCP_STATE_SYN_SENT: /* Do not program ntf because the connection is half-open */ + tc->rst_state = tc->state; tcp_handle_rst (tc); break; case TCP_STATE_ESTABLISHED: @@ -426,8 +427,6 @@ acceptable: * Note that although the original article, srtt and rttvar are scaled * to minimize round-off errors, here we don't. Instead, we rely on * better precision time measurements. - * - * TODO support us rtt resolution */ static void tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) @@ -452,16 +451,28 @@ tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) } } +static inline void +tcp_estimate_rtt_us (tcp_connection_t * tc, f64 mrtt) +{ + tc->mrtt_us = tc->mrtt_us + (mrtt - tc->mrtt_us) * 0.125; +} + /** - * Update RTT estimate and RTO timer + * Update rtt estimate * - * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK - * timing. Middle boxes are known to fiddle with TCP options so we - * should give higher priority to ACK timing. + * We have potentially three sources of rtt measurements: * - * This should be called only if previously sent bytes have been acked. + * TSOPT difference between current and echoed timestamp. It has ms + * precision and can be computed per ack + * ACK timing one sequence number is tracked per rtt with us (micro second) + * precision. + * rate sample if enabled, all outstanding bytes are tracked with us + * precision. Every ack and sack are a rtt sample * - * return 1 if valid rtt 0 otherwise + * Middle boxes are known to fiddle with TCP options so we give higher + * priority to ACK timing. + * + * For now, rate sample rtts are only used under congestion. */ static int tcp_update_rtt (tcp_connection_t * tc, tcp_rate_sample_t * rs, u32 ack) @@ -473,19 +484,19 @@ tcp_update_rtt (tcp_connection_t * tc, tcp_rate_sample_t * rs, u32 ack) if (tcp_in_cong_recovery (tc)) { /* Accept rtt estimates for samples that have not been retransmitted */ - if ((tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE) - && !(rs->flags & TCP_BTS_IS_RXT)) - { - mrtt = rs->rtt_time * THZ; - goto estimate_rtt; - } - goto done; + if (!(tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE) + || (rs->flags & TCP_BTS_IS_RXT)) + goto done; + if (rs->rtt_time) + tcp_estimate_rtt_us (tc, rs->rtt_time); + mrtt = rs->rtt_time * THZ; + goto estimate_rtt; } if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq)) { f64 sample = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts; - tc->mrtt_us = tc->mrtt_us + (sample - tc->mrtt_us) * 0.125; + tcp_estimate_rtt_us (tc, sample); mrtt = clib_max ((u32) (sample * THZ), 1); /* Allow measuring of a new RTT */ tc->rtt_ts = 0; @@ -1048,10 +1059,11 @@ process_ack: if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE) tcp_bt_sample_delivery_rate (tc, &rs); - if (tc->bytes_acked) + if (tc->bytes_acked + tc->sack_sb.last_sacked_bytes) { - tcp_program_dequeue (wrk, tc); tcp_update_rtt (tc, &rs, vnet_buffer (b)->tcp.ack_number); + if (tc->bytes_acked) + tcp_program_dequeue (wrk, tc); } TCP_EVT (TCP_EVT_ACK_RCVD, tc); @@ -1252,26 +1264,6 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, return TCP_ERROR_ENQUEUED_OOO; } -/** - * Check if ACK could be delayed. If ack can be delayed, it should return - * true for a full frame. If we're always acking return 0. - */ -always_inline int -tcp_can_delack (tcp_connection_t * tc) -{ - /* Send ack if ... */ - if (TCP_ALWAYS_ACK - /* just sent a rcv wnd 0 - || (tc->flags & TCP_CONN_SENT_RCV_WND0) != 0 */ - /* constrained to send ack */ - || (tc->flags & TCP_CONN_SNDACK) != 0 - /* we're almost out of tx wnd */ - || tcp_available_cc_snd_space (tc) < 4 * tc->snd_mss) - return 0; - - return 1; -} - static int tcp_buffer_discard_bytes (vlib_buffer_t * b, u32 n_bytes_to_drop) { @@ -1360,14 +1352,6 @@ in_order: /* In order data, enqueue. Fifo figures out by itself if any out-of-order * segments can be enqueued after fifo tail offset changes. */ error = tcp_session_enqueue_data (tc, b, n_data_bytes); - if (tcp_can_delack (tc)) - { - if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK)) - tcp_timer_set (&wrk->timer_wheel, tc, TCP_TIMER_DELACK, - tcp_cfg.delack_time); - goto done; - } - tcp_program_ack (tc); done: @@ -1948,11 +1932,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, new_tc0->timers[TCP_TIMER_RETRANSMIT_SYN] = TCP_TIMER_HANDLE_INVALID; new_tc0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX]; - /* If this is not the owning thread, wait for syn retransmit to - * expire and cleanup then */ - if (tcp_half_open_connection_cleanup (tc0)) - tc0->flags |= TCP_CONN_HALF_OPEN_DONE; - if (tcp_opts_tstamp (&new_tc0->rcv_opts)) { new_tc0->tsval_recent = new_tc0->rcv_opts.tsval; @@ -1985,12 +1964,13 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Notify app that we have connection. If session layer can't * allocate session send reset */ - if (session_stream_connect_notify (&new_tc0->connection, 0)) + if (session_stream_connect_notify (&new_tc0->connection, + SESSION_E_NONE)) { tcp_send_reset_w_pkt (new_tc0, b0, my_thread_index, is_ip4); tcp_connection_cleanup (new_tc0); error0 = TCP_ERROR_CREATE_SESSION_FAIL; - goto drop; + goto cleanup_ho; } new_tc0->tx_fifo_size = @@ -2006,13 +1986,14 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, new_tc0->state = TCP_STATE_SYN_RCVD; /* Notify app that we have connection */ - if (session_stream_connect_notify (&new_tc0->connection, 0)) + if (session_stream_connect_notify (&new_tc0->connection, + SESSION_E_NONE)) { tcp_connection_cleanup (new_tc0); tcp_send_reset_w_pkt (tc0, b0, my_thread_index, is_ip4); TCP_EVT (TCP_EVT_RST_SENT, tc0); error0 = TCP_ERROR_CREATE_SESSION_FAIL; - goto drop; + goto cleanup_ho; } new_tc0->tx_fifo_size = @@ -2021,7 +2002,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_init_snd_vars (new_tc0); tcp_send_synack (new_tc0); error0 = TCP_ERROR_SYNS_RCVD; - goto drop; + goto cleanup_ho; } if (!(new_tc0->cfg_flags & TCP_CFG_F_NO_TSO)) @@ -2042,6 +2023,13 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_send_ack (new_tc0); } + cleanup_ho: + + /* If this is not the owning thread, wait for syn retransmit to + * expire and cleanup then */ + if (tcp_half_open_connection_cleanup (tc0)) + tc0->flags |= TCP_CONN_HALF_OPEN_DONE; + drop: tcp_inc_counter (syn_sent, error0, 1); @@ -2643,6 +2631,13 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_connection_init_vars (child); child->rto = TCP_RTO_MIN; + /* + * This initializes elog track, must be done before synack. + * We also do it before possible tcp_connection_cleanup() as it + * generates TCP_EVT_DELETE event. + */ + TCP_EVT (TCP_EVT_SYN_RCVD, child, 1); + if (session_stream_accept (&child->connection, lc->c_s_index, lc->c_thread_index, 0 /* notify */ )) { @@ -2655,8 +2650,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_send_synack (child); - TCP_EVT (TCP_EVT_SYN_RCVD, child, 1); - done: if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED)) @@ -2834,11 +2827,9 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_main_t *tm = vnet_get_tcp_main (); vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b; u16 nexts[VLIB_FRAME_SIZE], *next; - vlib_node_runtime_t *error_node; tcp_set_time_now (tcp_get_worker (thread_index)); - error_node = vlib_node_get_runtime (vm, tcp_node_index (input, is_ip4)); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; vlib_get_buffers (vm, from, bufs, n_left_from); @@ -2874,8 +2865,8 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index; vnet_buffer (b[1])->tcp.connection_index = tc1->c_c_index; - tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], error_node); - tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], error_node); + tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], node); + tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], node); } else { @@ -2883,24 +2874,24 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0]))); vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index; - tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], error_node); + tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], node); } else { tcp_input_set_error_next (tm, &next[0], &error0, is_ip4); - b[0]->error = error_node->errors[error0]; + b[0]->error = node->errors[error0]; } if (PREDICT_TRUE (tc1 != 0)) { ASSERT (tcp_lookup_is_valid (tc1, b[1], tcp_buffer_hdr (b[1]))); vnet_buffer (b[1])->tcp.connection_index = tc1->c_c_index; - tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], error_node); + tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], node); } else { tcp_input_set_error_next (tm, &next[1], &error1, is_ip4); - b[1]->error = error_node->errors[error1]; + b[1]->error = node->errors[error1]; } } @@ -2926,12 +2917,12 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0]))); vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index; - tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], error_node); + tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], node); } else { tcp_input_set_error_next (tm, &next[0], &error0, is_ip4); - b[0]->error = error_node->errors[error0]; + b[0]->error = node->errors[error0]; } b += 1; @@ -3239,6 +3230,7 @@ do { \ _(CLOSE_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(CLOSE_WAIT, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(CLOSE_WAIT, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID); _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);