X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp_input.c;h=6809a9173df6d33db78d367418a38eef4332aa78;hb=178cf493d009995b28fdf220f04c98860ff79a9b;hp=9c303eb01a5fb58f31fedc4a2338853914b3e607;hpb=7ac053b27fee8f9e437cf7b61357943356381061;p=vpp.git diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 9c303eb01a5..6809a9173df 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -455,8 +455,11 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq)) { - tc->mrtt_us = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts; - mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1); + f64 sample = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts; + tc->mrtt_us = tc->mrtt_us + (sample - tc->mrtt_us) * 0.125; + mrtt = clib_max ((u32) (sample * THZ), 1); + /* Allow measuring of a new RTT */ + tc->rtt_ts = 0; } /* As per RFC7323 TSecr can be used for RTTM only if the segment advances * snd_una, i.e., the left side of the send window: @@ -475,9 +478,6 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) done: - /* Allow measuring of a new RTT */ - tc->rtt_ts = 0; - /* If we got here something must've been ACKed so make sure boff is 0, * even if mrtt is not valid since we update the rto lower */ tc->rto_boff = 0; @@ -486,6 +486,29 @@ done: return 0; } +static void +tcp_estimate_initial_rtt (tcp_connection_t * tc) +{ + u8 thread_index = vlib_num_workers ()? 1 : 0; + int mrtt; + + if (tc->rtt_ts) + { + tc->mrtt_us = tcp_time_now_us (thread_index) - tc->rtt_ts; + mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1); + tc->rtt_ts = 0; + } + else + { + mrtt = tcp_time_now_w_thread (thread_index) - tc->rcv_opts.tsecr; + tc->mrtt_us = (f64) mrtt *TCP_TICK; + + } + + if (mrtt > 0 && mrtt < TCP_RTT_MAX) + tcp_estimate_rtt (tc, mrtt); +} + /** * Dequeue bytes for connections that have received acks in last burst */ @@ -506,6 +529,9 @@ tcp_handle_postponed_dequeues (tcp_worker_ctx_t * wrk) tc = tcp_connection_get (pending_deq_acked[i], thread_index); tc->flags &= ~TCP_CONN_DEQ_PENDING; + if (PREDICT_FALSE (!tc->burst_acked)) + continue; + /* Dequeue the newly ACKed bytes */ stream_session_dequeue_drop (&tc->connection, tc->burst_acked); tc->burst_acked = 0; @@ -514,6 +540,11 @@ tcp_handle_postponed_dequeues (tcp_worker_ctx_t * wrk) /* If everything has been acked, stop retransmit timer * otherwise update. */ tcp_retransmit_timer_update (tc); + + /* If not congested, update pacer based on our new + * cwnd estimate */ + if (!tcp_in_fastrecovery (tc)) + tcp_connection_tx_pacer_update (tc); } _vec_len (wrk->pending_deq_acked) = 0; } @@ -1084,6 +1115,7 @@ tcp_cc_recovery_exit (tcp_connection_t * tc) tcp_update_rto (tc); tc->snd_rxt_ts = 0; tc->snd_nxt = tc->snd_una_max; + tc->rtt_ts = 0; tcp_recovery_off (tc); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); } @@ -1096,9 +1128,9 @@ tcp_cc_fastrecovery_exit (tcp_connection_t * tc) tc->rcv_dupacks = 0; tc->snd_nxt = tc->snd_una_max; tc->snd_rxt_bytes = 0; + tc->rtt_ts = 0; tcp_fastrecovery_off (tc); - tcp_fastrecovery_1_smss_off (tc); tcp_fastrecovery_first_off (tc); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); @@ -1381,6 +1413,10 @@ partial_ack: * Legitimate ACK. 1) See if we can exit recovery */ + /* Update the pacing rate. For the first partial ack we move from + * the artificially constrained rate to the one after congestion */ + tcp_connection_tx_pacer_update (tc); + if (seq_geq (tc->snd_una, tc->snd_congestion)) { tcp_retransmit_timer_update (tc); @@ -1403,12 +1439,8 @@ partial_ack: * Legitimate ACK. 2) If PARTIAL ACK try to retransmit */ - /* Update the pacing rate. For the first partial ack we move from - * the artificially constrained rate to the one after congestion */ - tcp_connection_tx_pacer_update (tc); - /* XXX limit this only to first partial ack? */ - tcp_retransmit_timer_force_update (tc); + tcp_retransmit_timer_update (tc); /* RFC6675: If the incoming ACK is a cumulative acknowledgment, * reset dupacks to 0. Also needed if in congestion recovery */ @@ -1567,6 +1599,54 @@ process_ack: return 0; } +static void +tcp_program_disconnect (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) +{ + if (!tcp_disconnect_pending (tc)) + { + vec_add1 (wrk->pending_disconnects, tc->c_c_index); + tcp_disconnect_pending_on (tc); + } +} + +static void +tcp_handle_disconnects (tcp_worker_ctx_t * wrk) +{ + u32 thread_index, *pending_disconnects; + tcp_connection_t *tc; + int i; + + if (!vec_len (wrk->pending_disconnects)) + return; + + thread_index = wrk->vm->thread_index; + pending_disconnects = wrk->pending_disconnects; + for (i = 0; i < vec_len (pending_disconnects); i++) + { + tc = tcp_connection_get (pending_disconnects[i], thread_index); + tcp_disconnect_pending_off (tc); + stream_session_disconnect_notify (&tc->connection); + } + _vec_len (wrk->pending_disconnects) = 0; +} + +static void +tcp_rcv_fin (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, vlib_buffer_t * b, + u32 * error) +{ + /* Enter CLOSE-WAIT and notify session. To avoid lingering + * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */ + /* Account for the FIN if nothing else was received */ + if (vnet_buffer (b)->tcp.data_len == 0) + tc->rcv_nxt += 1; + tcp_program_ack (wrk, tc); + tc->state = TCP_STATE_CLOSE_WAIT; + tcp_program_disconnect (wrk, tc); + tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); + TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc); + *error = TCP_ERROR_FIN_RCVD; +} + static u8 tcp_sack_vector_is_sane (sack_block_t * sacks) { @@ -1910,13 +1990,14 @@ tcp_set_rx_trace_data (tcp_rx_trace_t * t0, tcp_connection_t * tc0, { if (tc0) { - clib_memcpy (&t0->tcp_connection, tc0, sizeof (t0->tcp_connection)); + clib_memcpy_fast (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } else { th0 = tcp_buffer_hdr (b0); } - clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + clib_memcpy_fast (&t0->tcp_header, th0, sizeof (t0->tcp_header)); } static void @@ -2066,19 +2147,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* 8: check the FIN bit */ if (PREDICT_FALSE (is_fin)) - { - /* Enter CLOSE-WAIT and notify session. To avoid lingering - * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */ - /* Account for the FIN if nothing else was received */ - if (vnet_buffer (b0)->tcp.data_len == 0) - tc0->rcv_nxt += 1; - tcp_program_ack (wrk, tc0); - tc0->state = TCP_STATE_CLOSE_WAIT; - stream_session_disconnect_notify (&tc0->connection); - tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); - TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); - error0 = TCP_ERROR_FIN_RCVD; - } + tcp_rcv_fin (wrk, tc0, b0, &error0); done: tcp_inc_err_counter (err_counters, error0, 1); @@ -2089,6 +2158,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, err_counters[TCP_ERROR_EVENT_FIFO_FULL] = errors; tcp_store_err_counters (established, err_counters); tcp_handle_postponed_dequeues (wrk); + tcp_handle_disconnects (wrk); vlib_buffer_free (vm, first_buffer, frame->n_vectors); return frame->n_vectors; @@ -2374,7 +2444,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Valid SYN or SYN-ACK. Move connection from half-open pool to * current thread pool. */ pool_get (tm->connections[my_thread_index], new_tc0); - clib_memcpy (new_tc0, tc0, sizeof (*new_tc0)); + clib_memcpy_fast (new_tc0, tc0, sizeof (*new_tc0)); new_tc0->c_c_index = new_tc0 - tm->connections[my_thread_index]; new_tc0->c_thread_index = my_thread_index; new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end; @@ -2426,8 +2496,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } + new_tc0->tx_fifo_size = + transport_tx_fifo_size (&new_tc0->connection); /* Update rtt with the syn-ack sample */ - tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number); + tcp_estimate_initial_rtt (new_tc0); TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, new_tc0); error0 = TCP_ERROR_SYN_ACKS_RCVD; } @@ -2445,8 +2517,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - tc0->rtt_ts = 0; - tcp_init_snd_vars (tc0); + new_tc0->tx_fifo_size = + transport_tx_fifo_size (&new_tc0->connection); + new_tc0->rtt_ts = 0; + tcp_init_snd_vars (new_tc0); tcp_send_synack (new_tc0); error0 = TCP_ERROR_SYNS_RCVD; goto drop; @@ -2471,8 +2545,9 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE ((b0->flags & VLIB_BUFFER_IS_TRACED) && tcp0 != 0)) { t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); - clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); - clib_memcpy (&t0->tcp_connection, tc0, sizeof (t0->tcp_connection)); + clib_memcpy_fast (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); + clib_memcpy_fast (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } } @@ -2636,7 +2711,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* Update rtt and rto */ - tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number); + tcp_estimate_initial_rtt (tc0); /* Switch state to ESTABLISHED */ tc0->state = TCP_STATE_ESTABLISHED; @@ -2687,6 +2762,12 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * wait for peer's FIN but not indefinitely. */ tcp_connection_timers_reset (tc0); tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + + /* Don't try to deq the FIN acked */ + if (tc0->burst_acked > 1) + stream_session_dequeue_drop (&tc0->connection, + tc0->burst_acked - 1); + tc0->burst_acked = 0; } break; case TCP_STATE_FIN_WAIT_2: @@ -2695,6 +2776,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * acknowledged ("ok") but do not delete the TCB. */ if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0)) goto drop; + tc0->burst_acked = 0; break; case TCP_STATE_CLOSE_WAIT: /* Do the same processing as for the ESTABLISHED state. */ @@ -3005,10 +3087,10 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } else { - clib_memcpy (&child0->c_lcl_ip6, &ip60->dst_address, - sizeof (ip6_address_t)); - clib_memcpy (&child0->c_rmt_ip6, &ip60->src_address, - sizeof (ip6_address_t)); + clib_memcpy_fast (&child0->c_lcl_ip6, &ip60->dst_address, + sizeof (ip6_address_t)); + clib_memcpy_fast (&child0->c_rmt_ip6, &ip60->src_address, + sizeof (ip6_address_t)); } if (tcp_options_parse (th0, &child0->rcv_opts)) @@ -3050,6 +3132,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } + child0->tx_fifo_size = transport_tx_fifo_size (&child0->connection); tcp_send_synack (child0); tcp_timer_set (child0, TCP_TIMER_ESTABLISH, TCP_SYN_RCVD_TIME); @@ -3058,8 +3141,9 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); - clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); - clib_memcpy (&t0->tcp_connection, lc0, sizeof (t0->tcp_connection)); + clib_memcpy_fast (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + clib_memcpy_fast (&t0->tcp_connection, lc0, + sizeof (t0->tcp_connection)); } n_syns += (error0 == TCP_ERROR_NONE);