From: Florin Coras Date: Mon, 5 Nov 2018 19:06:53 +0000 (-0800) Subject: tcp: dequeue acked only once per burst X-Git-Tag: v19.04-rc0~460 X-Git-Url: https://gerrit.fd.io/r/gitweb?p=vpp.git;a=commitdiff_plain;h=9ece3c03133309dda1f7f7f292bd071fa1ccb0f1 tcp: dequeue acked only once per burst Avoid dequeuing acked bytes more than once per burst for a connection. Although the fifos do not use locks, size decrements are atomic, so they rely on locked instructions. Change-Id: Id65f4ea40b2c10057461402dfd0393034e6472d5 Signed-off-by: Florin Coras --- diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index f8e74a88fcf..a466e3c935c 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -1322,12 +1322,14 @@ tcp_main_enable (vlib_main_t * vm) for (thread = 0; thread < num_threads; thread++) { - vec_validate (tm->wrk_ctx[thread].pending_fast_rxt, 0); - vec_validate (tm->wrk_ctx[thread].ongoing_fast_rxt, 0); - vec_validate (tm->wrk_ctx[thread].postponed_fast_rxt, 0); + vec_validate (tm->wrk_ctx[thread].pending_fast_rxt, 255); + vec_validate (tm->wrk_ctx[thread].ongoing_fast_rxt, 255); + vec_validate (tm->wrk_ctx[thread].postponed_fast_rxt, 255); + vec_validate (tm->wrk_ctx[thread].pending_deq_acked, 255); vec_reset_length (tm->wrk_ctx[thread].pending_fast_rxt); vec_reset_length (tm->wrk_ctx[thread].ongoing_fast_rxt); vec_reset_length (tm->wrk_ctx[thread].postponed_fast_rxt); + vec_reset_length (tm->wrk_ctx[thread].pending_deq_acked); tm->wrk_ctx[thread].vm = vlib_mains[thread]; /* diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index bd5e4f71bdb..480b924c882 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -122,6 +122,7 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; _(FINPNDG, "FIN pending") \ _(FRXT_PENDING, "Fast-retransmit pending") \ _(FRXT_FIRST, "Fast-retransmit first again") \ + _(DEQ_PENDING, "Pending dequeue acked") \ typedef enum _tcp_connection_flag_bits { @@ -308,6 +309,7 @@ typedef struct _tcp_connection u32 prev_ssthresh; /**< ssthresh before congestion */ u32 prev_cwnd; /**< ssthresh before congestion */ u32 bytes_acked; /**< Bytes acknowledged by current segment */ + u32 burst_acked; /**< Bytes acknowledged in current burst */ u32 snd_rxt_bytes; /**< Retransmitted bytes */ u32 snd_rxt_ts; /**< Timestamp when first packet is retransmitted */ u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */ @@ -392,6 +394,7 @@ typedef struct tcp_worker_ctx_ now doing fast rxt */ u32 *postponed_fast_rxt; /**< vector of connections that will do fast rxt */ + u32 *pending_deq_acked; vlib_main_t *vm; /**< pointer to vm */ CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 0b79a6699d7..9fc601b9788 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -494,23 +494,46 @@ done: } /** - * Dequeue bytes that have been acked and while at it update RTT estimates. + * Dequeue bytes for connections that have received acks in last burst */ static void -tcp_dequeue_acked (tcp_connection_t * tc, u32 ack) +tcp_handle_postponed_dequeues (tcp_worker_ctx_t * wrk) { - /* Dequeue the newly ACKed add SACKed bytes */ - stream_session_dequeue_drop (&tc->connection, - tc->bytes_acked + tc->sack_sb.snd_una_adv); + u32 thread_index = wrk->vm->thread_index; + u32 *pending_deq_acked; + tcp_connection_t *tc; + int i; + + if (!vec_len (wrk->pending_deq_acked)) + return; - tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); + pending_deq_acked = wrk->pending_deq_acked; + for (i = 0; i < vec_len (pending_deq_acked); i++) + { + tc = tcp_connection_get (pending_deq_acked[i], thread_index); + tc->flags &= ~TCP_CONN_DEQ_PENDING; - /* Update rtt and rto */ - tcp_update_rtt (tc, ack); + /* Dequeue the newly ACKed bytes */ + stream_session_dequeue_drop (&tc->connection, tc->burst_acked); + tc->burst_acked = 0; + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); - /* If everything has been acked, stop retransmit timer - * otherwise update. */ - tcp_retransmit_timer_update (tc); + /* If everything has been acked, stop retransmit timer + * otherwise update. */ + tcp_retransmit_timer_update (tc); + } + _vec_len (wrk->pending_deq_acked) = 0; +} + +static void +tcp_program_dequeue (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) +{ + if (!(tc->flags & TCP_CONN_DEQ_PENDING)) + { + vec_add1 (wrk->pending_deq_acked, tc->c_c_index); + tc->flags |= TCP_CONN_DEQ_PENDING; + } + tc->burst_acked += tc->bytes_acked + tc->sack_sb.snd_una_adv; } /** @@ -1023,7 +1046,7 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) tc->snd_wl2 = ack; TCP_EVT_DBG (TCP_EVT_SND_WND, tc); - if (tc->snd_wnd < tc->snd_mss) + if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss)) { /* Set persist timer if not set and we just got 0 wnd */ if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST) @@ -1033,7 +1056,7 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) else { tcp_persist_timer_reset (tc); - if (!tcp_in_recovery (tc) && tc->rto_boff > 0) + if (PREDICT_FALSE (!tcp_in_recovery (tc) && tc->rto_boff > 0)) { tc->rto_boff = 0; tcp_update_rto (tc); @@ -1452,7 +1475,7 @@ partial_ack: * Process incoming ACK */ static int -tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, +tcp_rcv_ack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, vlib_buffer_t * b, tcp_header_t * th, u32 * next, u32 * error) { u32 prev_snd_wnd, prev_snd_una; @@ -1522,7 +1545,10 @@ process_ack: tcp_validate_txf_size (tc, tc->bytes_acked); if (tc->bytes_acked) - tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); + { + tcp_program_dequeue (wrk, tc); + tcp_update_rtt (tc, vnet_buffer (b)->tcp.ack_number); + } TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); @@ -1992,6 +2018,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, int is_ip4) { u32 thread_index = vm->thread_index, errors = 0; + tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index); u32 n_left_from, next_index, *from, *to_next; u16 err_counters[TCP_N_ERROR] = { 0 }; u8 is_fin = 0; @@ -2062,7 +2089,8 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* 5: check the ACK field */ - if (PREDICT_FALSE (tcp_rcv_ack (tc0, b0, th0, &next0, &error0))) + if (PREDICT_FALSE (tcp_rcv_ack (wrk, tc0, b0, th0, &next0, + &error0))) { tcp_maybe_inc_err_counter (err_counters, error0); goto done; @@ -2107,7 +2135,8 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, thread_index); err_counters[TCP_ERROR_EVENT_FIFO_FULL] = errors; tcp_store_err_counters (established, err_counters); - tcp_flush_frame_to_output (tcp_get_worker (thread_index), is_ip4); + tcp_handle_postponed_dequeues (wrk); + tcp_flush_frame_to_output (wrk, is_ip4); return frame->n_vectors; } @@ -2588,6 +2617,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 n_left_from, next_index, *from, *to_next, n_fins = 0; u32 my_thread_index = vm->thread_index, errors = 0; + tcp_worker_ctx_t *wrk = tcp_get_worker (my_thread_index); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -2705,7 +2735,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_ESTABLISHED: /* We can get packets in established state here because they * were enqueued before state change */ - if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0)) { tcp_maybe_inc_counter (rcv_process, error0, 1); goto drop; @@ -2716,7 +2746,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* In addition to the processing for the ESTABLISHED state, if * our FIN is now acknowledged then enter FIN-WAIT-2 and * continue processing in that state. */ - if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0)) { tcp_maybe_inc_counter (rcv_process, error0, 1); goto drop; @@ -2746,7 +2776,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* In addition to the processing for the ESTABLISHED state, if * the retransmission queue is empty, the user's CLOSE can be * acknowledged ("ok") but do not delete the TCB. */ - if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0)) { tcp_maybe_inc_counter (rcv_process, error0, 1); goto drop; @@ -2754,7 +2784,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, break; case TCP_STATE_CLOSE_WAIT: /* Do the same processing as for the ESTABLISHED state. */ - if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0)) { tcp_maybe_inc_counter (rcv_process, error0, 1); goto drop; @@ -2776,7 +2806,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* In addition to the processing for the ESTABLISHED state, if * the ACK acknowledges our FIN then enter the TIME-WAIT state, * otherwise ignore the segment. */ - if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0)) { tcp_maybe_inc_counter (rcv_process, error0, 1); goto drop; @@ -2824,7 +2854,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * retransmission of the remote FIN. Acknowledge it, and restart * the 2 MSL timeout. */ - if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0)) { tcp_maybe_inc_counter (rcv_process, error0, 1); goto drop; @@ -2937,6 +2967,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, my_thread_index); tcp_inc_counter (rcv_process, TCP_ERROR_EVENT_FIFO_FULL, errors); tcp_inc_counter (rcv_process, TCP_ERROR_FIN_RCVD, n_fins); + tcp_handle_postponed_dequeues (wrk); return from_frame->n_vectors; } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index b15cf9b362b..29a919bd160 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1308,7 +1308,7 @@ tcp_prepare_segment (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, } } - tcp_get_free_buffer_index (wrk, &bi); + (void) tcp_get_free_buffer_index (wrk, &bi); ASSERT (bi != (u32) ~ 0); *b = vlib_get_buffer (vm, bi); data = tcp_init_buffer (vm, *b); @@ -1908,7 +1908,7 @@ send_unsent: /* RFC 6582: Send a new segment if permitted by the new value of cwnd. */ snd_space = tcp_available_cc_snd_space (tc); - if (snd_space < tc->snd_mss) + if (snd_space < tc->snd_mss || tc->snd_mss == 0) goto done; max_deq = session_tx_fifo_max_dequeue (&tc->connection);