X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp_input.c;h=9fc601b9788bbdbbbaaced281d6a2a7837ffcc59;hb=9ece3c03133309dda1f7f7f292bd071fa1ccb0f1;hp=154b9ac836382c46c366757571de1d39f6a445a3;hpb=e55a6d7a97044c2f4fd0231242e062924d75c7b6;p=vpp.git diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 154b9ac8363..9fc601b9788 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -258,7 +258,7 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end) { ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval)); tc->tsval_recent = tc->rcv_opts.tsval; - tc->tsval_recent_age = tcp_time_now (); + tc->tsval_recent_age = tcp_time_now_w_thread (tc->c_thread_index); } } @@ -308,7 +308,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, /* If it just so happens that a segment updates tsval_recent for a * segment over 24 days old, invalidate tsval_recent. */ if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE, - tcp_time_now ())) + tcp_time_now_w_thread (tc0->c_thread_index))) { /* Age isn't reset until we get a valid tsval (bsd inspired) */ tc0->tsval_recent = 0; @@ -470,7 +470,8 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) * seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */ else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr) { - mrtt = clib_max (tcp_time_now () - tc->rcv_opts.tsecr, 1); + u32 now = tcp_time_now_w_thread (tc->c_thread_index); + mrtt = clib_max (now - tc->rcv_opts.tsecr, 1); } /* Ignore dubious measurements */ @@ -493,23 +494,46 @@ done: } /** - * Dequeue bytes that have been acked and while at it update RTT estimates. + * Dequeue bytes for connections that have received acks in last burst */ static void -tcp_dequeue_acked (tcp_connection_t * tc, u32 ack) +tcp_handle_postponed_dequeues (tcp_worker_ctx_t * wrk) { - /* Dequeue the newly ACKed add SACKed bytes */ - stream_session_dequeue_drop (&tc->connection, - tc->bytes_acked + tc->sack_sb.snd_una_adv); + u32 thread_index = wrk->vm->thread_index; + u32 *pending_deq_acked; + tcp_connection_t *tc; + int i; + + if (!vec_len (wrk->pending_deq_acked)) + return; + + pending_deq_acked = wrk->pending_deq_acked; + for (i = 0; i < vec_len (pending_deq_acked); i++) + { + tc = tcp_connection_get (pending_deq_acked[i], thread_index); + tc->flags &= ~TCP_CONN_DEQ_PENDING; - tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); + /* Dequeue the newly ACKed bytes */ + stream_session_dequeue_drop (&tc->connection, tc->burst_acked); + tc->burst_acked = 0; + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); - /* Update rtt and rto */ - tcp_update_rtt (tc, ack); + /* If everything has been acked, stop retransmit timer + * otherwise update. */ + tcp_retransmit_timer_update (tc); + } + _vec_len (wrk->pending_deq_acked) = 0; +} - /* If everything has been acked, stop retransmit timer - * otherwise update. */ - tcp_retransmit_timer_update (tc); +static void +tcp_program_dequeue (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) +{ + if (!(tc->flags & TCP_CONN_DEQ_PENDING)) + { + vec_add1 (wrk->pending_deq_acked, tc->c_c_index); + tc->flags |= TCP_CONN_DEQ_PENDING; + } + tc->burst_acked += tc->bytes_acked + tc->sack_sb.snd_una_adv; } /** @@ -718,8 +742,7 @@ scoreboard_update_bytes (tcp_connection_t * tc, sack_scoreboard_t * sb) sack_scoreboard_hole_t * scoreboard_next_rxt_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * start, - u8 have_sent_1_smss, - u8 * can_rescue, u8 * snd_limited) + u8 have_unsent, u8 * can_rescue, u8 * snd_limited) { sack_scoreboard_hole_t *hole = 0; @@ -741,11 +764,11 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb, } else { - /* Rule (2): output takes care of transmitting new data */ - if (!have_sent_1_smss) + /* Rule (2): available unsent data */ + if (have_unsent) { - hole = 0; sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + return 0; } /* Rule (3): if hole not lost */ else if (seq_lt (hole->start, sb->high_sacked)) @@ -771,16 +794,17 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb, } static void -scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 seq) +scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 snd_una) { sack_scoreboard_hole_t *hole; hole = scoreboard_first_hole (sb); if (hole) { - seq = seq_gt (seq, hole->start) ? seq : hole->start; + snd_una = seq_gt (snd_una, hole->start) ? snd_una : hole->start; sb->cur_rxt_hole = sb->head; } - sb->high_rxt = seq; + sb->high_rxt = snd_una; + sb->rescue_rxt = snd_una - 1; } void @@ -1022,7 +1046,7 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) tc->snd_wl2 = ack; TCP_EVT_DBG (TCP_EVT_SND_WND, tc); - if (tc->snd_wnd < tc->snd_mss) + if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss)) { /* Set persist timer if not set and we just got 0 wnd */ if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST) @@ -1032,7 +1056,7 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) else { tcp_persist_timer_reset (tc); - if (!tcp_in_recovery (tc) && tc->rto_boff > 0) + if (PREDICT_FALSE (!tcp_in_recovery (tc) && tc->rto_boff > 0)) { tc->rto_boff = 0; tcp_update_rto (tc); @@ -1084,10 +1108,6 @@ tcp_cc_fastrecovery_exit (tcp_connection_t * tc) tcp_fastrecovery_1_smss_off (tc); tcp_fastrecovery_first_off (tc); - /* Update pacer because our cwnd changed. Also makes sure - * that we recompute the max burst size */ - tcp_update_pacer (tc); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); } @@ -1185,9 +1205,8 @@ tcp_should_fastrecover (tcp_connection_t * tc) } void -tcp_program_fastretransmit (tcp_connection_t * tc) +tcp_program_fastretransmit (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) { - tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[tc->c_thread_index]; if (!(tc->flags & TCP_CONN_FRXT_PENDING)) { vec_add1 (wrk->pending_fast_rxt, tc->c_c_index); @@ -1196,11 +1215,10 @@ tcp_program_fastretransmit (tcp_connection_t * tc) } void -tcp_do_fastretransmits (u32 thread_index) +tcp_do_fastretransmits (tcp_worker_ctx_t * wrk) { - tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[thread_index]; + u32 *ongoing_fast_rxt, burst_bytes, sent_bytes, thread_index; u32 max_burst_size, burst_size, n_segs = 0, n_segs_now; - u32 *ongoing_fast_rxt, burst_bytes, sent_bytes; tcp_connection_t *tc; u64 last_cpu_time; int i; @@ -1209,6 +1227,7 @@ tcp_do_fastretransmits (u32 thread_index) && vec_len (wrk->postponed_fast_rxt) == 0) return; + thread_index = wrk->vm->thread_index; last_cpu_time = wrk->vm->clib_time.last_cpu_time; ongoing_fast_rxt = wrk->ongoing_fast_rxt; vec_append (ongoing_fast_rxt, wrk->postponed_fast_rxt); @@ -1217,7 +1236,7 @@ tcp_do_fastretransmits (u32 thread_index) _vec_len (wrk->postponed_fast_rxt) = 0; _vec_len (wrk->pending_fast_rxt) = 0; - max_burst_size = VLIB_FRAME_SIZE / vec_len (wrk->ongoing_fast_rxt); + max_burst_size = VLIB_FRAME_SIZE / vec_len (ongoing_fast_rxt); max_burst_size = clib_max (max_burst_size, 1); for (i = 0; i < vec_len (ongoing_fast_rxt); i++) @@ -1240,15 +1259,14 @@ tcp_do_fastretransmits (u32 thread_index) burst_size = clib_min (burst_size, burst_bytes / tc->snd_mss); if (!burst_size) { - tcp_program_fastretransmit (tc); + tcp_program_fastretransmit (wrk, tc); continue; } - n_segs_now = tcp_fast_retransmit (tc, burst_size); + n_segs_now = tcp_fast_retransmit (wrk, tc, burst_size); sent_bytes = clib_min (n_segs_now * tc->snd_mss, burst_bytes); transport_connection_tx_pacer_update_bytes (&tc->connection, sent_bytes); - n_segs += n_segs_now; } _vec_len (ongoing_fast_rxt) = 0; @@ -1267,7 +1285,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) { if (tc->bytes_acked) goto partial_ack; - tcp_program_fastretransmit (tc); + tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index), tc); return; } /* @@ -1291,7 +1309,8 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) } else if (tcp_should_fastrecover (tc)) { - u32 byte_rate; + u32 pacer_wnd; + ASSERT (!tcp_in_fastrecovery (tc)); /* Heuristic to catch potential late dupacks @@ -1310,7 +1329,6 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) { tc->cwnd = tc->ssthresh; scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una); - tc->sack_sb.rescue_rxt = tc->snd_una - 1; } else { @@ -1320,9 +1338,12 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) tc->cwnd = tc->ssthresh + 3 * tc->snd_mss; } - byte_rate = (0.3 * tc->cwnd) / ((f64) TCP_TICK * tc->srtt); - transport_connection_tx_pacer_init (&tc->connection, byte_rate, 0); - tcp_program_fastretransmit (tc); + /* Constrain rate until we get a partial ack */ + pacer_wnd = clib_max (0.1 * tc->cwnd, 2 * tc->snd_mss); + tcp_connection_tx_pacer_reset (tc, pacer_wnd, + 0 /* start bucket */ ); + tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index), + tc); return; } else if (!tc->bytes_acked @@ -1389,6 +1410,10 @@ partial_ack: * Legitimate ACK. 2) If PARTIAL ACK try to retransmit */ + /* Update the pacing rate. For the first partial ack we move from + * the artificially constrained rate to the one after congestion */ + tcp_connection_tx_pacer_update (tc); + /* XXX limit this only to first partial ack? */ tcp_retransmit_timer_force_update (tc); @@ -1424,10 +1449,14 @@ partial_ack: { /* Apparently all retransmitted holes have been acked */ tc->snd_rxt_bytes = 0; + tc->sack_sb.high_rxt = tc->snd_una; } } else { + tcp_fastrecovery_first_on (tc); + /* Reuse last bytes delivered to track total bytes acked */ + tc->sack_sb.last_bytes_delivered += tc->bytes_acked; if (tc->snd_rxt_bytes > tc->bytes_acked) tc->snd_rxt_bytes -= tc->bytes_acked; else @@ -1439,14 +1468,14 @@ partial_ack: /* * Since this was a partial ack, try to retransmit some more data */ - tcp_program_fastretransmit (tc); + tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index), tc); } /** * Process incoming ACK */ static int -tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, +tcp_rcv_ack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, vlib_buffer_t * b, tcp_header_t * th, u32 * next, u32 * error) { u32 prev_snd_wnd, prev_snd_una; @@ -1475,7 +1504,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, { tcp_make_ack (tc, b); *next = tcp_next_output (tc->c_is_ip4); - *error = TCP_ERROR_ACK_INVALID; + *error = TCP_ERROR_ACK_FUTURE; TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 0, vnet_buffer (b)->tcp.ack_number); return -1; @@ -1485,7 +1514,6 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, vnet_buffer (b)->tcp.ack_number); tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; - *error = TCP_ERROR_ACK_FUTURE; } /* If old ACK, probably it's an old dupack */ @@ -1517,7 +1545,10 @@ process_ack: tcp_validate_txf_size (tc, tc->bytes_acked); if (tc->bytes_acked) - tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); + { + tcp_program_dequeue (wrk, tc); + tcp_update_rtt (tc, vnet_buffer (b)->tcp.ack_number); + } TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); @@ -1987,6 +2018,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, int is_ip4) { u32 thread_index = vm->thread_index, errors = 0; + tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index); u32 n_left_from, next_index, *from, *to_next; u16 err_counters[TCP_N_ERROR] = { 0 }; u8 is_fin = 0; @@ -2057,7 +2089,8 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* 5: check the ACK field */ - if (PREDICT_FALSE (tcp_rcv_ack (tc0, b0, th0, &next0, &error0))) + if (PREDICT_FALSE (tcp_rcv_ack (wrk, tc0, b0, th0, &next0, + &error0))) { tcp_maybe_inc_err_counter (err_counters, error0); goto done; @@ -2102,7 +2135,8 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, thread_index); err_counters[TCP_ERROR_EVENT_FIFO_FULL] = errors; tcp_store_err_counters (established, err_counters); - tcp_flush_frame_to_output (vm, thread_index, is_ip4); + tcp_handle_postponed_dequeues (wrk); + tcp_flush_frame_to_output (wrk, is_ip4); return frame->n_vectors; } @@ -2583,6 +2617,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 n_left_from, next_index, *from, *to_next, n_fins = 0; u32 my_thread_index = vm->thread_index, errors = 0; + tcp_worker_ctx_t *wrk = tcp_get_worker (my_thread_index); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -2700,7 +2735,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_ESTABLISHED: /* We can get packets in established state here because they * were enqueued before state change */ - if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0)) { tcp_maybe_inc_counter (rcv_process, error0, 1); goto drop; @@ -2711,7 +2746,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* In addition to the processing for the ESTABLISHED state, if * our FIN is now acknowledged then enter FIN-WAIT-2 and * continue processing in that state. */ - if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0)) { tcp_maybe_inc_counter (rcv_process, error0, 1); goto drop; @@ -2741,7 +2776,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* In addition to the processing for the ESTABLISHED state, if * the retransmission queue is empty, the user's CLOSE can be * acknowledged ("ok") but do not delete the TCB. */ - if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0)) { tcp_maybe_inc_counter (rcv_process, error0, 1); goto drop; @@ -2749,7 +2784,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, break; case TCP_STATE_CLOSE_WAIT: /* Do the same processing as for the ESTABLISHED state. */ - if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0)) { tcp_maybe_inc_counter (rcv_process, error0, 1); goto drop; @@ -2771,7 +2806,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* In addition to the processing for the ESTABLISHED state, if * the ACK acknowledges our FIN then enter the TIME-WAIT state, * otherwise ignore the segment. */ - if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0)) { tcp_maybe_inc_counter (rcv_process, error0, 1); goto drop; @@ -2819,7 +2854,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * retransmission of the remote FIN. Acknowledge it, and restart * the 2 MSL timeout. */ - if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0)) { tcp_maybe_inc_counter (rcv_process, error0, 1); goto drop; @@ -2932,6 +2967,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, my_thread_index); tcp_inc_counter (rcv_process, TCP_ERROR_EVENT_FIFO_FULL, errors); tcp_inc_counter (rcv_process, TCP_ERROR_FIN_RCVD, n_fins); + tcp_handle_postponed_dequeues (wrk); return from_frame->n_vectors; } @@ -3393,7 +3429,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b; u16 nexts[VLIB_FRAME_SIZE], *next; - tcp_set_time_now (thread_index); + tcp_set_time_now (tcp_get_worker (thread_index)); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors;