tcp: dequeue acked only once per burst

[vpp.git] / src / vnet / tcp / tcp_input.c
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c

index 154b9ac..9fc601b 100644 (file)
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -258,7 +258,7 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end)
      {
        ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval));
        tc->tsval_recent = tc->rcv_opts.tsval;
-      tc->tsval_recent_age = tcp_time_now ();
+      tc->tsval_recent_age = tcp_time_now_w_thread (tc->c_thread_index);
      }
  }
  
@@ -308,7 +308,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
        /* If it just so happens that a segment updates tsval_recent for a
         * segment over 24 days old, invalidate tsval_recent. */
        if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE,
-                       tcp_time_now ()))
+                       tcp_time_now_w_thread (tc0->c_thread_index)))
         {
           /* Age isn't reset until we get a valid tsval (bsd inspired) */
           tc0->tsval_recent = 0;
@@ -470,7 +470,8 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack)
     * seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */
    else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr)
      {
-      mrtt = clib_max (tcp_time_now () - tc->rcv_opts.tsecr, 1);
+      u32 now = tcp_time_now_w_thread (tc->c_thread_index);
+      mrtt = clib_max (now - tc->rcv_opts.tsecr, 1);
      }
  
    /* Ignore dubious measurements */
@@ -493,23 +494,46 @@ done:
  }
  
  /**
- * Dequeue bytes that have been acked and while at it update RTT estimates.
+ * Dequeue bytes for connections that have received acks in last burst
   */
  static void
-tcp_dequeue_acked (tcp_connection_t * tc, u32 ack)
+tcp_handle_postponed_dequeues (tcp_worker_ctx_t * wrk)
  {
-  /* Dequeue the newly ACKed add SACKed bytes */
-  stream_session_dequeue_drop (&tc->connection,
-                              tc->bytes_acked + tc->sack_sb.snd_una_adv);
+  u32 thread_index = wrk->vm->thread_index;
+  u32 *pending_deq_acked;
+  tcp_connection_t *tc;
+  int i;
+
+  if (!vec_len (wrk->pending_deq_acked))
+    return;
+
+  pending_deq_acked = wrk->pending_deq_acked;
+  for (i = 0; i < vec_len (pending_deq_acked); i++)
+    {
+      tc = tcp_connection_get (pending_deq_acked[i], thread_index);
+      tc->flags &= ~TCP_CONN_DEQ_PENDING;
  
-  tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
+      /* Dequeue the newly ACKed bytes */
+      stream_session_dequeue_drop (&tc->connection, tc->burst_acked);
+      tc->burst_acked = 0;
+      tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
  
-  /* Update rtt and rto */
-  tcp_update_rtt (tc, ack);
+      /* If everything has been acked, stop retransmit timer
+       * otherwise update. */
+      tcp_retransmit_timer_update (tc);
+    }
+  _vec_len (wrk->pending_deq_acked) = 0;
+}
  
-  /* If everything has been acked, stop retransmit timer
-   * otherwise update. */
-  tcp_retransmit_timer_update (tc);
+static void
+tcp_program_dequeue (tcp_worker_ctx_t * wrk, tcp_connection_t * tc)
+{
+  if (!(tc->flags & TCP_CONN_DEQ_PENDING))
+    {
+      vec_add1 (wrk->pending_deq_acked, tc->c_c_index);
+      tc->flags |= TCP_CONN_DEQ_PENDING;
+    }
+  tc->burst_acked += tc->bytes_acked + tc->sack_sb.snd_una_adv;
  }
  
  /**
@@ -718,8 +742,7 @@ scoreboard_update_bytes (tcp_connection_t * tc, sack_scoreboard_t * sb)
  sack_scoreboard_hole_t *
  scoreboard_next_rxt_hole (sack_scoreboard_t * sb,
                           sack_scoreboard_hole_t * start,
-                         u8 have_sent_1_smss,
-                         u8 * can_rescue, u8 * snd_limited)
+                         u8 have_unsent, u8 * can_rescue, u8 * snd_limited)
  {
    sack_scoreboard_hole_t *hole = 0;
  
@@ -741,11 +764,11 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb,
      }
    else
      {
-      /* Rule (2): output takes care of transmitting new data */
-      if (!have_sent_1_smss)
+      /* Rule (2): available unsent data */
+      if (have_unsent)
         {
-         hole = 0;
           sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+         return 0;
         }
        /* Rule (3): if hole not lost */
        else if (seq_lt (hole->start, sb->high_sacked))
@@ -771,16 +794,17 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb,
  }
  
  static void
-scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 seq)
+scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 snd_una)
  {
    sack_scoreboard_hole_t *hole;
    hole = scoreboard_first_hole (sb);
    if (hole)
      {
-      seq = seq_gt (seq, hole->start) ? seq : hole->start;
+      snd_una = seq_gt (snd_una, hole->start) ? snd_una : hole->start;
        sb->cur_rxt_hole = sb->head;
      }
-  sb->high_rxt = seq;
+  sb->high_rxt = snd_una;
+  sb->rescue_rxt = snd_una - 1;
  }
  
  void
@@ -1022,7 +1046,7 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd)
        tc->snd_wl2 = ack;
        TCP_EVT_DBG (TCP_EVT_SND_WND, tc);
  
-      if (tc->snd_wnd < tc->snd_mss)
+      if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss))
         {
           /* Set persist timer if not set and we just got 0 wnd */
           if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST)
@@ -1032,7 +1056,7 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd)
        else
         {
           tcp_persist_timer_reset (tc);
-         if (!tcp_in_recovery (tc) && tc->rto_boff > 0)
+         if (PREDICT_FALSE (!tcp_in_recovery (tc) && tc->rto_boff > 0))
             {
               tc->rto_boff = 0;
               tcp_update_rto (tc);
@@ -1084,10 +1108,6 @@ tcp_cc_fastrecovery_exit (tcp_connection_t * tc)
    tcp_fastrecovery_1_smss_off (tc);
    tcp_fastrecovery_first_off (tc);
  
-  /* Update pacer because our cwnd changed. Also makes sure
-   * that we recompute the max burst size */
-  tcp_update_pacer (tc);
-
    TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
  }
  
@@ -1185,9 +1205,8 @@ tcp_should_fastrecover (tcp_connection_t * tc)
  }
  
  void
-tcp_program_fastretransmit (tcp_connection_t * tc)
+tcp_program_fastretransmit (tcp_worker_ctx_t * wrk, tcp_connection_t * tc)
  {
-  tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[tc->c_thread_index];
    if (!(tc->flags & TCP_CONN_FRXT_PENDING))
      {
        vec_add1 (wrk->pending_fast_rxt, tc->c_c_index);
@@ -1196,11 +1215,10 @@ tcp_program_fastretransmit (tcp_connection_t * tc)
  }
  
  void
-tcp_do_fastretransmits (u32 thread_index)
+tcp_do_fastretransmits (tcp_worker_ctx_t * wrk)
  {
-  tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[thread_index];
+  u32 *ongoing_fast_rxt, burst_bytes, sent_bytes, thread_index;
    u32 max_burst_size, burst_size, n_segs = 0, n_segs_now;
-  u32 *ongoing_fast_rxt, burst_bytes, sent_bytes;
    tcp_connection_t *tc;
    u64 last_cpu_time;
    int i;
@@ -1209,6 +1227,7 @@ tcp_do_fastretransmits (u32 thread_index)
        && vec_len (wrk->postponed_fast_rxt) == 0)
      return;
  
+  thread_index = wrk->vm->thread_index;
    last_cpu_time = wrk->vm->clib_time.last_cpu_time;
    ongoing_fast_rxt = wrk->ongoing_fast_rxt;
    vec_append (ongoing_fast_rxt, wrk->postponed_fast_rxt);
@@ -1217,7 +1236,7 @@ tcp_do_fastretransmits (u32 thread_index)
    _vec_len (wrk->postponed_fast_rxt) = 0;
    _vec_len (wrk->pending_fast_rxt) = 0;
  
-  max_burst_size = VLIB_FRAME_SIZE / vec_len (wrk->ongoing_fast_rxt);
+  max_burst_size = VLIB_FRAME_SIZE / vec_len (ongoing_fast_rxt);
    max_burst_size = clib_max (max_burst_size, 1);
  
    for (i = 0; i < vec_len (ongoing_fast_rxt); i++)
@@ -1240,15 +1259,14 @@ tcp_do_fastretransmits (u32 thread_index)
        burst_size = clib_min (burst_size, burst_bytes / tc->snd_mss);
        if (!burst_size)
         {
-         tcp_program_fastretransmit (tc);
+         tcp_program_fastretransmit (wrk, tc);
           continue;
         }
  
-      n_segs_now = tcp_fast_retransmit (tc, burst_size);
+      n_segs_now = tcp_fast_retransmit (wrk, tc, burst_size);
        sent_bytes = clib_min (n_segs_now * tc->snd_mss, burst_bytes);
        transport_connection_tx_pacer_update_bytes (&tc->connection,
                                                   sent_bytes);
-
        n_segs += n_segs_now;
      }
    _vec_len (ongoing_fast_rxt) = 0;
@@ -1267,7 +1285,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
      {
        if (tc->bytes_acked)
         goto partial_ack;
-      tcp_program_fastretransmit (tc);
+      tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index), tc);
        return;
      }
    /*
@@ -1291,7 +1309,8 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
         }
        else if (tcp_should_fastrecover (tc))
         {
-         u32 byte_rate;
+         u32 pacer_wnd;
+
           ASSERT (!tcp_in_fastrecovery (tc));
  
           /* Heuristic to catch potential late dupacks
@@ -1310,7 +1329,6 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
             {
               tc->cwnd = tc->ssthresh;
               scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una);
-             tc->sack_sb.rescue_rxt = tc->snd_una - 1;
             }
           else
             {
@@ -1320,9 +1338,12 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
               tc->cwnd = tc->ssthresh + 3 * tc->snd_mss;
             }
  
-         byte_rate = (0.3 * tc->cwnd) / ((f64) TCP_TICK * tc->srtt);
-         transport_connection_tx_pacer_init (&tc->connection, byte_rate, 0);
-         tcp_program_fastretransmit (tc);
+         /* Constrain rate until we get a partial ack */
+         pacer_wnd = clib_max (0.1 * tc->cwnd, 2 * tc->snd_mss);
+         tcp_connection_tx_pacer_reset (tc, pacer_wnd,
+                                        0 /* start bucket */ );
+         tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index),
+                                     tc);
           return;
         }
        else if (!tc->bytes_acked
@@ -1389,6 +1410,10 @@ partial_ack:
     * Legitimate ACK. 2) If PARTIAL ACK try to retransmit
     */
  
+  /* Update the pacing rate. For the first partial ack we move from
+   * the artificially constrained rate to the one after congestion */
+  tcp_connection_tx_pacer_update (tc);
+
    /* XXX limit this only to first partial ack? */
    tcp_retransmit_timer_force_update (tc);
  
@@ -1424,10 +1449,14 @@ partial_ack:
         {
           /* Apparently all retransmitted holes have been acked */
           tc->snd_rxt_bytes = 0;
+         tc->sack_sb.high_rxt = tc->snd_una;
         }
      }
    else
      {
+      tcp_fastrecovery_first_on (tc);
+      /* Reuse last bytes delivered to track total bytes acked */
+      tc->sack_sb.last_bytes_delivered += tc->bytes_acked;
        if (tc->snd_rxt_bytes > tc->bytes_acked)
         tc->snd_rxt_bytes -= tc->bytes_acked;
        else
@@ -1439,14 +1468,14 @@ partial_ack:
    /*
     * Since this was a partial ack, try to retransmit some more data
     */
-  tcp_program_fastretransmit (tc);
+  tcp_program_fastretransmit (tcp_get_worker (tc->c_thread_index), tc);
  }
  
  /**
   * Process incoming ACK
   */
  static int
-tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
+tcp_rcv_ack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, vlib_buffer_t * b,
              tcp_header_t * th, u32 * next, u32 * error)
  {
    u32 prev_snd_wnd, prev_snd_una;
@@ -1475,7 +1504,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
         {
           tcp_make_ack (tc, b);
           *next = tcp_next_output (tc->c_is_ip4);
-         *error = TCP_ERROR_ACK_INVALID;
+         *error = TCP_ERROR_ACK_FUTURE;
           TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 0,
                        vnet_buffer (b)->tcp.ack_number);
           return -1;
@@ -1485,7 +1514,6 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
                    vnet_buffer (b)->tcp.ack_number);
  
        tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
-      *error = TCP_ERROR_ACK_FUTURE;
      }
  
    /* If old ACK, probably it's an old dupack */
@@ -1517,7 +1545,10 @@ process_ack:
    tcp_validate_txf_size (tc, tc->bytes_acked);
  
    if (tc->bytes_acked)
-    tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number);
+    {
+      tcp_program_dequeue (wrk, tc);
+      tcp_update_rtt (tc, vnet_buffer (b)->tcp.ack_number);
+    }
  
    TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
  
@@ -1987,6 +2018,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                           vlib_frame_t * frame, int is_ip4)
  {
    u32 thread_index = vm->thread_index, errors = 0;
+  tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
    u32 n_left_from, next_index, *from, *to_next;
    u16 err_counters[TCP_N_ERROR] = { 0 };
    u8 is_fin = 0;
@@ -2057,7 +2089,8 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
             }
  
           /* 5: check the ACK field  */
-         if (PREDICT_FALSE (tcp_rcv_ack (tc0, b0, th0, &next0, &error0)))
+         if (PREDICT_FALSE (tcp_rcv_ack (wrk, tc0, b0, th0, &next0,
+                                         &error0)))
             {
               tcp_maybe_inc_err_counter (err_counters, error0);
               goto done;
@@ -2102,7 +2135,8 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                                                  thread_index);
    err_counters[TCP_ERROR_EVENT_FIFO_FULL] = errors;
    tcp_store_err_counters (established, err_counters);
-  tcp_flush_frame_to_output (vm, thread_index, is_ip4);
+  tcp_handle_postponed_dequeues (wrk);
+  tcp_flush_frame_to_output (wrk, is_ip4);
  
    return frame->n_vectors;
  }
@@ -2583,6 +2617,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
  {
    u32 n_left_from, next_index, *from, *to_next, n_fins = 0;
    u32 my_thread_index = vm->thread_index, errors = 0;
+  tcp_worker_ctx_t *wrk = tcp_get_worker (my_thread_index);
  
    from = vlib_frame_vector_args (from_frame);
    n_left_from = from_frame->n_vectors;
@@ -2700,7 +2735,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
             case TCP_STATE_ESTABLISHED:
               /* We can get packets in established state here because they
                * were enqueued before state change */
-             if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+             if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0))
                 {
                   tcp_maybe_inc_counter (rcv_process, error0, 1);
                   goto drop;
@@ -2711,7 +2746,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
               /* In addition to the processing for the ESTABLISHED state, if
                * our FIN is now acknowledged then enter FIN-WAIT-2 and
                * continue processing in that state. */
-             if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+             if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0))
                 {
                   tcp_maybe_inc_counter (rcv_process, error0, 1);
                   goto drop;
@@ -2741,7 +2776,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
               /* In addition to the processing for the ESTABLISHED state, if
                * the retransmission queue is empty, the user's CLOSE can be
                * acknowledged ("ok") but do not delete the TCB. */
-             if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+             if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0))
                 {
                   tcp_maybe_inc_counter (rcv_process, error0, 1);
                   goto drop;
@@ -2749,7 +2784,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
               break;
             case TCP_STATE_CLOSE_WAIT:
               /* Do the same processing as for the ESTABLISHED state. */
-             if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+             if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0))
                 {
                   tcp_maybe_inc_counter (rcv_process, error0, 1);
                   goto drop;
@@ -2771,7 +2806,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
               /* In addition to the processing for the ESTABLISHED state, if
                * the ACK acknowledges our FIN then enter the TIME-WAIT state,
                * otherwise ignore the segment. */
-             if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+             if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0))
                 {
                   tcp_maybe_inc_counter (rcv_process, error0, 1);
                   goto drop;
@@ -2819,7 +2854,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                * retransmission of the remote FIN. Acknowledge it, and restart
                * the 2 MSL timeout. */
  
-             if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+             if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &next0, &error0))
                 {
                   tcp_maybe_inc_counter (rcv_process, error0, 1);
                   goto drop;
@@ -2932,6 +2967,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                                                  my_thread_index);
    tcp_inc_counter (rcv_process, TCP_ERROR_EVENT_FIFO_FULL, errors);
    tcp_inc_counter (rcv_process, TCP_ERROR_FIN_RCVD, n_fins);
+  tcp_handle_postponed_dequeues (wrk);
    return from_frame->n_vectors;
  }
  
@@ -3393,7 +3429,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
    vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
    u16 nexts[VLIB_FRAME_SIZE], *next;
  
-  tcp_set_time_now (thread_index);
+  tcp_set_time_now (tcp_get_worker (thread_index));
  
    from = vlib_frame_vector_args (frame);
    n_left_from = frame->n_vectors;