tcp: fast retransmit pacing 58/15658/4
authorFlorin Coras <fcoras@cisco.com>
Thu, 1 Nov 2018 06:09:22 +0000 (23:09 -0700)
committerDave Barach <openvpp@barachs.net>
Thu, 1 Nov 2018 20:43:29 +0000 (20:43 +0000)
Force pacing for fast retransmit to avoid bursts of retransmitted
packets.

Change-Id: I2ff42c328899b36322c4de557b1f7d853dba8fe2
Signed-off-by: Florin Coras <fcoras@cisco.com>
src/vnet/session/session_node.c
src/vnet/session/transport.c
src/vnet/session/transport_interface.h
src/vnet/tcp/tcp.c
src/vnet/tcp/tcp.h
src/vnet/tcp/tcp_debug.h
src/vnet/tcp/tcp_input.c
src/vnet/tcp/tcp_output.c

index 6ef461a..3413172 100644 (file)
@@ -570,7 +570,8 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
   ctx->tc = session_tx_get_transport (ctx, peek_data);
   ctx->snd_mss = ctx->transport_vft->send_mss (ctx->tc);
   ctx->snd_space =
-    transport_connection_max_tx_burst (ctx->tc, vm->clib_time.last_cpu_time);
+    transport_connection_snd_space (ctx->tc, vm->clib_time.last_cpu_time,
+                                   ctx->snd_mss);
   if (ctx->snd_space == 0 || ctx->snd_mss == 0)
     {
       vec_add1 (wrk->pending_event_vector, *e);
index fefedca..0dd9ccd 100644 (file)
@@ -497,9 +497,14 @@ static inline u32
 spacer_max_burst (spacer_t * pacer, u64 norm_time_now)
 {
   u64 n_periods = norm_time_now - pacer->last_update;
+  u64 inc;
+
+  if (n_periods > 0 && (inc = n_periods * pacer->tokens_per_period) > 10)
+    {
+      pacer->last_update = norm_time_now;
+      pacer->bucket += inc;
+    }
 
-  pacer->last_update = norm_time_now;
-  pacer->bucket += n_periods * pacer->tokens_per_period;
   return clib_min (pacer->bucket, pacer->max_burst_size);
 }
 
@@ -525,17 +530,22 @@ spacer_set_pace_rate (spacer_t * pacer, u64 rate_bytes_per_sec)
 
 void
 transport_connection_tx_pacer_init (transport_connection_t * tc,
-                                   u32 rate_bytes_per_sec, u32 burst_bytes)
+                                   u32 rate_bytes_per_sec,
+                                   u32 initial_bucket)
 {
   vlib_main_t *vm = vlib_get_main ();
   u64 time_now = vm->clib_time.last_cpu_time;
   spacer_t *pacer = &tc->pacer;
+  f64 dispatch_period;
+  u32 burst_size;
 
   tc->flags |= TRANSPORT_CONNECTION_F_IS_TX_PACED;
-  spacer_update_max_burst_size (&tc->pacer, burst_bytes);
+  dispatch_period = transport_dispatch_period (tc->thread_index);
+  burst_size = rate_bytes_per_sec * dispatch_period;
+  spacer_update_max_burst_size (&tc->pacer, burst_size);
   spacer_set_pace_rate (&tc->pacer, rate_bytes_per_sec);
   pacer->last_update = time_now >> SPACER_CPU_TICKS_PER_PERIOD_SHIFT;
-  pacer->bucket = burst_bytes;
+  pacer->bucket = initial_bucket;
 }
 
 void
@@ -550,17 +560,24 @@ transport_connection_tx_pacer_update (transport_connection_t * tc,
 }
 
 u32
-transport_connection_max_tx_burst (transport_connection_t * tc, u64 time_now)
+transport_connection_tx_pacer_burst (transport_connection_t * tc,
+                                    u64 time_now)
+{
+  time_now >>= SPACER_CPU_TICKS_PER_PERIOD_SHIFT;
+  return spacer_max_burst (&tc->pacer, time_now);
+}
+
+u32
+transport_connection_snd_space (transport_connection_t * tc, u64 time_now,
+                               u16 mss)
 {
   u32 snd_space, max_paced_burst;
-  u32 mss;
 
   snd_space = tp_vfts[tc->proto].send_space (tc);
   if (transport_connection_is_tx_paced (tc))
     {
       time_now >>= SPACER_CPU_TICKS_PER_PERIOD_SHIFT;
       max_paced_burst = spacer_max_burst (&tc->pacer, time_now);
-      mss = tp_vfts[tc->proto].send_mss (tc);
       max_paced_burst = (max_paced_burst < mss) ? 0 : max_paced_burst;
       snd_space = clib_min (snd_space, max_paced_burst);
       snd_space = snd_space - snd_space % mss;
@@ -576,6 +593,13 @@ transport_connection_update_tx_stats (transport_connection_t * tc, u32 bytes)
     spacer_update_bucket (&tc->pacer, bytes);
 }
 
+void
+transport_connection_tx_pacer_update_bytes (transport_connection_t * tc,
+                                           u32 bytes)
+{
+  spacer_update_bucket (&tc->pacer, bytes);
+}
+
 void
 transport_init_tx_pacers_period (void)
 {
index ce3bb7f..b7aa4b7 100644 (file)
@@ -111,7 +111,7 @@ void transport_enable_disable (vlib_main_t * vm, u8 is_en);
  */
 void transport_connection_tx_pacer_init (transport_connection_t * tc,
                                         u32 rate_bytes_per_sec,
-                                        u32 burst_bytes);
+                                        u32 initial_bucket);
 
 /**
  * Update tx pacer pacing rate
@@ -127,9 +127,13 @@ void transport_connection_tx_pacer_update (transport_connection_t * tc,
  *
  * @param tc           transport connection
  * @param time_now     current cpu time as returned by @ref clib_cpu_time_now
+ * @param mss          transport's mss
  */
-u32 transport_connection_max_tx_burst (transport_connection_t * tc,
-                                      u64 time_now);
+u32 transport_connection_snd_space (transport_connection_t * tc,
+                                   u64 time_now, u16 mss);
+
+u32 transport_connection_tx_pacer_burst (transport_connection_t * tc,
+                                        u64 time_now);
 
 /**
  * Initialize period for tx pacers
@@ -163,6 +167,10 @@ u8 *format_transport_pacer (u8 * s, va_list * args);
 void transport_connection_update_tx_stats (transport_connection_t * tc,
                                           u32 bytes);
 
+void
+transport_connection_tx_pacer_update_bytes (transport_connection_t * tc,
+                                           u32 bytes);
+
 #endif /* SRC_VNET_SESSION_TRANSPORT_INTERFACE_H_ */
 
 /*
index 8c0c5da..bdf9c7a 100644 (file)
@@ -558,10 +558,11 @@ tcp_init_snd_vars (tcp_connection_t * tc)
 void
 tcp_enable_pacing (tcp_connection_t * tc)
 {
-  u32 max_burst, byte_rate;
-  max_burst = 16 * tc->snd_mss;
+  u32 initial_bucket, byte_rate;
+  initial_bucket = 16 * tc->snd_mss;
   byte_rate = 2 << 16;
-  transport_connection_tx_pacer_init (&tc->connection, byte_rate, max_burst);
+  transport_connection_tx_pacer_init (&tc->connection, byte_rate,
+                                     initial_bucket);
   tc->mrtt_us = (u32) ~ 0;
 }
 
@@ -1318,6 +1319,7 @@ tcp_main_enable (vlib_main_t * vm)
 
   num_threads = 1 /* main thread */  + vtm->n_threads;
   vec_validate (tm->connections, num_threads - 1);
+  vec_validate (tm->wrk_ctx, num_threads - 1);
 
   /*
    * Preallocate connections. Assume that thread 0 won't
@@ -1339,6 +1341,13 @@ tcp_main_enable (vlib_main_t * vm)
       if (preallocated_connections_per_thread)
        pool_init_fixed (tm->connections[thread],
                         preallocated_connections_per_thread);
+      vec_validate (tm->wrk_ctx[thread].pending_fast_rxt, 0);
+      vec_validate (tm->wrk_ctx[thread].ongoing_fast_rxt, 0);
+      vec_validate (tm->wrk_ctx[thread].postponed_fast_rxt, 0);
+      vec_reset_length (tm->wrk_ctx[thread].pending_fast_rxt);
+      vec_reset_length (tm->wrk_ctx[thread].ongoing_fast_rxt);
+      vec_reset_length (tm->wrk_ctx[thread].postponed_fast_rxt);
+      tm->wrk_ctx[thread].vm = vlib_mains[thread];
     }
 
   /*
@@ -1358,7 +1367,6 @@ tcp_main_enable (vlib_main_t * vm)
       clib_spinlock_init (&tm->half_open_lock);
     }
 
-  vec_validate (tm->wrk_ctx, num_threads - 1);
   tcp_initialize_timer_wheels (tm);
 
   tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size
index 4ba3d5e..f7424c3 100644 (file)
@@ -160,7 +160,7 @@ enum
 };
 
 #define TCP_SCOREBOARD_TRACE (0)
-#define TCP_MAX_SACK_BLOCKS 32 /**< Max number of SACK blocks stored */
+#define TCP_MAX_SACK_BLOCKS 256        /**< Max number of SACK blocks stored */
 #define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0)
 
 typedef struct _scoreboard_trace_elt
@@ -390,6 +390,9 @@ typedef struct tcp_worker_ctx_
                                                     needing fast rxt */
   u32 *ongoing_fast_rxt;                       /**< vector of connections
                                                     now doing fast rxt */
+  u32 *postponed_fast_rxt;                     /**< vector of connections
+                                                    that will do fast rxt */
+  vlib_main_t *vm;                             /**< pointer to vm */
 
     CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
   u8 cached_opts[40];                          /**< cached 'on the wire'
@@ -722,8 +725,8 @@ always_inline void
 tcp_cc_rcv_ack (tcp_connection_t * tc)
 {
   tc->cc_algo->rcv_ack (tc);
-  tcp_update_pacer (tc);
   tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+  tcp_update_pacer (tc);
 }
 
 always_inline void
index 8f626b1..cd4a6f0 100755 (executable)
@@ -627,10 +627,8 @@ if (_av > 0)                                                               \
 
 #if TCP_DEBUG_CC
 
-#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)                     \
+#define TCP_EVT_CC_EVT_PRINT(_tc, _sub_evt)                            \
 {                                                                      \
-  if (_tc->snd_una != _tc->iss)                                                \
-    TCP_EVT_CC_STAT_PRINT (_tc);                                       \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
     .format = "cc: %s snd_space %u snd_una %u out %u flight %u",       \
@@ -638,8 +636,8 @@ if (_av > 0)                                                                \
     .n_enum_strings = 7,                                               \
     .enum_strings = {                                                  \
       "fast-rxt",                                                      \
-      "rxt-timeout",                                                   \
       "first-rxt",                                                     \
+      "rxt-timeout",                                                   \
       "recovered",                                                     \
       "congestion",                                                    \
       "undo",                                                          \
@@ -653,8 +651,18 @@ if (_av > 0)                                                               \
   ed->data[3] = tcp_bytes_out(_tc);                                    \
   ed->data[4] = tcp_flight_size (_tc);                                 \
 }
+
+#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)                     \
+{                                                                      \
+  if (_tc->snd_una != _tc->iss)                                                \
+    TCP_EVT_CC_STAT_PRINT (_tc);                                       \
+  if ((_sub_evt <= 1 && TCP_DEBUG_CC > 1)                              \
+      || (_sub_evt > 1 && TCP_DEBUG_CC > 0))                           \
+      TCP_EVT_CC_EVT_PRINT (_tc, _sub_evt);                            \
+}
 #else
-#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)
+#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)                     \
+
 #endif
 
 #if TCP_DEBUG_CC > 1
index ac0e996..154b9ac 100644 (file)
@@ -1199,67 +1199,60 @@ void
 tcp_do_fastretransmits (u32 thread_index)
 {
   tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[thread_index];
-  u32 max_burst_size, burst_size, n_segs = 0;
+  u32 max_burst_size, burst_size, n_segs = 0, n_segs_now;
+  u32 *ongoing_fast_rxt, burst_bytes, sent_bytes;
   tcp_connection_t *tc;
+  u64 last_cpu_time;
   int i;
 
-  if (vec_len (wrk->pending_fast_rxt) == 0)
+  if (vec_len (wrk->pending_fast_rxt) == 0
+      && vec_len (wrk->postponed_fast_rxt) == 0)
     return;
 
-  vec_append (wrk->ongoing_fast_rxt, wrk->pending_fast_rxt);
-  vec_reset_length (wrk->pending_fast_rxt);
+  last_cpu_time = wrk->vm->clib_time.last_cpu_time;
+  ongoing_fast_rxt = wrk->ongoing_fast_rxt;
+  vec_append (ongoing_fast_rxt, wrk->postponed_fast_rxt);
+  vec_append (ongoing_fast_rxt, wrk->pending_fast_rxt);
+
+  _vec_len (wrk->postponed_fast_rxt) = 0;
+  _vec_len (wrk->pending_fast_rxt) = 0;
 
   max_burst_size = VLIB_FRAME_SIZE / vec_len (wrk->ongoing_fast_rxt);
   max_burst_size = clib_max (max_burst_size, 1);
 
-  for (i = 0; i < vec_len (wrk->ongoing_fast_rxt); i++)
+  for (i = 0; i < vec_len (ongoing_fast_rxt); i++)
     {
-      tc = tcp_connection_get (wrk->ongoing_fast_rxt[i], thread_index);
+      if (n_segs >= VLIB_FRAME_SIZE)
+       {
+         vec_add1 (wrk->postponed_fast_rxt, ongoing_fast_rxt[i]);
+         continue;
+       }
+
+      tc = tcp_connection_get (ongoing_fast_rxt[i], thread_index);
       tc->flags &= ~TCP_CONN_FRXT_PENDING;
 
       if (!tcp_in_fastrecovery (tc))
        continue;
 
-      /* TODO tx pacer instead of this */
-      if (n_segs >= VLIB_FRAME_SIZE)
+      burst_size = clib_min (max_burst_size, VLIB_FRAME_SIZE - n_segs);
+      burst_bytes = transport_connection_tx_pacer_burst (&tc->connection,
+                                                        last_cpu_time);
+      burst_size = clib_min (burst_size, burst_bytes / tc->snd_mss);
+      if (!burst_size)
        {
          tcp_program_fastretransmit (tc);
          continue;
        }
 
-      burst_size = clib_min (max_burst_size, VLIB_FRAME_SIZE - n_segs);
+      n_segs_now = tcp_fast_retransmit (tc, burst_size);
+      sent_bytes = clib_min (n_segs_now * tc->snd_mss, burst_bytes);
+      transport_connection_tx_pacer_update_bytes (&tc->connection,
+                                                 sent_bytes);
 
-      if (tc->cwnd > tc->ssthresh + 3 * tc->snd_mss)
-       {
-         /* The first segment MUST be retransmitted */
-         if (tcp_retransmit_first_unacked (tc))
-           {
-             tcp_program_fastretransmit (tc);
-             continue;
-           }
-
-         /* Post retransmit update cwnd to ssthresh and account for the
-          * three segments that have left the network and should've been
-          * buffered at the receiver XXX */
-         tc->cwnd = tc->ssthresh + 3 * tc->snd_mss;
-
-         /* If cwnd allows, send more data */
-         if (tcp_opts_sack_permitted (&tc->rcv_opts))
-           {
-             scoreboard_init_high_rxt (&tc->sack_sb,
-                                       tc->snd_una + tc->snd_mss);
-             tc->sack_sb.rescue_rxt = tc->snd_una - 1;
-             n_segs += tcp_fast_retransmit_sack (tc, burst_size);
-           }
-         else
-           {
-             n_segs += tcp_fast_retransmit_no_sack (tc, burst_size);
-           }
-       }
-      else
-       n_segs += tcp_fast_retransmit (tc, burst_size);
+      n_segs += n_segs_now;
     }
-  vec_reset_length (wrk->ongoing_fast_rxt);
+  _vec_len (ongoing_fast_rxt) = 0;
+  wrk->ongoing_fast_rxt = ongoing_fast_rxt;
 }
 
 /**
@@ -1298,6 +1291,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
        }
       else if (tcp_should_fastrecover (tc))
        {
+         u32 byte_rate;
          ASSERT (!tcp_in_fastrecovery (tc));
 
          /* Heuristic to catch potential late dupacks
@@ -1313,8 +1307,21 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
          tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
 
          if (tcp_opts_sack_permitted (&tc->rcv_opts))
-           tc->sack_sb.high_rxt = tc->snd_una;
+           {
+             tc->cwnd = tc->ssthresh;
+             scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una);
+             tc->sack_sb.rescue_rxt = tc->snd_una - 1;
+           }
+         else
+           {
+             /* Post retransmit update cwnd to ssthresh and account for the
+              * three segments that have left the network and should've been
+              * buffered at the receiver XXX */
+             tc->cwnd = tc->ssthresh + 3 * tc->snd_mss;
+           }
 
+         byte_rate = (0.3 * tc->cwnd) / ((f64) TCP_TICK * tc->srtt);
+         transport_connection_tx_pacer_init (&tc->connection, byte_rate, 0);
          tcp_program_fastretransmit (tc);
          return;
        }
index f14a612..81579ef 100644 (file)
@@ -1452,10 +1452,10 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
       tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID;
     }
 
-  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
-
   if (tc->state >= TCP_STATE_ESTABLISHED)
     {
+      TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
+
       /* Lost FIN, retransmit and return */
       if (tcp_is_lost_fin (tc))
        {
@@ -1536,6 +1536,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
          return;
        }
 
+      TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
+
       /* Try without increasing RTO a number of times. If this fails,
        * start growing RTO exponentially */
       tc->rto_boff += 1;
@@ -1562,6 +1564,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
   /* Retransmit SYN-ACK */
   else if (tc->state == TCP_STATE_SYN_RCVD)
     {
+      TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
+
       tc->rto_boff += 1;
       if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
        tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
@@ -1693,7 +1697,7 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc)
   old_snd_nxt = tc->snd_nxt;
   tc->snd_nxt = tc->snd_una;
 
-  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
+  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
 
   n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
   if (!n_bytes)