tcp: fast retransmit improvements 26/15426/18
authorFlorin Coras <fcoras@cisco.com>
Fri, 19 Oct 2018 23:26:24 +0000 (16:26 -0700)
committerDamjan Marion <dmarion@me.com>
Tue, 23 Oct 2018 19:35:51 +0000 (19:35 +0000)
Patch is too large to be ported to 18.10 just days before release.

- handle fast retransmits outside of established node and limit the
retransmit burst size to avoid tx losses and worsening congestion.
- in the absance of a tx pacer, use slow start after fast retransmit
exists
- add fast retransmit heuristic that re-retries sending the first
segment if everything else fails
- fine tuning

Change-Id: I84a2ab8fbba8b97f1d2b26584dc11a1e2c33c8d2
Signed-off-by: Florin Coras <fcoras@cisco.com>
src/vnet/tcp/tcp.c
src/vnet/tcp/tcp.h
src/vnet/tcp/tcp_debug.h
src/vnet/tcp/tcp_input.c
src/vnet/tcp/tcp_output.c

index e32b5c4..cb05b8c 100644 (file)
@@ -950,7 +950,8 @@ format_tcp_scoreboard (u8 * s, va_list * args)
 
   hole = scoreboard_first_hole (sb);
   if (hole)
-    s = format (s, "\n head %u tail %u holes:\n", sb->head, sb->tail);
+    s = format (s, "\n head %u tail %u %u holes:\n", sb->head, sb->tail,
+               pool_elts (sb->holes));
 
   while (hole)
     {
@@ -1027,7 +1028,7 @@ tcp_snd_space_inline (tcp_connection_t * tc)
 {
   int snd_space, snt_limited;
 
-  if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0))
+  if (PREDICT_TRUE (!tcp_in_fastrecovery (tc)))
     {
       snd_space = tcp_available_output_snd_space (tc);
 
@@ -1047,16 +1048,6 @@ tcp_snd_space_inline (tcp_connection_t * tc)
       return tcp_round_snd_space (tc, snd_space);
     }
 
-  if (tcp_in_recovery (tc))
-    {
-      tc->snd_nxt = tc->snd_una_max;
-      snd_space = tcp_available_snd_wnd (tc) - tc->snd_rxt_bytes
-       - (tc->snd_una_max - tc->snd_congestion);
-      if (snd_space <= 0 || (tc->snd_una_max - tc->snd_una) >= tc->snd_wnd)
-       return 0;
-      return tcp_round_snd_space (tc, snd_space);
-    }
-
   /* RFC 5681: When previously unsent data is available and the new value of
    * cwnd and the receiver's advertised window allow, a TCP SHOULD send 1*SMSS
    * bytes of previously unsent data. */
@@ -1103,6 +1094,7 @@ tcp_update_time (f64 now, u8 thread_index)
   tw_timer_expire_timers_16t_2w_512sl (&tcp_main.
                                       wrk_ctx[thread_index].timer_wheel,
                                       now);
+  tcp_do_fastretransmits (thread_index);
   tcp_flush_frames_to_output (thread_index);
 }
 
index 165659b..a036072 100644 (file)
@@ -120,6 +120,8 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler;
   _(FR_1_SMSS, "Sent 1 SMSS")                  \
   _(HALF_OPEN_DONE, "Half-open completed")     \
   _(FINPNDG, "FIN pending")                    \
+  _(FRXT_PENDING, "Fast-retransmit pending")   \
+  _(FRXT_FIRST, "Fast-retransmit first again") \
 
 typedef enum _tcp_connection_flag_bits
 {
@@ -345,6 +347,9 @@ struct _tcp_cc_algorithm
 #define tcp_fastrecovery_sent_1_smss(tc) ((tc)->flags & TCP_CONN_FR_1_SMSS)
 #define tcp_fastrecovery_1_smss_on(tc) ((tc)->flags |= TCP_CONN_FR_1_SMSS)
 #define tcp_fastrecovery_1_smss_off(tc) ((tc)->flags &= ~TCP_CONN_FR_1_SMSS)
+#define tcp_fastrecovery_first(tc) ((tc)->flags & TCP_CONN_FRXT_FIRST)
+#define tcp_fastrecovery_first_on(tc) ((tc)->flags |= TCP_CONN_FRXT_FIRST)
+#define tcp_fastrecovery_first_off(tc) ((tc)->flags &= ~TCP_CONN_FRXT_FIRST)
 
 #define tcp_in_cong_recovery(tc) ((tc)->flags &                \
          (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY))
@@ -354,6 +359,7 @@ tcp_cong_recovery_off (tcp_connection_t * tc)
 {
   tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY);
   tcp_fastrecovery_1_smss_off (tc);
+  tcp_fastrecovery_first_off (tc);
 }
 
 typedef enum _tcp_error
@@ -379,9 +385,15 @@ typedef struct tcp_worker_ctx_
                                                     output nodes */
   vlib_frame_t *ip_lookup_tx_frames[2];                /**< tx frames for ip 4/6
                                                     lookup nodes */
+  u32 *pending_fast_rxt;                       /**< vector of connections
+                                                    needing fast rxt */
+  u32 *ongoing_fast_rxt;                       /**< vector of connections
+                                                    now doing fast rxt */
+
     CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
   u8 cached_opts[40];                          /**< cached 'on the wire'
                                                     options for bursts */
+
 } tcp_worker_ctx_t;
 
 typedef struct _tcp_main
@@ -542,6 +554,8 @@ void tcp_update_burst_snd_vars (tcp_connection_t * tc);
 void tcp_update_rto (tcp_connection_t * tc);
 void tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4);
 void tcp_flush_frames_to_output (u8 thread_index);
+void tcp_program_fastretransmit (tcp_connection_t * tc);
+void tcp_do_fastretransmits (u32 thread_index);
 
 always_inline u32
 tcp_end_seq (tcp_header_t * th, u32 len)
@@ -659,10 +673,10 @@ tcp_is_lost_fin (tcp_connection_t * tc)
 }
 
 u32 tcp_snd_space (tcp_connection_t * tc);
-void tcp_retransmit_first_unacked (tcp_connection_t * tc);
-void tcp_fast_retransmit_no_sack (tcp_connection_t * tc);
-void tcp_fast_retransmit_sack (tcp_connection_t * tc);
-void tcp_fast_retransmit (tcp_connection_t * tc);
+int tcp_retransmit_first_unacked (tcp_connection_t * tc);
+int tcp_fast_retransmit_no_sack (tcp_connection_t * tc, u32 burst_size);
+int tcp_fast_retransmit_sack (tcp_connection_t * tc, u32 burst_size);
+int tcp_fast_retransmit (tcp_connection_t * tc, u32 burst_size);
 void tcp_cc_init_congestion (tcp_connection_t * tc);
 void tcp_cc_fastrecovery_exit (tcp_connection_t * tc);
 
index ccf12da..8f626b1 100755 (executable)
@@ -629,6 +629,8 @@ if (_av > 0)                                                                \
 
 #define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)                     \
 {                                                                      \
+  if (_tc->snd_una != _tc->iss)                                                \
+    TCP_EVT_CC_STAT_PRINT (_tc);                                       \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
     .format = "cc: %s snd_space %u snd_una %u out %u flight %u",       \
@@ -788,9 +790,11 @@ if (TCP_DEBUG_CC > 1)                                                      \
 
 #define STATS_INTERVAL 1
 
-#define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...)                          \
-{                                                                      \
-if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now())           \
+#define tcp_cc_time_to_print_stats(_tc)                                        \
+  _tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()              \
+  || tcp_in_fastrecovery (_tc)                                         \
+
+#define TCP_EVT_CC_RTO_STAT_PRINT(_tc)                                 \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
@@ -801,29 +805,40 @@ if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now())              \
   ed->data[0] = _tc->rto;                                              \
   ed->data[1] = _tc->srtt;                                             \
   ed->data[2] = _tc->rttvar;                                           \
-}                                                                      \
 }
-#define TCP_EVT_CC_SND_STAT_HANDLER(_tc, ...)                          \
+
+#define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...)                          \
+{                                                                      \
+if (tcp_cc_time_to_print_stats (_tc))                                  \
 {                                                                      \
-if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now())           \
+  TCP_EVT_CC_RTO_STAT_PRINT (_tc);                                     \
+}                                                                      \
+}
+
+#define TCP_EVT_CC_SND_STAT_PRINT(_tc)                                 \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "snd_stat: dack %u sacked %u lost %u out %u rxt %u",     \
+    .format = "snd_stat: cc_space %u sacked %u lost %u out %u rxt %u", \
     .format_args = "i4i4i4i4i4",                                       \
   };                                                                   \
   DECLARE_ETD(_tc, _e, 5);                                             \
-  ed->data[0] = _tc->rcv_dupacks;                                      \
+  ed->data[0] = tcp_available_cc_snd_space (_tc);                      \
   ed->data[1] = _tc->sack_sb.sacked_bytes;                             \
   ed->data[2] = _tc->sack_sb.lost_bytes;                               \
   ed->data[3] = tcp_bytes_out (_tc);                                   \
   ed->data[3] = _tc->snd_rxt_bytes;                                    \
-}                                                                      \
 }
 
-#define TCP_EVT_CC_STAT_HANDLER(_tc, ...)                              \
+#define TCP_EVT_CC_SND_STAT_HANDLER(_tc, ...)                          \
+{                                                                      \
+if (tcp_cc_time_to_print_stats (_tc))                                  \
 {                                                                      \
-if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now())           \
+    TCP_EVT_CC_SND_STAT_PRINT(_tc);                                    \
+}                                                                      \
+}
+
+#define TCP_EVT_CC_STAT_PRINT(_tc)                                     \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
@@ -836,7 +851,15 @@ if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now())               \
   ed->data[2] = tcp_snd_space (_tc);                                   \
   ed->data[3] = _tc->ssthresh;                                         \
   ed->data[4] = _tc->snd_wnd;                                          \
-  TCP_EVT_CC_RTO_STAT_HANDLER (_tc);                                   \
+  TCP_EVT_CC_RTO_STAT_PRINT (_tc);                                     \
+  TCP_EVT_CC_SND_STAT_PRINT (_tc);                                     \
+}
+
+#define TCP_EVT_CC_STAT_HANDLER(_tc, ...)                              \
+{                                                                      \
+if (tcp_cc_time_to_print_stats (_tc))                                  \
+{                                                                      \
+  TCP_EVT_CC_STAT_PRINT (_tc);                                         \
   _tc->c_cc_stat_tstamp = tcp_time_now();                              \
 }                                                                      \
 }
index 4e3987e..39a538b 100644 (file)
@@ -749,7 +749,7 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb,
       /* Rule (3): if hole not lost */
       else if (seq_lt (hole->start, sb->high_sacked))
        {
-         *snd_limited = 1;
+         *snd_limited = 0;
          sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
        }
       /* Rule (4): if hole beyond high_sacked */
@@ -993,10 +993,10 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
   sb->last_sacked_bytes = sb->sacked_bytes
     - (old_sacked_bytes - sb->last_bytes_delivered);
   ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc));
-  ASSERT (sb->sacked_bytes == 0
+  ASSERT (sb->sacked_bytes == 0 || tcp_in_recovery (tc)
          || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack));
   ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max
-         - seq_max (tc->snd_una, ack));
+         - seq_max (tc->snd_una, ack) || tcp_in_recovery (tc));
   ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc)
          || sb->holes[sb->head].start == ack + sb->snd_una_adv);
   TCP_EVT_DBG (TCP_EVT_CC_SCOREBOARD, tc);
@@ -1052,6 +1052,9 @@ tcp_cc_init_congestion (tcp_connection_t * tc)
   tcp_fastrecovery_on (tc);
   tc->snd_congestion = tc->snd_una_max;
   tc->cwnd_acc_bytes = 0;
+  tc->snd_rxt_bytes = 0;
+  tc->prev_ssthresh = tc->ssthresh;
+  tc->prev_cwnd = tc->cwnd;
   tc->cc_algo->congestion (tc);
   TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4);
 }
@@ -1074,8 +1077,14 @@ tcp_cc_fastrecovery_exit (tcp_connection_t * tc)
   tc->snd_rxt_bytes = 0;
   tc->rcv_dupacks = 0;
   tc->snd_nxt = tc->snd_una_max;
+  tc->snd_rxt_bytes = 0;
+
+  /* HACK: since we don't have an output pacer, force slow start */
+  tc->cwnd = 20 * tc->snd_mss;
+
   tcp_fastrecovery_off (tc);
   tcp_fastrecovery_1_smss_off (tc);
+  tcp_fastrecovery_first_off (tc);
   TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
 }
 
@@ -1088,13 +1097,14 @@ tcp_cc_congestion_undo (tcp_connection_t * tc)
   tc->rcv_dupacks = 0;
   if (tcp_in_recovery (tc))
     tcp_cc_recovery_exit (tc);
+  else if (tcp_in_fastrecovery (tc))
+    tcp_cc_fastrecovery_exit (tc);
   ASSERT (tc->rto_boff == 0);
   TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 5);
-  /* TODO extend for fastrecovery */
 }
 
-static u8
-tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
+static inline u8
+tcp_cc_is_spurious_timeout_rxt (tcp_connection_t * tc)
 {
   return (tcp_in_recovery (tc) && tc->rto_boff == 1
          && tc->snd_rxt_ts
@@ -1102,6 +1112,20 @@ tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
          && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts));
 }
 
+static inline u8
+tcp_cc_is_spurious_fast_rxt (tcp_connection_t * tc)
+{
+  return (tcp_in_fastrecovery (tc)
+         && tc->cwnd > tc->ssthresh + 3 * tc->snd_mss);
+}
+
+static u8
+tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
+{
+  return (tcp_cc_is_spurious_timeout_rxt (tc)
+         || tcp_cc_is_spurious_fast_rxt (tc));
+}
+
 static int
 tcp_cc_recover (tcp_connection_t * tc)
 {
@@ -1158,6 +1182,84 @@ tcp_should_fastrecover (tcp_connection_t * tc)
          || tcp_should_fastrecover_sack (tc));
 }
 
+void
+tcp_program_fastretransmit (tcp_connection_t * tc)
+{
+  tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[tc->c_thread_index];
+  if (!(tc->flags & TCP_CONN_FRXT_PENDING))
+    {
+      vec_add1 (wrk->pending_fast_rxt, tc->c_c_index);
+      tc->flags |= TCP_CONN_FRXT_PENDING;
+    }
+}
+
+void
+tcp_do_fastretransmits (u32 thread_index)
+{
+  tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[thread_index];
+  u32 max_burst_size, burst_size, n_segs = 0;
+  tcp_connection_t *tc;
+  int i;
+
+  if (vec_len (wrk->pending_fast_rxt) == 0)
+    return;
+
+  vec_append (wrk->ongoing_fast_rxt, wrk->pending_fast_rxt);
+  vec_reset_length (wrk->pending_fast_rxt);
+
+  max_burst_size = VLIB_FRAME_SIZE / vec_len (wrk->ongoing_fast_rxt);
+  max_burst_size = clib_max (max_burst_size, 1);
+
+  for (i = 0; i < vec_len (wrk->ongoing_fast_rxt); i++)
+    {
+      tc = tcp_connection_get (wrk->ongoing_fast_rxt[i], thread_index);
+      tc->flags &= ~TCP_CONN_FRXT_PENDING;
+
+      if (!tcp_in_fastrecovery (tc))
+       continue;
+
+      /* TODO tx pacer instead of this */
+      if (n_segs >= VLIB_FRAME_SIZE)
+       {
+         tcp_program_fastretransmit (tc);
+         continue;
+       }
+
+      burst_size = clib_min (max_burst_size, VLIB_FRAME_SIZE - n_segs);
+
+      if (tc->cwnd > tc->ssthresh + 3 * tc->snd_mss)
+       {
+         /* The first segment MUST be retransmitted */
+         if (tcp_retransmit_first_unacked (tc))
+           {
+             tcp_program_fastretransmit (tc);
+             continue;
+           }
+
+         /* Post retransmit update cwnd to ssthresh and account for the
+          * three segments that have left the network and should've been
+          * buffered at the receiver XXX */
+         tc->cwnd = tc->ssthresh + 3 * tc->snd_mss;
+
+         /* If cwnd allows, send more data */
+         if (tcp_opts_sack_permitted (&tc->rcv_opts))
+           {
+             scoreboard_init_high_rxt (&tc->sack_sb,
+                                       tc->snd_una + tc->snd_mss);
+             tc->sack_sb.rescue_rxt = tc->snd_una - 1;
+             n_segs += tcp_fast_retransmit_sack (tc, burst_size);
+           }
+         else
+           {
+             n_segs += tcp_fast_retransmit_no_sack (tc, burst_size);
+           }
+       }
+      else
+       n_segs += tcp_fast_retransmit (tc, burst_size);
+    }
+  vec_reset_length (wrk->ongoing_fast_rxt);
+}
+
 /**
  * One function to rule them all ... and in the darkness bind them
  */
@@ -1170,7 +1272,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
     {
       if (tc->bytes_acked)
        goto partial_ack;
-      tcp_fast_retransmit (tc);
+      tcp_program_fastretransmit (tc);
       return;
     }
   /*
@@ -1196,20 +1298,10 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
        {
          ASSERT (!tcp_in_fastrecovery (tc));
 
-         /* If of of the two conditions lower hold, reset dupacks because
-          * we're probably after timeout (RFC6582 heuristics).
-          * If Cumulative ack does not cover more than congestion threshold,
-          * and:
-          * 1) The following doesn't hold: The congestion window is greater
-          *    than SMSS bytes and the difference between highest_ack
-          *    and prev_highest_ack is at most 4*SMSS bytes
-          * 2) Echoed timestamp in the last non-dup ack does not equal the
-          *    stored timestamp
-          */
-         if (seq_leq (tc->snd_una, tc->snd_congestion)
-             && ((!(tc->cwnd > tc->snd_mss
-                    && tc->bytes_acked <= 4 * tc->snd_mss))
-                 || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
+         /* Heuristic to catch potential late dupacks
+          * after fast retransmit exits */
+         if (is_dack && tc->snd_una == tc->snd_congestion
+             && timestamp_leq (tc->rcv_opts.tsecr, tc->tsecr_last_ack))
            {
              tc->rcv_dupacks = 0;
              return;
@@ -1218,26 +1310,10 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
          tcp_cc_init_congestion (tc);
          tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
 
-         /* The first segment MUST be retransmitted */
-         tcp_retransmit_first_unacked (tc);
-
-         /* Post retransmit update cwnd to ssthresh and account for the
-          * three segments that have left the network and should've been
-          * buffered at the receiver XXX */
-         tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss;
-         ASSERT (tc->cwnd >= tc->snd_mss);
-
-         /* If cwnd allows, send more data */
          if (tcp_opts_sack_permitted (&tc->rcv_opts))
-           {
-             scoreboard_init_high_rxt (&tc->sack_sb,
-                                       tc->snd_una + tc->snd_mss);
-             tcp_fast_retransmit_sack (tc);
-           }
-         else
-           {
-             tcp_fast_retransmit_no_sack (tc);
-           }
+           tc->sack_sb.high_rxt = tc->snd_una;
+
+         tcp_program_fastretransmit (tc);
          return;
        }
       else if (!tc->bytes_acked
@@ -1249,6 +1325,28 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
       else
        goto partial_ack;
     }
+  /* Don't allow entry in fast recovery if still in recovery, for now */
+  else if (0 && is_dack && tcp_in_recovery (tc))
+    {
+      /* If of of the two conditions lower hold, reset dupacks because
+       * we're probably after timeout (RFC6582 heuristics).
+       * If Cumulative ack does not cover more than congestion threshold,
+       * and:
+       * 1) The following doesn't hold: The congestion window is greater
+       *    than SMSS bytes and the difference between highest_ack
+       *    and prev_highest_ack is at most 4*SMSS bytes
+       * 2) Echoed timestamp in the last non-dup ack does not equal the
+       *    stored timestamp
+       */
+      if (seq_leq (tc->snd_una, tc->snd_congestion)
+         && ((!(tc->cwnd > tc->snd_mss
+                && tc->bytes_acked <= 4 * tc->snd_mss))
+             || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
+       {
+         tc->rcv_dupacks = 0;
+         return;
+       }
+    }
 
   if (!tc->bytes_acked)
     return;
@@ -1259,14 +1357,11 @@ partial_ack:
   /*
    * Legitimate ACK. 1) See if we can exit recovery
    */
-  /* XXX limit this only to first partial ack? */
-  if (seq_lt (tc->snd_una, tc->snd_congestion))
-    tcp_retransmit_timer_force_update (tc);
-  else
-    tcp_retransmit_timer_update (tc);
 
   if (seq_geq (tc->snd_una, tc->snd_congestion))
     {
+      tcp_retransmit_timer_update (tc);
+
       /* If spurious return, we've already updated everything */
       if (tcp_cc_recover (tc))
        {
@@ -1286,6 +1381,9 @@ partial_ack:
    * Legitimate ACK. 2) If PARTIAL ACK try to retransmit
    */
 
+  /* XXX limit this only to first partial ack? */
+  tcp_retransmit_timer_force_update (tc);
+
   /* RFC6675: If the incoming ACK is a cumulative acknowledgment,
    * reset dupacks to 0. Also needed if in congestion recovery */
   tc->rcv_dupacks = 0;
@@ -1300,24 +1398,33 @@ partial_ack:
     }
 
   /* Remove retransmitted bytes that have been delivered */
-  ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
-         >= tc->sack_sb.last_bytes_delivered
-         || (tc->flags & TCP_CONN_FINSNT));
-
-  if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt))
+  if (tcp_opts_sack_permitted (&tc->rcv_opts))
     {
+      ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
+             >= tc->sack_sb.last_bytes_delivered
+             || (tc->flags & TCP_CONN_FINSNT));
+
       /* If we have sacks and we haven't gotten an ack beyond high_rxt,
        * remove sacked bytes delivered */
-      rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv
-       - tc->sack_sb.last_bytes_delivered;
-      ASSERT (tc->snd_rxt_bytes >= rxt_delivered);
-      tc->snd_rxt_bytes -= rxt_delivered;
+      if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt))
+       {
+         rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv
+           - tc->sack_sb.last_bytes_delivered;
+         ASSERT (tc->snd_rxt_bytes >= rxt_delivered);
+         tc->snd_rxt_bytes -= rxt_delivered;
+       }
+      else
+       {
+         /* Apparently all retransmitted holes have been acked */
+         tc->snd_rxt_bytes = 0;
+       }
     }
   else
     {
-      /* Either all retransmitted holes have been acked, or we're
-       * "in the blind" and retransmitting segment by segment */
-      tc->snd_rxt_bytes = 0;
+      if (tc->snd_rxt_bytes > tc->bytes_acked)
+       tc->snd_rxt_bytes -= tc->bytes_acked;
+      else
+       tc->snd_rxt_bytes = 0;
     }
 
   tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK);
@@ -1325,7 +1432,7 @@ partial_ack:
   /*
    * Since this was a partial ack, try to retransmit some more data
    */
-  tcp_fast_retransmit (tc);
+  tcp_program_fastretransmit (tc);
 }
 
 /**
index ed1c641..2e6036b 100644 (file)
@@ -1409,7 +1409,11 @@ tcp_rxt_timeout_cc (tcp_connection_t * tc)
 
   /* Cleanly recover cc (also clears up fast retransmit) */
   if (tcp_in_fastrecovery (tc))
-    tcp_cc_fastrecovery_exit (tc);
+    {
+      /* TODO be less aggressive about this */
+      scoreboard_clear (&tc->sack_sb);
+      tcp_cc_fastrecovery_exit (tc);
+    }
 
   /* Start again from the beginning */
   tc->cc_algo->congestion (tc);
@@ -1487,6 +1491,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
       /* First retransmit timeout */
       if (tc->rto_boff == 1)
        tcp_rxt_timeout_cc (tc);
+      else
+       scoreboard_clear (&tc->sack_sb);
 
       /* If we've sent beyond snd_congestion, update it */
       if (seq_gt (tc->snd_una_max, tc->snd_congestion))
@@ -1499,9 +1505,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
        * shortfall */
       n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
 
-      /* TODO be less aggressive about this */
-      scoreboard_clear (&tc->sack_sb);
-
       if (n_bytes == 0)
        {
          tcp_retransmit_timer_force_update (tc);
@@ -1680,7 +1683,7 @@ tcp_timer_persist_handler (u32 index)
 /**
  * Retransmit first unacked segment
  */
-void
+int
 tcp_retransmit_first_unacked (tcp_connection_t * tc)
 {
   vlib_main_t *vm = vlib_get_main ();
@@ -1691,20 +1694,23 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc)
   tc->snd_nxt = tc->snd_una;
 
   TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
+
   n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
   if (!n_bytes)
-    return;
+    return -1;
+
   bi = vlib_get_buffer_index (vm, b);
   tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
-
   tc->snd_nxt = old_snd_nxt;
+
+  return 0;
 }
 
 /**
  * Do fast retransmit with SACKs
  */
-void
-tcp_fast_retransmit_sack (tcp_connection_t * tc)
+int
+tcp_fast_retransmit_sack (tcp_connection_t * tc, u32 burst_size)
 {
   vlib_main_t *vm = vlib_get_main ();
   u32 n_written = 0, offset, max_bytes, n_segs = 0;
@@ -1720,13 +1726,16 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc)
   old_snd_nxt = tc->snd_nxt;
   sb = &tc->sack_sb;
   snd_space = tcp_available_cc_snd_space (tc);
+  hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
 
   if (snd_space < tc->snd_mss)
-    goto done;
+    {
+      tcp_program_fastretransmit (tc);
+      goto done;
+    }
 
   TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
-  hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
-  while (hole && snd_space > 0 && n_segs++ < VLIB_FRAME_SIZE)
+  while (snd_space > 0 && n_segs < burst_size)
     {
       hole = scoreboard_next_rxt_hole (sb, hole,
                                       tcp_fastrecovery_sent_1_smss (tc),
@@ -1736,7 +1745,21 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc)
          if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una)
                               || seq_gt (sb->rescue_rxt,
                                          tc->snd_congestion)))
-           break;
+           {
+             if (tcp_fastrecovery_first (tc))
+               break;
+
+             /* We tend to lose the first segment. Try re-resending
+              * it but only once and after we've tried everything */
+             hole = scoreboard_first_hole (sb);
+             if (hole && hole->start == tc->snd_una)
+               {
+                 tcp_retransmit_first_unacked (tc);
+                 tcp_fastrecovery_first_on (tc);
+                 n_segs += 1;
+               }
+             break;
+           }
 
          /* If rescue rxt undefined or less than snd_una then one segment of
           * up to SMSS octets that MUST include the highest outstanding
@@ -1756,6 +1779,7 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc)
 
          bi = vlib_get_buffer_index (vm, b);
          tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+         n_segs += 1;
          break;
        }
 
@@ -1776,22 +1800,27 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc)
       tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
       ASSERT (n_written <= snd_space);
       snd_space -= n_written;
+      n_segs += 1;
     }
 
+  if (hole)
+    tcp_program_fastretransmit (tc);
+
 done:
   /* If window allows, send 1 SMSS of new data */
   tc->snd_nxt = old_snd_nxt;
+  return n_segs;
 }
 
 /**
  * Fast retransmit without SACK info
  */
-void
-tcp_fast_retransmit_no_sack (tcp_connection_t * tc)
+int
+tcp_fast_retransmit_no_sack (tcp_connection_t * tc, u32 burst_size)
 {
   vlib_main_t *vm = vlib_get_main ();
   u32 n_written = 0, offset = 0, bi, old_snd_nxt;
-  int snd_space;
+  int snd_space, n_segs = 0;
   vlib_buffer_t *b;
 
   ASSERT (tcp_in_fastrecovery (tc));
@@ -1802,7 +1831,7 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc)
   tc->snd_nxt = tc->snd_una;
   snd_space = tcp_available_cc_snd_space (tc);
 
-  while (snd_space > 0)
+  while (snd_space > 0 && n_segs < burst_size)
     {
       offset += n_written;
       n_written = tcp_prepare_retransmit_segment (tc, offset, snd_space, &b);
@@ -1814,22 +1843,29 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc)
       bi = vlib_get_buffer_index (vm, b);
       tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
       snd_space -= n_written;
+      n_segs += 1;
     }
 
+  /* More data to resend */
+  if (seq_lt (tc->snd_nxt, tc->snd_congestion))
+    tcp_program_fastretransmit (tc);
+
   /* Restore snd_nxt. If window allows, send 1 SMSS of new data */
   tc->snd_nxt = old_snd_nxt;
+
+  return n_segs;
 }
 
 /**
  * Do fast retransmit
  */
-void
-tcp_fast_retransmit (tcp_connection_t * tc)
+int
+tcp_fast_retransmit (tcp_connection_t * tc, u32 burst_size)
 {
   if (tcp_opts_sack_permitted (&tc->rcv_opts))
-    tcp_fast_retransmit_sack (tc);
+    return tcp_fast_retransmit_sack (tc, burst_size);
   else
-    tcp_fast_retransmit_no_sack (tc);
+    return tcp_fast_retransmit_no_sack (tc, burst_size);
 }
 
 static u32