tcp: loss recovery improvements/fixes 08/12708/10
authorFlorin Coras <fcoras@cisco.com>
Thu, 24 May 2018 04:01:30 +0000 (21:01 -0700)
committerDamjan Marion <dmarion.lists@gmail.com>
Sat, 26 May 2018 18:56:43 +0000 (18:56 +0000)
- fix newreno cwnd computation
- reset snd_una_max on entering recovery
- accept acks beyond snd_nxt but less than snd_congestion when in
recovery
- avoid entering fast recovery multiple times when using sacks
- avoid as much as possible sending small segments when doing fast
retransmit
- more event logging

Change-Id: I19dd151d7704e39d4eae06de3a26f5e124875366
Signed-off-by: Florin Coras <fcoras@cisco.com>
src/vnet/session/session.c
src/vnet/session/session_node.c
src/vnet/tcp/tcp.c
src/vnet/tcp/tcp.h
src/vnet/tcp/tcp_debug.h
src/vnet/tcp/tcp_input.c
src/vnet/tcp/tcp_newreno.c
src/vnet/tcp/tcp_output.c

index 2697c26..c016325 100644 (file)
@@ -774,7 +774,7 @@ stream_session_reset_notify (transport_connection_t * tc)
   stream_session_t *s;
   application_t *app;
   s = session_get (tc->s_index, tc->thread_index);
-
+  s->session_state = SESSION_STATE_CLOSED;
   app = application_get (s->app_index);
   app->cb_fns.session_reset_callback (s);
 }
index 46fc4dc..e046efb 100644 (file)
@@ -260,6 +260,8 @@ session_tx_not_ready (stream_session_t * s, u8 peek_data)
        * session is not ready or closed */
       if (s->session_state < SESSION_STATE_READY)
        return 1;
+      if (s->session_state == SESSION_STATE_CLOSED)
+       return 2;
     }
   return 0;
 }
@@ -364,11 +366,12 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
   session_tx_context_t *ctx = &smm->ctx[thread_index];
   transport_proto_t tp;
   vlib_buffer_t *pb;
-  u16 n_bufs;
+  u16 n_bufs, rv;
 
-  if (PREDICT_FALSE (session_tx_not_ready (s, peek_data)))
+  if (PREDICT_FALSE ((rv = session_tx_not_ready (s, peek_data))))
     {
-      vec_add1 (smm->pending_event_vector[thread_index], *e);
+      if (rv < 2)
+       vec_add1 (smm->pending_event_vector[thread_index], *e);
       return 0;
     }
 
index 25292d1..15ac7d3 100644 (file)
@@ -734,9 +734,9 @@ format_tcp_vars (u8 * s, va_list * args)
   s = format (s, " snd_wnd %u rcv_wnd %u snd_wl1 %u snd_wl2 %u\n",
              tc->snd_wnd, tc->rcv_wnd, tc->snd_wl1 - tc->irs,
              tc->snd_wl2 - tc->iss);
-  s = format (s, " flight size %u send space %u rcv_wnd_av %d\n",
+  s = format (s, " flight size %u out space %u cc space %u rcv_wnd_av %u\n",
              tcp_flight_size (tc), tcp_available_output_snd_space (tc),
-             tcp_rcv_wnd_available (tc));
+             tcp_available_cc_snd_space (tc), tcp_rcv_wnd_available (tc));
   s = format (s, " cong %U ", format_tcp_congestion_status, tc);
   s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n",
              tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked);
@@ -1022,7 +1022,7 @@ tcp_snd_space (tcp_connection_t * tc)
    * bytes of previously unsent data. */
   if (tcp_in_fastrecovery (tc) && !tcp_fastrecovery_sent_1_smss (tc))
     {
-      if (tcp_available_output_snd_space (tc) < tc->snd_mss)
+      if (tcp_available_cc_snd_space (tc) < tc->snd_mss)
        return 0;
       tcp_fastrecovery_1_smss_on (tc);
       return tc->snd_mss;
index 837b5b4..10aa721 100644 (file)
@@ -119,7 +119,7 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler;
   _(FAST_RECOVERY, "Fast Recovery")            \
   _(FR_1_SMSS, "Sent 1 SMSS")                  \
   _(HALF_OPEN_DONE, "Half-open completed")     \
-  _(FINPNDG, "FIN pending")
+  _(FINPNDG, "FIN pending")                    \
 
 typedef enum _tcp_connection_flag_bits
 {
@@ -617,7 +617,7 @@ tcp_available_output_snd_space (const tcp_connection_t * tc)
  * Estimate of how many bytes we can still push into the network
  */
 always_inline u32
-tcp_available_snd_space (const tcp_connection_t * tc)
+tcp_available_cc_snd_space (const tcp_connection_t * tc)
 {
   u32 available_wnd = tcp_available_snd_wnd (tc);
   u32 flight_size = tcp_flight_size (tc);
@@ -652,6 +652,7 @@ fib_node_index_t tcp_lookup_rmt_in_fib (tcp_connection_t * tc);
 
 /* Made public for unit testing only */
 void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end);
+u32 tcp_sack_list_bytes (tcp_connection_t * tc);
 
 always_inline u32
 tcp_time_now (void)
@@ -791,7 +792,6 @@ tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer)
 void
 scoreboard_remove_hole (sack_scoreboard_t * sb,
                        sack_scoreboard_hole_t * hole);
-void scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb);
 sack_scoreboard_hole_t *scoreboard_insert_hole (sack_scoreboard_t * sb,
                                                u32 prev_index, u32 start,
                                                u32 end);
index 4af4f2e..a52efe0 100755 (executable)
@@ -56,6 +56,9 @@
   _(CC_PACK, "cc partial ack")         \
   _(CC_STAT, "cc stats")               \
   _(CC_RTO_STAT, "cc rto stats")       \
+  _(CC_SCOREBOARD, "scoreboard stats") \
+  _(CC_SACKS, "snd sacks stats")       \
+  _(CC_INPUT, "ooo data delivered")    \
   _(SEG_INVALID, "invalid segment")    \
   _(PAWS_FAIL, "failed paws check")    \
   _(ACK_RCV_ERR, "invalid ack")                \
@@ -192,7 +195,7 @@ typedef enum _tcp_dbg_evt
   ed->data[0] = _tc->c_c_index;                                                \
 }
 
-#define TCP_EVT_SYN_RCVD_HANDLER(_tc,_init, ...)                               \
+#define TCP_EVT_SYN_RCVD_HANDLER(_tc,_init, ...)                       \
 {                                                                      \
   if (_init)                                                           \
     TCP_EVT_INIT_HANDLER(_tc, 0);                                      \
@@ -277,9 +280,9 @@ typedef enum _tcp_dbg_evt
   };                                                                   \
   DECLARE_ETD(_tc, _e, 4);                                             \
   ed->data[0] = _tc->iss;                                              \
-  ed->data[1] = _tc->snd_una - _tc->iss;                                       \
+  ed->data[1] = _tc->snd_una - _tc->iss;                               \
   ed->data[2] = _tc->snd_una_max - _tc->iss;                           \
-  ed->data[3] = _tc->snd_nxt - _tc->iss;                                       \
+  ed->data[3] = _tc->snd_nxt - _tc->iss;                               \
   TCP_EVT_STATE_CHANGE_HANDLER(_tc);                                   \
 }
 
@@ -288,14 +291,14 @@ typedef enum _tcp_dbg_evt
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
     .format = "synack-tx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\
-    .format_args = "i4i4i4i4i4",                                               \
+    .format_args = "i4i4i4i4i4",                                       \
   };                                                                   \
   DECLARE_ETD(_tc, _e, 5);                                             \
   ed->data[0] = _tc->iss;                                              \
   ed->data[1] = _tc->irs;                                              \
-  ed->data[2] = _tc->snd_una - _tc->iss;                                       \
-  ed->data[3] = _tc->snd_nxt - _tc->iss;                                       \
-  ed->data[4] = _tc->rcv_nxt - _tc->irs;                                       \
+  ed->data[2] = _tc->snd_una - _tc->iss;                               \
+  ed->data[3] = _tc->snd_nxt - _tc->iss;                               \
+  ed->data[4] = _tc->rcv_nxt - _tc->irs;                               \
 }
 
 #define TCP_EVT_SYNACK_RCVD_HANDLER(_tc, ...)                          \
@@ -303,14 +306,14 @@ typedef enum _tcp_dbg_evt
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
     .format = "synack-rx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\
-    .format_args = "i4i4i4i4i4",                                               \
+    .format_args = "i4i4i4i4i4",                                       \
   };                                                                   \
   DECLARE_ETD(_tc, _e, 5);                                             \
   ed->data[0] = _tc->iss;                                              \
   ed->data[1] = _tc->irs;                                              \
-  ed->data[2] = _tc->snd_una - _tc->iss;                                       \
-  ed->data[3] = _tc->snd_nxt - _tc->iss;                                       \
-  ed->data[4] = _tc->rcv_nxt - _tc->irs;                                       \
+  ed->data[2] = _tc->snd_una - _tc->iss;                               \
+  ed->data[3] = _tc->snd_nxt - _tc->iss;                               \
+  ed->data[4] = _tc->rcv_nxt - _tc->irs;                               \
   TCP_EVT_STATE_CHANGE_HANDLER(_tc);                                   \
 }
 
@@ -371,7 +374,7 @@ if (_tc)                                                            \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
     .format = "%s-rxt: iss %u irs %u snd_nxt %u rcv_nxt %u",           \
-    .format_args = "t4i4i4i4i4",                                               \
+    .format_args = "t4i4i4i4i4",                                       \
     .n_enum_strings = 2,                                               \
     .enum_strings = {                                                  \
        "syn",                                                          \
@@ -382,10 +385,9 @@ if (_tc)                                                           \
   ed->data[0] = _type;                                                 \
   ed->data[1] = _tc->iss;                                              \
   ed->data[2] = _tc->irs;                                              \
-  ed->data[3] = _tc->snd_nxt - _tc->iss;                                       \
-  ed->data[4] = _tc->rcv_nxt - _tc->irs;                                       \
+  ed->data[3] = _tc->snd_nxt - _tc->iss;                               \
+  ed->data[4] = _tc->rcv_nxt - _tc->irs;                               \
 }
-
 #else
 #define TCP_EVT_SYN_SENT_HANDLER(_tc, ...)
 #define TCP_EVT_SYNACK_SENT_HANDLER(_tc, ...)
@@ -399,6 +401,81 @@ if (_tc)                                                           \
 #endif
 
 #if TCP_DEBUG_SM > 1
+#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _btcp, ...)                   \
+{                                                                      \
+  ELOG_TYPE_DECLARE (_e) =                                             \
+  {                                                                    \
+    .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u rcv_wnd %u",\
+    .format_args = "i4i4i4i4i4",                                       \
+  };                                                                   \
+  DECLARE_ETD(_tc, _e, 5);                                             \
+  ed->data[0] = _btcp.seq_number - _tc->irs;                           \
+  ed->data[1] = _btcp.seq_end - _tc->irs;                              \
+  ed->data[2] = _tc->rcv_las - _tc->irs;                               \
+  ed->data[3] = _tc->rcv_nxt - _tc->irs;                               \
+  ed->data[4] = _tc->rcv_wnd;                                          \
+}
+
+#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...)                        \
+{                                                                      \
+  ELOG_TYPE_DECLARE (_e) =                                             \
+  {                                                                    \
+    .format = "paws-err: seq %u end %u tsval %u tsval_recent %u",      \
+    .format_args = "i4i4i4i4",                                         \
+  };                                                                   \
+  DECLARE_ETD(_tc, _e, 4);                                             \
+  ed->data[0] = _seq - _tc->irs;                                       \
+  ed->data[1] = _end - _tc->irs;                                       \
+  ed->data[2] = _tc->rcv_opts.tsval;                                   \
+  ed->data[3] = _tc->tsval_recent;                                     \
+}
+
+#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...)             \
+{                                                                      \
+  ELOG_TYPE_DECLARE (_e) =                                             \
+  {                                                                    \
+    .format = "ack-err: %s ack %u snd_una %u snd_nxt %u una_max %u",   \
+    .format_args = "t4i4i4i4i4",                                       \
+    .n_enum_strings = 3,                                               \
+    .enum_strings = {                                                  \
+      "invalid",                                                       \
+      "old",                                                           \
+      "future",                                                                \
+    },                                                                         \
+  };                                                                   \
+  DECLARE_ETD(_tc, _e, 5);                                             \
+  ed->data[0] = _type;                                                 \
+  ed->data[1] = _ack - _tc->iss;                                       \
+  ed->data[2] = _tc->snd_una - _tc->iss;                               \
+  ed->data[3] = _tc->snd_nxt - _tc->iss;                               \
+  ed->data[4] = _tc->snd_una_max - _tc->iss;                           \
+}
+
+#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...)            \
+{                                                                      \
+if (_av > 0)                                                           \
+{                                                                      \
+  ELOG_TYPE_DECLARE (_e) =                                             \
+  {                                                                    \
+    .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u",  \
+    .format_args = "i4i4i4i4i4",                                       \
+  };                                                                   \
+  DECLARE_ETD(_tc, _e, 5);                                             \
+  ed->data[0] = _tc->rcv_wnd;                                          \
+  ed->data[1] = _obs;                                                  \
+  ed->data[2] = _av;                                                   \
+  ed->data[3] = _tc->rcv_nxt - _tc->irs;                               \
+  ed->data[4] = _tc->rcv_las - _tc->irs;                               \
+}                                                                      \
+}
+#else
+#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _btcp, ...)
+#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...)
+#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...)
+#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...)
+#endif
+
+#if TCP_DEBUG_SM > 2
 
 #define TCP_EVT_ACK_SENT_HANDLER(_tc, ...)                             \
 {                                                                      \
@@ -505,90 +582,18 @@ if (_tc)                                                          \
                    _tc_index);                                         \
     }                                                                  \
 }
-
-#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...)              \
-{                                                                      \
-  ELOG_TYPE_DECLARE (_e) =                                             \
-  {                                                                    \
-    .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u rcv_wnd %u",\
-    .format_args = "i4i4i4i4i4",                                       \
-  };                                                                   \
-  DECLARE_ETD(_tc, _e, 5);                                             \
-  ed->data[0] = _seq - _tc->irs;                                       \
-  ed->data[1] = _end - _tc->irs;                                       \
-  ed->data[2] = _tc->rcv_las - _tc->irs;                               \
-  ed->data[3] = _tc->rcv_nxt - _tc->irs;                               \
-  ed->data[4] = _tc->rcv_wnd;                                          \
-}
-
-#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...)                        \
-{                                                                      \
-  ELOG_TYPE_DECLARE (_e) =                                             \
-  {                                                                    \
-    .format = "paws-err: seq %u end %u tsval %u tsval_recent %u",      \
-    .format_args = "i4i4i4i4",                                         \
-  };                                                                   \
-  DECLARE_ETD(_tc, _e, 4);                                             \
-  ed->data[0] = _seq - _tc->irs;                                       \
-  ed->data[1] = _end - _tc->irs;                                       \
-  ed->data[2] = _tc->rcv_opts.tsval;                                   \
-  ed->data[3] = _tc->tsval_recent;                                     \
-}
-
-#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...)             \
-{                                                                      \
-  ELOG_TYPE_DECLARE (_e) =                                             \
-  {                                                                    \
-    .format = "ack-err: %s ack %u snd_una %u snd_nxt %u una_max %u",   \
-    .format_args = "t4i4i4i4i4",                                       \
-    .n_enum_strings = 3,                                               \
-    .enum_strings = {                                                  \
-      "invalid",                                                       \
-      "old",                                                           \
-      "future",                                                                \
-    },                                                                         \
-  };                                                                   \
-  DECLARE_ETD(_tc, _e, 5);                                             \
-  ed->data[0] = _type;                                                 \
-  ed->data[1] = _ack - _tc->iss;                                       \
-  ed->data[2] = _tc->snd_una - _tc->iss;                               \
-  ed->data[3] = _tc->snd_nxt - _tc->iss;                               \
-  ed->data[4] = _tc->snd_una_max - _tc->iss;                           \
-}
-
-#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...)            \
-{                                                                      \
-if (_av > 0)                                                           \
-{                                                                      \
-  ELOG_TYPE_DECLARE (_e) =                                             \
-  {                                                                    \
-    .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u",  \
-    .format_args = "i4i4i4i4i4",                                       \
-  };                                                                   \
-  DECLARE_ETD(_tc, _e, 5);                                             \
-  ed->data[0] = _tc->rcv_wnd;                                          \
-  ed->data[1] = _obs;                                                  \
-  ed->data[2] = _av;                                                   \
-  ed->data[3] = _tc->rcv_nxt - _tc->irs;                               \
-  ed->data[4] = _tc->rcv_las - _tc->irs;                               \
-}                                                                      \
-}
 #else
 #define TCP_EVT_ACK_SENT_HANDLER(_tc, ...)
 #define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...)
 #define TCP_EVT_PKTIZE_HANDLER(_tc, ...)
 #define TCP_EVT_INPUT_HANDLER(_tc, _type, _len, _written, ...)
 #define TCP_EVT_TIMER_POP_HANDLER(_tc_index, _timer_id, ...)
-#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...)
-#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...)
-#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...)
-#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...)
 #endif
 
 /*
  * State machine verbose
  */
-#if TCP_DEBUG_SM > 2
+#if TCP_DEBUG_SM > 3
 #define TCP_EVT_SND_WND_HANDLER(_tc, ...)                              \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
@@ -626,9 +631,9 @@ if (_av > 0)                                                                \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "cc: %s snd_space %u snd_cong %u rxt_bytes %u",          \
-    .format_args = "t4i4i4i4",                                         \
-    .n_enum_strings = 6,                                               \
+    .format = "cc: %s snd_space %u snd_una %u out %u flight %u",       \
+    .format_args = "t4i4i4i4i4",                                       \
+    .n_enum_strings = 7,                                               \
     .enum_strings = {                                                  \
       "fast-rxt",                                                      \
       "rxt-timeout",                                                   \
@@ -636,13 +641,15 @@ if (_av > 0)                                                              \
       "recovered",                                                     \
       "congestion",                                                    \
       "undo",                                                          \
+      "recovery",                                                      \
     },                                                                 \
   };                                                                   \
-  DECLARE_ETD(_tc, _e, 4);                                             \
+  DECLARE_ETD(_tc, _e, 5);                                             \
   ed->data[0] = _sub_evt;                                              \
-  ed->data[1] = tcp_available_snd_space (_tc);                         \
-  ed->data[2] = _tc->snd_congestion - _tc->iss;                                \
-  ed->data[3] = _tc->snd_rxt_bytes;                                    \
+  ed->data[1] = tcp_available_cc_snd_space (_tc);                      \
+  ed->data[2] = _tc->snd_una - _tc->iss;                               \
+  ed->data[3] = tcp_bytes_out(_tc);                                    \
+  ed->data[4] = tcp_flight_size (_tc);                                 \
 }
 
 #define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...)              \
@@ -659,19 +666,19 @@ if (_av > 0)                                                              \
   ed->data[3] = _tc->snd_rxt_bytes;                                    \
 }
 
-#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...)                          \
+#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, _btcp, ...)                   \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "dack-tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\
+    .format = "dack-tx: rcv_nxt %u seq %u rcv_wnd %u snd_nxt %u av_wnd %u",\
     .format_args = "i4i4i4i4i4",                                       \
   };                                                                   \
   DECLARE_ETD(_tc, _e, 5);                                             \
   ed->data[0] = _tc->rcv_nxt - _tc->irs;                               \
-  ed->data[1] = _tc->rcv_wnd;                                          \
-  ed->data[2] = _tc->snd_nxt - _tc->iss;                               \
-  ed->data[3] = tcp_available_snd_wnd(_tc);                            \
-  ed->data[4] = _tc->snd_wnd;                                          \
+  ed->data[1] = _btcp.seq_number - _tc->irs;                           \
+  ed->data[2] = _tc->rcv_wnd;                                          \
+  ed->data[3] = _tc->snd_nxt - _tc->iss;                               \
+  ed->data[4] = tcp_available_snd_wnd(_tc);                            \
 }
 
 #define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...)                          \
@@ -700,12 +707,75 @@ if (_av > 0)                                                              \
   ed->data[0] = _tc->snd_una - _tc->iss;                               \
   ed->data[1] = _tc->snd_una_max - _tc->iss;                           \
 }
+#define TCP_EVT_CC_SCOREBOARD_HANDLER(_tc, ...)                                \
+{                                                                      \
+if (TCP_DEBUG_CC > 1 && _tc->sack_sb.last_sacked_bytes)                        \
+  {                                                                    \
+    ELOG_TYPE_DECLARE (_e) =                                           \
+    {                                                                  \
+      .format = "sb1: holes %u lost %u sacked %u high %u highrxt %u",  \
+      .format_args = "i4i4i4i4i4",                                     \
+    };                                                                 \
+    DECLARE_ETD(_tc, _e, 5);                                           \
+    ed->data[0] = pool_elts(_tc->sack_sb.holes);                       \
+    ed->data[1] = _tc->sack_sb.lost_bytes;                             \
+    ed->data[2] = _tc->sack_sb.sacked_bytes;                           \
+    ed->data[3] = _tc->sack_sb.high_sacked - _tc->iss;                 \
+    ed->data[4] = _tc->sack_sb.high_rxt - _tc->iss;                    \
+  }                                                                    \
+if (TCP_DEBUG_CC > 1 && _tc->sack_sb.last_sacked_bytes)                        \
+  {                                                                    \
+    sack_scoreboard_hole_t *hole;                                      \
+    hole = scoreboard_first_hole (&_tc->sack_sb);                      \
+    ELOG_TYPE_DECLARE (_e) =                                           \
+    {                                                                  \
+      .format = "sb2: first start: %u end %u last start %u end %u",    \
+      .format_args = "i4i4i4i4",                                       \
+    };                                                                 \
+    DECLARE_ETD(_tc, _e, 4);                                           \
+    ed->data[0] = hole ? hole->start - _tc->iss : 0;                   \
+    ed->data[1] = hole ? hole->end - _tc->iss : 0;                     \
+    hole = scoreboard_last_hole (&_tc->sack_sb);                       \
+    ed->data[2] = hole ? hole->start - _tc->iss : 0;                   \
+    ed->data[3] = hole ? hole->end - _tc->iss : 0;                     \
+  }                                                                    \
+}
+#define TCP_EVT_CC_SACKS_HANDLER(_tc, ...)                             \
+{                                                                      \
+if (TCP_DEBUG_CC > 1)                                                  \
+  {                                                                    \
+    ELOG_TYPE_DECLARE (_e) =                                           \
+    {                                                                  \
+      .format = "sacks: blocks %u bytes %u",                           \
+      .format_args = "i4i4",                                           \
+    };                                                                 \
+    DECLARE_ETD(_tc, _e, 2);                                           \
+    ed->data[0] = vec_len (_tc->snd_sacks);                            \
+    ed->data[1] = tcp_sack_list_bytes (_tc);                           \
+  }                                                                    \
+}
+#define TCP_EVT_CC_INPUT_HANDLER(_tc, _len, _written, ...)             \
+{                                                                      \
+  ELOG_TYPE_DECLARE (_e) =                                             \
+  {                                                                    \
+    .format = "cc input: len %u written %d rcv_nxt %u rcv_wnd(o) %d",  \
+    .format_args = "i4i4i4i4",                                         \
+  };                                                                   \
+  DECLARE_ETD(_tc, _e, 4);                                             \
+  ed->data[0] = _len;                                                  \
+  ed->data[1] = _written;                                              \
+  ed->data[2] = _tc->rcv_nxt - _tc->irs;                               \
+  ed->data[3] = _tc->rcv_wnd - (_tc->rcv_nxt - _tc->rcv_las);          \
+}
 #else
 #define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...)
-#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, _btcp, ...)
 #define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...)
 #define TCP_EVT_CC_PACK_HANDLER(_tc, ...)
 #define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)
+#define TCP_EVT_CC_SCOREBOARD_HANDLER(_tc, ...)
+#define TCP_EVT_CC_SACKS_HANDLER(_tc, ...)
+#define TCP_EVT_CC_INPUT_HANDLER(_tc, _len, _written, ...)
 #endif
 
 /*
index c864320..19ecc7d 100644 (file)
@@ -275,6 +275,14 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
                      vlib_buffer_t * b0, tcp_header_t * th0,
                      u32 * next0, u32 * error0)
 {
+  /* We could get a burst of RSTs interleaved with acks */
+  if (PREDICT_FALSE (tc0->state == TCP_STATE_CLOSED))
+    {
+      tcp_send_reset (tc0);
+      *error0 = TCP_ERROR_CONNECTION_CLOSED;
+      goto drop;
+    }
+
   if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0)))
     {
       *error0 = TCP_ERROR_SEGMENT_INVALID;
@@ -292,13 +300,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
     {
       *error0 = TCP_ERROR_PAWS;
       if (CLIB_DEBUG > 2)
-       {
-         clib_warning ("paws failed\n%U", format_tcp_connection, tc0, 2);
-         clib_warning ("seq %u seq_end %u ack %u",
-                       vnet_buffer (b0)->tcp.seq_number - tc0->irs,
-                       vnet_buffer (b0)->tcp.seq_end - tc0->irs,
-                       vnet_buffer (b0)->tcp.ack_number - tc0->iss);
-       }
+       clib_warning ("paws failed\n%U", format_tcp_connection, tc0, 2);
       TCP_EVT_DBG (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number,
                   vnet_buffer (b0)->tcp.seq_end);
 
@@ -317,7 +319,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
          if (!tcp_rst (th0))
            {
              tcp_make_ack (tc0, b0);
-             TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0);
+             TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
              goto error;
            }
        }
@@ -329,7 +331,6 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
                               vnet_buffer (b0)->tcp.seq_end))
     {
       *error0 = TCP_ERROR_RCV_WND;
-
       /* If our window is 0 and the packet is in sequence, let it pass
        * through for ack processing. It should be dropped later. */
       if (!(tc0->rcv_wnd == 0
@@ -339,7 +340,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
          if (!tcp_rst (th0))
            {
              tcp_make_ack (tc0, b0);
-             TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0);
+             TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
              goto error;
            }
          goto drop;
@@ -889,13 +890,14 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
   scoreboard_update_bytes (tc, sb);
   sb->last_sacked_bytes = sb->sacked_bytes
     - (old_sacked_bytes - sb->last_bytes_delivered);
-  ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes);
+  ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc));
   ASSERT (sb->sacked_bytes == 0
          || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack));
   ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max
          - seq_max (tc->snd_una, ack));
   ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc)
          || sb->holes[sb->head].start == ack + sb->snd_una_adv);
+  TCP_EVT_DBG (TCP_EVT_CC_SCOREBOARD, tc);
 }
 
 /**
@@ -1063,11 +1065,18 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
 {
   u32 rxt_delivered;
 
+  if (tcp_in_fastrecovery (tc) && tcp_opts_sack_permitted (&tc->rcv_opts))
+    {
+      if (tc->bytes_acked)
+       goto partial_ack;
+      tcp_fast_retransmit (tc);
+      return;
+    }
   /*
    * Duplicate ACK. Check if we should enter fast recovery, or if already in
    * it account for the bytes that left the network.
    */
-  if (is_dack && !tcp_in_recovery (tc))
+  else if (is_dack && !tcp_in_recovery (tc))
     {
       TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1);
       ASSERT (tc->snd_una != tc->snd_una_max
@@ -1128,7 +1137,6 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
            {
              tcp_fast_retransmit_no_sack (tc);
            }
-
          return;
        }
       else if (!tc->bytes_acked
@@ -1237,6 +1245,16 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
   /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */
   if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
     {
+      /* When we entered recovery, we reset snd_nxt to snd_una. Seems peer
+       * still has the data so accept the ack */
+      if (tcp_in_recovery (tc)
+         && seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_congestion)
+         && seq_geq (vnet_buffer (b)->tcp.ack_number, tc->snd_una))
+       {
+         tc->snd_una_max = tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
+         goto process_ack;
+       }
+
       /* If we have outstanding data and this is within the window, accept it,
        * probably retransmit has timed out. Otherwise ACK segment and then
        * drop it */
@@ -1264,9 +1282,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
       TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1,
                   vnet_buffer (b)->tcp.ack_number);
       if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
-       {
-         tcp_cc_handle_event (tc, 1);
-       }
+       tcp_cc_handle_event (tc, 1);
       /* Don't drop yet */
       return 0;
     }
@@ -1274,7 +1290,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
   /*
    * Looks okay, process feedback
    */
-
+process_ack:
   if (tcp_opts_sack_permitted (&tc->rcv_opts))
     tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
 
@@ -1390,6 +1406,15 @@ tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end)
   ASSERT (tcp_sack_vector_is_sane (tc->snd_sacks));
 }
 
+u32
+tcp_sack_list_bytes (tcp_connection_t * tc)
+{
+  u32 bytes = 0, i;
+  for (i = 0; i < vec_len (tc->snd_sacks); i++)
+    bytes += tc->snd_sacks[i].end - tc->snd_sacks[i].start;
+  return bytes;
+}
+
 /** Enqueue data for delivery to application */
 always_inline int
 tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
@@ -1416,6 +1441,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
 
       /* Send ACK confirming the update */
       tc->flags |= TCP_CONN_SNDACK;
+      TCP_EVT_DBG (TCP_EVT_CC_INPUT, tc, data_len, written);
     }
   else if (written > 0)
     {
@@ -1488,6 +1514,7 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b,
          end = start + ooo_segment_length (s0->server_rx_fifo, newest);
          tcp_update_sack_list (tc, start, end);
          svm_fifo_newest_ooo_segment_reset (s0->server_rx_fifo);
+         TCP_EVT_DBG (TCP_EVT_CC_SACKS, tc);
        }
     }
 
@@ -1508,7 +1535,7 @@ tcp_can_delack (tcp_connection_t * tc)
       /* constrained to send ack */
       || (tc->flags & TCP_CONN_SNDACK) != 0
       /* we're almost out of tx wnd */
-      || tcp_available_snd_space (tc) < 4 * tc->snd_mss)
+      || tcp_available_cc_snd_space (tc) < 4 * tc->snd_mss)
     return 0;
 
   return 1;
@@ -1592,7 +1619,7 @@ tcp_segment_rcv (tcp_connection_t * tc, vlib_buffer_t * b, u32 * next0)
       *next0 = tcp_next_output (tc->c_is_ip4);
       tcp_make_ack (tc, b);
       vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_DUPACK;
-      TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc);
+      TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc, vnet_buffer (b)->tcp);
       goto done;
     }
 
@@ -1773,9 +1800,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                                                   &error0)))
            {
              tcp_maybe_inc_err_counter (err_counters, error0);
-             TCP_EVT_DBG (TCP_EVT_SEG_INVALID, tc0,
-                          vnet_buffer (b0)->tcp.seq_number,
-                          vnet_buffer (b0)->tcp.seq_end);
+             TCP_EVT_DBG (TCP_EVT_SEG_INVALID, tc0, vnet_buffer (b0)->tcp);
              goto done;
            }
 
index 0f43d21..a9ec58c 100644 (file)
@@ -41,8 +41,8 @@ newreno_rcv_ack (tcp_connection_t * tc)
       if (tc->cwnd_acc_bytes >= tc->cwnd)
        {
          u32 inc = tc->cwnd_acc_bytes / tc->cwnd;
-         tc->cwnd += inc * tc->snd_mss;
          tc->cwnd_acc_bytes -= inc * tc->cwnd;
+         tc->cwnd += inc * tc->snd_mss;
        }
       tc->cwnd = clib_min (tc->cwnd,
                           transport_tx_fifo_size (&tc->connection));
index 2745065..a036287 100644 (file)
@@ -389,6 +389,7 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts,
     {
     case TCP_STATE_ESTABLISHED:
     case TCP_STATE_FIN_WAIT_1:
+    case TCP_STATE_CLOSED:
       return tcp_make_established_options (tc, opts);
     case TCP_STATE_SYN_RCVD:
       return tcp_make_synack_options (tc, opts);
@@ -1337,8 +1338,9 @@ done:
  * Reset congestion control, switch cwnd to loss window and try again.
  */
 static void
-tcp_rtx_timeout_cc (tcp_connection_t * tc)
+tcp_rxt_timeout_cc (tcp_connection_t * tc)
 {
+  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 6);
   tc->prev_ssthresh = tc->ssthresh;
   tc->prev_cwnd = tc->cwnd;
 
@@ -1383,6 +1385,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
       tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID;
     }
 
+  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
+
   if (tc->state >= TCP_STATE_ESTABLISHED)
     {
       /* Lost FIN, retransmit and return */
@@ -1414,13 +1418,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
 
       /* First retransmit timeout */
       if (tc->rto_boff == 1)
-       tcp_rtx_timeout_cc (tc);
+       tcp_rxt_timeout_cc (tc);
 
       tc->snd_una_max = tc->snd_nxt = tc->snd_una;
       tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
 
-      TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
-
       /* Send one segment. Note that n_bytes may be zero due to buffer shortfall  */
       n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
 
@@ -1627,7 +1629,7 @@ void
 tcp_fast_retransmit_sack (tcp_connection_t * tc)
 {
   vlib_main_t *vm = vlib_get_main ();
-  u32 n_written = 0, offset, max_bytes;
+  u32 n_written = 0, offset, max_bytes, n_segs = 0;
   vlib_buffer_t *b = 0;
   sack_scoreboard_hole_t *hole;
   sack_scoreboard_t *sb;
@@ -1636,14 +1638,17 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc)
   u8 snd_limited = 0, can_rescue = 0;
 
   ASSERT (tcp_in_fastrecovery (tc));
-  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
 
   old_snd_nxt = tc->snd_nxt;
   sb = &tc->sack_sb;
-  snd_space = tcp_available_snd_space (tc);
+  snd_space = tcp_available_cc_snd_space (tc);
 
+  if (snd_space < tc->snd_mss)
+    goto done;
+
+  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
   hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
-  while (hole && snd_space > 0)
+  while (hole && snd_space > 0 && n_segs++ < VLIB_FRAME_SIZE)
     {
       hole = scoreboard_next_rxt_hole (sb, hole,
                                       tcp_fastrecovery_sent_1_smss (tc),
@@ -1717,7 +1722,7 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc)
   /* Start resending from first un-acked segment */
   old_snd_nxt = tc->snd_nxt;
   tc->snd_nxt = tc->snd_una;
-  snd_space = tcp_available_snd_space (tc);
+  snd_space = tcp_available_cc_snd_space (tc);
 
   while (snd_space > 0)
     {
@@ -1743,8 +1748,7 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc)
 void
 tcp_fast_retransmit (tcp_connection_t * tc)
 {
-  if (tcp_opts_sack_permitted (&tc->rcv_opts)
-      && scoreboard_first_hole (&tc->sack_sb))
+  if (tcp_opts_sack_permitted (&tc->rcv_opts))
     tcp_fast_retransmit_sack (tc);
   else
     tcp_fast_retransmit_no_sack (tc);