{
   ooo_segment_t *s;
   u32 index, bytes = 0, diff;
+  u32 cursize;
+
+  /* read cursize, which can only increase while we're working */
+  cursize = svm_fifo_max_dequeue (f);
 
   s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head);
 
   /* If tail is adjacent to an ooo segment, 'consume' it */
   if (diff == 0)
     {
-      bytes = ((f->nitems - f->cursize) >= s->length) ? s->length :
-       f->nitems - f->cursize;
+      bytes = ((f->nitems - cursize) >= s->length) ? s->length :
+       f->nitems - cursize;
 
       f->tail += bytes;
       f->tail %= f->nitems;
   u32 total_copy_bytes, first_copy_bytes, second_copy_bytes;
   u32 cursize, nitems;
 
-  if (PREDICT_FALSE (f->cursize == f->nitems))
+  /* read cursize, which can only increase while we're working */
+  cursize = svm_fifo_max_dequeue (f);
+
+  if (PREDICT_FALSE (cursize == f->nitems))
     return -2;                 /* fifo stuffed */
 
-  /* read cursize, which can only decrease while we're working */
-  cursize = f->cursize;
   nitems = f->nitems;
 
   /* Number of bytes we're going to copy */
 
   ASSERT (offset > 0);
 
-  /* read cursize, which can only decrease while we're working */
-  cursize = f->cursize;
+  /* read cursize, which can only increase while we're working */
+  cursize = svm_fifo_max_dequeue (f);
   nitems = f->nitems;
 
   /* Will this request fit? */
   u32 total_copy_bytes, first_copy_bytes, second_copy_bytes;
   u32 cursize, nitems;
 
-  if (PREDICT_FALSE (f->cursize == 0))
+  /* read cursize, which can only increase while we're working */
+  cursize = svm_fifo_max_dequeue (f);
+  if (PREDICT_FALSE (cursize == 0))
     return -2;                 /* nothing in the fifo */
 
-  /* read cursize, which can only increase while we're working */
-  cursize = f->cursize;
   nitems = f->nitems;
 
   /* Number of bytes we're going to copy */
   u32 total_copy_bytes, first_copy_bytes, second_copy_bytes;
   u32 cursize, nitems, real_head;
 
-  if (PREDICT_FALSE (f->cursize == 0))
+  /* read cursize, which can only increase while we're working */
+  cursize = svm_fifo_max_dequeue (f);
+  if (PREDICT_FALSE (cursize == 0))
     return -2;                 /* nothing in the fifo */
 
-  /* read cursize, which can only increase while we're working */
-  cursize = f->cursize;
   nitems = f->nitems;
   real_head = f->head + offset;
   real_head = real_head >= nitems ? real_head - nitems : real_head;
   u32 total_drop_bytes, first_drop_bytes, second_drop_bytes;
   u32 cursize, nitems;
 
-  if (PREDICT_FALSE (f->cursize == 0))
+  /* read cursize, which can only increase while we're working */
+  cursize = svm_fifo_max_dequeue (f);
+  if (PREDICT_FALSE (cursize == 0))
     return -2;                 /* nothing in the fifo */
 
-  /* read cursize, which can only increase while we're working */
-  cursize = f->cursize;
   nitems = f->nitems;
 
   /* Number of bytes we're going to drop */
 
 
 typedef struct
 {
+  volatile u32 cursize;                /**< current fifo size */
+  u32 nitems;
+    CLIB_CACHE_LINE_ALIGN_MARK (end_cursize);
+
   pthread_mutex_t mutex;       /* 8 bytes */
   pthread_cond_t condvar;      /* 8 bytes */
   svm_lock_tag_t tag;
 
-  volatile u32 cursize;                /**< current fifo size */
   volatile u8 has_event;       /**< non-zero if deq event exists */
   u32 owner_pid;
-  u32 nitems;
 
   /* Backpointers */
   u32 server_session_index;
 static inline u32
 svm_fifo_max_enqueue (svm_fifo_t * f)
 {
-  return f->nitems - f->cursize;
+  return f->nitems - svm_fifo_max_dequeue (f);
 }
 
 static inline u8
 
 
   /* Nothing to read return */
   if (max_dequeue0 == 0)
-    {
-      return 0;
-    }
+    return 0;
 
   /* Ensure we're not writing more than transport window allows */
-  max_len_to_snd0 = clib_min (max_dequeue0, snd_space0);
-
-  /* TODO check if transport is willing to send len_to_snd0
-   * bytes (Nagle) */
+  if (max_dequeue0 < snd_space0)
+    {
+      /* Constrained by tx queue. Try to send only fully formed segments */
+      max_len_to_snd0 = (max_dequeue0 > snd_mss0) ?
+       max_dequeue0 - max_dequeue0 % snd_mss0 : max_dequeue0;
+      /* TODO Nagle ? */
+    }
+  else
+    {
+      max_len_to_snd0 = snd_space0;
+    }
 
   n_frame_bytes = snd_mss0 * VLIB_FRAME_SIZE;
   n_frames_per_evt = ceil ((double) max_len_to_snd0 / n_frame_bytes);
   int n_tx_packets = 0;
   u32 my_thread_index = vm->cpu_index;
   int i, rv;
+  f64 now = vlib_time_now (vm);
+
+  SESSION_EVT_DBG (SESSION_EVT_POLL_GAP_TRACK, smm, my_thread_index);
 
   /*
    *  Update TCP time
    */
-  tcp_update_time (vlib_time_now (vm), my_thread_index);
+  tcp_update_time (now, my_thread_index);
 
   /*
    * Get vpp queue events
 
                                        u8 * added_a_segment)
 {
   svm_fifo_segment_private_t *fifo_segment;
-  u32 fifo_size, default_fifo_size = 128 << 10;        /* TODO config */
+  u32 fifo_size, default_fifo_size = 1 << 16;  /* TODO config */
   int i;
 
   *added_a_segment = 0;
   vec_validate (smm->current_enqueue_epoch, num_threads - 1);
   vec_validate (smm->vpp_event_queues, num_threads - 1);
 
+#if SESSION_DBG
+  vec_validate (smm->last_event_poll_by_thread, num_threads - 1);
+#endif
+
   /* $$$$ preallocate hack config parameter */
   for (i = 0; i < 200000; i++)
     {
 
 #include <vlibmemory/api.h>
 #include <vppinfra/sparse_vec.h>
 #include <svm/svm_fifo_segment.h>
+#include <vnet/session/session_debug.h>
 
 #define HALF_OPEN_LOOKUP_INVALID_VALUE ((u64)~0)
 #define INVALID_INDEX ((u32)~0)
   FIFO_EVENT_BUILTIN_RX
 } fifo_event_type_t;
 
-#define foreach_session_input_error                                         \
+#define foreach_session_input_error                                            \
 _(NO_SESSION, "No session drops")                                       \
 _(NO_LISTENER, "No listener for dst port drops")                        \
 _(ENQUEUED, "Packets pushed into rx fifo")                              \
   /* Convenience */
   vlib_main_t *vlib_main;
   vnet_main_t *vnet_main;
+
+#if SESSION_DBG
+  /**
+   * last event poll time by thread
+   * Debug only. Will cause false cache-line sharing as-is
+   */
+  f64 *last_event_poll_by_thread;
+#endif
+
 };
 
 extern session_manager_main_t session_manager_main;
 
 #define SRC_VNET_SESSION_SESSION_DEBUG_H_
 
 #include <vnet/session/transport.h>
-#include <vnet/session/session.h>
 #include <vlib/vlib.h>
 
 #define foreach_session_dbg_evt                \
   _(ENQ, "enqueue")                    \
   _(DEQ, "dequeue")                    \
-  _(DEQ_NODE, "dequeue")
+  _(DEQ_NODE, "dequeue")               \
+  _(POLL_GAP_TRACK, "poll gap track")  \
 
 typedef enum _session_evt_dbg
 {
 
 #define SESSION_DBG (0)
 #define SESSION_DEQ_NODE_EVTS (0)
+#define SESSION_EVT_POLL_DBG (1)
 
 #if TRANSPORT_DEBUG && SESSION_DBG
 
 #define SESSION_EVT_DEQ_NODE_HANDLER(_node_evt)
 #endif
 
+#if SESSION_DBG && SESSION_EVT_POLL_DBG
+#define SESSION_EVT_POLL_GAP(_smm, _my_thread_index)                   \
+{                                                                      \
+  ELOG_TYPE_DECLARE (_e) =                                             \
+  {                                                                    \
+    .format = "nixon-gap: %d MS",                                      \
+    .format_args = "i4",                                               \
+  };                                                                   \
+  DEC_SESSION_ED(_e, 1);                                               \
+  ed->data[0] =        (u32) ((now -                                           \
+    _smm->last_event_poll_by_thread[my_thread_index])*1000.0);         \
+}
+#define SESSION_EVT_POLL_GAP_TRACK_HANDLER(_smm, _my_thread_index)     \
+{                                                                      \
+  if (PREDICT_TRUE(                                                    \
+             smm->last_event_poll_by_thread[my_thread_index] != 0.0))  \
+    if (now > smm->last_event_poll_by_thread[_my_thread_index] + 500e-6)\
+       SESSION_EVT_POLL_GAP(smm, my_thread_index);                     \
+  _smm->last_event_poll_by_thread[my_thread_index] = now;              \
+}
+
+#else
+#define SESSION_EVT_POLL_GAP(_smm, _my_thread_index)
+#define SESSION_EVT_POLL_GAP_TRACK_HANDLER(_smm, _my_thread_index)
+#endif
+
 #define CONCAT_HELPER(_a, _b) _a##_b
 #define CC(_a, _b) CONCAT_HELPER(_a, _b)
-
 #define SESSION_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args)
 
 #else
 
 #include <vpp/api/vpe_all_api_h.h>
 #undef vl_printfun
 
+#define TCP_BUILTIN_CLIENT_DBG (1)
+#define TCP_BUILTIN_CLIENT_VPP_THREAD (0)
+#define TCP_BUILTIN_CLIENT_PTHREAD (!TCP_BUILTIN_CLIENT_VPP_THREAD)
+
 static void
 send_test_chunk (tclient_main_t * tm, session_t * s)
 {
   session_fifo_event_t evt;
   static int serial_number = 0;
   int rv;
+  test_buf_offset = s->bytes_sent % vec_len (test_data);
+  bytes_this_chunk = vec_len (test_data) - test_buf_offset;
 
-  while (s->bytes_to_send > 0)
-    {
+  bytes_this_chunk = bytes_this_chunk < s->bytes_to_send
+    ? bytes_this_chunk : s->bytes_to_send;
 
-      test_buf_offset = s->bytes_sent % vec_len (test_data);
-      bytes_this_chunk = vec_len (test_data) - test_buf_offset;
+  rv = svm_fifo_enqueue_nowait (s->server_tx_fifo, 0 /*pid */ ,
+                               bytes_this_chunk,
+                               test_data + test_buf_offset);
 
-      bytes_this_chunk = bytes_this_chunk < s->bytes_to_send
-       ? bytes_this_chunk : s->bytes_to_send;
+  /* If we managed to enqueue data... */
+  if (rv > 0)
+    {
+      if (TCP_BUILTIN_CLIENT_DBG)
+       {
+          /* *INDENT-OFF* */
+          ELOG_TYPE_DECLARE (e) =
+            {
+              .format = "tx-enq: %d bytes",
+              .format_args = "i4",
+            };
+          /* *INDENT-ON* */
+         struct
+         {
+           u32 data[1];
+         } *ed;
+         ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+         ed->data[0] = rv;
+       }
 
-      rv = svm_fifo_enqueue_nowait (s->server_tx_fifo, 0 /*pid */ ,
-                                   bytes_this_chunk,
-                                   test_data + test_buf_offset);
+      /* Account for it... */
+      s->bytes_to_send -= rv;
+      s->bytes_sent += rv;
 
-      if (rv > 0)
+      /* Poke the TCP state machine */
+      if (svm_fifo_set_event (s->server_tx_fifo))
        {
-         s->bytes_to_send -= rv;
-         s->bytes_sent += rv;
+         /* Fabricate TX event, send to vpp */
+         evt.fifo = s->server_tx_fifo;
+         evt.event_type = FIFO_EVENT_SERVER_TX;
+         evt.event_id = serial_number++;
 
-         if (svm_fifo_set_event (s->server_tx_fifo))
-           {
-             /* Fabricate TX event, send to vpp */
-             evt.fifo = s->server_tx_fifo;
-             evt.event_type = FIFO_EVENT_SERVER_TX;
-             evt.event_id = serial_number++;
-
-             unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt,
-                                           0 /* do wait for mutex */ );
-           }
+         unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt,
+                                       0 /* do wait for mutex */ );
        }
     }
 }
 receive_test_chunk (tclient_main_t * tm, session_t * s)
 {
   svm_fifo_t *rx_fifo = s->server_rx_fifo;
-  int n_read, bytes, i;
+  int n_read, test_bytes = 0;
 
-  bytes = svm_fifo_max_dequeue (rx_fifo);
   /* Allow enqueuing of new event */
-  svm_fifo_unset_event (rx_fifo);
+  // svm_fifo_unset_event (rx_fifo);
 
-  /* Read the bytes */
-  do
+  n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (tm->rx_buf),
+                                   tm->rx_buf);
+  if (n_read > 0)
     {
-      n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (tm->rx_buf),
-                                       tm->rx_buf);
-      if (n_read > 0)
+      if (TCP_BUILTIN_CLIENT_DBG)
        {
-         bytes -= n_read;
+          /* *INDENT-OFF* */
+          ELOG_TYPE_DECLARE (e) =
+            {
+              .format = "rx-deq: %d bytes",
+              .format_args = "i4",
+            };
+          /* *INDENT-ON* */
+         struct
+         {
+           u32 data[1];
+         } *ed;
+         ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+         ed->data[0] = n_read;
+       }
+
+      if (test_bytes)
+       {
+         int i;
          for (i = 0; i < n_read; i++)
            {
              if (tm->rx_buf[i] != ((s->bytes_received + i) & 0xff))
                {
                  clib_warning ("read %d error at byte %lld, 0x%x not 0x%x",
-                               n_read, s->bytes_received + i,
-                               tm->rx_buf[i],
+                               n_read, s->bytes_received + i, tm->rx_buf[i],
                                ((s->bytes_received + i) & 0xff));
                }
            }
-         s->bytes_to_receive -= n_read;
-         s->bytes_received += n_read;
        }
-
+      s->bytes_to_receive -= n_read;
+      s->bytes_received += n_read;
     }
-  while (n_read < 0 || bytes > 0);
 }
 
+#if TCP_BUILTIN_CLIENT_VPP_THREAD
+static void
+#else
 static void *
+#endif
 tclient_thread_fn (void *arg)
 {
   tclient_main_t *tm = &tclient_main;
     pthread_sigmask (SIG_SETMASK, &s, 0);
   }
 
+  clib_per_cpu_mheaps[os_get_cpu_number ()] = clib_per_cpu_mheaps[0];
+
   while (1)
     {
       /* Wait until we're told to get busy */
 
       /* Disconnect sessions... */
       vec_reset_length (session_indices);
-      pool_foreach (sp, tm->sessions, (
-                                       {
-                                       vec_add1 (session_indices,
-                                                 sp - tm->sessions);
-                                       }
-                   ));
+
+      /* *INDENT-OFF* */
+      pool_foreach (sp, tm->sessions, ({
+       vec_add1 (session_indices, sp - tm->sessions);
+      }));
+      /* *INDENT-ON* */
 
       for (i = 0; i < vec_len (session_indices); i++)
        {
        }
     }
   /* NOTREACHED */
+#if TCP_BUILTIN_CLIENT_PTHREAD
   return 0;
+#endif
 }
 
 /* So we don't get "no handler for... " msgs */
                             unformat_input_t * input,
                             vlib_cli_command_t * cmd)
 {
-  u8 *connect_uri = (u8 *) "tcp://6.0.1.2/1234";
+  u8 *connect_uri = (u8 *) "tcp://6.0.1.1/1234";
   u8 *uri;
   tclient_main_t *tm = &tclient_main;
   int i;
        ;
       else if (unformat (input, "iterations %d", &tm->n_iterations))
        ;
-      else if (unformat (input, "bytes %d", &tm->bytes_to_send))
+      else if (unformat (input, "bytes %lld", &tm->bytes_to_send))
        ;
       else if (unformat (input, "uri %s", &tm->connect_uri))
        ;
 
   create_api_loopback (tm);
 
+#if TCP_BUILTIN_CLIENT_PTHREAD
   /* Start a transmit thread */
   if (tm->client_thread_handle == 0)
     {
       int rv = pthread_create (&tm->client_thread_handle,
-                              NULL /*attr */ , tclient_thread_fn, 0);
+                              NULL /*attr */ ,
+                              tclient_thread_fn, 0);
       if (rv)
        {
          tm->client_thread_handle = 0;
          return clib_error_return (0, "pthread_create returned %d", rv);
        }
     }
+#endif
 
   /* Fire off connect requests, in something approaching a normal manner */
   for (i = 0; i < n_clients; i++)
   return 0;
 }
 
+#if TCP_BUILTIN_CLIENT_VPP_THREAD
+/* *INDENT-OFF* */
+VLIB_REGISTER_THREAD (builtin_client_reg, static) = {
+  .name = "tcp-builtin-client",
+  .function = tclient_thread_fn,
+  .fixed_count = 1,
+  .count = 1,
+  .no_data_structure_clone = 1,
+};
+/* *INDENT-ON* */
+#endif
+
 /* *INDENT-OFF* */
 VLIB_CLI_COMMAND (test_clients_command, static) =
 {
 
     {
       /* XXX timeout for session that are stuck */
 
+    rx_event:
       /* Program self-tap to retry */
       if (svm_fifo_set_event (rx_fifo))
        {
 
   n_written =
     svm_fifo_enqueue_nowait (tx_fifo, 0, actual_transfer, bsm->rx_buf);
-  ASSERT (n_written == max_transfer);
+
+  if (n_written != max_transfer)
+    clib_warning ("short trout!");
 
   if (svm_fifo_set_event (tx_fifo))
     {
                                    (u8 *) & evt, 0 /* do wait for mutex */ );
     }
 
+  if (PREDICT_FALSE (max_enqueue < max_dequeue))
+    goto rx_event;
+
   return 0;
 }
 
   a->session_cb_vft = &builtin_session_cb_vft;
   a->options = options;
   a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 128 << 20;
-  a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 64 << 10;
-  a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 64 << 10;
+  a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 1 << 16;
+  a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 1 << 16;
   a->segment_name = segment_name;
   a->segment_name_length = ARRAY_LEN (segment_name);
 
 
       /* If we can't write at least a segment, don't try at all */
       if (snd_space < tc->snd_mss)
        return 0;
-      return snd_space;
+
+      /* round down to mss multiple */
+      return snd_space - (snd_space % tc->snd_mss);
     }
 
   /* If in fast recovery, send 1 SMSS if wnd allows */
 {
     tcp_timer_retransmit_handler,
     tcp_timer_delack_handler,
-    0,
+    tcp_timer_persist_handler,
     tcp_timer_keep_handler,
     tcp_timer_waitclose_handler,
     tcp_timer_retransmit_syn_handler,
 
 
 extern timer_expiration_handler tcp_timer_delack_handler;
 extern timer_expiration_handler tcp_timer_retransmit_handler;
+extern timer_expiration_handler tcp_timer_persist_handler;
 extern timer_expiration_handler tcp_timer_retransmit_syn_handler;
 
 #define TCP_TIMER_HANDLE_INVALID ((u32) ~0)
 
 #define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY
 #define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY
+#define tcp_recovery_on(tc) (tc)->flags |= TCP_CONN_RECOVERY
+#define tcp_recovery_off(tc) (tc)->flags &= ~TCP_CONN_RECOVERY
 #define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY)
-#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY))
+#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_RECOVERY))
 #define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh)
 #define tcp_fastrecovery_sent_1_smss(tc) ((tc)->flags & TCP_CONN_FR_1_SMSS)
 #define tcp_fastrecovery_1_smss_on(tc) ((tc)->flags |= TCP_CONN_FR_1_SMSS)
 #define tcp_fastrecovery_1_smss_off(tc) ((tc)->flags &= ~TCP_CONN_FR_1_SMSS)
 
+#define tcp_in_cong_recovery(tc) ((tc)->flags &                \
+         (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY))
+
+always_inline void
+tcp_cong_recovery_off (tcp_connection_t * tc)
+{
+  tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY);
+  tcp_fastrecovery_1_smss_off (tc);
+}
+
 typedef enum
 {
   TCP_IP4,
   tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT);
 }
 
+always_inline void
+tcp_persist_timer_set (tcp_connection_t * tc)
+{
+  /* Reuse RTO. It's backed off in handler */
+  tcp_timer_set (tc, TCP_TIMER_PERSIST,
+                clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
+}
+
+always_inline void
+tcp_persist_timer_update (tcp_connection_t * tc)
+{
+  tcp_timer_update (tc, TCP_TIMER_PERSIST,
+                   clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
+}
+
+always_inline void
+tcp_persist_timer_reset (tcp_connection_t * tc)
+{
+  tcp_timer_reset (tc, TCP_TIMER_PERSIST);
+}
+
 always_inline u8
 tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer)
 {
 
   _(UNBIND, "unbind")                  \
   _(DELETE, "delete")                  \
   _(SYN_SENT, "SYN sent")              \
+  _(SYN_RTX, "SYN retransmit")         \
   _(FIN_SENT, "FIN sent")              \
   _(ACK_SENT, "ACK sent")              \
   _(DUPACK_SENT, "DUPACK sent")                \
   _(CC_PACK, "cc partial ack")         \
   _(SEG_INVALID, "invalid segment")    \
   _(ACK_RCV_ERR, "invalid ack")                \
+  _(RCV_WND_SHRUNK, "shrunk rcv_wnd")  \
 
 typedef enum _tcp_dbg
 {
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "ack_prep: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u",   \
-    .format_args = "i4i4i4i4",                                         \
+    .format = "ack_tx: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u snd_wnd %u",\
+    .format_args = "i4i4i4i4i4",                                       \
   };                                                                   \
-  DECLARE_ETD(_tc, _e, 4);                                             \
+  DECLARE_ETD(_tc, _e, 5);                                             \
   ed->data[0] = _tc->rcv_nxt - _tc->rcv_las;                           \
   ed->data[1] = _tc->rcv_nxt - _tc->irs;                               \
   ed->data[2] = _tc->rcv_wnd;                                          \
   ed->data[3] = _tc->snd_nxt - _tc->iss;                               \
+  ed->data[4] = _tc->snd_wnd;                                          \
 }
 
 #define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...)                          \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "dack_tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av-wnd %u",   \
-    .format_args = "i4i4i4i4",                                         \
+    .format = "dack_tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\
+    .format_args = "i4i4i4i4i4",                                       \
   };                                                                   \
-  DECLARE_ETD(_tc, _e, 4);                                             \
+  DECLARE_ETD(_tc, _e, 5);                                             \
   ed->data[0] = _tc->rcv_nxt - _tc->irs;                               \
   ed->data[1] = _tc->rcv_wnd;                                          \
   ed->data[2] = _tc->snd_nxt - _tc->iss;                               \
   ed->data[3] = tcp_available_wnd(_tc);                                        \
+  ed->data[4] = _tc->snd_wnd;                                          \
 }
 
 #define TCP_EVT_SYN_SENT_HANDLER(_tc, ...)                             \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "SYNtx: iss %u",                                 \
+    .format = "SYNtx: iss %u",                                         \
+    .format_args = "i4",                                               \
+  };                                                                   \
+  DECLARE_ETD(_tc, _e, 1);                                             \
+  ed->data[0] = _tc->iss;                                              \
+}
+
+#define TCP_EVT_SYN_RTX_HANDLER(_tc, ...)                              \
+{                                                                      \
+  ELOG_TYPE_DECLARE (_e) =                                             \
+  {                                                                    \
+    .format = "SYNrtx: iss %u",                                                \
     .format_args = "i4",                                               \
   };                                                                   \
   DECLARE_ETD(_tc, _e, 1);                                             \
   ed->data[1] = _tc->rcv_nxt - _tc->irs;                               \
 }
 
-#define TCP_EVT_ACK_RCVD_HANDLER(_tc, _ack, ...)                       \
+#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...)                             \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "acked: %u snd_una %u ack %u cwnd %u inflight %u",       \
+    .format = "acked: %u snd_una %u snd_wnd %u cwnd %u inflight %u",   \
     .format_args = "i4i4i4i4i4",                                       \
   };                                                                   \
   DECLARE_ETD(_tc, _e, 5);                                             \
   ed->data[0] = _tc->bytes_acked;                                      \
   ed->data[1] = _tc->snd_una - _tc->iss;                               \
-  ed->data[2] = _ack - _tc->iss;                                       \
+  ed->data[2] = _tc->snd_wnd;                                          \
   ed->data[3] = _tc->cwnd;                                             \
   ed->data[4] = tcp_flight_size(_tc);                                  \
 }
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "dack_rx: snd_una %u cwnd %u snd_wnd %u inflight %u",    \
-    .format_args = "i4i4i4i4",                                         \
+    .format = "dack_rx: snd_una %u cwnd %u snd_wnd %u flight %u rcv_wnd %u",\
+    .format_args = "i4i4i4i4i4",                                       \
   };                                                                   \
-  DECLARE_ETD(_tc, _e, 4);                                             \
+  DECLARE_ETD(_tc, _e, 5);                                             \
   ed->data[0] = _tc->snd_una - _tc->iss;                               \
   ed->data[1] = _tc->cwnd;                                             \
   ed->data[2] = _tc->snd_wnd;                                          \
   ed->data[3] = tcp_flight_size(_tc);                                  \
+  ed->data[4] = _tc->rcv_wnd;                                          \
 }
 
 #define TCP_EVT_PKTIZE_HANDLER(_tc, ...)                               \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "in: %s len %u written %d rcv_nxt %u free wnd %d",       \
+    .format = "in: %s len %u written %d rcv_nxt %u rcv_wnd(o) %d",     \
     .format_args = "t4i4i4i4i4",                                       \
     .n_enum_strings = 2,                                               \
     .enum_strings = {                                                  \
     .enum_strings = {                                                  \
       "retransmit",                                                    \
       "delack",                                                        \
-      "BUG",                                                           \
+      "persist",                                                       \
       "keep",                                                          \
       "waitclose",                                                     \
       "retransmit syn",                                                \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u wnd %u",   \
+    .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u rcv_wnd %u",\
     .format_args = "i4i4i4i4i4",                                       \
   };                                                                   \
   DECLARE_ETD(_tc, _e, 5);                                             \
 #define TCP_EVT_CC_PACK_HANDLER(_tc, ...)
 #endif
 
+#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...)            \
+{                                                                      \
+if (_av > 0)                                                           \
+{                                                                      \
+  ELOG_TYPE_DECLARE (_e) =                                             \
+  {                                                                    \
+    .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u",  \
+    .format_args = "i4i4i4i4i4",                                       \
+  };                                                                   \
+  DECLARE_ETD(_tc, _e, 5);                                             \
+  ed->data[0] = _tc->rcv_wnd;                                          \
+  ed->data[1] = _obs;                                                  \
+  ed->data[2] = _av;                                                   \
+  ed->data[3] = _tc->rcv_nxt - _tc->irs;                               \
+  ed->data[4] = _tc->rcv_las - _tc->irs;                               \
+}                                                                      \
+}
+
 #if TCP_DBG_VERBOSE
 #define TCP_EVT_SND_WND_HANDLER(_tc, ...)                              \
 {                                                                      \
 
       if (tc0->rcv_wnd == 0
          && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number)
        {
-         /* Make it look as if there's nothing to dequeue */
-         vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number;
+         /* TODO Should segment be tagged?  */
        }
       else
        {
   if (tc->rtt_seq && seq_gt (ack, tc->rtt_seq) && !tc->rto_boff)
     {
       mrtt = tcp_time_now () - tc->rtt_ts;
-      tc->rtt_seq = 0;
     }
 
   /* As per RFC7323 TSecr can be used for RTTM only if the segment advances
 
   tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
 
+  /* Allow measuring of RTT and make sure boff is 0 */
+  tc->rtt_seq = 0;
+  tc->rto_boff = 0;
+
   return 1;
 }
 
   stream_session_dequeue_drop (&tc->connection, tc->bytes_acked);
 
   /* Update rtt and rto */
-  if (tcp_update_rtt (tc, ack))
-    {
-      /* Good ACK received and valid RTT, make sure retransmit backoff is 0 */
-      tc->rto_boff = 0;
-    }
+  tcp_update_rtt (tc, ack);
 }
 
 /**
       tc->snd_wl1 = seq;
       tc->snd_wl2 = ack;
       TCP_EVT_DBG (TCP_EVT_SND_WND, tc);
+
+      /* Set probe timer if we just got 0 wnd */
+      if (tc->snd_wnd < tc->snd_mss
+         && !tcp_timer_is_active (tc, TCP_TIMER_PERSIST))
+       tcp_persist_timer_set (tc);
+      else
+       tcp_persist_timer_reset (tc);
     }
 }
 
 void
 tcp_cc_recover (tcp_connection_t * tc)
 {
+  /* TODO: check if time to recover was small. It might be that RTO popped
+   * too soon.
+   */
+
   tc->cc_algo->recovered (tc);
 
   tc->rtx_bytes = 0;
   tc->cc_algo->rcv_ack (tc);
   tc->tsecr_last_ack = tc->opt.tsecr;
 
-  tcp_fastrecovery_1_smss_off (tc);
-  tcp_fastrecovery_off (tc);
+  tcp_cong_recovery_off (tc);
 
   TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
 }
 {
   u8 partial_ack;
 
-  if (tcp_in_recovery (tc))
+  if (tcp_in_cong_recovery (tc))
     {
       partial_ack = seq_lt (tc->snd_una, tc->snd_congestion);
       if (!partial_ack)
 
          /* In case snd_nxt is still in the past and output tries to
           * shove some new bytes */
-         tc->snd_nxt = tc->snd_una;
+         tc->snd_nxt = tc->snd_una_max;
 
          /* XXX need proper RFC6675 support */
-         if (tc->sack_sb.last_sacked_bytes)
+         if (tc->sack_sb.last_sacked_bytes && !tcp_in_recovery (tc))
            {
              tcp_fast_retransmit (tc);
            }
            {
              /* Retransmit first unacked segment */
              tcp_retransmit_first_unacked (tc);
-             /* If window allows, send 1 SMSS of new data */
-             if (seq_lt (tc->snd_nxt, tc->snd_congestion))
-               tc->snd_nxt = tc->snd_congestion;
            }
        }
     }
          return -1;
        }
 
-      tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
-      *error = TCP_ERROR_ACK_FUTURE;
       TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 2,
                   vnet_buffer (b)->tcp.ack_number);
+
+      tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
+      *error = TCP_ERROR_ACK_FUTURE;
     }
 
   /* If old ACK, probably it's an old dupack */
    * timer. */
   if (tc->bytes_acked)
     {
-      TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc, vnet_buffer (b)->tcp.ack_number);
+      TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
 
       /* Updates congestion control (slow start/congestion avoidance) */
       tcp_cc_rcv_ack (tc, b);
       tc->rcv_nxt += written;
 
       /* Depending on how fast the app is, all remaining buffers in burst will
-       * not be enqueued. Should we inform peer of the damage? XXX */
+       * not be enqueued. Inform peer */
+      tc->flags |= TCP_CONN_SNDACK;
+
       return TCP_ERROR_PARTIALLY_ENQUEUED;
     }
   else
     {
+      tc->flags |= TCP_CONN_SNDACK;
       return TCP_ERROR_FIFO_FULL;
     }
 
       goto done;
     }
 
-  if (PREDICT_FALSE (error == TCP_ERROR_FIFO_FULL))
-    *next0 = TCP_NEXT_DROP;
-
   /* Check if ACK can be delayed */
-  if (!tcp_can_delack (tc))
-    {
-      /* Nothing to do for pure ACKs XXX */
-      if (n_data_bytes == 0)
-       goto done;
-
-      *next0 = tcp_next_output (tc->c_is_ip4);
-      tcp_make_ack (tc, b);
-    }
-  else
+  if (tcp_can_delack (tc))
     {
       if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK))
        tcp_timer_set (tc, TCP_TIMER_DELACK, TCP_DELACK_TIME);
+      goto done;
     }
 
+  *next0 = tcp_next_output (tc->c_is_ip4);
+  tcp_make_ack (tc, b);
+
 done:
   return error;
 }
 
          child0->irs = vnet_buffer (b0)->tcp.seq_number;
          child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1;
+         child0->rcv_las = child0->rcv_nxt;
          child0->state = TCP_STATE_SYN_RCVD;
 
          /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK}
 
   max_fifo = stream_session_fifo_size (&tc->connection);
 
   ASSERT (tc->opt.mss < max_fifo);
-
-  if (available_space < tc->opt.mss && available_space < max_fifo / 8)
+  if (available_space < tc->opt.mss && available_space < max_fifo >> 3)
     available_space = 0;
 
   /*
   /* Bad. Thou shalt not shrink */
   if (available_space < observed_wnd)
     {
-      /* Does happen! */
       wnd = observed_wnd;
+      TCP_EVT_DBG (TCP_EVT_RCV_WND_SHRUNK, tc, observed_wnd, available_space);
     }
   else
     {
       wnd = available_space;
     }
 
-  if (wnd && ((wnd << tc->rcv_wscale) >> tc->rcv_wscale != wnd))
-    wnd += 1 << tc->rcv_wscale;
+  /* Make sure we have a multiple of rcv_wscale */
+  if (wnd && tc->rcv_wscale)
+    {
+      wnd &= ~(1 << tc->rcv_wscale);
+      if (wnd == 0)
+       wnd = 1 << tc->rcv_wscale;
+    }
 
   tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale);
 }
 
   tcp_reuse_buffer (vm, b);
   tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK);
-  vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK;
   TCP_EVT_DBG (TCP_EVT_ACK_SENT, tc);
+  vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK;
+  tc->rcv_las = tc->rcv_nxt;
 }
 
 /**
   vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
 
   tc->snd_nxt += data_len;
+
   /* TODO this is updated in output as well ... */
   if (tc->snd_nxt > tc->snd_una_max)
     tc->snd_una_max = tc->snd_nxt;
 
   /* Fill in the ACK */
   tcp_make_ack (tc, b);
-
   tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
 }
 
 
   tc = tcp_connection_get (index, thread_index);
   tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID;
-//  tc->flags &= ~TCP_CONN_DELACK;
   tcp_send_ack (tc);
 }
 
  * Reset congestion control, switch cwnd to loss window and try again.
  */
 static void
-tcp_rtx_timeout_cc_recover (tcp_connection_t * tc)
+tcp_rtx_timeout_cc (tcp_connection_t * tc)
 {
   /* Cleanly recover cc (also clears up fast retransmit) */
   if (tcp_in_fastrecovery (tc))
     }
 
   /* Start again from the beginning */
+  tcp_recovery_on (tc);
   tc->cwnd = tcp_loss_wnd (tc);
   tc->snd_congestion = tc->snd_una_max;
 }
     {
       /* First retransmit timeout */
       if (tc->rto_boff == 1)
-       tcp_rtx_timeout_cc_recover (tc);
+       tcp_rtx_timeout_cc (tc);
 
       /* Exponential backoff */
       tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
     {
       ASSERT (tc->state == TCP_STATE_SYN_SENT);
 
+      TCP_EVT_DBG (TCP_EVT_SYN_RTX, tc);
+
       /* This goes straight to ipx_lookup */
       tcp_push_ip_hdr (tm, tc, b);
       tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4);
   tcp_timer_retransmit_handler_i (index, 1);
 }
 
+/**
+ * Got 0 snd_wnd from peer, try to do something about it.
+ *
+ */
+void
+tcp_timer_persist_handler (u32 index)
+{
+  tcp_main_t *tm = vnet_get_tcp_main ();
+  vlib_main_t *vm = vlib_get_main ();
+  u32 thread_index = os_get_cpu_number ();
+  tcp_connection_t *tc;
+  vlib_buffer_t *b;
+  u32 bi, n_bytes;
+
+  tc = tcp_connection_get (index, thread_index);
+
+  /* Make sure timer handle is set to invalid */
+  tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID;
+
+  /* Problem already solved or worse */
+  if (tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc))
+    return;
+
+  /* Increment RTO backoff */
+  tc->rto_boff += 1;
+  tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+
+  /* Try to force the first unsent segment  */
+  tcp_get_free_buffer_index (tm, &bi);
+  b = vlib_get_buffer (vm, bi);
+  n_bytes = stream_session_peek_bytes (&tc->connection,
+                                      vlib_buffer_get_current (b),
+                                      tc->snd_una_max - tc->snd_una,
+                                      tc->snd_mss);
+  /* Nothing to send */
+  if (n_bytes == 0)
+    {
+      tcp_return_buffer (tm);
+      return;
+    }
+
+  b->current_length = n_bytes;
+  tcp_push_hdr_i (tc, b, tc->state);
+  tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+
+  /* Re-enable persist timer */
+  tcp_persist_timer_set (tc);
+}
+
 /**
  * Retransmit first unacked segment
  */
                }
            }
 
-         /* Retransmitted SYNs do reach this but it should be harmless */
-         tc0->rcv_las = tc0->rcv_nxt;
-
          /* Stop DELACK timer and fix flags */
          tc0->flags &= ~(TCP_CONN_SNDACK);
          if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK))