Improvements to tcp rx path and debugging 11/7611/9
authorFlorin Coras <fcoras@cisco.com>
Tue, 18 Jul 2017 09:38:03 +0000 (05:38 -0400)
committerFlorin Coras <fcoras@cisco.com>
Fri, 21 Jul 2017 23:20:09 +0000 (19:20 -0400)
- Increment rcv_nxt for fin packets
- Call tcp_segment_rcv only if buffer has data
- Parse rcv opts before deleting half-open connection
- Fix initial rcv_wnd
- Improved event logging

Change-Id: I9b83c04f432c4cec832c480b03e534deff02c3b1
Signed-off-by: Florin Coras <fcoras@cisco.com>
13 files changed:
src/vnet/session/node.c
src/vnet/session/session.c
src/vnet/session/session.h
src/vnet/session/session_api.c
src/vnet/session/session_cli.c
src/vnet/tcp/builtin_client.c
src/vnet/tcp/builtin_server.c
src/vnet/tcp/tcp.c
src/vnet/tcp/tcp.h
src/vnet/tcp/tcp_debug.h
src/vnet/tcp/tcp_input.c
src/vnet/tcp/tcp_output.c
src/vnet/tcp/tcp_test.c

index 983b78b..8d703b0 100644 (file)
@@ -443,6 +443,79 @@ dump_thread_0_event_queue (void)
     }
 }
 
+static u8
+session_node_cmp_event (session_fifo_event_t * e, svm_fifo_t * f)
+{
+  stream_session_t *s;
+  switch (e->event_type)
+    {
+    case FIFO_EVENT_APP_RX:
+    case FIFO_EVENT_APP_TX:
+    case FIFO_EVENT_BUILTIN_RX:
+      if (e->fifo == f)
+       return 1;
+      break;
+    case FIFO_EVENT_DISCONNECT:
+      break;
+    case FIFO_EVENT_RPC:
+      s = stream_session_get_from_handle (e->session_handle);
+      if (!s)
+       {
+         clib_warning ("session has event but doesn't exist!");
+         break;
+       }
+      if (s->server_rx_fifo == f || s->server_tx_fifo == f)
+       return 1;
+      break;
+    default:
+      break;
+    }
+  return 0;
+}
+
+u8
+session_node_lookup_fifo_event (svm_fifo_t * f, session_fifo_event_t * e)
+{
+  session_manager_main_t *smm = vnet_get_session_manager_main ();
+  unix_shared_memory_queue_t *q;
+  session_fifo_event_t *pending_event_vector, *evt;
+  int i, index, found = 0;
+  i8 *headp;
+  u8 thread_index;
+
+  ASSERT (e);
+  thread_index = f->master_thread_index;
+  /*
+   * Search evt queue
+   */
+  q = smm->vpp_event_queues[thread_index];
+  index = q->head;
+  for (i = 0; i < q->cursize; i++)
+    {
+      headp = (i8 *) (&q->data[0] + q->elsize * index);
+      clib_memcpy (e, headp, q->elsize);
+      found = session_node_cmp_event (e, f);
+      if (found)
+       break;
+      if (++index == q->maxsize)
+       index = 0;
+    }
+  /*
+   * Search pending events vector
+   */
+  pending_event_vector = smm->pending_event_vector[thread_index];
+  vec_foreach (evt, pending_event_vector)
+  {
+    found = session_node_cmp_event (evt, f);
+    if (found)
+      {
+       clib_memcpy (e, evt, sizeof (*evt));
+       break;
+      }
+  }
+  return found;
+}
+
 static uword
 session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
                       vlib_frame_t * frame)
index 2c2a27c..09bc00e 100644 (file)
@@ -32,6 +32,22 @@ static transport_proto_vft_t *tp_vfts;
 
 session_manager_main_t session_manager_main;
 
+transport_connection_t *
+stream_session_lookup_half_open (transport_connection_t * tc)
+{
+  session_manager_main_t *smm = &session_manager_main;
+  session_kv4_t kv4;
+  int rv;
+  if (tc->is_ip4)
+    {
+      make_v4_ss_kv_from_tc (&kv4, tc);
+      rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4);
+      if (rv == 0)
+       return tp_vfts[tc->proto].get_half_open (kv4.value & 0xFFFFFFFFULL);
+    }
+  return 0;
+}
+
 /*
  * Session lookup key; (src-ip, dst-ip, src-port, dst-port, session-type)
  * Value: (owner thread index << 32 | session_index);
@@ -501,7 +517,7 @@ stream_session_create_i (segment_manager_t * sm, transport_connection_t * tc,
   tc->s_index = s->session_index;
 
   /* Add to the main lookup table */
-  value = (((u64) thread_index) << 32) | (u64) s->session_index;
+  value = stream_session_handle (s);
   stream_session_table_add_for_tc (tc, value);
 
   *ret_s = s;
@@ -817,8 +833,18 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst,
     }
 
   /* Notify client */
-  app->cb_fns.session_connected_callback (app->index, api_context, new_s,
-                                         is_fail);
+  if (app->cb_fns.session_connected_callback (app->index, api_context, new_s,
+                                             is_fail))
+    {
+      clib_warning ("failed to notify app");
+      if (!is_fail)
+       stream_session_disconnect (new_s);
+    }
+  else
+    {
+      if (!is_fail)
+       new_s->session_state = SESSION_STATE_READY;
+    }
 
   /* Cleanup session lookup */
   stream_session_half_open_table_del (smm, sst, tc);
@@ -862,15 +888,19 @@ void
 stream_session_delete (stream_session_t * s)
 {
   session_manager_main_t *smm = vnet_get_session_manager_main ();
+  int rv;
 
   /* Delete from the main lookup table. */
-  stream_session_table_del (smm, s);
+  if ((rv = stream_session_table_del (smm, s)))
+    clib_warning ("hash delete error, rv %d", rv);
 
   /* Cleanup fifo segments */
   segment_manager_dealloc_fifos (s->svm_segment_index, s->server_rx_fifo,
                                 s->server_tx_fifo);
 
   pool_put (smm->sessions[s->thread_index], s);
+  if (CLIB_DEBUG)
+    memset (s, 0xFA, sizeof (*s));
 }
 
 /**
index 6069c57..6c61632 100644 (file)
@@ -170,6 +170,8 @@ typedef int
 extern session_fifo_rx_fn session_tx_fifo_peek_and_snd;
 extern session_fifo_rx_fn session_tx_fifo_dequeue_and_snd;
 
+u8 session_node_lookup_fifo_event (svm_fifo_t * f, session_fifo_event_t * e);
+
 struct _session_manager_main
 {
   /** Lookup tables for established sessions and listeners */
@@ -289,6 +291,8 @@ transport_connection_t *stream_session_lookup_transport6 (ip6_address_t * lcl,
 
 stream_session_t *stream_session_lookup_listener (ip46_address_t * lcl,
                                                  u16 lcl_port, u8 proto);
+transport_connection_t
+  * stream_session_lookup_half_open (transport_connection_t * tc);
 void stream_session_table_add_for_tc (transport_connection_t * tc, u64 value);
 int stream_session_table_del_for_tc (transport_connection_t * tc);
 
index 60f764a..6bee3e2 100755 (executable)
@@ -184,13 +184,6 @@ send_session_connected_callback (u32 app_index, u32 api_context,
     }
 
   vl_msg_api_send_shmem (q, (u8 *) & mp);
-
-  /* Remove client if connect failed */
-  if (!is_fail)
-    {
-      s->session_state = SESSION_STATE_READY;
-    }
-
   return 0;
 }
 
index e8e6f99..4d43297 100755 (executable)
@@ -19,8 +19,24 @@ u8 *
 format_stream_session_fifos (u8 * s, va_list * args)
 {
   stream_session_t *ss = va_arg (*args, stream_session_t *);
+  int verbose = va_arg (*args, int);
+  session_fifo_event_t _e, *e = &_e;
+  u8 found;
+
   s = format (s, " Rx fifo: %U", format_svm_fifo, ss->server_rx_fifo, 1);
+  if (verbose > 2 && ss->server_rx_fifo->has_event)
+    {
+      found = session_node_lookup_fifo_event (ss->server_rx_fifo, e);
+      s = format (s, " session node event: %s\n",
+                 found ? "found" : "not found");
+    }
   s = format (s, " Tx fifo: %U", format_svm_fifo, ss->server_tx_fifo, 1);
+  if (verbose > 2 && ss->server_tx_fifo->has_event)
+    {
+      found = session_node_lookup_fifo_event (ss->server_tx_fifo, e);
+      s = format (s, " session node event: %s\n",
+                 found ? "found" : "not found");
+    }
   return s;
 }
 
@@ -55,7 +71,7 @@ format_stream_session (u8 * s, va_list * args)
       if (verbose == 1)
        s = format (s, "%v", str);
       if (verbose > 1)
-       s = format (s, "%U", format_stream_session_fifos, ss);
+       s = format (s, "%U", format_stream_session_fifos, ss, verbose);
     }
   else if (ss->session_state == SESSION_STATE_LISTENING)
     {
@@ -75,7 +91,7 @@ format_stream_session (u8 * s, va_list * args)
       if (verbose == 1)
        s = format (s, "%v", str);
       if (verbose > 1)
-       s = format (s, "%U", format_stream_session_fifos, ss);
+       s = format (s, "%U", format_stream_session_fifos, ss, verbose);
     }
   else
     {
@@ -248,7 +264,7 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input,
 
   if (one_session)
     {
-      vlib_cli_output (vm, "%U", format_stream_session, s, 2);
+      vlib_cli_output (vm, "%U", format_stream_session, s, 3);
       return 0;
     }
 
index a92baca..744f50e 100644 (file)
@@ -410,9 +410,6 @@ builtin_session_connected_callback (u32 app_index, u32 api_context,
       return -1;
     }
 
-  /* Mark vpp session as connected */
-  s->session_state = SESSION_STATE_READY;
-
   tm->our_event_queue = session_manager_get_vpp_event_queue (s->thread_index);
   tm->vpp_event_queue = session_manager_get_vpp_event_queue (s->thread_index);
 
@@ -466,6 +463,7 @@ builtin_session_reset_callback (stream_session_t * s)
 {
   if (s->session_state == SESSION_STATE_READY)
     clib_warning ("Reset active connection %U", format_stream_session, s, 2);
+  stream_session_cleanup (s);
   return;
 }
 
@@ -478,6 +476,11 @@ builtin_session_create_callback (stream_session_t * s)
 static void
 builtin_session_disconnect_callback (stream_session_t * s)
 {
+  tclient_main_t *tm = &tclient_main;
+  vnet_disconnect_args_t _a, *a = &_a;
+  a->handle = stream_session_handle (s);
+  a->app_index = tm->app_index;
+  vnet_disconnect_session (a);
   return;
 }
 
@@ -521,7 +524,7 @@ attach_builtin_test_clients_app (void)
   options[SESSION_OPTIONS_ACCEPT_COOKIE] = 0x12345678;
   options[SESSION_OPTIONS_SEGMENT_SIZE] = (2ULL << 32);
   options[SESSION_OPTIONS_RX_FIFO_SIZE] = tm->fifo_size;
-  options[SESSION_OPTIONS_TX_FIFO_SIZE] = tm->fifo_size / 2;
+  options[SESSION_OPTIONS_TX_FIFO_SIZE] = tm->fifo_size;
   options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = tm->private_segment_count;
   options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = tm->private_segment_size;
   options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = prealloc_fifos;
index 4ecaf56..3416678 100644 (file)
@@ -213,15 +213,15 @@ builtin_server_rx_callback (stream_session_t * s)
          q = bsm->vpp_queue[thread_index];
          if (PREDICT_FALSE (q->cursize == q->maxsize))
            clib_warning ("out of event queue space");
-         else if (unix_shared_memory_queue_add (q, (u8 *) & evt, 0     /* don't wait for mutex */
-                  ))
+         else if (unix_shared_memory_queue_add (q, (u8 *) & evt, 0))
            clib_warning ("failed to enqueue self-tap");
 
-         bsm->rx_retries[thread_index][s->session_index]++;
          if (bsm->rx_retries[thread_index][s->session_index] == 500000)
            {
              clib_warning ("session stuck: %U", format_stream_session, s, 2);
            }
+         if (bsm->rx_retries[thread_index][s->session_index] < 500001)
+           bsm->rx_retries[thread_index][s->session_index]++;
        }
 
       return 0;
@@ -303,7 +303,7 @@ create_api_loopback (vlib_main_t * vm)
 
   /* Wait for reply */
   bsm->node_index = vlib_get_current_process (vm)->node_runtime.node_index;
-  vlib_process_wait_for_event_or_clock (vm, 1.0);
+  vlib_process_wait_for_event_or_clock (vm, 2.0);
   event_type = vlib_process_get_events (vm, &event_data);
   switch (event_type)
     {
index 8ed325d..a221415 100644 (file)
@@ -163,6 +163,33 @@ tcp_connection_del (tcp_connection_t * tc)
   tcp_connection_cleanup (tc);
 }
 
+/**
+ * Cleanup half-open connection
+ */
+void
+tcp_half_open_connection_del (tcp_connection_t * tc)
+{
+  tcp_main_t *tm = vnet_get_tcp_main ();
+  if (CLIB_DEBUG)
+    memset (tc, 0xFA, sizeof (*tc));
+  clib_spinlock_lock (&tm->half_open_lock);
+  pool_put (tm->half_open_connections, tc);
+  clib_spinlock_unlock (&tm->half_open_lock);
+}
+
+tcp_connection_t *
+tcp_connection_new (u8 thread_index)
+{
+  tcp_main_t *tm = vnet_get_tcp_main ();
+  tcp_connection_t *tc;
+
+  pool_get (tm->connections[thread_index], tc);
+  memset (tc, 0, sizeof (*tc));
+  tc->c_c_index = tc - tm->connections[thread_index];
+  tc->c_thread_index = thread_index;
+  return tc;
+}
+
 /** Notify session that connection has been reset.
  *
  * Switch state to closed and wait for session to call cleanup.
@@ -170,6 +197,7 @@ tcp_connection_del (tcp_connection_t * tc)
 void
 tcp_connection_reset (tcp_connection_t * tc)
 {
+  TCP_EVT_DBG (TCP_EVT_RST_RCVD, tc);
   switch (tc->state)
     {
     case TCP_STATE_SYN_RCVD:
@@ -178,12 +206,18 @@ tcp_connection_reset (tcp_connection_t * tc)
       tcp_connection_cleanup (tc);
       break;
     case TCP_STATE_SYN_SENT:
+      /* XXX remove sst from call */
+      stream_session_connect_notify (&tc->connection, tc->connection.proto,
+                                    1 /* fail */ );
+      tcp_connection_cleanup (tc);
+      break;
     case TCP_STATE_ESTABLISHED:
     case TCP_STATE_CLOSE_WAIT:
     case TCP_STATE_FIN_WAIT_1:
     case TCP_STATE_FIN_WAIT_2:
     case TCP_STATE_CLOSING:
       tc->state = TCP_STATE_CLOSED;
+      TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc);
 
       /* Make sure all timers are cleared */
       tcp_connection_timers_reset (tc);
@@ -227,6 +261,7 @@ tcp_connection_close (tcp_connection_t * tc)
     tc->state = TCP_STATE_CLOSED;
   else if (tc->state == TCP_STATE_CLOSE_WAIT)
     tc->state = TCP_STATE_LAST_ACK;
+  TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc);
 
   /* If in CLOSED and WAITCLOSE timer is not set, delete connection now */
   if (tc->timers[TCP_TIMER_WAITCLOSE] == TCP_TIMER_HANDLE_INVALID
@@ -250,6 +285,7 @@ tcp_session_cleanup (u32 conn_index, u32 thread_index)
 
   /* Wait for the session tx events to clear */
   tc->state = TCP_STATE_CLOSED;
+  TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc);
   tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
 }
 
@@ -287,7 +323,7 @@ ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4)
  * Allocate local port and add if successful add entry to local endpoint
  * table to mark the pair as used.
  */
-u16
+int
 tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip)
 {
   transport_endpoint_t *tep;
@@ -484,7 +520,7 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4)
   fib_node_index_t fei;
   u32 sw_if_index;
   ip46_address_t lcl_addr;
-  u16 lcl_port;
+  int lcl_port;
 
   /*
    * Find the local address and allocate port
@@ -500,12 +536,19 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4)
 
   /* Couldn't find route to destination. Bail out. */
   if (fei == FIB_NODE_INDEX_INVALID)
-    return -1;
+    {
+      clib_warning ("no route to destination");
+      return -1;
+    }
 
   sw_if_index = fib_entry_get_resolving_interface (fei);
 
   if (sw_if_index == (u32) ~ 0)
-    return -1;
+    {
+      clib_warning ("no resolving interface for %U", format_ip46_address,
+                   rmt_addr, IP46_TYPE_IP4);
+      return -1;
+    }
 
   if (is_ip4)
     {
@@ -570,11 +613,9 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4)
   /* The other connection vars will be initialized after SYN ACK */
   tcp_connection_timers_init (tc);
 
-  tcp_send_syn (tc);
-
-  tc->state = TCP_STATE_SYN_SENT;
-
   TCP_EVT_DBG (TCP_EVT_OPEN, tc);
+  tc->state = TCP_STATE_SYN_SENT;
+  tcp_send_syn (tc);
 
   return tc->c_c_index;
 }
@@ -1206,7 +1247,7 @@ tcp_main_enable (vlib_main_t * vm)
   clib_bihash_init_24_8 (&tm->local_endpoints_table, "local endpoint table",
                         200000 /* $$$$ config parameter nbuckets */ ,
                         (64 << 20) /*$$$ config parameter table size */ );
-
+  clib_spinlock_init (&tm->half_open_lock);
   return error;
 }
 
index fd0d02b..89c3061 100644 (file)
@@ -33,6 +33,7 @@
 
 #define TCP_DUPACK_THRESHOLD   3
 #define TCP_MAX_RX_FIFO_SIZE   4 << 20
+#define TCP_MIN_RX_FIFO_SIZE   4 << 10
 #define TCP_IW_N_SEGMENTS      10
 #define TCP_ALWAYS_ACK         1       /**< On/off delayed acks */
 #define TCP_USE_SACKS          1       /**< Disable only for testing */
@@ -371,11 +372,9 @@ typedef struct _tcp_main
   /* Per worker-thread timer wheel for connections timers */
   tw_timer_wheel_16t_2w_512sl_t *timer_wheels;
 
-//  /* Convenience per worker-thread vector of connections to DELACK */
-//  u32 **delack_connections;
-
   /* Pool of half-open connections on which we've sent a SYN */
   tcp_connection_t *half_open_connections;
+  clib_spinlock_t half_open_lock;
 
   /* Pool of local TCP endpoints */
   transport_endpoint_t *local_endpoints;
@@ -455,6 +454,8 @@ tcp_get_connection_from_transport (transport_connection_t * tconn)
 void tcp_connection_close (tcp_connection_t * tc);
 void tcp_connection_cleanup (tcp_connection_t * tc);
 void tcp_connection_del (tcp_connection_t * tc);
+void tcp_half_open_connection_del (tcp_connection_t * tc);
+tcp_connection_t *tcp_connection_new (u8 thread_index);
 void tcp_connection_reset (tcp_connection_t * tc);
 
 u8 *format_tcp_connection_id (u8 * s, va_list * args);
@@ -472,13 +473,15 @@ tcp_listener_get (u32 tli)
 always_inline tcp_connection_t *
 tcp_half_open_connection_get (u32 conn_index)
 {
+  if (pool_is_free_index (tcp_main.half_open_connections, conn_index))
+    return 0;
   return pool_elt_at_index (tcp_main.half_open_connections, conn_index);
 }
 
 void tcp_make_ack (tcp_connection_t * ts, vlib_buffer_t * b);
 void tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b);
 void tcp_make_synack (tcp_connection_t * ts, vlib_buffer_t * b);
-void tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4);
+void tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4);
 void tcp_send_syn (tcp_connection_t * tc);
 void tcp_send_fin (tcp_connection_t * tc);
 void tcp_init_mss (tcp_connection_t * tc);
@@ -658,7 +661,6 @@ tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval)
                                 tc->c_c_index, timer_id, interval);
 }
 
-/* XXX Switch retransmit to faster TW */
 always_inline void
 tcp_retransmit_timer_set (tcp_connection_t * tc)
 {
index be51bca..e3da56f 100755 (executable)
 #include <vlib/vlib.h>
 
 #define TCP_DEBUG (1)
-#define TCP_DEBUG_SM (0)
-#define TCP_DEBUG_CC (1)
-#define TCP_DEBUG_CC_STAT (1)
-#define TCP_DEBUG_SM_VERBOSE (0)
+#define TCP_DEBUG_SM (2)
+#define TCP_DEBUG_CC (0)
+#define TCP_DEBUG_CC_STAT (0)
 
 #define foreach_tcp_dbg_evt            \
   _(INIT, "")                          \
@@ -33,7 +32,9 @@
   _(UNBIND, "unbind")                  \
   _(DELETE, "delete")                  \
   _(SYN_SENT, "SYN sent")              \
-  _(SYN_RTX, "SYN retransmit")         \
+  _(SYNACK_SENT, "SYNACK sent")                \
+  _(SYNACK_RCVD, "SYNACK rcvd")                \
+  _(SYN_RXT, "SYN retransmit")         \
   _(FIN_SENT, "FIN sent")              \
   _(ACK_SENT, "ACK sent")              \
   _(DUPACK_SENT, "DUPACK sent")                \
@@ -43,6 +44,7 @@
   _(DUPACK_RCVD, "DUPACK rcvd")                \
   _(FIN_RCVD, "FIN rcvd")              \
   _(RST_RCVD, "RST rcvd")              \
+  _(STATE_CHANGE, "state change")      \
   _(PKTIZE, "packetize")               \
   _(INPUT, "in")                       \
   _(SND_WND, "snd_wnd update")         \
@@ -96,11 +98,64 @@ typedef enum _tcp_dbg_evt
   ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main,                   \
                        _e, _tc->c_elog_track)
 
-#define TCP_EVT_INIT_HANDLER(_tc, _fmt, ...)                           \
+#define TCP_DBG_IP_TAG_LCL(_tc)                                                \
 {                                                                      \
-  _tc->c_elog_track.name =                                             \
-       (char *) format (0, _fmt, _tc->c_c_index, 0);                   \
+  if (_tc->c_is_ip4)                                                   \
+    {                                                                  \
+      ELOG_TYPE_DECLARE (_e) =                                         \
+      {                                                                        \
+        .format = "lcl: %d.%d.%d.%d:%d",                               \
+        .format_args = "i4i4i4i4i4",                                   \
+      };                                                               \
+      DECLARE_ETD(_tc, _e, 5);                                         \
+      ed->data[0] = _tc->c_lcl_ip.ip4.as_u8[0];                                \
+      ed->data[1] = _tc->c_lcl_ip.ip4.as_u8[1];                                \
+      ed->data[2] = _tc->c_lcl_ip.ip4.as_u8[2];                                \
+      ed->data[3] = _tc->c_lcl_ip.ip4.as_u8[3];                                \
+      ed->data[4] = clib_net_to_host_u16(_tc->c_lcl_port);             \
+    }                                                                  \
+}
+
+#define TCP_DBG_IP_TAG_RMT(_tc)                                                \
+{                                                                      \
+  if (_tc->c_is_ip4)                                                   \
+    {                                                                  \
+      ELOG_TYPE_DECLARE (_e) =                                         \
+      {                                                                        \
+        .format = "rmt: %d.%d.%d.%d:%d",                               \
+        .format_args = "i4i4i4i4i4",                                   \
+      };                                                               \
+      DECLARE_ETD(_tc, _e, 5);                                         \
+      ed->data[0] = _tc->c_rmt_ip.ip4.as_u8[0];                                \
+      ed->data[1] = _tc->c_rmt_ip.ip4.as_u8[1];                                \
+      ed->data[2] = _tc->c_rmt_ip.ip4.as_u8[2];                                \
+      ed->data[3] = _tc->c_rmt_ip.ip4.as_u8[3];                                \
+      ed->data[4] = clib_net_to_host_u16(_tc->c_rmt_port);             \
+    }                                                                  \
+}
+
+#define TCP_EVT_INIT_HANDLER(_tc, _is_l, ...)                          \
+{                                                                      \
+  char *_fmt = _is_l ? "l[%d].%d:%d%c" : "[%d].%d:%d->.%d:%d%c";       \
+  if (_tc->c_is_ip4)                                                   \
+    {                                                                  \
+      _tc->c_elog_track.name =                                         \
+       (char *) format (0, _fmt, _tc->c_thread_index,                  \
+                        _tc->c_lcl_ip.ip4.as_u8[3],                    \
+                        clib_net_to_host_u16(_tc->c_lcl_port),         \
+                        _tc->c_rmt_ip.ip4.as_u8[3],                    \
+                        clib_net_to_host_u16(_tc->c_rmt_port), 0);     \
+    }                                                                  \
+  else                                                                 \
+      _tc->c_elog_track.name =                                         \
+       (char *) format (0, _fmt, _tc->c_thread_index,                  \
+                        _tc->c_lcl_ip.ip6.as_u8[15],                   \
+                        clib_net_to_host_u16(_tc->c_lcl_port),         \
+                        _tc->c_rmt_ip.ip6.as_u8[15],                   \
+                        clib_net_to_host_u16(_tc->c_rmt_port), 0);     \
   elog_track_register (&vlib_global_main.elog_main, &_tc->c_elog_track);\
+  TCP_DBG_IP_TAG_LCL(_tc);                                             \
+  TCP_DBG_IP_TAG_RMT(_tc);                                             \
 }
 
 #define TCP_EVT_DEALLOC_HANDLER(_tc, ...)                              \
@@ -110,7 +165,7 @@ typedef enum _tcp_dbg_evt
 
 #define TCP_EVT_OPEN_HANDLER(_tc, ...)                                 \
 {                                                                      \
-  TCP_EVT_INIT_HANDLER(_tc, "s%d%c");                                  \
+  TCP_EVT_INIT_HANDLER(_tc, 0);                                                \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
     .format = "open: index %d",                                                \
@@ -133,7 +188,7 @@ typedef enum _tcp_dbg_evt
 
 #define TCP_EVT_BIND_HANDLER(_tc, ...)                                 \
 {                                                                      \
-  TCP_EVT_INIT_HANDLER(_tc, "l%d%c");                                  \
+  TCP_EVT_INIT_HANDLER(_tc, 1);                                                \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
     .format = "bind: listener %d",                                     \
@@ -166,18 +221,6 @@ typedef enum _tcp_dbg_evt
   TCP_EVT_DEALLOC_HANDLER(_tc);                                                \
 }
 
-#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...)                             \
-{                                                                      \
-  TCP_EVT_INIT_HANDLER(_tc, "s%d%c");                                  \
-  ELOG_TYPE_DECLARE (_e) =                                             \
-  {                                                                    \
-    .format = "SYNrx: irs %u",                                         \
-    .format_args = "i4",                                               \
-  };                                                                   \
-  DECLARE_ETD(_tc, _e, 1);                                             \
-  ed->data[0] = _tc->irs;                                              \
-}
-
 #define CONCAT_HELPER(_a, _b) _a##_b
 #define CC(_a, _b) CONCAT_HELPER(_a, _b)
 #define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args)
@@ -190,63 +233,86 @@ typedef enum _tcp_dbg_evt
  */
 #if TCP_DEBUG_SM
 
-#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...)                             \
+#define TCP_EVT_STATE_CHANGE_HANDLER(_tc, ...)                         \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "ack_tx: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u snd_wnd %u",\
-    .format_args = "i4i4i4i4i4",                                       \
+    .format = "state: %s",                                             \
+    .format_args = "t4",                                               \
+    .n_enum_strings = 11,                                              \
+    .enum_strings = {                                                  \
+      "closed",                                                                \
+      "listen",                                                        \
+      "syn-sent",                                                      \
+      "syn-rcvd",                                                      \
+      "established",                                                   \
+      "close_wait",                                                    \
+      "fin-wait-1",                                                    \
+      "last-ack",                                                      \
+      "closing",                                                       \
+      "fin-wait-2",                                                    \
+      "time-wait",                                                     \
+    },                                                                 \
   };                                                                   \
-  DECLARE_ETD(_tc, _e, 5);                                             \
-  ed->data[0] = _tc->rcv_nxt - _tc->rcv_las;                           \
-  ed->data[1] = _tc->rcv_nxt - _tc->irs;                               \
-  ed->data[2] = _tc->rcv_wnd;                                          \
-  ed->data[3] = _tc->snd_nxt - _tc->iss;                               \
-  ed->data[4] = _tc->snd_wnd;                                          \
+  DECLARE_ETD(_tc, _e, 1);                                             \
+  ed->data[0] = _tc->state;                                            \
 }
 
-#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...)                          \
+#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...)                             \
 {                                                                      \
+  TCP_EVT_INIT_HANDLER(_tc, 0);                                                \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "dack_tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\
-    .format_args = "i4i4i4i4i4",                                       \
+    .format = "syn-rx: irs %u",                                                \
+    .format_args = "i4",                                               \
   };                                                                   \
-  DECLARE_ETD(_tc, _e, 5);                                             \
-  ed->data[0] = _tc->rcv_nxt - _tc->irs;                               \
-  ed->data[1] = _tc->rcv_wnd;                                          \
-  ed->data[2] = _tc->snd_nxt - _tc->iss;                               \
-  ed->data[3] = tcp_available_wnd(_tc);                                        \
-  ed->data[4] = _tc->snd_wnd;                                          \
+  DECLARE_ETD(_tc, _e, 1);                                             \
+  ed->data[0] = _tc->irs;                                              \
+  TCP_EVT_STATE_CHANGE_HANDLER(_tc);                                   \
 }
 
 #define TCP_EVT_SYN_SENT_HANDLER(_tc, ...)                             \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "SYNtx: iss %u",                                         \
+    .format = "syn-tx: iss %u",                                                \
     .format_args = "i4",                                               \
   };                                                                   \
   DECLARE_ETD(_tc, _e, 1);                                             \
   ed->data[0] = _tc->iss;                                              \
+  TCP_EVT_STATE_CHANGE_HANDLER(_tc);                                   \
 }
 
-#define TCP_EVT_SYN_RTX_HANDLER(_tc, ...)                              \
+#define TCP_EVT_SYNACK_SENT_HANDLER(_tc, ...)                          \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "SYNrtx: iss %u",                                                \
-    .format_args = "i4",                                               \
+    .format = "synack-tx: iss %u irs %u",                              \
+    .format_args = "i4i4",                                             \
   };                                                                   \
-  DECLARE_ETD(_tc, _e, 1);                                             \
+  DECLARE_ETD(_tc, _e, 2);                                             \
+  ed->data[0] = _tc->iss;                                              \
+  ed->data[1] = _tc->irs;                                              \
+}
+
+#define TCP_EVT_SYNACK_RCVD_HANDLER(_tc, ...)                          \
+{                                                                      \
+  ELOG_TYPE_DECLARE (_e) =                                             \
+  {                                                                    \
+    .format = "synack-rx: iss %u irs %u",                              \
+    .format_args = "i4i4",                                             \
+  };                                                                   \
+  DECLARE_ETD(_tc, _e, 2);                                             \
   ed->data[0] = _tc->iss;                                              \
+  ed->data[1] = _tc->irs;                                              \
+  TCP_EVT_STATE_CHANGE_HANDLER(_tc);                                   \
 }
 
 #define TCP_EVT_FIN_SENT_HANDLER(_tc, ...)                             \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "FINtx: snd_nxt %d rcv_nxt %d",                          \
+    .format = "fin-tx: snd_nxt %d rcv_nxt %d",                         \
     .format_args = "i4i4",                                             \
   };                                                                   \
   DECLARE_ETD(_tc, _e, 2);                                             \
@@ -258,19 +324,20 @@ typedef enum _tcp_dbg_evt
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "RSTtx: snd_nxt %d rcv_nxt %d",                          \
+    .format = "rst-tx: snd_nxt %d rcv_nxt %d",                         \
     .format_args = "i4i4",                                             \
   };                                                                   \
   DECLARE_ETD(_tc, _e, 2);                                             \
   ed->data[0] = _tc->snd_nxt - _tc->iss;                               \
   ed->data[1] = _tc->rcv_nxt - _tc->irs;                               \
+  TCP_EVT_STATE_CHANGE_HANDLER(_tc);                                   \
 }
 
 #define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...)                             \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "FINrx: snd_nxt %d rcv_nxt %d",                          \
+    .format = "fin-rx: snd_nxt %d rcv_nxt %d",                         \
     .format_args = "i4i4",                                             \
   };                                                                   \
   DECLARE_ETD(_tc, _e, 2);                                             \
@@ -282,7 +349,7 @@ typedef enum _tcp_dbg_evt
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "RSTrx: snd_nxt %d rcv_nxt %d",                          \
+    .format = "rst-rx: snd_nxt %d rcv_nxt %d",                         \
     .format_args = "i4i4",                                             \
   };                                                                   \
   DECLARE_ETD(_tc, _e, 2);                                             \
@@ -290,6 +357,67 @@ typedef enum _tcp_dbg_evt
   ed->data[1] = _tc->rcv_nxt - _tc->irs;                               \
 }
 
+#define TCP_EVT_SYN_RXT_HANDLER(_tc, _type, ...)                       \
+{                                                                      \
+  ELOG_TYPE_DECLARE (_e) =                                             \
+  {                                                                    \
+    .format = "%s-rxt: iss %u",                                                \
+    .format_args = "t4i4",                                             \
+    .n_enum_strings = 2,                                               \
+    .enum_strings = {                                                  \
+       "syn",                                                          \
+        "syn-ack",                                                     \
+    },                                                                 \
+  };                                                                   \
+  DECLARE_ETD(_tc, _e, 2);                                             \
+  ed->data[0] = _type;                                                 \
+  ed->data[1] = _tc->iss;                                              \
+}
+
+#else
+#define TCP_EVT_SYN_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_SYNACK_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_SYNACK_RCVD_HANDLER(_tc, ...)
+#define TCP_EVT_SYN_RXT_HANDLER(_tc, ...)
+#define TCP_EVT_FIN_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_RST_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...)
+#define TCP_EVT_RST_RCVD_HANDLER(_tc, ...)
+#define TCP_EVT_STATE_CHANGE_HANDLER(_tc, ...)
+#endif
+
+#if TCP_DEBUG_SM > 1
+
+#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...)                             \
+{                                                                      \
+  ELOG_TYPE_DECLARE (_e) =                                             \
+  {                                                                    \
+    .format = "ack-tx: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u snd_wnd %u",\
+    .format_args = "i4i4i4i4i4",                                       \
+  };                                                                   \
+  DECLARE_ETD(_tc, _e, 5);                                             \
+  ed->data[0] = _tc->rcv_nxt - _tc->rcv_las;                           \
+  ed->data[1] = _tc->rcv_nxt - _tc->irs;                               \
+  ed->data[2] = _tc->rcv_wnd;                                          \
+  ed->data[3] = _tc->snd_nxt - _tc->iss;                               \
+  ed->data[4] = _tc->snd_wnd;                                          \
+}
+
+#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...)                          \
+{                                                                      \
+  ELOG_TYPE_DECLARE (_e) =                                             \
+  {                                                                    \
+    .format = "dack-tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\
+    .format_args = "i4i4i4i4i4",                                       \
+  };                                                                   \
+  DECLARE_ETD(_tc, _e, 5);                                             \
+  ed->data[0] = _tc->rcv_nxt - _tc->irs;                               \
+  ed->data[1] = _tc->rcv_wnd;                                          \
+  ed->data[2] = _tc->snd_nxt - _tc->iss;                               \
+  ed->data[3] = tcp_available_wnd(_tc);                                        \
+  ed->data[4] = _tc->snd_wnd;                                          \
+}
+
 #define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...)                             \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
@@ -309,7 +437,7 @@ typedef enum _tcp_dbg_evt
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "dack_rx: snd_una %u cwnd %u snd_wnd %u flight %u rcv_wnd %u",\
+    .format = "dack-rx: snd_una %u cwnd %u snd_wnd %u flight %u rcv_wnd %u",\
     .format_args = "i4i4i4i4i4",                                       \
   };                                                                   \
   DECLARE_ETD(_tc, _e, 5);                                             \
@@ -370,7 +498,7 @@ typedef enum _tcp_dbg_evt
     }                                                                  \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "TimerPop: %s (%d)",                                     \
+    .format = "timer-pop: %s (%d)",                                    \
     .format_args = "t4i4",                                             \
     .n_enum_strings = 7,                                               \
     .enum_strings = {                                                  \
@@ -391,7 +519,8 @@ typedef enum _tcp_dbg_evt
     }                                                                  \
   else                                                                 \
     {                                                                  \
-      clib_warning ("pop for unexisting connection %d", _tc_index);    \
+      clib_warning ("pop %d for unexisting connection %d", _timer_id,  \
+                   _tc_index);                                         \
     }                                                                  \
 }
 
@@ -414,7 +543,7 @@ typedef enum _tcp_dbg_evt
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "paws fail: seq %u end %u tsval %u tsval_recent %u",     \
+    .format = "paws-err: seq %u end %u tsval %u tsval_recent %u",      \
     .format_args = "i4i4i4i4",                                         \
   };                                                                   \
   DECLARE_ETD(_tc, _e, 4);                                             \
@@ -465,12 +594,6 @@ if (_av > 0)                                                               \
 #else
 #define TCP_EVT_ACK_SENT_HANDLER(_tc, ...)
 #define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...)
-#define TCP_EVT_SYN_SENT_HANDLER(_tc, ...)
-#define TCP_EVT_SYN_RTX_HANDLER(_tc, ...)
-#define TCP_EVT_FIN_SENT_HANDLER(_tc, ...)
-#define TCP_EVT_RST_SENT_HANDLER(_tc, ...)
-#define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...)
-#define TCP_EVT_RST_RCVD_HANDLER(_tc, ...)
 #define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...)
 #define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...)
 #define TCP_EVT_PKTIZE_HANDLER(_tc, ...)
@@ -485,12 +608,12 @@ if (_av > 0)                                                              \
 /*
  * State machine verbose
  */
-#if TCP_DBG_SM_VERBOSE
+#if TCP_DEBUG_SM > 2
 #define TCP_EVT_SND_WND_HANDLER(_tc, ...)                              \
 {                                                                      \
   ELOG_TYPE_DECLARE (_e) =                                             \
   {                                                                    \
-    .format = "snd_wnd update: %u ",                                   \
+    .format = "snd-wnd update: %u ",                                   \
     .format_args = "i4",                                               \
   };                                                                   \
   DECLARE_ETD(_tc, _e, 1);                                             \
@@ -617,6 +740,7 @@ if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now())                \
 #define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...)
 #define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)
 #define TCP_EVT_CC_PACK_HANDLER(_tc, ...)
+#define TCP_EVT_CC_STAT_HANDLER(_tc, ...)
 #endif
 
 #endif /* SRC_VNET_TCP_TCP_DEBUG_H_ */
index bc7d901..cc5cecd 100644 (file)
@@ -349,7 +349,10 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
   /* 4th: check the SYN bit */
   if (tcp_syn (th0))
     {
-      tcp_send_reset (b0, tc0->c_is_ip4);
+      /* TODO implement RFC 5961 */
+      tcp_make_ack (tc0, b0);
+      *next0 = tcp_next_output (tc0->c_is_ip4);
+      TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0);
       return -1;
     }
 
@@ -1246,8 +1249,6 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
    * Looks okay, process feedback
    */
 
-  TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
-
   if (tcp_opts_sack_permitted (&tc->rcv_opts))
     tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
 
@@ -1263,6 +1264,8 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
   if (tc->bytes_acked)
     tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number);
 
+  TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
+
   /*
    * Check if we have congestion event
    */
@@ -1496,9 +1499,13 @@ tcp_can_delack (tcp_connection_t * tc)
 
 static int
 tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b,
-                u16 n_data_bytes, u32 * next0)
+                u32 * next0)
 {
-  u32 error = 0, n_bytes_to_drop;
+  u32 error = 0, n_bytes_to_drop, n_data_bytes;
+
+  vlib_buffer_advance (b, vnet_buffer (b)->tcp.data_offset);
+  n_data_bytes = vnet_buffer (b)->tcp.data_len;
+  ASSERT (n_data_bytes);
 
   /* Handle out-of-order data */
   if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt))
@@ -1512,7 +1519,12 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b,
 
          /* Completely in the past (possible retransmit) */
          if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
-           goto done;
+           {
+             /* Ack retransmissions since we may not have any data to send */
+             tcp_make_ack (tc, b);
+             *next0 = tcp_next_output (tc->c_is_ip4);
+             goto done;
+           }
 
          /* Chop off the bytes in the past */
          n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number;
@@ -1550,12 +1562,6 @@ in_order:
    * segments can be enqueued after fifo tail offset changes. */
   error = tcp_session_enqueue_data (tc, b, n_data_bytes);
 
-  if (n_data_bytes == 0)
-    {
-      *next0 = TCP_NEXT_DROP;
-      goto done;
-    }
-
   /* Check if ACK can be delayed */
   if (tcp_can_delack (tc))
     {
@@ -1680,7 +1686,9 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
            }
 
          th0 = tcp_buffer_hdr (b0);
-         is_fin = (th0->flags & TCP_FLAG_FIN) != 0;
+         /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a
+          * dangling reference. */
+         is_fin = tcp_is_fin (th0);
 
          /* SYNs, FINs and data consume sequence numbers */
          vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number
@@ -1700,29 +1708,23 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
          /* 5: check the ACK field  */
          if (tcp_rcv_ack (tc0, b0, th0, &next0, &error0))
-           {
-             goto done;
-           }
+           goto done;
 
          /* 6: check the URG bit TODO */
 
          /* 7: process the segment text */
-
-         vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset);
-         error0 = tcp_segment_rcv (tm, tc0, b0,
-                                   vnet_buffer (b0)->tcp.data_len, &next0);
-
-         /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a
-          * dangling reference. */
+         if (vnet_buffer (b0)->tcp.data_len)
+           error0 = tcp_segment_rcv (tm, tc0, b0, &next0);
 
          /* 8: check the FIN bit */
-         if (is_fin)
+         if (PREDICT_FALSE (is_fin))
            {
              /* Enter CLOSE-WAIT and notify session. Don't send ACK, instead
               * wait for session to call close. To avoid lingering
               * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
              tc0->state = TCP_STATE_CLOSE_WAIT;
              TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
+             tc0->rcv_nxt += (vnet_buffer (b0)->tcp.data_len == 0);
              stream_session_disconnect_notify (&tc0->connection);
              tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
            }
@@ -1856,6 +1858,21 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          seq0 = vnet_buffer (b0)->tcp.seq_number;
          tcp0 = tcp_buffer_hdr (b0);
 
+         if (!tc0)
+           {
+             ip4_header_t *ip40 = vlib_buffer_get_current (b0);
+             tcp0 = ip4_next_header (ip40);
+             tc0 =
+               (tcp_connection_t *)
+               stream_session_lookup_transport_wt4 (&ip40->dst_address,
+                                                    &ip40->src_address,
+                                                    tcp0->dst_port,
+                                                    tcp0->src_port,
+                                                    SESSION_TYPE_IP4_TCP,
+                                                    my_thread_index);
+             ASSERT (0);
+             goto drop;
+           }
          if (PREDICT_FALSE
              (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0)))
            goto drop;
@@ -1881,8 +1898,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt)
                {
                  if (!tcp_rst (tcp0))
-                   tcp_send_reset (b0, is_ip4);
-
+                   tcp_send_reset (tc0, b0, is_ip4);
                  goto drop;
                }
 
@@ -1900,11 +1916,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              /* If ACK is acceptable, signal client that peer is not
               * willing to accept connection and drop connection*/
              if (tcp_ack (tcp0))
-               {
-                 stream_session_connect_notify (&tc0->connection, sst,
-                                                1 /* fail */ );
-                 tcp_connection_cleanup (tc0);
-               }
+               tcp_connection_reset (tc0);
              goto drop;
            }
 
@@ -1920,6 +1932,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          if (!tcp_syn (tcp0))
            goto drop;
 
+         /* Parse options */
+         if (tcp_options_parse (tcp0, &tc0->rcv_opts))
+           goto drop;
+
          /* Stop connection establishment and retransmit timers */
          tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH);
          tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN);
@@ -1928,19 +1944,11 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
           * current thread pool. */
          pool_get (tm->connections[my_thread_index], new_tc0);
          clib_memcpy (new_tc0, tc0, sizeof (*new_tc0));
-
-         new_tc0->c_thread_index = my_thread_index;
          new_tc0->c_c_index = new_tc0 - tm->connections[my_thread_index];
-
-         /* Cleanup half-open connection XXX lock */
-         pool_put (tm->half_open_connections, tc0);
-
+         new_tc0->c_thread_index = my_thread_index;
          new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end;
          new_tc0->irs = seq0;
-
-         /* Parse options */
-         if (tcp_options_parse (tcp0, &new_tc0->rcv_opts))
-           goto drop;
+         tcp_half_open_connection_del (tc0);
 
          if (tcp_opts_tstamp (&new_tc0->rcv_opts))
            {
@@ -1959,7 +1967,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          tcp_connection_init_vars (new_tc0);
 
          /* SYN-ACK: See if we can switch to ESTABLISHED state */
-         if (tcp_ack (tcp0))
+         if (PREDICT_TRUE (tcp_ack (tcp0)))
            {
              /* Our SYN is ACKed: we have iss < ack = snd_una */
 
@@ -1976,7 +1984,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                                                 0))
                {
                  tcp_connection_cleanup (new_tc0);
-                 tcp_send_reset (b0, is_ip4);
+                 tcp_send_reset (tc0, b0, is_ip4);
                  goto drop;
                }
 
@@ -1986,6 +1994,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              /* Update rtt with the syn-ack sample */
              new_tc0->bytes_acked = 1;
              tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number);
+             TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, new_tc0);
            }
          /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */
          else
@@ -1997,12 +2006,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                  (&new_tc0->connection, sst, 0))
                {
                  tcp_connection_cleanup (new_tc0);
-                 tcp_send_reset (b0, is_ip4);
+                 tcp_send_reset (tc0, b0, is_ip4);
+                 TCP_EVT_DBG (TCP_EVT_RST_SENT, tc0);
                  goto drop;
                }
 
              tc0->rtt_ts = 0;
-
              tcp_make_synack (new_tc0, b0);
              next0 = tcp_next_output (is_ip4);
 
@@ -2010,12 +2019,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
            }
 
          /* Read data, if any */
-         if (vnet_buffer (b0)->tcp.data_len)
+         if (PREDICT_FALSE (vnet_buffer (b0)->tcp.data_len))
            {
-             vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset);
-             error0 = tcp_segment_rcv (tm, new_tc0, b0,
-                                       vnet_buffer (b0)->tcp.data_len,
-                                       &next0);
+             ASSERT (0);
+             error0 = tcp_segment_rcv (tm, new_tc0, b0, &next0);
              if (error0 == TCP_ERROR_PURE_ACK)
                error0 = TCP_ERROR_SYN_ACKS_RCVD;
            }
@@ -2114,6 +2121,7 @@ VLIB_REGISTER_NODE (tcp6_syn_sent_node) =
 /* *INDENT-ON* */
 
 VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv);
+
 /**
  * Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED
  * as per RFC793 p. 64
@@ -2202,7 +2210,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
               */
              if (!tcp_rcv_ack_is_acceptable (tc0, b0))
                {
-                 tcp_send_reset (b0, is_ip4);
+                 tcp_send_reset (tc0, b0, is_ip4);
                  goto drop;
                }
 
@@ -2243,6 +2251,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                {
                  ASSERT (tcp_fin (tcp0));
                  tc0->state = TCP_STATE_FIN_WAIT_2;
+                 TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
+
                  /* Stop all timers, 2MSL will be set lower */
                  tcp_connection_timers_reset (tc0);
                }
@@ -2269,6 +2279,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
              /* XXX test that send queue empty */
              tc0->state = TCP_STATE_TIME_WAIT;
+             TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
              goto drop;
 
              break;
@@ -2289,6 +2300,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                }
 
              tc0->state = TCP_STATE_CLOSED;
+             TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
 
              /* Don't delete the connection/session yet. Instead, wait a
               * reasonable amount of time until the pipes are cleared. In
@@ -2329,10 +2341,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
            case TCP_STATE_ESTABLISHED:
            case TCP_STATE_FIN_WAIT_1:
            case TCP_STATE_FIN_WAIT_2:
-             vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset);
-             error0 = tcp_segment_rcv (tm, tc0, b0,
-                                       vnet_buffer (b0)->tcp.data_len,
-                                       &next0);
+             if (vnet_buffer (b0)->tcp.data_len)
+               error0 = tcp_segment_rcv (tm, tc0, b0, &next0);
              break;
            case TCP_STATE_CLOSE_WAIT:
            case TCP_STATE_CLOSING:
@@ -2357,6 +2367,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              next0 = tcp_next_output (tc0->c_is_ip4);
              stream_session_disconnect_notify (&tc0->connection);
              tc0->state = TCP_STATE_CLOSE_WAIT;
+             TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
              break;
            case TCP_STATE_CLOSE_WAIT:
            case TCP_STATE_CLOSING:
@@ -2367,6 +2378,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              tc0->state = TCP_STATE_TIME_WAIT;
              tcp_connection_timers_reset (tc0);
              tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+             TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
              break;
            case TCP_STATE_FIN_WAIT_2:
              /* Got FIN, send ACK! */
@@ -2375,6 +2387,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
              tcp_make_ack (tc0, b0);
              next0 = tcp_next_output (is_ip4);
+             TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
              break;
            case TCP_STATE_TIME_WAIT:
              /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait
@@ -2486,7 +2499,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 {
   u32 n_left_from, next_index, *from, *to_next;
   u32 my_thread_index = vm->thread_index;
-  tcp_main_t *tm = vnet_get_tcp_main ();
   u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
 
   from = vlib_frame_vector_args (from_frame);
@@ -2549,14 +2561,10 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          /* 3. check for a SYN (did that already) */
 
          /* Create child session and send SYN-ACK */
-         pool_get (tm->connections[my_thread_index], child0);
-         memset (child0, 0, sizeof (*child0));
-
-         child0->c_c_index = child0 - tm->connections[my_thread_index];
+         child0 = tcp_connection_new (my_thread_index);
          child0->c_lcl_port = lc0->c_lcl_port;
          child0->c_rmt_port = th0->src_port;
          child0->c_is_ip4 = is_ip4;
-         child0->c_thread_index = my_thread_index;
          child0->state = TCP_STATE_SYN_RCVD;
 
          if (is_ip4)
@@ -2605,7 +2613,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
 
          tcp_connection_init_vars (child0);
-
          TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0);
 
          /* Reuse buffer to make syn-ack and send */
@@ -2722,6 +2729,31 @@ typedef enum _tcp_input_next
 
 #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN)
 
+static u8
+tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr)
+{
+  transport_connection_t *tmp;
+  if (!tc)
+    return 1;
+
+  u8 is_valid = (tc->c_lcl_port == hdr->dst_port
+                && (tc->state == TCP_STATE_LISTEN
+                    || tc->c_rmt_port == hdr->src_port));
+
+  if (!is_valid)
+    {
+      if ((tmp = stream_session_lookup_half_open (&tc->connection)))
+       {
+         if (tmp->lcl_port == hdr->dst_port
+             && tmp->rmt_port == hdr->src_port)
+           {
+             clib_warning ("half-open is valid!");
+           }
+       }
+    }
+  return is_valid;
+}
+
 always_inline uword
 tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                    vlib_frame_t * from_frame, int is_ip4)
@@ -2774,7 +2806,6 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              n_data_bytes0 = clib_net_to_host_u16 (ip40->length)
                - n_advance_bytes0;
 
-             /* lookup session */
              tc0 =
                (tcp_connection_t *)
                stream_session_lookup_transport_wt4 (&ip40->dst_address,
@@ -2783,6 +2814,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                                                     tcp0->src_port,
                                                     SESSION_TYPE_IP4_TCP,
                                                     my_thread_index);
+             ASSERT (tcp_lookup_is_valid (tc0, tcp0));
            }
          else
            {
@@ -2795,12 +2827,13 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
              tc0 =
                (tcp_connection_t *)
-               stream_session_lookup_transport_wt6 (&ip60->src_address,
-                                                    &ip60->dst_address,
-                                                    tcp0->src_port,
+               stream_session_lookup_transport_wt6 (&ip60->dst_address,
+                                                    &ip60->src_address,
                                                     tcp0->dst_port,
+                                                    tcp0->src_port,
                                                     SESSION_TYPE_IP6_TCP,
                                                     my_thread_index);
+             ASSERT (tcp_lookup_is_valid (tc0, tcp0));
            }
 
          /* Length check */
index 35f3eba..5e9ecf1 100644 (file)
@@ -75,12 +75,34 @@ tcp_window_compute_scale (u32 available_space)
 }
 
 /**
- * TCP's IW as recommended by RFC6928
+ * Update max segment size we're able to process.
+ *
+ * The value is constrained by our interface's MTU and IP options. It is
+ * also what we advertise to our peer.
+ */
+void
+tcp_update_rcv_mss (tcp_connection_t * tc)
+{
+  /* TODO find our iface MTU */
+  tc->mss = dummy_mtu;
+}
+
+/**
+ * TCP's initial window
  */
 always_inline u32
 tcp_initial_wnd_unscaled (tcp_connection_t * tc)
 {
-  return TCP_IW_N_SEGMENTS * tc->mss;
+  /* RFC 6928 recommends the value lower. However at the time our connections
+   * are initialized, fifos may not be allocated. Therefore, advertise the
+   * smallest possible unscaled window size and update once fifos are
+   * assigned to the session.
+   */
+  /*
+     tcp_update_rcv_mss (tc);
+     TCP_IW_N_SEGMENTS * tc->mss;
+   */
+  return TCP_MIN_RX_FIFO_SIZE;
 }
 
 /**
@@ -372,19 +394,6 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts,
     }
 }
 
-/**
- * Update max segment size we're able to process.
- *
- * The value is constrained by our interface's MTU and IP options. It is
- * also what we advertise to our peer.
- */
-void
-tcp_update_rcv_mss (tcp_connection_t * tc)
-{
-  /* TODO find our iface MTU */
-  tc->mss = dummy_mtu;
-}
-
 /**
  * Update snd_mss to reflect the effective segment size that we can send
  * by taking into account all TCP options, including SACKs
@@ -576,6 +585,7 @@ tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b)
 
   /* Init retransmit timer */
   tcp_retransmit_timer_set (tc);
+  TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc);
 }
 
 always_inline void
@@ -684,7 +694,7 @@ tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0,
  *  Send reset without reusing existing buffer
  */
 void
-tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4)
+tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4)
 {
   vlib_buffer_t *b;
   u32 bi;
@@ -720,7 +730,7 @@ tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4)
     {
       flags = TCP_FLAG_RST;
       seq = pkt_th->ack_number;
-      ack = 0;
+      ack = (tc && tc->state >= TCP_STATE_SYN_RCVD) ? tc->rcv_nxt : 0;
     }
   else
     {
@@ -754,6 +764,7 @@ tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4)
     }
 
   tcp_enqueue_to_ip_lookup (vm, b, bi, is_ip4);
+  TCP_EVT_DBG (TCP_EVT_RST_SENT, tc);
 }
 
 void
@@ -839,6 +850,7 @@ tcp_send_syn (tcp_connection_t * tc)
 
   tcp_push_ip_hdr (tm, tc, b);
   tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4);
+  TCP_EVT_DBG (TCP_EVT_SYN_SENT, tc);
 }
 
 always_inline void
@@ -1148,12 +1160,13 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
        tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
 
       vlib_buffer_make_headroom (b, MAX_HDRS_LEN);
-
       tcp_push_hdr_i (tc, b, tc->state, 1);
 
       /* Account for the SYN */
       tc->snd_nxt += 1;
       tc->rtt_ts = 0;
+      TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc,
+                  (tc->state == TCP_STATE_SYN_SENT ? 0 : 1));
     }
   else
     {
@@ -1173,8 +1186,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
     {
       ASSERT (tc->state == TCP_STATE_SYN_SENT);
 
-      TCP_EVT_DBG (TCP_EVT_SYN_RTX, tc);
-
       /* This goes straight to ipx_lookup */
       tcp_push_ip_hdr (tm, tc, b);
       tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4);
index f37ba96..5c40ddf 100644 (file)
@@ -1550,6 +1550,101 @@ tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input)
   return res;
 }
 
+static int
+tcp_test_lookup (vlib_main_t * vm, unformat_input_t * input)
+{
+  session_manager_main_t *smm = &session_manager_main;
+  tcp_main_t *tm = &tcp_main;
+  transport_connection_t _tc1, *tc1 = &_tc1, _tc2, *tc2 = &_tc2, *tconn;
+  tcp_connection_t *tc;
+  stream_session_t *s;
+  u8 cmp = 0;
+
+  pool_get (smm->sessions[0], s);
+  memset (s, 0, sizeof (*s));
+  s->session_index = s - smm->sessions[0];
+
+  pool_get (tm->connections[0], tc);
+  memset (tc, 0, sizeof (*tc));
+  tc->connection.c_index = tc - tm->connections[0];
+  tc->connection.s_index = s->session_index;
+  s->connection_index = tc->connection.c_index;
+
+  tc->connection.lcl_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000101);
+  tc->connection.rmt_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000103);
+  tc->connection.lcl_port = 35051;
+  tc->connection.rmt_port = 53764;
+  tc->connection.proto = 0;
+  clib_memcpy (tc1, &tc->connection, sizeof (*tc1));
+
+  pool_get (session_manager_main.sessions[0], s);
+  memset (s, 0, sizeof (*s));
+  s->session_index = s - smm->sessions[0];
+  pool_get (tm->connections[0], tc);
+  memset (tc, 0, sizeof (*tc));
+  tc->connection.c_index = tc - tm->connections[0];
+  tc->connection.s_index = s->session_index;
+  s->connection_index = tc->connection.c_index;
+
+  tc->connection.lcl_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000101);
+  tc->connection.rmt_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000102);
+  tc->connection.lcl_port = 38225;
+  tc->connection.rmt_port = 53764;
+  tc->connection.proto = 0;
+  clib_memcpy (tc2, &tc->connection, sizeof (*tc2));
+
+  /*
+   * Confirm that connection lookup works
+   */
+
+  stream_session_table_add_for_tc (tc1, tc1->s_index);
+  tconn = stream_session_lookup_transport_wt4 (&tc1->lcl_ip.ip4,
+                                              &tc1->rmt_ip.ip4,
+                                              tc1->lcl_port, tc1->rmt_port,
+                                              tc1->proto, 0);
+  cmp = (memcmp (&tconn->rmt_ip, &tc1->rmt_ip, sizeof (tc1->rmt_ip)) == 0);
+  TCP_TEST ((cmp), "rmt ip is identical %d", cmp);
+  TCP_TEST ((tconn->lcl_port == tc1->lcl_port),
+           "rmt port is identical %d", tconn->lcl_port == tc1->lcl_port);
+
+  /*
+   * Non-existing connection lookup should not work
+   */
+
+  tconn = stream_session_lookup_transport_wt4 (&tc2->lcl_ip.ip4,
+                                              &tc2->rmt_ip.ip4,
+                                              tc2->lcl_port, tc2->rmt_port,
+                                              tc2->proto, 0);
+  TCP_TEST ((tconn == 0), "lookup result should be null");
+
+  /*
+   * Delete and lookup again
+   */
+  stream_session_table_del_for_tc (tc1);
+  tconn = stream_session_lookup_transport_wt4 (&tc1->lcl_ip.ip4,
+                                              &tc1->rmt_ip.ip4,
+                                              tc1->lcl_port, tc1->rmt_port,
+                                              tc1->proto, 0);
+  TCP_TEST ((tconn == 0), "lookup result should be null");
+  tconn = stream_session_lookup_transport_wt4 (&tc2->lcl_ip.ip4,
+                                              &tc2->rmt_ip.ip4,
+                                              tc2->lcl_port, tc2->rmt_port,
+                                              tc2->proto, 0);
+  TCP_TEST ((tconn == 0), "lookup result should be null");
+
+  /*
+   * Re-add and lookup tc2
+   */
+  stream_session_table_add_for_tc (tc1, tc1->s_index);
+  tconn = stream_session_lookup_transport_wt4 (&tc2->lcl_ip.ip4,
+                                              &tc2->rmt_ip.ip4,
+                                              tc2->lcl_port, tc2->rmt_port,
+                                              tc2->proto, 0);
+  TCP_TEST ((tconn == 0), "lookup result should be null");
+
+  return 0;
+}
+
 static int
 tcp_test_session (vlib_main_t * vm, unformat_input_t * input)
 {
@@ -1632,6 +1727,10 @@ tcp_test (vlib_main_t * vm,
        {
          res = tcp_test_session (vm, input);
        }
+      else if (unformat (input, "lookup"))
+       {
+         res = tcp_test_lookup (vm, input);
+       }
       else
        break;
     }