Improvements to tcp rx path and debugging
[vpp.git] / src / vnet / tcp / tcp_input.c
index bc7d901..cc5cecd 100644 (file)
@@ -349,7 +349,10 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
   /* 4th: check the SYN bit */
   if (tcp_syn (th0))
     {
-      tcp_send_reset (b0, tc0->c_is_ip4);
+      /* TODO implement RFC 5961 */
+      tcp_make_ack (tc0, b0);
+      *next0 = tcp_next_output (tc0->c_is_ip4);
+      TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0);
       return -1;
     }
 
@@ -1246,8 +1249,6 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
    * Looks okay, process feedback
    */
 
-  TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
-
   if (tcp_opts_sack_permitted (&tc->rcv_opts))
     tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
 
@@ -1263,6 +1264,8 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
   if (tc->bytes_acked)
     tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number);
 
+  TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
+
   /*
    * Check if we have congestion event
    */
@@ -1496,9 +1499,13 @@ tcp_can_delack (tcp_connection_t * tc)
 
 static int
 tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b,
-                u16 n_data_bytes, u32 * next0)
+                u32 * next0)
 {
-  u32 error = 0, n_bytes_to_drop;
+  u32 error = 0, n_bytes_to_drop, n_data_bytes;
+
+  vlib_buffer_advance (b, vnet_buffer (b)->tcp.data_offset);
+  n_data_bytes = vnet_buffer (b)->tcp.data_len;
+  ASSERT (n_data_bytes);
 
   /* Handle out-of-order data */
   if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt))
@@ -1512,7 +1519,12 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b,
 
          /* Completely in the past (possible retransmit) */
          if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
-           goto done;
+           {
+             /* Ack retransmissions since we may not have any data to send */
+             tcp_make_ack (tc, b);
+             *next0 = tcp_next_output (tc->c_is_ip4);
+             goto done;
+           }
 
          /* Chop off the bytes in the past */
          n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number;
@@ -1550,12 +1562,6 @@ in_order:
    * segments can be enqueued after fifo tail offset changes. */
   error = tcp_session_enqueue_data (tc, b, n_data_bytes);
 
-  if (n_data_bytes == 0)
-    {
-      *next0 = TCP_NEXT_DROP;
-      goto done;
-    }
-
   /* Check if ACK can be delayed */
   if (tcp_can_delack (tc))
     {
@@ -1680,7 +1686,9 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
            }
 
          th0 = tcp_buffer_hdr (b0);
-         is_fin = (th0->flags & TCP_FLAG_FIN) != 0;
+         /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a
+          * dangling reference. */
+         is_fin = tcp_is_fin (th0);
 
          /* SYNs, FINs and data consume sequence numbers */
          vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number
@@ -1700,29 +1708,23 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
          /* 5: check the ACK field  */
          if (tcp_rcv_ack (tc0, b0, th0, &next0, &error0))
-           {
-             goto done;
-           }
+           goto done;
 
          /* 6: check the URG bit TODO */
 
          /* 7: process the segment text */
-
-         vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset);
-         error0 = tcp_segment_rcv (tm, tc0, b0,
-                                   vnet_buffer (b0)->tcp.data_len, &next0);
-
-         /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a
-          * dangling reference. */
+         if (vnet_buffer (b0)->tcp.data_len)
+           error0 = tcp_segment_rcv (tm, tc0, b0, &next0);
 
          /* 8: check the FIN bit */
-         if (is_fin)
+         if (PREDICT_FALSE (is_fin))
            {
              /* Enter CLOSE-WAIT and notify session. Don't send ACK, instead
               * wait for session to call close. To avoid lingering
               * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
              tc0->state = TCP_STATE_CLOSE_WAIT;
              TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
+             tc0->rcv_nxt += (vnet_buffer (b0)->tcp.data_len == 0);
              stream_session_disconnect_notify (&tc0->connection);
              tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
            }
@@ -1856,6 +1858,21 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          seq0 = vnet_buffer (b0)->tcp.seq_number;
          tcp0 = tcp_buffer_hdr (b0);
 
+         if (!tc0)
+           {
+             ip4_header_t *ip40 = vlib_buffer_get_current (b0);
+             tcp0 = ip4_next_header (ip40);
+             tc0 =
+               (tcp_connection_t *)
+               stream_session_lookup_transport_wt4 (&ip40->dst_address,
+                                                    &ip40->src_address,
+                                                    tcp0->dst_port,
+                                                    tcp0->src_port,
+                                                    SESSION_TYPE_IP4_TCP,
+                                                    my_thread_index);
+             ASSERT (0);
+             goto drop;
+           }
          if (PREDICT_FALSE
              (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0)))
            goto drop;
@@ -1881,8 +1898,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt)
                {
                  if (!tcp_rst (tcp0))
-                   tcp_send_reset (b0, is_ip4);
-
+                   tcp_send_reset (tc0, b0, is_ip4);
                  goto drop;
                }
 
@@ -1900,11 +1916,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              /* If ACK is acceptable, signal client that peer is not
               * willing to accept connection and drop connection*/
              if (tcp_ack (tcp0))
-               {
-                 stream_session_connect_notify (&tc0->connection, sst,
-                                                1 /* fail */ );
-                 tcp_connection_cleanup (tc0);
-               }
+               tcp_connection_reset (tc0);
              goto drop;
            }
 
@@ -1920,6 +1932,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          if (!tcp_syn (tcp0))
            goto drop;
 
+         /* Parse options */
+         if (tcp_options_parse (tcp0, &tc0->rcv_opts))
+           goto drop;
+
          /* Stop connection establishment and retransmit timers */
          tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH);
          tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN);
@@ -1928,19 +1944,11 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
           * current thread pool. */
          pool_get (tm->connections[my_thread_index], new_tc0);
          clib_memcpy (new_tc0, tc0, sizeof (*new_tc0));
-
-         new_tc0->c_thread_index = my_thread_index;
          new_tc0->c_c_index = new_tc0 - tm->connections[my_thread_index];
-
-         /* Cleanup half-open connection XXX lock */
-         pool_put (tm->half_open_connections, tc0);
-
+         new_tc0->c_thread_index = my_thread_index;
          new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end;
          new_tc0->irs = seq0;
-
-         /* Parse options */
-         if (tcp_options_parse (tcp0, &new_tc0->rcv_opts))
-           goto drop;
+         tcp_half_open_connection_del (tc0);
 
          if (tcp_opts_tstamp (&new_tc0->rcv_opts))
            {
@@ -1959,7 +1967,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          tcp_connection_init_vars (new_tc0);
 
          /* SYN-ACK: See if we can switch to ESTABLISHED state */
-         if (tcp_ack (tcp0))
+         if (PREDICT_TRUE (tcp_ack (tcp0)))
            {
              /* Our SYN is ACKed: we have iss < ack = snd_una */
 
@@ -1976,7 +1984,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                                                 0))
                {
                  tcp_connection_cleanup (new_tc0);
-                 tcp_send_reset (b0, is_ip4);
+                 tcp_send_reset (tc0, b0, is_ip4);
                  goto drop;
                }
 
@@ -1986,6 +1994,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              /* Update rtt with the syn-ack sample */
              new_tc0->bytes_acked = 1;
              tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number);
+             TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, new_tc0);
            }
          /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */
          else
@@ -1997,12 +2006,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                  (&new_tc0->connection, sst, 0))
                {
                  tcp_connection_cleanup (new_tc0);
-                 tcp_send_reset (b0, is_ip4);
+                 tcp_send_reset (tc0, b0, is_ip4);
+                 TCP_EVT_DBG (TCP_EVT_RST_SENT, tc0);
                  goto drop;
                }
 
              tc0->rtt_ts = 0;
-
              tcp_make_synack (new_tc0, b0);
              next0 = tcp_next_output (is_ip4);
 
@@ -2010,12 +2019,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
            }
 
          /* Read data, if any */
-         if (vnet_buffer (b0)->tcp.data_len)
+         if (PREDICT_FALSE (vnet_buffer (b0)->tcp.data_len))
            {
-             vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset);
-             error0 = tcp_segment_rcv (tm, new_tc0, b0,
-                                       vnet_buffer (b0)->tcp.data_len,
-                                       &next0);
+             ASSERT (0);
+             error0 = tcp_segment_rcv (tm, new_tc0, b0, &next0);
              if (error0 == TCP_ERROR_PURE_ACK)
                error0 = TCP_ERROR_SYN_ACKS_RCVD;
            }
@@ -2114,6 +2121,7 @@ VLIB_REGISTER_NODE (tcp6_syn_sent_node) =
 /* *INDENT-ON* */
 
 VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv);
+
 /**
  * Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED
  * as per RFC793 p. 64
@@ -2202,7 +2210,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
               */
              if (!tcp_rcv_ack_is_acceptable (tc0, b0))
                {
-                 tcp_send_reset (b0, is_ip4);
+                 tcp_send_reset (tc0, b0, is_ip4);
                  goto drop;
                }
 
@@ -2243,6 +2251,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                {
                  ASSERT (tcp_fin (tcp0));
                  tc0->state = TCP_STATE_FIN_WAIT_2;
+                 TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
+
                  /* Stop all timers, 2MSL will be set lower */
                  tcp_connection_timers_reset (tc0);
                }
@@ -2269,6 +2279,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
              /* XXX test that send queue empty */
              tc0->state = TCP_STATE_TIME_WAIT;
+             TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
              goto drop;
 
              break;
@@ -2289,6 +2300,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                }
 
              tc0->state = TCP_STATE_CLOSED;
+             TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
 
              /* Don't delete the connection/session yet. Instead, wait a
               * reasonable amount of time until the pipes are cleared. In
@@ -2329,10 +2341,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
            case TCP_STATE_ESTABLISHED:
            case TCP_STATE_FIN_WAIT_1:
            case TCP_STATE_FIN_WAIT_2:
-             vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset);
-             error0 = tcp_segment_rcv (tm, tc0, b0,
-                                       vnet_buffer (b0)->tcp.data_len,
-                                       &next0);
+             if (vnet_buffer (b0)->tcp.data_len)
+               error0 = tcp_segment_rcv (tm, tc0, b0, &next0);
              break;
            case TCP_STATE_CLOSE_WAIT:
            case TCP_STATE_CLOSING:
@@ -2357,6 +2367,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              next0 = tcp_next_output (tc0->c_is_ip4);
              stream_session_disconnect_notify (&tc0->connection);
              tc0->state = TCP_STATE_CLOSE_WAIT;
+             TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
              break;
            case TCP_STATE_CLOSE_WAIT:
            case TCP_STATE_CLOSING:
@@ -2367,6 +2378,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              tc0->state = TCP_STATE_TIME_WAIT;
              tcp_connection_timers_reset (tc0);
              tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+             TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
              break;
            case TCP_STATE_FIN_WAIT_2:
              /* Got FIN, send ACK! */
@@ -2375,6 +2387,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
              tcp_make_ack (tc0, b0);
              next0 = tcp_next_output (is_ip4);
+             TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
              break;
            case TCP_STATE_TIME_WAIT:
              /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait
@@ -2486,7 +2499,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 {
   u32 n_left_from, next_index, *from, *to_next;
   u32 my_thread_index = vm->thread_index;
-  tcp_main_t *tm = vnet_get_tcp_main ();
   u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
 
   from = vlib_frame_vector_args (from_frame);
@@ -2549,14 +2561,10 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          /* 3. check for a SYN (did that already) */
 
          /* Create child session and send SYN-ACK */
-         pool_get (tm->connections[my_thread_index], child0);
-         memset (child0, 0, sizeof (*child0));
-
-         child0->c_c_index = child0 - tm->connections[my_thread_index];
+         child0 = tcp_connection_new (my_thread_index);
          child0->c_lcl_port = lc0->c_lcl_port;
          child0->c_rmt_port = th0->src_port;
          child0->c_is_ip4 = is_ip4;
-         child0->c_thread_index = my_thread_index;
          child0->state = TCP_STATE_SYN_RCVD;
 
          if (is_ip4)
@@ -2605,7 +2613,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
 
          tcp_connection_init_vars (child0);
-
          TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0);
 
          /* Reuse buffer to make syn-ack and send */
@@ -2722,6 +2729,31 @@ typedef enum _tcp_input_next
 
 #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN)
 
+static u8
+tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr)
+{
+  transport_connection_t *tmp;
+  if (!tc)
+    return 1;
+
+  u8 is_valid = (tc->c_lcl_port == hdr->dst_port
+                && (tc->state == TCP_STATE_LISTEN
+                    || tc->c_rmt_port == hdr->src_port));
+
+  if (!is_valid)
+    {
+      if ((tmp = stream_session_lookup_half_open (&tc->connection)))
+       {
+         if (tmp->lcl_port == hdr->dst_port
+             && tmp->rmt_port == hdr->src_port)
+           {
+             clib_warning ("half-open is valid!");
+           }
+       }
+    }
+  return is_valid;
+}
+
 always_inline uword
 tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                    vlib_frame_t * from_frame, int is_ip4)
@@ -2774,7 +2806,6 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              n_data_bytes0 = clib_net_to_host_u16 (ip40->length)
                - n_advance_bytes0;
 
-             /* lookup session */
              tc0 =
                (tcp_connection_t *)
                stream_session_lookup_transport_wt4 (&ip40->dst_address,
@@ -2783,6 +2814,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                                                     tcp0->src_port,
                                                     SESSION_TYPE_IP4_TCP,
                                                     my_thread_index);
+             ASSERT (tcp_lookup_is_valid (tc0, tcp0));
            }
          else
            {
@@ -2795,12 +2827,13 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
              tc0 =
                (tcp_connection_t *)
-               stream_session_lookup_transport_wt6 (&ip60->src_address,
-                                                    &ip60->dst_address,
-                                                    tcp0->src_port,
+               stream_session_lookup_transport_wt6 (&ip60->dst_address,
+                                                    &ip60->src_address,
                                                     tcp0->dst_port,
+                                                    tcp0->src_port,
                                                     SESSION_TYPE_IP6_TCP,
                                                     my_thread_index);
+             ASSERT (tcp_lookup_is_valid (tc0, tcp0));
            }
 
          /* Length check */