VPP-659 TCP improvements
[vpp.git] / src / vnet / tcp / tcp_input.c
index daa0683..f19fbf8 100644 (file)
@@ -274,10 +274,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
   /* 2nd: check the RST bit */
   if (tcp_rst (th0))
     {
-      /* Notify session that connection has been reset. Switch
-       * state to closed and await for session to do the cleanup. */
-      stream_session_reset_notify (&tc0->connection);
-      tc0->state = TCP_STATE_CLOSED;
+      tcp_connection_reset (tc0);
       return -1;
     }
 
@@ -711,7 +708,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
   if (tcp_opts_sack_permitted (&tc->opt))
     tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
 
-  new_snd_wnd = clib_net_to_host_u32 (th->window) << tc->snd_wscale;
+  new_snd_wnd = clib_net_to_host_u16 (th->window) << tc->snd_wscale;
 
   if (tcp_ack_is_dupack (tc, b, new_snd_wnd))
     {
@@ -1023,6 +1020,12 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
                                    my_thread_index);
 
+         if (PREDICT_FALSE (tc0 == 0))
+           {
+             error0 = TCP_ERROR_INVALID_CONNECTION;
+             goto drop;
+           }
+
          /* Checksum computed by ipx_local no need to compute again */
 
          if (is_ip4)
@@ -1072,12 +1075,12 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          /* 8: check the FIN bit */
          if (tcp_fin (th0))
            {
-             /* Send ACK and enter CLOSE-WAIT */
-             tcp_make_ack (tc0, b0);
-             tcp_connection_force_ack (tc0, b0);
-             next0 = tcp_next_output (tc0->c_is_ip4);
+             /* Enter CLOSE-WAIT and notify session. Don't send ACK, instead
+              * wait for session to call close. To avoid lingering
+              * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
              tc0->state = TCP_STATE_CLOSE_WAIT;
              stream_session_disconnect_notify (&tc0->connection);
+             tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
            }
 
        drop:
@@ -1320,7 +1323,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
          /* Parse options */
          tcp_options_parse (tcp0, &new_tc0->opt);
-         tcp_connection_init_vars (new_tc0);
 
          if (tcp_opts_tstamp (&new_tc0->opt))
            {
@@ -1331,11 +1333,13 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          if (tcp_opts_wscale (&new_tc0->opt))
            new_tc0->snd_wscale = new_tc0->opt.wscale;
 
-         new_tc0->snd_wnd = clib_net_to_host_u32 (tcp0->window)
-           << new_tc0->snd_wscale;
+         /* No scaling */
+         new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window);
          new_tc0->snd_wl1 = seq0;
          new_tc0->snd_wl2 = ack0;
 
+         tcp_connection_init_vars (new_tc0);
+
          /* SYN-ACK: See if we can switch to ESTABLISHED state */
          if (tcp_ack (tcp0))
            {
@@ -1345,6 +1349,9 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              new_tc0->snd_una = ack0;
              new_tc0->state = TCP_STATE_ESTABLISHED;
 
+             /* Make sure las is initialized for the wnd computation */
+             new_tc0->rcv_las = new_tc0->rcv_nxt;
+
              /* Notify app that we have connection */
              stream_session_connect_notify (&new_tc0->connection, sst, 0);
 
@@ -1464,7 +1471,7 @@ VLIB_REGISTER_NODE (tcp6_syn_sent_node) =
 
 VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv);
 /**
- * Handles reception for all states except LISTEN, SYN-SEND and ESTABLISHED
+ * Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED
  * as per RFC793 p. 64
  */
 always_inline uword
@@ -1507,6 +1514,11 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          b0 = vlib_get_buffer (vm, bi0);
          tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
                                    my_thread_index);
+         if (PREDICT_FALSE (tc0 == 0))
+           {
+             error0 = TCP_ERROR_INVALID_CONNECTION;
+             goto drop;
+           }
 
          /* Checksum computed by ipx_local no need to compute again */
 
@@ -1575,7 +1587,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
              /* Initialize session variables */
              tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
-             tc0->snd_wnd = clib_net_to_host_u32 (tcp0->window)
+             tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
                << tc0->opt.wscale;
              tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
              tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
@@ -1583,7 +1595,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              /* Shoulder tap the server */
              stream_session_accept_notify (&tc0->connection);
 
-             tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN);
+             /* Reset SYN-ACK retransmit timer */
+             tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT);
              break;
            case TCP_STATE_ESTABLISHED:
              /* We can get packets in established state here because they
@@ -1598,9 +1611,14 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
               * continue processing in that state. */
              if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
                goto drop;
-             tc0->state = TCP_STATE_FIN_WAIT_2;
-             /* Stop all timers, 2MSL will be set lower */
-             tcp_connection_timers_reset (tc0);
+
+             /* If FIN is ACKed */
+             if (tc0->snd_una == tc0->snd_una_max)
+               {
+                 tc0->state = TCP_STATE_FIN_WAIT_2;
+                 /* Stop all timers, 2MSL will be set lower */
+                 tcp_connection_timers_reset (tc0);
+               }
              break;
            case TCP_STATE_FIN_WAIT_2:
              /* In addition to the processing for the ESTABLISHED state, if
@@ -1635,7 +1653,17 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              if (!tcp_rcv_ack_is_acceptable (tc0, b0))
                goto drop;
 
-             tcp_connection_del (tc0);
+             tc0->state = TCP_STATE_CLOSED;
+
+             /* Don't delete the connection/session yet. Instead, wait a
+              * reasonable amount of time until the pipes are cleared. In
+              * particular, this makes sure that we won't have dead sessions
+              * when processing events on the tx path */
+             tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
+
+             /* Stop retransmit */
+             tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT);
+
              goto drop;
 
              break;
@@ -1680,7 +1708,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
            case TCP_STATE_SYN_RCVD:
              /* Send FIN-ACK notify app and enter CLOSE-WAIT */
              tcp_connection_timers_reset (tc0);
-             tcp_make_finack (tc0, b0);
+             tcp_make_fin (tc0, b0);
              next0 = tcp_next_output (tc0->c_is_ip4);
              stream_session_disconnect_notify (&tc0->connection);
              tc0->state = TCP_STATE_CLOSE_WAIT;
@@ -1693,12 +1721,12 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
            case TCP_STATE_FIN_WAIT_1:
              tc0->state = TCP_STATE_TIME_WAIT;
              tcp_connection_timers_reset (tc0);
-             tcp_timer_set (tc0, TCP_TIMER_2MSL, TCP_2MSL_TIME);
+             tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
              break;
            case TCP_STATE_FIN_WAIT_2:
              /* Got FIN, send ACK! */
              tc0->state = TCP_STATE_TIME_WAIT;
-             tcp_timer_set (tc0, TCP_TIMER_2MSL, TCP_2MSL_TIME);
+             tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
              tcp_make_ack (tc0, b0);
              next0 = tcp_next_output (is_ip4);
              break;
@@ -1706,7 +1734,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait
               * timeout.
               */
-             tcp_timer_update (tc0, TCP_TIMER_2MSL, TCP_2MSL_TIME);
+             tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
              break;
            }
 
@@ -1899,7 +1927,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
            }
 
          tcp_options_parse (th0, &child0->opt);
-         tcp_connection_init_vars (child0);
 
          child0->irs = vnet_buffer (b0)->tcp.seq_number;
          child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1;
@@ -1913,6 +1940,16 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              child0->tsval_recent_age = tcp_time_now ();
            }
 
+         if (tcp_opts_wscale (&child0->opt))
+           child0->snd_wscale = child0->opt.wscale;
+
+         /* No scaling */
+         child0->snd_wnd = clib_net_to_host_u16 (th0->window);
+         child0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
+         child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
+
+         tcp_connection_init_vars (child0);
+
          /* Reuse buffer to make syn-ack and send */
          tcp_make_synack (child0, b0);
          next0 = tcp_next_output (is_ip4);
@@ -1923,7 +1960,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
            }
 
-         b0->error = error0 ? node->errors[error0] : 0;
+         b0->error = node->errors[error0];
 
          vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
                                           n_left_to_next, bi0, next0);
@@ -2069,7 +2106,6 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
   u32 n_left_from, next_index, *from, *to_next;
   u32 my_thread_index = vm->cpu_index;
   tcp_main_t *tm = vnet_get_tcp_main ();
-  session_manager_main_t *ssm = vnet_get_session_manager_main ();
 
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
@@ -2101,6 +2137,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          n_left_to_next -= 1;
 
          b0 = vlib_get_buffer (vm, bi0);
+         vnet_buffer (b0)->tcp.flags = 0;
 
          if (is_ip4)
            {
@@ -2109,26 +2146,26 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
              /* lookup session */
              tc0 =
-               (tcp_connection_t *) stream_session_lookup_transport4 (ssm,
-                                                                      &ip40->dst_address,
-                                                                      &ip40->src_address,
-                                                                      tcp0->dst_port,
-                                                                      tcp0->src_port,
-                                                                      SESSION_TYPE_IP4_TCP,
-                                                                      my_thread_index);
+               (tcp_connection_t *)
+               stream_session_lookup_transport4 (&ip40->dst_address,
+                                                 &ip40->src_address,
+                                                 tcp0->dst_port,
+                                                 tcp0->src_port,
+                                                 SESSION_TYPE_IP4_TCP,
+                                                 my_thread_index);
            }
          else
            {
              ip60 = vlib_buffer_get_current (b0);
              tcp0 = ip6_next_header (ip60);
              tc0 =
-               (tcp_connection_t *) stream_session_lookup_transport6 (ssm,
-                                                                      &ip60->src_address,
-                                                                      &ip60->dst_address,
-                                                                      tcp0->src_port,
-                                                                      tcp0->dst_port,
-                                                                      SESSION_TYPE_IP6_TCP,
-                                                                      my_thread_index);
+               (tcp_connection_t *)
+               stream_session_lookup_transport6 (&ip60->src_address,
+                                                 &ip60->dst_address,
+                                                 tcp0->src_port,
+                                                 tcp0->dst_port,
+                                                 SESSION_TYPE_IP6_TCP,
+                                                 my_thread_index);
            }
 
          /* Session exists */
@@ -2156,7 +2193,6 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              /* Send reset */
              next0 = TCP_INPUT_NEXT_RESET;
              error0 = TCP_ERROR_NO_LISTENER;
-             vnet_buffer (b0)->tcp.flags = 0;
            }
 
          b0->error = error0 ? node->errors[error0] : 0;
@@ -2276,6 +2312,7 @@ do {                                                              \
   _(ESTABLISHED, TCP_FLAG_FIN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
   _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED,
     TCP_ERROR_NONE);
+  _(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
   /* ACK or FIN-ACK to our FIN */
   _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
   _(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS,