tcp: retransmit and multi-buffer segment fixes and improvements
[vpp.git] / src / vnet / tcp / tcp_input.c
index 2d36c85..95f9ade 100644 (file)
@@ -1363,7 +1363,7 @@ always_inline int
 tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
                          u16 data_len)
 {
-  int written;
+  int written, error = TCP_ERROR_ENQUEUED;
 
   ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
 
@@ -1381,12 +1381,12 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
   /* Update rcv_nxt */
   if (PREDICT_TRUE (written == data_len))
     {
-      tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end;
+      tc->rcv_nxt += written;
     }
   /* If more data written than expected, account for out-of-order bytes. */
   else if (written > data_len)
     {
-      tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end + written - data_len;
+      tc->rcv_nxt += written;
 
       /* Send ACK confirming the update */
       tc->flags |= TCP_CONN_SNDACK;
@@ -1400,7 +1400,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
        * not be enqueued. Inform peer */
       tc->flags |= TCP_CONN_SNDACK;
 
-      return TCP_ERROR_PARTIALLY_ENQUEUED;
+      error = TCP_ERROR_PARTIALLY_ENQUEUED;
     }
   else
     {
@@ -1415,7 +1415,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
       tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt);
     }
 
-  return TCP_ERROR_ENQUEUED;
+  return error;
 }
 
 /** Enqueue out-of-order data */
@@ -1495,10 +1495,10 @@ tcp_can_delack (tcp_connection_t * tc)
 static int
 tcp_buffer_discard_bytes (vlib_buffer_t * b, u32 n_bytes_to_drop)
 {
-  u32 discard;
+  u32 discard, first = b->current_length;
   vlib_main_t *vm = vlib_get_main ();
 
-  /* Handle multi segment packets */
+  /* Handle multi-buffer segments */
   if (n_bytes_to_drop > b->current_length)
     {
       if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT))
@@ -1511,7 +1511,12 @@ tcp_buffer_discard_bytes (vlib_buffer_t * b, u32 n_bytes_to_drop)
          n_bytes_to_drop -= discard;
        }
       while (n_bytes_to_drop);
+      if (n_bytes_to_drop > first)
+       b->total_length_not_including_first_buffer -= n_bytes_to_drop - first;
     }
+  else
+    vlib_buffer_advance (b, n_bytes_to_drop);
+  vnet_buffer (b)->tcp.data_len -= n_bytes_to_drop;
   return 0;
 }
 
@@ -1877,26 +1882,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          tc0 =
            tcp_half_open_connection_get (vnet_buffer (b0)->
                                          tcp.connection_index);
+         ASSERT (tc0);
 
          ack0 = vnet_buffer (b0)->tcp.ack_number;
          seq0 = vnet_buffer (b0)->tcp.seq_number;
          tcp0 = tcp_buffer_hdr (b0);
 
-         if (!tc0)
-           {
-             ip4_header_t *ip40 = vlib_buffer_get_current (b0);
-             tcp0 = ip4_next_header (ip40);
-             tc0 =
-               (tcp_connection_t *)
-               stream_session_lookup_transport_wt4 (&ip40->dst_address,
-                                                    &ip40->src_address,
-                                                    tcp0->dst_port,
-                                                    tcp0->src_port,
-                                                    SESSION_TYPE_IP4_TCP,
-                                                    my_thread_index);
-             ASSERT (0);
-             goto drop;
-           }
          if (PREDICT_FALSE
              (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0)))
            goto drop;
@@ -1922,7 +1913,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt)
                {
                  if (!tcp_rst (tcp0))
-                   tcp_send_reset (tc0, b0, is_ip4);
+                   tcp_send_reset_w_pkt (tc0, b0, is_ip4);
                  goto drop;
                }
 
@@ -2009,7 +2000,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
               * allocate session send reset */
              if (stream_session_connect_notify (&new_tc0->connection, 0))
                {
-                 tcp_send_reset (new_tc0, b0, is_ip4);
+                 tcp_send_reset_w_pkt (new_tc0, b0, is_ip4);
                  tcp_connection_cleanup (new_tc0);
                  goto drop;
                }
@@ -2031,7 +2022,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              if (stream_session_connect_notify (&new_tc0->connection, 0))
                {
                  tcp_connection_cleanup (new_tc0);
-                 tcp_send_reset (tc0, b0, is_ip4);
+                 tcp_send_reset_w_pkt (tc0, b0, is_ip4);
                  TCP_EVT_DBG (TCP_EVT_RST_SENT, tc0);
                  goto drop;
                }
@@ -2235,7 +2226,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
               */
              if (!tcp_rcv_ack_is_acceptable (tc0, b0))
                {
-                 tcp_send_reset (tc0, b0, is_ip4);
+                 tcp_send_reset_w_pkt (tc0, b0, is_ip4);
                  goto drop;
                }
 
@@ -2254,8 +2245,9 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
              stream_session_accept_notify (&tc0->connection);
 
-             /* Reset SYN-ACK retransmit timer */
+             /* Reset SYN-ACK retransmit and SYN_RCV establish timers */
              tcp_retransmit_timer_reset (tc0);
+             tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH);
              break;
            case TCP_STATE_ESTABLISHED:
              /* We can get packets in established state here because they
@@ -2281,13 +2273,21 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              /* If FIN is ACKed */
              else if (tc0->snd_una == tc0->snd_una_max)
                {
-                 ASSERT (tcp_fin (tcp0));
                  tc0->rcv_nxt += 1;
                  tc0->state = TCP_STATE_FIN_WAIT_2;
                  TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
 
-                 /* Stop all timers, 2MSL will be set lower */
-                 tcp_connection_timers_reset (tc0);
+                 if (tcp_fin (tcp0))
+                   {
+                     /* Stop all timers, 2MSL will be set lower */
+                     tcp_connection_timers_reset (tc0);
+                   }
+                 else
+                   {
+                     /* Wait for peer to finish sending its data */
+                     tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE,
+                                       TCP_2MSL_TIME);
+                   }
                }
              break;
            case TCP_STATE_FIN_WAIT_2:
@@ -2296,8 +2296,6 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
               * acknowledged ("ok") but do not delete the TCB. */
              if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
                goto drop;
-
-             /* check if rtx queue is empty and ack CLOSE TODO */
              break;
            case TCP_STATE_CLOSE_WAIT:
              /* Do the same processing as for the ESTABLISHED state. */
@@ -2311,9 +2309,9 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
                goto drop;
 
-             /* XXX test that send queue empty */
              tc0->state = TCP_STATE_TIME_WAIT;
              TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
+             tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
              goto drop;
 
              break;
@@ -2409,10 +2407,12 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              /* move along .. */
              break;
            case TCP_STATE_FIN_WAIT_1:
-             tc0->state = TCP_STATE_TIME_WAIT;
-             tcp_connection_timers_reset (tc0);
-             tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+             tc0->state = TCP_STATE_CLOSING;
+             tcp_make_ack (tc0, b0);
+             next0 = tcp_next_output (is_ip4);
              TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
+             /* Wait for ACK but not forever */
+             tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
              break;
            case TCP_STATE_FIN_WAIT_2:
              /* Got FIN, send ACK! */
@@ -2652,6 +2652,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          /* Reuse buffer to make syn-ack and send */
          tcp_make_synack (child0, b0);
          next0 = tcp_next_output (is_ip4);
+         tcp_timer_set (child0, TCP_TIMER_ESTABLISH, TCP_SYN_RCVD_TIME);
 
        drop:
          if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))