tcp: retransmit and multi-buffer segment fixes and improvements
[vpp.git] / src / vnet / tcp / tcp_input.c
index d32b4fc..95f9ade 100644 (file)
@@ -492,14 +492,6 @@ tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd,
          && (prev_snd_wnd == tc->snd_wnd));
 }
 
-static u8
-tcp_is_lost_fin (tcp_connection_t * tc)
-{
-  if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1)
-    return 1;
-  return 0;
-}
-
 /**
  * Checks if ack is a congestion control event.
  */
@@ -1162,7 +1154,8 @@ partial_ack:
 
   /* Remove retransmitted bytes that have been delivered */
   ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
-         >= tc->sack_sb.last_bytes_delivered);
+         >= tc->sack_sb.last_bytes_delivered
+         || (tc->flags & TCP_CONN_FINSNT));
 
   if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt))
     {
@@ -1273,6 +1266,8 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
   if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack))
     {
       tcp_cc_handle_event (tc, is_dack);
+      if (!tcp_in_cong_recovery (tc))
+       return 0;
       *error = TCP_ERROR_ACK_DUP;
       TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1);
       return vnet_buffer (b)->tcp.data_len ? 0 : -1;
@@ -1368,7 +1363,7 @@ always_inline int
 tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
                          u16 data_len)
 {
-  int written;
+  int written, error = TCP_ERROR_ENQUEUED;
 
   ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
 
@@ -1386,12 +1381,12 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
   /* Update rcv_nxt */
   if (PREDICT_TRUE (written == data_len))
     {
-      tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end;
+      tc->rcv_nxt += written;
     }
   /* If more data written than expected, account for out-of-order bytes. */
   else if (written > data_len)
     {
-      tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end + written - data_len;
+      tc->rcv_nxt += written;
 
       /* Send ACK confirming the update */
       tc->flags |= TCP_CONN_SNDACK;
@@ -1405,7 +1400,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
        * not be enqueued. Inform peer */
       tc->flags |= TCP_CONN_SNDACK;
 
-      return TCP_ERROR_PARTIALLY_ENQUEUED;
+      error = TCP_ERROR_PARTIALLY_ENQUEUED;
     }
   else
     {
@@ -1420,7 +1415,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
       tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt);
     }
 
-  return TCP_ERROR_ENQUEUED;
+  return error;
 }
 
 /** Enqueue out-of-order data */
@@ -1497,6 +1492,34 @@ tcp_can_delack (tcp_connection_t * tc)
   return 1;
 }
 
+static int
+tcp_buffer_discard_bytes (vlib_buffer_t * b, u32 n_bytes_to_drop)
+{
+  u32 discard, first = b->current_length;
+  vlib_main_t *vm = vlib_get_main ();
+
+  /* Handle multi-buffer segments */
+  if (n_bytes_to_drop > b->current_length)
+    {
+      if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT))
+       return -1;
+      do
+       {
+         discard = clib_min (n_bytes_to_drop, b->current_length);
+         vlib_buffer_advance (b, discard);
+         b = vlib_get_buffer (vm, b->next_buffer);
+         n_bytes_to_drop -= discard;
+       }
+      while (n_bytes_to_drop);
+      if (n_bytes_to_drop > first)
+       b->total_length_not_including_first_buffer -= n_bytes_to_drop - first;
+    }
+  else
+    vlib_buffer_advance (b, n_bytes_to_drop);
+  vnet_buffer (b)->tcp.data_len -= n_bytes_to_drop;
+  return 0;
+}
+
 static int
 tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b,
                 u32 * next0)
@@ -1530,7 +1553,8 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b,
          n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number;
          n_data_bytes -= n_bytes_to_drop;
          vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt;
-         vlib_buffer_advance (b, n_bytes_to_drop);
+         if (tcp_buffer_discard_bytes (b, n_bytes_to_drop))
+           goto done;
 
          goto in_order;
        }
@@ -1724,9 +1748,13 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
               * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
              tc0->state = TCP_STATE_CLOSE_WAIT;
              TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
-             tc0->rcv_nxt += (vnet_buffer (b0)->tcp.data_len == 0);
+             if (vnet_buffer (b0)->tcp.data_len == 0)
+               {
+                 tc0->rcv_nxt += 1;
+                 next0 = TCP_ESTABLISHED_NEXT_DROP;
+               }
              stream_session_disconnect_notify (&tc0->connection);
-             tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
+             tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
            }
 
        done:
@@ -1747,6 +1775,8 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
   errors = session_manager_flush_enqueue_events (my_thread_index);
   tcp_established_inc_counter (vm, is_ip4, TCP_ERROR_EVENT_FIFO_FULL, errors);
+  tcp_flush_frame_to_output (vm, my_thread_index, is_ip4);
+
   return from_frame->n_vectors;
 }
 
@@ -1819,7 +1849,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
   tcp_main_t *tm = vnet_get_tcp_main ();
   u32 n_left_from, next_index, *from, *to_next;
   u32 my_thread_index = vm->thread_index, errors = 0;
-  u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
 
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
@@ -1853,26 +1882,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          tc0 =
            tcp_half_open_connection_get (vnet_buffer (b0)->
                                          tcp.connection_index);
+         ASSERT (tc0);
 
          ack0 = vnet_buffer (b0)->tcp.ack_number;
          seq0 = vnet_buffer (b0)->tcp.seq_number;
          tcp0 = tcp_buffer_hdr (b0);
 
-         if (!tc0)
-           {
-             ip4_header_t *ip40 = vlib_buffer_get_current (b0);
-             tcp0 = ip4_next_header (ip40);
-             tc0 =
-               (tcp_connection_t *)
-               stream_session_lookup_transport_wt4 (&ip40->dst_address,
-                                                    &ip40->src_address,
-                                                    tcp0->dst_port,
-                                                    tcp0->src_port,
-                                                    SESSION_TYPE_IP4_TCP,
-                                                    my_thread_index);
-             ASSERT (0);
-             goto drop;
-           }
          if (PREDICT_FALSE
              (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0)))
            goto drop;
@@ -1898,7 +1913,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt)
                {
                  if (!tcp_rst (tcp0))
-                   tcp_send_reset (tc0, b0, is_ip4);
+                   tcp_send_reset_w_pkt (tc0, b0, is_ip4);
                  goto drop;
                }
 
@@ -1936,10 +1951,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          if (tcp_options_parse (tcp0, &tc0->rcv_opts))
            goto drop;
 
-         /* Stop connection establishment and retransmit timers */
-         tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH);
-         tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN);
-
          /* Valid SYN or SYN-ACK. Move connection from half-open pool to
           * current thread pool. */
          pool_get (tm->connections[my_thread_index], new_tc0);
@@ -1948,7 +1959,14 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          new_tc0->c_thread_index = my_thread_index;
          new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end;
          new_tc0->irs = seq0;
-         tcp_half_open_connection_del (tc0);
+         new_tc0->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID;
+         new_tc0->timers[TCP_TIMER_RETRANSMIT_SYN] =
+           TCP_TIMER_HANDLE_INVALID;
+
+         /* If this is not the owning thread, wait for syn retransmit to
+          * expire and cleanup then */
+         if (tcp_half_open_connection_cleanup (tc0))
+           tc0->flags |= TCP_CONN_HALF_OPEN_DONE;
 
          if (tcp_opts_tstamp (&new_tc0->rcv_opts))
            {
@@ -1980,11 +1998,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
              /* Notify app that we have connection. If session layer can't
               * allocate session send reset */
-             if (stream_session_connect_notify (&new_tc0->connection, sst,
-                                                0))
+             if (stream_session_connect_notify (&new_tc0->connection, 0))
                {
+                 tcp_send_reset_w_pkt (new_tc0, b0, is_ip4);
                  tcp_connection_cleanup (new_tc0);
-                 tcp_send_reset (tc0, b0, is_ip4);
                  goto drop;
                }
 
@@ -2002,11 +2019,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              new_tc0->state = TCP_STATE_SYN_RCVD;
 
              /* Notify app that we have connection */
-             if (stream_session_connect_notify
-                 (&new_tc0->connection, sst, 0))
+             if (stream_session_connect_notify (&new_tc0->connection, 0))
                {
                  tcp_connection_cleanup (new_tc0);
-                 tcp_send_reset (tc0, b0, is_ip4);
+                 tcp_send_reset_w_pkt (tc0, b0, is_ip4);
                  TCP_EVT_DBG (TCP_EVT_RST_SENT, tc0);
                  goto drop;
                }
@@ -2210,7 +2226,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
               */
              if (!tcp_rcv_ack_is_acceptable (tc0, b0))
                {
-                 tcp_send_reset (tc0, b0, is_ip4);
+                 tcp_send_reset_w_pkt (tc0, b0, is_ip4);
                  goto drop;
                }
 
@@ -2229,8 +2245,9 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
              stream_session_accept_notify (&tc0->connection);
 
-             /* Reset SYN-ACK retransmit timer */
+             /* Reset SYN-ACK retransmit and SYN_RCV establish timers */
              tcp_retransmit_timer_reset (tc0);
+             tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH);
              break;
            case TCP_STATE_ESTABLISHED:
              /* We can get packets in established state here because they
@@ -2246,15 +2263,31 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
                goto drop;
 
+             /* Still have to send the FIN */
+             if (tc0->flags & TCP_CONN_FINPNDG)
+               {
+                 /* TX fifo finally drained */
+                 if (!stream_session_tx_fifo_max_dequeue (&tc0->connection))
+                   tcp_send_fin (tc0);
+               }
              /* If FIN is ACKed */
-             if (tc0->snd_una == tc0->snd_una_max)
+             else if (tc0->snd_una == tc0->snd_una_max)
                {
-                 ASSERT (tcp_fin (tcp0));
+                 tc0->rcv_nxt += 1;
                  tc0->state = TCP_STATE_FIN_WAIT_2;
                  TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
 
-                 /* Stop all timers, 2MSL will be set lower */
-                 tcp_connection_timers_reset (tc0);
+                 if (tcp_fin (tcp0))
+                   {
+                     /* Stop all timers, 2MSL will be set lower */
+                     tcp_connection_timers_reset (tc0);
+                   }
+                 else
+                   {
+                     /* Wait for peer to finish sending its data */
+                     tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE,
+                                       TCP_2MSL_TIME);
+                   }
                }
              break;
            case TCP_STATE_FIN_WAIT_2:
@@ -2263,7 +2296,6 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
               * acknowledged ("ok") but do not delete the TCB. */
              if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
                goto drop;
-             /* check if rtx queue is empty and ack CLOSE TODO */
              break;
            case TCP_STATE_CLOSE_WAIT:
              /* Do the same processing as for the ESTABLISHED state. */
@@ -2277,9 +2309,9 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
                goto drop;
 
-             /* XXX test that send queue empty */
              tc0->state = TCP_STATE_TIME_WAIT;
              TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
+             tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
              goto drop;
 
              break;
@@ -2375,16 +2407,18 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              /* move along .. */
              break;
            case TCP_STATE_FIN_WAIT_1:
-             tc0->state = TCP_STATE_TIME_WAIT;
-             tcp_connection_timers_reset (tc0);
-             tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+             tc0->state = TCP_STATE_CLOSING;
+             tcp_make_ack (tc0, b0);
+             next0 = tcp_next_output (is_ip4);
              TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
+             /* Wait for ACK but not forever */
+             tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
              break;
            case TCP_STATE_FIN_WAIT_2:
              /* Got FIN, send ACK! */
              tc0->state = TCP_STATE_TIME_WAIT;
              tcp_connection_timers_reset (tc0);
-             tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
+             tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
              tcp_make_ack (tc0, b0);
              next0 = tcp_next_output (is_ip4);
              TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
@@ -2618,6 +2652,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          /* Reuse buffer to make syn-ack and send */
          tcp_make_synack (child0, b0);
          next0 = tcp_next_output (is_ip4);
+         tcp_timer_set (child0, TCP_TIMER_ESTABLISH, TCP_SYN_RCVD_TIME);
 
        drop:
          if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
@@ -2745,7 +2780,7 @@ tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr)
       if ((tmp =
           stream_session_half_open_lookup (&tc->c_lcl_ip, &tc->c_rmt_ip,
                                            tc->c_lcl_port, tc->c_rmt_port,
-                                           tc->c_proto)))
+                                           tc->c_transport_proto)))
        {
          if (tmp->lcl_port == hdr->dst_port
              && tmp->rmt_port == hdr->src_port)
@@ -2767,8 +2802,8 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
-
   next_index = node->cached_next_index;
+  tcp_set_time_now (my_thread_index);
 
   while (n_left_from > 0)
     {