tcp: force deschedule if no send space available
[vpp.git] / src / vnet / tcp / tcp_input.c
index f981351..bac4147 100755 (executable)
@@ -266,6 +266,98 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end)
     }
 }
 
+static void
+tcp_handle_rst (tcp_connection_t * tc)
+{
+  switch (tc->rst_state)
+    {
+    case TCP_STATE_SYN_RCVD:
+      /* Cleanup everything. App wasn't notified yet */
+      session_transport_delete_notify (&tc->connection);
+      tcp_connection_cleanup (tc);
+      break;
+    case TCP_STATE_SYN_SENT:
+      session_stream_connect_notify (&tc->connection, 1 /* fail */ );
+      tcp_connection_cleanup (tc);
+      break;
+    case TCP_STATE_ESTABLISHED:
+      session_transport_reset_notify (&tc->connection);
+      session_transport_closed_notify (&tc->connection);
+      break;
+    case TCP_STATE_CLOSE_WAIT:
+    case TCP_STATE_FIN_WAIT_1:
+    case TCP_STATE_FIN_WAIT_2:
+    case TCP_STATE_CLOSING:
+    case TCP_STATE_LAST_ACK:
+      session_transport_closed_notify (&tc->connection);
+      break;
+    case TCP_STATE_CLOSED:
+    case TCP_STATE_TIME_WAIT:
+      break;
+    default:
+      TCP_DBG ("reset state: %u", tc->state);
+    }
+}
+
+static void
+tcp_program_reset_ntf (tcp_worker_ctx_t * wrk, tcp_connection_t * tc)
+{
+  if (!tcp_disconnect_pending (tc))
+    {
+      tc->rst_state = tc->state;
+      vec_add1 (wrk->pending_resets, tc->c_c_index);
+      tcp_disconnect_pending_on (tc);
+    }
+}
+
+/**
+ * Handle reset packet
+ *
+ * Programs disconnect/reset notification that should be sent
+ * later by calling @ref tcp_handle_disconnects
+ */
+static void
+tcp_rcv_rst (tcp_worker_ctx_t * wrk, tcp_connection_t * tc)
+{
+  TCP_EVT (TCP_EVT_RST_RCVD, tc);
+  switch (tc->state)
+    {
+    case TCP_STATE_SYN_RCVD:
+      tcp_program_reset_ntf (wrk, tc);
+      tcp_connection_set_state (tc, TCP_STATE_CLOSED);
+      break;
+    case TCP_STATE_SYN_SENT:
+      /* Do not program ntf because the connection is half-open */
+      tcp_handle_rst (tc);
+      break;
+    case TCP_STATE_ESTABLISHED:
+      tcp_connection_timers_reset (tc);
+      tcp_cong_recovery_off (tc);
+      tcp_program_reset_ntf (wrk, tc);
+      tcp_connection_set_state (tc, TCP_STATE_CLOSED);
+      tcp_program_cleanup (wrk, tc);
+      break;
+    case TCP_STATE_CLOSE_WAIT:
+    case TCP_STATE_FIN_WAIT_1:
+    case TCP_STATE_FIN_WAIT_2:
+    case TCP_STATE_CLOSING:
+    case TCP_STATE_LAST_ACK:
+      tcp_connection_timers_reset (tc);
+      tcp_cong_recovery_off (tc);
+      tcp_program_reset_ntf (wrk, tc);
+      /* Make sure we mark the session as closed. In some states we may
+       * be still trying to send data */
+      tcp_connection_set_state (tc, TCP_STATE_CLOSED);
+      tcp_program_cleanup (wrk, tc);
+      break;
+    case TCP_STATE_CLOSED:
+    case TCP_STATE_TIME_WAIT:
+      break;
+    default:
+      TCP_DBG ("reset state: %u", tc->state);
+    }
+}
+
 /**
  * Validate incoming segment as per RFC793 p. 69 and RFC1323 p. 19
  *
@@ -392,7 +484,7 @@ tcp_segment_validate (tcp_worker_ctx_t * wrk, tcp_connection_t * tc0,
   /* 2nd: check the RST bit */
   if (PREDICT_FALSE (tcp_rst (th0)))
     {
-      tcp_connection_reset (tc0);
+      tcp_rcv_rst (wrk, tc0);
       *error0 = TCP_ERROR_RST_RCVD;
       goto error;
     }
@@ -608,6 +700,9 @@ tcp_handle_postponed_dequeues (tcp_worker_ctx_t * wrk)
            tc->flags &= ~TCP_CONN_PSH_PENDING;
        }
 
+      if (tcp_is_descheduled (tc))
+       tcp_reschedule (tc);
+
       /* If everything has been acked, stop retransmit timer
        * otherwise update. */
       tcp_retransmit_timer_update (tc);
@@ -1002,9 +1097,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
     {
       if (seq_lt (blk->start, blk->end)
          && seq_gt (blk->start, tc->snd_una)
-         && seq_gt (blk->start, ack)
-         && seq_lt (blk->start, tc->snd_nxt)
-         && seq_leq (blk->end, tc->snd_nxt))
+         && seq_gt (blk->start, ack) && seq_leq (blk->end, tc->snd_nxt))
        {
          blk++;
          continue;
@@ -1222,7 +1315,12 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd)
        }
       else
        {
-         tcp_persist_timer_reset (tc);
+         if (PREDICT_FALSE (tcp_timer_is_active (tc, TCP_TIMER_PERSIST)))
+           tcp_persist_timer_reset (tc);
+
+         if (PREDICT_FALSE (tcp_is_descheduled (tc)))
+           tcp_reschedule (tc);
+
          if (PREDICT_FALSE (!tcp_in_recovery (tc) && tc->rto_boff > 0))
            {
              tc->rto_boff = 0;
@@ -1373,6 +1471,7 @@ tcp_cc_recover (tcp_connection_t * tc)
   ASSERT (tc->rto_boff == 0);
   ASSERT (!tcp_in_cong_recovery (tc));
   ASSERT (tcp_scoreboard_is_sane_post_recovery (tc));
+
   return is_spurious;
 }
 
@@ -1676,22 +1775,35 @@ tcp_program_disconnect (tcp_worker_ctx_t * wrk, tcp_connection_t * tc)
 static void
 tcp_handle_disconnects (tcp_worker_ctx_t * wrk)
 {
-  u32 thread_index, *pending_disconnects;
+  u32 thread_index, *pending_disconnects, *pending_resets;
   tcp_connection_t *tc;
   int i;
 
-  if (!vec_len (wrk->pending_disconnects))
-    return;
+  if (vec_len (wrk->pending_disconnects))
+    {
+      thread_index = wrk->vm->thread_index;
+      pending_disconnects = wrk->pending_disconnects;
+      for (i = 0; i < vec_len (pending_disconnects); i++)
+       {
+         tc = tcp_connection_get (pending_disconnects[i], thread_index);
+         tcp_disconnect_pending_off (tc);
+         session_transport_closing_notify (&tc->connection);
+       }
+      _vec_len (wrk->pending_disconnects) = 0;
+    }
 
-  thread_index = wrk->vm->thread_index;
-  pending_disconnects = wrk->pending_disconnects;
-  for (i = 0; i < vec_len (pending_disconnects); i++)
+  if (vec_len (wrk->pending_resets))
     {
-      tc = tcp_connection_get (pending_disconnects[i], thread_index);
-      tcp_disconnect_pending_off (tc);
-      session_transport_closing_notify (&tc->connection);
+      thread_index = wrk->vm->thread_index;
+      pending_resets = wrk->pending_resets;
+      for (i = 0; i < vec_len (pending_resets); i++)
+       {
+         tc = tcp_connection_get (pending_resets[i], thread_index);
+         tcp_disconnect_pending_off (tc);
+         tcp_handle_rst (tc);
+       }
+      _vec_len (wrk->pending_resets) = 0;
     }
-  _vec_len (wrk->pending_disconnects) = 0;
 }
 
 static void
@@ -1972,7 +2084,8 @@ tcp_segment_rcv (tcp_worker_ctx_t * wrk, tcp_connection_t * tc,
           * retransmissions since we may not have any data to send */
          if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
            {
-             tcp_program_ack (tc);
+             tcp_program_dupack (tc);
+             tc->errors.below_data_wnd++;
              error = TCP_ERROR_SEGMENT_OLD;
              goto done;
            }
@@ -2556,7 +2669,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          /* If ACK is acceptable, signal client that peer is not
           * willing to accept connection and drop connection*/
          if (tcp_ack (tcp0))
-           tcp_connection_reset (tc0);
+           tcp_rcv_rst (wrk, tc0);
          error0 = TCP_ERROR_RST_RCVD;
          goto drop;
        }
@@ -2701,6 +2814,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
                                              my_thread_index);
   tcp_inc_counter (syn_sent, TCP_ERROR_MSG_QUEUE_FULL, errors);
   vlib_buffer_free (vm, first_buffer, from_frame->n_vectors);
+  tcp_handle_disconnects (wrk);
 
   return from_frame->n_vectors;
 }
@@ -2837,7 +2951,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          /* Make sure the segment is exactly right */
          if (tc0->rcv_nxt != vnet_buffer (b0)->tcp.seq_number || is_fin0)
            {
-             tcp_connection_reset (tc0);
+             tcp_send_reset_w_pkt (tc0, b0, thread_index, is_ip4);
              error0 = TCP_ERROR_SEGMENT_INVALID;
              goto drop;
            }
@@ -2850,7 +2964,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
           */
          if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
            {
-             tcp_connection_reset (tc0);
+             tcp_send_reset_w_pkt (tc0, b0, thread_index, is_ip4);
+             error0 = TCP_ERROR_SEGMENT_INVALID;
              goto drop;
            }
 
@@ -2877,7 +2992,9 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          if (session_stream_accept_notify (&tc0->connection))
            {
              error0 = TCP_ERROR_MSG_QUEUE_FULL;
-             tcp_connection_reset (tc0);
+             tcp_send_reset (tc0);
+             session_transport_delete_notify (&tc0->connection);
+             tcp_connection_cleanup (tc0);
              goto drop;
            }
          error0 = TCP_ERROR_ACK_OK;
@@ -2920,9 +3037,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              if (tc0->flags & TCP_CONN_FINRCVD)
                {
                  tcp_connection_set_state (tc0, TCP_STATE_CLOSED);
-                 tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE,
-                                tcp_cfg.cleanup_time);
                  session_transport_closed_notify (&tc0->connection);
+                 tcp_program_cleanup (wrk, tc0);
                  goto drop;
                }
 
@@ -3003,7 +3119,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
           * we can't ensure that we have no packets already enqueued
           * to output. Rely instead on the waitclose timer */
          tcp_connection_timers_reset (tc0);
-         tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.cleanup_time);
+         tcp_program_cleanup (tcp_get_worker (tc0->c_thread_index), tc0);
 
          goto drop;
 
@@ -3594,14 +3710,10 @@ tcp_input_dispatch_buffer (tcp_main_t * tm, tcp_connection_t * tc,
 
   if (PREDICT_FALSE (error != TCP_ERROR_NONE))
     {
-      /* Overload tcp flags to store state */
-      tcp_state_t state = tc->state;
-      vnet_buffer (b)->tcp.flags = tc->state;
-
       b->error = error_node->errors[error];
       if (error == TCP_ERROR_DISPATCH)
        clib_warning ("tcp conn %u disp error state %U flags %U",
-                     tc->c_c_index, format_tcp_state, state,
+                     tc->c_c_index, format_tcp_state, tc->state,
                      format_tcp_flags, (int) flags);
     }
 }