session: first approximation implementation of tls
[vpp.git] / src / vnet / session / session_node.c
index 8d703b0..9cd0ef1 100644 (file)
 #include <math.h>
 #include <vlib/vlib.h>
 #include <vnet/vnet.h>
-#include <vnet/tcp/tcp.h>
 #include <vppinfra/elog.h>
+#include <vnet/session/transport.h>
 #include <vnet/session/application.h>
 #include <vnet/session/session_debug.h>
-#include <vlibmemory/unix_shared_memory_queue.h>
+#include <svm/queue.h>
 
 vlib_node_registration_t session_queue_node;
 
@@ -64,31 +64,29 @@ static char *session_queue_error_strings[] = {
 #undef _
 };
 
-static u32 session_type_to_next[] = {
-  SESSION_QUEUE_NEXT_TCP_IP4_OUTPUT,
-  SESSION_QUEUE_NEXT_IP4_LOOKUP,
-  SESSION_QUEUE_NEXT_TCP_IP6_OUTPUT,
-  SESSION_QUEUE_NEXT_IP6_LOOKUP,
-};
-
 always_inline void
 session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm,
                            u8 thread_index, svm_fifo_t * fifo,
                            vlib_buffer_t * b0, u32 bi0, u8 n_bufs_per_seg,
-                           u32 * left_to_snd0, u16 * n_bufs, u32 * rx_offset,
-                           u16 deq_per_buf, u8 peek_data)
+                           u32 left_from_seg, u32 * left_to_snd0,
+                           u16 * n_bufs, u32 * tx_offset, u16 deq_per_buf,
+                           u8 peek_data)
 {
   vlib_buffer_t *chain_b0, *prev_b0;
-  u32 chain_bi0;
+  u32 chain_bi0, to_deq;
   u16 len_to_deq0, n_bytes_read;
   u8 *data0, j;
 
+  b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+  b0->total_length_not_including_first_buffer = 0;
+
   chain_bi0 = bi0;
   chain_b0 = b0;
+  to_deq = left_from_seg;
   for (j = 1; j < n_bufs_per_seg; j++)
     {
       prev_b0 = chain_b0;
-      len_to_deq0 = clib_min (*left_to_snd0, deq_per_buf);
+      len_to_deq0 = clib_min (to_deq, deq_per_buf);
 
       *n_bufs -= 1;
       chain_bi0 = smm->tx_buffers[thread_index][*n_bufs];
@@ -99,8 +97,8 @@ session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm,
       data0 = vlib_buffer_get_current (chain_b0);
       if (peek_data)
        {
-         n_bytes_read = svm_fifo_peek (fifo, *rx_offset, len_to_deq0, data0);
-         *rx_offset += n_bytes_read;
+         n_bytes_read = svm_fifo_peek (fifo, *tx_offset, len_to_deq0, data0);
+         *tx_offset += n_bytes_read;
        }
       else
        {
@@ -117,10 +115,13 @@ session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm,
       /* update current buffer */
       chain_b0->next_buffer = 0;
 
-      *left_to_snd0 -= n_bytes_read;
-      if (*left_to_snd0 == 0)
+      to_deq -= n_bytes_read;
+      if (to_deq == 0)
        break;
     }
+  ASSERT (to_deq == 0
+         && b0->total_length_not_including_first_buffer == left_from_seg);
+  *left_to_snd0 -= left_from_seg;
 }
 
 always_inline int
@@ -132,21 +133,23 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
 {
   u32 n_trace = vlib_get_trace_count (vm, node);
   u32 left_to_snd0, max_len_to_snd0, len_to_deq0, snd_space0;
-  u32 n_bufs_per_evt, n_frames_per_evt;
+  u32 n_bufs_per_evt, n_frames_per_evt, n_bufs_per_frame;
   transport_connection_t *tc0;
   transport_proto_vft_t *transport_vft;
+  transport_proto_t tp;
   u32 next_index, next0, *to_next, n_left_to_next, bi0;
   vlib_buffer_t *b0;
-  u32 rx_offset = 0, max_dequeue0, n_bytes_per_seg;
+  u32 tx_offset = 0, max_dequeue0, n_bytes_per_seg, left_for_seg;
   u16 snd_mss0, n_bufs_per_seg, n_bufs;
   u8 *data0;
   int i, n_bytes_read;
-  u32 n_bytes_per_buf, deq_per_buf;
+  u32 n_bytes_per_buf, deq_per_buf, deq_per_first_buf;
   u32 buffers_allocated, buffers_allocated_this_call;
 
-  next_index = next0 = session_type_to_next[s0->session_type];
+  next_index = next0 = smm->session_type_to_next[s0->session_type];
 
-  transport_vft = session_get_transport_vft (s0->session_type);
+  tp = session_get_transport_proto (s0);
+  transport_vft = transport_protocol_get_vft (tp);
   tc0 = transport_vft->get_connection (s0->connection_index, thread_index);
 
   /* Make sure we have space to send and there's something to dequeue */
@@ -160,21 +163,25 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
       return 0;
     }
 
+  /* Allow enqueuing of a new event */
+  svm_fifo_unset_event (s0->server_tx_fifo);
+
+  /* Check how much we can pull. */
+  max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo);
+
   if (peek_data)
     {
-      /* Offset in rx fifo from where to peek data  */
-      rx_offset = transport_vft->tx_fifo_offset (tc0);
+      /* Offset in rx fifo from where to peek data */
+      tx_offset = transport_vft->tx_fifo_offset (tc0);
+      if (PREDICT_FALSE (tx_offset >= max_dequeue0))
+       max_dequeue0 = 0;
+      else
+       max_dequeue0 -= tx_offset;
     }
 
-  /* Check how much we can pull. If buffering, subtract the offset */
-  max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo) - rx_offset;
-
   /* Nothing to read return */
   if (max_dequeue0 == 0)
-    {
-      svm_fifo_unset_event (s0->server_tx_fifo);
-      return 0;
-    }
+    return 0;
 
   /* Ensure we're not writing more than transport window allows */
   if (max_dequeue0 < snd_space0)
@@ -186,57 +193,60 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
     }
   else
     {
+      /* Expectation is that snd_space0 is already a multiple of snd_mss */
       max_len_to_snd0 = snd_space0;
     }
 
   n_bytes_per_buf = vlib_buffer_free_list_buffer_size
     (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+  ASSERT (n_bytes_per_buf > MAX_HDRS_LEN);
   n_bytes_per_seg = MAX_HDRS_LEN + snd_mss0;
   n_bufs_per_seg = ceil ((double) n_bytes_per_seg / n_bytes_per_buf);
-  n_bufs_per_evt = (ceil ((double) max_len_to_snd0 / n_bytes_per_seg))
-    * n_bufs_per_seg;
+  n_bufs_per_evt = ceil ((double) max_len_to_snd0 / n_bytes_per_seg);
   n_frames_per_evt = ceil ((double) n_bufs_per_evt / VLIB_FRAME_SIZE);
+  n_bufs_per_frame = n_bufs_per_seg * VLIB_FRAME_SIZE;
 
   deq_per_buf = clib_min (snd_mss0, n_bytes_per_buf);
+  deq_per_first_buf = clib_min (snd_mss0, n_bytes_per_buf - MAX_HDRS_LEN);
 
   n_bufs = vec_len (smm->tx_buffers[thread_index]);
   left_to_snd0 = max_len_to_snd0;
   for (i = 0; i < n_frames_per_evt; i++)
     {
       /* Make sure we have at least one full frame of buffers ready */
-      if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE))
+      if (PREDICT_FALSE (n_bufs < n_bufs_per_frame))
        {
          vec_validate (smm->tx_buffers[thread_index],
-                       n_bufs + 2 * VLIB_FRAME_SIZE - 1);
-
+                       n_bufs + n_bufs_per_frame - 1);
          buffers_allocated = 0;
          do
            {
-             buffers_allocated_this_call =
-               vlib_buffer_alloc
-               (vm,
-                &smm->tx_buffers[thread_index][n_bufs + buffers_allocated],
-                2 * VLIB_FRAME_SIZE - buffers_allocated);
+             buffers_allocated_this_call = vlib_buffer_alloc (vm,
+                                                              &smm->tx_buffers
+                                                              [thread_index]
+                                                              [n_bufs +
+                                                               buffers_allocated],
+                                                              n_bufs_per_frame
+                                                              -
+                                                              buffers_allocated);
              buffers_allocated += buffers_allocated_this_call;
            }
          while (buffers_allocated_this_call > 0
-                && ((buffers_allocated + n_bufs < VLIB_FRAME_SIZE)));
+                && ((buffers_allocated + n_bufs < n_bufs_per_frame)));
 
          n_bufs += buffers_allocated;
-
          _vec_len (smm->tx_buffers[thread_index]) = n_bufs;
 
-         if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE))
+         if (PREDICT_FALSE (n_bufs < n_bufs_per_frame))
            {
              vec_add1 (smm->pending_event_vector[thread_index], *e0);
              return -1;
            }
+         ASSERT (n_bufs >= n_bufs_per_frame);
        }
-      /* Allow enqueuing of a new event */
-      svm_fifo_unset_event (s0->server_tx_fifo);
 
       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
-      while (left_to_snd0 && n_left_to_next >= n_bufs_per_seg)
+      while (left_to_snd0 && n_left_to_next)
        {
          /*
           * Handle first buffer in chain separately
@@ -245,7 +255,6 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
          /* Get free buffer */
          ASSERT (n_bufs >= 1);
          bi0 = smm->tx_buffers[thread_index][--n_bufs];
-         ASSERT (bi0);
          _vec_len (smm->tx_buffers[thread_index]) = n_bufs;
 
          /* usual speculation, or the enqueue_x1 macro will barf */
@@ -255,31 +264,30 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
 
          b0 = vlib_get_buffer (vm, bi0);
          b0->error = 0;
-         b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID
-           | VNET_BUFFER_F_LOCALLY_ORIGINATED;
+         b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
          b0->current_data = 0;
          b0->total_length_not_including_first_buffer = 0;
 
-         len_to_deq0 = clib_min (left_to_snd0, deq_per_buf);
-
+         len_to_deq0 = clib_min (left_to_snd0, deq_per_first_buf);
          data0 = vlib_buffer_make_headroom (b0, MAX_HDRS_LEN);
          if (peek_data)
            {
-             n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, rx_offset,
+             n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, tx_offset,
                                            len_to_deq0, data0);
+             if (n_bytes_read <= 0)
+               goto dequeue_fail;
              /* Keep track of progress locally, transport is also supposed to
               * increment it independently when pushing the header */
-             rx_offset += n_bytes_read;
+             tx_offset += n_bytes_read;
            }
          else
            {
              n_bytes_read = svm_fifo_dequeue_nowait (s0->server_tx_fifo,
                                                      len_to_deq0, data0);
+             if (n_bytes_read <= 0)
+               goto dequeue_fail;
            }
 
-         if (n_bytes_read <= 0)
-           goto dequeue_fail;
-
          b0->current_length = n_bytes_read;
 
          left_to_snd0 -= n_bytes_read;
@@ -288,12 +296,15 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
          /*
           * Fill in the remaining buffers in the chain, if any
           */
-         if (PREDICT_FALSE (n_bufs_per_seg > 1))
-           session_tx_fifo_chain_tail (smm, vm, thread_index,
-                                       s0->server_tx_fifo, b0, bi0,
-                                       n_bufs_per_seg, &left_to_snd0,
-                                       &n_bufs, &rx_offset, deq_per_buf,
-                                       peek_data);
+         if (PREDICT_FALSE (n_bufs_per_seg > 1 && left_to_snd0))
+           {
+             left_for_seg = clib_min (snd_mss0 - n_bytes_read, left_to_snd0);
+             session_tx_fifo_chain_tail (smm, vm, thread_index,
+                                         s0->server_tx_fifo, b0, bi0,
+                                         n_bufs_per_seg, left_for_seg,
+                                         &left_to_snd0, &n_bufs, &tx_offset,
+                                         deq_per_buf, peek_data);
+           }
 
          /* Ask transport to push header after current_length and
           * total_length_not_including_first_buffer are updated */
@@ -301,15 +312,15 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
 
          /* *INDENT-OFF* */
          SESSION_EVT_DBG(SESSION_EVT_DEQ, s0, ({
-             ed->data[0] = e0->event_id;
+             ed->data[0] = e0->event_type;
              ed->data[1] = max_dequeue0;
              ed->data[2] = len_to_deq0;
              ed->data[3] = left_to_snd0;
          }));
          /* *INDENT-ON* */
 
-
          VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
+
          if (PREDICT_FALSE (n_trace > 0))
            {
              session_queue_trace_t *t0;
@@ -378,12 +389,24 @@ session_tx_fifo_dequeue_and_snd (vlib_main_t * vm, vlib_node_runtime_t * node,
                                         n_tx_pkts, 0);
 }
 
+int
+session_tx_fifo_dequeue_internal (vlib_main_t * vm,
+                                 vlib_node_runtime_t * node,
+                                 session_manager_main_t * smm,
+                                 session_fifo_event_t * e0,
+                                 stream_session_t * s0, u32 thread_index,
+                                 int *n_tx_pkts)
+{
+  application_t *app;
+  app = application_get (s0->opaque);
+  svm_fifo_unset_event (s0->server_tx_fifo);
+  return app->cb_fns.builtin_app_tx_callback (s0);
+}
+
 always_inline stream_session_t *
 session_event_get_session (session_fifo_event_t * e, u8 thread_index)
 {
-  ASSERT (e->fifo->master_thread_index == thread_index);
-  return stream_session_get_if_valid (e->fifo->master_session_index,
-                                     thread_index);
+  return session_get_if_valid (e->fifo->master_session_index, thread_index);
 }
 
 void
@@ -397,7 +420,7 @@ dump_thread_0_event_queue (void)
   int i, index;
   i8 *headp;
 
-  unix_shared_memory_queue_t *q;
+  svm_queue_t *q;
   q = smm->vpp_event_queues[my_thread_index];
 
   index = q->head;
@@ -415,7 +438,7 @@ dump_thread_0_event_queue (void)
          break;
 
        case FIFO_EVENT_DISCONNECT:
-         s0 = stream_session_get_from_handle (e->session_handle);
+         s0 = session_get_from_handle (e->session_handle);
          fformat (stdout, "[%04d] disconnect session %d\n", i,
                   s0->session_index);
          break;
@@ -458,7 +481,7 @@ session_node_cmp_event (session_fifo_event_t * e, svm_fifo_t * f)
     case FIFO_EVENT_DISCONNECT:
       break;
     case FIFO_EVENT_RPC:
-      s = stream_session_get_from_handle (e->session_handle);
+      s = session_get_from_handle (e->session_handle);
       if (!s)
        {
          clib_warning ("session has event but doesn't exist!");
@@ -477,7 +500,7 @@ u8
 session_node_lookup_fifo_event (svm_fifo_t * f, session_fifo_event_t * e)
 {
   session_manager_main_t *smm = vnet_get_session_manager_main ();
-  unix_shared_memory_queue_t *q;
+  svm_queue_t *q;
   session_fifo_event_t *pending_event_vector, *evt;
   int i, index, found = 0;
   i8 *headp;
@@ -496,7 +519,7 @@ session_node_lookup_fifo_event (svm_fifo_t * f, session_fifo_event_t * e)
       clib_memcpy (e, headp, q->elsize);
       found = session_node_cmp_event (e, f);
       if (found)
-       break;
+       return 1;
       if (++index == q->maxsize)
        index = 0;
     }
@@ -521,10 +544,10 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
                       vlib_frame_t * frame)
 {
   session_manager_main_t *smm = vnet_get_session_manager_main ();
-  session_fifo_event_t *my_pending_event_vector, *e;
+  session_fifo_event_t *my_pending_event_vector, *pending_disconnects, *e;
   session_fifo_event_t *my_fifo_events;
   u32 n_to_dequeue, n_events;
-  unix_shared_memory_queue_t *q;
+  svm_queue_t *q;
   application_t *app;
   int n_tx_packets = 0;
   u32 my_thread_index = vm->thread_index;
@@ -535,9 +558,9 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   SESSION_EVT_DBG (SESSION_EVT_POLL_GAP_TRACK, smm, my_thread_index);
 
   /*
-   *  Update TCP time
+   *  Update transport time
    */
-  tcp_update_time (now, my_thread_index);
+  transport_update_time (now, my_thread_index);
 
   /*
    * Get vpp queue events
@@ -551,8 +574,10 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   /* min number of events we can dequeue without blocking */
   n_to_dequeue = q->cursize;
   my_pending_event_vector = smm->pending_event_vector[my_thread_index];
+  pending_disconnects = smm->pending_disconnects[my_thread_index];
 
-  if (n_to_dequeue == 0 && vec_len (my_pending_event_vector) == 0)
+  if (!n_to_dequeue && !vec_len (my_pending_event_vector)
+      && !vec_len (pending_disconnects))
     return 0;
 
   SESSION_EVT_DBG (SESSION_EVT_DEQ_NODE, 0);
@@ -575,7 +600,7 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   for (i = 0; i < n_to_dequeue; i++)
     {
       vec_add2 (my_fifo_events, e, 1);
-      unix_shared_memory_queue_sub_raw (q, (u8 *) e);
+      svm_queue_sub_raw (q, (u8 *) e);
     }
 
   /* The other side of the connection is not polling */
@@ -584,9 +609,11 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   pthread_mutex_unlock (&q->mutex);
 
   vec_append (my_fifo_events, my_pending_event_vector);
+  vec_append (my_fifo_events, smm->pending_disconnects[my_thread_index]);
 
   _vec_len (my_pending_event_vector) = 0;
   smm->pending_event_vector[my_thread_index] = my_pending_event_vector;
+  _vec_len (smm->pending_disconnects[my_thread_index]) = 0;
 
 skip_dequeue:
   n_events = vec_len (my_fifo_events);
@@ -602,14 +629,18 @@ skip_dequeue:
        case FIFO_EVENT_APP_TX:
          s0 = session_event_get_session (e0, my_thread_index);
 
-         if (CLIB_DEBUG && !s0)
+         if (PREDICT_FALSE (!s0))
            {
              clib_warning ("It's dead, Jim!");
              continue;
            }
-
-         if (PREDICT_FALSE (s0->session_state == SESSION_STATE_CLOSED))
-           continue;
+         /* Can retransmit for closed sessions but can't do anything if
+          * session is not ready or closed */
+         if (PREDICT_FALSE (s0->session_state < SESSION_STATE_READY))
+           {
+             vec_add1 (smm->pending_event_vector[my_thread_index], *e0);
+             continue;
+           }
          /* Spray packets in per session type frames, since they go to
           * different nodes */
          rv = (smm->session_tx_fns[s0->session_type]) (vm, node, smm, e0, s0,
@@ -624,14 +655,23 @@ skip_dequeue:
            }
          break;
        case FIFO_EVENT_DISCONNECT:
-         s0 = stream_session_get_from_handle (e0->session_handle);
-         stream_session_disconnect (s0);
+         /* Make sure disconnects run after the pending list is drained */
+         if (!e0->postponed)
+           {
+             e0->postponed = 1;
+             vec_add1 (smm->pending_disconnects[my_thread_index], *e0);
+             continue;
+           }
+         s0 = session_get_from_handle (e0->session_handle);
+         stream_session_disconnect_transport (s0);
          break;
        case FIFO_EVENT_BUILTIN_RX:
          s0 = session_event_get_session (e0, my_thread_index);
+         if (PREDICT_FALSE (!s0))
+           continue;
          svm_fifo_unset_event (s0->server_rx_fifo);
          app = application_get (s0->app_index);
-         app->cb_fns.builtin_server_rx_callback (s0);
+         app->cb_fns.builtin_app_rx_callback (s0);
          break;
        case FIFO_EVENT_RPC:
          fp = e0->rpc_args.fp;
@@ -663,19 +703,29 @@ VLIB_REGISTER_NODE (session_queue_node) =
   .type = VLIB_NODE_TYPE_INPUT,
   .n_errors = ARRAY_LEN (session_queue_error_strings),
   .error_strings = session_queue_error_strings,
-  .n_next_nodes = SESSION_QUEUE_N_NEXT,
   .state = VLIB_NODE_STATE_DISABLED,
-  .next_nodes =
-  {
-      [SESSION_QUEUE_NEXT_DROP] = "error-drop",
-      [SESSION_QUEUE_NEXT_IP4_LOOKUP] = "ip4-lookup",
-      [SESSION_QUEUE_NEXT_IP6_LOOKUP] = "ip6-lookup",
-      [SESSION_QUEUE_NEXT_TCP_IP4_OUTPUT] = "tcp4-output",
-      [SESSION_QUEUE_NEXT_TCP_IP6_OUTPUT] = "tcp6-output",
-  },
 };
 /* *INDENT-ON* */
 
+static clib_error_t *
+session_queue_exit (vlib_main_t * vm)
+{
+  if (vec_len (vlib_mains) < 2)
+    return 0;
+
+  /*
+   * Shut off (especially) worker-thread session nodes.
+   * Otherwise, vpp can crash as the main thread unmaps the
+   * API segment.
+   */
+  vlib_worker_thread_barrier_sync (vm);
+  session_node_enable_disable (0 /* is_enable */ );
+  vlib_worker_thread_barrier_release (vm);
+  return 0;
+}
+
+VLIB_MAIN_LOOP_EXIT_FUNCTION (session_queue_exit);
+
 /*
  * fd.io coding-style-patch-verification: ON
  *