udp/session: refactor to support dgram mode
[vpp.git] / src / vnet / session / session_node.c
index 7811617..1471696 100644 (file)
 #include <math.h>
 #include <vlib/vlib.h>
 #include <vnet/vnet.h>
-#include <vnet/tcp/tcp.h>
 #include <vppinfra/elog.h>
+#include <vnet/session/transport.h>
 #include <vnet/session/application.h>
 #include <vnet/session/session_debug.h>
-#include <vlibmemory/unix_shared_memory_queue.h>
+#include <svm/queue.h>
 
 vlib_node_registration_t session_queue_node;
 
@@ -64,20 +64,13 @@ static char *session_queue_error_strings[] = {
 #undef _
 };
 
-static u32 session_type_to_next[] = {
-  SESSION_QUEUE_NEXT_TCP_IP4_OUTPUT,
-  SESSION_QUEUE_NEXT_IP4_LOOKUP,
-  SESSION_QUEUE_NEXT_TCP_IP6_OUTPUT,
-  SESSION_QUEUE_NEXT_IP6_LOOKUP,
-};
-
 always_inline void
 session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm,
                            u8 thread_index, svm_fifo_t * fifo,
                            vlib_buffer_t * b0, u32 bi0, u8 n_bufs_per_seg,
                            u32 left_from_seg, u32 * left_to_snd0,
                            u16 * n_bufs, u32 * tx_offset, u16 deq_per_buf,
-                           u8 peek_data)
+                           u8 peek_data, transport_tx_fn_type_t tx_type)
 {
   vlib_buffer_t *chain_b0, *prev_b0;
   u32 chain_bi0, to_deq;
@@ -109,7 +102,23 @@ session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm,
        }
       else
        {
-         n_bytes_read = svm_fifo_dequeue_nowait (fifo, len_to_deq0, data0);
+         if (tx_type == TRANSPORT_TX_DGRAM)
+           {
+             session_dgram_hdr_t *hdr;
+             u16 deq_now;
+             hdr = (session_dgram_hdr_t *) svm_fifo_head (fifo);
+             deq_now = clib_min (hdr->data_length - hdr->data_offset,
+                                 len_to_deq0);
+             n_bytes_read = svm_fifo_peek (fifo, hdr->data_offset, deq_now,
+                                           data0);
+             ASSERT (n_bytes_read > 0);
+
+             hdr->data_offset += n_bytes_read;
+             if (hdr->data_offset == hdr->data_length)
+               svm_fifo_dequeue_drop (fifo, hdr->data_length);
+           }
+         else
+           n_bytes_read = svm_fifo_dequeue_nowait (fifo, len_to_deq0, data0);
        }
       ASSERT (n_bytes_read == len_to_deq0);
       chain_b0->current_length = n_bytes_read;
@@ -143,6 +152,7 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
   u32 n_bufs_per_evt, n_frames_per_evt, n_bufs_per_frame;
   transport_connection_t *tc0;
   transport_proto_vft_t *transport_vft;
+  transport_proto_t tp;
   u32 next_index, next0, *to_next, n_left_to_next, bi0;
   vlib_buffer_t *b0;
   u32 tx_offset = 0, max_dequeue0, n_bytes_per_seg, left_for_seg;
@@ -150,12 +160,36 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
   u8 *data0;
   int i, n_bytes_read;
   u32 n_bytes_per_buf, deq_per_buf, deq_per_first_buf;
-  u32 buffers_allocated, buffers_allocated_this_call;
-
-  next_index = next0 = session_type_to_next[s0->session_type];
+  u32 bufs_alloc, bufs_now;
+  session_dgram_hdr_t hdr;
 
-  transport_vft = transport_protocol_get_vft (s0->session_type);
-  tc0 = transport_vft->get_connection (s0->connection_index, thread_index);
+  next_index = next0 = smm->session_type_to_next[s0->session_type];
+  tp = session_get_transport_proto (s0);
+  transport_vft = transport_protocol_get_vft (tp);
+  if (peek_data)
+    {
+      if (PREDICT_FALSE (s0->session_state < SESSION_STATE_READY))
+       {
+         /* Can retransmit for closed sessions but can't send new data if
+          * session is not ready or closed */
+         vec_add1 (smm->pending_event_vector[thread_index], *e0);
+         return 0;
+       }
+      tc0 =
+       transport_vft->get_connection (s0->connection_index, thread_index);
+    }
+  else
+    {
+      if (s0->session_state == SESSION_STATE_LISTENING)
+       tc0 = transport_vft->get_listener (s0->connection_index);
+      else
+       {
+         if (PREDICT_FALSE (s0->session_state == SESSION_STATE_CLOSED))
+           return 0;
+         tc0 = transport_vft->get_connection (s0->connection_index,
+                                              thread_index);
+       }
+    }
 
   /* Make sure we have space to send and there's something to dequeue */
   snd_mss0 = transport_vft->send_mss (tc0);
@@ -173,20 +207,26 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
 
   /* Check how much we can pull. */
   max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo);
-
   if (peek_data)
     {
       /* Offset in rx fifo from where to peek data */
       tx_offset = transport_vft->tx_fifo_offset (tc0);
       if (PREDICT_FALSE (tx_offset >= max_dequeue0))
-       max_dequeue0 = 0;
-      else
-       max_dequeue0 -= tx_offset;
+       return 0;
+      max_dequeue0 -= tx_offset;
     }
-
-  /* Nothing to read return */
-  if (max_dequeue0 == 0)
-    return 0;
+  else
+    {
+      if (transport_vft->tx_type == TRANSPORT_TX_DGRAM)
+       {
+         if (max_dequeue0 < sizeof (hdr))
+           return 0;
+         svm_fifo_peek (s0->server_tx_fifo, 0, sizeof (hdr), (u8 *) & hdr);
+         ASSERT (hdr.data_length > hdr.data_offset);
+         max_dequeue0 = hdr.data_length - hdr.data_offset;
+       }
+    }
+  ASSERT (max_dequeue0 > 0);
 
   /* Ensure we're not writing more than transport window allows */
   if (max_dequeue0 < snd_space0)
@@ -223,23 +263,19 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
        {
          vec_validate (smm->tx_buffers[thread_index],
                        n_bufs + n_bufs_per_frame - 1);
-         buffers_allocated = 0;
+         bufs_alloc = 0;
          do
            {
-             buffers_allocated_this_call = vlib_buffer_alloc (vm,
-                                                              &smm->tx_buffers
-                                                              [thread_index]
-                                                              [n_bufs +
-                                                               buffers_allocated],
-                                                              n_bufs_per_frame
-                                                              -
-                                                              buffers_allocated);
-             buffers_allocated += buffers_allocated_this_call;
+             bufs_now =
+               vlib_buffer_alloc (vm,
+                                  &smm->tx_buffers[thread_index][n_bufs +
+                                                                 bufs_alloc],
+                                  n_bufs_per_frame - bufs_alloc);
+             bufs_alloc += bufs_now;
            }
-         while (buffers_allocated_this_call > 0
-                && ((buffers_allocated + n_bufs < n_bufs_per_frame)));
+         while (bufs_now > 0 && ((bufs_alloc + n_bufs < n_bufs_per_frame)));
 
-         n_bufs += buffers_allocated;
+         n_bufs += bufs_alloc;
          _vec_len (smm->tx_buffers[thread_index]) = n_bufs;
 
          if (PREDICT_FALSE (n_bufs < n_bufs_per_frame))
@@ -257,6 +293,15 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
           * Handle first buffer in chain separately
           */
 
+         len_to_deq0 = clib_min (left_to_snd0, deq_per_first_buf);
+         if (left_to_snd0 > len_to_deq0 && n_left_to_next > 1)
+           {
+             vlib_buffer_t *pb;
+             u32 pbi = smm->tx_buffers[thread_index][n_bufs - 2];
+             pb = vlib_get_buffer (vm, pbi);
+             vlib_prefetch_buffer_header (pb, LOAD);
+           }
+
          /* Get free buffer */
          ASSERT (n_bufs >= 1);
          bi0 = smm->tx_buffers[thread_index][--n_bufs];
@@ -269,11 +314,10 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
 
          b0 = vlib_get_buffer (vm, bi0);
          b0->error = 0;
-         b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
+         b0->flags = VNET_BUFFER_F_LOCALLY_ORIGINATED;
          b0->current_data = 0;
          b0->total_length_not_including_first_buffer = 0;
 
-         len_to_deq0 = clib_min (left_to_snd0, deq_per_first_buf);
          data0 = vlib_buffer_make_headroom (b0, MAX_HDRS_LEN);
          if (peek_data)
            {
@@ -287,14 +331,42 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
            }
          else
            {
-             n_bytes_read = svm_fifo_dequeue_nowait (s0->server_tx_fifo,
-                                                     len_to_deq0, data0);
-             if (n_bytes_read <= 0)
-               goto dequeue_fail;
+             if (transport_vft->tx_type == TRANSPORT_TX_DGRAM)
+               {
+                 svm_fifo_t *f = s0->server_tx_fifo;
+                 u16 deq_now;
+                 u32 offset;
+
+                 ASSERT (hdr.data_length > hdr.data_offset);
+                 deq_now = clib_min (hdr.data_length - hdr.data_offset,
+                                     len_to_deq0);
+                 offset = hdr.data_offset + SESSION_CONN_HDR_LEN;
+                 n_bytes_read = svm_fifo_peek (f, offset, deq_now, data0);
+                 if (PREDICT_FALSE (n_bytes_read <= 0))
+                   goto dequeue_fail;
+
+                 if (s0->session_state == SESSION_STATE_LISTENING)
+                   {
+                     ip_copy (&tc0->rmt_ip, &hdr.rmt_ip, tc0->is_ip4);
+                     tc0->rmt_port = hdr.rmt_port;
+                   }
+                 hdr.data_offset += n_bytes_read;
+                 if (hdr.data_offset == hdr.data_length)
+                   {
+                     offset = hdr.data_length + SESSION_CONN_HDR_LEN;
+                     svm_fifo_dequeue_drop (f, offset);
+                   }
+               }
+             else
+               {
+                 n_bytes_read = svm_fifo_dequeue_nowait (s0->server_tx_fifo,
+                                                         len_to_deq0, data0);
+                 if (n_bytes_read <= 0)
+                   goto dequeue_fail;
+               }
            }
 
          b0->current_length = n_bytes_read;
-
          left_to_snd0 -= n_bytes_read;
          *n_tx_packets = *n_tx_packets + 1;
 
@@ -308,7 +380,8 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
                                          s0->server_tx_fifo, b0, bi0,
                                          n_bufs_per_seg, left_for_seg,
                                          &left_to_snd0, &n_bufs, &tx_offset,
-                                         deq_per_buf, peek_data);
+                                         deq_per_buf, peek_data,
+                                         transport_vft->tx_type);
            }
 
          /* Ask transport to push header after current_length and
@@ -325,8 +398,6 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
          /* *INDENT-ON* */
 
          VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
-         if (VLIB_BUFFER_TRACE_TRAJECTORY)
-           b0->pre_data[1] = 3;
 
          if (PREDICT_FALSE (n_trace > 0))
            {
@@ -348,12 +419,18 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
 
   /* If we couldn't dequeue all bytes mark as partially read */
   if (max_len_to_snd0 < max_dequeue0)
+    if (svm_fifo_set_event (s0->server_tx_fifo))
+      vec_add1 (smm->pending_event_vector[thread_index], *e0);
+
+  if (!peek_data && transport_vft->tx_type == TRANSPORT_TX_DGRAM)
     {
-      /* If we don't already have new event */
-      if (svm_fifo_set_event (s0->server_tx_fifo))
-       {
-         vec_add1 (smm->pending_event_vector[thread_index], *e0);
-       }
+      /* Fix dgram pre header */
+      if (max_len_to_snd0 < max_dequeue0)
+       svm_fifo_overwrite_head (s0->server_tx_fifo, (u8 *) & hdr,
+                                sizeof (session_dgram_pre_hdr_t));
+      /* More data needs to be read */
+      else if (svm_fifo_max_dequeue (s0->server_tx_fifo) > 0)
+       vec_add1 (smm->pending_event_vector[thread_index], *e0);
     }
   return 0;
 
@@ -363,7 +440,6 @@ dequeue_fail:
    * read, return buff to free list and return
    */
   clib_warning ("dequeue fail");
-
   if (svm_fifo_set_event (s0->server_tx_fifo))
     {
       vec_add1 (smm->pending_event_vector[thread_index], *e0);
@@ -396,6 +472,20 @@ session_tx_fifo_dequeue_and_snd (vlib_main_t * vm, vlib_node_runtime_t * node,
                                         n_tx_pkts, 0);
 }
 
+int
+session_tx_fifo_dequeue_internal (vlib_main_t * vm,
+                                 vlib_node_runtime_t * node,
+                                 session_manager_main_t * smm,
+                                 session_fifo_event_t * e0,
+                                 stream_session_t * s0, u32 thread_index,
+                                 int *n_tx_pkts)
+{
+  application_t *app;
+  app = application_get (s0->opaque);
+  svm_fifo_unset_event (s0->server_tx_fifo);
+  return app->cb_fns.builtin_app_tx_callback (s0);
+}
+
 always_inline stream_session_t *
 session_event_get_session (session_fifo_event_t * e, u8 thread_index)
 {
@@ -413,7 +503,7 @@ dump_thread_0_event_queue (void)
   int i, index;
   i8 *headp;
 
-  unix_shared_memory_queue_t *q;
+  svm_queue_t *q;
   q = smm->vpp_event_queues[my_thread_index];
 
   index = q->head;
@@ -493,7 +583,7 @@ u8
 session_node_lookup_fifo_event (svm_fifo_t * f, session_fifo_event_t * e)
 {
   session_manager_main_t *smm = vnet_get_session_manager_main ();
-  unix_shared_memory_queue_t *q;
+  svm_queue_t *q;
   session_fifo_event_t *pending_event_vector, *evt;
   int i, index, found = 0;
   i8 *headp;
@@ -512,7 +602,7 @@ session_node_lookup_fifo_event (svm_fifo_t * f, session_fifo_event_t * e)
       clib_memcpy (e, headp, q->elsize);
       found = session_node_cmp_event (e, f);
       if (found)
-       break;
+       return 1;
       if (++index == q->maxsize)
        index = 0;
     }
@@ -540,7 +630,7 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   session_fifo_event_t *my_pending_event_vector, *pending_disconnects, *e;
   session_fifo_event_t *my_fifo_events;
   u32 n_to_dequeue, n_events;
-  unix_shared_memory_queue_t *q;
+  svm_queue_t *q;
   application_t *app;
   int n_tx_packets = 0;
   u32 my_thread_index = vm->thread_index;
@@ -551,9 +641,9 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   SESSION_EVT_DBG (SESSION_EVT_POLL_GAP_TRACK, smm, my_thread_index);
 
   /*
-   *  Update TCP time
+   *  Update transport time
    */
-  tcp_update_time (now, my_thread_index);
+  transport_update_time (now, my_thread_index);
 
   /*
    * Get vpp queue events
@@ -593,7 +683,7 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   for (i = 0; i < n_to_dequeue; i++)
     {
       vec_add2 (my_fifo_events, e, 1);
-      unix_shared_memory_queue_sub_raw (q, (u8 *) e);
+      svm_queue_sub_raw (q, (u8 *) e);
     }
 
   /* The other side of the connection is not polling */
@@ -627,10 +717,6 @@ skip_dequeue:
              clib_warning ("It's dead, Jim!");
              continue;
            }
-         /* Can retransmit for closed sessions but can't do anything if
-          * session is not ready or closed */
-         if (PREDICT_FALSE (s0->session_state < SESSION_STATE_READY))
-           continue;
          /* Spray packets in per session type frames, since they go to
           * different nodes */
          rv = (smm->session_tx_fns[s0->session_type]) (vm, node, smm, e0, s0,
@@ -653,7 +739,7 @@ skip_dequeue:
              continue;
            }
          s0 = session_get_from_handle (e0->session_handle);
-         stream_session_disconnect (s0);
+         stream_session_disconnect_transport (s0);
          break;
        case FIFO_EVENT_BUILTIN_RX:
          s0 = session_event_get_session (e0, my_thread_index);
@@ -661,7 +747,7 @@ skip_dequeue:
            continue;
          svm_fifo_unset_event (s0->server_rx_fifo);
          app = application_get (s0->app_index);
-         app->cb_fns.builtin_server_rx_callback (s0);
+         app->cb_fns.builtin_app_rx_callback (s0);
          break;
        case FIFO_EVENT_RPC:
          fp = e0->rpc_args.fp;
@@ -693,19 +779,29 @@ VLIB_REGISTER_NODE (session_queue_node) =
   .type = VLIB_NODE_TYPE_INPUT,
   .n_errors = ARRAY_LEN (session_queue_error_strings),
   .error_strings = session_queue_error_strings,
-  .n_next_nodes = SESSION_QUEUE_N_NEXT,
   .state = VLIB_NODE_STATE_DISABLED,
-  .next_nodes =
-  {
-      [SESSION_QUEUE_NEXT_DROP] = "error-drop",
-      [SESSION_QUEUE_NEXT_IP4_LOOKUP] = "ip4-lookup",
-      [SESSION_QUEUE_NEXT_IP6_LOOKUP] = "ip6-lookup",
-      [SESSION_QUEUE_NEXT_TCP_IP4_OUTPUT] = "tcp4-output",
-      [SESSION_QUEUE_NEXT_TCP_IP6_OUTPUT] = "tcp6-output",
-  },
 };
 /* *INDENT-ON* */
 
+static clib_error_t *
+session_queue_exit (vlib_main_t * vm)
+{
+  if (vec_len (vlib_mains) < 2)
+    return 0;
+
+  /*
+   * Shut off (especially) worker-thread session nodes.
+   * Otherwise, vpp can crash as the main thread unmaps the
+   * API segment.
+   */
+  vlib_worker_thread_barrier_sync (vm);
+  session_node_enable_disable (0 /* is_enable */ );
+  vlib_worker_thread_barrier_release (vm);
+  return 0;
+}
+
+VLIB_MAIN_LOOP_EXIT_FUNCTION (session_queue_exit);
+
 /*
  * fd.io coding-style-patch-verification: ON
  *