vcl: ldp support SO_ORIGINAL_DST
[vpp.git] / src / vcl / vppcom.c
index 9e7af0a..06a345d 100644 (file)
@@ -351,6 +351,11 @@ vcl_session_accepted_handler (vcl_worker_t * wrk, session_accepted_msg_t * mp,
 
   session->vpp_handle = mp->handle;
   session->session_state = VCL_STATE_READY;
+  if (mp->rmt.is_ip4)
+    {
+      session->original_dst_ip4 = mp->original_dst_ip4;
+      session->original_dst_port = mp->original_dst_port;
+    }
   session->transport.rmt_port = mp->rmt.port;
   session->transport.is_ip4 = mp->rmt.is_ip4;
   clib_memcpy_fast (&session->transport.rmt_ip, &mp->rmt.ip,
@@ -969,7 +974,7 @@ vcl_session_app_del_segment_handler (vcl_worker_t * wrk, void *data)
 {
   session_app_del_segment_msg_t *msg = (session_app_del_segment_msg_t *) data;
   vcl_segment_detach (msg->segment_handle);
-  VDBG (1, "Unmapped segment: %d", msg->segment_handle);
+  VDBG (1, "Unmapped segment: %lx", msg->segment_handle);
 }
 
 static void
@@ -2864,19 +2869,27 @@ vppcom_epoll_ctl (uint32_t vep_handle, int op, uint32_t session_handle,
       s->vep.et_mask = VEP_DEFAULT_ET_MASK;
       s->vep.lt_next = VCL_INVALID_SESSION_INDEX;
       s->vep.ev = *event;
+      s->vep.ev.events |= EPOLLHUP | EPOLLERR;
       s->flags &= ~VCL_SESSION_F_IS_VEP;
       s->flags |= VCL_SESSION_F_IS_VEP_SESSION;
       vep_session->vep.next_sh = session_handle;
 
-      if (event->events & EPOLLOUT)
-       vcl_session_add_want_deq_ntf (s, SVM_FIFO_WANT_DEQ_NOTIF_IF_FULL);
-
-      /* Generate EPOLLOUT if tx fifo not full */
-      if ((event->events & EPOLLOUT) && (vcl_session_write_ready (s) > 0))
+      if ((event->events & EPOLLOUT))
        {
-         vcl_epoll_ctl_add_unhandled_event (wrk, s, event->events & EPOLLET,
-                                            SESSION_IO_EVT_TX);
-         add_evt = 1;
+         int write_ready = vcl_session_write_ready (s);
+
+         vcl_session_add_want_deq_ntf (s, SVM_FIFO_WANT_DEQ_NOTIF_IF_FULL);
+         if (write_ready > 0)
+           {
+             /* Generate EPOLLOUT if tx fifo not full */
+             vcl_epoll_ctl_add_unhandled_event (
+               wrk, s, event->events & EPOLLET, SESSION_IO_EVT_TX);
+             add_evt = 1;
+           }
+         else
+           {
+             vcl_session_add_want_deq_ntf (s, SVM_FIFO_WANT_DEQ_NOTIF);
+           }
        }
       /* Generate EPOLLIN if rx fifo has data */
       if ((event->events & EPOLLIN) && (vcl_session_read_ready (s) > 0))
@@ -2922,18 +2935,23 @@ vppcom_epoll_ctl (uint32_t vep_handle, int op, uint32_t session_handle,
          goto done;
        }
 
-      if (event->events & EPOLLOUT)
-       vcl_session_add_want_deq_ntf (s, SVM_FIFO_WANT_DEQ_NOTIF_IF_FULL);
-      else
-       vcl_session_del_want_deq_ntf (s, SVM_FIFO_WANT_DEQ_NOTIF_IF_FULL);
-
-      /* Generate EPOLLOUT if session write ready nd event was not on */
-      if ((event->events & EPOLLOUT) && !(s->vep.ev.events & EPOLLOUT) &&
-         (vcl_session_write_ready (s) > 0))
+      /* Generate EPOLLOUT if session write ready and event was not on */
+      if ((event->events & EPOLLOUT) && !(s->vep.ev.events & EPOLLOUT))
        {
-         vcl_epoll_ctl_add_unhandled_event (wrk, s, event->events & EPOLLET,
-                                            SESSION_IO_EVT_TX);
+         /* Fifo size load acq synchronized with update store rel */
+         int write_ready = vcl_session_write_ready (s);
+
+         vcl_session_add_want_deq_ntf (s, SVM_FIFO_WANT_DEQ_NOTIF_IF_FULL);
+         if (write_ready > 0)
+           vcl_epoll_ctl_add_unhandled_event (wrk, s, event->events & EPOLLET,
+                                              SESSION_IO_EVT_TX);
+         else
+           /* Request deq ntf in case dequeue happened while updating flag */
+           vcl_session_add_want_deq_ntf (s, SVM_FIFO_WANT_DEQ_NOTIF);
        }
+      else if (!(event->events & EPOLLOUT))
+       vcl_session_del_want_deq_ntf (s, SVM_FIFO_WANT_DEQ_NOTIF_IF_FULL);
+
       /* Generate EPOLLIN if session read ready and event was not on */
       if ((event->events & EPOLLIN) && !(s->vep.ev.events & EPOLLIN) &&
          (vcl_session_read_ready (s) > 0))
@@ -2943,6 +2961,7 @@ vppcom_epoll_ctl (uint32_t vep_handle, int op, uint32_t session_handle,
        }
       s->vep.et_mask = VEP_DEFAULT_ET_MASK;
       s->vep.ev = *event;
+      s->vep.ev.events |= EPOLLHUP | EPOLLERR;
 
       VDBG (1, "EPOLL_CTL_MOD: vep_sh %u, sh %u, events 0x%x, data 0x%llx!",
            vep_handle, session_handle, event->events, event->data.u64);
@@ -3021,6 +3040,14 @@ done:
   return rv;
 }
 
+always_inline u8
+vcl_ep_session_needs_evt (vcl_session_t *s, u32 evt)
+{
+  /* No event if not epolled / events reset on hup or level-trigger on */
+  return ((s->vep.ev.events & evt) &&
+         s->vep.lt_next == VCL_INVALID_SESSION_INDEX);
+}
+
 static inline void
 vcl_epoll_wait_handle_mq_event (vcl_worker_t * wrk, session_event_t * e,
                                struct epoll_event *events, u32 * num_ev)
@@ -3040,11 +3067,10 @@ vcl_epoll_wait_handle_mq_event (vcl_worker_t * wrk, session_event_t * e,
       if (vcl_session_is_closed (s))
        break;
       vcl_fifo_rx_evt_valid_or_break (s);
-      session_events = s->vep.ev.events;
-      if (!(EPOLLIN & s->vep.ev.events) ||
-         (s->flags & VCL_SESSION_F_HAS_RX_EVT) ||
-         (s->vep.lt_next != VCL_INVALID_SESSION_INDEX))
+      if (!vcl_ep_session_needs_evt (s, EPOLLIN) ||
+         (s->flags & VCL_SESSION_F_HAS_RX_EVT))
        break;
+      session_events = s->vep.ev.events;
       add_event = 1;
       events[*num_ev].events = EPOLLIN;
       session_evt_data = s->vep.ev.data.u64;
@@ -3057,9 +3083,9 @@ vcl_epoll_wait_handle_mq_event (vcl_worker_t * wrk, session_event_t * e,
        break;
       svm_fifo_reset_has_deq_ntf (vcl_session_is_ct (s) ? s->ct_tx_fifo :
                                                                s->tx_fifo);
-      session_events = s->vep.ev.events;
-      if (!(EPOLLOUT & session_events))
+      if (!vcl_ep_session_needs_evt (s, EPOLLOUT))
        break;
+      session_events = s->vep.ev.events;
       add_event = 1;
       events[*num_ev].events = EPOLLOUT;
       session_evt_data = s->vep.ev.data.u64;
@@ -3069,13 +3095,10 @@ vcl_epoll_wait_handle_mq_event (vcl_worker_t * wrk, session_event_t * e,
        s = vcl_session_accepted (wrk, (session_accepted_msg_t *) e->data);
       else
        s = vcl_session_get (wrk, e->session_index);
-      if (!s)
+      if (!s || !vcl_ep_session_needs_evt (s, EPOLLIN))
        break;
-      session_events = s->vep.ev.events;
       sid = s->session_index;
-      if (!(EPOLLIN & session_events) ||
-         (s->vep.lt_next != VCL_INVALID_SESSION_INDEX))
-       break;
+      session_events = s->vep.ev.events;
       add_event = 1;
       events[*num_ev].events = EPOLLIN;
       session_evt_data = s->vep.ev.data.u64;
@@ -3089,19 +3112,20 @@ vcl_epoll_wait_handle_mq_event (vcl_worker_t * wrk, session_event_t * e,
       else
        sid = e->session_index;
       s = vcl_session_get (wrk, sid);
-      if (vcl_session_is_closed (s))
-       break;
-      session_events = s->vep.ev.events;
-      /* Generate EPOLLOUT because there's no connected event */
-      if (!(EPOLLOUT & session_events))
+      if (vcl_session_is_closed (s) || !vcl_ep_session_needs_evt (s, EPOLLOUT))
        break;
       /* We didn't have a fifo when the event was added */
       vcl_session_add_want_deq_ntf (s, SVM_FIFO_WANT_DEQ_NOTIF_IF_FULL);
       add_event = 1;
+      session_events = s->vep.ev.events;
+      /* Generate EPOLLOUT because there's no connected event */
       events[*num_ev].events = EPOLLOUT;
       session_evt_data = s->vep.ev.data.u64;
       if (s->session_state == VCL_STATE_DETACHED)
-       events[*num_ev].events |= EPOLLHUP;
+       {
+         events[*num_ev].events |= EPOLLHUP;
+         s->vep.ev.events = 0;
+       }
       break;
     case SESSION_CTRL_EVT_DISCONNECTED:
       if (!e->postponed)
@@ -3114,8 +3138,7 @@ vcl_epoll_wait_handle_mq_event (vcl_worker_t * wrk, session_event_t * e,
          s = vcl_session_get (wrk, e->session_index);
          s->flags &= ~VCL_SESSION_F_PENDING_DISCONNECT;
        }
-      if (vcl_session_is_closed (s) ||
-         !(s->flags & VCL_SESSION_F_IS_VEP_SESSION))
+      if (vcl_session_is_closed (s) || !vcl_ep_session_needs_evt (s, EPOLLHUP))
        {
          if (s && (s->flags & VCL_SESSION_F_PENDING_FREE))
            vcl_session_free (wrk, s);
@@ -3139,7 +3162,7 @@ vcl_epoll_wait_handle_mq_event (vcl_worker_t * wrk, session_event_t * e,
          events[*num_ev].events = EPOLLHUP;
        }
       session_evt_data = s->vep.ev.data.u64;
-
+      s->vep.ev.events = 0;
       break;
     case SESSION_CTRL_EVT_BOUND:
       vcl_session_bound_handler (wrk, (session_bound_msg_t *) e->data);
@@ -3157,8 +3180,7 @@ vcl_epoll_wait_handle_mq_event (vcl_worker_t * wrk, session_event_t * e,
          s = vcl_session_get (wrk, sid);
          s->flags &= ~VCL_SESSION_F_PENDING_DISCONNECT;
        }
-      if (vcl_session_is_closed (s) ||
-         !(s->flags & VCL_SESSION_F_IS_VEP_SESSION))
+      if (vcl_session_is_closed (s) || !vcl_ep_session_needs_evt (s, EPOLLHUP))
        {
          if (s && (s->flags & VCL_SESSION_F_PENDING_FREE))
            vcl_session_free (wrk, s);
@@ -3177,6 +3199,7 @@ vcl_epoll_wait_handle_mq_event (vcl_worker_t * wrk, session_event_t * e,
          events[*num_ev].events |= EPOLLIN;
        }
       session_evt_data = s->vep.ev.data.u64;
+      s->vep.ev.events = 0;
       break;
     case SESSION_CTRL_EVT_UNLISTEN_REPLY:
       vcl_session_unlisten_reply_handler (wrk, e->data);
@@ -3209,11 +3232,13 @@ vcl_epoll_wait_handle_mq_event (vcl_worker_t * wrk, session_event_t * e,
 
   if (add_event)
     {
+      ASSERT (s->flags & VCL_SESSION_F_IS_VEP_SESSION);
       events[*num_ev].data.u64 = session_evt_data;
       if (EPOLLONESHOT & session_events)
        {
          s = vcl_session_get (wrk, sid);
-         s->vep.ev.events = 0;
+         if (!(events[*num_ev].events & EPOLLHUP))
+           s->vep.ev.events = EPOLLHUP | EPOLLERR;
        }
       else if (!(EPOLLET & session_events))
        {
@@ -3367,6 +3392,11 @@ vcl_epoll_wait_handle_lt (vcl_worker_t *wrk, struct epoll_event *events,
       s = vcl_session_get (wrk, next);
       next = s->vep.lt_next;
 
+      if (s->vep.ev.events == 0)
+       {
+         vec_add1 (to_remove, s->session_index);
+         continue;
+       }
       if ((s->vep.ev.events & EPOLLIN) && (rv = vcl_session_read_ready (s)))
        {
          add_event = 1;
@@ -3389,11 +3419,13 @@ vcl_epoll_wait_handle_lt (vcl_worker_t *wrk, struct epoll_event *events,
        {
          events[*n_evts].events = evt_flags;
          events[*n_evts].data.u64 = evt_data;
+         if (EPOLLONESHOT & s->vep.ev.events)
+           s->vep.ev.events = EPOLLHUP | EPOLLERR;
+         if (evt_flags & EPOLLHUP)
+           s->vep.ev.events = 0;
          *n_evts += 1;
          add_event = 0;
          evt_flags = 0;
-         if (EPOLLONESHOT & s->vep.ev.events)
-           s->vep.ev.events = 0;
          if (*n_evts == maxevents)
            {
              wrk->ep_lt_current = next;
@@ -3584,6 +3616,33 @@ vppcom_session_attr (uint32_t session_handle, uint32_t op,
        rv = VPPCOM_EINVAL;
       break;
 
+    case VPPCOM_ATTR_GET_ORIGINAL_DST:
+      if (!session->transport.is_ip4)
+       {
+         /* now original dst only support ipv4*/
+         rv = VPPCOM_EAFNOSUPPORT;
+         break;
+       }
+      if (PREDICT_TRUE (buffer && buflen && (*buflen >= sizeof (*ep)) &&
+                       ep->ip))
+       {
+         ep->is_ip4 = session->transport.is_ip4;
+         ep->port = session->original_dst_port;
+         clib_memcpy_fast (ep->ip, &session->original_dst_ip4,
+                           sizeof (ip4_address_t));
+         *buflen = sizeof (*ep);
+         VDBG (1,
+               "VPPCOM_ATTR_GET_ORIGINAL_DST: sh %u, is_ip4 = %u, addr = %U"
+               " port %d",
+               session_handle, ep->is_ip4, vcl_format_ip4_address,
+               (ip4_address_t *) (&session->original_dst_ip4),
+               ep->is_ip4 ? IP46_TYPE_IP4 : IP46_TYPE_IP6,
+               clib_net_to_host_u16 (ep->port));
+       }
+      else
+       rv = VPPCOM_EINVAL;
+      break;
+
     case VPPCOM_ATTR_SET_LCL_ADDR:
       if (PREDICT_TRUE (buffer && buflen &&
                        (*buflen >= sizeof (*ep)) && ep->ip))