X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Fdevices%2Fvirtio%2Fvhost_user_input.c;h=11d45812e396c13e08bb4594e99acd991915dca3;hb=97c998c28;hp=291d687ab6ad76375d8282b2136bc95076f27957;hpb=b7b929931a07fbb27b43d5cd105f366c3e29807e;p=vpp.git

diff --git a/src/vnet/devices/virtio/vhost_user_input.c b/src/vnet/devices/virtio/vhost_user_input.c
index 291d687ab6a..11d45812e39 100644
--- a/src/vnet/devices/virtio/vhost_user_input.c
+++ b/src/vnet/devices/virtio/vhost_user_input.c
@@ -39,6 +39,7 @@
 #include <vnet/devices/devices.h>
 #include <vnet/feature/feature.h>
 
+#include <vnet/devices/virtio/virtio.h>
 #include <vnet/devices/virtio/vhost_user.h>
 #include <vnet/devices/virtio/vhost_user_inline.h>
 
@@ -65,7 +66,7 @@
  */
 #define VHOST_USER_RX_COPY_THRESHOLD 64
 
-vlib_node_registration_t vhost_user_input_node;
+extern vlib_node_registration_t vhost_user_input_node;
 
 #define foreach_vhost_user_input_func_error      \
   _(NO_ERROR, "no error")  \
@@ -92,10 +93,10 @@ static __clib_unused char *vhost_user_input_func_error_strings[] = {
 static_always_inline void
 vhost_user_rx_trace (vhost_trace_t * t,
 		     vhost_user_intf_t * vui, u16 qid,
-		     vlib_buffer_t * b, vhost_user_vring_t * txvq)
+		     vlib_buffer_t * b, vhost_user_vring_t * txvq,
+		     u16 last_avail_idx)
 {
   vhost_user_main_t *vum = &vhost_user_main;
-  u32 last_avail_idx = txvq->last_avail_idx;
   u32 desc_current = txvq->avail->ring[last_avail_idx & txvq->qsz_mask];
   vring_desc_t *hdr_desc = 0;
   virtio_net_hdr_mrg_rxbuf_t *hdr;
@@ -162,8 +163,8 @@ vhost_user_input_copy (vhost_user_intf_t * vui, vhost_copy_t * cpy,
 	  CLIB_PREFETCH (src2, 64, LOAD);
 	  CLIB_PREFETCH (src3, 64, LOAD);
 
-	  clib_memcpy ((void *) cpy[0].dst, src0, cpy[0].len);
-	  clib_memcpy ((void *) cpy[1].dst, src1, cpy[1].len);
+	  clib_memcpy_fast ((void *) cpy[0].dst, src0, cpy[0].len);
+	  clib_memcpy_fast ((void *) cpy[1].dst, src1, cpy[1].len);
 	  copy_len -= 2;
 	  cpy += 2;
 	}
@@ -172,7 +173,7 @@ vhost_user_input_copy (vhost_user_intf_t * vui, vhost_copy_t * cpy,
     {
       if (PREDICT_FALSE (!(src0 = map_guest_mem (vui, cpy->src, map_hint))))
 	return 1;
-      clib_memcpy ((void *) cpy->dst, src0, cpy->len);
+      clib_memcpy_fast ((void *) cpy->dst, src0, cpy->len);
       copy_len -= 1;
       cpy += 1;
     }
@@ -195,25 +196,27 @@ vhost_user_rx_discard_packet (vlib_main_t * vm,
    */
   u32 discarded_packets = 0;
   u32 avail_idx = txvq->avail->idx;
+  u16 mask = txvq->qsz_mask;
+  u16 last_avail_idx = txvq->last_avail_idx;
+  u16 last_used_idx = txvq->last_used_idx;
   while (discarded_packets != discard_max)
     {
-      if (avail_idx == txvq->last_avail_idx)
+      if (avail_idx == last_avail_idx)
 	goto out;
 
-      u16 desc_chain_head =
-	txvq->avail->ring[txvq->last_avail_idx & txvq->qsz_mask];
-      txvq->last_avail_idx++;
-      txvq->used->ring[txvq->last_used_idx & txvq->qsz_mask].id =
-	desc_chain_head;
-      txvq->used->ring[txvq->last_used_idx & txvq->qsz_mask].len = 0;
-      vhost_user_log_dirty_ring (vui, txvq,
-				 ring[txvq->last_used_idx & txvq->qsz_mask]);
-      txvq->last_used_idx++;
+      u16 desc_chain_head = txvq->avail->ring[last_avail_idx & mask];
+      last_avail_idx++;
+      txvq->used->ring[last_used_idx & mask].id = desc_chain_head;
+      txvq->used->ring[last_used_idx & mask].len = 0;
+      vhost_user_log_dirty_ring (vui, txvq, ring[last_used_idx & mask]);
+      last_used_idx++;
       discarded_packets++;
     }
 
 out:
-  CLIB_MEMORY_BARRIER ();
+  txvq->last_avail_idx = last_avail_idx;
+  txvq->last_used_idx = last_used_idx;
+  CLIB_MEMORY_STORE_BARRIER ();
   txvq->used->idx = txvq->last_used_idx;
   vhost_user_log_dirty_ring (vui, txvq, idx);
   return discarded_packets;
@@ -222,7 +225,7 @@ out:
 /*
  * In case of overflow, we need to rewind the array of allocated buffers.
  */
-static __clib_unused void
+static_always_inline void
 vhost_user_input_rewind_buffers (vlib_main_t * vm,
 				 vhost_cpu_t * cpu, vlib_buffer_t * b_head)
 {
@@ -241,27 +244,117 @@ vhost_user_input_rewind_buffers (vlib_main_t * vm,
   cpu->rx_buffers_len++;
 }
 
-static __clib_unused u32
+static_always_inline void
+vhost_user_handle_rx_offload (vlib_buffer_t * b0, u8 * b0_data,
+			      virtio_net_hdr_t * hdr)
+{
+  u8 l4_hdr_sz = 0;
+
+  if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
+    {
+      u8 l4_proto = 0;
+      ethernet_header_t *eh = (ethernet_header_t *) b0_data;
+      u16 ethertype = clib_net_to_host_u16 (eh->type);
+      u16 l2hdr_sz = sizeof (ethernet_header_t);
+
+      if (ethernet_frame_is_tagged (ethertype))
+	{
+	  ethernet_vlan_header_t *vlan = (ethernet_vlan_header_t *) (eh + 1);
+
+	  ethertype = clib_net_to_host_u16 (vlan->type);
+	  l2hdr_sz += sizeof (*vlan);
+	  if (ethertype == ETHERNET_TYPE_VLAN)
+	    {
+	      vlan++;
+	      ethertype = clib_net_to_host_u16 (vlan->type);
+	      l2hdr_sz += sizeof (*vlan);
+	    }
+	}
+      vnet_buffer (b0)->l2_hdr_offset = 0;
+      vnet_buffer (b0)->l3_hdr_offset = l2hdr_sz;
+      vnet_buffer (b0)->l4_hdr_offset = hdr->csum_start;
+      b0->flags |= (VNET_BUFFER_F_L2_HDR_OFFSET_VALID |
+		    VNET_BUFFER_F_L3_HDR_OFFSET_VALID |
+		    VNET_BUFFER_F_L4_HDR_OFFSET_VALID |
+		    VNET_BUFFER_F_OFFLOAD_IP_CKSUM);
+
+      if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP4))
+	{
+	  ip4_header_t *ip4 = (ip4_header_t *) (b0_data + l2hdr_sz);
+	  l4_proto = ip4->protocol;
+	  b0->flags |= VNET_BUFFER_F_IS_IP4;
+	}
+      else if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP6))
+	{
+	  ip6_header_t *ip6 = (ip6_header_t *) (b0_data + l2hdr_sz);
+	  l4_proto = ip6->protocol;
+	  b0->flags |= VNET_BUFFER_F_IS_IP6;
+	}
+
+      if (l4_proto == IP_PROTOCOL_TCP)
+	{
+	  tcp_header_t *tcp = (tcp_header_t *)
+	    (b0_data + vnet_buffer (b0)->l4_hdr_offset);
+	  l4_hdr_sz = tcp_header_bytes (tcp);
+	  tcp->checksum = 0;
+	  b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
+	}
+      else if (l4_proto == IP_PROTOCOL_UDP)
+	{
+	  udp_header_t *udp =
+	    (udp_header_t *) (b0_data + vnet_buffer (b0)->l4_hdr_offset);
+	  l4_hdr_sz = sizeof (*udp);
+	  udp->checksum = 0;
+	  b0->flags |= VNET_BUFFER_F_OFFLOAD_UDP_CKSUM;
+	}
+    }
+
+  if (hdr->gso_type == VIRTIO_NET_HDR_GSO_UDP)
+    {
+      vnet_buffer2 (b0)->gso_size = hdr->gso_size;
+      vnet_buffer2 (b0)->gso_l4_hdr_sz = l4_hdr_sz;
+      b0->flags |= VNET_BUFFER_F_GSO;
+    }
+  else if (hdr->gso_type == VIRTIO_NET_HDR_GSO_TCPV4)
+    {
+      vnet_buffer2 (b0)->gso_size = hdr->gso_size;
+      vnet_buffer2 (b0)->gso_l4_hdr_sz = l4_hdr_sz;
+      b0->flags |= (VNET_BUFFER_F_GSO | VNET_BUFFER_F_IS_IP4);
+    }
+  else if (hdr->gso_type == VIRTIO_NET_HDR_GSO_TCPV6)
+    {
+      vnet_buffer2 (b0)->gso_size = hdr->gso_size;
+      vnet_buffer2 (b0)->gso_l4_hdr_sz = l4_hdr_sz;
+      b0->flags |= (VNET_BUFFER_F_GSO | VNET_BUFFER_F_IS_IP6);
+    }
+}
+
+static_always_inline u32
 vhost_user_if_input (vlib_main_t * vm,
 		     vhost_user_main_t * vum,
 		     vhost_user_intf_t * vui,
 		     u16 qid, vlib_node_runtime_t * node,
-		     vnet_hw_interface_rx_mode mode)
+		     vnet_hw_interface_rx_mode mode, u8 enable_csum)
 {
   vhost_user_vring_t *txvq = &vui->vrings[VHOST_VRING_IDX_TX (qid)];
+  vnet_feature_main_t *fm = &feature_main;
   u16 n_rx_packets = 0;
   u32 n_rx_bytes = 0;
   u16 n_left;
   u32 n_left_to_next, *to_next;
   u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
   u32 n_trace = vlib_get_trace_count (vm, node);
+  u32 buffer_data_size = vlib_buffer_get_default_data_size (vm);
   u32 map_hint = 0;
-  u16 thread_index = vm->thread_index;
+  vhost_cpu_t *cpu = &vum->cpus[vm->thread_index];
   u16 copy_len = 0;
+  u8 feature_arc_idx = fm->device_input_feature_arc_index;
+  u32 current_config_index = ~(u32) 0;
+  u16 mask = txvq->qsz_mask;
 
   /* The descriptor table is not ready yet */
   if (PREDICT_FALSE (txvq->avail == 0))
-    return 0;
+    goto done;
 
   {
     /* do we have pending interrupts ? */
@@ -296,13 +389,13 @@ vhost_user_if_input (vlib_main_t * vm,
     }
 
   if (PREDICT_FALSE (txvq->avail->flags & 0xFFFE))
-    return 0;
+    goto done;
 
   n_left = (u16) (txvq->avail->idx - txvq->last_avail_idx);
 
   /* nothing to do */
   if (PREDICT_FALSE (n_left == 0))
-    return 0;
+    goto done;
 
   if (PREDICT_FALSE (!vui->admin_up || !(txvq->enabled)))
     {
@@ -315,10 +408,10 @@ vhost_user_if_input (vlib_main_t * vm,
        */
       vhost_user_rx_discard_packet (vm, vui, txvq,
 				    VHOST_USER_DOWN_DISCARD_COUNT);
-      return 0;
+      goto done;
     }
 
-  if (PREDICT_FALSE (n_left == (txvq->qsz_mask + 1)))
+  if (PREDICT_FALSE (n_left == (mask + 1)))
     {
       /*
        * Informational error logging when VPP is not
@@ -333,277 +426,313 @@ vhost_user_if_input (vlib_main_t * vm,
 
   /*
    * For small packets (<2kB), we will not need more than one vlib buffer
-   * per packet. In case packets are bigger, we will just yeld at some point
+   * per packet. In case packets are bigger, we will just yield at some point
    * in the loop and come back later. This is not an issue as for big packet,
    * processing cost really comes from the memory copy.
    * The assumption is that big packets will fit in 40 buffers.
    */
-  if (PREDICT_FALSE (vum->cpus[thread_index].rx_buffers_len < n_left + 1 ||
-		     vum->cpus[thread_index].rx_buffers_len < 40))
+  if (PREDICT_FALSE (cpu->rx_buffers_len < n_left + 1 ||
+		     cpu->rx_buffers_len < 40))
     {
-      u32 curr_len = vum->cpus[thread_index].rx_buffers_len;
-      vum->cpus[thread_index].rx_buffers_len +=
-	vlib_buffer_alloc_from_free_list (vm,
-					  vum->cpus[thread_index].rx_buffers +
-					  curr_len,
-					  VHOST_USER_RX_BUFFERS_N - curr_len,
-					  VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+      u32 curr_len = cpu->rx_buffers_len;
+      cpu->rx_buffers_len +=
+	vlib_buffer_alloc (vm, cpu->rx_buffers + curr_len,
+			   VHOST_USER_RX_BUFFERS_N - curr_len);
 
       if (PREDICT_FALSE
-	  (vum->cpus[thread_index].rx_buffers_len <
-	   VHOST_USER_RX_BUFFER_STARVATION))
+	  (cpu->rx_buffers_len < VHOST_USER_RX_BUFFER_STARVATION))
 	{
 	  /* In case of buffer starvation, discard some packets from the queue
 	   * and log the event.
 	   * We keep doing best effort for the remaining packets. */
-	  u32 flush = (n_left + 1 > vum->cpus[thread_index].rx_buffers_len) ?
-	    n_left + 1 - vum->cpus[thread_index].rx_buffers_len : 1;
+	  u32 flush = (n_left + 1 > cpu->rx_buffers_len) ?
+	    n_left + 1 - cpu->rx_buffers_len : 1;
 	  flush = vhost_user_rx_discard_packet (vm, vui, txvq, flush);
 
 	  n_left -= flush;
 	  vlib_increment_simple_counter (vnet_main.
 					 interface_main.sw_if_counters +
 					 VNET_INTERFACE_COUNTER_DROP,
-					 vlib_get_thread_index (),
-					 vui->sw_if_index, flush);
+					 vm->thread_index, vui->sw_if_index,
+					 flush);
 
 	  vlib_error_count (vm, vhost_user_input_node.index,
 			    VHOST_USER_INPUT_FUNC_ERROR_NO_BUFFER, flush);
 	}
     }
 
+  if (PREDICT_FALSE (vnet_have_features (feature_arc_idx, vui->sw_if_index)))
+    {
+      vnet_feature_config_main_t *cm;
+      cm = &fm->feature_config_mains[feature_arc_idx];
+      current_config_index = vec_elt (cm->config_index_by_sw_if_index,
+				      vui->sw_if_index);
+      vnet_get_config_data (&cm->config_main, &current_config_index,
+			    &next_index, 0);
+    }
+
+  u16 last_avail_idx = txvq->last_avail_idx;
+  u16 last_used_idx = txvq->last_used_idx;
+
+  vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+  if (next_index == VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT)
+    {
+      /* give some hints to ethernet-input */
+      vlib_next_frame_t *nf;
+      vlib_frame_t *f;
+      ethernet_input_frame_t *ef;
+      nf = vlib_node_runtime_get_next_frame (vm, node, next_index);
+      f = vlib_get_frame (vm, nf->frame);
+      f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX;
+
+      ef = vlib_frame_scalar_args (f);
+      ef->sw_if_index = vui->sw_if_index;
+      ef->hw_if_index = vui->hw_if_index;
+      vlib_frame_no_append (f);
+    }
+
   while (n_left > 0)
     {
-      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+      vlib_buffer_t *b_head, *b_current;
+      u32 bi_current;
+      u16 desc_current;
+      u32 desc_data_offset;
+      vring_desc_t *desc_table = txvq->desc;
 
-      while (n_left > 0 && n_left_to_next > 0)
+      if (PREDICT_FALSE (cpu->rx_buffers_len <= 1))
 	{
-	  vlib_buffer_t *b_head, *b_current;
-	  u32 bi_current;
-	  u16 desc_current;
-	  u32 desc_data_offset;
-	  vring_desc_t *desc_table = txvq->desc;
+	  /* Not enough rx_buffers
+	   * Note: We yeld on 1 so we don't need to do an additional
+	   * check for the next buffer prefetch.
+	   */
+	  n_left = 0;
+	  break;
+	}
+
+      desc_current = txvq->avail->ring[last_avail_idx & mask];
+      cpu->rx_buffers_len--;
+      bi_current = cpu->rx_buffers[cpu->rx_buffers_len];
+      b_head = b_current = vlib_get_buffer (vm, bi_current);
+      to_next[0] = bi_current;	//We do that now so we can forget about bi_current
+      to_next++;
+      n_left_to_next--;
+
+      vlib_prefetch_buffer_with_index
+	(vm, cpu->rx_buffers[cpu->rx_buffers_len - 1], LOAD);
+
+      /* Just preset the used descriptor id and length for later */
+      txvq->used->ring[last_used_idx & mask].id = desc_current;
+      txvq->used->ring[last_used_idx & mask].len = 0;
+      vhost_user_log_dirty_ring (vui, txvq, ring[last_used_idx & mask]);
 
-	  if (PREDICT_FALSE (vum->cpus[thread_index].rx_buffers_len <= 1))
+      /* The buffer should already be initialized */
+      b_head->total_length_not_including_first_buffer = 0;
+      b_head->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+      if (PREDICT_FALSE (n_trace))
+	{
+	  vlib_trace_buffer (vm, node, next_index, b_head,
+			     /* follow_chain */ 0);
+	  vhost_trace_t *t0 =
+	    vlib_add_trace (vm, node, b_head, sizeof (t0[0]));
+	  vhost_user_rx_trace (t0, vui, qid, b_head, txvq, last_avail_idx);
+	  n_trace--;
+	  vlib_set_trace_count (vm, node, n_trace);
+	}
+
+      /* This depends on the setup but is very consistent
+       * So I think the CPU branch predictor will make a pretty good job
+       * at optimizing the decision. */
+      u8 indirect = 0;
+      if (txvq->desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT)
+	{
+	  desc_table = map_guest_mem (vui, txvq->desc[desc_current].addr,
+				      &map_hint);
+	  desc_current = 0;
+	  indirect = 1;
+	  if (PREDICT_FALSE (desc_table == 0))
 	    {
-	      /* Not enough rx_buffers
-	       * Note: We yeld on 1 so we don't need to do an additional
-	       * check for the next buffer prefetch.
-	       */
-	      n_left = 0;
-	      break;
+	      vlib_error_count (vm, node->node_index,
+				VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1);
+	      goto out;
 	    }
+	}
+
+      if (PREDICT_TRUE (vui->is_any_layout) ||
+	  (!(desc_table[desc_current].flags & VIRTQ_DESC_F_NEXT)))
+	{
+	  /* ANYLAYOUT or single buffer */
+	  desc_data_offset = vui->virtio_net_hdr_sz;
+	}
+      else
+	{
+	  /* CSR case without ANYLAYOUT, skip 1st buffer */
+	  desc_data_offset = desc_table[desc_current].len;
+	}
+
+      if (enable_csum)
+	{
+	  virtio_net_hdr_mrg_rxbuf_t *hdr;
+	  u8 *b_data;
+	  u16 current = desc_current;
+	  u32 data_offset = desc_data_offset;
 
-	  desc_current =
-	    txvq->avail->ring[txvq->last_avail_idx & txvq->qsz_mask];
-	  vum->cpus[thread_index].rx_buffers_len--;
-	  bi_current = (vum->cpus[thread_index].rx_buffers)
-	    [vum->cpus[thread_index].rx_buffers_len];
-	  b_head = b_current = vlib_get_buffer (vm, bi_current);
-	  to_next[0] = bi_current;	//We do that now so we can forget about bi_current
-	  to_next++;
-	  n_left_to_next--;
-
-	  vlib_prefetch_buffer_with_index (vm,
-					   (vum->
-					    cpus[thread_index].rx_buffers)
-					   [vum->cpus[thread_index].
-					    rx_buffers_len - 1], LOAD);
-
-	  /* Just preset the used descriptor id and length for later */
-	  txvq->used->ring[txvq->last_used_idx & txvq->qsz_mask].id =
-	    desc_current;
-	  txvq->used->ring[txvq->last_used_idx & txvq->qsz_mask].len = 0;
-	  vhost_user_log_dirty_ring (vui, txvq,
-				     ring[txvq->last_used_idx &
-					  txvq->qsz_mask]);
-
-	  /* The buffer should already be initialized */
-	  b_head->total_length_not_including_first_buffer = 0;
-	  b_head->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
-
-	  if (PREDICT_FALSE (n_trace))
+	  if ((data_offset == desc_table[current].len) &&
+	      (desc_table[current].flags & VIRTQ_DESC_F_NEXT))
 	    {
-	      //TODO: next_index is not exactly known at that point
-	      vlib_trace_buffer (vm, node, next_index, b_head,
-				 /* follow_chain */ 0);
-	      vhost_trace_t *t0 =
-		vlib_add_trace (vm, node, b_head, sizeof (t0[0]));
-	      vhost_user_rx_trace (t0, vui, qid, b_head, txvq);
-	      n_trace--;
-	      vlib_set_trace_count (vm, node, n_trace);
+	      current = desc_table[current].next;
+	      data_offset = 0;
 	    }
-
-	  /* This depends on the setup but is very consistent
-	   * So I think the CPU branch predictor will make a pretty good job
-	   * at optimizing the decision. */
-	  if (txvq->desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT)
+	  hdr = map_guest_mem (vui, desc_table[current].addr, &map_hint);
+	  if (PREDICT_FALSE (hdr == 0))
+	    {
+	      vlib_error_count (vm, node->node_index,
+				VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1);
+	      goto out;
+	    }
+	  b_data = (u8 *) hdr + data_offset;
+	  if (indirect)
 	    {
-	      desc_table = map_guest_mem (vui, txvq->desc[desc_current].addr,
-					  &map_hint);
-	      desc_current = 0;
-	      if (PREDICT_FALSE (desc_table == 0))
+	      hdr = map_guest_mem (vui, desc_table[desc_current].addr,
+				   &map_hint);
+	      if (PREDICT_FALSE (hdr == 0))
 		{
 		  vlib_error_count (vm, node->node_index,
 				    VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1);
 		  goto out;
 		}
 	    }
+	  vhost_user_handle_rx_offload (b_head, b_data, &hdr->hdr);
+	}
 
-	  if (PREDICT_TRUE (vui->is_any_layout) ||
-	      (!(desc_table[desc_current].flags & VIRTQ_DESC_F_NEXT)))
-	    {
-	      /* ANYLAYOUT or single buffer */
-	      desc_data_offset = vui->virtio_net_hdr_sz;
-	    }
-	  else
-	    {
-	      /* CSR case without ANYLAYOUT, skip 1st buffer */
-	      desc_data_offset = desc_table[desc_current].len;
-	    }
-
-	  while (1)
+      while (1)
+	{
+	  /* Get more input if necessary. Or end of packet. */
+	  if (desc_data_offset == desc_table[desc_current].len)
 	    {
-	      /* Get more input if necessary. Or end of packet. */
-	      if (desc_data_offset == desc_table[desc_current].len)
+	      if (PREDICT_FALSE (desc_table[desc_current].flags &
+				 VIRTQ_DESC_F_NEXT))
+		{
+		  desc_current = desc_table[desc_current].next;
+		  desc_data_offset = 0;
+		}
+	      else
 		{
-		  if (PREDICT_FALSE (desc_table[desc_current].flags &
-				     VIRTQ_DESC_F_NEXT))
-		    {
-		      desc_current = desc_table[desc_current].next;
-		      desc_data_offset = 0;
-		    }
-		  else
-		    {
-		      goto out;
-		    }
+		  goto out;
 		}
+	    }
 
-	      /* Get more output if necessary. Or end of packet. */
-	      if (PREDICT_FALSE
-		  (b_current->current_length == VLIB_BUFFER_DATA_SIZE))
+	  /* Get more output if necessary. Or end of packet. */
+	  if (PREDICT_FALSE (b_current->current_length == buffer_data_size))
+	    {
+	      if (PREDICT_FALSE (cpu->rx_buffers_len == 0))
 		{
-		  if (PREDICT_FALSE
-		      (vum->cpus[thread_index].rx_buffers_len == 0))
-		    {
-		      /* Cancel speculation */
-		      to_next--;
-		      n_left_to_next++;
-
-		      /*
-		       * Checking if there are some left buffers.
-		       * If not, just rewind the used buffers and stop.
-		       * Note: Scheduled copies are not cancelled. This is
-		       * not an issue as they would still be valid. Useless,
-		       * but valid.
-		       */
-		      vhost_user_input_rewind_buffers (vm,
-						       &vum->cpus
-						       [thread_index],
-						       b_head);
-		      n_left = 0;
-		      goto stop;
-		    }
-
-		  /* Get next output */
-		  vum->cpus[thread_index].rx_buffers_len--;
-		  u32 bi_next =
-		    (vum->cpus[thread_index].rx_buffers)[vum->cpus
-							 [thread_index].rx_buffers_len];
-		  b_current->next_buffer = bi_next;
-		  b_current->flags |= VLIB_BUFFER_NEXT_PRESENT;
-		  bi_current = bi_next;
-		  b_current = vlib_get_buffer (vm, bi_current);
+		  /* Cancel speculation */
+		  to_next--;
+		  n_left_to_next++;
+
+		  /*
+		   * Checking if there are some left buffers.
+		   * If not, just rewind the used buffers and stop.
+		   * Note: Scheduled copies are not cancelled. This is
+		   * not an issue as they would still be valid. Useless,
+		   * but valid.
+		   */
+		  vhost_user_input_rewind_buffers (vm, cpu, b_head);
+		  n_left = 0;
+		  goto stop;
 		}
 
-	      /* Prepare a copy order executed later for the data */
-	      vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len];
-	      copy_len++;
-	      u32 desc_data_l =
-		desc_table[desc_current].len - desc_data_offset;
-	      cpy->len = VLIB_BUFFER_DATA_SIZE - b_current->current_length;
-	      cpy->len = (cpy->len > desc_data_l) ? desc_data_l : cpy->len;
-	      cpy->dst = (uword) (vlib_buffer_get_current (b_current) +
-				  b_current->current_length);
-	      cpy->src = desc_table[desc_current].addr + desc_data_offset;
-
-	      desc_data_offset += cpy->len;
-
-	      b_current->current_length += cpy->len;
-	      b_head->total_length_not_including_first_buffer += cpy->len;
+	      /* Get next output */
+	      cpu->rx_buffers_len--;
+	      u32 bi_next = cpu->rx_buffers[cpu->rx_buffers_len];
+	      b_current->next_buffer = bi_next;
+	      b_current->flags |= VLIB_BUFFER_NEXT_PRESENT;
+	      bi_current = bi_next;
+	      b_current = vlib_get_buffer (vm, bi_current);
 	    }
 
-	out:
-	  CLIB_PREFETCH (&n_left, sizeof (n_left), LOAD);
-
-	  n_rx_bytes += b_head->total_length_not_including_first_buffer;
-	  n_rx_packets++;
+	  /* Prepare a copy order executed later for the data */
+	  ASSERT (copy_len < VHOST_USER_COPY_ARRAY_N);
+	  vhost_copy_t *cpy = &cpu->copy[copy_len];
+	  copy_len++;
+	  u32 desc_data_l = desc_table[desc_current].len - desc_data_offset;
+	  cpy->len = buffer_data_size - b_current->current_length;
+	  cpy->len = (cpy->len > desc_data_l) ? desc_data_l : cpy->len;
+	  cpy->dst = (uword) (vlib_buffer_get_current (b_current) +
+			      b_current->current_length);
+	  cpy->src = desc_table[desc_current].addr + desc_data_offset;
+
+	  desc_data_offset += cpy->len;
+
+	  b_current->current_length += cpy->len;
+	  b_head->total_length_not_including_first_buffer += cpy->len;
+	}
 
-	  b_head->total_length_not_including_first_buffer -=
-	    b_head->current_length;
+    out:
 
-	  /* consume the descriptor and return it as used */
-	  txvq->last_avail_idx++;
-	  txvq->last_used_idx++;
+      n_rx_bytes += b_head->total_length_not_including_first_buffer;
+      n_rx_packets++;
 
-	  VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b_head);
+      b_head->total_length_not_including_first_buffer -=
+	b_head->current_length;
 
-	  vnet_buffer (b_head)->sw_if_index[VLIB_RX] = vui->sw_if_index;
-	  vnet_buffer (b_head)->sw_if_index[VLIB_TX] = (u32) ~ 0;
-	  b_head->error = 0;
+      /* consume the descriptor and return it as used */
+      last_avail_idx++;
+      last_used_idx++;
 
-	  {
-	    u32 next0 = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+      VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b_head);
 
-	    /* redirect if feature path enabled */
-	    vnet_feature_start_device_input_x1 (vui->sw_if_index, &next0,
-						b_head);
+      vnet_buffer (b_head)->sw_if_index[VLIB_RX] = vui->sw_if_index;
+      vnet_buffer (b_head)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+      b_head->error = 0;
 
-	    u32 bi = to_next[-1];	//Cannot use to_next[-1] in the macro
-	    vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
-					     to_next, n_left_to_next,
-					     bi, next0);
-	  }
+      if (current_config_index != ~(u32) 0)
+	{
+	  b_head->current_config_index = current_config_index;
+	  vnet_buffer (b_head)->feature_arc_index = feature_arc_idx;
+	}
 
-	  n_left--;
+      n_left--;
 
-	  /*
-	   * Although separating memory copies from virtio ring parsing
-	   * is beneficial, we can offer to perform the copies from time
-	   * to time in order to free some space in the ring.
-	   */
-	  if (PREDICT_FALSE (copy_len >= VHOST_USER_RX_COPY_THRESHOLD))
+      /*
+       * Although separating memory copies from virtio ring parsing
+       * is beneficial, we can offer to perform the copies from time
+       * to time in order to free some space in the ring.
+       */
+      if (PREDICT_FALSE (copy_len >= VHOST_USER_RX_COPY_THRESHOLD))
+	{
+	  if (PREDICT_FALSE (vhost_user_input_copy (vui, cpu->copy,
+						    copy_len, &map_hint)))
 	    {
-	      if (PREDICT_FALSE
-		  (vhost_user_input_copy (vui, vum->cpus[thread_index].copy,
-					  copy_len, &map_hint)))
-		{
-		  vlib_error_count (vm, node->node_index,
-				    VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1);
-		}
-	      copy_len = 0;
-
-	      /* give buffers back to driver */
-	      CLIB_MEMORY_BARRIER ();
-	      txvq->used->idx = txvq->last_used_idx;
-	      vhost_user_log_dirty_ring (vui, txvq, idx);
+	      vlib_error_count (vm, node->node_index,
+				VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1);
 	    }
+	  copy_len = 0;
+
+	  /* give buffers back to driver */
+	  CLIB_MEMORY_STORE_BARRIER ();
+	  txvq->used->idx = last_used_idx;
+	  vhost_user_log_dirty_ring (vui, txvq, idx);
 	}
-    stop:
-      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
     }
+stop:
+  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+
+  txvq->last_used_idx = last_used_idx;
+  txvq->last_avail_idx = last_avail_idx;
 
   /* Do the memory copies */
-  if (PREDICT_FALSE
-      (vhost_user_input_copy (vui, vum->cpus[thread_index].copy,
-			      copy_len, &map_hint)))
+  if (PREDICT_FALSE (vhost_user_input_copy (vui, cpu->copy, copy_len,
+					    &map_hint)))
     {
       vlib_error_count (vm, node->node_index,
 			VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1);
     }
 
   /* give buffers back to driver */
-  CLIB_MEMORY_BARRIER ();
+  CLIB_MEMORY_STORE_BARRIER ();
   txvq->used->idx = txvq->last_used_idx;
   vhost_user_log_dirty_ring (vui, txvq, idx);
 
@@ -620,11 +749,12 @@ vhost_user_if_input (vlib_main_t * vm,
   /* increase rx counters */
   vlib_increment_combined_counter
     (vnet_main.interface_main.combined_sw_if_counters
-     + VNET_INTERFACE_COUNTER_RX,
-     vlib_get_thread_index (), vui->sw_if_index, n_rx_packets, n_rx_bytes);
+     + VNET_INTERFACE_COUNTER_RX, vm->thread_index, vui->sw_if_index,
+     n_rx_packets, n_rx_bytes);
 
-  vnet_device_increment_rx_packets (thread_index, n_rx_packets);
+  vnet_device_increment_rx_packets (vm->thread_index, n_rx_packets);
 
+done:
   return n_rx_packets;
 }
 
@@ -641,13 +771,19 @@ VLIB_NODE_FN (vhost_user_input_node) (vlib_main_t * vm,
 
   vec_foreach (dq, rt->devices_and_queues)
   {
-    if (clib_smp_swap (&dq->interrupt_pending, 0) ||
-	(node->state == VLIB_NODE_STATE_POLLING))
+    if ((node->state == VLIB_NODE_STATE_POLLING) ||
+	clib_atomic_swap_acq_n (&dq->interrupt_pending, 0))
       {
 	vui =
 	  pool_elt_at_index (vum->vhost_user_interfaces, dq->dev_instance);
-	n_rx_packets = vhost_user_if_input (vm, vum, vui, dq->queue_id, node,
-					    dq->mode);
+	if (vui->features & (1ULL << FEAT_VIRTIO_NET_F_CSUM))
+	  n_rx_packets +=
+	    vhost_user_if_input (vm, vum, vui, dq->queue_id, node, dq->mode,
+				 1);
+	else
+	  n_rx_packets +=
+	    vhost_user_if_input (vm, vum, vui, dq->queue_id, node, dq->mode,
+				 0);
       }
   }
 
@@ -659,6 +795,7 @@ VLIB_REGISTER_NODE (vhost_user_input_node) = {
   .type = VLIB_NODE_TYPE_INPUT,
   .name = "vhost-user-input",
   .sibling_of = "device-input",
+  .flags = VLIB_NODE_FLAG_TRACE_SUPPORTED,
 
   /* Will be enabled if/when hardware is detected. */
   .state = VLIB_NODE_STATE_DISABLED,