IP6 SR multicast replicator
[vpp.git] / vnet / vnet / devices / dpdk / device.c
index 72df02a..20c8b8f 100644 (file)
@@ -16,6 +16,7 @@
 #include <vppinfra/vec.h>
 #include <vppinfra/format.h>
 #include <vlib/unix/cj.h>
+#include <assert.h>
 
 #include <vnet/ethernet/ethernet.h>
 #include <vnet/devices/dpdk/dpdk.h>
@@ -42,7 +43,41 @@ static char * dpdk_tx_func_error_strings[] = {
 #undef _
 };
 
-static struct rte_mbuf * dpdk_replicate_packet_mb (vlib_buffer_t * b)
+clib_error_t *
+dpdk_set_mac_address (vnet_hw_interface_t * hi, char * address)
+{
+   int error;
+   dpdk_main_t * dm = &dpdk_main;
+   dpdk_device_t * xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+
+   error=rte_eth_dev_default_mac_addr_set(xd->device_index,
+                                          (struct ether_addr *) address);
+
+   if (error) {
+     return clib_error_return (0, "mac address set failed: %d", error);
+   } else {
+     return NULL;
+  }
+}
+
+clib_error_t *
+dpdk_set_mc_filter (vnet_hw_interface_t * hi,
+                    struct ether_addr mc_addr_vec[], int naddr)
+{
+  int error;
+  dpdk_main_t * dm = &dpdk_main;
+  dpdk_device_t * xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+
+  error=rte_eth_dev_set_mc_addr_list(xd->device_index, mc_addr_vec, naddr);
+
+  if (error) {
+    return clib_error_return (0, "mc addr list failed: %d", error);
+  } else {
+    return NULL;
+  }
+}
+
+struct rte_mbuf * dpdk_replicate_packet_mb (vlib_buffer_t * b)
 {
   vlib_main_t * vm = vlib_get_main();
   vlib_buffer_main_t * bm = vm->buffer_main;
@@ -52,7 +87,7 @@ static struct rte_mbuf * dpdk_replicate_packet_mb (vlib_buffer_t * b)
   unsigned socket_id = rte_socket_id();
 
   ASSERT (bm->pktmbuf_pools[socket_id]);
-  pkt_mb = ((struct rte_mbuf *)b)-1;
+  pkt_mb = rte_mbuf_from_vlib_buffer(b);
   nb_segs = pkt_mb->nb_segs;
   for (nb_segs_left = nb_segs; nb_segs_left; nb_segs_left--)
     {
@@ -100,7 +135,7 @@ static struct rte_mbuf * dpdk_replicate_packet_mb (vlib_buffer_t * b)
       rte_pktmbuf_data_len (new_mb) = pkt_mb->data_len;
       copy_bytes = pkt_mb->data_len + RTE_PKTMBUF_HEADROOM;
       ASSERT(copy_bytes <= pkt_mb->buf_len);
-      memcpy(new_mb->buf_addr, pkt_mb->buf_addr, copy_bytes);
+      clib_memcpy(new_mb->buf_addr, pkt_mb->buf_addr, copy_bytes);
 
       prev_mb_next = &new_mb->next;
       pkt_mb = pkt_mb->next;
@@ -112,14 +147,73 @@ static struct rte_mbuf * dpdk_replicate_packet_mb (vlib_buffer_t * b)
   return first_mb;
 }
 
-typedef struct {
-  u32 buffer_index;
-  u16 device_index;
-  u8 queue_index;
-  struct rte_mbuf mb;
-  /* Copy of VLIB buffer; packet data stored in pre_data. */
-  vlib_buffer_t buffer;
-} dpdk_tx_dma_trace_t;
+struct rte_mbuf * dpdk_zerocopy_replicate_packet_mb (vlib_buffer_t * b)
+{
+  vlib_main_t * vm = vlib_get_main();
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  struct rte_mbuf * first_mb = 0, * new_mb, * pkt_mb, ** prev_mb_next = 0;
+  u8 nb_segs, nb_segs_left;
+  unsigned socket_id = rte_socket_id();
+
+  ASSERT (bm->pktmbuf_pools[socket_id]);
+  pkt_mb = rte_mbuf_from_vlib_buffer(b);
+  nb_segs = pkt_mb->nb_segs;
+  for (nb_segs_left = nb_segs; nb_segs_left; nb_segs_left--)
+    {
+      if (PREDICT_FALSE(pkt_mb == 0))
+       {
+         clib_warning ("Missing %d mbuf chain segment(s):   "
+                       "(nb_segs = %d, nb_segs_left = %d)!",
+                       nb_segs - nb_segs_left, nb_segs, nb_segs_left);
+         if (first_mb)
+           rte_pktmbuf_free(first_mb);
+         return NULL;
+       }
+      new_mb = rte_pktmbuf_clone(pkt_mb, bm->pktmbuf_pools[socket_id]);
+      if (PREDICT_FALSE(new_mb == 0))
+       {
+         if (first_mb)
+           rte_pktmbuf_free(first_mb);
+         return NULL;
+       }
+      
+      /*
+       * Copy packet info into 1st segment.
+       */
+      if (first_mb == 0)
+       {
+         first_mb = new_mb;
+         rte_pktmbuf_pkt_len (first_mb) = pkt_mb->pkt_len;
+         first_mb->nb_segs = pkt_mb->nb_segs;
+         first_mb->port = pkt_mb->port;
+#ifdef DAW_FIXME // TX Offload support TBD
+         first_mb->vlan_macip = pkt_mb->vlan_macip;
+         first_mb->hash = pkt_mb->hash;
+         first_mb->ol_flags = pkt_mb->ol_flags
+#endif
+       }
+      else
+       {
+         ASSERT(prev_mb_next != 0);
+         *prev_mb_next = new_mb;
+       }
+      
+      /*
+       * Copy packet segment data into new mbuf segment.
+       */
+      rte_pktmbuf_data_len (new_mb) = pkt_mb->data_len;
+
+      prev_mb_next = &new_mb->next;
+      pkt_mb = pkt_mb->next;
+    }
+
+  ASSERT(pkt_mb == 0);
+  __rte_mbuf_sanity_check(first_mb, 1);
+
+  return first_mb;
+
+
+}
 
 static void
 dpdk_tx_trace_buffer (dpdk_main_t * dm,
@@ -133,15 +227,15 @@ dpdk_tx_trace_buffer (dpdk_main_t * dm,
   dpdk_tx_dma_trace_t * t0;
   struct rte_mbuf * mb;
 
-  mb = ((struct rte_mbuf *)buffer)-1;
+  mb = rte_mbuf_from_vlib_buffer(buffer);
 
   t0 = vlib_add_trace (vm, node, buffer, sizeof (t0[0]));
   t0->queue_index = queue_id;
   t0->device_index = xd->device_index;
   t0->buffer_index = buffer_index;
-  memcpy (&t0->mb, mb, sizeof (t0->mb));
-  memcpy (&t0->buffer, buffer, sizeof (buffer[0]) - sizeof (buffer->pre_data));
-  memcpy (t0->buffer.pre_data, buffer->data + buffer->current_data,
+  clib_memcpy (&t0->mb, mb, sizeof (t0->mb));
+  clib_memcpy (&t0->buffer, buffer, sizeof (buffer[0]) - sizeof (buffer->pre_data));
+  clib_memcpy (t0->buffer.pre_data, buffer->data + buffer->current_data,
          sizeof (t0->buffer.pre_data));
 }
 
@@ -220,11 +314,13 @@ u32 tx_burst_vector_internal (vlib_main_t * vm,
        * This device only supports one TX queue,
        * and we're running multi-threaded...
        */
-      if (PREDICT_FALSE(xd->lockp != 0))
+      if (PREDICT_FALSE(xd->dev_type != VNET_DPDK_DEV_VHOST_USER &&
+        xd->lockp != 0))
         {
-          queue_id = 0;
-          while (__sync_lock_test_and_set (xd->lockp, 1))
-            /* zzzz */;
+          queue_id = queue_id % xd->tx_q_used;
+          while (__sync_lock_test_and_set (xd->lockp[queue_id], 1))
+            /* zzzz */
+            queue_id = (queue_id + 1) % xd->tx_q_used;
         }
 
       if (PREDICT_TRUE(xd->dev_type == VNET_DPDK_DEV_ETH)) 
@@ -261,20 +357,50 @@ u32 tx_burst_vector_internal (vlib_main_t * vm,
         } 
       else if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER)
         {
+          u32 offset = 0;
+          if (xd->need_txlock) {
+            queue_id = 0;
+            while (__sync_lock_test_and_set (xd->lockp[queue_id], 1));
+          }
+#if RTE_VERSION >= RTE_VERSION_NUM(2, 2, 0, 0)
+          else {
+              dpdk_device_and_queue_t * dq;
+              vec_foreach (dq, dm->devices_by_cpu[vm->cpu_index])
+              {
+                if (xd->device_index == dq->device)
+                    break; 
+              }
+              assert (dq);
+              offset = dq->queue_id * VIRTIO_QNUM;
+          }
+#endif
           if (PREDICT_TRUE(tx_head > tx_tail)) 
             {
+              int i; u32 bytes = 0;
+              struct rte_mbuf **pkts = &tx_vector[tx_tail];
+              for (i = 0; i < (tx_head - tx_tail); i++) {
+                  struct rte_mbuf *buff = pkts[i];
+                  bytes += rte_pktmbuf_data_len(buff);
+              } 
+                
               /* no wrap, transmit in one burst */
-              rv = rte_vhost_enqueue_burst(&xd->vu_vhost_dev, VIRTIO_RXQ,
+              rv = rte_vhost_enqueue_burst(&xd->vu_vhost_dev, offset + VIRTIO_RXQ,
                                            &tx_vector[tx_tail],
                                            (uint16_t) (tx_head-tx_tail));
               if (PREDICT_TRUE(rv > 0))
                 {
-                  if (dpdk_vhost_user_want_interrupt(xd, VIRTIO_RXQ)) {
-                    dpdk_vu_vring *vring = &(xd->vu_intf->vrings[VIRTIO_RXQ]);
+                  dpdk_vu_vring *vring = &(xd->vu_intf->vrings[offset + VIRTIO_TXQ]);
+                  vring->packets += rv;
+                  vring->bytes += bytes;
+
+                  if (dpdk_vhost_user_want_interrupt(xd, offset + VIRTIO_RXQ)) {
+                    vring = &(xd->vu_intf->vrings[offset + VIRTIO_RXQ]);
                     vring->n_since_last_int += rv;
 
-                    if (vring->n_since_last_int > dm->vhost_coalesce_frames)
-                      dpdk_vhost_user_send_interrupt(dm->vlib_main, xd, VIRTIO_RXQ);
+                    f64 now = vlib_time_now (vm);
+                    if (vring->int_deadline < now ||
+                        vring->n_since_last_int > dm->vhost_coalesce_frames)
+                      dpdk_vhost_user_send_interrupt(vm, xd, offset + VIRTIO_RXQ);
                   }
 
                   int c = rv;
@@ -289,18 +415,30 @@ u32 tx_burst_vector_internal (vlib_main_t * vm,
                * so we can try to transmit the rest. If we didn't transmit
                * everything, stop now.
                */
-              rv = rte_vhost_enqueue_burst(&xd->vu_vhost_dev, VIRTIO_RXQ,
+              int i; u32 bytes = 0;
+              struct rte_mbuf **pkts = &tx_vector[tx_tail];
+              for (i = 0; i < (DPDK_TX_RING_SIZE - tx_tail); i++) {
+                  struct rte_mbuf *buff = pkts[i];
+                  bytes += rte_pktmbuf_data_len(buff);
+              }
+              rv = rte_vhost_enqueue_burst(&xd->vu_vhost_dev, offset + VIRTIO_RXQ,
                                            &tx_vector[tx_tail], 
                                            (uint16_t) (DPDK_TX_RING_SIZE - tx_tail));
 
               if (PREDICT_TRUE(rv > 0))
                 {
-                  if (dpdk_vhost_user_want_interrupt(xd, VIRTIO_RXQ)) {
-                    dpdk_vu_vring *vring = &(xd->vu_intf->vrings[VIRTIO_RXQ]);
+                  dpdk_vu_vring *vring = &(xd->vu_intf->vrings[offset + VIRTIO_TXQ]);
+                  vring->packets += rv;
+                  vring->bytes += bytes;
+
+                  if (dpdk_vhost_user_want_interrupt(xd, offset + VIRTIO_RXQ)) {
+                    vring = &(xd->vu_intf->vrings[offset + VIRTIO_RXQ]);
                     vring->n_since_last_int += rv;
 
-                    if (vring->n_since_last_int > dm->vhost_coalesce_frames)
-                      dpdk_vhost_user_send_interrupt(dm->vlib_main, xd, VIRTIO_RXQ);
+                    f64 now = vlib_time_now (vm);
+                    if (vring->int_deadline < now ||
+                        vring->n_since_last_int > dm->vhost_coalesce_frames)
+                      dpdk_vhost_user_send_interrupt(vm, xd, offset + VIRTIO_RXQ);
                   }
 
                   int c = rv;
@@ -310,7 +448,11 @@ u32 tx_burst_vector_internal (vlib_main_t * vm,
 
               n_retry = (rv == DPDK_TX_RING_SIZE - tx_tail) ? 1 : 0;
             }
+
+          if (xd->need_txlock)
+            *xd->lockp[queue_id] = 0;
         }
+#if RTE_LIBRTE_KNI
       else if (xd->dev_type == VNET_DPDK_DEV_KNI)
         {
           if (PREDICT_TRUE(tx_head > tx_tail)) 
@@ -341,14 +483,16 @@ u32 tx_burst_vector_internal (vlib_main_t * vm,
               n_retry = (rv == DPDK_TX_RING_SIZE - tx_tail) ? 1 : 0;
             }
         } 
+#endif
       else
         {
           ASSERT(0);
           rv = 0;
         }
 
-      if (PREDICT_FALSE(xd->lockp != 0))
-          *xd->lockp = 0;
+      if (PREDICT_FALSE(xd->dev_type != VNET_DPDK_DEV_VHOST_USER &&
+            xd->lockp != 0))
+          *xd->lockp[queue_id] = 0;
 
       if (PREDICT_FALSE(rv < 0))
         {
@@ -467,7 +611,7 @@ dpdk_interface_tx (vlib_main_t * vm,
         {
           u32 bi0 = from[n_packets];
           vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
-          struct rte_mbuf *mb0 = ((struct rte_mbuf *)b0) - 1;
+          struct rte_mbuf *mb0 = rte_mbuf_from_vlib_buffer(b0);
           rte_pktmbuf_free (mb0);
         }
       return n_on_ring;
@@ -510,9 +654,9 @@ dpdk_interface_tx (vlib_main_t * vm,
       pref0 = vlib_get_buffer (vm, pi0);
       pref1 = vlib_get_buffer (vm, pi1);
 
-      prefmb0 = ((struct rte_mbuf *)pref0) - 1;
-      prefmb1 = ((struct rte_mbuf *)pref1) - 1;
-      
+      prefmb0 = rte_mbuf_from_vlib_buffer(pref0);
+      prefmb1 = rte_mbuf_from_vlib_buffer(pref1);
+
       CLIB_PREFETCH(prefmb0, CLIB_CACHE_LINE_BYTES, LOAD);
       CLIB_PREFETCH(pref0, CLIB_CACHE_LINE_BYTES, LOAD);
       CLIB_PREFETCH(prefmb1, CLIB_CACHE_LINE_BYTES, LOAD);
@@ -525,44 +669,44 @@ dpdk_interface_tx (vlib_main_t * vm,
       b0 = vlib_get_buffer (vm, bi0);
       b1 = vlib_get_buffer (vm, bi1);
 
-      mb0 = ((struct rte_mbuf *)b0) - 1;
-      mb1 = ((struct rte_mbuf *)b1) - 1;
+      mb0 = rte_mbuf_from_vlib_buffer(b0);
+      mb1 = rte_mbuf_from_vlib_buffer(b1);
 
       any_clone = b0->clone_count | b1->clone_count;
       if (PREDICT_FALSE(any_clone != 0))
         {
           if (PREDICT_FALSE(b0->clone_count != 0))
-        {
-          struct rte_mbuf * mb0_new = dpdk_replicate_packet_mb (b0);
-          if (PREDICT_FALSE(mb0_new == 0))
-            {
-              vlib_error_count (vm, node->node_index,
-                    DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
-              b0->flags |= VLIB_BUFFER_REPL_FAIL;
-            }
-          else
-            mb0 = mb0_new;
-          vec_add1 (dm->recycle[my_cpu], bi0);
-        }
+           {
+             struct rte_mbuf * mb0_new = dpdk_replicate_packet_mb (b0);
+             if (PREDICT_FALSE(mb0_new == 0))
+               {
+                 vlib_error_count (vm, node->node_index,
+                                   DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
+                 b0->flags |= VLIB_BUFFER_REPL_FAIL;
+               }
+             else
+               mb0 = mb0_new;
+             vec_add1 (dm->recycle[my_cpu], bi0);
+           }
           if (PREDICT_FALSE(b1->clone_count != 0))
-        {
-          struct rte_mbuf * mb1_new = dpdk_replicate_packet_mb (b1);
-          if (PREDICT_FALSE(mb1_new == 0))
-            {
-              vlib_error_count (vm, node->node_index,
-                    DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
-              b1->flags |= VLIB_BUFFER_REPL_FAIL;
-            }
-          else
-            mb1 = mb1_new;
-          vec_add1 (dm->recycle[my_cpu], bi1);
-        }
-    }
+           {
+             struct rte_mbuf * mb1_new = dpdk_replicate_packet_mb (b1);
+             if (PREDICT_FALSE(mb1_new == 0))
+               {
+                 vlib_error_count (vm, node->node_index,
+                                   DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
+                 b1->flags |= VLIB_BUFFER_REPL_FAIL;
+               }
+             else
+               mb1 = mb1_new;
+             vec_add1 (dm->recycle[my_cpu], bi1);
+           }
+       }
 
       delta0 = PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL) ? 0 :
-    vlib_buffer_length_in_chain (vm, b0) - (i16) mb0->pkt_len;
+       vlib_buffer_length_in_chain (vm, b0) - (i16) mb0->pkt_len;
       delta1 = PREDICT_FALSE(b1->flags & VLIB_BUFFER_REPL_FAIL) ? 0 :
-    vlib_buffer_length_in_chain (vm, b1) - (i16) mb1->pkt_len;
+       vlib_buffer_length_in_chain (vm, b1) - (i16) mb1->pkt_len;
       
       new_data_len0 = (u16)((i16) mb0->data_len + delta0);
       new_data_len1 = (u16)((i16) mb1->data_len + delta1);
@@ -577,23 +721,23 @@ dpdk_interface_tx (vlib_main_t * vm,
       mb1->pkt_len = new_pkt_len1;
 
       mb0->data_off = (PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL)) ?
-          mb0->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b0->current_data);
+       mb0->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b0->current_data);
       mb1->data_off = (PREDICT_FALSE(b1->flags & VLIB_BUFFER_REPL_FAIL)) ?
-          mb1->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b1->current_data);
+       mb1->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b1->current_data);
 
       if (PREDICT_FALSE(node->flags & VLIB_NODE_FLAG_TRACE))
-    {
+       {
           if (b0->flags & VLIB_BUFFER_IS_TRACED)
-              dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
+           dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
           if (b1->flags & VLIB_BUFFER_IS_TRACED)
-              dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi1, b1);
-    }
+           dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi1, b1);
+       }
 
       if (PREDICT_TRUE(any_clone == 0))
         {
-      tx_vector[i % DPDK_TX_RING_SIZE] = mb0;
+         tx_vector[i % DPDK_TX_RING_SIZE] = mb0;
           i++;
-      tx_vector[i % DPDK_TX_RING_SIZE] = mb1;
+         tx_vector[i % DPDK_TX_RING_SIZE] = mb1;
           i++;
         }
       else
@@ -601,16 +745,16 @@ dpdk_interface_tx (vlib_main_t * vm,
           /* cloning was done, need to check for failure */
           if (PREDICT_TRUE((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0))
             {
-          tx_vector[i % DPDK_TX_RING_SIZE] = mb0;
+             tx_vector[i % DPDK_TX_RING_SIZE] = mb0;
               i++;
             }
           if (PREDICT_TRUE((b1->flags & VLIB_BUFFER_REPL_FAIL) == 0))
             {
-          tx_vector[i % DPDK_TX_RING_SIZE] = mb1;
+             tx_vector[i % DPDK_TX_RING_SIZE] = mb1;
               i++;
             }
         }
-
+      
       n_left -= 2;
     }
   while (n_left > 0)
@@ -627,23 +771,23 @@ dpdk_interface_tx (vlib_main_t * vm,
       
       b0 = vlib_get_buffer (vm, bi0);
 
-      mb0 = ((struct rte_mbuf *)b0) - 1;
+      mb0 = rte_mbuf_from_vlib_buffer(b0);
       if (PREDICT_FALSE(b0->clone_count != 0))
-    {
-      struct rte_mbuf * mb0_new = dpdk_replicate_packet_mb (b0);
-      if (PREDICT_FALSE(mb0_new == 0))
-        {
-          vlib_error_count (vm, node->node_index,
-                DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
-          b0->flags |= VLIB_BUFFER_REPL_FAIL;
-        }
-      else
-        mb0 = mb0_new;
-      vec_add1 (dm->recycle[my_cpu], bi0);
-    }
+       {
+         struct rte_mbuf * mb0_new = dpdk_replicate_packet_mb (b0);
+         if (PREDICT_FALSE(mb0_new == 0))
+           {
+             vlib_error_count (vm, node->node_index,
+                               DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
+             b0->flags |= VLIB_BUFFER_REPL_FAIL;
+           }
+         else
+           mb0 = mb0_new;
+         vec_add1 (dm->recycle[my_cpu], bi0);
+       }
 
       delta0 = PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL) ? 0 :
-    vlib_buffer_length_in_chain (vm, b0) - (i16) mb0->pkt_len;
+       vlib_buffer_length_in_chain (vm, b0) - (i16) mb0->pkt_len;
       
       new_data_len0 = (u16)((i16) mb0->data_len + delta0);
       new_pkt_len0 = (u16)((i16) mb0->pkt_len + delta0);
@@ -652,15 +796,15 @@ dpdk_interface_tx (vlib_main_t * vm,
       mb0->data_len = new_data_len0;
       mb0->pkt_len = new_pkt_len0;
       mb0->data_off = (PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL)) ?
-          mb0->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b0->current_data);
+       mb0->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b0->current_data);
 
       if (PREDICT_FALSE(node->flags & VLIB_NODE_FLAG_TRACE))
-          if (b0->flags & VLIB_BUFFER_IS_TRACED)
-              dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
+       if (b0->flags & VLIB_BUFFER_IS_TRACED)
+         dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
 
       if (PREDICT_TRUE((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0))
         {
-      tx_vector[i % DPDK_TX_RING_SIZE] = mb0;
+         tx_vector[i % DPDK_TX_RING_SIZE] = mb0;
           i++;
         }
       n_left--;
@@ -707,15 +851,15 @@ dpdk_interface_tx (vlib_main_t * vm,
           vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, n_packets);
 
           vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_PKT_DROP,
-                n_packets);
+                           n_packets);
 
           while (n_packets--)
             rte_pktmbuf_free (tx_vector[ring->tx_tail + n_packets]);
         }
 
         /* Reset head/tail to avoid unnecessary wrap */
-        ring->tx_head = 0;
-        ring->tx_tail = 0;
+      ring->tx_head = 0;
+      ring->tx_tail = 0;
     }
 
   /* Recycle replicated buffers */
@@ -737,391 +881,15 @@ static int dpdk_device_renumber (vnet_hw_interface_t * hi,
   dpdk_device_t * xd = vec_elt_at_index (dm->devices, hi->dev_instance);
 
   if (!xd || xd->dev_type != VNET_DPDK_DEV_VHOST_USER) {
-      clib_warning("cannot renumber non-vhost-user interface (sw_if_index: %d)",
-              hi->sw_if_index);
-      return 0;
+    clib_warning("cannot renumber non-vhost-user interface (sw_if_index: %d)",
+                hi->sw_if_index);
+    return 0;
   }
 
   xd->vu_if_id = new_dev_instance;
   return 0;
 }
 
-static u8 * format_dpdk_device_name (u8 * s, va_list * args)
-{
-  dpdk_main_t * dm = &dpdk_main;
-  char *devname_format;
-  char *device_name;
-  u32 i = va_arg (*args, u32);
-  struct rte_eth_dev_info dev_info;
-  u8 * ret;
-
-  if (dm->interface_name_format_decimal)
-    devname_format = "%s%d/%d/%d";
-  else
-    devname_format = "%s%x/%x/%x";
-
-  if (dm->devices[i].dev_type == VNET_DPDK_DEV_KNI) {
-       return format(s, "kni%d", dm->devices[i].kni_port_id);
-  } else if (dm->devices[i].dev_type == VNET_DPDK_DEV_VHOST_USER) {
-       return format(s, "VirtualEthernet0/0/%d", dm->devices[i].vu_if_id);
-  }
-  switch (dm->devices[i].port_type)
-    {
-    case VNET_DPDK_PORT_TYPE_ETH_1G:
-      device_name = "GigabitEthernet";
-      break;
-
-    case VNET_DPDK_PORT_TYPE_ETH_10G:
-      device_name = "TenGigabitEthernet";
-      break;
-
-    case VNET_DPDK_PORT_TYPE_ETH_40G:
-      device_name = "FortyGigabitEthernet";
-      break;
-
-    case VNET_DPDK_PORT_TYPE_ETH_SWITCH:
-      device_name = "EthernetSwitch";
-      break;
-
-  #ifdef NETMAP
-    case VNET_DPDK_PORT_TYPE_NETMAP:
-       rte_eth_dev_info_get(i, &dev_info);
-       return format(s, "netmap:%s", dev_info.driver_name);
-  #endif
-
-    case VNET_DPDK_PORT_TYPE_AF_PACKET:
-      rte_eth_dev_info_get(i, &dev_info);
-      return format(s, "af_packet%d", dm->devices[i].af_packet_port_id);
-
-    default:
-    case VNET_DPDK_PORT_TYPE_UNKNOWN:
-      device_name = "UnknownEthernet";
-      break;
-    }
-
-  rte_eth_dev_info_get(i, &dev_info);
-  ret = format (s, devname_format, device_name, dev_info.pci_dev->addr.bus,
-                dev_info.pci_dev->addr.devid,
-                dev_info.pci_dev->addr.function);
-
-  /* address Chelsio cards which share PCI address */
-       if (dm->devices[i].pmd ==  VNET_DPDK_PMD_CXGBE) {
-    struct rte_eth_dev_info di;
-
-    di.pci_dev = 0;
-    rte_eth_dev_info_get(i+1, &di);
-    if (di.pci_dev && memcmp(&dev_info.pci_dev->addr, &di.pci_dev->addr,
-        sizeof(struct rte_pci_addr)) == 0)
-           return format(ret, "/0");   
-
-    di.pci_dev = 0;
-    rte_eth_dev_info_get(i-1, &di);
-    if (di.pci_dev && memcmp(&dev_info.pci_dev->addr, &di.pci_dev->addr,
-        sizeof(struct rte_pci_addr)) == 0)
-           return format(ret, "/1");   
-       }
-  return ret;
-}
-
-static u8 * format_dpdk_device_type (u8 * s, va_list * args)
-{
-  dpdk_main_t * dm = &dpdk_main;
-  char *dev_type;
-  u32 i = va_arg (*args, u32);
-
-  if (dm->devices[i].dev_type == VNET_DPDK_DEV_KNI) {
-       return format(s, "Kernel NIC Interface");
-  } else if (dm->devices[i].dev_type == VNET_DPDK_DEV_VHOST_USER) {
-       return format(s, "vhost-user interface");
-  }
-
-  switch (dm->devices[i].pmd)
-    {
-    case VNET_DPDK_PMD_E1000EM:
-       dev_type = "Intel 82540EM (e1000)";
-       break;
-
-    case VNET_DPDK_PMD_IGB:
-       dev_type = "Intel e1000";
-       break;
-
-    case VNET_DPDK_PMD_I40E:
-       dev_type = "Intel X710/XL710 Family";
-       break;
-
-    case VNET_DPDK_PMD_I40EVF:
-       dev_type = "Intel X710/XL710 Family VF";
-       break;
-
-    case VNET_DPDK_PMD_FM10K:
-       dev_type = "Intel FM10000 Family Ethernet Switch";
-       break;
-
-    case VNET_DPDK_PMD_IGBVF:
-       dev_type = "Intel e1000 VF";
-       break;
-
-    case VNET_DPDK_PMD_VIRTIO:
-       dev_type = "Red Hat Virtio";
-       break;
-
-    case VNET_DPDK_PMD_IXGBEVF:
-       dev_type = "Intel 82599 VF";
-       break;
-
-    case VNET_DPDK_PMD_IXGBE:
-       dev_type = "Intel 82599";
-       break;
-
-    case VNET_DPDK_PMD_VICE:
-    case VNET_DPDK_PMD_ENIC:
-       dev_type = "Cisco VIC";
-       break;
-
-    case VNET_DPDK_PMD_CXGBE:
-       dev_type = "Chelsio T4/T5";
-       break;
-
-    case VNET_DPDK_PMD_VMXNET3:
-       dev_type = "VMware VMXNET3";
-       break;
-
-#ifdef NETMAP
-    case VNET_DPDK_PMD_NETMAP:
-       dev_type = "Netmap/Vale";
-       break;
-#endif
-
-    case VNET_DPDK_PMD_AF_PACKET:
-  dev_type = "af_packet";
-  break;
-
-    default:
-    case VNET_DPDK_PMD_UNKNOWN:
-       dev_type = "### UNKNOWN ###";
-       break;
-    }
-
-  return format (s, dev_type);
-}
-
-static u8 * format_dpdk_link_status (u8 * s, va_list * args)
-{
-  dpdk_device_t * xd = va_arg (*args, dpdk_device_t *);
-  struct rte_eth_link * l = &xd->link;
-  vnet_main_t * vnm = vnet_get_main();
-  vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, xd->vlib_hw_if_index);
-  
-  s = format (s, "%s ", l->link_status ? "up" : "down");
-  if (l->link_status)
-    {
-      u32 promisc = rte_eth_promiscuous_get (xd->device_index);
-
-      s = format (s, "%s duplex ", (l->link_duplex == ETH_LINK_FULL_DUPLEX) ?
-                  "full" : "half");
-      s = format (s, "speed %u mtu %d %s\n", l->link_speed,
-                 hi->max_packet_bytes, promisc ? " promisc" : "");
-    }
-  else
-    s = format (s, "\n");
-
-  return s;
-}
-
-#define _line_len 72
-#define _(v, str)                                            \
-if (bitmap & v) {                                            \
-  if (format_get_indent (s) > next_split ) {                 \
-    next_split += _line_len;                                 \
-    s = format(s,"\n%U", format_white_space, indent);        \
-  }                                                          \
-  s = format(s, "%s ", str);                                 \
-}
-
-static u8 * format_dpdk_rss_hf_name(u8 * s, va_list * args)
-{
-  u64 bitmap = va_arg (*args, u64);
-  int next_split = _line_len;
-  int indent = format_get_indent (s);
-
-  if (!bitmap)
-    return format(s, "none");
-
-  foreach_dpdk_rss_hf
-
-  return s;
-}
-
-static u8 * format_dpdk_rx_offload_caps(u8 * s, va_list * args)
-{
-  u32 bitmap = va_arg (*args, u32);
-  int next_split = _line_len;
-  int indent = format_get_indent (s);
-
-  if (!bitmap)
-    return format(s, "none");
-
-  foreach_dpdk_rx_offload_caps
-
-  return s;
-}
-
-static u8 * format_dpdk_tx_offload_caps(u8 * s, va_list * args)
-{
-  u32 bitmap = va_arg (*args, u32);
-  int next_split = _line_len;
-  int indent = format_get_indent (s);
-  if (!bitmap)
-    return format(s, "none");
-
-  foreach_dpdk_tx_offload_caps
-
-  return s;
-}
-
-#undef _line_len
-#undef _
-
-static u8 * format_dpdk_device (u8 * s, va_list * args)
-{
-  u32 dev_instance = va_arg (*args, u32);
-  int verbose = va_arg (*args, int);
-  dpdk_main_t * dm = &dpdk_main;
-  dpdk_device_t * xd = vec_elt_at_index (dm->devices, dev_instance);
-  uword indent = format_get_indent (s);
-  f64 now = vlib_time_now (dm->vlib_main);
-
-  dpdk_update_counters (xd, now);
-  dpdk_update_link_state (xd, now);
-
-  s = format (s, "%U\n%Ucarrier %U",
-             format_dpdk_device_type, xd->device_index,
-             format_white_space, indent + 2,
-             format_dpdk_link_status, xd);
-
-  if (verbose > 1 && xd->dev_type == VNET_DPDK_DEV_ETH)
-    {
-      struct rte_eth_dev_info di;
-      struct rte_pci_device * pci;
-      struct rte_eth_rss_conf rss_conf;
-      int vlan_off;
-
-      rss_conf.rss_key = 0;
-      rte_eth_dev_info_get(xd->device_index, &di);
-      rte_eth_dev_rss_hash_conf_get(xd->device_index, &rss_conf);
-      pci = di.pci_dev;
-
-      if (pci)
-        s = format(s, "%Upci id:            device %04x:%04x subsystem %04x:%04x\n"
-                      "%Upci address:       %04x:%02x:%02x.%02x\n",
-                   format_white_space, indent + 2,
-                   pci->id.vendor_id, pci->id.device_id,
-                   pci->id.subsystem_vendor_id,
-                   pci->id.subsystem_device_id,
-                   format_white_space, indent + 2,
-                   pci->addr.domain, pci->addr.bus,
-                   pci->addr.devid, pci->addr.function);
-      s = format(s, "%Umax rx packet len: %d\n",
-                 format_white_space, indent + 2, di.max_rx_pktlen);
-      s = format(s, "%Upromiscuous:       unicast %s all-multicast %s\n",
-                 format_white_space, indent + 2,
-                 rte_eth_promiscuous_get(xd->device_index) ? "on" : "off",
-                 rte_eth_promiscuous_get(xd->device_index) ? "on" : "off");
-      vlan_off = rte_eth_dev_get_vlan_offload(xd->device_index);
-      s = format(s, "%Uvlan offload:      strip %s filter %s qinq %s\n",
-                 format_white_space, indent + 2,
-                 vlan_off & ETH_VLAN_STRIP_OFFLOAD ? "on" : "off",
-                 vlan_off & ETH_VLAN_FILTER_OFFLOAD ? "on" : "off",
-                 vlan_off & ETH_VLAN_EXTEND_OFFLOAD ? "on" : "off");
-      s = format(s, "%Uqueue size (max):  rx %d (%d) tx %d (%d)\n",
-                 format_white_space, indent + 2,
-                 xd->rx_q_used, di.max_rx_queues,
-                 xd->tx_q_used, di.max_tx_queues);
-      s = format(s, "%Urx offload caps:   %U\n",
-                 format_white_space, indent + 2,
-                 format_dpdk_rx_offload_caps, di.rx_offload_capa);
-      s = format(s, "%Utx offload caps:   %U\n",
-                 format_white_space, indent + 2,
-                 format_dpdk_tx_offload_caps, di.tx_offload_capa);
-      s = format(s, "%Urss active:        %U\n"
-                    "%Urss supported:     %U\n",
-                 format_white_space, indent + 2,
-                 format_dpdk_rss_hf_name, rss_conf.rss_hf,
-                 format_white_space, indent + 2,
-                 format_dpdk_rss_hf_name, di.flow_type_rss_offloads);
-    }
-
-  if (xd->cpu_socket > -1)
-    s = format (s, "%Ucpu socket %d",
-                format_white_space, indent + 2,
-                xd->cpu_socket);
-
-  /* $$$ MIB counters  */
-
-  {
-#define _(N, V)                                                        \
-    if (xd->stats.V != 0)                                      \
-      s = format (s, "\n%U%-40U%16Ld",                         \
-                 format_white_space, indent + 2,               \
-                 format_c_identifier, #N, xd->stats.V);
-    
-    foreach_dpdk_counter
-#undef _
-  }
-
-  u8 * xs = 0;
-  struct rte_eth_xstats * xstat;
-
-  vec_foreach(xstat, xd->xstats)
-    {
-      if (xstat->value)
-        {
-          /* format_c_identifier don't like c strings inside vector */
-          u8 * name = format(0,"%s", xstat->name);
-          xs = format(xs, "\n%U%-38U%16Ld",
-                      format_white_space, indent + 4,
-                      format_c_identifier, name, xstat->value);
-          vec_free(name);
-        }
-    }
-
-  if (xs)
-    {
-      s = format(s, "\n%Uextended stats:%v",
-                 format_white_space, indent + 2, xs);
-      vec_free(xs);
-    }
-
-  return s;
-}
-
-static u8 * format_dpdk_tx_dma_trace (u8 * s, va_list * va)
-{
-  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
-  CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
-  CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main();
-  dpdk_tx_dma_trace_t * t = va_arg (*va, dpdk_tx_dma_trace_t *);
-  dpdk_main_t * dm = &dpdk_main;
-  dpdk_device_t * xd = vec_elt_at_index (dm->devices, t->device_index);
-  uword indent = format_get_indent (s);
-  vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index);
-
-  s = format (s, "%U tx queue %d",
-             format_vnet_sw_interface_name, vnm, sw,
-             t->queue_index);
-
-  s = format (s, "\n%Ubuffer 0x%x: %U",
-             format_white_space, indent,
-             t->buffer_index,
-             format_vlib_buffer, &t->buffer);
-
-  s = format (s, "\n%U%U", format_white_space, indent,
-             format_ethernet_header_with_length, t->buffer.pre_data,
-             sizeof (t->buffer.pre_data));
-  
-  return s;
-}
-
 static void dpdk_clear_hw_interface_counters (u32 instance)
 {
   dpdk_main_t * dm = &dpdk_main;
@@ -1135,19 +903,38 @@ static void dpdk_clear_hw_interface_counters (u32 instance)
    */
   if (xd->admin_up != 0xff)
     {
-      rte_eth_stats_reset (xd->device_index);
-      memset (&xd->last_stats, 0, sizeof (xd->last_stats));
+      /*
+       * Set the "last_cleared_stats" to the current stats, so that
+       * things appear to clear from a display perspective.
+       */
       dpdk_update_counters (xd, vlib_time_now (dm->vlib_main));
+
+      clib_memcpy (&xd->last_cleared_stats, &xd->stats, sizeof(xd->stats));
+      clib_memcpy (xd->last_cleared_xstats, xd->xstats,
+             vec_len(xd->last_cleared_xstats) *
+             sizeof(xd->last_cleared_xstats[0]));
     }
   else
     {
-      rte_eth_stats_reset (xd->device_index);
-      memset(&xd->stats, 0, sizeof(xd->stats));
+      /*
+       * Internally rte_eth_xstats_reset() is calling rte_eth_stats_reset(),
+       * so we're only calling xstats_reset() here.
+       */
+      rte_eth_xstats_reset (xd->device_index);
+      memset (&xd->stats, 0, sizeof(xd->stats));
       memset (&xd->last_stats, 0, sizeof (xd->last_stats));
     }
-  rte_eth_xstats_reset(xd->device_index);
+
+  if (PREDICT_FALSE(xd->dev_type == VNET_DPDK_DEV_VHOST_USER)) {
+    int i;
+    for (i = 0; i < xd->rx_q_used * VIRTIO_QNUM; i++) {
+      xd->vu_intf->vrings[i].packets = 0;
+      xd->vu_intf->vrings[i].bytes = 0;
+    }
+  }
 }
 
+#ifdef RTE_LIBRTE_KNI
 static int
 kni_config_network_if(u8 port_id, u8 if_up)
 {
@@ -1192,6 +979,7 @@ kni_change_mtu(u8 port_id, unsigned new_mtu)
 
   return 0;
 }
+#endif
 
 static clib_error_t *
 dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
@@ -1202,6 +990,7 @@ dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
   dpdk_device_t * xd = vec_elt_at_index (dm->devices, hif->dev_instance);
   int rv = 0;
 
+#ifdef RTE_LIBRTE_KNI
   if (xd->dev_type == VNET_DPDK_DEV_KNI)
   {
       if (is_up)
@@ -1212,7 +1001,7 @@ dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
           vlib_buffer_main_t * bm = vm->buffer_main;
           memset(&conf, 0, sizeof(conf));
           snprintf(conf.name, RTE_KNI_NAMESIZE, "vpp%u", xd->kni_port_id);
-          conf.mbuf_size = MBUF_SIZE;
+          conf.mbuf_size = VLIB_BUFFER_DATA_SIZE;
           memset(&ops, 0, sizeof(ops));
           ops.port_id = xd->kni_port_id;
           ops.change_mtu = kni_change_mtu;
@@ -1236,6 +1025,7 @@ dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
       }
       return 0;
   }
+#endif
   if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER)
     {
       if (is_up)
@@ -1281,6 +1071,17 @@ dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
     }
   else
     {
+      /*
+       * DAW-FIXME: VMXNET3 device stop/start doesn't work,
+       * therefore fake the stop in the dpdk driver by
+       * silently dropping all of the incoming pkts instead of
+       * stopping the driver / hardware.
+       */
+      if (xd->pmd != VNET_DPDK_PMD_VMXNET3)
+         xd->admin_up = 0;
+      else
+         xd->admin_up = ~0;
+
       rte_eth_allmulticast_disable (xd->device_index);
       vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0);
 
@@ -1291,12 +1092,7 @@ dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
        * stopping the driver / hardware.
        */
       if (xd->pmd != VNET_DPDK_PMD_VMXNET3)
-       {
          rte_eth_dev_stop (xd->device_index);
-         xd->admin_up = 0;
-       }
-      else
-         xd->admin_up = ~0;
     }
 
   if (rv < 0)
@@ -1344,8 +1140,10 @@ dpdk_subif_add_del_function (vnet_main_t * vnm,
 
   if (xd->dev_type != VNET_DPDK_DEV_ETH)
         return 0;
-  /* currently we program VLANS only for IXGBE VF */
-  if (xd->pmd != VNET_DPDK_PMD_IXGBEVF)
+
+  /* currently we program VLANS only for IXGBE VF and I40E VF */
+  if ((xd->pmd != VNET_DPDK_PMD_IXGBEVF) &&
+      (xd->pmd != VNET_DPDK_PMD_I40EVF))
         return 0;
 
   if (t->sub.eth.flags.no_tags == 1)
@@ -1505,3 +1303,81 @@ int rte_delay_us_override (unsigned us) {
     }
   return 0; // no override
 }
+
+/*
+ * Return a copy of the DPDK port stats in dest.
+ */
+clib_error_t*
+dpdk_get_hw_interface_stats (u32 hw_if_index, struct rte_eth_stats* dest)
+{
+  dpdk_main_t * dm = &dpdk_main;
+  vnet_main_t * vnm = vnet_get_main();
+  vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, hw_if_index);
+  dpdk_device_t * xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+
+  if (!dest) {
+     return clib_error_return (0, "Missing or NULL argument");
+  }
+  if (!xd) {
+     return clib_error_return (0, "Unable to get DPDK device from HW interface");
+  }
+
+  dpdk_update_counters (xd, vlib_time_now (dm->vlib_main));
+
+  clib_memcpy(dest, &xd->stats, sizeof(xd->stats));
+  return (0);
+}
+
+/*
+ * Return the number of dpdk mbufs
+ */
+u32 dpdk_num_mbufs (void)
+{
+  dpdk_main_t * dm = &dpdk_main;
+
+  return dm->num_mbufs;
+}
+
+/*
+ * Return the io_thread_release
+ */
+int dpdk_io_thread_release (void)
+{
+  dpdk_main_t * dm = &dpdk_main;
+
+  return dm->io_thread_release;
+}
+
+/*
+ * Return the pmd type for a given hardware interface
+ */
+dpdk_pmd_t dpdk_get_pmd_type (vnet_hw_interface_t *hi)
+{
+  dpdk_main_t   * dm = &dpdk_main;
+  dpdk_device_t * xd;
+
+  assert (hi);
+
+  xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+
+  assert (xd);
+
+  return xd->pmd;
+}
+
+/*
+ * Return the cpu socket for a given hardware interface
+ */
+i8 dpdk_get_cpu_socket (vnet_hw_interface_t *hi)
+{
+  dpdk_main_t   * dm = &dpdk_main;
+  dpdk_device_t * xd;
+
+  assert (hi);
+
+  xd = vec_elt_at_index(dm->devices, hi->dev_instance);
+
+  assert (xd);
+
+  return xd->cpu_socket;
+}