vpp_lite: add cpu pinning support (VPP-467)
[vpp.git] / vnet / vnet / devices / dpdk / init.c
index 705c037..73edc4a 100644 (file)
@@ -71,7 +71,7 @@ dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd)
 
   ASSERT (os_get_cpu_number () == 0);
 
-  if (xd->admin_up)
+  if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
     {
       vnet_hw_interface_set_flags (dm->vnet_main, xd->vlib_hw_if_index, 0);
       rte_eth_dev_stop (xd->device_index);
@@ -123,7 +123,7 @@ dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd)
                                  xd->device_index, rv);
     }
 
-  if (xd->admin_up)
+  if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
     {
       int rv;
       rv = rte_eth_dev_start (xd->device_index);
@@ -143,12 +143,16 @@ dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags)
 
   if (ETHERNET_INTERFACE_FLAG_CONFIG_PROMISC (flags))
     {
-      old = xd->promisc;
-      xd->promisc = flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL;
+      old = (xd->flags & DPDK_DEVICE_FLAG_PROMISC) != 0;
 
-      if (xd->admin_up)
+      if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL)
+       xd->flags |= DPDK_DEVICE_FLAG_PROMISC;
+      else
+       xd->flags &= ~DPDK_DEVICE_FLAG_PROMISC;
+
+      if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
        {
-         if (xd->promisc)
+         if (xd->flags & DPDK_DEVICE_FLAG_PROMISC)
            rte_eth_promiscuous_enable (xd->device_index);
          else
            rte_eth_promiscuous_disable (xd->device_index);
@@ -181,7 +185,7 @@ dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags)
 
          xd->port_conf.rxmode.max_rx_pkt_len = hi->max_packet_bytes;
 
-         if (xd->admin_up)
+         if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
            rte_eth_dev_stop (xd->device_index);
 
          rv = rte_eth_dev_configure
@@ -194,7 +198,7 @@ dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags)
 
          rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes);
 
-         if (xd->admin_up)
+         if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
            {
              int rv = rte_eth_dev_start (xd->device_index);
              if (rv < 0)
@@ -247,10 +251,10 @@ dpdk_lib_init (dpdk_main_t * dm)
   dpdk_device_t *xd;
   vlib_pci_addr_t last_pci_addr;
   u32 last_pci_addr_port = 0;
-  vlib_thread_registration_t *tr;
-  uword *p;
+  vlib_thread_registration_t *tr, *tr_hqos;
+  uword *p, *p_hqos;
 
-  u32 next_cpu = 0;
+  u32 next_cpu = 0, next_hqos_cpu = 0;
   u8 af_packet_port_id = 0;
   last_pci_addr.as_u32 = ~0;
 
@@ -276,6 +280,30 @@ dpdk_lib_init (dpdk_main_t * dm)
   vec_validate_aligned (dm->workers, tm->n_vlib_mains - 1,
                        CLIB_CACHE_LINE_BYTES);
 
+  dm->hqos_cpu_first_index = 0;
+  dm->hqos_cpu_count = 0;
+
+  /* find out which cpus will be used for I/O TX */
+  p_hqos = hash_get_mem (tm->thread_registrations_by_name, "hqos-threads");
+  tr_hqos = p_hqos ? (vlib_thread_registration_t *) p_hqos[0] : 0;
+
+  if (tr_hqos && tr_hqos->count > 0)
+    {
+      dm->hqos_cpu_first_index = tr_hqos->first_index;
+      dm->hqos_cpu_count = tr_hqos->count;
+    }
+
+  vec_validate_aligned (dm->devices_by_hqos_cpu, tm->n_vlib_mains - 1,
+                       CLIB_CACHE_LINE_BYTES);
+
+  vec_validate_aligned (dm->hqos_threads, tm->n_vlib_mains - 1,
+                       CLIB_CACHE_LINE_BYTES);
+
+#ifdef NETMAP
+  if (rte_netmap_probe () < 0)
+    return clib_error_return (0, "rte netmap probe failed");
+#endif
+
   nports = rte_eth_dev_count ();
   if (nports < 1)
     {
@@ -401,7 +429,7 @@ dpdk_lib_init (dpdk_main_t * dm)
       else
        xd->rx_q_used = 1;
 
-      xd->dev_type = VNET_DPDK_DEV_ETH;
+      xd->flags |= DPDK_DEVICE_FLAG_PMD;
 
       /* workaround for drivers not setting driver_name */
       if ((!dev_info.driver_name) && (dev_info.pci_dev))
@@ -624,7 +652,7 @@ dpdk_lib_init (dpdk_main_t * dm)
          /* *INDENT-OFF* */
          clib_bitmap_foreach (i, devconf->workers, ({
            int cpu = dm->input_cpu_first_index + i;
-           unsigned lcore = vlib_worker_threads[cpu].dpdk_lcore_id;
+           unsigned lcore = vlib_worker_threads[cpu].lcore_id;
            vec_validate(xd->cpu_socket_id_by_queue, q);
            xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id(lcore);
            vec_add2(dm->devices_by_cpu[cpu], dq, 1);
@@ -637,7 +665,7 @@ dpdk_lib_init (dpdk_main_t * dm)
        for (q = 0; q < xd->rx_q_used; q++)
          {
            int cpu = dm->input_cpu_first_index + next_cpu;
-           unsigned lcore = vlib_worker_threads[cpu].dpdk_lcore_id;
+           unsigned lcore = vlib_worker_threads[cpu].lcore_id;
 
            /*
             * numa node for worker thread handling this queue
@@ -658,11 +686,47 @@ dpdk_lib_init (dpdk_main_t * dm)
              next_cpu = 0;
          }
 
+
+      if (devconf->hqos_enabled)
+       {
+         xd->flags |= DPDK_DEVICE_FLAG_HQOS;
+
+         if (devconf->hqos.hqos_thread_valid)
+           {
+             int cpu = dm->hqos_cpu_first_index + devconf->hqos.hqos_thread;
+
+             if (devconf->hqos.hqos_thread >= dm->hqos_cpu_count)
+               return clib_error_return (0, "invalid HQoS thread index");
+
+             vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1);
+             dq->device = xd->device_index;
+             dq->queue_id = 0;
+           }
+         else
+           {
+             int cpu = dm->hqos_cpu_first_index + next_hqos_cpu;
+
+             if (dm->hqos_cpu_count == 0)
+               return clib_error_return (0, "no HQoS threads available");
+
+             vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1);
+             dq->device = xd->device_index;
+             dq->queue_id = 0;
+
+             next_hqos_cpu++;
+             if (next_hqos_cpu == dm->hqos_cpu_count)
+               next_hqos_cpu = 0;
+
+             devconf->hqos.hqos_thread_valid = 1;
+             devconf->hqos.hqos_thread = cpu;
+           }
+       }
+
       vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains,
                            CLIB_CACHE_LINE_BYTES);
       for (j = 0; j < tm->n_vlib_mains; j++)
        {
-         vec_validate_ha (xd->tx_vectors[j], DPDK_TX_RING_SIZE,
+         vec_validate_ha (xd->tx_vectors[j], xd->nb_tx_desc,
                           sizeof (tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES);
          vec_reset_length (xd->tx_vectors[j]);
        }
@@ -681,6 +745,13 @@ dpdk_lib_init (dpdk_main_t * dm)
       if (rv)
        return rv;
 
+      if (devconf->hqos_enabled)
+       {
+         rv = dpdk_port_setup_hqos (xd, &devconf->hqos);
+         if (rv < 0)
+           return rv;
+       }
+
       /* count the number of descriptors used for this device */
       nb_desc += xd->nb_rx_desc + xd->nb_tx_desc * xd->tx_q_used;
 
@@ -751,7 +822,7 @@ dpdk_lib_init (dpdk_main_t * dm)
 
          /* Create vnet interface */
          vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES);
-         xd->dev_type = VNET_DPDK_DEV_KNI;
+         xd->flags |= DPDK_DEVICE_FLAG_KNI;
 
          xd->device_index = xd - dm->devices;
          ASSERT (nports + i == xd->device_index);
@@ -771,7 +842,7 @@ dpdk_lib_init (dpdk_main_t * dm)
                                CLIB_CACHE_LINE_BYTES);
          for (j = 0; j < tm->n_vlib_mains; j++)
            {
-             vec_validate_ha (xd->tx_vectors[j], DPDK_TX_RING_SIZE,
+             vec_validate_ha (xd->tx_vectors[j], xd->nb_tx_desc,
                               sizeof (tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES);
              vec_reset_length (xd->tx_vectors[j]);
            }
@@ -829,18 +900,16 @@ dpdk_bind_devices_to_uio (dpdk_config_main_t * conf)
   vlib_pci_main_t *pm = &pci_main;
   clib_error_t *error;
   vlib_pci_device_t *d;
-  pci_config_header_t *c;
   u8 *pci_addr = 0;
   int num_whitelisted = vec_len (conf->dev_confs);
 
   /* *INDENT-OFF* */
   pool_foreach (d, pm->pci_devs, ({
     dpdk_device_config_t * devconf = 0;
-    c = &d->config0.header;
     vec_reset_length (pci_addr);
     pci_addr = format (pci_addr, "%U%c", format_vlib_pci_addr, &d->bus_address, 0);
 
-    if (c->device_class != PCI_CLASS_NETWORK_ETHERNET)
+    if (d->device_class != PCI_CLASS_NETWORK_ETHERNET)
       continue;
 
     if (num_whitelisted)
@@ -854,24 +923,24 @@ dpdk_bind_devices_to_uio (dpdk_config_main_t * conf)
       }
 
     /* virtio */
-    if (c->vendor_id == 0x1af4 && c->device_id == 0x1000)
+    if (d->vendor_id == 0x1af4 && d->device_id == 0x1000)
       ;
     /* vmxnet3 */
-    else if (c->vendor_id == 0x15ad && c->device_id == 0x07b0)
+    else if (d->vendor_id == 0x15ad && d->device_id == 0x07b0)
       ;
     /* all Intel devices */
-    else if (c->vendor_id == 0x8086)
+    else if (d->vendor_id == 0x8086)
       ;
     /* Cisco VIC */
-    else if (c->vendor_id == 0x1137 && c->device_id == 0x0043)
+    else if (d->vendor_id == 0x1137 && d->device_id == 0x0043)
       ;
     /* Chelsio T4/T5 */
-    else if (c->vendor_id == 0x1425 && (c->device_id & 0xe000) == 0x4000)
+    else if (d->vendor_id == 0x1425 && (d->device_id & 0xe000) == 0x4000)
       ;
     else
       {
         clib_warning ("Unsupported Ethernet PCI device 0x%04x:0x%04x found "
-                     "at PCI address %s\n", (u16) c->vendor_id, (u16) c->device_id,
+                     "at PCI address %s\n", (u16) d->vendor_id, (u16) d->device_id,
                      pci_addr);
         continue;
       }
@@ -925,6 +994,8 @@ dpdk_device_config (dpdk_config_main_t * conf, vlib_pci_addr_t pci_addr,
     }
 
   devconf->pci_addr.as_u32 = pci_addr.as_u32;
+  devconf->hqos_enabled = 0;
+  dpdk_device_config_hqos_default (&devconf->hqos);
 
   if (!input)
     return 0;
@@ -954,6 +1025,19 @@ dpdk_device_config (dpdk_config_main_t * conf, vlib_pci_addr_t pci_addr,
        devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_OFF;
       else if (unformat (input, "vlan-strip-offload on"))
        devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_ON;
+      else
+       if (unformat
+           (input, "hqos %U", unformat_vlib_cli_sub_input, &sub_input))
+       {
+         devconf->hqos_enabled = 1;
+         error = unformat_hqos (&sub_input, &devconf->hqos);
+         if (error)
+           break;
+       }
+      else if (unformat (input, "hqos"))
+       {
+         devconf->hqos_enabled = 1;
+       }
       else
        {
          error = clib_error_return (0, "unknown input `%U'",
@@ -1129,6 +1213,8 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input)
        else if (unformat (input, "default"))
        ;
 
+      else if (unformat_skip_white_space (input))
+       ;
       else
        {
          error = clib_error_return (0, "unknown input `%U'",
@@ -1195,57 +1281,22 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input)
       /* *INDENT-OFF* */
       clib_bitmap_foreach (c, tm->cpu_socket_bitmap, (
         {
-         u32 pages_avail, page_size, mem;
-         u8 *s = 0;
-          u8 *p = 0;
-         char * numa_path = "/sys/devices/system/node/node%u/";
-          char * nonnuma_path = "/sys/kernel/mm/";
-          char * suffix = "hugepages/hugepages-%ukB/free_hugepages%c";
-          char * path = NULL;
-          struct stat sb_numa, sb_nonnuma;
-
-          p = format(p, numa_path, c);
-          if (stat(numa_path, &sb_numa) < 0)
-            sb_numa.st_mode = 0;
-
-          if (stat(nonnuma_path, &sb_nonnuma) < 0)
-            sb_nonnuma.st_mode = 0;
-
-          if (S_ISDIR(sb_numa.st_mode)) {
-            path = (char*)format((u8*)path, "%s%s", p, suffix);
-          } else if (S_ISDIR(sb_nonnuma.st_mode)) {
-            path = (char*)format((u8*)path, "%s%s", nonnuma_path, suffix);
-          } else {
-            use_1g = 0;
-            use_2m = 0;
-            vec_free(p);
-            break;
-          }
+         int pages_avail, page_size, mem;
 
          vec_validate(mem_by_socket, c);
          mem = mem_by_socket[c];
 
          page_size = 1024;
-         pages_avail = 0;
-         s = format (s, path, page_size * 1024, 0);
-         vlib_sysfs_read ((char *) s, "%u", &pages_avail);
-         vec_reset_length (s);
+         pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024);
 
-         if (page_size * pages_avail < mem)
+         if (pages_avail < 0 || page_size * pages_avail < mem)
            use_1g = 0;
 
          page_size = 2;
-         pages_avail = 0;
-         s = format (s, path, page_size * 1024, 0);
-         vlib_sysfs_read ((char *) s, "%u", &pages_avail);
-         vec_reset_length (s);
+         pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024);
 
-         if (page_size * pages_avail < mem)
+         if (pages_avail < 0 || page_size * pages_avail < mem)
            use_2m = 0;
-
-         vec_free(s);
-         vec_free(p);
-         vec_free(path);
       }));
       /* *INDENT-ON* */
 
@@ -1454,7 +1505,7 @@ dpdk_update_link_state (dpdk_device_t * xd, f64 now)
   u8 hw_flags_chg = 0;
 
   /* only update link state for PMD interfaces */
-  if (xd->dev_type != VNET_DPDK_DEV_ETH)
+  if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0)
     return;
 
   xd->time_last_link_update = now ? now : xd->time_last_link_update;
@@ -1479,13 +1530,13 @@ dpdk_update_link_state (dpdk_device_t * xd, f64 now)
       } *ed;
       ed = ELOG_DATA (&vm->elog_main, e);
       ed->sw_if_index = xd->vlib_sw_if_index;
-      ed->admin_up = xd->admin_up;
+      ed->admin_up = (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) != 0;
       ed->old_link_state = (u8)
        vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index);
       ed->new_link_state = (u8) xd->link.link_status;
     }
 
-  if ((xd->admin_up == 1) &&
+  if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) &&
       ((xd->link.link_status != 0) ^
        vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index)))
     {
@@ -1686,6 +1737,7 @@ dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f)
                    clib_memcpy (bhi->hw_address, addr, 6);
                    clib_memcpy (bei->address, addr, 6);
                    /* Init l3 packet size allowed on bonded interface */
+                   bhi->max_packet_bytes = ETHERNET_MAX_PACKET_BYTES;
                    bhi->max_l3_packet_bytes[VLIB_RX] =
                      bhi->max_l3_packet_bytes[VLIB_TX] =
                      ETHERNET_MAX_PACKET_BYTES - sizeof (ethernet_header_t);
@@ -1711,6 +1763,17 @@ dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f)
                          vnet_get_sw_interface (vnm, sdev->vlib_sw_if_index);
                        shi->bond_info = VNET_HW_INTERFACE_BOND_INFO_SLAVE;
                        ssi->flags |= VNET_SW_INTERFACE_FLAG_BOND_SLAVE;
+
+                       /* Set l3 packet size allowed as the lowest of slave */
+                       if (bhi->max_l3_packet_bytes[VLIB_RX] >
+                           shi->max_l3_packet_bytes[VLIB_RX])
+                         bhi->max_l3_packet_bytes[VLIB_RX] =
+                           bhi->max_l3_packet_bytes[VLIB_TX] =
+                           shi->max_l3_packet_bytes[VLIB_RX];
+
+                       /* Set max packet size allowed as the lowest of slave */
+                       if (bhi->max_packet_bytes > shi->max_packet_bytes)
+                         bhi->max_packet_bytes = shi->max_packet_bytes;
                      }
                  }
              }
@@ -1741,7 +1804,7 @@ dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f)
          dpdk_update_link_state (xd, now);
 
 #if DPDK_VHOST_USER
-       if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER)
+       if (xd->flags & DPDK_DEVICE_FLAG_VHOST_USER)
          if (dpdk_vhost_user_process_if (vm, xd, vu_state) != 0)
            continue;
 #endif