dpdk: code preparation for bumping to DPDK 22.11
[vpp.git] / src / plugins / dpdk / device / common.c
index 2521abd..e4a79e8 100644 (file)
 #include <vnet/vnet.h>
 #include <vppinfra/vec.h>
 #include <vppinfra/format.h>
+#include <vppinfra/file.h>
+#include <vlib/unix/unix.h>
 #include <assert.h>
 
 #include <vnet/ip/ip.h>
 #include <vnet/ethernet/ethernet.h>
 #include <vnet/ethernet/arp_packet.h>
+#include <vnet/interface/rx_queue_funcs.h>
 #include <dpdk/buffer.h>
 #include <dpdk/device/dpdk.h>
 #include <dpdk/device/dpdk_priv.h>
 #include <vppinfra/error.h>
 
+/* DPDK TX offload to vnet hw interface caps mapppings */
+static struct
+{
+  u64 offload;
+  vnet_hw_if_caps_t caps;
+} tx_off_caps_map[] = {
+  { RTE_ETH_TX_OFFLOAD_IPV4_CKSUM, VNET_HW_IF_CAP_TX_IP4_CKSUM },
+  { RTE_ETH_TX_OFFLOAD_TCP_CKSUM, VNET_HW_IF_CAP_TX_TCP_CKSUM },
+  { RTE_ETH_TX_OFFLOAD_UDP_CKSUM, VNET_HW_IF_CAP_TX_UDP_CKSUM },
+  { RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM, VNET_HW_IF_CAP_TX_IP4_OUTER_CKSUM },
+  { RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM, VNET_HW_IF_CAP_TX_UDP_OUTER_CKSUM },
+  { RTE_ETH_TX_OFFLOAD_TCP_TSO, VNET_HW_IF_CAP_TCP_GSO },
+  { RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO, VNET_HW_IF_CAP_VXLAN_TNL_GSO }
+};
+
 void
 dpdk_device_error (dpdk_device_t * xd, char *str, int rv)
 {
@@ -38,14 +56,16 @@ dpdk_device_error (dpdk_device_t * xd, char *str, int rv)
 void
 dpdk_device_setup (dpdk_device_t * xd)
 {
-  dpdk_main_t *dm = &dpdk_main;
   vlib_main_t *vm = vlib_get_main ();
   vnet_main_t *vnm = vnet_get_main ();
-  vlib_thread_main_t *tm = vlib_get_thread_main ();
   vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, xd->sw_if_index);
   vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, xd->hw_if_index);
+  u16 buf_sz = vlib_buffer_get_default_data_size (vm);
+  vnet_hw_if_caps_change_t caps = {};
   struct rte_eth_dev_info dev_info;
-  u64 bitmap;
+  struct rte_eth_conf conf = {};
+  u64 rxo, txo;
+  u32 max_frame_size;
   int rv;
   int j;
 
@@ -56,86 +76,166 @@ dpdk_device_setup (dpdk_device_t * xd)
 
   if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
     {
-      vnet_hw_interface_set_flags (dm->vnet_main, xd->hw_if_index, 0);
+      vnet_hw_interface_set_flags (vnm, xd->hw_if_index, 0);
       dpdk_device_stop (xd);
     }
 
-  /* Enable flow director when flows exist */
-  if (xd->pmd == VNET_DPDK_PMD_I40E)
+  rte_eth_dev_info_get (xd->port_id, &dev_info);
+
+  dpdk_log_debug ("[%u] configuring device %U", xd->port_id,
+                 format_dpdk_rte_device, dev_info.device);
+
+  /* create rx and tx offload wishlist */
+  rxo = RTE_ETH_RX_OFFLOAD_IPV4_CKSUM;
+  txo = 0;
+
+  if (xd->conf.enable_tcp_udp_checksum)
+    rxo |= RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_TCP_CKSUM;
+
+  if (xd->conf.disable_tx_checksum_offload == 0 &&
+      xd->conf.enable_outer_checksum_offload)
+    txo |=
+      RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM | RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM;
+
+  if (xd->conf.disable_tx_checksum_offload == 0)
+    txo |= RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
+          RTE_ETH_TX_OFFLOAD_UDP_CKSUM;
+
+  if (xd->conf.disable_multi_seg == 0)
     {
-      if ((xd->flags & DPDK_DEVICE_FLAG_RX_FLOW_OFFLOAD) != 0)
-       xd->port_conf.fdir_conf.mode = RTE_FDIR_MODE_PERFECT;
-      else
-       xd->port_conf.fdir_conf.mode = RTE_FDIR_MODE_NONE;
+      txo |= RTE_ETH_TX_OFFLOAD_MULTI_SEGS;
+      rxo |= RTE_ETH_RX_OFFLOAD_SCATTER;
+#if RTE_VERSION < RTE_VERSION_NUM(21, 11, 0, 0)
+      rxo |= DEV_RX_OFFLOAD_JUMBO_FRAME;
+#endif
     }
 
-  rte_eth_dev_info_get (xd->port_id, &dev_info);
-
-  bitmap = xd->port_conf.txmode.offloads & ~dev_info.tx_offload_capa;
-  if (bitmap)
+  if (xd->conf.enable_lro)
+    rxo |= RTE_ETH_RX_OFFLOAD_TCP_LRO;
+
+  /* per-device offload config */
+  if (xd->conf.enable_tso)
+    txo |= RTE_ETH_TX_OFFLOAD_TCP_CKSUM | RTE_ETH_TX_OFFLOAD_TCP_TSO |
+          RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO;
+
+  if (xd->conf.disable_rx_scatter)
+    rxo &= ~RTE_ETH_RX_OFFLOAD_SCATTER;
+
+  /* mask unsupported offloads */
+  rxo &= dev_info.rx_offload_capa;
+  txo &= dev_info.tx_offload_capa;
+
+  dpdk_log_debug ("[%u] Supported RX offloads: %U", xd->port_id,
+                 format_dpdk_rx_offload_caps, dev_info.rx_offload_capa);
+  dpdk_log_debug ("[%u] Configured RX offloads: %U", xd->port_id,
+                 format_dpdk_rx_offload_caps, rxo);
+  dpdk_log_debug ("[%u] Supported TX offloads: %U", xd->port_id,
+                 format_dpdk_tx_offload_caps, dev_info.tx_offload_capa);
+  dpdk_log_debug ("[%u] Configured TX offloads: %U", xd->port_id,
+                 format_dpdk_tx_offload_caps, txo);
+
+  /* finalize configuration */
+  conf.rxmode.offloads = rxo;
+  conf.txmode.offloads = txo;
+  if (rxo & RTE_ETH_RX_OFFLOAD_TCP_LRO)
+    conf.rxmode.max_lro_pkt_size = xd->conf.max_lro_pkt_size;
+
+  if (xd->conf.enable_lsc_int)
+    conf.intr_conf.lsc = 1;
+  if (xd->conf.enable_rxq_int)
+    conf.intr_conf.rxq = 1;
+
+  conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE;
+  if (xd->conf.n_rx_queues > 1)
     {
-      dpdk_log_warn ("unsupported tx offloads requested on port %u: %U",
-                    xd->port_id, format_dpdk_tx_offload_caps, bitmap);
-      xd->port_conf.txmode.offloads ^= bitmap;
+      if (xd->conf.disable_rss == 0)
+       {
+         conf.rxmode.mq_mode = RTE_ETH_MQ_RX_RSS;
+         conf.rx_adv_conf.rss_conf.rss_hf = xd->conf.rss_hf;
+       }
     }
 
-  bitmap = xd->port_conf.rxmode.offloads & ~dev_info.rx_offload_capa;
-  if (bitmap)
+#if RTE_VERSION < RTE_VERSION_NUM(21, 11, 0, 0)
+  if (rxo & DEV_RX_OFFLOAD_JUMBO_FRAME)
+    {
+      conf.rxmode.max_rx_pkt_len = dev_info.max_rx_pktlen;
+      xd->max_supported_frame_size = dev_info.max_rx_pktlen;
+      mtu = xd->max_supported_frame_size - xd->driver_frame_overhead;
+    }
+  else
     {
-      dpdk_log_warn ("unsupported rx offloads requested on port %u: %U",
-                    xd->port_id, format_dpdk_rx_offload_caps, bitmap);
-      xd->port_conf.rxmode.offloads ^= bitmap;
+      xd->max_supported_frame_size =
+       clib_min (1500 + xd->driver_frame_overhead, buf_sz);
     }
+#else
+  if (xd->conf.disable_multi_seg)
+    xd->max_supported_frame_size = clib_min (dev_info.max_rx_pktlen, buf_sz);
+  else
+    xd->max_supported_frame_size = dev_info.max_rx_pktlen;
+#endif
+
+  max_frame_size = clib_min (xd->max_supported_frame_size,
+                            ethernet_main.default_mtu + hi->frame_overhead);
 
-  rv = rte_eth_dev_configure (xd->port_id, xd->rx_q_used,
-                             xd->tx_q_used, &xd->port_conf);
+#if RTE_VERSION >= RTE_VERSION_NUM(21, 11, 0, 0)
+  conf.rxmode.mtu = max_frame_size - xd->driver_frame_overhead;
+#endif
 
-  if (rv < 0)
+retry:
+  rv = rte_eth_dev_configure (xd->port_id, xd->conf.n_rx_queues,
+                             xd->conf.n_tx_queues, &conf);
+  if (rv < 0 && conf.intr_conf.rxq)
     {
-      dpdk_device_error (xd, "rte_eth_dev_configure", rv);
-      goto error;
+      conf.intr_conf.rxq = 0;
+      goto retry;
     }
 
-  vec_validate_aligned (xd->tx_queues, xd->tx_q_used - 1,
+#if RTE_VERSION < RTE_VERSION_NUM(21, 11, 0, 0)
+  rte_eth_dev_set_mtu (xd->port_id,
+                      max_frame_size - xd->driver_frame_overhead);
+#endif
+
+  hi->max_frame_size = 0;
+  vnet_hw_interface_set_max_frame_size (vnm, xd->hw_if_index, max_frame_size);
+  dpdk_log_debug ("[%u] max_frame_size %u max max_frame_size %u "
+                 "driver_frame_overhead %u",
+                 xd->port_id, hi->max_frame_size,
+                 xd->max_supported_frame_size, xd->driver_frame_overhead);
+
+  vec_validate_aligned (xd->tx_queues, xd->conf.n_tx_queues - 1,
                        CLIB_CACHE_LINE_BYTES);
-  for (j = 0; j < xd->tx_q_used; j++)
+  for (j = 0; j < xd->conf.n_tx_queues; j++)
     {
-      rv =
-       rte_eth_tx_queue_setup (xd->port_id, j, xd->nb_tx_desc,
-                               xd->cpu_socket, &xd->tx_conf);
+      rv = rte_eth_tx_queue_setup (xd->port_id, j, xd->conf.n_tx_desc,
+                                  xd->cpu_socket, 0);
 
       /* retry with any other CPU socket */
       if (rv < 0)
-       rv =
-         rte_eth_tx_queue_setup (xd->port_id, j,
-                                 xd->nb_tx_desc, SOCKET_ID_ANY,
-                                 &xd->tx_conf);
+       rv = rte_eth_tx_queue_setup (xd->port_id, j, xd->conf.n_tx_desc,
+                                    SOCKET_ID_ANY, 0);
       if (rv < 0)
        dpdk_device_error (xd, "rte_eth_tx_queue_setup", rv);
 
-      if (xd->tx_q_used < tm->n_vlib_mains)
-       clib_spinlock_init (&vec_elt (xd->tx_queues, j).lock);
+      clib_spinlock_init (&vec_elt (xd->tx_queues, j).lock);
     }
 
-  vec_validate_aligned (xd->rx_queues, xd->rx_q_used - 1,
+  vec_validate_aligned (xd->rx_queues, xd->conf.n_rx_queues - 1,
                        CLIB_CACHE_LINE_BYTES);
-  for (j = 0; j < xd->rx_q_used; j++)
+
+  for (j = 0; j < xd->conf.n_rx_queues; j++)
     {
       dpdk_rx_queue_t *rxq = vec_elt_at_index (xd->rx_queues, j);
-      uword tidx = vnet_get_device_input_thread_index (dm->vnet_main,
-                                                      xd->hw_if_index, j);
-      unsigned lcore = vlib_worker_threads[tidx].cpu_id;
-      u16 socket_id = rte_lcore_to_socket_id (lcore);
-      u8 bpidx = vlib_buffer_pool_get_default_for_numa (vm, socket_id);
+      u8 bpidx = vlib_buffer_pool_get_default_for_numa (
+       vm, vnet_hw_if_get_rx_queue_numa_node (vnm, rxq->queue_index));
       vlib_buffer_pool_t *bp = vlib_get_buffer_pool (vm, bpidx);
       struct rte_mempool *mp = dpdk_mempool_by_buffer_pool_index[bpidx];
 
-      rv = rte_eth_rx_queue_setup (xd->port_id, j, xd->nb_rx_desc,
+      rv = rte_eth_rx_queue_setup (xd->port_id, j, xd->conf.n_rx_desc,
                                   xd->cpu_socket, 0, mp);
 
       /* retry with any other CPU socket */
       if (rv < 0)
-       rv = rte_eth_rx_queue_setup (xd->port_id, j, xd->nb_rx_desc,
+       rv = rte_eth_rx_queue_setup (xd->port_id, j, xd->conf.n_rx_desc,
                                     SOCKET_ID_ANY, 0, mp);
 
       rxq->buffer_pool_index = bp->index;
@@ -147,7 +247,40 @@ dpdk_device_setup (dpdk_device_t * xd)
   if (vec_len (xd->errors))
     goto error;
 
-  rte_eth_dev_set_mtu (xd->port_id, hi->max_packet_bytes);
+  xd->buffer_flags =
+    (VLIB_BUFFER_TOTAL_LENGTH_VALID | VLIB_BUFFER_EXT_HDR_VALID);
+
+  if ((rxo & (RTE_ETH_RX_OFFLOAD_TCP_CKSUM | RTE_ETH_RX_OFFLOAD_UDP_CKSUM)) ==
+      (RTE_ETH_RX_OFFLOAD_TCP_CKSUM | RTE_ETH_RX_OFFLOAD_UDP_CKSUM))
+    xd->buffer_flags |=
+      (VNET_BUFFER_F_L4_CHECKSUM_COMPUTED | VNET_BUFFER_F_L4_CHECKSUM_CORRECT);
+
+  dpdk_device_flag_set (xd, DPDK_DEVICE_FLAG_RX_IP4_CKSUM,
+                       rxo & RTE_ETH_RX_OFFLOAD_IPV4_CKSUM);
+  dpdk_device_flag_set (xd, DPDK_DEVICE_FLAG_MAYBE_MULTISEG,
+                       rxo & RTE_ETH_RX_OFFLOAD_SCATTER);
+  dpdk_device_flag_set (
+    xd, DPDK_DEVICE_FLAG_TX_OFFLOAD,
+    (txo & (RTE_ETH_TX_OFFLOAD_TCP_CKSUM | RTE_ETH_TX_OFFLOAD_UDP_CKSUM)) ==
+      (RTE_ETH_TX_OFFLOAD_TCP_CKSUM | RTE_ETH_TX_OFFLOAD_UDP_CKSUM));
+
+  /* unconditionally set mac filtering cap */
+  caps.val = caps.mask = VNET_HW_IF_CAP_MAC_FILTER;
+
+  ethernet_set_flags (vnm, xd->hw_if_index,
+                     ETHERNET_INTERFACE_FLAG_DEFAULT_L3);
+
+  for (int i = 0; i < ARRAY_LEN (tx_off_caps_map); i++)
+    {
+      __typeof__ (tx_off_caps_map[0]) *v = tx_off_caps_map + i;
+      caps.mask |= v->caps;
+      if ((v->offload & txo) == v->offload)
+       caps.val |= v->caps;
+    }
+
+  vnet_hw_if_change_caps (vnm, xd->hw_if_index, &caps);
+  xd->enabled_rx_off = rxo;
+  xd->enabled_tx_off = txo;
 
   if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
     dpdk_device_start (xd);
@@ -162,6 +295,93 @@ error:
   sw->flags |= VNET_SW_INTERFACE_FLAG_ERROR;
 }
 
+static clib_error_t *
+dpdk_rx_read_ready (clib_file_t *uf)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  dpdk_main_t *dm = &dpdk_main;
+  u32 qidx = uf->private_data;
+  vnet_hw_if_rx_queue_t *rxq = vnet_hw_if_get_rx_queue (vnm, qidx);
+  dpdk_device_t *xd = vec_elt_at_index (dm->devices, rxq->dev_instance);
+
+  u64 b;
+  CLIB_UNUSED (ssize_t size) = read (uf->file_descriptor, &b, sizeof (b));
+  if (rxq->mode != VNET_HW_IF_RX_MODE_POLLING)
+    {
+      vnet_hw_if_rx_queue_set_int_pending (vnm, uf->private_data);
+      rte_eth_dev_rx_intr_enable (xd->port_id, rxq->queue_id);
+    }
+
+  return 0;
+}
+
+static void
+dpdk_setup_interrupts (dpdk_device_t *xd)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, xd->hw_if_index);
+  int int_mode = 0;
+  if (!hi)
+    return;
+
+  if (!xd->conf.enable_rxq_int)
+    return;
+
+  /* Probe for interrupt support */
+  if (rte_eth_dev_rx_intr_enable (xd->port_id, 0))
+    {
+      dpdk_log_info ("probe for interrupt mode for device %U. Failed.\n",
+                    format_dpdk_device_name, xd->port_id);
+    }
+  else
+    {
+      xd->flags |= DPDK_DEVICE_FLAG_INT_SUPPORTED;
+      if (!(xd->flags & DPDK_DEVICE_FLAG_INT_UNMASKABLE))
+       rte_eth_dev_rx_intr_disable (xd->port_id, 0);
+      dpdk_log_info ("Probe for interrupt mode for device %U. Success.\n",
+                    format_dpdk_device_name, xd->port_id);
+    }
+
+  if (xd->flags & DPDK_DEVICE_FLAG_INT_SUPPORTED)
+    {
+      int_mode = 1;
+      for (int q = 0; q < xd->conf.n_rx_queues; q++)
+       {
+         dpdk_rx_queue_t *rxq = vec_elt_at_index (xd->rx_queues, q);
+         clib_file_t f = { 0 };
+         rxq->efd = rte_eth_dev_rx_intr_ctl_q_get_fd (xd->port_id, q);
+         if (rxq->efd < 0)
+           {
+             xd->flags &= ~DPDK_DEVICE_FLAG_INT_SUPPORTED;
+             int_mode = 0;
+             break;
+           }
+         f.read_function = dpdk_rx_read_ready;
+         f.flags = UNIX_FILE_EVENT_EDGE_TRIGGERED;
+         f.file_descriptor = rxq->efd;
+         f.private_data = rxq->queue_index;
+         f.description =
+           format (0, "%U queue %u", format_dpdk_device_name, xd->port_id, q);
+         rxq->clib_file_index = clib_file_add (&file_main, &f);
+         vnet_hw_if_set_rx_queue_file_index (vnm, rxq->queue_index,
+                                             rxq->clib_file_index);
+         if (xd->flags & DPDK_DEVICE_FLAG_INT_UNMASKABLE)
+           {
+             clib_file_main_t *fm = &file_main;
+             clib_file_t *f =
+               pool_elt_at_index (fm->file_pool, rxq->clib_file_index);
+             fm->file_update (f, UNIX_FILE_UPDATE_DELETE);
+           }
+       }
+    }
+
+  if (int_mode)
+    vnet_hw_if_set_caps (vnm, hi->hw_if_index, VNET_HW_IF_CAP_INT_MODE);
+  else
+    vnet_hw_if_unset_caps (vnm, hi->hw_if_index, VNET_HW_IF_CAP_INT_MODE);
+  vnet_hw_if_update_runtime_data (vnm, xd->hw_if_index);
+}
+
 void
 dpdk_device_start (dpdk_device_t * xd)
 {
@@ -178,6 +398,13 @@ dpdk_device_start (dpdk_device_t * xd)
       return;
     }
 
+  dpdk_log_debug ("[%u] RX burst function: %U", xd->port_id,
+                 format_dpdk_burst_fn, xd, VLIB_RX);
+  dpdk_log_debug ("[%u] TX burst function: %U", xd->port_id,
+                 format_dpdk_burst_fn, xd, VLIB_TX);
+
+  dpdk_setup_interrupts (xd);
+
   if (xd->default_mac_address)
     rv = rte_eth_dev_default_mac_addr_set (xd->port_id,
                                           (void *) xd->default_mac_address);
@@ -228,10 +455,11 @@ dpdk_port_state_callback_inline (dpdk_portid_t port_id,
   rte_eth_link_get_nowait (port_id, &link);
   u8 link_up = link.link_status;
   if (link_up)
-    dpdk_log_info ("Port %d Link Up - speed %u Mbps - %s",
-                  port_id, (unsigned) link.link_speed,
-                  (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
-                  "full-duplex" : "half-duplex");
+    dpdk_log_info ("Port %d Link Up - speed %u Mbps - %s", port_id,
+                  (unsigned) link.link_speed,
+                  (link.link_duplex == RTE_ETH_LINK_FULL_DUPLEX) ?
+                          "full-duplex" :
+                          "half-duplex");
   else
     dpdk_log_info ("Port %d Link Down\n\n", port_id);
 
@@ -254,7 +482,11 @@ dpdk_get_pci_device (const struct rte_eth_dev_info *info)
   const struct rte_bus *bus;
 
   bus = rte_bus_find_by_device (info->device);
+#if RTE_VERSION >= RTE_VERSION_NUM(22, 11, 0, 0)
+  if (bus && !strcmp (rte_bus_name (bus), "pci"))
+#else
   if (bus && !strcmp (bus->name, "pci"))
+#endif
     return RTE_DEV_TO_PCI (info->device);
   else
     return NULL;
@@ -267,7 +499,11 @@ dpdk_get_vmbus_device (const struct rte_eth_dev_info *info)
   const struct rte_bus *bus;
 
   bus = rte_bus_find_by_device (info->device);
+#if RTE_VERSION >= RTE_VERSION_NUM(22, 11, 0, 0)
+  if (bus && !strcmp (rte_bus_name (bus), "vmbus"))
+#else
   if (bus && !strcmp (bus->name, "vmbus"))
+#endif
     return container_of (info->device, struct rte_vmbus_device, device);
   else
     return NULL;