2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
15 #include <vnet/vnet.h>
16 #include <vppinfra/vec.h>
17 #include <vppinfra/error.h>
18 #include <vppinfra/format.h>
19 #include <vppinfra/bitmap.h>
20 #include <vppinfra/linux/sysfs.h>
21 #include <vlib/unix/unix.h>
24 #include <vnet/ethernet/ethernet.h>
25 #include <vnet/interface/rx_queue_funcs.h>
26 #include <dpdk/buffer.h>
27 #include <dpdk/device/dpdk.h>
28 #include <dpdk/cryptodev/cryptodev.h>
29 #include <vlib/pci/pci.h>
30 #include <vlib/vmbus/vmbus.h>
39 #include <sys/mount.h>
44 #include <dpdk/device/dpdk_priv.h>
46 #define ETHER_MAX_LEN 1518 /**< Maximum frame len, including CRC. */
48 dpdk_main_t dpdk_main;
49 dpdk_config_main_t dpdk_config_main;
51 #define LINK_STATE_ELOGS 0
53 /* Port configuration, mildly modified Intel app values */
55 static dpdk_port_type_t
56 port_type_from_speed_capa (struct rte_eth_dev_info *dev_info)
59 if (dev_info->speed_capa & ETH_LINK_SPEED_100G)
60 return VNET_DPDK_PORT_TYPE_ETH_100G;
61 else if (dev_info->speed_capa & ETH_LINK_SPEED_56G)
62 return VNET_DPDK_PORT_TYPE_ETH_56G;
63 else if (dev_info->speed_capa & ETH_LINK_SPEED_50G)
64 return VNET_DPDK_PORT_TYPE_ETH_50G;
65 else if (dev_info->speed_capa & ETH_LINK_SPEED_40G)
66 return VNET_DPDK_PORT_TYPE_ETH_40G;
67 else if (dev_info->speed_capa & ETH_LINK_SPEED_25G)
68 return VNET_DPDK_PORT_TYPE_ETH_25G;
69 else if (dev_info->speed_capa & ETH_LINK_SPEED_20G)
70 return VNET_DPDK_PORT_TYPE_ETH_20G;
71 else if (dev_info->speed_capa & ETH_LINK_SPEED_10G)
72 return VNET_DPDK_PORT_TYPE_ETH_10G;
73 else if (dev_info->speed_capa & ETH_LINK_SPEED_5G)
74 return VNET_DPDK_PORT_TYPE_ETH_5G;
75 else if (dev_info->speed_capa & ETH_LINK_SPEED_2_5G)
76 return VNET_DPDK_PORT_TYPE_ETH_2_5G;
77 else if (dev_info->speed_capa & ETH_LINK_SPEED_1G)
78 return VNET_DPDK_PORT_TYPE_ETH_1G;
80 return VNET_DPDK_PORT_TYPE_UNKNOWN;
83 static dpdk_port_type_t
84 port_type_from_link_speed (u32 link_speed)
88 case ETH_SPEED_NUM_1G:
89 return VNET_DPDK_PORT_TYPE_ETH_1G;
90 case ETH_SPEED_NUM_2_5G:
91 return VNET_DPDK_PORT_TYPE_ETH_2_5G;
92 case ETH_SPEED_NUM_5G:
93 return VNET_DPDK_PORT_TYPE_ETH_5G;
94 case ETH_SPEED_NUM_10G:
95 return VNET_DPDK_PORT_TYPE_ETH_10G;
96 case ETH_SPEED_NUM_20G:
97 return VNET_DPDK_PORT_TYPE_ETH_20G;
98 case ETH_SPEED_NUM_25G:
99 return VNET_DPDK_PORT_TYPE_ETH_25G;
100 case ETH_SPEED_NUM_40G:
101 return VNET_DPDK_PORT_TYPE_ETH_40G;
102 case ETH_SPEED_NUM_50G:
103 return VNET_DPDK_PORT_TYPE_ETH_50G;
104 case ETH_SPEED_NUM_56G:
105 return VNET_DPDK_PORT_TYPE_ETH_56G;
106 case ETH_SPEED_NUM_100G:
107 return VNET_DPDK_PORT_TYPE_ETH_100G;
109 return VNET_DPDK_PORT_TYPE_UNKNOWN;
114 dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags)
116 dpdk_main_t *dm = &dpdk_main;
117 dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance);
118 u32 old = (xd->flags & DPDK_DEVICE_FLAG_PROMISC) != 0;
122 case ETHERNET_INTERFACE_FLAG_DEFAULT_L3:
123 /* set to L3/non-promisc mode */
124 xd->flags &= ~DPDK_DEVICE_FLAG_PROMISC;
126 case ETHERNET_INTERFACE_FLAG_ACCEPT_ALL:
127 xd->flags |= DPDK_DEVICE_FLAG_PROMISC;
129 case ETHERNET_INTERFACE_FLAG_MTU:
130 xd->port_conf.rxmode.max_rx_pkt_len = hi->max_packet_bytes;
131 dpdk_device_setup (xd);
137 if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
139 if (xd->flags & DPDK_DEVICE_FLAG_PROMISC)
140 rte_eth_promiscuous_enable (xd->port_id);
142 rte_eth_promiscuous_disable (xd->port_id);
149 dpdk_port_crc_strip_enabled (dpdk_device_t * xd)
151 return !(xd->port_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC);
154 /* The function check_l3cache helps check if Level 3 cache exists or not on current CPUs
155 return value 1: exist.
156 return value 0: not exist.
164 const char *sys_cache_dir = "/sys/devices/system/cpu/cpu0/cache";
165 DIR *dir_cache = opendir (sys_cache_dir);
167 if (dir_cache == NULL)
170 while ((dp = readdir (dir_cache)) != NULL)
172 if (dp->d_type == DT_DIR)
175 int level_cache = -1;
177 p = format (p, "%s/%s/%s%c", sys_cache_dir, dp->d_name, "level", 0);
178 if ((err = clib_sysfs_read ((char *) p, "%d", &level_cache)))
179 clib_error_free (err);
181 if (level_cache == 3)
183 closedir (dir_cache);
189 if (dir_cache != NULL)
190 closedir (dir_cache);
196 dpdk_enable_l4_csum_offload (dpdk_device_t * xd)
198 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM;
199 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
200 xd->flags |= DPDK_DEVICE_FLAG_TX_OFFLOAD |
201 DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM;
204 static clib_error_t *
205 dpdk_lib_init (dpdk_main_t * dm)
207 vnet_main_t *vnm = vnet_get_main ();
209 u32 mtu, max_rx_frame;
212 vlib_main_t *vm = vlib_get_main ();
213 vlib_thread_main_t *tm = vlib_get_thread_main ();
214 vnet_device_main_t *vdm = &vnet_device_main;
215 vnet_sw_interface_t *sw;
216 vnet_hw_interface_t *hi;
218 vlib_pci_addr_t last_pci_addr;
219 u32 last_pci_addr_port = 0;
220 u8 af_packet_instance_num = 0;
221 last_pci_addr.as_u32 = ~0;
223 nports = rte_eth_dev_count_avail ();
227 dpdk_log_notice ("DPDK drivers found no Ethernet devices...");
231 dpdk_log_notice ("DPDK drivers found %d ports...", nports);
233 if (dm->conf->enable_tcp_udp_checksum)
234 dm->buffer_flags_template &= ~(VNET_BUFFER_F_L4_CHECKSUM_CORRECT
235 | VNET_BUFFER_F_L4_CHECKSUM_COMPUTED);
237 /* vlib_buffer_t template */
238 vec_validate_aligned (dm->per_thread_data, tm->n_vlib_mains - 1,
239 CLIB_CACHE_LINE_BYTES);
240 for (i = 0; i < tm->n_vlib_mains; i++)
242 dpdk_per_thread_data_t *ptd = vec_elt_at_index (dm->per_thread_data, i);
243 clib_memset (&ptd->buffer_template, 0, sizeof (vlib_buffer_t));
244 ptd->buffer_template.flags = dm->buffer_flags_template;
245 vnet_buffer (&ptd->buffer_template)->sw_if_index[VLIB_TX] = (u32) ~ 0;
249 RTE_ETH_FOREACH_DEV(i)
253 struct rte_eth_dev_info dev_info;
254 struct rte_pci_device *pci_dev;
255 struct rte_vmbus_device *vmbus_dev;
256 dpdk_portid_t next_port_id;
257 dpdk_device_config_t *devconf = 0;
258 vlib_pci_addr_t pci_addr;
259 vlib_vmbus_addr_t vmbus_addr;
262 if (!rte_eth_dev_is_valid_port(i))
265 rte_eth_dev_info_get (i, &dev_info);
267 if (dev_info.device == 0)
269 dpdk_log_notice ("DPDK bug: missing device info. Skipping %s device",
270 dev_info.driver_name);
274 pci_dev = dpdk_get_pci_device (&dev_info);
278 pci_addr.domain = pci_dev->addr.domain;
279 pci_addr.bus = pci_dev->addr.bus;
280 pci_addr.slot = pci_dev->addr.devid;
281 pci_addr.function = pci_dev->addr.function;
282 p = hash_get (dm->conf->device_config_index_by_pci_addr,
286 vmbus_dev = dpdk_get_vmbus_device (&dev_info);
290 unformat_input_t input_vmbus;
291 unformat_init_string (&input_vmbus, dev_info.device->name,
292 strlen (dev_info.device->name));
293 if (unformat (&input_vmbus, "%U", unformat_vlib_vmbus_addr,
296 p = mhash_get (&dm->conf->device_config_index_by_vmbus_addr,
299 unformat_free (&input_vmbus);
304 devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
305 /* If device is blacklisted, we should skip it */
306 if (devconf->is_blacklisted)
312 devconf = &dm->conf->default_devconf;
314 /* Create vnet interface */
315 vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES);
316 xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT;
317 xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT;
318 xd->cpu_socket = (i8) rte_eth_dev_socket_id (i);
321 xd->name = devconf->name;
324 /* Handle representor devices that share the same PCI ID */
325 if (dev_info.switch_info.domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
327 if (dev_info.switch_info.port_id != (uint16_t)-1)
328 xd->interface_name_suffix = format (0, "%d", dev_info.switch_info.port_id);
330 /* Handle interface naming for devices with multiple ports sharing same PCI ID */
332 ((next_port_id = rte_eth_find_next (i + 1)) != RTE_MAX_ETHPORTS))
334 struct rte_eth_dev_info di = { 0 };
335 struct rte_pci_device *next_pci_dev;
336 rte_eth_dev_info_get (next_port_id, &di);
337 next_pci_dev = di.device ? RTE_DEV_TO_PCI (di.device) : 0;
339 pci_addr.as_u32 != last_pci_addr.as_u32 &&
340 memcmp (&pci_dev->addr, &next_pci_dev->addr,
341 sizeof (struct rte_pci_addr)) == 0)
343 xd->interface_name_suffix = format (0, "0");
344 last_pci_addr.as_u32 = pci_addr.as_u32;
345 last_pci_addr_port = i;
347 else if (pci_addr.as_u32 == last_pci_addr.as_u32)
349 xd->interface_name_suffix =
350 format (0, "%u", i - last_pci_addr_port);
354 last_pci_addr.as_u32 = ~0;
358 last_pci_addr.as_u32 = ~0;
360 clib_memcpy (&xd->tx_conf, &dev_info.default_txconf,
361 sizeof (struct rte_eth_txconf));
363 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM)
365 xd->port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM;
366 xd->flags |= DPDK_DEVICE_FLAG_RX_IP4_CKSUM;
369 if (dm->conf->enable_tcp_udp_checksum)
371 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM)
372 xd->port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM;
373 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)
374 xd->port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_CKSUM;
375 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)
376 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
378 if (dm->conf->enable_outer_checksum_offload)
380 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)
381 xd->port_conf.txmode.offloads |=
382 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
383 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_OUTER_UDP_CKSUM)
384 xd->port_conf.txmode.offloads |=
385 DEV_TX_OFFLOAD_OUTER_UDP_CKSUM;
389 if (dm->conf->enable_lro)
391 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO)
393 xd->port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
394 if (devconf->max_lro_pkt_size)
395 xd->port_conf.rxmode.max_lro_pkt_size =
396 devconf->max_lro_pkt_size;
398 xd->port_conf.rxmode.max_lro_pkt_size =
399 DPDK_MAX_LRO_SIZE_DEFAULT;
402 if (dm->conf->no_multi_seg)
404 xd->port_conf.txmode.offloads &= ~DEV_TX_OFFLOAD_MULTI_SEGS;
405 xd->port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_JUMBO_FRAME;
406 xd->port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_SCATTER;
410 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS;
411 xd->port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME;
412 xd->port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_SCATTER;
413 xd->flags |= DPDK_DEVICE_FLAG_MAYBE_MULTISEG;
416 xd->tx_q_used = clib_min (dev_info.max_tx_queues, tm->n_vlib_mains);
418 if (devconf->num_tx_queues > 0
419 && devconf->num_tx_queues < xd->tx_q_used)
420 xd->tx_q_used = clib_min (xd->tx_q_used, devconf->num_tx_queues);
422 if (devconf->num_rx_queues > 1
423 && dev_info.max_rx_queues >= devconf->num_rx_queues)
425 xd->rx_q_used = devconf->num_rx_queues;
426 xd->port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
427 if (devconf->rss_fn == 0)
428 xd->port_conf.rx_adv_conf.rss_conf.rss_hf =
429 ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP;
432 u64 unsupported_bits;
433 xd->port_conf.rx_adv_conf.rss_conf.rss_hf = devconf->rss_fn;
434 unsupported_bits = xd->port_conf.rx_adv_conf.rss_conf.rss_hf;
435 unsupported_bits &= ~dev_info.flow_type_rss_offloads;
436 if (unsupported_bits)
437 dpdk_log_warn ("Unsupported RSS hash functions: %U",
438 format_dpdk_rss_hf_name, unsupported_bits);
440 xd->port_conf.rx_adv_conf.rss_conf.rss_hf &=
441 dev_info.flow_type_rss_offloads;
446 vec_validate_aligned (xd->rx_queues, xd->rx_q_used - 1,
447 CLIB_CACHE_LINE_BYTES);
449 xd->flags |= DPDK_DEVICE_FLAG_PMD;
451 /* workaround for drivers not setting driver_name */
452 if ((!dev_info.driver_name) && (pci_dev))
453 dev_info.driver_name = pci_dev->driver->driver.name;
455 ASSERT (dev_info.driver_name);
461 #define _(s,f) else if (dev_info.driver_name && \
462 !strcmp(dev_info.driver_name, s)) \
463 xd->pmd = VNET_DPDK_PMD_##f;
469 xd->pmd = VNET_DPDK_PMD_UNKNOWN;
471 xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
472 xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT;
473 xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT;
477 /* Drivers with valid speed_capa set */
478 case VNET_DPDK_PMD_I40E:
479 xd->flags |= DPDK_DEVICE_FLAG_INT_UNMASKABLE;
481 case VNET_DPDK_PMD_E1000EM:
482 case VNET_DPDK_PMD_IGB:
483 case VNET_DPDK_PMD_IGC:
484 case VNET_DPDK_PMD_IXGBE:
485 case VNET_DPDK_PMD_ICE:
486 xd->port_type = port_type_from_speed_capa (&dev_info);
487 xd->supported_flow_actions = VNET_FLOW_ACTION_MARK |
488 VNET_FLOW_ACTION_REDIRECT_TO_NODE |
489 VNET_FLOW_ACTION_REDIRECT_TO_QUEUE |
490 VNET_FLOW_ACTION_BUFFER_ADVANCE |
491 VNET_FLOW_ACTION_COUNT | VNET_FLOW_ACTION_DROP |
492 VNET_FLOW_ACTION_RSS;
494 if (dm->conf->no_tx_checksum_offload == 0)
496 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM;
497 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
498 xd->flags |= DPDK_DEVICE_FLAG_TX_OFFLOAD |
499 DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM;
502 xd->port_conf.intr_conf.rxq = 1;
504 case VNET_DPDK_PMD_MLX5:
505 if (dm->conf->no_tx_checksum_offload == 0)
507 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM;
508 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
509 xd->flags |= DPDK_DEVICE_FLAG_TX_OFFLOAD |
510 DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM;
512 xd->port_type = port_type_from_speed_capa (&dev_info);
514 case VNET_DPDK_PMD_CXGBE:
515 case VNET_DPDK_PMD_MLX4:
516 case VNET_DPDK_PMD_QEDE:
517 case VNET_DPDK_PMD_BNXT:
518 xd->port_type = port_type_from_speed_capa (&dev_info);
522 case VNET_DPDK_PMD_I40EVF:
523 xd->flags |= DPDK_DEVICE_FLAG_INT_UNMASKABLE;
525 case VNET_DPDK_PMD_IGBVF:
526 case VNET_DPDK_PMD_IXGBEVF:
527 xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF;
528 if (dm->conf->no_tx_checksum_offload == 0)
530 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM;
531 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
533 DPDK_DEVICE_FLAG_TX_OFFLOAD |
534 DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM;
536 /* DPDK bug in multiqueue... */
537 /* xd->port_conf.intr_conf.rxq = 1; */
541 case VNET_DPDK_PMD_IAVF:
542 xd->flags |= DPDK_DEVICE_FLAG_INT_UNMASKABLE;
543 xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF;
544 xd->supported_flow_actions =
545 VNET_FLOW_ACTION_MARK | VNET_FLOW_ACTION_REDIRECT_TO_NODE |
546 VNET_FLOW_ACTION_REDIRECT_TO_QUEUE |
547 VNET_FLOW_ACTION_BUFFER_ADVANCE | VNET_FLOW_ACTION_COUNT |
548 VNET_FLOW_ACTION_DROP | VNET_FLOW_ACTION_RSS;
550 if (dm->conf->no_tx_checksum_offload == 0)
552 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM;
553 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
555 DPDK_DEVICE_FLAG_TX_OFFLOAD |
556 DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM;
558 /* DPDK bug in multiqueue... */
559 /* xd->port_conf.intr_conf.rxq = 1; */
562 case VNET_DPDK_PMD_THUNDERX:
563 xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF;
565 if (dm->conf->no_tx_checksum_offload == 0)
567 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM;
568 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
569 xd->flags |= DPDK_DEVICE_FLAG_TX_OFFLOAD;
573 case VNET_DPDK_PMD_ENA:
574 xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF;
575 xd->port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_SCATTER;
576 xd->port_conf.intr_conf.rxq = 1;
577 if (dm->conf->no_tx_checksum_offload == 0)
579 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
580 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM;
581 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
582 xd->flags |= DPDK_DEVICE_FLAG_TX_OFFLOAD;
586 case VNET_DPDK_PMD_DPAA2:
587 xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
591 case VNET_DPDK_PMD_ENIC:
593 struct rte_eth_link l;
594 rte_eth_link_get_nowait (i, &l);
595 xd->port_type = port_type_from_link_speed (l.link_speed);
596 if (dm->conf->enable_tcp_udp_checksum)
597 dpdk_enable_l4_csum_offload (xd);
601 /* Intel Red Rock Canyon */
602 case VNET_DPDK_PMD_FM10K:
603 xd->port_type = VNET_DPDK_PORT_TYPE_ETH_SWITCH;
607 case VNET_DPDK_PMD_VIRTIO:
608 xd->port_conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
609 xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
610 xd->nb_rx_desc = DPDK_NB_RX_DESC_VIRTIO;
611 xd->nb_tx_desc = DPDK_NB_TX_DESC_VIRTIO;
613 * Enable use of RX interrupts if supported.
615 * There is no device flag or capability for this, so
616 * use the same check that the virtio driver does.
618 if (pci_dev && rte_intr_cap_multiple (&pci_dev->intr_handle))
619 xd->port_conf.intr_conf.rxq = 1;
623 case VNET_DPDK_PMD_VMXNET3:
624 xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
625 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS;
626 /* TCP csum offload not working although udp might work. Left
627 * disabled for now */
628 if (0 && (dm->conf->no_tx_checksum_offload == 0))
630 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
631 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM;
632 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
633 xd->flags |= DPDK_DEVICE_FLAG_TX_OFFLOAD;
637 case VNET_DPDK_PMD_AF_PACKET:
638 xd->port_type = VNET_DPDK_PORT_TYPE_AF_PACKET;
639 xd->af_packet_instance_num = af_packet_instance_num++;
642 case VNET_DPDK_PMD_VIRTIO_USER:
643 xd->port_type = VNET_DPDK_PORT_TYPE_VIRTIO_USER;
646 case VNET_DPDK_PMD_VHOST_ETHER:
647 xd->port_type = VNET_DPDK_PORT_TYPE_VHOST_ETHER;
650 case VNET_DPDK_PMD_LIOVF_ETHER:
651 xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF;
654 case VNET_DPDK_PMD_FAILSAFE:
655 xd->port_type = VNET_DPDK_PORT_TYPE_FAILSAFE;
656 xd->port_conf.intr_conf.lsc = 1;
659 case VNET_DPDK_PMD_NETVSC:
661 struct rte_eth_link l;
662 rte_eth_link_get_nowait (i, &l);
663 xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF;
668 xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
671 if (devconf->num_rx_desc)
672 xd->nb_rx_desc = devconf->num_rx_desc;
675 /* If num_rx_desc is not specified by VPP user, the current CPU is working
676 with 2M page and has no L3 cache, default num_rx_desc is changed to 512
677 from original 1024 to help reduce TLB misses.
679 if ((clib_mem_get_default_hugepage_size () == 2 << 20)
680 && check_l3cache() == 0)
681 xd->nb_rx_desc = 512;
684 if (devconf->num_tx_desc)
685 xd->nb_tx_desc = devconf->num_tx_desc;
688 /* If num_tx_desc is not specified by VPP user, the current CPU is working
689 with 2M page and has no L3 cache, default num_tx_desc is changed to 512
690 from original 1024 to help reduce TLB misses.
692 if ((clib_mem_get_default_hugepage_size () == 2 << 20)
693 && check_l3cache() == 0)
694 xd->nb_tx_desc = 512;
698 if (xd->pmd == VNET_DPDK_PMD_AF_PACKET)
700 f64 now = vlib_time_now (vm);
702 rnd = (u32) (now * 1e6);
703 rnd = random_u32 (&rnd);
704 clib_memcpy (addr + 2, &rnd, sizeof (rnd));
709 rte_eth_macaddr_get (i, (void *) addr);
712 xd->device_index = xd - dm->devices;
713 xd->per_interface_next_index = ~0;
715 /* assign interface to input thread */
718 error = ethernet_register_interface (
719 vnm, dpdk_device_class.index, xd->device_index,
720 /* ethernet address */ addr, &xd->hw_if_index, dpdk_flag_change);
725 * Ensure default mtu is not > the mtu read from the hardware.
726 * Otherwise rte_eth_dev_configure() will fail and the port will
728 * Calculate max_frame_size and mtu supported by NIC
730 if (ETHERNET_MAX_PACKET_BYTES > dev_info.max_rx_pktlen)
733 * This device does not support the platforms's max frame
734 * size. Use it's advertised mru instead.
736 max_rx_frame = dev_info.max_rx_pktlen;
737 mtu = dev_info.max_rx_pktlen - sizeof (ethernet_header_t);
741 /* VPP treats MTU and max_rx_pktlen both equal to
742 * ETHERNET_MAX_PACKET_BYTES, if dev_info.max_rx_pktlen >=
743 * ETHERNET_MAX_PACKET_BYTES + sizeof(ethernet_header_t)
745 if (dev_info.max_rx_pktlen >= (ETHERNET_MAX_PACKET_BYTES +
746 sizeof (ethernet_header_t)))
748 mtu = ETHERNET_MAX_PACKET_BYTES;
749 max_rx_frame = ETHERNET_MAX_PACKET_BYTES;
752 * Some platforms do not account for Ethernet FCS (4 bytes) in
753 * MTU calculations. To interop with them increase mru but only
754 * if the device's settings can support it.
756 if (dpdk_port_crc_strip_enabled (xd) &&
757 (dev_info.max_rx_pktlen >= (ETHERNET_MAX_PACKET_BYTES +
758 sizeof (ethernet_header_t) +
766 max_rx_frame = ETHERNET_MAX_PACKET_BYTES;
767 mtu = ETHERNET_MAX_PACKET_BYTES - sizeof (ethernet_header_t);
769 if (dpdk_port_crc_strip_enabled (xd) &&
770 (dev_info.max_rx_pktlen >= (ETHERNET_MAX_PACKET_BYTES + 4)))
777 if (xd->pmd == VNET_DPDK_PMD_FAILSAFE)
779 /* failsafe device numerables are reported with active device only,
780 * need to query the mtu for current device setup to overwrite
784 if (!rte_eth_dev_get_mtu (i, &dev_mtu))
787 max_rx_frame = mtu + sizeof (ethernet_header_t);
789 if (dpdk_port_crc_strip_enabled (xd))
796 /*Set port rxmode config */
797 xd->port_conf.rxmode.max_rx_pkt_len = max_rx_frame;
799 sw = vnet_get_hw_sw_interface (vnm, xd->hw_if_index);
800 xd->sw_if_index = sw->sw_if_index;
801 vnet_hw_if_set_input_node (vnm, xd->hw_if_index, dpdk_input_node.index);
803 if (devconf->workers)
807 clib_bitmap_foreach (j, devconf->workers)
809 dpdk_rx_queue_t *rxq = vec_elt_at_index (xd->rx_queues, q);
810 rxq->queue_index = vnet_hw_if_register_rx_queue (
811 vnm, xd->hw_if_index, q++, vdm->first_worker_thread_index + j);
815 for (q = 0; q < xd->rx_q_used; q++)
817 dpdk_rx_queue_t *rxq = vec_elt_at_index (xd->rx_queues, q);
818 rxq->queue_index = vnet_hw_if_register_rx_queue (
819 vnm, xd->hw_if_index, q, VNET_HW_IF_RXQ_THREAD_ANY);
822 vnet_hw_if_update_runtime_data (vnm, xd->hw_if_index);
824 /*Get vnet hardware interface */
825 hi = vnet_get_hw_interface (vnm, xd->hw_if_index);
827 /*Override default max_packet_bytes and max_supported_bytes set in
828 * ethernet_register_interface() above*/
831 hi->max_packet_bytes = mtu;
832 hi->max_supported_packet_bytes = max_rx_frame;
833 hi->numa_node = xd->cpu_socket;
835 /* Indicate ability to support L3 DMAC filtering and
836 * initialize interface to L3 non-promisc mode */
837 hi->caps |= VNET_HW_INTERFACE_CAP_SUPPORTS_MAC_FILTER;
838 ethernet_set_flags (vnm, xd->hw_if_index,
839 ETHERNET_INTERFACE_FLAG_DEFAULT_L3);
842 if (dm->conf->no_tx_checksum_offload == 0)
843 if (xd->flags & DPDK_DEVICE_FLAG_TX_OFFLOAD && hi != NULL)
845 hi->caps |= VNET_HW_INTERFACE_CAP_SUPPORTS_TX_IP4_CKSUM |
846 VNET_HW_INTERFACE_CAP_SUPPORTS_TX_TCP_CKSUM |
847 VNET_HW_INTERFACE_CAP_SUPPORTS_TX_UDP_CKSUM;
848 if (dm->conf->enable_outer_checksum_offload)
850 hi->caps |= VNET_HW_INTERFACE_CAP_SUPPORTS_TX_IP4_OUTER_CKSUM |
851 VNET_HW_INTERFACE_CAP_SUPPORTS_TX_UDP_OUTER_CKSUM;
854 if (devconf->tso == DPDK_DEVICE_TSO_ON && hi != NULL)
856 /*tcp_udp checksum must be enabled*/
857 if ((dm->conf->enable_tcp_udp_checksum) &&
858 (hi->caps & VNET_HW_INTERFACE_CAP_SUPPORTS_TX_CKSUM))
860 hi->caps |= VNET_HW_INTERFACE_CAP_SUPPORTS_TCP_GSO;
861 xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
863 if (dm->conf->enable_outer_checksum_offload &&
864 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VXLAN_TNL_TSO))
866 xd->port_conf.txmode.offloads |=
867 DEV_TX_OFFLOAD_VXLAN_TNL_TSO;
868 hi->caps |= VNET_HW_INTERFACE_CAP_SUPPORTS_VXLAN_TNL_GSO;
872 clib_warning ("%s: TCP/UDP checksum offload must be enabled",
876 dpdk_device_setup (xd);
878 /* rss queues should be configured after dpdk_device_setup() */
879 if ((hi != NULL) && (devconf->rss_queues != NULL))
881 if (vnet_hw_interface_set_rss_queues
882 (vnet_get_main (), hi, devconf->rss_queues))
884 clib_warning ("%s: Failed to set rss queues", hi->name);
888 if (vec_len (xd->errors))
889 dpdk_log_err ("setup failed for device %U. Errors:\n %U",
890 format_dpdk_device_name, i,
891 format_dpdk_device_errors, xd);
894 * A note on Cisco VIC (PMD_ENIC) and VLAN:
896 * With Cisco VIC vNIC, every ingress packet is tagged. On a
897 * trunk vNIC (C series "standalone" server), packets on no VLAN
898 * are tagged with vlan 0. On an access vNIC (standalone or B
899 * series "blade" server), packets on the default/native VLAN
900 * are tagged with that vNIC's VLAN. VPP expects these packets
901 * to be untagged, and previously enabled VLAN strip on VIC by
902 * default. But it also broke vlan sub-interfaces.
904 * The VIC adapter has "untag default vlan" ingress VLAN rewrite
905 * mode, which removes tags from these packets. VPP now includes
906 * a local patch for the enic driver to use this untag mode, so
907 * enabling vlan stripping is no longer needed. In future, the
908 * driver + dpdk will have an API to set the mode after
909 * rte_eal_init. Then, this note and local patch will be
914 * VLAN stripping: default to VLAN strip disabled, unless specified
915 * otherwise in the startup config.
918 vlan_off = rte_eth_dev_get_vlan_offload (xd->port_id);
919 if (devconf->vlan_strip_offload == DPDK_DEVICE_VLAN_STRIP_ON)
921 vlan_off |= ETH_VLAN_STRIP_OFFLOAD;
922 if (rte_eth_dev_set_vlan_offload (xd->port_id, vlan_off) >= 0)
923 dpdk_log_info ("VLAN strip enabled for interface\n");
925 dpdk_log_warn ("VLAN strip cannot be supported by interface\n");
926 xd->port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
930 if (vlan_off & ETH_VLAN_STRIP_OFFLOAD)
932 vlan_off &= ~ETH_VLAN_STRIP_OFFLOAD;
933 if (rte_eth_dev_set_vlan_offload (xd->port_id, vlan_off) >= 0)
934 dpdk_log_warn ("set VLAN offload failed\n");
936 xd->port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_VLAN_STRIP;
940 hi->max_packet_bytes = xd->port_conf.rxmode.max_rx_pkt_len
941 - sizeof (ethernet_header_t);
943 dpdk_log_warn ("hi NULL");
945 if (dm->conf->no_multi_seg)
946 mtu = mtu > ETHER_MAX_LEN ? ETHER_MAX_LEN : mtu;
948 rte_eth_dev_set_mtu (xd->port_id, mtu);
957 dpdk_bind_devices_to_uio (dpdk_config_main_t * conf)
959 vlib_main_t *vm = vlib_get_main ();
962 int num_whitelisted = vec_len (conf->dev_confs);
963 vlib_pci_device_info_t *d = 0;
964 vlib_pci_addr_t *addr = 0, *addrs;
967 addrs = vlib_pci_get_all_dev_addrs ();
969 vec_foreach (addr, addrs)
971 dpdk_device_config_t * devconf = 0;
972 vec_reset_length (pci_addr);
973 pci_addr = format (pci_addr, "%U%c", format_vlib_pci_addr, addr, 0);
976 vlib_pci_free_device_info (d);
979 d = vlib_pci_get_device_info (vm, addr, &error);
982 vlib_log_warn (dpdk_main.log_default, "%U", format_clib_error, error);
983 clib_error_free (error);
987 if (d->device_class != PCI_CLASS_NETWORK_ETHERNET && d->device_class != PCI_CLASS_PROCESSOR_CO)
992 uword * p = hash_get (conf->device_config_index_by_pci_addr, addr->as_u32);
1000 devconf = pool_elt_at_index (conf->dev_confs, p[0]);
1003 /* Enforce Device blacklist by vendor and device */
1004 for (i = 0; i < vec_len (conf->blacklist_by_pci_vendor_and_device); i++)
1007 vendor = (u16)(conf->blacklist_by_pci_vendor_and_device[i] >> 16);
1008 device = (u16)(conf->blacklist_by_pci_vendor_and_device[i] & 0xFFFF);
1009 if (d->vendor_id == vendor && d->device_id == device)
1012 * Expected case: device isn't whitelisted,
1013 * so blacklist it...
1017 /* Device is blacklisted */
1018 pool_get (conf->dev_confs, devconf);
1019 hash_set (conf->device_config_index_by_pci_addr, addr->as_u32,
1020 devconf - conf->dev_confs);
1021 devconf->pci_addr.as_u32 = addr->as_u32;
1022 devconf->dev_addr_type = VNET_DEV_ADDR_PCI;
1023 devconf->is_blacklisted = 1;
1026 else /* explicitly whitelisted, ignore the device blacklist */
1032 if (d->vendor_id == 0x1af4 &&
1033 (d->device_id == VIRTIO_PCI_LEGACY_DEVICEID_NET ||
1034 d->device_id == VIRTIO_PCI_MODERN_DEVICEID_NET))
1037 else if (d->vendor_id == 0x15ad && d->device_id == 0x07b0)
1040 * For vmxnet3 PCI, unless it is explicitly specified in the whitelist,
1041 * the default is to put it in the blacklist.
1045 pool_get (conf->dev_confs, devconf);
1046 hash_set (conf->device_config_index_by_pci_addr, addr->as_u32,
1047 devconf - conf->dev_confs);
1048 devconf->pci_addr.as_u32 = addr->as_u32;
1049 devconf->is_blacklisted = 1;
1052 /* all Intel network devices */
1053 else if (d->vendor_id == 0x8086 && d->device_class == PCI_CLASS_NETWORK_ETHERNET)
1055 /* all Intel QAT devices VFs */
1056 else if (d->vendor_id == 0x8086 && d->device_class == PCI_CLASS_PROCESSOR_CO &&
1057 (d->device_id == 0x0443 || d->device_id == 0x18a1 || d->device_id == 0x19e3 ||
1058 d->device_id == 0x37c9 || d->device_id == 0x6f55))
1061 else if (d->vendor_id == 0x1137 &&
1062 (d->device_id == 0x0043 || d->device_id == 0x0071))
1065 else if (d->vendor_id == 0x1425 && (d->device_id & 0xe000) == 0x4000)
1067 /* Amazon Elastic Network Adapter */
1068 else if (d->vendor_id == 0x1d0f && d->device_id >= 0xec20 && d->device_id <= 0xec21)
1070 /* Cavium Network Adapter */
1071 else if (d->vendor_id == 0x177d && d->device_id == 0x9712)
1073 /* Cavium FastlinQ QL41000 Series */
1074 else if (d->vendor_id == 0x1077 && d->device_id >= 0x8070 && d->device_id <= 0x8090)
1076 /* Mellanox CX3, CX3VF */
1077 else if (d->vendor_id == 0x15b3 && d->device_id >= 0x1003 && d->device_id <= 0x1004)
1081 /* Mellanox CX4, CX4VF, CX4LX, CX4LXVF, CX5, CX5VF, CX5EX, CX5EXVF */
1082 else if (d->vendor_id == 0x15b3 && d->device_id >= 0x1013 && d->device_id <= 0x101a)
1086 /* Mellanox CX6, CX6VF, CX6DX, CX6DXVF */
1087 else if (d->vendor_id == 0x15b3 && d->device_id >= 0x101b && d->device_id <= 0x101e)
1091 /* Broadcom NetXtreme S, and E series only */
1092 else if (d->vendor_id == 0x14e4 &&
1093 ((d->device_id >= 0x16c0 &&
1094 d->device_id != 0x16c6 && d->device_id != 0x16c7 &&
1095 d->device_id != 0x16dd && d->device_id != 0x16f7 &&
1096 d->device_id != 0x16fd && d->device_id != 0x16fe &&
1097 d->device_id != 0x170d && d->device_id != 0x170c &&
1098 d->device_id != 0x170e && d->device_id != 0x1712 &&
1099 d->device_id != 0x1713) ||
1100 (d->device_id == 0x1604 || d->device_id == 0x1605 ||
1101 d->device_id == 0x1614 || d->device_id == 0x1606 ||
1102 d->device_id == 0x1609 || d->device_id == 0x1614)))
1106 dpdk_log_warn ("Unsupported PCI device 0x%04x:0x%04x found "
1107 "at PCI address %s\n", (u16) d->vendor_id, (u16) d->device_id,
1112 error = vlib_pci_bind_to_uio (vm, addr, (char *) conf->uio_driver_name);
1118 pool_get (conf->dev_confs, devconf);
1119 hash_set (conf->device_config_index_by_pci_addr, addr->as_u32,
1120 devconf - conf->dev_confs);
1121 devconf->pci_addr.as_u32 = addr->as_u32;
1123 devconf->dev_addr_type = VNET_DEV_ADDR_PCI;
1124 devconf->is_blacklisted = 1;
1125 clib_error_report (error);
1129 vec_free (pci_addr);
1130 vlib_pci_free_device_info (d);
1134 dpdk_bind_vmbus_devices_to_uio (dpdk_config_main_t * conf)
1136 clib_error_t *error;
1137 vlib_vmbus_addr_t *addrs, *addr = 0;
1138 int num_whitelisted = vec_len (conf->dev_confs);
1141 addrs = vlib_vmbus_get_all_dev_addrs ();
1144 vec_foreach (addr, addrs)
1146 dpdk_device_config_t *devconf = 0;
1147 if (num_whitelisted)
1150 mhash_get (&conf->device_config_index_by_vmbus_addr, addr);
1153 /* No devices blacklisted, but have whitelisted. blacklist all
1154 * non-whitelisted */
1155 pool_get (conf->dev_confs, devconf);
1156 mhash_set (&conf->device_config_index_by_vmbus_addr, addr,
1157 devconf - conf->dev_confs, 0);
1158 devconf->vmbus_addr = *addr;
1159 devconf->dev_addr_type = VNET_DEV_ADDR_VMBUS;
1160 devconf->is_blacklisted = 1;
1165 devconf = pool_elt_at_index (conf->dev_confs, p[0]);
1168 /* Enforce Device blacklist by vmbus_addr */
1169 for (i = 0; i < vec_len (conf->blacklist_by_vmbus_addr); i++)
1171 vlib_vmbus_addr_t *a1 = &conf->blacklist_by_vmbus_addr[i];
1172 vlib_vmbus_addr_t *a2 = addr;
1173 if (memcmp (a1, a2, sizeof (vlib_vmbus_addr_t)) == 0)
1177 /* Device not whitelisted */
1178 pool_get (conf->dev_confs, devconf);
1179 mhash_set (&conf->device_config_index_by_vmbus_addr, addr,
1180 devconf - conf->dev_confs, 0);
1181 devconf->vmbus_addr = *addr;
1182 devconf->dev_addr_type = VNET_DEV_ADDR_VMBUS;
1183 devconf->is_blacklisted = 1;
1193 error = vlib_vmbus_bind_to_uio (addr);
1198 pool_get (conf->dev_confs, devconf);
1199 mhash_set (&conf->device_config_index_by_vmbus_addr, addr,
1200 devconf - conf->dev_confs, 0);
1201 devconf->vmbus_addr = *addr;
1203 devconf->dev_addr_type = VNET_DEV_ADDR_VMBUS;
1204 devconf->is_blacklisted = 1;
1205 clib_error_report (error);
1212 unformat_max_simd_bitwidth (unformat_input_t *input, va_list *va)
1214 uword *max_simd_bitwidth = va_arg (*va, uword *);
1216 while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
1218 if (!unformat (input, "%u", max_simd_bitwidth))
1221 if (*max_simd_bitwidth != DPDK_MAX_SIMD_BITWIDTH_256 &&
1222 *max_simd_bitwidth != DPDK_MAX_SIMD_BITWIDTH_512)
1230 static clib_error_t *
1231 dpdk_device_config (dpdk_config_main_t *conf, void *addr,
1232 dpdk_device_addr_type_t addr_type, unformat_input_t *input,
1235 clib_error_t *error = 0;
1237 dpdk_device_config_t *devconf = 0;
1238 unformat_input_t sub_input;
1242 devconf = &conf->default_devconf;
1244 else if (addr_type == VNET_DEV_ADDR_PCI)
1246 p = hash_get (conf->device_config_index_by_pci_addr,
1247 ((vlib_pci_addr_t *) (addr))->as_u32);
1251 pool_get (conf->dev_confs, devconf);
1252 hash_set (conf->device_config_index_by_pci_addr,
1253 ((vlib_pci_addr_t *) (addr))->as_u32,
1254 devconf - conf->dev_confs);
1257 return clib_error_return (0,
1258 "duplicate configuration for PCI address %U",
1259 format_vlib_pci_addr, addr);
1261 else if (addr_type == VNET_DEV_ADDR_VMBUS)
1263 p = mhash_get (&conf->device_config_index_by_vmbus_addr,
1264 (vlib_vmbus_addr_t *) (addr));
1268 pool_get (conf->dev_confs, devconf);
1269 mhash_set (&conf->device_config_index_by_vmbus_addr, addr,
1270 devconf - conf->dev_confs, 0);
1273 return clib_error_return (
1274 0, "duplicate configuration for VMBUS address %U",
1275 format_vlib_vmbus_addr, addr);
1278 if (addr_type == VNET_DEV_ADDR_PCI)
1280 devconf->pci_addr.as_u32 = ((vlib_pci_addr_t *) (addr))->as_u32;
1281 devconf->tso = DPDK_DEVICE_TSO_DEFAULT;
1282 devconf->dev_addr_type = VNET_DEV_ADDR_PCI;
1284 else if (addr_type == VNET_DEV_ADDR_VMBUS)
1286 devconf->vmbus_addr = *((vlib_vmbus_addr_t *) (addr));
1287 devconf->tso = DPDK_DEVICE_TSO_DEFAULT;
1288 devconf->dev_addr_type = VNET_DEV_ADDR_VMBUS;
1294 unformat_skip_white_space (input);
1295 while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
1297 if (unformat (input, "num-rx-queues %u", &devconf->num_rx_queues))
1299 else if (unformat (input, "num-tx-queues %u", &devconf->num_tx_queues))
1301 else if (unformat (input, "num-rx-desc %u", &devconf->num_rx_desc))
1303 else if (unformat (input, "num-tx-desc %u", &devconf->num_tx_desc))
1305 else if (unformat (input, "name %s", &devconf->name))
1307 else if (unformat (input, "workers %U", unformat_bitmap_list,
1312 (input, "rss %U", unformat_vlib_cli_sub_input, &sub_input))
1314 error = unformat_rss_fn (&sub_input, &devconf->rss_fn);
1318 else if (unformat (input, "vlan-strip-offload off"))
1319 devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_OFF;
1320 else if (unformat (input, "vlan-strip-offload on"))
1321 devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_ON;
1322 else if (unformat (input, "tso on"))
1324 devconf->tso = DPDK_DEVICE_TSO_ON;
1326 else if (unformat (input, "tso off"))
1328 devconf->tso = DPDK_DEVICE_TSO_OFF;
1330 else if (unformat (input, "devargs %s", &devconf->devargs))
1332 else if (unformat (input, "rss-queues %U",
1333 unformat_bitmap_list, &devconf->rss_queues))
1335 else if (unformat (input, "max-lro-pkt-size %u",
1336 &devconf->max_lro_pkt_size))
1340 error = clib_error_return (0, "unknown input `%U'",
1341 format_unformat_error, input);
1349 if (devconf->workers && devconf->num_rx_queues == 0)
1350 devconf->num_rx_queues = clib_bitmap_count_set_bits (devconf->workers);
1351 else if (devconf->workers &&
1352 clib_bitmap_count_set_bits (devconf->workers) !=
1353 devconf->num_rx_queues)
1354 error = clib_error_return (0,
1355 "%U: number of worker threads must be "
1356 "equal to number of rx queues",
1357 format_vlib_pci_addr, addr);
1362 static clib_error_t *
1363 dpdk_log_read_ready (clib_file_t * uf)
1365 unformat_input_t input;
1372 uword len = vec_len (s);
1373 vec_resize (s, len + n_try);
1375 n = read (uf->file_descriptor, s + len, n_try);
1376 if (n < 0 && errno != EAGAIN)
1377 return clib_error_return_unix (0, "read");
1378 _vec_len (s) = len + (n < 0 ? 0 : n);
1381 unformat_init_vector (&input, s);
1383 while (unformat_user (&input, unformat_line, &line))
1385 dpdk_log_notice ("%v", line);
1389 unformat_free (&input);
1393 static clib_error_t *
1394 dpdk_config (vlib_main_t * vm, unformat_input_t * input)
1396 clib_error_t *error = 0;
1397 dpdk_config_main_t *conf = &dpdk_config_main;
1398 vlib_thread_main_t *tm = vlib_get_thread_main ();
1399 dpdk_device_config_t *devconf;
1400 vlib_pci_addr_t pci_addr = { 0 };
1401 vlib_vmbus_addr_t vmbus_addr = { 0 };
1402 unformat_input_t sub_input;
1403 uword default_hugepage_sz, x;
1406 int num_whitelisted = 0;
1407 int eal_no_hugetlb = 0;
1412 u8 *huge_dir_path = 0;
1413 u32 vendor, device, domain, bus, func;
1416 format (0, "%s/hugepages%c", vlib_unix_get_runtime_dir (), 0);
1418 conf->device_config_index_by_pci_addr = hash_create (0, sizeof (uword));
1419 mhash_init (&conf->device_config_index_by_vmbus_addr, sizeof (uword),
1420 sizeof (vlib_vmbus_addr_t));
1422 while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
1424 /* Prime the pump */
1425 if (unformat (input, "no-hugetlb"))
1427 vec_add1 (conf->eal_init_args, (u8 *) "--no-huge");
1430 else if (unformat (input, "telemetry"))
1431 conf->enable_telemetry = 1;
1433 else if (unformat (input, "enable-tcp-udp-checksum"))
1435 conf->enable_tcp_udp_checksum = 1;
1436 if (unformat (input, "enable-outer-checksum-offload"))
1437 conf->enable_outer_checksum_offload = 1;
1439 else if (unformat (input, "no-tx-checksum-offload"))
1440 conf->no_tx_checksum_offload = 1;
1442 else if (unformat (input, "decimal-interface-names"))
1443 conf->interface_name_format_decimal = 1;
1445 else if (unformat (input, "no-multi-seg"))
1446 conf->no_multi_seg = 1;
1447 else if (unformat (input, "enable-lro"))
1448 conf->enable_lro = 1;
1449 else if (unformat (input, "max-simd-bitwidth %U",
1450 unformat_max_simd_bitwidth, &conf->max_simd_bitwidth))
1452 else if (unformat (input, "dev default %U", unformat_vlib_cli_sub_input,
1456 dpdk_device_config (conf, 0, VNET_DEV_ADDR_ANY, &sub_input, 1);
1463 (input, "dev %U %U", unformat_vlib_pci_addr, &pci_addr,
1464 unformat_vlib_cli_sub_input, &sub_input))
1466 error = dpdk_device_config (conf, &pci_addr, VNET_DEV_ADDR_PCI,
1474 else if (unformat (input, "dev %U", unformat_vlib_pci_addr, &pci_addr))
1477 dpdk_device_config (conf, &pci_addr, VNET_DEV_ADDR_PCI, 0, 0);
1484 else if (unformat (input, "dev %U %U", unformat_vlib_vmbus_addr,
1485 &vmbus_addr, unformat_vlib_cli_sub_input, &sub_input))
1487 error = dpdk_device_config (conf, &vmbus_addr, VNET_DEV_ADDR_VMBUS,
1495 else if (unformat (input, "dev %U", unformat_vlib_vmbus_addr,
1499 dpdk_device_config (conf, &vmbus_addr, VNET_DEV_ADDR_VMBUS, 0, 0);
1506 else if (unformat (input, "uio-driver %s", &conf->uio_driver_name))
1508 else if (unformat (input, "socket-mem %s", &socket_mem))
1510 else if (unformat (input, "no-pci"))
1513 tmp = format (0, "--no-pci%c", 0);
1514 vec_add1 (conf->eal_init_args, tmp);
1516 else if (unformat (input, "blacklist %U", unformat_vlib_vmbus_addr,
1519 vec_add1 (conf->blacklist_by_vmbus_addr, vmbus_addr);
1523 (input, "blacklist %x:%x:%x.%x", &domain, &bus, &device, &func))
1525 tmp = format (0, "-b%c", 0);
1526 vec_add1 (conf->eal_init_args, tmp);
1528 format (0, "%04x:%02x:%02x.%x%c", domain, bus, device, func, 0);
1529 vec_add1 (conf->eal_init_args, tmp);
1531 else if (unformat (input, "blacklist %x:%x", &vendor, &device))
1533 u32 blacklist_entry;
1534 if (vendor > 0xFFFF)
1535 return clib_error_return (0, "blacklist PCI vendor out of range");
1536 if (device > 0xFFFF)
1537 return clib_error_return (0, "blacklist PCI device out of range");
1538 blacklist_entry = (vendor << 16) | (device & 0xffff);
1539 vec_add1 (conf->blacklist_by_pci_vendor_and_device,
1542 else if (unformat (input, "no-vmbus"))
1545 tmp = format (0, "--no-vmbus%c", 0);
1546 vec_add1 (conf->eal_init_args, tmp);
1550 else if (unformat(input, #a)) \
1552 tmp = format (0, "--%s%c", #a, 0); \
1553 vec_add1 (conf->eal_init_args, tmp); \
1555 foreach_eal_double_hyphen_predicate_arg
1558 else if (unformat(input, #a " %s", &s)) \
1560 if (!strncmp(#a, "file-prefix", 11)) \
1562 tmp = format (0, "--%s%c", #a, 0); \
1563 vec_add1 (conf->eal_init_args, tmp); \
1565 if (!strncmp(#a, "vdev", 4)) \
1566 if (strstr((char*)s, "af_packet")) \
1567 clib_warning ("af_packet obsoleted. Use CLI 'create host-interface'."); \
1568 vec_add1 (conf->eal_init_args, s); \
1570 foreach_eal_double_hyphen_arg
1573 else if (unformat(input, #a " %s", &s)) \
1575 tmp = format (0, "-%s%c", #b, 0); \
1576 vec_add1 (conf->eal_init_args, tmp); \
1578 vec_add1 (conf->eal_init_args, s); \
1580 foreach_eal_single_hyphen_arg
1582 else if (unformat (input, "default"))
1585 else if (unformat_skip_white_space (input))
1589 error = clib_error_return (0, "unknown input `%U'",
1590 format_unformat_error, input);
1595 if (!conf->uio_driver_name)
1596 conf->uio_driver_name = format (0, "auto%c", 0);
1598 if (eal_no_hugetlb == 0)
1600 vec_add1 (conf->eal_init_args, (u8 *) "--in-memory");
1602 default_hugepage_sz = clib_mem_get_default_hugepage_size ();
1605 clib_bitmap_foreach (x, tm->cpu_socket_bitmap)
1609 /* preallocate at least 16MB of hugepages per socket,
1610 if more is needed it is up to consumer to preallocate more */
1611 n_pages = round_pow2 ((uword) 16 << 20, default_hugepage_sz);
1612 n_pages /= default_hugepage_sz;
1614 if ((e = clib_sysfs_prealloc_hugepages(x, 0, n_pages)))
1615 clib_error_report (e);
1620 /* on/off dpdk's telemetry thread */
1621 if (conf->enable_telemetry == 0)
1623 vec_add1 (conf->eal_init_args, (u8 *) "--no-telemetry");
1628 tmp = format (0, "--file-prefix%c", 0);
1629 vec_add1 (conf->eal_init_args, tmp);
1630 tmp = format (0, "vpp%c", 0);
1631 vec_add1 (conf->eal_init_args, tmp);
1637 if (no_pci == 0 && geteuid () == 0)
1638 dpdk_bind_devices_to_uio (conf);
1640 if (no_vmbus == 0 && geteuid () == 0)
1641 dpdk_bind_vmbus_devices_to_uio (conf);
1644 if (devconf->x == 0 && conf->default_devconf.x > 0) \
1645 devconf->x = conf->default_devconf.x ;
1647 pool_foreach (devconf, conf->dev_confs) {
1649 /* default per-device config items */
1650 foreach_dpdk_device_config_item
1652 /* copy vlan_strip config from default device */
1653 _ (vlan_strip_offload)
1655 /* copy tso config from default device */
1658 /* copy tso config from default device */
1661 /* copy rss_queues config from default device */
1664 /* add DPDK EAL whitelist/blacklist entry */
1665 if (num_whitelisted > 0 && devconf->is_blacklisted == 0 &&
1666 devconf->dev_addr_type == VNET_DEV_ADDR_PCI)
1668 tmp = format (0, "-a%c", 0);
1669 vec_add1 (conf->eal_init_args, tmp);
1670 if (devconf->devargs)
1672 tmp = format (0, "%U,%s%c", format_vlib_pci_addr,
1673 &devconf->pci_addr, devconf->devargs, 0);
1677 tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0);
1679 vec_add1 (conf->eal_init_args, tmp);
1681 else if (num_whitelisted == 0 && devconf->is_blacklisted != 0 &&
1682 devconf->dev_addr_type == VNET_DEV_ADDR_PCI)
1684 tmp = format (0, "-b%c", 0);
1685 vec_add1 (conf->eal_init_args, tmp);
1686 tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0);
1687 vec_add1 (conf->eal_init_args, tmp);
1694 clib_warning ("socket-mem argument is deprecated");
1696 /* NULL terminate the "argv" vector, in case of stupidity */
1697 vec_add1 (conf->eal_init_args, 0);
1698 _vec_len (conf->eal_init_args) -= 1;
1700 /* Set up DPDK eal and packet mbuf pool early. */
1702 int log_fds[2] = { 0 };
1703 if (pipe (log_fds) == 0)
1705 if (fcntl (log_fds[1], F_SETFL, O_NONBLOCK) == 0)
1707 FILE *f = fdopen (log_fds[1], "a");
1708 if (f && rte_openlog_stream (f) == 0)
1710 clib_file_t t = { 0 };
1711 t.read_function = dpdk_log_read_ready;
1712 t.file_descriptor = log_fds[0];
1713 t.description = format (0, "DPDK logging pipe");
1714 clib_file_add (&file_main, &t);
1724 vm = vlib_get_main ();
1726 /* make copy of args as rte_eal_init tends to mess up with arg array */
1727 for (i = 1; i < vec_len (conf->eal_init_args); i++)
1728 conf->eal_init_args_str = format (conf->eal_init_args_str, "%s ",
1729 conf->eal_init_args[i]);
1731 vec_terminate_c_string (conf->eal_init_args_str);
1733 dpdk_log_notice ("EAL init args: %s", conf->eal_init_args_str);
1734 ret = rte_eal_init (vec_len (conf->eal_init_args),
1735 (char **) conf->eal_init_args);
1737 /* enable the AVX-512 vPMDs in DPDK */
1738 if (clib_cpu_supports_avx512_bitalg () &&
1739 conf->max_simd_bitwidth == DPDK_MAX_SIMD_BITWIDTH_DEFAULT)
1740 rte_vect_set_max_simd_bitwidth (RTE_VECT_SIMD_512);
1741 else if (conf->max_simd_bitwidth != DPDK_MAX_SIMD_BITWIDTH_DEFAULT)
1742 rte_vect_set_max_simd_bitwidth (conf->max_simd_bitwidth ==
1743 DPDK_MAX_SIMD_BITWIDTH_256 ?
1747 /* lazy umount hugepages */
1748 umount2 ((char *) huge_dir_path, MNT_DETACH);
1749 rmdir ((char *) huge_dir_path);
1750 vec_free (huge_dir_path);
1753 return clib_error_return (0, "rte_eal_init returned %d", ret);
1755 /* main thread 1st */
1756 if ((error = dpdk_buffer_pools_create (vm)))
1763 VLIB_CONFIG_FUNCTION (dpdk_config, "dpdk");
1766 dpdk_update_link_state (dpdk_device_t * xd, f64 now)
1768 vnet_main_t *vnm = vnet_get_main ();
1769 struct rte_eth_link prev_link = xd->link;
1771 u8 hw_flags_chg = 0;
1773 /* only update link state for PMD interfaces */
1774 if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0)
1777 xd->time_last_link_update = now ? now : xd->time_last_link_update;
1778 clib_memset (&xd->link, 0, sizeof (xd->link));
1779 rte_eth_link_get_nowait (xd->port_id, &xd->link);
1781 if (LINK_STATE_ELOGS)
1783 ELOG_TYPE_DECLARE (e) =
1786 "update-link-state: sw_if_index %d, admin_up %d,"
1787 "old link_state %d new link_state %d",.format_args = "i4i1i1i1",};
1796 ed = ELOG_DATA (&vlib_global_main.elog_main, e);
1797 ed->sw_if_index = xd->sw_if_index;
1798 ed->admin_up = (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) != 0;
1799 ed->old_link_state = (u8)
1800 vnet_hw_interface_is_link_up (vnm, xd->hw_if_index);
1801 ed->new_link_state = (u8) xd->link.link_status;
1804 if ((xd->link.link_duplex != prev_link.link_duplex))
1807 switch (xd->link.link_duplex)
1809 case ETH_LINK_HALF_DUPLEX:
1810 hw_flags |= VNET_HW_INTERFACE_FLAG_HALF_DUPLEX;
1812 case ETH_LINK_FULL_DUPLEX:
1813 hw_flags |= VNET_HW_INTERFACE_FLAG_FULL_DUPLEX;
1819 if (xd->link.link_speed != prev_link.link_speed)
1820 vnet_hw_interface_set_link_speed (vnm, xd->hw_if_index,
1821 xd->link.link_speed * 1000);
1823 if (xd->link.link_status != prev_link.link_status)
1827 if (xd->link.link_status)
1828 hw_flags |= VNET_HW_INTERFACE_FLAG_LINK_UP;
1833 if (LINK_STATE_ELOGS)
1835 ELOG_TYPE_DECLARE (e) =
1838 "update-link-state: sw_if_index %d, new flags %d",.format_args
1846 ed = ELOG_DATA (&vlib_global_main.elog_main, e);
1847 ed->sw_if_index = xd->sw_if_index;
1848 ed->flags = hw_flags;
1850 vnet_hw_interface_set_flags (vnm, xd->hw_if_index, hw_flags);
1855 dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f)
1857 clib_error_t *error;
1858 dpdk_main_t *dm = &dpdk_main;
1860 vlib_thread_main_t *tm = vlib_get_thread_main ();
1862 error = dpdk_lib_init (dm);
1865 clib_error_report (error);
1867 if (dpdk_cryptodev_init)
1869 error = dpdk_cryptodev_init (vm);
1872 vlib_log_warn (dpdk_main.log_cryptodev, "%U", format_clib_error,
1874 clib_error_free (error);
1878 tm->worker_thread_release = 1;
1880 f64 now = vlib_time_now (vm);
1881 vec_foreach (xd, dm->devices)
1883 dpdk_update_link_state (xd, now);
1889 * check each time through the loop in case intervals are changed
1891 f64 min_wait = dm->link_state_poll_interval < dm->stat_poll_interval ?
1892 dm->link_state_poll_interval : dm->stat_poll_interval;
1894 vlib_process_wait_for_event_or_clock (vm, min_wait);
1896 if (dm->admin_up_down_in_progress)
1897 /* skip the poll if an admin up down is in progress (on any interface) */
1900 vec_foreach (xd, dm->devices)
1902 f64 now = vlib_time_now (vm);
1903 if ((now - xd->time_last_stats_update) >= dm->stat_poll_interval)
1904 dpdk_update_counters (xd, now);
1905 if ((now - xd->time_last_link_update) >= dm->link_state_poll_interval)
1906 dpdk_update_link_state (xd, now);
1915 VLIB_REGISTER_NODE (dpdk_process_node,static) = {
1916 .function = dpdk_process,
1917 .type = VLIB_NODE_TYPE_PROCESS,
1918 .name = "dpdk-process",
1919 .process_log2_n_stack_bytes = 17,
1923 static clib_error_t *
1924 dpdk_init (vlib_main_t * vm)
1926 dpdk_main_t *dm = &dpdk_main;
1927 clib_error_t *error = 0;
1929 /* verify that structs are cacheline aligned */
1930 STATIC_ASSERT (offsetof (dpdk_device_t, cacheline0) == 0,
1931 "Cache line marker must be 1st element in dpdk_device_t");
1932 STATIC_ASSERT (offsetof (dpdk_device_t, cacheline1) ==
1933 CLIB_CACHE_LINE_BYTES,
1934 "Data in cache line 0 is bigger than cache line size");
1935 STATIC_ASSERT (offsetof (frame_queue_trace_t, cacheline0) == 0,
1936 "Cache line marker must be 1st element in frame_queue_trace_t");
1938 dpdk_cli_reference ();
1940 dm->conf = &dpdk_config_main;
1942 vec_add1 (dm->conf->eal_init_args, (u8 *) "vnet");
1944 /* Default vlib_buffer_t flags, DISABLES tcp/udp checksumming... */
1945 dm->buffer_flags_template = (VLIB_BUFFER_TOTAL_LENGTH_VALID |
1946 VLIB_BUFFER_EXT_HDR_VALID |
1947 VNET_BUFFER_F_L4_CHECKSUM_COMPUTED |
1948 VNET_BUFFER_F_L4_CHECKSUM_CORRECT);
1950 dm->stat_poll_interval = DPDK_STATS_POLL_INTERVAL;
1951 dm->link_state_poll_interval = DPDK_LINK_POLL_INTERVAL;
1953 dm->log_default = vlib_log_register_class ("dpdk", 0);
1954 dm->log_cryptodev = vlib_log_register_class ("dpdk", "cryptodev");
1955 dm->log_ipsec = vlib_log_register_class ("dpdk", "ipsec");
1960 VLIB_INIT_FUNCTION (dpdk_init);
1962 static clib_error_t *
1963 dpdk_worker_thread_init (vlib_main_t *vm)
1965 if (rte_thread_register () < 0)
1966 clib_panic ("dpdk: cannot register thread %u - %s", vm->thread_index,
1967 rte_strerror (rte_errno));
1971 VLIB_WORKER_INIT_FUNCTION (dpdk_worker_thread_init);