4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46 #include <sys/ioctl.h>
47 #include <sys/socket.h>
48 #include <sys/utsname.h>
49 #include <netinet/in.h>
50 #include <linux/ethtool.h>
51 #include <linux/sockios.h>
52 #include <linux/version.h>
57 #include <rte_atomic.h>
58 #include <rte_ethdev.h>
59 #include <rte_bus_pci.h>
61 #include <rte_common.h>
62 #include <rte_interrupts.h>
63 #include <rte_alarm.h>
64 #include <rte_malloc.h>
67 #include "mlx5_rxtx.h"
68 #include "mlx5_utils.h"
70 /* Add defines in case the running kernel is not the same as user headers. */
71 #ifndef ETHTOOL_GLINKSETTINGS
72 struct ethtool_link_settings {
81 uint8_t eth_tp_mdix_ctrl;
82 int8_t link_mode_masks_nwords;
84 uint32_t link_mode_masks[];
87 #define ETHTOOL_GLINKSETTINGS 0x0000004c
88 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
89 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
90 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
91 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
92 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
93 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
94 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
95 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
96 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
97 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
98 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
99 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
100 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
101 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
102 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
103 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
105 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
106 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
107 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
108 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
110 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
111 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
112 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
114 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
115 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
116 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
117 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
118 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
122 * Return private structure associated with an Ethernet device.
125 * Pointer to Ethernet device structure.
128 * Pointer to private structure.
131 mlx5_get_priv(struct rte_eth_dev *dev)
133 return dev->data->dev_private;
137 * Get interface name from private structure.
140 * Pointer to private structure.
142 * Interface name output buffer.
145 * 0 on success, -1 on failure and errno is set.
148 priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE])
152 unsigned int dev_type = 0;
153 unsigned int dev_port_prev = ~0u;
154 char match[IF_NAMESIZE] = "";
157 MKSTR(path, "%s/device/net", priv->ibdev_path);
163 while ((dent = readdir(dir)) != NULL) {
164 char *name = dent->d_name;
166 unsigned int dev_port;
169 if ((name[0] == '.') &&
170 ((name[1] == '\0') ||
171 ((name[1] == '.') && (name[2] == '\0'))))
174 MKSTR(path, "%s/device/net/%s/%s",
175 priv->ibdev_path, name,
176 (dev_type ? "dev_id" : "dev_port"));
178 file = fopen(path, "rb");
183 * Switch to dev_id when dev_port does not exist as
184 * is the case with Linux kernel versions < 3.15.
195 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
200 * Switch to dev_id when dev_port returns the same value for
201 * all ports. May happen when using a MOFED release older than
202 * 3.0 with a Linux kernel >= 3.15.
204 if (dev_port == dev_port_prev)
206 dev_port_prev = dev_port;
207 if (dev_port == (priv->port - 1u))
208 snprintf(match, sizeof(match), "%s", name);
211 if (match[0] == '\0')
213 strncpy(*ifname, match, sizeof(*ifname));
218 * Check if the counter is located on ib counters file.
224 * 1 if counter is located on ib counters file , 0 otherwise.
227 priv_is_ib_cntr(const char *cntr)
229 if (!strcmp(cntr, "out_of_buffer"))
235 * Read from sysfs entry.
238 * Pointer to private structure.
240 * Entry name relative to sysfs path.
242 * Data output buffer.
247 * 0 on success, -1 on failure and errno is set.
250 priv_sysfs_read(const struct priv *priv, const char *entry,
251 char *buf, size_t size)
253 char ifname[IF_NAMESIZE];
258 if (priv_get_ifname(priv, &ifname))
261 if (priv_is_ib_cntr(entry)) {
262 MKSTR(path, "%s/ports/1/hw_counters/%s",
263 priv->ibdev_path, entry);
264 file = fopen(path, "rb");
266 MKSTR(path, "%s/device/net/%s/%s",
267 priv->ibdev_path, ifname, entry);
268 file = fopen(path, "rb");
272 ret = fread(buf, 1, size, file);
274 if (((size_t)ret < size) && (ferror(file)))
284 * Write to sysfs entry.
287 * Pointer to private structure.
289 * Entry name relative to sysfs path.
296 * 0 on success, -1 on failure and errno is set.
299 priv_sysfs_write(const struct priv *priv, const char *entry,
300 char *buf, size_t size)
302 char ifname[IF_NAMESIZE];
307 if (priv_get_ifname(priv, &ifname))
310 MKSTR(path, "%s/device/net/%s/%s", priv->ibdev_path, ifname, entry);
312 file = fopen(path, "wb");
315 ret = fwrite(buf, 1, size, file);
317 if (((size_t)ret < size) || (ferror(file)))
327 * Get unsigned long sysfs property.
330 * Pointer to private structure.
332 * Entry name relative to sysfs path.
334 * Value output buffer.
337 * 0 on success, -1 on failure and errno is set.
340 priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value)
343 unsigned long value_ret;
346 ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1));
348 DEBUG("cannot read %s value from sysfs: %s",
349 name, strerror(errno));
352 value_str[ret] = '\0';
354 value_ret = strtoul(value_str, NULL, 0);
356 DEBUG("invalid %s value `%s': %s", name, value_str,
365 * Set unsigned long sysfs property.
368 * Pointer to private structure.
370 * Entry name relative to sysfs path.
375 * 0 on success, -1 on failure and errno is set.
378 priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value)
381 MKSTR(value_str, "%lu", value);
383 ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1));
385 DEBUG("cannot write %s `%s' (%lu) to sysfs: %s",
386 name, value_str, value, strerror(errno));
393 * Perform ifreq ioctl() on associated Ethernet device.
396 * Pointer to private structure.
398 * Request number to pass to ioctl().
400 * Interface request structure output buffer.
403 * 0 on success, -1 on failure and errno is set.
406 priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr)
408 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
413 if (priv_get_ifname(priv, &ifr->ifr_name) == 0)
414 ret = ioctl(sock, req, ifr);
420 * Return the number of active VFs for the current device.
423 * Pointer to private structure.
424 * @param[out] num_vfs
425 * Number of active VFs.
428 * 0 on success, -1 on failure and errno is set.
431 priv_get_num_vfs(struct priv *priv, uint16_t *num_vfs)
433 /* The sysfs entry name depends on the operating system. */
434 const char **name = (const char *[]){
435 "device/sriov_numvfs",
436 "device/mlx5_num_vfs",
442 unsigned long ulong_num_vfs;
444 ret = priv_get_sysfs_ulong(priv, *name, &ulong_num_vfs);
446 *num_vfs = ulong_num_vfs;
447 } while (*(++name) && ret);
455 * Pointer to private structure.
457 * MTU value output buffer.
460 * 0 on success, -1 on failure and errno is set.
463 priv_get_mtu(struct priv *priv, uint16_t *mtu)
465 unsigned long ulong_mtu;
467 if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1)
474 * Read device counter from sysfs.
477 * Pointer to private structure.
481 * Counter output buffer.
484 * 0 on success, -1 on failure and errno is set.
487 priv_get_cntr_sysfs(struct priv *priv, const char *name, uint64_t *cntr)
489 unsigned long ulong_ctr;
491 if (priv_get_sysfs_ulong(priv, name, &ulong_ctr) == -1)
501 * Pointer to private structure.
506 * 0 on success, -1 on failure and errno is set.
509 priv_set_mtu(struct priv *priv, uint16_t mtu)
513 if (priv_set_sysfs_ulong(priv, "mtu", mtu) ||
514 priv_get_mtu(priv, &new_mtu))
526 * Pointer to private structure.
528 * Bitmask for flags that must remain untouched.
530 * Bitmask for flags to modify.
533 * 0 on success, -1 on failure and errno is set.
536 priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags)
540 if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1)
543 tmp |= (flags & (~keep));
544 return priv_set_sysfs_ulong(priv, "flags", tmp);
548 * Ethernet device configuration.
550 * Prepare the driver for a given number of TX and RX queues.
553 * Pointer to Ethernet device structure.
556 * 0 on success, errno value on failure.
559 dev_configure(struct rte_eth_dev *dev)
561 struct priv *priv = dev->data->dev_private;
562 unsigned int rxqs_n = dev->data->nb_rx_queues;
563 unsigned int txqs_n = dev->data->nb_tx_queues;
566 unsigned int reta_idx_n;
567 const uint8_t use_app_rss_key =
568 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
570 if (use_app_rss_key &&
571 (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
572 rss_hash_default_key_len)) {
573 /* MLX5 RSS only support 40bytes key. */
576 priv->rss_conf.rss_key =
577 rte_realloc(priv->rss_conf.rss_key,
578 rss_hash_default_key_len, 0);
579 if (!priv->rss_conf.rss_key) {
580 ERROR("cannot allocate RSS hash key memory (%u)", rxqs_n);
583 memcpy(priv->rss_conf.rss_key,
585 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
586 rss_hash_default_key,
587 rss_hash_default_key_len);
588 priv->rss_conf.rss_key_len = rss_hash_default_key_len;
589 priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
590 priv->rxqs = (void *)dev->data->rx_queues;
591 priv->txqs = (void *)dev->data->tx_queues;
592 if (txqs_n != priv->txqs_n) {
593 INFO("%p: TX queues number update: %u -> %u",
594 (void *)dev, priv->txqs_n, txqs_n);
595 priv->txqs_n = txqs_n;
597 if (rxqs_n > priv->ind_table_max_size) {
598 ERROR("cannot handle this many RX queues (%u)", rxqs_n);
601 if (rxqs_n == priv->rxqs_n)
603 INFO("%p: RX queues number update: %u -> %u",
604 (void *)dev, priv->rxqs_n, rxqs_n);
605 priv->rxqs_n = rxqs_n;
606 /* If the requested number of RX queues is not a power of two, use the
607 * maximum indirection table size for better balancing.
608 * The result is always rounded to the next power of two. */
609 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
610 priv->ind_table_max_size :
612 if (priv_rss_reta_index_resize(priv, reta_idx_n))
614 /* When the number of RX queues is not a power of two, the remaining
615 * table entries are padded with reused WQs and hashes are not spread
617 for (i = 0, j = 0; (i != reta_idx_n); ++i) {
618 (*priv->reta_idx)[i] = j;
626 * DPDK callback for Ethernet device configuration.
629 * Pointer to Ethernet device structure.
632 * 0 on success, negative errno value on failure.
635 mlx5_dev_configure(struct rte_eth_dev *dev)
637 struct priv *priv = dev->data->dev_private;
641 ret = dev_configure(dev);
648 * DPDK callback to get information about the device.
651 * Pointer to Ethernet device structure.
653 * Info structure output buffer.
656 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
658 struct priv *priv = mlx5_get_priv(dev);
660 char ifname[IF_NAMESIZE];
662 info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
665 /* FIXME: we should ask the device for these values. */
666 info->min_rx_bufsize = 32;
667 info->max_rx_pktlen = 65536;
669 * Since we need one CQ per QP, the limit is the minimum number
670 * between the two values.
672 max = RTE_MIN(priv->device_attr.orig_attr.max_cq,
673 priv->device_attr.orig_attr.max_qp);
674 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
677 info->max_rx_queues = max;
678 info->max_tx_queues = max;
679 info->max_mac_addrs = RTE_DIM(priv->mac);
680 info->rx_offload_capa =
682 (DEV_RX_OFFLOAD_IPV4_CKSUM |
683 DEV_RX_OFFLOAD_UDP_CKSUM |
684 DEV_RX_OFFLOAD_TCP_CKSUM) :
686 (priv->hw_vlan_strip ? DEV_RX_OFFLOAD_VLAN_STRIP : 0) |
687 DEV_RX_OFFLOAD_TIMESTAMP;
690 info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
692 info->tx_offload_capa |=
693 (DEV_TX_OFFLOAD_IPV4_CKSUM |
694 DEV_TX_OFFLOAD_UDP_CKSUM |
695 DEV_TX_OFFLOAD_TCP_CKSUM);
697 info->tx_offload_capa |= DEV_TX_OFFLOAD_TCP_TSO;
699 info->tx_offload_capa |= (DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM |
700 DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
701 DEV_TX_OFFLOAD_GRE_TNL_TSO);
702 if (priv_get_ifname(priv, &ifname) == 0)
703 info->if_index = if_nametoindex(ifname);
704 info->reta_size = priv->reta_idx_n ?
705 priv->reta_idx_n : priv->ind_table_max_size;
706 info->hash_key_size = priv->rss_conf.rss_key_len;
707 info->speed_capa = priv->link_speed_capa;
708 info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
713 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
715 static const uint32_t ptypes[] = {
716 /* refers to rxq_cq_to_pkt_type() */
718 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
719 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
720 RTE_PTYPE_L4_NONFRAG,
724 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
725 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
726 RTE_PTYPE_INNER_L4_NONFRAG,
727 RTE_PTYPE_INNER_L4_FRAG,
728 RTE_PTYPE_INNER_L4_TCP,
729 RTE_PTYPE_INNER_L4_UDP,
733 if (dev->rx_pkt_burst == mlx5_rx_burst ||
734 dev->rx_pkt_burst == mlx5_rx_burst_vec)
740 * DPDK callback to retrieve physical link information.
743 * Pointer to Ethernet device structure.
744 * @param wait_to_complete
745 * Wait for request completion (ignored).
748 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, int wait_to_complete)
750 struct priv *priv = mlx5_get_priv(dev);
751 struct ethtool_cmd edata = {
752 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
755 struct rte_eth_link dev_link;
758 /* priv_lock() is not taken to allow concurrent calls. */
760 (void)wait_to_complete;
761 if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) {
762 WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno));
765 memset(&dev_link, 0, sizeof(dev_link));
766 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
767 (ifr.ifr_flags & IFF_RUNNING));
768 ifr.ifr_data = (void *)&edata;
769 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
770 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
774 link_speed = ethtool_cmd_speed(&edata);
775 if (link_speed == -1)
776 dev_link.link_speed = 0;
778 dev_link.link_speed = link_speed;
779 priv->link_speed_capa = 0;
780 if (edata.supported & SUPPORTED_Autoneg)
781 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
782 if (edata.supported & (SUPPORTED_1000baseT_Full |
783 SUPPORTED_1000baseKX_Full))
784 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
785 if (edata.supported & SUPPORTED_10000baseKR_Full)
786 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
787 if (edata.supported & (SUPPORTED_40000baseKR4_Full |
788 SUPPORTED_40000baseCR4_Full |
789 SUPPORTED_40000baseSR4_Full |
790 SUPPORTED_40000baseLR4_Full))
791 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
792 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
793 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
794 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
795 ETH_LINK_SPEED_FIXED);
796 if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) {
797 /* Link status changed. */
798 dev->data->dev_link = dev_link;
801 /* Link status is still the same. */
806 * Retrieve physical link information (unlocked version using new ioctl).
809 * Pointer to Ethernet device structure.
810 * @param wait_to_complete
811 * Wait for request completion (ignored).
814 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete)
816 struct priv *priv = mlx5_get_priv(dev);
817 struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
819 struct rte_eth_link dev_link;
822 (void)wait_to_complete;
823 if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) {
824 WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno));
827 memset(&dev_link, 0, sizeof(dev_link));
828 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
829 (ifr.ifr_flags & IFF_RUNNING));
830 ifr.ifr_data = (void *)&gcmd;
831 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
832 DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s",
836 gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
838 alignas(struct ethtool_link_settings)
839 uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
840 sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
841 struct ethtool_link_settings *ecmd = (void *)data;
844 ifr.ifr_data = (void *)ecmd;
845 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
846 DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s",
850 dev_link.link_speed = ecmd->speed;
851 sc = ecmd->link_mode_masks[0] |
852 ((uint64_t)ecmd->link_mode_masks[1] << 32);
853 priv->link_speed_capa = 0;
854 if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
855 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
856 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
857 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
858 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
859 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
860 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
861 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
862 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
863 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
864 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
865 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
866 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
867 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
868 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
869 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
870 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
871 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
872 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
873 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
874 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
875 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
876 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
877 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
878 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
879 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
880 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
881 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
882 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
883 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
884 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
885 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
886 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
887 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
888 dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
889 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
890 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
891 ETH_LINK_SPEED_FIXED);
892 if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) {
893 /* Link status changed. */
894 dev->data->dev_link = dev_link;
897 /* Link status is still the same. */
902 * Enable receiving and transmitting traffic.
905 * Pointer to private structure.
908 priv_link_start(struct priv *priv)
910 struct rte_eth_dev *dev = priv->dev;
913 priv_dev_select_tx_function(priv, dev);
914 priv_dev_select_rx_function(priv, dev);
915 err = priv_dev_traffic_enable(priv, dev);
917 ERROR("%p: error occurred while configuring control flows: %s",
918 (void *)priv, strerror(err));
919 err = priv_flow_start(priv, &priv->flows);
921 ERROR("%p: error occurred while configuring flows: %s",
922 (void *)priv, strerror(err));
926 * Disable receiving and transmitting traffic.
929 * Pointer to private structure.
932 priv_link_stop(struct priv *priv)
934 struct rte_eth_dev *dev = priv->dev;
936 priv_flow_stop(priv, &priv->flows);
937 priv_dev_traffic_disable(priv, dev);
938 dev->rx_pkt_burst = removed_rx_burst;
939 dev->tx_pkt_burst = removed_tx_burst;
943 * Retrieve physical link information and update rx/tx_pkt_burst callbacks
947 * Pointer to private structure.
948 * @param wait_to_complete
949 * Wait for request completion (ignored).
952 priv_link_update(struct priv *priv, int wait_to_complete)
954 struct rte_eth_dev *dev = priv->dev;
955 struct utsname utsname;
958 struct rte_eth_link dev_link = dev->data->dev_link;
960 if (uname(&utsname) == -1 ||
961 sscanf(utsname.release, "%d.%d.%d",
962 &ver[0], &ver[1], &ver[2]) != 3 ||
963 KERNEL_VERSION(ver[0], ver[1], ver[2]) < KERNEL_VERSION(4, 9, 0))
964 ret = mlx5_link_update_unlocked_gset(dev, wait_to_complete);
966 ret = mlx5_link_update_unlocked_gs(dev, wait_to_complete);
967 /* If lsc interrupt is disabled, should always be ready for traffic. */
968 if (!dev->data->dev_conf.intr_conf.lsc) {
969 priv_link_start(priv);
972 /* Re-select burst callbacks only if link status has been changed. */
973 if (!ret && dev_link.link_status != dev->data->dev_link.link_status) {
974 if (dev->data->dev_link.link_status == ETH_LINK_UP)
975 priv_link_start(priv);
977 priv_link_stop(priv);
983 * Querying the link status till it changes to the desired state.
984 * Number of query attempts is bounded by MLX5_MAX_LINK_QUERY_ATTEMPTS.
987 * Pointer to private structure.
989 * Link desired status.
992 * 0 on success, negative errno value on failure.
995 priv_force_link_status_change(struct priv *priv, int status)
999 while (try < MLX5_MAX_LINK_QUERY_ATTEMPTS) {
1000 priv_link_update(priv, 0);
1001 if (priv->dev->data->dev_link.link_status == status)
1010 * DPDK callback to retrieve physical link information.
1013 * Pointer to Ethernet device structure.
1014 * @param wait_to_complete
1015 * Wait for request completion (ignored).
1018 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
1020 struct priv *priv = dev->data->dev_private;
1024 ret = priv_link_update(priv, wait_to_complete);
1030 * DPDK callback to change the MTU.
1033 * Pointer to Ethernet device structure.
1038 * 0 on success, negative errno value on failure.
1041 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
1043 struct priv *priv = dev->data->dev_private;
1048 ret = priv_get_mtu(priv, &kern_mtu);
1051 /* Set kernel interface MTU first. */
1052 ret = priv_set_mtu(priv, mtu);
1055 ret = priv_get_mtu(priv, &kern_mtu);
1058 if (kern_mtu == mtu) {
1060 DEBUG("adapter port %u MTU set to %u", priv->port, mtu);
1066 WARN("cannot set port %u MTU to %u: %s", priv->port, mtu,
1074 * DPDK callback to get flow control status.
1077 * Pointer to Ethernet device structure.
1078 * @param[out] fc_conf
1079 * Flow control output buffer.
1082 * 0 on success, negative errno value on failure.
1085 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
1087 struct priv *priv = dev->data->dev_private;
1089 struct ethtool_pauseparam ethpause = {
1090 .cmd = ETHTOOL_GPAUSEPARAM
1094 ifr.ifr_data = (void *)ðpause;
1096 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
1098 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)"
1104 fc_conf->autoneg = ethpause.autoneg;
1105 if (ethpause.rx_pause && ethpause.tx_pause)
1106 fc_conf->mode = RTE_FC_FULL;
1107 else if (ethpause.rx_pause)
1108 fc_conf->mode = RTE_FC_RX_PAUSE;
1109 else if (ethpause.tx_pause)
1110 fc_conf->mode = RTE_FC_TX_PAUSE;
1112 fc_conf->mode = RTE_FC_NONE;
1122 * DPDK callback to modify flow control parameters.
1125 * Pointer to Ethernet device structure.
1126 * @param[in] fc_conf
1127 * Flow control parameters.
1130 * 0 on success, negative errno value on failure.
1133 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
1135 struct priv *priv = dev->data->dev_private;
1137 struct ethtool_pauseparam ethpause = {
1138 .cmd = ETHTOOL_SPAUSEPARAM
1142 ifr.ifr_data = (void *)ðpause;
1143 ethpause.autoneg = fc_conf->autoneg;
1144 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1145 (fc_conf->mode & RTE_FC_RX_PAUSE))
1146 ethpause.rx_pause = 1;
1148 ethpause.rx_pause = 0;
1150 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1151 (fc_conf->mode & RTE_FC_TX_PAUSE))
1152 ethpause.tx_pause = 1;
1154 ethpause.tx_pause = 0;
1157 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
1159 WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
1173 * Get PCI information from struct ibv_device.
1176 * Pointer to Ethernet device structure.
1177 * @param[out] pci_addr
1178 * PCI bus address output buffer.
1181 * 0 on success, -1 on failure and errno is set.
1184 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
1185 struct rte_pci_addr *pci_addr)
1189 MKSTR(path, "%s/device/uevent", device->ibdev_path);
1191 file = fopen(path, "rb");
1194 while (fgets(line, sizeof(line), file) == line) {
1195 size_t len = strlen(line);
1198 /* Truncate long lines. */
1199 if (len == (sizeof(line) - 1))
1200 while (line[(len - 1)] != '\n') {
1204 line[(len - 1)] = ret;
1206 /* Extract information. */
1209 "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
1213 &pci_addr->function) == 4) {
1223 * Update the link status.
1226 * Pointer to private structure.
1229 * Zero if the callback process can be called immediately.
1232 priv_link_status_update(struct priv *priv)
1234 struct rte_eth_link *link = &priv->dev->data->dev_link;
1236 priv_link_update(priv, 0);
1237 if (((link->link_speed == 0) && link->link_status) ||
1238 ((link->link_speed != 0) && !link->link_status)) {
1240 * Inconsistent status. Event likely occurred before the
1241 * kernel netdevice exposes the new status.
1243 if (!priv->pending_alarm) {
1244 priv->pending_alarm = 1;
1245 rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
1246 mlx5_dev_link_status_handler,
1250 } else if (unlikely(priv->pending_alarm)) {
1251 /* Link interrupt occurred while alarm is already scheduled. */
1252 priv->pending_alarm = 0;
1253 rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv->dev);
1259 * Device status handler.
1262 * Pointer to private structure.
1264 * Pointer to event flags holder.
1267 * Events bitmap of callback process which can be called immediately.
1270 priv_dev_status_handler(struct priv *priv)
1272 struct ibv_async_event event;
1275 /* Read all message and acknowledge them. */
1277 if (ibv_get_async_event(priv->ctx, &event))
1279 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
1280 event.event_type == IBV_EVENT_PORT_ERR) &&
1281 (priv->dev->data->dev_conf.intr_conf.lsc == 1))
1282 ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
1283 else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
1284 priv->dev->data->dev_conf.intr_conf.rmv == 1)
1285 ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
1287 DEBUG("event type %d on port %d not handled",
1288 event.event_type, event.element.port_num);
1289 ibv_ack_async_event(&event);
1291 if (ret & (1 << RTE_ETH_EVENT_INTR_LSC))
1292 if (priv_link_status_update(priv))
1293 ret &= ~(1 << RTE_ETH_EVENT_INTR_LSC);
1298 * Handle delayed link status event.
1301 * Registered argument.
1304 mlx5_dev_link_status_handler(void *arg)
1306 struct rte_eth_dev *dev = arg;
1307 struct priv *priv = dev->data->dev_private;
1310 while (!priv_trylock(priv)) {
1311 /* Alarm is being canceled. */
1312 if (priv->pending_alarm == 0)
1316 priv->pending_alarm = 0;
1317 ret = priv_link_status_update(priv);
1320 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
1325 * Handle interrupts from the NIC.
1327 * @param[in] intr_handle
1328 * Interrupt handler.
1330 * Callback argument.
1333 mlx5_dev_interrupt_handler(void *cb_arg)
1335 struct rte_eth_dev *dev = cb_arg;
1336 struct priv *priv = dev->data->dev_private;
1340 events = priv_dev_status_handler(priv);
1342 if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
1343 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
1345 if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
1346 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL,
1351 * Handle interrupts from the socket.
1354 * Callback argument.
1357 mlx5_dev_handler_socket(void *cb_arg)
1359 struct rte_eth_dev *dev = cb_arg;
1360 struct priv *priv = dev->data->dev_private;
1363 priv_socket_handle(priv);
1368 * Uninstall interrupt handler.
1371 * Pointer to private structure.
1373 * Pointer to the rte_eth_dev structure.
1376 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev)
1378 if (dev->data->dev_conf.intr_conf.lsc ||
1379 dev->data->dev_conf.intr_conf.rmv)
1380 rte_intr_callback_unregister(&priv->intr_handle,
1381 mlx5_dev_interrupt_handler, dev);
1382 if (priv->primary_socket)
1383 rte_intr_callback_unregister(&priv->intr_handle_socket,
1384 mlx5_dev_handler_socket, dev);
1385 if (priv->pending_alarm) {
1386 priv->pending_alarm = 0;
1387 rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev);
1389 priv->intr_handle.fd = 0;
1390 priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1391 priv->intr_handle_socket.fd = 0;
1392 priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN;
1396 * Install interrupt handler.
1399 * Pointer to private structure.
1401 * Pointer to the rte_eth_dev structure.
1404 priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
1408 assert(priv->ctx->async_fd > 0);
1409 flags = fcntl(priv->ctx->async_fd, F_GETFL);
1410 rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1412 INFO("failed to change file descriptor async event queue");
1413 dev->data->dev_conf.intr_conf.lsc = 0;
1414 dev->data->dev_conf.intr_conf.rmv = 0;
1416 if (dev->data->dev_conf.intr_conf.lsc ||
1417 dev->data->dev_conf.intr_conf.rmv) {
1418 priv->intr_handle.fd = priv->ctx->async_fd;
1419 priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
1420 rte_intr_callback_register(&priv->intr_handle,
1421 mlx5_dev_interrupt_handler, dev);
1424 rc = priv_socket_init(priv);
1425 if (!rc && priv->primary_socket) {
1426 priv->intr_handle_socket.fd = priv->primary_socket;
1427 priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT;
1428 rte_intr_callback_register(&priv->intr_handle_socket,
1429 mlx5_dev_handler_socket, dev);
1434 * Change the link state (UP / DOWN).
1437 * Pointer to private data structure.
1439 * Nonzero for link up, otherwise link down.
1442 * 0 on success, errno value on failure.
1445 priv_dev_set_link(struct priv *priv, int up)
1447 return priv_set_flags(priv, ~IFF_UP, up ? IFF_UP : ~IFF_UP);
1451 * DPDK callback to bring the link DOWN.
1454 * Pointer to Ethernet device structure.
1457 * 0 on success, errno value on failure.
1460 mlx5_set_link_down(struct rte_eth_dev *dev)
1462 struct priv *priv = dev->data->dev_private;
1466 err = priv_dev_set_link(priv, 0);
1472 * DPDK callback to bring the link UP.
1475 * Pointer to Ethernet device structure.
1478 * 0 on success, errno value on failure.
1481 mlx5_set_link_up(struct rte_eth_dev *dev)
1483 struct priv *priv = dev->data->dev_private;
1487 err = priv_dev_set_link(priv, 1);
1493 * Configure the TX function to use.
1496 * Pointer to private data structure.
1498 * Pointer to rte_eth_dev structure.
1501 priv_dev_select_tx_function(struct priv *priv, struct rte_eth_dev *dev)
1503 assert(priv != NULL);
1504 assert(dev != NULL);
1505 dev->tx_pkt_burst = mlx5_tx_burst;
1506 /* Select appropriate TX function. */
1507 if (priv->mps == MLX5_MPW_ENHANCED) {
1508 if (priv_check_vec_tx_support(priv) > 0) {
1509 if (priv_check_raw_vec_tx_support(priv) > 0)
1510 dev->tx_pkt_burst = mlx5_tx_burst_raw_vec;
1512 dev->tx_pkt_burst = mlx5_tx_burst_vec;
1513 DEBUG("selected Enhanced MPW TX vectorized function");
1515 dev->tx_pkt_burst = mlx5_tx_burst_empw;
1516 DEBUG("selected Enhanced MPW TX function");
1518 } else if (priv->mps && priv->txq_inline) {
1519 dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline;
1520 DEBUG("selected MPW inline TX function");
1521 } else if (priv->mps) {
1522 dev->tx_pkt_burst = mlx5_tx_burst_mpw;
1523 DEBUG("selected MPW TX function");
1528 * Configure the RX function to use.
1531 * Pointer to private data structure.
1533 * Pointer to rte_eth_dev structure.
1536 priv_dev_select_rx_function(struct priv *priv, struct rte_eth_dev *dev)
1538 assert(priv != NULL);
1539 assert(dev != NULL);
1540 if (priv_check_vec_rx_support(priv) > 0) {
1541 dev->rx_pkt_burst = mlx5_rx_burst_vec;
1542 DEBUG("selected RX vectorized function");
1544 dev->rx_pkt_burst = mlx5_rx_burst;