New upstream version 17.11.5
[deb_dpdk.git] / drivers / net / mlx5 / mlx5_ethdev.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2015 6WIND S.A.
5  *   Copyright 2015 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #define _GNU_SOURCE
35
36 #include <stddef.h>
37 #include <assert.h>
38 #include <inttypes.h>
39 #include <unistd.h>
40 #include <stdint.h>
41 #include <stdio.h>
42 #include <string.h>
43 #include <stdlib.h>
44 #include <errno.h>
45 #include <dirent.h>
46 #include <net/if.h>
47 #include <sys/ioctl.h>
48 #include <sys/socket.h>
49 #include <netinet/in.h>
50 #include <linux/ethtool.h>
51 #include <linux/sockios.h>
52 #include <fcntl.h>
53 #include <stdalign.h>
54 #include <sys/un.h>
55 #include <time.h>
56
57 #include <rte_atomic.h>
58 #include <rte_ethdev.h>
59 #include <rte_bus_pci.h>
60 #include <rte_mbuf.h>
61 #include <rte_common.h>
62 #include <rte_interrupts.h>
63 #include <rte_malloc.h>
64
65 #include "mlx5.h"
66 #include "mlx5_rxtx.h"
67 #include "mlx5_utils.h"
68
69 /* Supported speed values found in /usr/include/linux/ethtool.h */
70 #ifndef HAVE_SUPPORTED_40000baseKR4_Full
71 #define SUPPORTED_40000baseKR4_Full (1 << 23)
72 #endif
73 #ifndef HAVE_SUPPORTED_40000baseCR4_Full
74 #define SUPPORTED_40000baseCR4_Full (1 << 24)
75 #endif
76 #ifndef HAVE_SUPPORTED_40000baseSR4_Full
77 #define SUPPORTED_40000baseSR4_Full (1 << 25)
78 #endif
79 #ifndef HAVE_SUPPORTED_40000baseLR4_Full
80 #define SUPPORTED_40000baseLR4_Full (1 << 26)
81 #endif
82 #ifndef HAVE_SUPPORTED_56000baseKR4_Full
83 #define SUPPORTED_56000baseKR4_Full (1 << 27)
84 #endif
85 #ifndef HAVE_SUPPORTED_56000baseCR4_Full
86 #define SUPPORTED_56000baseCR4_Full (1 << 28)
87 #endif
88 #ifndef HAVE_SUPPORTED_56000baseSR4_Full
89 #define SUPPORTED_56000baseSR4_Full (1 << 29)
90 #endif
91 #ifndef HAVE_SUPPORTED_56000baseLR4_Full
92 #define SUPPORTED_56000baseLR4_Full (1 << 30)
93 #endif
94
95 /* Add defines in case the running kernel is not the same as user headers. */
96 #ifndef ETHTOOL_GLINKSETTINGS
97 struct ethtool_link_settings {
98         uint32_t cmd;
99         uint32_t speed;
100         uint8_t duplex;
101         uint8_t port;
102         uint8_t phy_address;
103         uint8_t autoneg;
104         uint8_t mdio_support;
105         uint8_t eth_to_mdix;
106         uint8_t eth_tp_mdix_ctrl;
107         int8_t link_mode_masks_nwords;
108         uint32_t reserved[8];
109         uint32_t link_mode_masks[];
110 };
111
112 #define ETHTOOL_GLINKSETTINGS 0x0000004c
113 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
114 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
115 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
116 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
117 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
118 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
119 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
120 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
121 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
122 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
123 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
124 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
125 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
126 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
127 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
128 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
129 #endif
130 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
131 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
132 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
133 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
134 #endif
135 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
136 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
137 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
138 #endif
139 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
140 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
141 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
142 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
143 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
144 #endif
145
146 /**
147  * Get interface name from private structure.
148  *
149  * @param[in] dev
150  *   Pointer to Ethernet device.
151  * @param[out] ifname
152  *   Interface name output buffer.
153  *
154  * @return
155  *   0 on success, a negative errno value otherwise and rte_errno is set.
156  */
157 int
158 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
159 {
160         struct priv *priv = dev->data->dev_private;
161         DIR *dir;
162         struct dirent *dent;
163         unsigned int dev_type = 0;
164         unsigned int dev_port_prev = ~0u;
165         char match[IF_NAMESIZE] = "";
166
167         {
168                 MKSTR(path, "%s/device/net", priv->ibdev_path);
169
170                 dir = opendir(path);
171                 if (dir == NULL) {
172                         rte_errno = errno;
173                         return -rte_errno;
174                 }
175         }
176         while ((dent = readdir(dir)) != NULL) {
177                 char *name = dent->d_name;
178                 FILE *file;
179                 unsigned int dev_port;
180                 int r;
181
182                 if ((name[0] == '.') &&
183                     ((name[1] == '\0') ||
184                      ((name[1] == '.') && (name[2] == '\0'))))
185                         continue;
186
187                 MKSTR(path, "%s/device/net/%s/%s",
188                       priv->ibdev_path, name,
189                       (dev_type ? "dev_id" : "dev_port"));
190
191                 file = fopen(path, "rb");
192                 if (file == NULL) {
193                         if (errno != ENOENT)
194                                 continue;
195                         /*
196                          * Switch to dev_id when dev_port does not exist as
197                          * is the case with Linux kernel versions < 3.15.
198                          */
199 try_dev_id:
200                         match[0] = '\0';
201                         if (dev_type)
202                                 break;
203                         dev_type = 1;
204                         dev_port_prev = ~0u;
205                         rewinddir(dir);
206                         continue;
207                 }
208                 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
209                 fclose(file);
210                 if (r != 1)
211                         continue;
212                 /*
213                  * Switch to dev_id when dev_port returns the same value for
214                  * all ports. May happen when using a MOFED release older than
215                  * 3.0 with a Linux kernel >= 3.15.
216                  */
217                 if (dev_port == dev_port_prev)
218                         goto try_dev_id;
219                 dev_port_prev = dev_port;
220                 if (dev_port == (priv->port - 1u))
221                         snprintf(match, sizeof(match), "%s", name);
222         }
223         closedir(dir);
224         if (match[0] == '\0') {
225                 rte_errno = ENOENT;
226                 return -rte_errno;
227         }
228         strncpy(*ifname, match, sizeof(*ifname));
229         return 0;
230 }
231
232 /**
233  * Perform ifreq ioctl() on associated Ethernet device.
234  *
235  * @param[in] dev
236  *   Pointer to Ethernet device.
237  * @param req
238  *   Request number to pass to ioctl().
239  * @param[out] ifr
240  *   Interface request structure output buffer.
241  *
242  * @return
243  *   0 on success, a negative errno value otherwise and rte_errno is set.
244  */
245 int
246 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
247 {
248         int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
249         int ret = 0;
250
251         if (sock == -1) {
252                 rte_errno = errno;
253                 return -rte_errno;
254         }
255         ret = mlx5_get_ifname(dev, &ifr->ifr_name);
256         if (ret)
257                 goto error;
258         ret = ioctl(sock, req, ifr);
259         if (ret == -1) {
260                 rte_errno = errno;
261                 goto error;
262         }
263         close(sock);
264         return 0;
265 error:
266         close(sock);
267         return -rte_errno;
268 }
269
270 /**
271  * Get device MTU.
272  *
273  * @param dev
274  *   Pointer to Ethernet device.
275  * @param[out] mtu
276  *   MTU value output buffer.
277  *
278  * @return
279  *   0 on success, a negative errno value otherwise and rte_errno is set.
280  */
281 int
282 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
283 {
284         struct ifreq request;
285         int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
286
287         if (ret)
288                 return ret;
289         *mtu = request.ifr_mtu;
290         return 0;
291 }
292
293 /**
294  * Set device MTU.
295  *
296  * @param dev
297  *   Pointer to Ethernet device.
298  * @param mtu
299  *   MTU value to set.
300  *
301  * @return
302  *   0 on success, a negative errno value otherwise and rte_errno is set.
303  */
304 static int
305 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
306 {
307         struct ifreq request = { .ifr_mtu = mtu, };
308
309         return mlx5_ifreq(dev, SIOCSIFMTU, &request);
310 }
311
312 /**
313  * Set device flags.
314  *
315  * @param dev
316  *   Pointer to Ethernet device.
317  * @param keep
318  *   Bitmask for flags that must remain untouched.
319  * @param flags
320  *   Bitmask for flags to modify.
321  *
322  * @return
323  *   0 on success, a negative errno value otherwise and rte_errno is set.
324  */
325 int
326 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
327 {
328         struct ifreq request;
329         int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
330
331         if (ret)
332                 return ret;
333         request.ifr_flags &= keep;
334         request.ifr_flags |= flags & ~keep;
335         return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
336 }
337
338 /**
339  * DPDK callback for Ethernet device configuration.
340  *
341  * @param dev
342  *   Pointer to Ethernet device structure.
343  *
344  * @return
345  *   0 on success, a negative errno value otherwise and rte_errno is set.
346  */
347 int
348 mlx5_dev_configure(struct rte_eth_dev *dev)
349 {
350         struct priv *priv = dev->data->dev_private;
351         unsigned int rxqs_n = dev->data->nb_rx_queues;
352         unsigned int txqs_n = dev->data->nb_tx_queues;
353         unsigned int i;
354         unsigned int j;
355         unsigned int reta_idx_n;
356         const uint8_t use_app_rss_key =
357                 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
358         int ret = 0;
359
360         if (use_app_rss_key &&
361             (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
362              rss_hash_default_key_len)) {
363                 DRV_LOG(ERR, "port %u RSS key len must be %zu Bytes long",
364                         dev->data->port_id, rss_hash_default_key_len);
365                 rte_errno = EINVAL;
366                 return -rte_errno;
367         }
368         priv->rss_conf.rss_key =
369                 rte_realloc(priv->rss_conf.rss_key,
370                             rss_hash_default_key_len, 0);
371         if (!priv->rss_conf.rss_key) {
372                 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
373                         dev->data->port_id, rxqs_n);
374                 rte_errno = ENOMEM;
375                 return -rte_errno;
376         }
377         memcpy(priv->rss_conf.rss_key,
378                use_app_rss_key ?
379                dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
380                rss_hash_default_key,
381                rss_hash_default_key_len);
382         priv->rss_conf.rss_key_len = rss_hash_default_key_len;
383         priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
384         priv->rxqs = (void *)dev->data->rx_queues;
385         priv->txqs = (void *)dev->data->tx_queues;
386         if (txqs_n != priv->txqs_n) {
387                 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
388                         dev->data->port_id, priv->txqs_n, txqs_n);
389                 priv->txqs_n = txqs_n;
390         }
391         if (rxqs_n > priv->ind_table_max_size) {
392                 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
393                         dev->data->port_id, rxqs_n);
394                 rte_errno = EINVAL;
395                 return -rte_errno;
396         }
397         if (rxqs_n == priv->rxqs_n)
398                 return 0;
399         DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
400                 dev->data->port_id, priv->rxqs_n, rxqs_n);
401         priv->rxqs_n = rxqs_n;
402         /* If the requested number of RX queues is not a power of two, use the
403          * maximum indirection table size for better balancing.
404          * The result is always rounded to the next power of two. */
405         reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
406                                      priv->ind_table_max_size :
407                                      rxqs_n));
408         ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
409         if (ret)
410                 return ret;
411         if (mlx5_mr_register_memseg(dev)) {
412                 DRV_LOG(ERR, "%p: MR registration failed", (void *)dev);
413                 rte_errno = ENOMEM;
414                 return -rte_errno;
415         }
416         /* When the number of RX queues is not a power of two, the remaining
417          * table entries are padded with reused WQs and hashes are not spread
418          * uniformly. */
419         for (i = 0, j = 0; (i != reta_idx_n); ++i) {
420                 (*priv->reta_idx)[i] = j;
421                 if (++j == rxqs_n)
422                         j = 0;
423         }
424         return 0;
425 }
426
427 /**
428  * DPDK callback to get information about the device.
429  *
430  * @param dev
431  *   Pointer to Ethernet device structure.
432  * @param[out] info
433  *   Info structure output buffer.
434  */
435 void
436 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
437 {
438         struct priv *priv = dev->data->dev_private;
439         unsigned int max;
440         char ifname[IF_NAMESIZE];
441
442         info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
443         /* FIXME: we should ask the device for these values. */
444         info->min_rx_bufsize = 32;
445         info->max_rx_pktlen = 65536;
446         /*
447          * Since we need one CQ per QP, the limit is the minimum number
448          * between the two values.
449          */
450         max = RTE_MIN(priv->device_attr.orig_attr.max_cq,
451                       priv->device_attr.orig_attr.max_qp);
452         /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
453         if (max >= 65535)
454                 max = 65535;
455         info->max_rx_queues = max;
456         info->max_tx_queues = max;
457         info->max_mac_addrs = RTE_DIM(priv->mac);
458         info->rx_offload_capa =
459                 (priv->hw_csum ?
460                  (DEV_RX_OFFLOAD_IPV4_CKSUM |
461                   DEV_RX_OFFLOAD_UDP_CKSUM |
462                   DEV_RX_OFFLOAD_TCP_CKSUM) :
463                  0) |
464                 (priv->hw_vlan_strip ? DEV_RX_OFFLOAD_VLAN_STRIP : 0) |
465                 DEV_RX_OFFLOAD_TIMESTAMP;
466
467         if (!priv->mps)
468                 info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
469         if (priv->hw_csum)
470                 info->tx_offload_capa |=
471                         (DEV_TX_OFFLOAD_IPV4_CKSUM |
472                          DEV_TX_OFFLOAD_UDP_CKSUM |
473                          DEV_TX_OFFLOAD_TCP_CKSUM);
474         if (priv->tso)
475                 info->tx_offload_capa |= DEV_TX_OFFLOAD_TCP_TSO;
476         if (priv->tunnel_en)
477                 info->tx_offload_capa |= (DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM |
478                                           DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
479                                           DEV_TX_OFFLOAD_GRE_TNL_TSO);
480         if (mlx5_get_ifname(dev, &ifname) == 0)
481                 info->if_index = if_nametoindex(ifname);
482         info->reta_size = priv->reta_idx_n ?
483                 priv->reta_idx_n : priv->ind_table_max_size;
484         info->hash_key_size = rss_hash_default_key_len;
485         info->speed_capa = priv->link_speed_capa;
486         info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
487 }
488
489 /**
490  * Get supported packet types.
491  *
492  * @param dev
493  *   Pointer to Ethernet device structure.
494  *
495  * @return
496  *   A pointer to the supported Packet types array.
497  */
498 const uint32_t *
499 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
500 {
501         static const uint32_t ptypes[] = {
502                 /* refers to rxq_cq_to_pkt_type() */
503                 RTE_PTYPE_L2_ETHER,
504                 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
505                 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
506                 RTE_PTYPE_L4_NONFRAG,
507                 RTE_PTYPE_L4_FRAG,
508                 RTE_PTYPE_L4_TCP,
509                 RTE_PTYPE_L4_UDP,
510                 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
511                 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
512                 RTE_PTYPE_INNER_L4_NONFRAG,
513                 RTE_PTYPE_INNER_L4_FRAG,
514                 RTE_PTYPE_INNER_L4_TCP,
515                 RTE_PTYPE_INNER_L4_UDP,
516                 RTE_PTYPE_UNKNOWN
517         };
518
519         if (dev->rx_pkt_burst == mlx5_rx_burst ||
520             dev->rx_pkt_burst == mlx5_rx_burst_vec)
521                 return ptypes;
522         return NULL;
523 }
524
525 /**
526  * DPDK callback to retrieve physical link information.
527  *
528  * @param dev
529  *   Pointer to Ethernet device structure.
530  * @param[out] link
531  *   Storage for current link status.
532  *
533  * @return
534  *   0 on success, a negative errno value otherwise and rte_errno is set.
535  */
536 static int
537 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
538                                struct rte_eth_link *link)
539 {
540         struct priv *priv = dev->data->dev_private;
541         struct ethtool_cmd edata = {
542                 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
543         };
544         struct ifreq ifr;
545         struct rte_eth_link dev_link;
546         int link_speed = 0;
547         int ret;
548
549         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
550         if (ret) {
551                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
552                         dev->data->port_id, strerror(rte_errno));
553                 return ret;
554         }
555         memset(&dev_link, 0, sizeof(dev_link));
556         dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
557                                 (ifr.ifr_flags & IFF_RUNNING));
558         ifr.ifr_data = (void *)&edata;
559         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
560         if (ret) {
561                 DRV_LOG(WARNING,
562                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
563                         dev->data->port_id, strerror(rte_errno));
564                 return ret;
565         }
566         link_speed = ethtool_cmd_speed(&edata);
567         if (link_speed == -1)
568                 dev_link.link_speed = 0;
569         else
570                 dev_link.link_speed = link_speed;
571         priv->link_speed_capa = 0;
572         if (edata.supported & SUPPORTED_Autoneg)
573                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
574         if (edata.supported & (SUPPORTED_1000baseT_Full |
575                                SUPPORTED_1000baseKX_Full))
576                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
577         if (edata.supported & SUPPORTED_10000baseKR_Full)
578                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
579         if (edata.supported & (SUPPORTED_40000baseKR4_Full |
580                                SUPPORTED_40000baseCR4_Full |
581                                SUPPORTED_40000baseSR4_Full |
582                                SUPPORTED_40000baseLR4_Full))
583                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
584         dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
585                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
586         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
587                         ETH_LINK_SPEED_FIXED);
588         if ((dev_link.link_speed && !dev_link.link_status) ||
589             (!dev_link.link_speed && dev_link.link_status)) {
590                 rte_errno = EAGAIN;
591                 return -rte_errno;
592         }
593         *link = dev_link;
594         return 0;
595 }
596
597 /**
598  * Retrieve physical link information (unlocked version using new ioctl).
599  *
600  * @param dev
601  *   Pointer to Ethernet device structure.
602  * @param[out] link
603  *   Storage for current link status.
604  *
605  * @return
606  *   0 on success, a negative errno value otherwise and rte_errno is set.
607  */
608 static int
609 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
610                              struct rte_eth_link *link)
611
612 {
613         struct priv *priv = dev->data->dev_private;
614         struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
615         struct ifreq ifr;
616         struct rte_eth_link dev_link;
617         uint64_t sc;
618         int ret;
619
620         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
621         if (ret) {
622                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
623                         dev->data->port_id, strerror(rte_errno));
624                 return ret;
625         }
626         memset(&dev_link, 0, sizeof(dev_link));
627         dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
628                                 (ifr.ifr_flags & IFF_RUNNING));
629         ifr.ifr_data = (void *)&gcmd;
630         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
631         if (ret) {
632                 DRV_LOG(DEBUG,
633                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
634                         " failed: %s",
635                         dev->data->port_id, strerror(rte_errno));
636                 return ret;
637         }
638         gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
639
640         alignas(struct ethtool_link_settings)
641         uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
642                      sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
643         struct ethtool_link_settings *ecmd = (void *)data;
644
645         *ecmd = gcmd;
646         ifr.ifr_data = (void *)ecmd;
647         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
648         if (ret) {
649                 DRV_LOG(DEBUG,
650                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
651                         " failed: %s",
652                         dev->data->port_id, strerror(rte_errno));
653                 return ret;
654         }
655         dev_link.link_speed = ecmd->speed;
656         sc = ecmd->link_mode_masks[0] |
657                 ((uint64_t)ecmd->link_mode_masks[1] << 32);
658         priv->link_speed_capa = 0;
659         if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
660                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
661         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
662                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
663                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
664         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
665                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
666                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
667                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
668         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
669                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
670                 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
671         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
672                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
673                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
674                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
675                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
676         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
677                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
678                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
679                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
680                 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
681         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
682                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
683                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
684                 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
685         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
686                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
687                 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
688         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
689                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
690                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
691                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
692                 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
693         dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
694                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
695         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
696                                   ETH_LINK_SPEED_FIXED);
697         if ((dev_link.link_speed && !dev_link.link_status) ||
698             (!dev_link.link_speed && dev_link.link_status)) {
699                 rte_errno = EAGAIN;
700                 return -rte_errno;
701         }
702         *link = dev_link;
703         return 0;
704 }
705
706 /**
707  * DPDK callback to retrieve physical link information.
708  *
709  * @param dev
710  *   Pointer to Ethernet device structure.
711  * @param wait_to_complete
712  *   Wait for request completion.
713  *
714  * @return
715  *   0 if link status was not updated, positive if it was, a negative errno
716  *   value otherwise and rte_errno is set.
717  */
718 int
719 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
720 {
721         int ret;
722         struct rte_eth_link dev_link;
723         time_t start_time = time(NULL);
724
725         do {
726                 ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
727                 if (ret)
728                         ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
729                 if (ret == 0)
730                         break;
731                 /* Handle wait to complete situation. */
732                 if (wait_to_complete && ret == -EAGAIN) {
733                         if (abs((int)difftime(time(NULL), start_time)) <
734                             MLX5_LINK_STATUS_TIMEOUT) {
735                                 usleep(0);
736                                 continue;
737                         } else {
738                                 rte_errno = EBUSY;
739                                 return -rte_errno;
740                         }
741                 } else if (ret < 0) {
742                         return ret;
743                 }
744         } while (wait_to_complete);
745         ret = !!memcmp(&dev->data->dev_link, &dev_link,
746                        sizeof(struct rte_eth_link));
747         dev->data->dev_link = dev_link;
748         return ret;
749 }
750
751 /**
752  * DPDK callback to change the MTU.
753  *
754  * @param dev
755  *   Pointer to Ethernet device structure.
756  * @param in_mtu
757  *   New MTU.
758  *
759  * @return
760  *   0 on success, a negative errno value otherwise and rte_errno is set.
761  */
762 int
763 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
764 {
765         struct priv *priv = dev->data->dev_private;
766         uint16_t kern_mtu = 0;
767         int ret;
768
769         ret = mlx5_get_mtu(dev, &kern_mtu);
770         if (ret)
771                 return ret;
772         /* Set kernel interface MTU first. */
773         ret = mlx5_set_mtu(dev, mtu);
774         if (ret)
775                 return ret;
776         ret = mlx5_get_mtu(dev, &kern_mtu);
777         if (ret)
778                 return ret;
779         if (kern_mtu == mtu) {
780                 priv->mtu = mtu;
781                 DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
782                         dev->data->port_id, mtu);
783                 return 0;
784         }
785         rte_errno = EAGAIN;
786         return -rte_errno;
787 }
788
789 /**
790  * DPDK callback to get flow control status.
791  *
792  * @param dev
793  *   Pointer to Ethernet device structure.
794  * @param[out] fc_conf
795  *   Flow control output buffer.
796  *
797  * @return
798  *   0 on success, a negative errno value otherwise and rte_errno is set.
799  */
800 int
801 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
802 {
803         struct ifreq ifr;
804         struct ethtool_pauseparam ethpause = {
805                 .cmd = ETHTOOL_GPAUSEPARAM
806         };
807         int ret;
808
809         ifr.ifr_data = (void *)&ethpause;
810         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
811         if (ret) {
812                 DRV_LOG(WARNING,
813                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
814                         " %s",
815                         dev->data->port_id, strerror(rte_errno));
816                 return ret;
817         }
818         fc_conf->autoneg = ethpause.autoneg;
819         if (ethpause.rx_pause && ethpause.tx_pause)
820                 fc_conf->mode = RTE_FC_FULL;
821         else if (ethpause.rx_pause)
822                 fc_conf->mode = RTE_FC_RX_PAUSE;
823         else if (ethpause.tx_pause)
824                 fc_conf->mode = RTE_FC_TX_PAUSE;
825         else
826                 fc_conf->mode = RTE_FC_NONE;
827         return 0;
828 }
829
830 /**
831  * DPDK callback to modify flow control parameters.
832  *
833  * @param dev
834  *   Pointer to Ethernet device structure.
835  * @param[in] fc_conf
836  *   Flow control parameters.
837  *
838  * @return
839  *   0 on success, a negative errno value otherwise and rte_errno is set.
840  */
841 int
842 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
843 {
844         struct ifreq ifr;
845         struct ethtool_pauseparam ethpause = {
846                 .cmd = ETHTOOL_SPAUSEPARAM
847         };
848         int ret;
849
850         ifr.ifr_data = (void *)&ethpause;
851         ethpause.autoneg = fc_conf->autoneg;
852         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
853             (fc_conf->mode & RTE_FC_RX_PAUSE))
854                 ethpause.rx_pause = 1;
855         else
856                 ethpause.rx_pause = 0;
857
858         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
859             (fc_conf->mode & RTE_FC_TX_PAUSE))
860                 ethpause.tx_pause = 1;
861         else
862                 ethpause.tx_pause = 0;
863         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
864         if (ret) {
865                 DRV_LOG(WARNING,
866                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
867                         " failed: %s",
868                         dev->data->port_id, strerror(rte_errno));
869                 return ret;
870         }
871         return 0;
872 }
873
874 /**
875  * Get PCI information from struct ibv_device.
876  *
877  * @param device
878  *   Pointer to Ethernet device structure.
879  * @param[out] pci_addr
880  *   PCI bus address output buffer.
881  *
882  * @return
883  *   0 on success, a negative errno value otherwise and rte_errno is set.
884  */
885 int
886 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
887                             struct rte_pci_addr *pci_addr)
888 {
889         FILE *file;
890         char line[32];
891         MKSTR(path, "%s/device/uevent", device->ibdev_path);
892
893         file = fopen(path, "rb");
894         if (file == NULL) {
895                 rte_errno = errno;
896                 return -rte_errno;
897         }
898         while (fgets(line, sizeof(line), file) == line) {
899                 size_t len = strlen(line);
900                 int ret;
901
902                 /* Truncate long lines. */
903                 if (len == (sizeof(line) - 1))
904                         while (line[(len - 1)] != '\n') {
905                                 ret = fgetc(file);
906                                 if (ret == EOF)
907                                         break;
908                                 line[(len - 1)] = ret;
909                         }
910                 /* Extract information. */
911                 if (sscanf(line,
912                            "PCI_SLOT_NAME="
913                            "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
914                            &pci_addr->domain,
915                            &pci_addr->bus,
916                            &pci_addr->devid,
917                            &pci_addr->function) == 4) {
918                         ret = 0;
919                         break;
920                 }
921         }
922         fclose(file);
923         return 0;
924 }
925
926 /**
927  * Device status handler.
928  *
929  * @param dev
930  *   Pointer to Ethernet device.
931  * @param events
932  *   Pointer to event flags holder.
933  *
934  * @return
935  *   Events bitmap of callback process which can be called immediately.
936  */
937 static uint32_t
938 mlx5_dev_status_handler(struct rte_eth_dev *dev)
939 {
940         struct priv *priv = dev->data->dev_private;
941         struct ibv_async_event event;
942         uint32_t ret = 0;
943
944         if (mlx5_link_update(dev, 0) == -EAGAIN) {
945                 usleep(0);
946                 return 0;
947         }
948         /* Read all message and acknowledge them. */
949         for (;;) {
950                 if (ibv_get_async_event(priv->ctx, &event))
951                         break;
952                 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
953                         event.event_type == IBV_EVENT_PORT_ERR) &&
954                         (dev->data->dev_conf.intr_conf.lsc == 1))
955                         ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
956                 else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
957                         dev->data->dev_conf.intr_conf.rmv == 1)
958                         ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
959                 else
960                         DRV_LOG(DEBUG,
961                                 "port %u event type %d on not handled",
962                                 dev->data->port_id, event.event_type);
963                 ibv_ack_async_event(&event);
964         }
965         return ret;
966 }
967
968 /**
969  * Handle interrupts from the NIC.
970  *
971  * @param[in] intr_handle
972  *   Interrupt handler.
973  * @param cb_arg
974  *   Callback argument.
975  */
976 void
977 mlx5_dev_interrupt_handler(void *cb_arg)
978 {
979         struct rte_eth_dev *dev = cb_arg;
980         uint32_t events;
981
982         events = mlx5_dev_status_handler(dev);
983         if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
984                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
985                                               NULL);
986         if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
987                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL,
988                                               NULL);
989 }
990
991 /**
992  * Handle interrupts from the socket.
993  *
994  * @param cb_arg
995  *   Callback argument.
996  */
997 static void
998 mlx5_dev_handler_socket(void *cb_arg)
999 {
1000         struct rte_eth_dev *dev = cb_arg;
1001
1002         mlx5_socket_handle(dev);
1003 }
1004
1005 /**
1006  * Uninstall interrupt handler.
1007  *
1008  * @param dev
1009  *   Pointer to Ethernet device.
1010  */
1011 void
1012 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
1013 {
1014         struct priv *priv = dev->data->dev_private;
1015
1016         if (dev->data->dev_conf.intr_conf.lsc ||
1017             dev->data->dev_conf.intr_conf.rmv)
1018                 rte_intr_callback_unregister(&priv->intr_handle,
1019                                              mlx5_dev_interrupt_handler, dev);
1020         if (priv->primary_socket)
1021                 rte_intr_callback_unregister(&priv->intr_handle_socket,
1022                                              mlx5_dev_handler_socket, dev);
1023         priv->intr_handle.fd = 0;
1024         priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1025         priv->intr_handle_socket.fd = 0;
1026         priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN;
1027 }
1028
1029 /**
1030  * Install interrupt handler.
1031  *
1032  * @param dev
1033  *   Pointer to Ethernet device.
1034  */
1035 void
1036 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
1037 {
1038         struct priv *priv = dev->data->dev_private;
1039         int ret;
1040         int flags;
1041
1042         assert(priv->ctx->async_fd > 0);
1043         flags = fcntl(priv->ctx->async_fd, F_GETFL);
1044         ret = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1045         if (ret) {
1046                 DRV_LOG(INFO,
1047                         "port %u failed to change file descriptor async event"
1048                         " queue",
1049                         dev->data->port_id);
1050                 dev->data->dev_conf.intr_conf.lsc = 0;
1051                 dev->data->dev_conf.intr_conf.rmv = 0;
1052         }
1053         if (dev->data->dev_conf.intr_conf.lsc ||
1054             dev->data->dev_conf.intr_conf.rmv) {
1055                 priv->intr_handle.fd = priv->ctx->async_fd;
1056                 priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
1057                 rte_intr_callback_register(&priv->intr_handle,
1058                                            mlx5_dev_interrupt_handler, dev);
1059         }
1060         ret = mlx5_socket_init(dev);
1061         if (ret)
1062                 DRV_LOG(ERR, "port %u cannot initialise socket: %s",
1063                         dev->data->port_id, strerror(rte_errno));
1064         else if (priv->primary_socket) {
1065                 priv->intr_handle_socket.fd = priv->primary_socket;
1066                 priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT;
1067                 rte_intr_callback_register(&priv->intr_handle_socket,
1068                                            mlx5_dev_handler_socket, dev);
1069         }
1070 }
1071
1072 /**
1073  * DPDK callback to bring the link DOWN.
1074  *
1075  * @param dev
1076  *   Pointer to Ethernet device structure.
1077  *
1078  * @return
1079  *   0 on success, a negative errno value otherwise and rte_errno is set.
1080  */
1081 int
1082 mlx5_set_link_down(struct rte_eth_dev *dev)
1083 {
1084         return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
1085 }
1086
1087 /**
1088  * DPDK callback to bring the link UP.
1089  *
1090  * @param dev
1091  *   Pointer to Ethernet device structure.
1092  *
1093  * @return
1094  *   0 on success, a negative errno value otherwise and rte_errno is set.
1095  */
1096 int
1097 mlx5_set_link_up(struct rte_eth_dev *dev)
1098 {
1099         return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
1100 }
1101
1102 /**
1103  * Configure the TX function to use.
1104  *
1105  * @param dev
1106  *   Pointer to rte_eth_dev structure.
1107  *
1108  * @return
1109  *   Pointer to selected Tx burst function.
1110  */
1111 eth_tx_burst_t
1112 mlx5_select_tx_function(struct rte_eth_dev *dev)
1113 {
1114         struct priv *priv = dev->data->dev_private;
1115         eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst;
1116
1117         /* Select appropriate TX function. */
1118         if (priv->mps == MLX5_MPW_ENHANCED) {
1119                 if (mlx5_check_vec_tx_support(dev) > 0) {
1120                         if (mlx5_check_raw_vec_tx_support(dev) > 0)
1121                                 tx_pkt_burst = mlx5_tx_burst_raw_vec;
1122                         else
1123                                 tx_pkt_burst = mlx5_tx_burst_vec;
1124                         DRV_LOG(DEBUG,
1125                                 "port %u selected enhanced MPW Tx vectorized"
1126                                 " function",
1127                                 dev->data->port_id);
1128                 } else {
1129                         tx_pkt_burst = mlx5_tx_burst_empw;
1130                         DRV_LOG(DEBUG,
1131                                 "port %u selected enhanced MPW Tx function",
1132                                 dev->data->port_id);
1133                 }
1134         } else if (priv->mps && priv->txq_inline) {
1135                 tx_pkt_burst = mlx5_tx_burst_mpw_inline;
1136                 DRV_LOG(DEBUG, "port %u selected MPW inline Tx function",
1137                         dev->data->port_id);
1138         } else if (priv->mps) {
1139                 tx_pkt_burst = mlx5_tx_burst_mpw;
1140                 DRV_LOG(DEBUG, "port %u selected MPW Tx function",
1141                         dev->data->port_id);
1142         }
1143         return tx_pkt_burst;
1144 }
1145
1146 /**
1147  * Configure the RX function to use.
1148  *
1149  * @param dev
1150  *   Pointer to rte_eth_dev structure.
1151  *
1152  * @return
1153  *   Pointer to selected Rx burst function.
1154  */
1155 eth_rx_burst_t
1156 mlx5_select_rx_function(struct rte_eth_dev *dev)
1157 {
1158         eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1159
1160         assert(dev != NULL);
1161         if (mlx5_check_vec_rx_support(dev) > 0) {
1162                 rx_pkt_burst = mlx5_rx_burst_vec;
1163                 DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
1164                         dev->data->port_id);
1165         }
1166         return rx_pkt_burst;
1167 }