e441483a98da28221d3c366c3248a2f818372444
[deb_dpdk.git] / drivers / net / mlx5 / mlx5_ethdev.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2015 6WIND S.A.
5  *   Copyright 2015 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #define _GNU_SOURCE
35
36 #include <stddef.h>
37 #include <assert.h>
38 #include <inttypes.h>
39 #include <unistd.h>
40 #include <stdint.h>
41 #include <stdio.h>
42 #include <string.h>
43 #include <stdlib.h>
44 #include <errno.h>
45 #include <dirent.h>
46 #include <net/if.h>
47 #include <sys/ioctl.h>
48 #include <sys/socket.h>
49 #include <netinet/in.h>
50 #include <linux/ethtool.h>
51 #include <linux/sockios.h>
52 #include <fcntl.h>
53 #include <stdalign.h>
54 #include <sys/un.h>
55 #include <time.h>
56
57 #include <rte_atomic.h>
58 #include <rte_ethdev.h>
59 #include <rte_bus_pci.h>
60 #include <rte_mbuf.h>
61 #include <rte_common.h>
62 #include <rte_interrupts.h>
63 #include <rte_malloc.h>
64
65 #include "mlx5.h"
66 #include "mlx5_rxtx.h"
67 #include "mlx5_utils.h"
68
69 /* Supported speed values found in /usr/include/linux/ethtool.h */
70 #ifndef HAVE_SUPPORTED_40000baseKR4_Full
71 #define SUPPORTED_40000baseKR4_Full (1 << 23)
72 #endif
73 #ifndef HAVE_SUPPORTED_40000baseCR4_Full
74 #define SUPPORTED_40000baseCR4_Full (1 << 24)
75 #endif
76 #ifndef HAVE_SUPPORTED_40000baseSR4_Full
77 #define SUPPORTED_40000baseSR4_Full (1 << 25)
78 #endif
79 #ifndef HAVE_SUPPORTED_40000baseLR4_Full
80 #define SUPPORTED_40000baseLR4_Full (1 << 26)
81 #endif
82 #ifndef HAVE_SUPPORTED_56000baseKR4_Full
83 #define SUPPORTED_56000baseKR4_Full (1 << 27)
84 #endif
85 #ifndef HAVE_SUPPORTED_56000baseCR4_Full
86 #define SUPPORTED_56000baseCR4_Full (1 << 28)
87 #endif
88 #ifndef HAVE_SUPPORTED_56000baseSR4_Full
89 #define SUPPORTED_56000baseSR4_Full (1 << 29)
90 #endif
91 #ifndef HAVE_SUPPORTED_56000baseLR4_Full
92 #define SUPPORTED_56000baseLR4_Full (1 << 30)
93 #endif
94
95 /* Add defines in case the running kernel is not the same as user headers. */
96 #ifndef ETHTOOL_GLINKSETTINGS
97 struct ethtool_link_settings {
98         uint32_t cmd;
99         uint32_t speed;
100         uint8_t duplex;
101         uint8_t port;
102         uint8_t phy_address;
103         uint8_t autoneg;
104         uint8_t mdio_support;
105         uint8_t eth_to_mdix;
106         uint8_t eth_tp_mdix_ctrl;
107         int8_t link_mode_masks_nwords;
108         uint32_t reserved[8];
109         uint32_t link_mode_masks[];
110 };
111
112 #define ETHTOOL_GLINKSETTINGS 0x0000004c
113 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
114 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
115 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
116 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
117 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
118 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
119 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
120 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
121 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
122 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
123 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
124 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
125 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
126 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
127 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
128 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
129 #endif
130 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
131 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
132 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
133 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
134 #endif
135 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
136 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
137 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
138 #endif
139 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
140 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
141 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
142 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
143 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
144 #endif
145
146 /**
147  * Get interface name from private structure.
148  *
149  * @param[in] dev
150  *   Pointer to Ethernet device.
151  * @param[out] ifname
152  *   Interface name output buffer.
153  *
154  * @return
155  *   0 on success, a negative errno value otherwise and rte_errno is set.
156  */
157 int
158 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
159 {
160         struct priv *priv = dev->data->dev_private;
161         DIR *dir;
162         struct dirent *dent;
163         unsigned int dev_type = 0;
164         unsigned int dev_port_prev = ~0u;
165         char match[IF_NAMESIZE] = "";
166
167         {
168                 MKSTR(path, "%s/device/net", priv->ibdev_path);
169
170                 dir = opendir(path);
171                 if (dir == NULL) {
172                         rte_errno = errno;
173                         return -rte_errno;
174                 }
175         }
176         while ((dent = readdir(dir)) != NULL) {
177                 char *name = dent->d_name;
178                 FILE *file;
179                 unsigned int dev_port;
180                 int r;
181
182                 if ((name[0] == '.') &&
183                     ((name[1] == '\0') ||
184                      ((name[1] == '.') && (name[2] == '\0'))))
185                         continue;
186
187                 MKSTR(path, "%s/device/net/%s/%s",
188                       priv->ibdev_path, name,
189                       (dev_type ? "dev_id" : "dev_port"));
190
191                 file = fopen(path, "rb");
192                 if (file == NULL) {
193                         if (errno != ENOENT)
194                                 continue;
195                         /*
196                          * Switch to dev_id when dev_port does not exist as
197                          * is the case with Linux kernel versions < 3.15.
198                          */
199 try_dev_id:
200                         match[0] = '\0';
201                         if (dev_type)
202                                 break;
203                         dev_type = 1;
204                         dev_port_prev = ~0u;
205                         rewinddir(dir);
206                         continue;
207                 }
208                 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
209                 fclose(file);
210                 if (r != 1)
211                         continue;
212                 /*
213                  * Switch to dev_id when dev_port returns the same value for
214                  * all ports. May happen when using a MOFED release older than
215                  * 3.0 with a Linux kernel >= 3.15.
216                  */
217                 if (dev_port == dev_port_prev)
218                         goto try_dev_id;
219                 dev_port_prev = dev_port;
220                 if (dev_port == (priv->port - 1u))
221                         snprintf(match, sizeof(match), "%s", name);
222         }
223         closedir(dir);
224         if (match[0] == '\0') {
225                 rte_errno = ENOENT;
226                 return -rte_errno;
227         }
228         strncpy(*ifname, match, sizeof(*ifname));
229         return 0;
230 }
231
232 /**
233  * Perform ifreq ioctl() on associated Ethernet device.
234  *
235  * @param[in] dev
236  *   Pointer to Ethernet device.
237  * @param req
238  *   Request number to pass to ioctl().
239  * @param[out] ifr
240  *   Interface request structure output buffer.
241  *
242  * @return
243  *   0 on success, a negative errno value otherwise and rte_errno is set.
244  */
245 int
246 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
247 {
248         int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
249         int ret = 0;
250
251         if (sock == -1) {
252                 rte_errno = errno;
253                 return -rte_errno;
254         }
255         ret = mlx5_get_ifname(dev, &ifr->ifr_name);
256         if (ret)
257                 goto error;
258         ret = ioctl(sock, req, ifr);
259         if (ret == -1) {
260                 rte_errno = errno;
261                 goto error;
262         }
263         close(sock);
264         return 0;
265 error:
266         close(sock);
267         return -rte_errno;
268 }
269
270 /**
271  * Get device MTU.
272  *
273  * @param dev
274  *   Pointer to Ethernet device.
275  * @param[out] mtu
276  *   MTU value output buffer.
277  *
278  * @return
279  *   0 on success, a negative errno value otherwise and rte_errno is set.
280  */
281 int
282 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
283 {
284         struct ifreq request;
285         int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
286
287         if (ret)
288                 return ret;
289         *mtu = request.ifr_mtu;
290         return 0;
291 }
292
293 /**
294  * Set device MTU.
295  *
296  * @param dev
297  *   Pointer to Ethernet device.
298  * @param mtu
299  *   MTU value to set.
300  *
301  * @return
302  *   0 on success, a negative errno value otherwise and rte_errno is set.
303  */
304 static int
305 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
306 {
307         struct ifreq request = { .ifr_mtu = mtu, };
308
309         return mlx5_ifreq(dev, SIOCSIFMTU, &request);
310 }
311
312 /**
313  * Set device flags.
314  *
315  * @param dev
316  *   Pointer to Ethernet device.
317  * @param keep
318  *   Bitmask for flags that must remain untouched.
319  * @param flags
320  *   Bitmask for flags to modify.
321  *
322  * @return
323  *   0 on success, a negative errno value otherwise and rte_errno is set.
324  */
325 int
326 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
327 {
328         struct ifreq request;
329         int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
330
331         if (ret)
332                 return ret;
333         request.ifr_flags &= keep;
334         request.ifr_flags |= flags & ~keep;
335         return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
336 }
337
338 /**
339  * DPDK callback for Ethernet device configuration.
340  *
341  * @param dev
342  *   Pointer to Ethernet device structure.
343  *
344  * @return
345  *   0 on success, a negative errno value otherwise and rte_errno is set.
346  */
347 int
348 mlx5_dev_configure(struct rte_eth_dev *dev)
349 {
350         struct priv *priv = dev->data->dev_private;
351         unsigned int rxqs_n = dev->data->nb_rx_queues;
352         unsigned int txqs_n = dev->data->nb_tx_queues;
353         unsigned int i;
354         unsigned int j;
355         unsigned int reta_idx_n;
356         const uint8_t use_app_rss_key =
357                 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
358         int ret = 0;
359
360         if (use_app_rss_key &&
361             (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
362              rss_hash_default_key_len)) {
363                 DRV_LOG(ERR, "port %u RSS key len must be %zu Bytes long",
364                         dev->data->port_id, rss_hash_default_key_len);
365                 rte_errno = EINVAL;
366                 return -rte_errno;
367         }
368         priv->rss_conf.rss_key =
369                 rte_realloc(priv->rss_conf.rss_key,
370                             rss_hash_default_key_len, 0);
371         if (!priv->rss_conf.rss_key) {
372                 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
373                         dev->data->port_id, rxqs_n);
374                 rte_errno = ENOMEM;
375                 return -rte_errno;
376         }
377         memcpy(priv->rss_conf.rss_key,
378                use_app_rss_key ?
379                dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
380                rss_hash_default_key,
381                rss_hash_default_key_len);
382         priv->rss_conf.rss_key_len = rss_hash_default_key_len;
383         priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
384         priv->rxqs = (void *)dev->data->rx_queues;
385         priv->txqs = (void *)dev->data->tx_queues;
386         if (txqs_n != priv->txqs_n) {
387                 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
388                         dev->data->port_id, priv->txqs_n, txqs_n);
389                 priv->txqs_n = txqs_n;
390         }
391         if (rxqs_n > priv->ind_table_max_size) {
392                 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
393                         dev->data->port_id, rxqs_n);
394                 rte_errno = EINVAL;
395                 return -rte_errno;
396         }
397         if (rxqs_n == priv->rxqs_n)
398                 return 0;
399         DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
400                 dev->data->port_id, priv->rxqs_n, rxqs_n);
401         priv->rxqs_n = rxqs_n;
402         /* If the requested number of RX queues is not a power of two, use the
403          * maximum indirection table size for better balancing.
404          * The result is always rounded to the next power of two. */
405         reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
406                                      priv->ind_table_max_size :
407                                      rxqs_n));
408         ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
409         if (ret)
410                 return ret;
411         /* When the number of RX queues is not a power of two, the remaining
412          * table entries are padded with reused WQs and hashes are not spread
413          * uniformly. */
414         for (i = 0, j = 0; (i != reta_idx_n); ++i) {
415                 (*priv->reta_idx)[i] = j;
416                 if (++j == rxqs_n)
417                         j = 0;
418         }
419         return 0;
420 }
421
422 /**
423  * DPDK callback to get information about the device.
424  *
425  * @param dev
426  *   Pointer to Ethernet device structure.
427  * @param[out] info
428  *   Info structure output buffer.
429  */
430 void
431 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
432 {
433         struct priv *priv = dev->data->dev_private;
434         unsigned int max;
435         char ifname[IF_NAMESIZE];
436
437         info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
438         /* FIXME: we should ask the device for these values. */
439         info->min_rx_bufsize = 32;
440         info->max_rx_pktlen = 65536;
441         /*
442          * Since we need one CQ per QP, the limit is the minimum number
443          * between the two values.
444          */
445         max = RTE_MIN(priv->device_attr.orig_attr.max_cq,
446                       priv->device_attr.orig_attr.max_qp);
447         /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
448         if (max >= 65535)
449                 max = 65535;
450         info->max_rx_queues = max;
451         info->max_tx_queues = max;
452         info->max_mac_addrs = RTE_DIM(priv->mac);
453         info->rx_offload_capa =
454                 (priv->hw_csum ?
455                  (DEV_RX_OFFLOAD_IPV4_CKSUM |
456                   DEV_RX_OFFLOAD_UDP_CKSUM |
457                   DEV_RX_OFFLOAD_TCP_CKSUM) :
458                  0) |
459                 (priv->hw_vlan_strip ? DEV_RX_OFFLOAD_VLAN_STRIP : 0) |
460                 DEV_RX_OFFLOAD_TIMESTAMP;
461
462         if (!priv->mps)
463                 info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
464         if (priv->hw_csum)
465                 info->tx_offload_capa |=
466                         (DEV_TX_OFFLOAD_IPV4_CKSUM |
467                          DEV_TX_OFFLOAD_UDP_CKSUM |
468                          DEV_TX_OFFLOAD_TCP_CKSUM);
469         if (priv->tso)
470                 info->tx_offload_capa |= DEV_TX_OFFLOAD_TCP_TSO;
471         if (priv->tunnel_en)
472                 info->tx_offload_capa |= (DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM |
473                                           DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
474                                           DEV_TX_OFFLOAD_GRE_TNL_TSO);
475         if (mlx5_get_ifname(dev, &ifname) == 0)
476                 info->if_index = if_nametoindex(ifname);
477         info->reta_size = priv->reta_idx_n ?
478                 priv->reta_idx_n : priv->ind_table_max_size;
479         info->hash_key_size = rss_hash_default_key_len;
480         info->speed_capa = priv->link_speed_capa;
481         info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
482 }
483
484 /**
485  * Get supported packet types.
486  *
487  * @param dev
488  *   Pointer to Ethernet device structure.
489  *
490  * @return
491  *   A pointer to the supported Packet types array.
492  */
493 const uint32_t *
494 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
495 {
496         static const uint32_t ptypes[] = {
497                 /* refers to rxq_cq_to_pkt_type() */
498                 RTE_PTYPE_L2_ETHER,
499                 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
500                 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
501                 RTE_PTYPE_L4_NONFRAG,
502                 RTE_PTYPE_L4_FRAG,
503                 RTE_PTYPE_L4_TCP,
504                 RTE_PTYPE_L4_UDP,
505                 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
506                 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
507                 RTE_PTYPE_INNER_L4_NONFRAG,
508                 RTE_PTYPE_INNER_L4_FRAG,
509                 RTE_PTYPE_INNER_L4_TCP,
510                 RTE_PTYPE_INNER_L4_UDP,
511                 RTE_PTYPE_UNKNOWN
512         };
513
514         if (dev->rx_pkt_burst == mlx5_rx_burst ||
515             dev->rx_pkt_burst == mlx5_rx_burst_vec)
516                 return ptypes;
517         return NULL;
518 }
519
520 /**
521  * DPDK callback to retrieve physical link information.
522  *
523  * @param dev
524  *   Pointer to Ethernet device structure.
525  * @param[out] link
526  *   Storage for current link status.
527  *
528  * @return
529  *   0 on success, a negative errno value otherwise and rte_errno is set.
530  */
531 static int
532 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
533                                struct rte_eth_link *link)
534 {
535         struct priv *priv = dev->data->dev_private;
536         struct ethtool_cmd edata = {
537                 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
538         };
539         struct ifreq ifr;
540         struct rte_eth_link dev_link;
541         int link_speed = 0;
542         int ret;
543
544         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
545         if (ret) {
546                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
547                         dev->data->port_id, strerror(rte_errno));
548                 return ret;
549         }
550         memset(&dev_link, 0, sizeof(dev_link));
551         dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
552                                 (ifr.ifr_flags & IFF_RUNNING));
553         ifr.ifr_data = (void *)&edata;
554         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
555         if (ret) {
556                 DRV_LOG(WARNING,
557                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
558                         dev->data->port_id, strerror(rte_errno));
559                 return ret;
560         }
561         link_speed = ethtool_cmd_speed(&edata);
562         if (link_speed == -1)
563                 dev_link.link_speed = 0;
564         else
565                 dev_link.link_speed = link_speed;
566         priv->link_speed_capa = 0;
567         if (edata.supported & SUPPORTED_Autoneg)
568                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
569         if (edata.supported & (SUPPORTED_1000baseT_Full |
570                                SUPPORTED_1000baseKX_Full))
571                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
572         if (edata.supported & SUPPORTED_10000baseKR_Full)
573                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
574         if (edata.supported & (SUPPORTED_40000baseKR4_Full |
575                                SUPPORTED_40000baseCR4_Full |
576                                SUPPORTED_40000baseSR4_Full |
577                                SUPPORTED_40000baseLR4_Full))
578                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
579         dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
580                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
581         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
582                         ETH_LINK_SPEED_FIXED);
583         if ((dev_link.link_speed && !dev_link.link_status) ||
584             (!dev_link.link_speed && dev_link.link_status)) {
585                 rte_errno = EAGAIN;
586                 return -rte_errno;
587         }
588         *link = dev_link;
589         return 0;
590 }
591
592 /**
593  * Retrieve physical link information (unlocked version using new ioctl).
594  *
595  * @param dev
596  *   Pointer to Ethernet device structure.
597  * @param[out] link
598  *   Storage for current link status.
599  *
600  * @return
601  *   0 on success, a negative errno value otherwise and rte_errno is set.
602  */
603 static int
604 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
605                              struct rte_eth_link *link)
606
607 {
608         struct priv *priv = dev->data->dev_private;
609         struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
610         struct ifreq ifr;
611         struct rte_eth_link dev_link;
612         uint64_t sc;
613         int ret;
614
615         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
616         if (ret) {
617                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
618                         dev->data->port_id, strerror(rte_errno));
619                 return ret;
620         }
621         memset(&dev_link, 0, sizeof(dev_link));
622         dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
623                                 (ifr.ifr_flags & IFF_RUNNING));
624         ifr.ifr_data = (void *)&gcmd;
625         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
626         if (ret) {
627                 DRV_LOG(DEBUG,
628                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
629                         " failed: %s",
630                         dev->data->port_id, strerror(rte_errno));
631                 return ret;
632         }
633         gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
634
635         alignas(struct ethtool_link_settings)
636         uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
637                      sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
638         struct ethtool_link_settings *ecmd = (void *)data;
639
640         *ecmd = gcmd;
641         ifr.ifr_data = (void *)ecmd;
642         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
643         if (ret) {
644                 DRV_LOG(DEBUG,
645                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
646                         " failed: %s",
647                         dev->data->port_id, strerror(rte_errno));
648                 return ret;
649         }
650         dev_link.link_speed = ecmd->speed;
651         sc = ecmd->link_mode_masks[0] |
652                 ((uint64_t)ecmd->link_mode_masks[1] << 32);
653         priv->link_speed_capa = 0;
654         if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
655                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
656         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
657                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
658                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
659         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
660                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
661                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
662                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
663         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
664                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
665                 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
666         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
667                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
668                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
669                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
670                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
671         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
672                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
673                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
674                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
675                 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
676         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
677                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
678                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
679                 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
680         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
681                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
682                 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
683         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
684                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
685                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
686                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
687                 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
688         dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
689                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
690         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
691                                   ETH_LINK_SPEED_FIXED);
692         if ((dev_link.link_speed && !dev_link.link_status) ||
693             (!dev_link.link_speed && dev_link.link_status)) {
694                 rte_errno = EAGAIN;
695                 return -rte_errno;
696         }
697         *link = dev_link;
698         return 0;
699 }
700
701 /**
702  * DPDK callback to retrieve physical link information.
703  *
704  * @param dev
705  *   Pointer to Ethernet device structure.
706  * @param wait_to_complete
707  *   Wait for request completion.
708  *
709  * @return
710  *   0 if link status was not updated, positive if it was, a negative errno
711  *   value otherwise and rte_errno is set.
712  */
713 int
714 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
715 {
716         int ret;
717         struct rte_eth_link dev_link;
718         time_t start_time = time(NULL);
719
720         do {
721                 ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
722                 if (ret)
723                         ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
724                 if (ret == 0)
725                         break;
726                 /* Handle wait to complete situation. */
727                 if (wait_to_complete && ret == -EAGAIN) {
728                         if (abs((int)difftime(time(NULL), start_time)) <
729                             MLX5_LINK_STATUS_TIMEOUT) {
730                                 usleep(0);
731                                 continue;
732                         } else {
733                                 rte_errno = EBUSY;
734                                 return -rte_errno;
735                         }
736                 } else if (ret < 0) {
737                         return ret;
738                 }
739         } while (wait_to_complete);
740         ret = !!memcmp(&dev->data->dev_link, &dev_link,
741                        sizeof(struct rte_eth_link));
742         dev->data->dev_link = dev_link;
743         return ret;
744 }
745
746 /**
747  * DPDK callback to change the MTU.
748  *
749  * @param dev
750  *   Pointer to Ethernet device structure.
751  * @param in_mtu
752  *   New MTU.
753  *
754  * @return
755  *   0 on success, a negative errno value otherwise and rte_errno is set.
756  */
757 int
758 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
759 {
760         struct priv *priv = dev->data->dev_private;
761         uint16_t kern_mtu = 0;
762         int ret;
763
764         ret = mlx5_get_mtu(dev, &kern_mtu);
765         if (ret)
766                 return ret;
767         /* Set kernel interface MTU first. */
768         ret = mlx5_set_mtu(dev, mtu);
769         if (ret)
770                 return ret;
771         ret = mlx5_get_mtu(dev, &kern_mtu);
772         if (ret)
773                 return ret;
774         if (kern_mtu == mtu) {
775                 priv->mtu = mtu;
776                 DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
777                         dev->data->port_id, mtu);
778                 return 0;
779         }
780         rte_errno = EAGAIN;
781         return -rte_errno;
782 }
783
784 /**
785  * DPDK callback to get flow control status.
786  *
787  * @param dev
788  *   Pointer to Ethernet device structure.
789  * @param[out] fc_conf
790  *   Flow control output buffer.
791  *
792  * @return
793  *   0 on success, a negative errno value otherwise and rte_errno is set.
794  */
795 int
796 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
797 {
798         struct ifreq ifr;
799         struct ethtool_pauseparam ethpause = {
800                 .cmd = ETHTOOL_GPAUSEPARAM
801         };
802         int ret;
803
804         ifr.ifr_data = (void *)&ethpause;
805         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
806         if (ret) {
807                 DRV_LOG(WARNING,
808                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
809                         " %s",
810                         dev->data->port_id, strerror(rte_errno));
811                 return ret;
812         }
813         fc_conf->autoneg = ethpause.autoneg;
814         if (ethpause.rx_pause && ethpause.tx_pause)
815                 fc_conf->mode = RTE_FC_FULL;
816         else if (ethpause.rx_pause)
817                 fc_conf->mode = RTE_FC_RX_PAUSE;
818         else if (ethpause.tx_pause)
819                 fc_conf->mode = RTE_FC_TX_PAUSE;
820         else
821                 fc_conf->mode = RTE_FC_NONE;
822         return 0;
823 }
824
825 /**
826  * DPDK callback to modify flow control parameters.
827  *
828  * @param dev
829  *   Pointer to Ethernet device structure.
830  * @param[in] fc_conf
831  *   Flow control parameters.
832  *
833  * @return
834  *   0 on success, a negative errno value otherwise and rte_errno is set.
835  */
836 int
837 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
838 {
839         struct ifreq ifr;
840         struct ethtool_pauseparam ethpause = {
841                 .cmd = ETHTOOL_SPAUSEPARAM
842         };
843         int ret;
844
845         ifr.ifr_data = (void *)&ethpause;
846         ethpause.autoneg = fc_conf->autoneg;
847         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
848             (fc_conf->mode & RTE_FC_RX_PAUSE))
849                 ethpause.rx_pause = 1;
850         else
851                 ethpause.rx_pause = 0;
852
853         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
854             (fc_conf->mode & RTE_FC_TX_PAUSE))
855                 ethpause.tx_pause = 1;
856         else
857                 ethpause.tx_pause = 0;
858         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
859         if (ret) {
860                 DRV_LOG(WARNING,
861                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
862                         " failed: %s",
863                         dev->data->port_id, strerror(rte_errno));
864                 return ret;
865         }
866         return 0;
867 }
868
869 /**
870  * Get PCI information from struct ibv_device.
871  *
872  * @param device
873  *   Pointer to Ethernet device structure.
874  * @param[out] pci_addr
875  *   PCI bus address output buffer.
876  *
877  * @return
878  *   0 on success, a negative errno value otherwise and rte_errno is set.
879  */
880 int
881 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
882                             struct rte_pci_addr *pci_addr)
883 {
884         FILE *file;
885         char line[32];
886         MKSTR(path, "%s/device/uevent", device->ibdev_path);
887
888         file = fopen(path, "rb");
889         if (file == NULL) {
890                 rte_errno = errno;
891                 return -rte_errno;
892         }
893         while (fgets(line, sizeof(line), file) == line) {
894                 size_t len = strlen(line);
895                 int ret;
896
897                 /* Truncate long lines. */
898                 if (len == (sizeof(line) - 1))
899                         while (line[(len - 1)] != '\n') {
900                                 ret = fgetc(file);
901                                 if (ret == EOF)
902                                         break;
903                                 line[(len - 1)] = ret;
904                         }
905                 /* Extract information. */
906                 if (sscanf(line,
907                            "PCI_SLOT_NAME="
908                            "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
909                            &pci_addr->domain,
910                            &pci_addr->bus,
911                            &pci_addr->devid,
912                            &pci_addr->function) == 4) {
913                         ret = 0;
914                         break;
915                 }
916         }
917         fclose(file);
918         return 0;
919 }
920
921 /**
922  * Device status handler.
923  *
924  * @param dev
925  *   Pointer to Ethernet device.
926  * @param events
927  *   Pointer to event flags holder.
928  *
929  * @return
930  *   Events bitmap of callback process which can be called immediately.
931  */
932 static uint32_t
933 mlx5_dev_status_handler(struct rte_eth_dev *dev)
934 {
935         struct priv *priv = dev->data->dev_private;
936         struct ibv_async_event event;
937         uint32_t ret = 0;
938
939         if (mlx5_link_update(dev, 0) == -EAGAIN) {
940                 usleep(0);
941                 return 0;
942         }
943         /* Read all message and acknowledge them. */
944         for (;;) {
945                 if (ibv_get_async_event(priv->ctx, &event))
946                         break;
947                 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
948                         event.event_type == IBV_EVENT_PORT_ERR) &&
949                         (dev->data->dev_conf.intr_conf.lsc == 1))
950                         ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
951                 else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
952                         dev->data->dev_conf.intr_conf.rmv == 1)
953                         ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
954                 else
955                         DRV_LOG(DEBUG,
956                                 "port %u event type %d on not handled",
957                                 dev->data->port_id, event.event_type);
958                 ibv_ack_async_event(&event);
959         }
960         return ret;
961 }
962
963 /**
964  * Handle interrupts from the NIC.
965  *
966  * @param[in] intr_handle
967  *   Interrupt handler.
968  * @param cb_arg
969  *   Callback argument.
970  */
971 void
972 mlx5_dev_interrupt_handler(void *cb_arg)
973 {
974         struct rte_eth_dev *dev = cb_arg;
975         uint32_t events;
976
977         events = mlx5_dev_status_handler(dev);
978         if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
979                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
980                                               NULL);
981         if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
982                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL,
983                                               NULL);
984 }
985
986 /**
987  * Handle interrupts from the socket.
988  *
989  * @param cb_arg
990  *   Callback argument.
991  */
992 static void
993 mlx5_dev_handler_socket(void *cb_arg)
994 {
995         struct rte_eth_dev *dev = cb_arg;
996
997         mlx5_socket_handle(dev);
998 }
999
1000 /**
1001  * Uninstall interrupt handler.
1002  *
1003  * @param dev
1004  *   Pointer to Ethernet device.
1005  */
1006 void
1007 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
1008 {
1009         struct priv *priv = dev->data->dev_private;
1010
1011         if (dev->data->dev_conf.intr_conf.lsc ||
1012             dev->data->dev_conf.intr_conf.rmv)
1013                 rte_intr_callback_unregister(&priv->intr_handle,
1014                                              mlx5_dev_interrupt_handler, dev);
1015         if (priv->primary_socket)
1016                 rte_intr_callback_unregister(&priv->intr_handle_socket,
1017                                              mlx5_dev_handler_socket, dev);
1018         priv->intr_handle.fd = 0;
1019         priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1020         priv->intr_handle_socket.fd = 0;
1021         priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN;
1022 }
1023
1024 /**
1025  * Install interrupt handler.
1026  *
1027  * @param dev
1028  *   Pointer to Ethernet device.
1029  */
1030 void
1031 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
1032 {
1033         struct priv *priv = dev->data->dev_private;
1034         int ret;
1035         int flags;
1036
1037         assert(priv->ctx->async_fd > 0);
1038         flags = fcntl(priv->ctx->async_fd, F_GETFL);
1039         ret = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1040         if (ret) {
1041                 DRV_LOG(INFO,
1042                         "port %u failed to change file descriptor async event"
1043                         " queue",
1044                         dev->data->port_id);
1045                 dev->data->dev_conf.intr_conf.lsc = 0;
1046                 dev->data->dev_conf.intr_conf.rmv = 0;
1047         }
1048         if (dev->data->dev_conf.intr_conf.lsc ||
1049             dev->data->dev_conf.intr_conf.rmv) {
1050                 priv->intr_handle.fd = priv->ctx->async_fd;
1051                 priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
1052                 rte_intr_callback_register(&priv->intr_handle,
1053                                            mlx5_dev_interrupt_handler, dev);
1054         }
1055         ret = mlx5_socket_init(dev);
1056         if (ret)
1057                 DRV_LOG(ERR, "port %u cannot initialise socket: %s",
1058                         dev->data->port_id, strerror(rte_errno));
1059         else if (priv->primary_socket) {
1060                 priv->intr_handle_socket.fd = priv->primary_socket;
1061                 priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT;
1062                 rte_intr_callback_register(&priv->intr_handle_socket,
1063                                            mlx5_dev_handler_socket, dev);
1064         }
1065 }
1066
1067 /**
1068  * DPDK callback to bring the link DOWN.
1069  *
1070  * @param dev
1071  *   Pointer to Ethernet device structure.
1072  *
1073  * @return
1074  *   0 on success, a negative errno value otherwise and rte_errno is set.
1075  */
1076 int
1077 mlx5_set_link_down(struct rte_eth_dev *dev)
1078 {
1079         return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
1080 }
1081
1082 /**
1083  * DPDK callback to bring the link UP.
1084  *
1085  * @param dev
1086  *   Pointer to Ethernet device structure.
1087  *
1088  * @return
1089  *   0 on success, a negative errno value otherwise and rte_errno is set.
1090  */
1091 int
1092 mlx5_set_link_up(struct rte_eth_dev *dev)
1093 {
1094         return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
1095 }
1096
1097 /**
1098  * Configure the TX function to use.
1099  *
1100  * @param dev
1101  *   Pointer to rte_eth_dev structure.
1102  *
1103  * @return
1104  *   Pointer to selected Tx burst function.
1105  */
1106 eth_tx_burst_t
1107 mlx5_select_tx_function(struct rte_eth_dev *dev)
1108 {
1109         struct priv *priv = dev->data->dev_private;
1110         eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst;
1111
1112         /* Select appropriate TX function. */
1113         if (priv->mps == MLX5_MPW_ENHANCED) {
1114                 if (mlx5_check_vec_tx_support(dev) > 0) {
1115                         if (mlx5_check_raw_vec_tx_support(dev) > 0)
1116                                 tx_pkt_burst = mlx5_tx_burst_raw_vec;
1117                         else
1118                                 tx_pkt_burst = mlx5_tx_burst_vec;
1119                         DRV_LOG(DEBUG,
1120                                 "port %u selected enhanced MPW Tx vectorized"
1121                                 " function",
1122                                 dev->data->port_id);
1123                 } else {
1124                         tx_pkt_burst = mlx5_tx_burst_empw;
1125                         DRV_LOG(DEBUG,
1126                                 "port %u selected enhanced MPW Tx function",
1127                                 dev->data->port_id);
1128                 }
1129         } else if (priv->mps && priv->txq_inline) {
1130                 tx_pkt_burst = mlx5_tx_burst_mpw_inline;
1131                 DRV_LOG(DEBUG, "port %u selected MPW inline Tx function",
1132                         dev->data->port_id);
1133         } else if (priv->mps) {
1134                 tx_pkt_burst = mlx5_tx_burst_mpw;
1135                 DRV_LOG(DEBUG, "port %u selected MPW Tx function",
1136                         dev->data->port_id);
1137         }
1138         return tx_pkt_burst;
1139 }
1140
1141 /**
1142  * Configure the RX function to use.
1143  *
1144  * @param dev
1145  *   Pointer to rte_eth_dev structure.
1146  *
1147  * @return
1148  *   Pointer to selected Rx burst function.
1149  */
1150 eth_rx_burst_t
1151 mlx5_select_rx_function(struct rte_eth_dev *dev)
1152 {
1153         eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1154
1155         assert(dev != NULL);
1156         if (mlx5_check_vec_rx_support(dev) > 0) {
1157                 rx_pkt_burst = mlx5_rx_burst_vec;
1158                 DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
1159                         dev->data->port_id);
1160         }
1161         return rx_pkt_burst;
1162 }