New upstream version 17.11.3
[deb_dpdk.git] / drivers / net / mlx5 / mlx5_ethdev.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2015 6WIND S.A.
5  *   Copyright 2015 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #define _GNU_SOURCE
35
36 #include <stddef.h>
37 #include <assert.h>
38 #include <inttypes.h>
39 #include <unistd.h>
40 #include <stdint.h>
41 #include <stdio.h>
42 #include <string.h>
43 #include <stdlib.h>
44 #include <errno.h>
45 #include <dirent.h>
46 #include <net/if.h>
47 #include <sys/ioctl.h>
48 #include <sys/socket.h>
49 #include <netinet/in.h>
50 #include <linux/ethtool.h>
51 #include <linux/sockios.h>
52 #include <fcntl.h>
53 #include <stdalign.h>
54 #include <sys/un.h>
55 #include <time.h>
56
57 #include <rte_atomic.h>
58 #include <rte_ethdev.h>
59 #include <rte_bus_pci.h>
60 #include <rte_mbuf.h>
61 #include <rte_common.h>
62 #include <rte_interrupts.h>
63 #include <rte_malloc.h>
64
65 #include "mlx5.h"
66 #include "mlx5_rxtx.h"
67 #include "mlx5_utils.h"
68
69 /* Add defines in case the running kernel is not the same as user headers. */
70 #ifndef ETHTOOL_GLINKSETTINGS
71 struct ethtool_link_settings {
72         uint32_t cmd;
73         uint32_t speed;
74         uint8_t duplex;
75         uint8_t port;
76         uint8_t phy_address;
77         uint8_t autoneg;
78         uint8_t mdio_support;
79         uint8_t eth_to_mdix;
80         uint8_t eth_tp_mdix_ctrl;
81         int8_t link_mode_masks_nwords;
82         uint32_t reserved[8];
83         uint32_t link_mode_masks[];
84 };
85
86 #define ETHTOOL_GLINKSETTINGS 0x0000004c
87 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
88 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
89 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
90 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
91 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
92 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
93 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
94 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
95 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
96 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
97 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
98 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
99 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
100 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
101 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
102 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
103 #endif
104 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
105 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
106 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
107 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
108 #endif
109 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
110 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
111 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
112 #endif
113 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
114 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
115 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
116 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
117 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
118 #endif
119
120 /**
121  * Get interface name from private structure.
122  *
123  * @param[in] dev
124  *   Pointer to Ethernet device.
125  * @param[out] ifname
126  *   Interface name output buffer.
127  *
128  * @return
129  *   0 on success, a negative errno value otherwise and rte_errno is set.
130  */
131 int
132 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
133 {
134         struct priv *priv = dev->data->dev_private;
135         DIR *dir;
136         struct dirent *dent;
137         unsigned int dev_type = 0;
138         unsigned int dev_port_prev = ~0u;
139         char match[IF_NAMESIZE] = "";
140
141         {
142                 MKSTR(path, "%s/device/net", priv->ibdev_path);
143
144                 dir = opendir(path);
145                 if (dir == NULL) {
146                         rte_errno = errno;
147                         return -rte_errno;
148                 }
149         }
150         while ((dent = readdir(dir)) != NULL) {
151                 char *name = dent->d_name;
152                 FILE *file;
153                 unsigned int dev_port;
154                 int r;
155
156                 if ((name[0] == '.') &&
157                     ((name[1] == '\0') ||
158                      ((name[1] == '.') && (name[2] == '\0'))))
159                         continue;
160
161                 MKSTR(path, "%s/device/net/%s/%s",
162                       priv->ibdev_path, name,
163                       (dev_type ? "dev_id" : "dev_port"));
164
165                 file = fopen(path, "rb");
166                 if (file == NULL) {
167                         if (errno != ENOENT)
168                                 continue;
169                         /*
170                          * Switch to dev_id when dev_port does not exist as
171                          * is the case with Linux kernel versions < 3.15.
172                          */
173 try_dev_id:
174                         match[0] = '\0';
175                         if (dev_type)
176                                 break;
177                         dev_type = 1;
178                         dev_port_prev = ~0u;
179                         rewinddir(dir);
180                         continue;
181                 }
182                 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
183                 fclose(file);
184                 if (r != 1)
185                         continue;
186                 /*
187                  * Switch to dev_id when dev_port returns the same value for
188                  * all ports. May happen when using a MOFED release older than
189                  * 3.0 with a Linux kernel >= 3.15.
190                  */
191                 if (dev_port == dev_port_prev)
192                         goto try_dev_id;
193                 dev_port_prev = dev_port;
194                 if (dev_port == (priv->port - 1u))
195                         snprintf(match, sizeof(match), "%s", name);
196         }
197         closedir(dir);
198         if (match[0] == '\0') {
199                 rte_errno = ENOENT;
200                 return -rte_errno;
201         }
202         strncpy(*ifname, match, sizeof(*ifname));
203         return 0;
204 }
205
206 /**
207  * Perform ifreq ioctl() on associated Ethernet device.
208  *
209  * @param[in] dev
210  *   Pointer to Ethernet device.
211  * @param req
212  *   Request number to pass to ioctl().
213  * @param[out] ifr
214  *   Interface request structure output buffer.
215  *
216  * @return
217  *   0 on success, a negative errno value otherwise and rte_errno is set.
218  */
219 int
220 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
221 {
222         int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
223         int ret = 0;
224
225         if (sock == -1) {
226                 rte_errno = errno;
227                 return -rte_errno;
228         }
229         ret = mlx5_get_ifname(dev, &ifr->ifr_name);
230         if (ret)
231                 goto error;
232         ret = ioctl(sock, req, ifr);
233         if (ret == -1) {
234                 rte_errno = errno;
235                 goto error;
236         }
237         close(sock);
238         return 0;
239 error:
240         close(sock);
241         return -rte_errno;
242 }
243
244 /**
245  * Get device MTU.
246  *
247  * @param dev
248  *   Pointer to Ethernet device.
249  * @param[out] mtu
250  *   MTU value output buffer.
251  *
252  * @return
253  *   0 on success, a negative errno value otherwise and rte_errno is set.
254  */
255 int
256 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
257 {
258         struct ifreq request;
259         int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
260
261         if (ret)
262                 return ret;
263         *mtu = request.ifr_mtu;
264         return 0;
265 }
266
267 /**
268  * Set device MTU.
269  *
270  * @param dev
271  *   Pointer to Ethernet device.
272  * @param mtu
273  *   MTU value to set.
274  *
275  * @return
276  *   0 on success, a negative errno value otherwise and rte_errno is set.
277  */
278 static int
279 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
280 {
281         struct ifreq request = { .ifr_mtu = mtu, };
282
283         return mlx5_ifreq(dev, SIOCSIFMTU, &request);
284 }
285
286 /**
287  * Set device flags.
288  *
289  * @param dev
290  *   Pointer to Ethernet device.
291  * @param keep
292  *   Bitmask for flags that must remain untouched.
293  * @param flags
294  *   Bitmask for flags to modify.
295  *
296  * @return
297  *   0 on success, a negative errno value otherwise and rte_errno is set.
298  */
299 int
300 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
301 {
302         struct ifreq request;
303         int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
304
305         if (ret)
306                 return ret;
307         request.ifr_flags &= keep;
308         request.ifr_flags |= flags & ~keep;
309         return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
310 }
311
312 /**
313  * DPDK callback for Ethernet device configuration.
314  *
315  * @param dev
316  *   Pointer to Ethernet device structure.
317  *
318  * @return
319  *   0 on success, a negative errno value otherwise and rte_errno is set.
320  */
321 int
322 mlx5_dev_configure(struct rte_eth_dev *dev)
323 {
324         struct priv *priv = dev->data->dev_private;
325         unsigned int rxqs_n = dev->data->nb_rx_queues;
326         unsigned int txqs_n = dev->data->nb_tx_queues;
327         unsigned int i;
328         unsigned int j;
329         unsigned int reta_idx_n;
330         const uint8_t use_app_rss_key =
331                 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
332         int ret = 0;
333
334         if (use_app_rss_key &&
335             (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
336              rss_hash_default_key_len)) {
337                 DRV_LOG(ERR, "port %u RSS key len must be %zu Bytes long",
338                         dev->data->port_id, rss_hash_default_key_len);
339                 rte_errno = EINVAL;
340                 return -rte_errno;
341         }
342         priv->rss_conf.rss_key =
343                 rte_realloc(priv->rss_conf.rss_key,
344                             rss_hash_default_key_len, 0);
345         if (!priv->rss_conf.rss_key) {
346                 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
347                         dev->data->port_id, rxqs_n);
348                 rte_errno = ENOMEM;
349                 return -rte_errno;
350         }
351         memcpy(priv->rss_conf.rss_key,
352                use_app_rss_key ?
353                dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
354                rss_hash_default_key,
355                rss_hash_default_key_len);
356         priv->rss_conf.rss_key_len = rss_hash_default_key_len;
357         priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
358         priv->rxqs = (void *)dev->data->rx_queues;
359         priv->txqs = (void *)dev->data->tx_queues;
360         if (txqs_n != priv->txqs_n) {
361                 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
362                         dev->data->port_id, priv->txqs_n, txqs_n);
363                 priv->txqs_n = txqs_n;
364         }
365         if (rxqs_n > priv->ind_table_max_size) {
366                 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
367                         dev->data->port_id, rxqs_n);
368                 rte_errno = EINVAL;
369                 return -rte_errno;
370         }
371         if (rxqs_n == priv->rxqs_n)
372                 return 0;
373         DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
374                 dev->data->port_id, priv->rxqs_n, rxqs_n);
375         priv->rxqs_n = rxqs_n;
376         /* If the requested number of RX queues is not a power of two, use the
377          * maximum indirection table size for better balancing.
378          * The result is always rounded to the next power of two. */
379         reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
380                                      priv->ind_table_max_size :
381                                      rxqs_n));
382         ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
383         if (ret)
384                 return ret;
385         /* When the number of RX queues is not a power of two, the remaining
386          * table entries are padded with reused WQs and hashes are not spread
387          * uniformly. */
388         for (i = 0, j = 0; (i != reta_idx_n); ++i) {
389                 (*priv->reta_idx)[i] = j;
390                 if (++j == rxqs_n)
391                         j = 0;
392         }
393         return 0;
394 }
395
396 /**
397  * DPDK callback to get information about the device.
398  *
399  * @param dev
400  *   Pointer to Ethernet device structure.
401  * @param[out] info
402  *   Info structure output buffer.
403  */
404 void
405 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
406 {
407         struct priv *priv = dev->data->dev_private;
408         unsigned int max;
409         char ifname[IF_NAMESIZE];
410
411         info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
412         /* FIXME: we should ask the device for these values. */
413         info->min_rx_bufsize = 32;
414         info->max_rx_pktlen = 65536;
415         /*
416          * Since we need one CQ per QP, the limit is the minimum number
417          * between the two values.
418          */
419         max = RTE_MIN(priv->device_attr.orig_attr.max_cq,
420                       priv->device_attr.orig_attr.max_qp);
421         /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
422         if (max >= 65535)
423                 max = 65535;
424         info->max_rx_queues = max;
425         info->max_tx_queues = max;
426         info->max_mac_addrs = RTE_DIM(priv->mac);
427         info->rx_offload_capa =
428                 (priv->hw_csum ?
429                  (DEV_RX_OFFLOAD_IPV4_CKSUM |
430                   DEV_RX_OFFLOAD_UDP_CKSUM |
431                   DEV_RX_OFFLOAD_TCP_CKSUM) :
432                  0) |
433                 (priv->hw_vlan_strip ? DEV_RX_OFFLOAD_VLAN_STRIP : 0) |
434                 DEV_RX_OFFLOAD_TIMESTAMP;
435
436         if (!priv->mps)
437                 info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
438         if (priv->hw_csum)
439                 info->tx_offload_capa |=
440                         (DEV_TX_OFFLOAD_IPV4_CKSUM |
441                          DEV_TX_OFFLOAD_UDP_CKSUM |
442                          DEV_TX_OFFLOAD_TCP_CKSUM);
443         if (priv->tso)
444                 info->tx_offload_capa |= DEV_TX_OFFLOAD_TCP_TSO;
445         if (priv->tunnel_en)
446                 info->tx_offload_capa |= (DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM |
447                                           DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
448                                           DEV_TX_OFFLOAD_GRE_TNL_TSO);
449         if (mlx5_get_ifname(dev, &ifname) == 0)
450                 info->if_index = if_nametoindex(ifname);
451         info->reta_size = priv->reta_idx_n ?
452                 priv->reta_idx_n : priv->ind_table_max_size;
453         info->hash_key_size = rss_hash_default_key_len;
454         info->speed_capa = priv->link_speed_capa;
455         info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
456 }
457
458 /**
459  * Get supported packet types.
460  *
461  * @param dev
462  *   Pointer to Ethernet device structure.
463  *
464  * @return
465  *   A pointer to the supported Packet types array.
466  */
467 const uint32_t *
468 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
469 {
470         static const uint32_t ptypes[] = {
471                 /* refers to rxq_cq_to_pkt_type() */
472                 RTE_PTYPE_L2_ETHER,
473                 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
474                 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
475                 RTE_PTYPE_L4_NONFRAG,
476                 RTE_PTYPE_L4_FRAG,
477                 RTE_PTYPE_L4_TCP,
478                 RTE_PTYPE_L4_UDP,
479                 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
480                 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
481                 RTE_PTYPE_INNER_L4_NONFRAG,
482                 RTE_PTYPE_INNER_L4_FRAG,
483                 RTE_PTYPE_INNER_L4_TCP,
484                 RTE_PTYPE_INNER_L4_UDP,
485                 RTE_PTYPE_UNKNOWN
486         };
487
488         if (dev->rx_pkt_burst == mlx5_rx_burst ||
489             dev->rx_pkt_burst == mlx5_rx_burst_vec)
490                 return ptypes;
491         return NULL;
492 }
493
494 /**
495  * DPDK callback to retrieve physical link information.
496  *
497  * @param dev
498  *   Pointer to Ethernet device structure.
499  * @param[out] link
500  *   Storage for current link status.
501  *
502  * @return
503  *   0 on success, a negative errno value otherwise and rte_errno is set.
504  */
505 static int
506 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
507                                struct rte_eth_link *link)
508 {
509         struct priv *priv = dev->data->dev_private;
510         struct ethtool_cmd edata = {
511                 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
512         };
513         struct ifreq ifr;
514         struct rte_eth_link dev_link;
515         int link_speed = 0;
516         int ret;
517
518         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
519         if (ret) {
520                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
521                         dev->data->port_id, strerror(rte_errno));
522                 return ret;
523         }
524         memset(&dev_link, 0, sizeof(dev_link));
525         dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
526                                 (ifr.ifr_flags & IFF_RUNNING));
527         ifr.ifr_data = (void *)&edata;
528         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
529         if (ret) {
530                 DRV_LOG(WARNING,
531                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
532                         dev->data->port_id, strerror(rte_errno));
533                 return ret;
534         }
535         link_speed = ethtool_cmd_speed(&edata);
536         if (link_speed == -1)
537                 dev_link.link_speed = 0;
538         else
539                 dev_link.link_speed = link_speed;
540         priv->link_speed_capa = 0;
541         if (edata.supported & SUPPORTED_Autoneg)
542                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
543         if (edata.supported & (SUPPORTED_1000baseT_Full |
544                                SUPPORTED_1000baseKX_Full))
545                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
546         if (edata.supported & SUPPORTED_10000baseKR_Full)
547                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
548         if (edata.supported & (SUPPORTED_40000baseKR4_Full |
549                                SUPPORTED_40000baseCR4_Full |
550                                SUPPORTED_40000baseSR4_Full |
551                                SUPPORTED_40000baseLR4_Full))
552                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
553         dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
554                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
555         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
556                         ETH_LINK_SPEED_FIXED);
557         if ((dev_link.link_speed && !dev_link.link_status) ||
558             (!dev_link.link_speed && dev_link.link_status)) {
559                 rte_errno = EAGAIN;
560                 return -rte_errno;
561         }
562         *link = dev_link;
563         return 0;
564 }
565
566 /**
567  * Retrieve physical link information (unlocked version using new ioctl).
568  *
569  * @param dev
570  *   Pointer to Ethernet device structure.
571  * @param[out] link
572  *   Storage for current link status.
573  *
574  * @return
575  *   0 on success, a negative errno value otherwise and rte_errno is set.
576  */
577 static int
578 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
579                              struct rte_eth_link *link)
580
581 {
582         struct priv *priv = dev->data->dev_private;
583         struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
584         struct ifreq ifr;
585         struct rte_eth_link dev_link;
586         uint64_t sc;
587         int ret;
588
589         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
590         if (ret) {
591                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
592                         dev->data->port_id, strerror(rte_errno));
593                 return ret;
594         }
595         memset(&dev_link, 0, sizeof(dev_link));
596         dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
597                                 (ifr.ifr_flags & IFF_RUNNING));
598         ifr.ifr_data = (void *)&gcmd;
599         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
600         if (ret) {
601                 DRV_LOG(DEBUG,
602                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
603                         " failed: %s",
604                         dev->data->port_id, strerror(rte_errno));
605                 return ret;
606         }
607         gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
608
609         alignas(struct ethtool_link_settings)
610         uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
611                      sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
612         struct ethtool_link_settings *ecmd = (void *)data;
613
614         *ecmd = gcmd;
615         ifr.ifr_data = (void *)ecmd;
616         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
617         if (ret) {
618                 DRV_LOG(DEBUG,
619                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
620                         " failed: %s",
621                         dev->data->port_id, strerror(rte_errno));
622                 return ret;
623         }
624         dev_link.link_speed = ecmd->speed;
625         sc = ecmd->link_mode_masks[0] |
626                 ((uint64_t)ecmd->link_mode_masks[1] << 32);
627         priv->link_speed_capa = 0;
628         if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
629                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
630         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
631                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
632                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
633         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
634                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
635                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
636                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
637         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
638                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
639                 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
640         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
641                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
642                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
643                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
644                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
645         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
646                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
647                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
648                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
649                 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
650         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
651                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
652                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
653                 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
654         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
655                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
656                 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
657         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
658                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
659                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
660                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
661                 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
662         dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
663                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
664         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
665                                   ETH_LINK_SPEED_FIXED);
666         if ((dev_link.link_speed && !dev_link.link_status) ||
667             (!dev_link.link_speed && dev_link.link_status)) {
668                 rte_errno = EAGAIN;
669                 return -rte_errno;
670         }
671         *link = dev_link;
672         return 0;
673 }
674
675 /**
676  * DPDK callback to retrieve physical link information.
677  *
678  * @param dev
679  *   Pointer to Ethernet device structure.
680  * @param wait_to_complete
681  *   Wait for request completion.
682  *
683  * @return
684  *   0 if link status was not updated, positive if it was, a negative errno
685  *   value otherwise and rte_errno is set.
686  */
687 int
688 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
689 {
690         int ret;
691         struct rte_eth_link dev_link;
692         time_t start_time = time(NULL);
693
694         do {
695                 ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
696                 if (ret)
697                         ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
698                 if (ret == 0)
699                         break;
700                 /* Handle wait to complete situation. */
701                 if (wait_to_complete && ret == -EAGAIN) {
702                         if (abs((int)difftime(time(NULL), start_time)) <
703                             MLX5_LINK_STATUS_TIMEOUT) {
704                                 usleep(0);
705                                 continue;
706                         } else {
707                                 rte_errno = EBUSY;
708                                 return -rte_errno;
709                         }
710                 } else if (ret < 0) {
711                         return ret;
712                 }
713         } while (wait_to_complete);
714         ret = !!memcmp(&dev->data->dev_link, &dev_link,
715                        sizeof(struct rte_eth_link));
716         dev->data->dev_link = dev_link;
717         return ret;
718 }
719
720 /**
721  * DPDK callback to change the MTU.
722  *
723  * @param dev
724  *   Pointer to Ethernet device structure.
725  * @param in_mtu
726  *   New MTU.
727  *
728  * @return
729  *   0 on success, a negative errno value otherwise and rte_errno is set.
730  */
731 int
732 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
733 {
734         struct priv *priv = dev->data->dev_private;
735         uint16_t kern_mtu = 0;
736         int ret;
737
738         ret = mlx5_get_mtu(dev, &kern_mtu);
739         if (ret)
740                 return ret;
741         /* Set kernel interface MTU first. */
742         ret = mlx5_set_mtu(dev, mtu);
743         if (ret)
744                 return ret;
745         ret = mlx5_get_mtu(dev, &kern_mtu);
746         if (ret)
747                 return ret;
748         if (kern_mtu == mtu) {
749                 priv->mtu = mtu;
750                 DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
751                         dev->data->port_id, mtu);
752                 return 0;
753         }
754         rte_errno = EAGAIN;
755         return -rte_errno;
756 }
757
758 /**
759  * DPDK callback to get flow control status.
760  *
761  * @param dev
762  *   Pointer to Ethernet device structure.
763  * @param[out] fc_conf
764  *   Flow control output buffer.
765  *
766  * @return
767  *   0 on success, a negative errno value otherwise and rte_errno is set.
768  */
769 int
770 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
771 {
772         struct ifreq ifr;
773         struct ethtool_pauseparam ethpause = {
774                 .cmd = ETHTOOL_GPAUSEPARAM
775         };
776         int ret;
777
778         ifr.ifr_data = (void *)&ethpause;
779         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
780         if (ret) {
781                 DRV_LOG(WARNING,
782                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
783                         " %s",
784                         dev->data->port_id, strerror(rte_errno));
785                 return ret;
786         }
787         fc_conf->autoneg = ethpause.autoneg;
788         if (ethpause.rx_pause && ethpause.tx_pause)
789                 fc_conf->mode = RTE_FC_FULL;
790         else if (ethpause.rx_pause)
791                 fc_conf->mode = RTE_FC_RX_PAUSE;
792         else if (ethpause.tx_pause)
793                 fc_conf->mode = RTE_FC_TX_PAUSE;
794         else
795                 fc_conf->mode = RTE_FC_NONE;
796         return 0;
797 }
798
799 /**
800  * DPDK callback to modify flow control parameters.
801  *
802  * @param dev
803  *   Pointer to Ethernet device structure.
804  * @param[in] fc_conf
805  *   Flow control parameters.
806  *
807  * @return
808  *   0 on success, a negative errno value otherwise and rte_errno is set.
809  */
810 int
811 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
812 {
813         struct ifreq ifr;
814         struct ethtool_pauseparam ethpause = {
815                 .cmd = ETHTOOL_SPAUSEPARAM
816         };
817         int ret;
818
819         ifr.ifr_data = (void *)&ethpause;
820         ethpause.autoneg = fc_conf->autoneg;
821         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
822             (fc_conf->mode & RTE_FC_RX_PAUSE))
823                 ethpause.rx_pause = 1;
824         else
825                 ethpause.rx_pause = 0;
826
827         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
828             (fc_conf->mode & RTE_FC_TX_PAUSE))
829                 ethpause.tx_pause = 1;
830         else
831                 ethpause.tx_pause = 0;
832         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
833         if (ret) {
834                 DRV_LOG(WARNING,
835                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
836                         " failed: %s",
837                         dev->data->port_id, strerror(rte_errno));
838                 return ret;
839         }
840         return 0;
841 }
842
843 /**
844  * Get PCI information from struct ibv_device.
845  *
846  * @param device
847  *   Pointer to Ethernet device structure.
848  * @param[out] pci_addr
849  *   PCI bus address output buffer.
850  *
851  * @return
852  *   0 on success, a negative errno value otherwise and rte_errno is set.
853  */
854 int
855 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
856                             struct rte_pci_addr *pci_addr)
857 {
858         FILE *file;
859         char line[32];
860         MKSTR(path, "%s/device/uevent", device->ibdev_path);
861
862         file = fopen(path, "rb");
863         if (file == NULL) {
864                 rte_errno = errno;
865                 return -rte_errno;
866         }
867         while (fgets(line, sizeof(line), file) == line) {
868                 size_t len = strlen(line);
869                 int ret;
870
871                 /* Truncate long lines. */
872                 if (len == (sizeof(line) - 1))
873                         while (line[(len - 1)] != '\n') {
874                                 ret = fgetc(file);
875                                 if (ret == EOF)
876                                         break;
877                                 line[(len - 1)] = ret;
878                         }
879                 /* Extract information. */
880                 if (sscanf(line,
881                            "PCI_SLOT_NAME="
882                            "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
883                            &pci_addr->domain,
884                            &pci_addr->bus,
885                            &pci_addr->devid,
886                            &pci_addr->function) == 4) {
887                         ret = 0;
888                         break;
889                 }
890         }
891         fclose(file);
892         return 0;
893 }
894
895 /**
896  * Device status handler.
897  *
898  * @param dev
899  *   Pointer to Ethernet device.
900  * @param events
901  *   Pointer to event flags holder.
902  *
903  * @return
904  *   Events bitmap of callback process which can be called immediately.
905  */
906 static uint32_t
907 mlx5_dev_status_handler(struct rte_eth_dev *dev)
908 {
909         struct priv *priv = dev->data->dev_private;
910         struct ibv_async_event event;
911         uint32_t ret = 0;
912
913         if (mlx5_link_update(dev, 0) == -EAGAIN) {
914                 usleep(0);
915                 return 0;
916         }
917         /* Read all message and acknowledge them. */
918         for (;;) {
919                 if (ibv_get_async_event(priv->ctx, &event))
920                         break;
921                 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
922                         event.event_type == IBV_EVENT_PORT_ERR) &&
923                         (dev->data->dev_conf.intr_conf.lsc == 1))
924                         ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
925                 else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
926                         dev->data->dev_conf.intr_conf.rmv == 1)
927                         ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
928                 else
929                         DRV_LOG(DEBUG,
930                                 "port %u event type %d on not handled",
931                                 dev->data->port_id, event.event_type);
932                 ibv_ack_async_event(&event);
933         }
934         return ret;
935 }
936
937 /**
938  * Handle interrupts from the NIC.
939  *
940  * @param[in] intr_handle
941  *   Interrupt handler.
942  * @param cb_arg
943  *   Callback argument.
944  */
945 void
946 mlx5_dev_interrupt_handler(void *cb_arg)
947 {
948         struct rte_eth_dev *dev = cb_arg;
949         uint32_t events;
950
951         events = mlx5_dev_status_handler(dev);
952         if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
953                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
954                                               NULL);
955         if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
956                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL,
957                                               NULL);
958 }
959
960 /**
961  * Handle interrupts from the socket.
962  *
963  * @param cb_arg
964  *   Callback argument.
965  */
966 static void
967 mlx5_dev_handler_socket(void *cb_arg)
968 {
969         struct rte_eth_dev *dev = cb_arg;
970
971         mlx5_socket_handle(dev);
972 }
973
974 /**
975  * Uninstall interrupt handler.
976  *
977  * @param dev
978  *   Pointer to Ethernet device.
979  */
980 void
981 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
982 {
983         struct priv *priv = dev->data->dev_private;
984
985         if (dev->data->dev_conf.intr_conf.lsc ||
986             dev->data->dev_conf.intr_conf.rmv)
987                 rte_intr_callback_unregister(&priv->intr_handle,
988                                              mlx5_dev_interrupt_handler, dev);
989         if (priv->primary_socket)
990                 rte_intr_callback_unregister(&priv->intr_handle_socket,
991                                              mlx5_dev_handler_socket, dev);
992         priv->intr_handle.fd = 0;
993         priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
994         priv->intr_handle_socket.fd = 0;
995         priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN;
996 }
997
998 /**
999  * Install interrupt handler.
1000  *
1001  * @param dev
1002  *   Pointer to Ethernet device.
1003  */
1004 void
1005 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
1006 {
1007         struct priv *priv = dev->data->dev_private;
1008         int ret;
1009         int flags;
1010
1011         assert(priv->ctx->async_fd > 0);
1012         flags = fcntl(priv->ctx->async_fd, F_GETFL);
1013         ret = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1014         if (ret) {
1015                 DRV_LOG(INFO,
1016                         "port %u failed to change file descriptor async event"
1017                         " queue",
1018                         dev->data->port_id);
1019                 dev->data->dev_conf.intr_conf.lsc = 0;
1020                 dev->data->dev_conf.intr_conf.rmv = 0;
1021         }
1022         if (dev->data->dev_conf.intr_conf.lsc ||
1023             dev->data->dev_conf.intr_conf.rmv) {
1024                 priv->intr_handle.fd = priv->ctx->async_fd;
1025                 priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
1026                 rte_intr_callback_register(&priv->intr_handle,
1027                                            mlx5_dev_interrupt_handler, dev);
1028         }
1029         ret = mlx5_socket_init(dev);
1030         if (ret)
1031                 DRV_LOG(ERR, "port %u cannot initialise socket: %s",
1032                         dev->data->port_id, strerror(rte_errno));
1033         else if (priv->primary_socket) {
1034                 priv->intr_handle_socket.fd = priv->primary_socket;
1035                 priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT;
1036                 rte_intr_callback_register(&priv->intr_handle_socket,
1037                                            mlx5_dev_handler_socket, dev);
1038         }
1039 }
1040
1041 /**
1042  * DPDK callback to bring the link DOWN.
1043  *
1044  * @param dev
1045  *   Pointer to Ethernet device structure.
1046  *
1047  * @return
1048  *   0 on success, a negative errno value otherwise and rte_errno is set.
1049  */
1050 int
1051 mlx5_set_link_down(struct rte_eth_dev *dev)
1052 {
1053         return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
1054 }
1055
1056 /**
1057  * DPDK callback to bring the link UP.
1058  *
1059  * @param dev
1060  *   Pointer to Ethernet device structure.
1061  *
1062  * @return
1063  *   0 on success, a negative errno value otherwise and rte_errno is set.
1064  */
1065 int
1066 mlx5_set_link_up(struct rte_eth_dev *dev)
1067 {
1068         return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
1069 }
1070
1071 /**
1072  * Configure the TX function to use.
1073  *
1074  * @param dev
1075  *   Pointer to rte_eth_dev structure.
1076  *
1077  * @return
1078  *   Pointer to selected Tx burst function.
1079  */
1080 eth_tx_burst_t
1081 mlx5_select_tx_function(struct rte_eth_dev *dev)
1082 {
1083         struct priv *priv = dev->data->dev_private;
1084         eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst;
1085
1086         /* Select appropriate TX function. */
1087         if (priv->mps == MLX5_MPW_ENHANCED) {
1088                 if (mlx5_check_vec_tx_support(dev) > 0) {
1089                         if (mlx5_check_raw_vec_tx_support(dev) > 0)
1090                                 tx_pkt_burst = mlx5_tx_burst_raw_vec;
1091                         else
1092                                 tx_pkt_burst = mlx5_tx_burst_vec;
1093                         DRV_LOG(DEBUG,
1094                                 "port %u selected enhanced MPW Tx vectorized"
1095                                 " function",
1096                                 dev->data->port_id);
1097                 } else {
1098                         tx_pkt_burst = mlx5_tx_burst_empw;
1099                         DRV_LOG(DEBUG,
1100                                 "port %u selected enhanced MPW Tx function",
1101                                 dev->data->port_id);
1102                 }
1103         } else if (priv->mps && priv->txq_inline) {
1104                 tx_pkt_burst = mlx5_tx_burst_mpw_inline;
1105                 DRV_LOG(DEBUG, "port %u selected MPW inline Tx function",
1106                         dev->data->port_id);
1107         } else if (priv->mps) {
1108                 tx_pkt_burst = mlx5_tx_burst_mpw;
1109                 DRV_LOG(DEBUG, "port %u selected MPW Tx function",
1110                         dev->data->port_id);
1111         }
1112         return tx_pkt_burst;
1113 }
1114
1115 /**
1116  * Configure the RX function to use.
1117  *
1118  * @param dev
1119  *   Pointer to rte_eth_dev structure.
1120  *
1121  * @return
1122  *   Pointer to selected Rx burst function.
1123  */
1124 eth_rx_burst_t
1125 mlx5_select_rx_function(struct rte_eth_dev *dev)
1126 {
1127         eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1128
1129         assert(dev != NULL);
1130         if (mlx5_check_vec_rx_support(dev) > 0) {
1131                 rx_pkt_burst = mlx5_rx_burst_vec;
1132                 DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
1133                         dev->data->port_id);
1134         }
1135         return rx_pkt_burst;
1136 }