New upstream version 18.11-rc1
[deb_dpdk.git] / drivers / net / tap / rte_eth_tap.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016-2017 Intel Corporation
3  */
4
5 #include <rte_atomic.h>
6 #include <rte_branch_prediction.h>
7 #include <rte_byteorder.h>
8 #include <rte_common.h>
9 #include <rte_mbuf.h>
10 #include <rte_ethdev_driver.h>
11 #include <rte_ethdev_vdev.h>
12 #include <rte_malloc.h>
13 #include <rte_bus_vdev.h>
14 #include <rte_kvargs.h>
15 #include <rte_net.h>
16 #include <rte_debug.h>
17 #include <rte_ip.h>
18 #include <rte_string_fns.h>
19 #include <rte_ethdev.h>
20 #include <rte_errno.h>
21
22 #include <assert.h>
23 #include <sys/types.h>
24 #include <sys/stat.h>
25 #include <sys/socket.h>
26 #include <sys/ioctl.h>
27 #include <sys/utsname.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 #include <signal.h>
31 #include <stdbool.h>
32 #include <stdint.h>
33 #include <sys/uio.h>
34 #include <unistd.h>
35 #include <arpa/inet.h>
36 #include <net/if.h>
37 #include <linux/if_tun.h>
38 #include <linux/if_ether.h>
39 #include <fcntl.h>
40
41 #include <tap_rss.h>
42 #include <rte_eth_tap.h>
43 #include <tap_flow.h>
44 #include <tap_netlink.h>
45 #include <tap_tcmsgs.h>
46
47 /* Linux based path to the TUN device */
48 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
49 #define DEFAULT_TAP_NAME        "dtap"
50 #define DEFAULT_TUN_NAME        "dtun"
51
52 #define ETH_TAP_IFACE_ARG       "iface"
53 #define ETH_TAP_REMOTE_ARG      "remote"
54 #define ETH_TAP_MAC_ARG         "mac"
55 #define ETH_TAP_MAC_FIXED       "fixed"
56
57 #define ETH_TAP_USR_MAC_FMT     "xx:xx:xx:xx:xx:xx"
58 #define ETH_TAP_CMP_MAC_FMT     "0123456789ABCDEFabcdef"
59 #define ETH_TAP_MAC_ARG_FMT     ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT
60
61 #define TAP_GSO_MBUFS_PER_CORE  128
62 #define TAP_GSO_MBUF_SEG_SIZE   128
63 #define TAP_GSO_MBUF_CACHE_SIZE 4
64 #define TAP_GSO_MBUFS_NUM \
65         (TAP_GSO_MBUFS_PER_CORE * TAP_GSO_MBUF_CACHE_SIZE)
66
67 /* IPC key for queue fds sync */
68 #define TAP_MP_KEY "tap_mp_sync_queues"
69
70 static int tap_devices_count;
71 static struct rte_vdev_driver pmd_tap_drv;
72 static struct rte_vdev_driver pmd_tun_drv;
73
74 static const char *valid_arguments[] = {
75         ETH_TAP_IFACE_ARG,
76         ETH_TAP_REMOTE_ARG,
77         ETH_TAP_MAC_ARG,
78         NULL
79 };
80
81 static unsigned int tap_unit;
82 static unsigned int tun_unit;
83
84 static char tuntap_name[8];
85
86 static volatile uint32_t tap_trigger;   /* Rx trigger */
87
88 static struct rte_eth_link pmd_link = {
89         .link_speed = ETH_SPEED_NUM_10G,
90         .link_duplex = ETH_LINK_FULL_DUPLEX,
91         .link_status = ETH_LINK_DOWN,
92         .link_autoneg = ETH_LINK_FIXED,
93 };
94
95 static void
96 tap_trigger_cb(int sig __rte_unused)
97 {
98         /* Valid trigger values are nonzero */
99         tap_trigger = (tap_trigger + 1) | 0x80000000;
100 }
101
102 /* Specifies on what netdevices the ioctl should be applied */
103 enum ioctl_mode {
104         LOCAL_AND_REMOTE,
105         LOCAL_ONLY,
106         REMOTE_ONLY,
107 };
108
109 /* Message header to synchronize queues via IPC */
110 struct ipc_queues {
111         char port_name[RTE_DEV_NAME_MAX_LEN];
112         int rxq_count;
113         int txq_count;
114         /*
115          * The file descriptors are in the dedicated part
116          * of the Unix message to be translated by the kernel.
117          */
118 };
119
120 static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);
121
122 /**
123  * Tun/Tap allocation routine
124  *
125  * @param[in] pmd
126  *   Pointer to private structure.
127  *
128  * @param[in] is_keepalive
129  *   Keepalive flag
130  *
131  * @return
132  *   -1 on failure, fd on success
133  */
134 static int
135 tun_alloc(struct pmd_internals *pmd, int is_keepalive)
136 {
137         struct ifreq ifr;
138 #ifdef IFF_MULTI_QUEUE
139         unsigned int features;
140 #endif
141         int fd;
142
143         memset(&ifr, 0, sizeof(struct ifreq));
144
145         /*
146          * Do not set IFF_NO_PI as packet information header will be needed
147          * to check if a received packet has been truncated.
148          */
149         ifr.ifr_flags = (pmd->type == ETH_TUNTAP_TYPE_TAP) ?
150                 IFF_TAP : IFF_TUN | IFF_POINTOPOINT;
151         snprintf(ifr.ifr_name, IFNAMSIZ, "%s", pmd->name);
152
153         TAP_LOG(DEBUG, "ifr_name '%s'", ifr.ifr_name);
154
155         fd = open(TUN_TAP_DEV_PATH, O_RDWR);
156         if (fd < 0) {
157                 TAP_LOG(ERR, "Unable to create %s interface", tuntap_name);
158                 goto error;
159         }
160
161 #ifdef IFF_MULTI_QUEUE
162         /* Grab the TUN features to verify we can work multi-queue */
163         if (ioctl(fd, TUNGETFEATURES, &features) < 0) {
164                 TAP_LOG(ERR, "%s unable to get TUN/TAP features",
165                         tuntap_name);
166                 goto error;
167         }
168         TAP_LOG(DEBUG, "%s Features %08x", tuntap_name, features);
169
170         if (features & IFF_MULTI_QUEUE) {
171                 TAP_LOG(DEBUG, "  Multi-queue support for %d queues",
172                         RTE_PMD_TAP_MAX_QUEUES);
173                 ifr.ifr_flags |= IFF_MULTI_QUEUE;
174         } else
175 #endif
176         {
177                 ifr.ifr_flags |= IFF_ONE_QUEUE;
178                 TAP_LOG(DEBUG, "  Single queue only support");
179         }
180
181         /* Set the TUN/TAP configuration and set the name if needed */
182         if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) {
183                 TAP_LOG(WARNING, "Unable to set TUNSETIFF for %s: %s",
184                         ifr.ifr_name, strerror(errno));
185                 goto error;
186         }
187
188         if (is_keepalive) {
189                 /*
190                  * Detach the TUN/TAP keep-alive queue
191                  * to avoid traffic through it
192                  */
193                 ifr.ifr_flags = IFF_DETACH_QUEUE;
194                 if (ioctl(fd, TUNSETQUEUE, (void *)&ifr) < 0) {
195                         TAP_LOG(WARNING,
196                                 "Unable to detach keep-alive queue for %s: %s",
197                                 ifr.ifr_name, strerror(errno));
198                         goto error;
199                 }
200         }
201
202         /* Always set the file descriptor to non-blocking */
203         if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
204                 TAP_LOG(WARNING,
205                         "Unable to set %s to nonblocking: %s",
206                         ifr.ifr_name, strerror(errno));
207                 goto error;
208         }
209
210         /* Set up trigger to optimize empty Rx bursts */
211         errno = 0;
212         do {
213                 struct sigaction sa;
214                 int flags = fcntl(fd, F_GETFL);
215
216                 if (flags == -1 || sigaction(SIGIO, NULL, &sa) == -1)
217                         break;
218                 if (sa.sa_handler != tap_trigger_cb) {
219                         /*
220                          * Make sure SIGIO is not already taken. This is done
221                          * as late as possible to leave the application a
222                          * chance to set up its own signal handler first.
223                          */
224                         if (sa.sa_handler != SIG_IGN &&
225                             sa.sa_handler != SIG_DFL) {
226                                 errno = EBUSY;
227                                 break;
228                         }
229                         sa = (struct sigaction){
230                                 .sa_flags = SA_RESTART,
231                                 .sa_handler = tap_trigger_cb,
232                         };
233                         if (sigaction(SIGIO, &sa, NULL) == -1)
234                                 break;
235                 }
236                 /* Enable SIGIO on file descriptor */
237                 fcntl(fd, F_SETFL, flags | O_ASYNC);
238                 fcntl(fd, F_SETOWN, getpid());
239         } while (0);
240
241         if (errno) {
242                 /* Disable trigger globally in case of error */
243                 tap_trigger = 0;
244                 TAP_LOG(WARNING, "Rx trigger disabled: %s",
245                         strerror(errno));
246         }
247
248         return fd;
249
250 error:
251         if (fd > 0)
252                 close(fd);
253         return -1;
254 }
255
256 static void
257 tap_verify_csum(struct rte_mbuf *mbuf)
258 {
259         uint32_t l2 = mbuf->packet_type & RTE_PTYPE_L2_MASK;
260         uint32_t l3 = mbuf->packet_type & RTE_PTYPE_L3_MASK;
261         uint32_t l4 = mbuf->packet_type & RTE_PTYPE_L4_MASK;
262         unsigned int l2_len = sizeof(struct ether_hdr);
263         unsigned int l3_len;
264         uint16_t cksum = 0;
265         void *l3_hdr;
266         void *l4_hdr;
267
268         if (l2 == RTE_PTYPE_L2_ETHER_VLAN)
269                 l2_len += 4;
270         else if (l2 == RTE_PTYPE_L2_ETHER_QINQ)
271                 l2_len += 8;
272         /* Don't verify checksum for packets with discontinuous L2 header */
273         if (unlikely(l2_len + sizeof(struct ipv4_hdr) >
274                      rte_pktmbuf_data_len(mbuf)))
275                 return;
276         l3_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len);
277         if (l3 == RTE_PTYPE_L3_IPV4 || l3 == RTE_PTYPE_L3_IPV4_EXT) {
278                 struct ipv4_hdr *iph = l3_hdr;
279
280                 /* ihl contains the number of 4-byte words in the header */
281                 l3_len = 4 * (iph->version_ihl & 0xf);
282                 if (unlikely(l2_len + l3_len > rte_pktmbuf_data_len(mbuf)))
283                         return;
284
285                 cksum = ~rte_raw_cksum(iph, l3_len);
286                 mbuf->ol_flags |= cksum ?
287                         PKT_RX_IP_CKSUM_BAD :
288                         PKT_RX_IP_CKSUM_GOOD;
289         } else if (l3 == RTE_PTYPE_L3_IPV6) {
290                 l3_len = sizeof(struct ipv6_hdr);
291         } else {
292                 /* IPv6 extensions are not supported */
293                 return;
294         }
295         if (l4 == RTE_PTYPE_L4_UDP || l4 == RTE_PTYPE_L4_TCP) {
296                 l4_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len + l3_len);
297                 /* Don't verify checksum for multi-segment packets. */
298                 if (mbuf->nb_segs > 1)
299                         return;
300                 if (l3 == RTE_PTYPE_L3_IPV4)
301                         cksum = ~rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr);
302                 else if (l3 == RTE_PTYPE_L3_IPV6)
303                         cksum = ~rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr);
304                 mbuf->ol_flags |= cksum ?
305                         PKT_RX_L4_CKSUM_BAD :
306                         PKT_RX_L4_CKSUM_GOOD;
307         }
308 }
309
310 static uint64_t
311 tap_rx_offload_get_port_capa(void)
312 {
313         /*
314          * No specific port Rx offload capabilities.
315          */
316         return 0;
317 }
318
319 static uint64_t
320 tap_rx_offload_get_queue_capa(void)
321 {
322         return DEV_RX_OFFLOAD_SCATTER |
323                DEV_RX_OFFLOAD_IPV4_CKSUM |
324                DEV_RX_OFFLOAD_UDP_CKSUM |
325                DEV_RX_OFFLOAD_TCP_CKSUM;
326 }
327
328 /* Callback to handle the rx burst of packets to the correct interface and
329  * file descriptor(s) in a multi-queue setup.
330  */
331 static uint16_t
332 pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
333 {
334         struct rx_queue *rxq = queue;
335         struct pmd_process_private *process_private;
336         uint16_t num_rx;
337         unsigned long num_rx_bytes = 0;
338         uint32_t trigger = tap_trigger;
339
340         if (trigger == rxq->trigger_seen)
341                 return 0;
342         if (trigger)
343                 rxq->trigger_seen = trigger;
344         process_private = rte_eth_devices[rxq->in_port].process_private;
345         rte_compiler_barrier();
346         for (num_rx = 0; num_rx < nb_pkts; ) {
347                 struct rte_mbuf *mbuf = rxq->pool;
348                 struct rte_mbuf *seg = NULL;
349                 struct rte_mbuf *new_tail = NULL;
350                 uint16_t data_off = rte_pktmbuf_headroom(mbuf);
351                 int len;
352
353                 len = readv(process_private->rxq_fds[rxq->queue_id],
354                         *rxq->iovecs,
355                         1 + (rxq->rxmode->offloads & DEV_RX_OFFLOAD_SCATTER ?
356                              rxq->nb_rx_desc : 1));
357                 if (len < (int)sizeof(struct tun_pi))
358                         break;
359
360                 /* Packet couldn't fit in the provided mbuf */
361                 if (unlikely(rxq->pi.flags & TUN_PKT_STRIP)) {
362                         rxq->stats.ierrors++;
363                         continue;
364                 }
365
366                 len -= sizeof(struct tun_pi);
367
368                 mbuf->pkt_len = len;
369                 mbuf->port = rxq->in_port;
370                 while (1) {
371                         struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
372
373                         if (unlikely(!buf)) {
374                                 rxq->stats.rx_nombuf++;
375                                 /* No new buf has been allocated: do nothing */
376                                 if (!new_tail || !seg)
377                                         goto end;
378
379                                 seg->next = NULL;
380                                 rte_pktmbuf_free(mbuf);
381
382                                 goto end;
383                         }
384                         seg = seg ? seg->next : mbuf;
385                         if (rxq->pool == mbuf)
386                                 rxq->pool = buf;
387                         if (new_tail)
388                                 new_tail->next = buf;
389                         new_tail = buf;
390                         new_tail->next = seg->next;
391
392                         /* iovecs[0] is reserved for packet info (pi) */
393                         (*rxq->iovecs)[mbuf->nb_segs].iov_len =
394                                 buf->buf_len - data_off;
395                         (*rxq->iovecs)[mbuf->nb_segs].iov_base =
396                                 (char *)buf->buf_addr + data_off;
397
398                         seg->data_len = RTE_MIN(seg->buf_len - data_off, len);
399                         seg->data_off = data_off;
400
401                         len -= seg->data_len;
402                         if (len <= 0)
403                                 break;
404                         mbuf->nb_segs++;
405                         /* First segment has headroom, not the others */
406                         data_off = 0;
407                 }
408                 seg->next = NULL;
409                 mbuf->packet_type = rte_net_get_ptype(mbuf, NULL,
410                                                       RTE_PTYPE_ALL_MASK);
411                 if (rxq->rxmode->offloads & DEV_RX_OFFLOAD_CHECKSUM)
412                         tap_verify_csum(mbuf);
413
414                 /* account for the receive frame */
415                 bufs[num_rx++] = mbuf;
416                 num_rx_bytes += mbuf->pkt_len;
417         }
418 end:
419         rxq->stats.ipackets += num_rx;
420         rxq->stats.ibytes += num_rx_bytes;
421
422         return num_rx;
423 }
424
425 static uint64_t
426 tap_tx_offload_get_port_capa(void)
427 {
428         /*
429          * No specific port Tx offload capabilities.
430          */
431         return 0;
432 }
433
434 static uint64_t
435 tap_tx_offload_get_queue_capa(void)
436 {
437         return DEV_TX_OFFLOAD_MULTI_SEGS |
438                DEV_TX_OFFLOAD_IPV4_CKSUM |
439                DEV_TX_OFFLOAD_UDP_CKSUM |
440                DEV_TX_OFFLOAD_TCP_CKSUM |
441                DEV_TX_OFFLOAD_TCP_TSO;
442 }
443
444 /* Finalize l4 checksum calculation */
445 static void
446 tap_tx_l4_cksum(uint16_t *l4_cksum, uint16_t l4_phdr_cksum,
447                 uint32_t l4_raw_cksum)
448 {
449         if (l4_cksum) {
450                 uint32_t cksum;
451
452                 cksum = __rte_raw_cksum_reduce(l4_raw_cksum);
453                 cksum += l4_phdr_cksum;
454
455                 cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
456                 cksum = (~cksum) & 0xffff;
457                 if (cksum == 0)
458                         cksum = 0xffff;
459                 *l4_cksum = cksum;
460         }
461 }
462
463 /* Accumaulate L4 raw checksums */
464 static void
465 tap_tx_l4_add_rcksum(char *l4_data, unsigned int l4_len, uint16_t *l4_cksum,
466                         uint32_t *l4_raw_cksum)
467 {
468         if (l4_cksum == NULL)
469                 return;
470
471         *l4_raw_cksum = __rte_raw_cksum(l4_data, l4_len, *l4_raw_cksum);
472 }
473
474 /* L3 and L4 pseudo headers checksum offloads */
475 static void
476 tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned int l2_len,
477                 unsigned int l3_len, unsigned int l4_len, uint16_t **l4_cksum,
478                 uint16_t *l4_phdr_cksum, uint32_t *l4_raw_cksum)
479 {
480         void *l3_hdr = packet + l2_len;
481
482         if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4)) {
483                 struct ipv4_hdr *iph = l3_hdr;
484                 uint16_t cksum;
485
486                 iph->hdr_checksum = 0;
487                 cksum = rte_raw_cksum(iph, l3_len);
488                 iph->hdr_checksum = (cksum == 0xffff) ? cksum : ~cksum;
489         }
490         if (ol_flags & PKT_TX_L4_MASK) {
491                 void *l4_hdr;
492
493                 l4_hdr = packet + l2_len + l3_len;
494                 if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM)
495                         *l4_cksum = &((struct udp_hdr *)l4_hdr)->dgram_cksum;
496                 else if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM)
497                         *l4_cksum = &((struct tcp_hdr *)l4_hdr)->cksum;
498                 else
499                         return;
500                 **l4_cksum = 0;
501                 if (ol_flags & PKT_TX_IPV4)
502                         *l4_phdr_cksum = rte_ipv4_phdr_cksum(l3_hdr, 0);
503                 else
504                         *l4_phdr_cksum = rte_ipv6_phdr_cksum(l3_hdr, 0);
505                 *l4_raw_cksum = __rte_raw_cksum(l4_hdr, l4_len, 0);
506         }
507 }
508
509 static inline void
510 tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
511                         struct rte_mbuf **pmbufs,
512                         uint16_t *num_packets, unsigned long *num_tx_bytes)
513 {
514         int i;
515         uint16_t l234_hlen;
516         struct pmd_process_private *process_private;
517
518         process_private = rte_eth_devices[txq->out_port].process_private;
519
520         for (i = 0; i < num_mbufs; i++) {
521                 struct rte_mbuf *mbuf = pmbufs[i];
522                 struct iovec iovecs[mbuf->nb_segs + 2];
523                 struct tun_pi pi = { .flags = 0, .proto = 0x00 };
524                 struct rte_mbuf *seg = mbuf;
525                 char m_copy[mbuf->data_len];
526                 int proto;
527                 int n;
528                 int j;
529                 int k; /* current index in iovecs for copying segments */
530                 uint16_t seg_len; /* length of first segment */
531                 uint16_t nb_segs;
532                 uint16_t *l4_cksum; /* l4 checksum (pseudo header + payload) */
533                 uint32_t l4_raw_cksum = 0; /* TCP/UDP payload raw checksum */
534                 uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header checksum */
535                 uint16_t is_cksum = 0; /* in case cksum should be offloaded */
536
537                 l4_cksum = NULL;
538                 if (txq->type == ETH_TUNTAP_TYPE_TUN) {
539                         /*
540                          * TUN and TAP are created with IFF_NO_PI disabled.
541                          * For TUN PMD this mandatory as fields are used by
542                          * Kernel tun.c to determine whether its IP or non IP
543                          * packets.
544                          *
545                          * The logic fetches the first byte of data from mbuf
546                          * then compares whether its v4 or v6. If first byte
547                          * is 4 or 6, then protocol field is updated.
548                          */
549                         char *buff_data = rte_pktmbuf_mtod(seg, void *);
550                         proto = (*buff_data & 0xf0);
551                         pi.proto = (proto == 0x40) ?
552                                 rte_cpu_to_be_16(ETHER_TYPE_IPv4) :
553                                 ((proto == 0x60) ?
554                                         rte_cpu_to_be_16(ETHER_TYPE_IPv6) :
555                                         0x00);
556                 }
557
558                 k = 0;
559                 iovecs[k].iov_base = &pi;
560                 iovecs[k].iov_len = sizeof(pi);
561                 k++;
562
563                 nb_segs = mbuf->nb_segs;
564                 if (txq->csum &&
565                     ((mbuf->ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4) ||
566                      (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM ||
567                      (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM))) {
568                         is_cksum = 1;
569
570                         /* Support only packets with at least layer 4
571                          * header included in the first segment
572                          */
573                         seg_len = rte_pktmbuf_data_len(mbuf);
574                         l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
575                         if (seg_len < l234_hlen)
576                                 break;
577
578                         /* To change checksums, work on a * copy of l2, l3
579                          * headers + l4 pseudo header
580                          */
581                         rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
582                                         l234_hlen);
583                         tap_tx_l3_cksum(m_copy, mbuf->ol_flags,
584                                        mbuf->l2_len, mbuf->l3_len, mbuf->l4_len,
585                                        &l4_cksum, &l4_phdr_cksum,
586                                        &l4_raw_cksum);
587                         iovecs[k].iov_base = m_copy;
588                         iovecs[k].iov_len = l234_hlen;
589                         k++;
590
591                         /* Update next iovecs[] beyond l2, l3, l4 headers */
592                         if (seg_len > l234_hlen) {
593                                 iovecs[k].iov_len = seg_len - l234_hlen;
594                                 iovecs[k].iov_base =
595                                         rte_pktmbuf_mtod(seg, char *) +
596                                                 l234_hlen;
597                                 tap_tx_l4_add_rcksum(iovecs[k].iov_base,
598                                         iovecs[k].iov_len, l4_cksum,
599                                         &l4_raw_cksum);
600                                 k++;
601                                 nb_segs++;
602                         }
603                         seg = seg->next;
604                 }
605
606                 for (j = k; j <= nb_segs; j++) {
607                         iovecs[j].iov_len = rte_pktmbuf_data_len(seg);
608                         iovecs[j].iov_base = rte_pktmbuf_mtod(seg, void *);
609                         if (is_cksum)
610                                 tap_tx_l4_add_rcksum(iovecs[j].iov_base,
611                                         iovecs[j].iov_len, l4_cksum,
612                                         &l4_raw_cksum);
613                         seg = seg->next;
614                 }
615
616                 if (is_cksum)
617                         tap_tx_l4_cksum(l4_cksum, l4_phdr_cksum, l4_raw_cksum);
618
619                 /* copy the tx frame data */
620                 n = writev(process_private->txq_fds[txq->queue_id], iovecs, j);
621                 if (n <= 0)
622                         break;
623                 (*num_packets)++;
624                 (*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
625         }
626 }
627
628 /* Callback to handle sending packets from the tap interface
629  */
630 static uint16_t
631 pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
632 {
633         struct tx_queue *txq = queue;
634         uint16_t num_tx = 0;
635         uint16_t num_packets = 0;
636         unsigned long num_tx_bytes = 0;
637         uint32_t max_size;
638         int i;
639
640         if (unlikely(nb_pkts == 0))
641                 return 0;
642
643         struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
644         max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
645         for (i = 0; i < nb_pkts; i++) {
646                 struct rte_mbuf *mbuf_in = bufs[num_tx];
647                 struct rte_mbuf **mbuf;
648                 uint16_t num_mbufs = 0;
649                 uint16_t tso_segsz = 0;
650                 int ret;
651                 uint16_t hdrs_len;
652                 int j;
653                 uint64_t tso;
654
655                 tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
656                 if (tso) {
657                         struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;
658
659                         assert(gso_ctx != NULL);
660
661                         /* TCP segmentation implies TCP checksum offload */
662                         mbuf_in->ol_flags |= PKT_TX_TCP_CKSUM;
663
664                         /* gso size is calculated without ETHER_CRC_LEN */
665                         hdrs_len = mbuf_in->l2_len + mbuf_in->l3_len +
666                                         mbuf_in->l4_len;
667                         tso_segsz = mbuf_in->tso_segsz + hdrs_len;
668                         if (unlikely(tso_segsz == hdrs_len) ||
669                                 tso_segsz > *txq->mtu) {
670                                 txq->stats.errs++;
671                                 break;
672                         }
673                         gso_ctx->gso_size = tso_segsz;
674                         ret = rte_gso_segment(mbuf_in, /* packet to segment */
675                                 gso_ctx, /* gso control block */
676                                 (struct rte_mbuf **)&gso_mbufs, /* out mbufs */
677                                 RTE_DIM(gso_mbufs)); /* max tso mbufs */
678
679                         /* ret contains the number of new created mbufs */
680                         if (ret < 0)
681                                 break;
682
683                         mbuf = gso_mbufs;
684                         num_mbufs = ret;
685                 } else {
686                         /* stats.errs will be incremented */
687                         if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
688                                 break;
689
690                         /* ret 0 indicates no new mbufs were created */
691                         ret = 0;
692                         mbuf = &mbuf_in;
693                         num_mbufs = 1;
694                 }
695
696                 tap_write_mbufs(txq, num_mbufs, mbuf,
697                                 &num_packets, &num_tx_bytes);
698                 num_tx++;
699                 /* free original mbuf */
700                 rte_pktmbuf_free(mbuf_in);
701                 /* free tso mbufs */
702                 for (j = 0; j < ret; j++)
703                         rte_pktmbuf_free(mbuf[j]);
704         }
705
706         txq->stats.opackets += num_packets;
707         txq->stats.errs += nb_pkts - num_tx;
708         txq->stats.obytes += num_tx_bytes;
709
710         return num_packets;
711 }
712
713 static const char *
714 tap_ioctl_req2str(unsigned long request)
715 {
716         switch (request) {
717         case SIOCSIFFLAGS:
718                 return "SIOCSIFFLAGS";
719         case SIOCGIFFLAGS:
720                 return "SIOCGIFFLAGS";
721         case SIOCGIFHWADDR:
722                 return "SIOCGIFHWADDR";
723         case SIOCSIFHWADDR:
724                 return "SIOCSIFHWADDR";
725         case SIOCSIFMTU:
726                 return "SIOCSIFMTU";
727         }
728         return "UNKNOWN";
729 }
730
731 static int
732 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
733           struct ifreq *ifr, int set, enum ioctl_mode mode)
734 {
735         short req_flags = ifr->ifr_flags;
736         int remote = pmd->remote_if_index &&
737                 (mode == REMOTE_ONLY || mode == LOCAL_AND_REMOTE);
738
739         if (!pmd->remote_if_index && mode == REMOTE_ONLY)
740                 return 0;
741         /*
742          * If there is a remote netdevice, apply ioctl on it, then apply it on
743          * the tap netdevice.
744          */
745 apply:
746         if (remote)
747                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->remote_iface);
748         else if (mode == LOCAL_ONLY || mode == LOCAL_AND_REMOTE)
749                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->name);
750         switch (request) {
751         case SIOCSIFFLAGS:
752                 /* fetch current flags to leave other flags untouched */
753                 if (ioctl(pmd->ioctl_sock, SIOCGIFFLAGS, ifr) < 0)
754                         goto error;
755                 if (set)
756                         ifr->ifr_flags |= req_flags;
757                 else
758                         ifr->ifr_flags &= ~req_flags;
759                 break;
760         case SIOCGIFFLAGS:
761         case SIOCGIFHWADDR:
762         case SIOCSIFHWADDR:
763         case SIOCSIFMTU:
764                 break;
765         default:
766                 RTE_LOG(WARNING, PMD, "%s: ioctl() called with wrong arg\n",
767                         pmd->name);
768                 return -EINVAL;
769         }
770         if (ioctl(pmd->ioctl_sock, request, ifr) < 0)
771                 goto error;
772         if (remote-- && mode == LOCAL_AND_REMOTE)
773                 goto apply;
774         return 0;
775
776 error:
777         TAP_LOG(DEBUG, "%s(%s) failed: %s(%d)", ifr->ifr_name,
778                 tap_ioctl_req2str(request), strerror(errno), errno);
779         return -errno;
780 }
781
782 static int
783 tap_link_set_down(struct rte_eth_dev *dev)
784 {
785         struct pmd_internals *pmd = dev->data->dev_private;
786         struct ifreq ifr = { .ifr_flags = IFF_UP };
787
788         dev->data->dev_link.link_status = ETH_LINK_DOWN;
789         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_ONLY);
790 }
791
792 static int
793 tap_link_set_up(struct rte_eth_dev *dev)
794 {
795         struct pmd_internals *pmd = dev->data->dev_private;
796         struct ifreq ifr = { .ifr_flags = IFF_UP };
797
798         dev->data->dev_link.link_status = ETH_LINK_UP;
799         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
800 }
801
802 static int
803 tap_dev_start(struct rte_eth_dev *dev)
804 {
805         int err, i;
806
807         err = tap_intr_handle_set(dev, 1);
808         if (err)
809                 return err;
810
811         err = tap_link_set_up(dev);
812         if (err)
813                 return err;
814
815         for (i = 0; i < dev->data->nb_tx_queues; i++)
816                 dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
817         for (i = 0; i < dev->data->nb_rx_queues; i++)
818                 dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
819
820         return err;
821 }
822
823 /* This function gets called when the current port gets stopped.
824  */
825 static void
826 tap_dev_stop(struct rte_eth_dev *dev)
827 {
828         int i;
829
830         for (i = 0; i < dev->data->nb_tx_queues; i++)
831                 dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
832         for (i = 0; i < dev->data->nb_rx_queues; i++)
833                 dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
834
835         tap_intr_handle_set(dev, 0);
836         tap_link_set_down(dev);
837 }
838
839 static int
840 tap_dev_configure(struct rte_eth_dev *dev)
841 {
842         if (dev->data->nb_rx_queues > RTE_PMD_TAP_MAX_QUEUES) {
843                 TAP_LOG(ERR,
844                         "%s: number of rx queues %d exceeds max num of queues %d",
845                         dev->device->name,
846                         dev->data->nb_rx_queues,
847                         RTE_PMD_TAP_MAX_QUEUES);
848                 return -1;
849         }
850         if (dev->data->nb_tx_queues > RTE_PMD_TAP_MAX_QUEUES) {
851                 TAP_LOG(ERR,
852                         "%s: number of tx queues %d exceeds max num of queues %d",
853                         dev->device->name,
854                         dev->data->nb_tx_queues,
855                         RTE_PMD_TAP_MAX_QUEUES);
856                 return -1;
857         }
858
859         TAP_LOG(INFO, "%s: %p: TX configured queues number: %u",
860                 dev->device->name, (void *)dev, dev->data->nb_tx_queues);
861
862         TAP_LOG(INFO, "%s: %p: RX configured queues number: %u",
863                 dev->device->name, (void *)dev, dev->data->nb_rx_queues);
864
865         return 0;
866 }
867
868 static uint32_t
869 tap_dev_speed_capa(void)
870 {
871         uint32_t speed = pmd_link.link_speed;
872         uint32_t capa = 0;
873
874         if (speed >= ETH_SPEED_NUM_10M)
875                 capa |= ETH_LINK_SPEED_10M;
876         if (speed >= ETH_SPEED_NUM_100M)
877                 capa |= ETH_LINK_SPEED_100M;
878         if (speed >= ETH_SPEED_NUM_1G)
879                 capa |= ETH_LINK_SPEED_1G;
880         if (speed >= ETH_SPEED_NUM_5G)
881                 capa |= ETH_LINK_SPEED_2_5G;
882         if (speed >= ETH_SPEED_NUM_5G)
883                 capa |= ETH_LINK_SPEED_5G;
884         if (speed >= ETH_SPEED_NUM_10G)
885                 capa |= ETH_LINK_SPEED_10G;
886         if (speed >= ETH_SPEED_NUM_20G)
887                 capa |= ETH_LINK_SPEED_20G;
888         if (speed >= ETH_SPEED_NUM_25G)
889                 capa |= ETH_LINK_SPEED_25G;
890         if (speed >= ETH_SPEED_NUM_40G)
891                 capa |= ETH_LINK_SPEED_40G;
892         if (speed >= ETH_SPEED_NUM_50G)
893                 capa |= ETH_LINK_SPEED_50G;
894         if (speed >= ETH_SPEED_NUM_56G)
895                 capa |= ETH_LINK_SPEED_56G;
896         if (speed >= ETH_SPEED_NUM_100G)
897                 capa |= ETH_LINK_SPEED_100G;
898
899         return capa;
900 }
901
902 static void
903 tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
904 {
905         struct pmd_internals *internals = dev->data->dev_private;
906
907         dev_info->if_index = internals->if_index;
908         dev_info->max_mac_addrs = 1;
909         dev_info->max_rx_pktlen = (uint32_t)ETHER_MAX_VLAN_FRAME_LEN;
910         dev_info->max_rx_queues = RTE_PMD_TAP_MAX_QUEUES;
911         dev_info->max_tx_queues = RTE_PMD_TAP_MAX_QUEUES;
912         dev_info->min_rx_bufsize = 0;
913         dev_info->speed_capa = tap_dev_speed_capa();
914         dev_info->rx_queue_offload_capa = tap_rx_offload_get_queue_capa();
915         dev_info->rx_offload_capa = tap_rx_offload_get_port_capa() |
916                                     dev_info->rx_queue_offload_capa;
917         dev_info->tx_queue_offload_capa = tap_tx_offload_get_queue_capa();
918         dev_info->tx_offload_capa = tap_tx_offload_get_port_capa() |
919                                     dev_info->tx_queue_offload_capa;
920         dev_info->hash_key_size = TAP_RSS_HASH_KEY_SIZE;
921         /*
922          * limitation: TAP supports all of IP, UDP and TCP hash
923          * functions together and not in partial combinations
924          */
925         dev_info->flow_type_rss_offloads = ~TAP_RSS_HF_MASK;
926 }
927
928 static int
929 tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *tap_stats)
930 {
931         unsigned int i, imax;
932         unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
933         unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
934         unsigned long rx_nombuf = 0, ierrors = 0;
935         const struct pmd_internals *pmd = dev->data->dev_private;
936
937         /* rx queue statistics */
938         imax = (dev->data->nb_rx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
939                 dev->data->nb_rx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
940         for (i = 0; i < imax; i++) {
941                 tap_stats->q_ipackets[i] = pmd->rxq[i].stats.ipackets;
942                 tap_stats->q_ibytes[i] = pmd->rxq[i].stats.ibytes;
943                 rx_total += tap_stats->q_ipackets[i];
944                 rx_bytes_total += tap_stats->q_ibytes[i];
945                 rx_nombuf += pmd->rxq[i].stats.rx_nombuf;
946                 ierrors += pmd->rxq[i].stats.ierrors;
947         }
948
949         /* tx queue statistics */
950         imax = (dev->data->nb_tx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
951                 dev->data->nb_tx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
952
953         for (i = 0; i < imax; i++) {
954                 tap_stats->q_opackets[i] = pmd->txq[i].stats.opackets;
955                 tap_stats->q_errors[i] = pmd->txq[i].stats.errs;
956                 tap_stats->q_obytes[i] = pmd->txq[i].stats.obytes;
957                 tx_total += tap_stats->q_opackets[i];
958                 tx_err_total += tap_stats->q_errors[i];
959                 tx_bytes_total += tap_stats->q_obytes[i];
960         }
961
962         tap_stats->ipackets = rx_total;
963         tap_stats->ibytes = rx_bytes_total;
964         tap_stats->ierrors = ierrors;
965         tap_stats->rx_nombuf = rx_nombuf;
966         tap_stats->opackets = tx_total;
967         tap_stats->oerrors = tx_err_total;
968         tap_stats->obytes = tx_bytes_total;
969         return 0;
970 }
971
972 static void
973 tap_stats_reset(struct rte_eth_dev *dev)
974 {
975         int i;
976         struct pmd_internals *pmd = dev->data->dev_private;
977
978         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
979                 pmd->rxq[i].stats.ipackets = 0;
980                 pmd->rxq[i].stats.ibytes = 0;
981                 pmd->rxq[i].stats.ierrors = 0;
982                 pmd->rxq[i].stats.rx_nombuf = 0;
983
984                 pmd->txq[i].stats.opackets = 0;
985                 pmd->txq[i].stats.errs = 0;
986                 pmd->txq[i].stats.obytes = 0;
987         }
988 }
989
990 static void
991 tap_dev_close(struct rte_eth_dev *dev)
992 {
993         int i;
994         struct pmd_internals *internals = dev->data->dev_private;
995         struct pmd_process_private *process_private = dev->process_private;
996
997         tap_link_set_down(dev);
998         tap_flow_flush(dev, NULL);
999         tap_flow_implicit_flush(internals, NULL);
1000
1001         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1002                 if (process_private->rxq_fds[i] != -1) {
1003                         close(process_private->rxq_fds[i]);
1004                         process_private->rxq_fds[i] = -1;
1005                 }
1006                 if (process_private->txq_fds[i] != -1) {
1007                         close(process_private->txq_fds[i]);
1008                         process_private->txq_fds[i] = -1;
1009                 }
1010         }
1011
1012         if (internals->remote_if_index) {
1013                 /* Restore initial remote state */
1014                 ioctl(internals->ioctl_sock, SIOCSIFFLAGS,
1015                                 &internals->remote_initial_flags);
1016         }
1017
1018         if (internals->ka_fd != -1) {
1019                 close(internals->ka_fd);
1020                 internals->ka_fd = -1;
1021         }
1022         /*
1023          * Since TUN device has no more opened file descriptors
1024          * it will be removed from kernel
1025          */
1026 }
1027
1028 static void
1029 tap_rx_queue_release(void *queue)
1030 {
1031         struct rx_queue *rxq = queue;
1032         struct pmd_process_private *process_private;
1033
1034         if (!rxq)
1035                 return;
1036         process_private = rte_eth_devices[rxq->in_port].process_private;
1037         if (process_private->rxq_fds[rxq->queue_id] > 0) {
1038                 close(process_private->rxq_fds[rxq->queue_id]);
1039                 process_private->rxq_fds[rxq->queue_id] = -1;
1040                 rte_pktmbuf_free(rxq->pool);
1041                 rte_free(rxq->iovecs);
1042                 rxq->pool = NULL;
1043                 rxq->iovecs = NULL;
1044         }
1045 }
1046
1047 static void
1048 tap_tx_queue_release(void *queue)
1049 {
1050         struct tx_queue *txq = queue;
1051         struct pmd_process_private *process_private;
1052
1053         if (!txq)
1054                 return;
1055         process_private = rte_eth_devices[txq->out_port].process_private;
1056
1057         if (process_private->txq_fds[txq->queue_id] > 0) {
1058                 close(process_private->txq_fds[txq->queue_id]);
1059                 process_private->txq_fds[txq->queue_id] = -1;
1060         }
1061 }
1062
1063 static int
1064 tap_link_update(struct rte_eth_dev *dev, int wait_to_complete __rte_unused)
1065 {
1066         struct rte_eth_link *dev_link = &dev->data->dev_link;
1067         struct pmd_internals *pmd = dev->data->dev_private;
1068         struct ifreq ifr = { .ifr_flags = 0 };
1069
1070         if (pmd->remote_if_index) {
1071                 tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, REMOTE_ONLY);
1072                 if (!(ifr.ifr_flags & IFF_UP) ||
1073                     !(ifr.ifr_flags & IFF_RUNNING)) {
1074                         dev_link->link_status = ETH_LINK_DOWN;
1075                         return 0;
1076                 }
1077         }
1078         tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, LOCAL_ONLY);
1079         dev_link->link_status =
1080                 ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING) ?
1081                  ETH_LINK_UP :
1082                  ETH_LINK_DOWN);
1083         return 0;
1084 }
1085
1086 static void
1087 tap_promisc_enable(struct rte_eth_dev *dev)
1088 {
1089         struct pmd_internals *pmd = dev->data->dev_private;
1090         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
1091
1092         dev->data->promiscuous = 1;
1093         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
1094         if (pmd->remote_if_index && !pmd->flow_isolate)
1095                 tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC);
1096 }
1097
1098 static void
1099 tap_promisc_disable(struct rte_eth_dev *dev)
1100 {
1101         struct pmd_internals *pmd = dev->data->dev_private;
1102         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
1103
1104         dev->data->promiscuous = 0;
1105         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
1106         if (pmd->remote_if_index && !pmd->flow_isolate)
1107                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_PROMISC);
1108 }
1109
1110 static void
1111 tap_allmulti_enable(struct rte_eth_dev *dev)
1112 {
1113         struct pmd_internals *pmd = dev->data->dev_private;
1114         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
1115
1116         dev->data->all_multicast = 1;
1117         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
1118         if (pmd->remote_if_index && !pmd->flow_isolate)
1119                 tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI);
1120 }
1121
1122 static void
1123 tap_allmulti_disable(struct rte_eth_dev *dev)
1124 {
1125         struct pmd_internals *pmd = dev->data->dev_private;
1126         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
1127
1128         dev->data->all_multicast = 0;
1129         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
1130         if (pmd->remote_if_index && !pmd->flow_isolate)
1131                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_ALLMULTI);
1132 }
1133
1134 static int
1135 tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
1136 {
1137         struct pmd_internals *pmd = dev->data->dev_private;
1138         enum ioctl_mode mode = LOCAL_ONLY;
1139         struct ifreq ifr;
1140         int ret;
1141
1142         if (pmd->type == ETH_TUNTAP_TYPE_TUN) {
1143                 TAP_LOG(ERR, "%s: can't MAC address for TUN",
1144                         dev->device->name);
1145                 return -ENOTSUP;
1146         }
1147
1148         if (is_zero_ether_addr(mac_addr)) {
1149                 TAP_LOG(ERR, "%s: can't set an empty MAC address",
1150                         dev->device->name);
1151                 return -EINVAL;
1152         }
1153         /* Check the actual current MAC address on the tap netdevice */
1154         ret = tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, LOCAL_ONLY);
1155         if (ret < 0)
1156                 return ret;
1157         if (is_same_ether_addr((struct ether_addr *)&ifr.ifr_hwaddr.sa_data,
1158                                mac_addr))
1159                 return 0;
1160         /* Check the current MAC address on the remote */
1161         ret = tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY);
1162         if (ret < 0)
1163                 return ret;
1164         if (!is_same_ether_addr((struct ether_addr *)&ifr.ifr_hwaddr.sa_data,
1165                                mac_addr))
1166                 mode = LOCAL_AND_REMOTE;
1167         ifr.ifr_hwaddr.sa_family = AF_LOCAL;
1168         rte_memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, ETHER_ADDR_LEN);
1169         ret = tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 1, mode);
1170         if (ret < 0)
1171                 return ret;
1172         rte_memcpy(&pmd->eth_addr, mac_addr, ETHER_ADDR_LEN);
1173         if (pmd->remote_if_index && !pmd->flow_isolate) {
1174                 /* Replace MAC redirection rule after a MAC change */
1175                 ret = tap_flow_implicit_destroy(pmd, TAP_REMOTE_LOCAL_MAC);
1176                 if (ret < 0) {
1177                         TAP_LOG(ERR,
1178                                 "%s: Couldn't delete MAC redirection rule",
1179                                 dev->device->name);
1180                         return ret;
1181                 }
1182                 ret = tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC);
1183                 if (ret < 0) {
1184                         TAP_LOG(ERR,
1185                                 "%s: Couldn't add MAC redirection rule",
1186                                 dev->device->name);
1187                         return ret;
1188                 }
1189         }
1190
1191         return 0;
1192 }
1193
1194 static int
1195 tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev)
1196 {
1197         uint32_t gso_types;
1198         char pool_name[64];
1199
1200         /*
1201          * Create private mbuf pool with TAP_GSO_MBUF_SEG_SIZE bytes
1202          * size per mbuf use this pool for both direct and indirect mbufs
1203          */
1204
1205         struct rte_mempool *mp;      /* Mempool for GSO packets */
1206
1207         /* initialize GSO context */
1208         gso_types = DEV_TX_OFFLOAD_TCP_TSO;
1209         snprintf(pool_name, sizeof(pool_name), "mp_%s", dev->device->name);
1210         mp = rte_mempool_lookup((const char *)pool_name);
1211         if (!mp) {
1212                 mp = rte_pktmbuf_pool_create(pool_name, TAP_GSO_MBUFS_NUM,
1213                         TAP_GSO_MBUF_CACHE_SIZE, 0,
1214                         RTE_PKTMBUF_HEADROOM + TAP_GSO_MBUF_SEG_SIZE,
1215                         SOCKET_ID_ANY);
1216                 if (!mp) {
1217                         struct pmd_internals *pmd = dev->data->dev_private;
1218                         RTE_LOG(DEBUG, PMD, "%s: failed to create mbuf pool for device %s\n",
1219                                 pmd->name, dev->device->name);
1220                         return -1;
1221                 }
1222         }
1223
1224         gso_ctx->direct_pool = mp;
1225         gso_ctx->indirect_pool = mp;
1226         gso_ctx->gso_types = gso_types;
1227         gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */
1228         gso_ctx->flag = 0;
1229
1230         return 0;
1231 }
1232
1233 static int
1234 tap_setup_queue(struct rte_eth_dev *dev,
1235                 struct pmd_internals *internals,
1236                 uint16_t qid,
1237                 int is_rx)
1238 {
1239         int ret;
1240         int *fd;
1241         int *other_fd;
1242         const char *dir;
1243         struct pmd_internals *pmd = dev->data->dev_private;
1244         struct pmd_process_private *process_private = dev->process_private;
1245         struct rx_queue *rx = &internals->rxq[qid];
1246         struct tx_queue *tx = &internals->txq[qid];
1247         struct rte_gso_ctx *gso_ctx;
1248
1249         if (is_rx) {
1250                 fd = &process_private->rxq_fds[qid];
1251                 other_fd = &process_private->txq_fds[qid];
1252                 dir = "rx";
1253                 gso_ctx = NULL;
1254         } else {
1255                 fd = &process_private->txq_fds[qid];
1256                 other_fd = &process_private->rxq_fds[qid];
1257                 dir = "tx";
1258                 gso_ctx = &tx->gso_ctx;
1259         }
1260         if (*fd != -1) {
1261                 /* fd for this queue already exists */
1262                 TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists",
1263                         pmd->name, *fd, dir, qid);
1264                 gso_ctx = NULL;
1265         } else if (*other_fd != -1) {
1266                 /* Only other_fd exists. dup it */
1267                 *fd = dup(*other_fd);
1268                 if (*fd < 0) {
1269                         *fd = -1;
1270                         TAP_LOG(ERR, "%s: dup() failed.", pmd->name);
1271                         return -1;
1272                 }
1273                 TAP_LOG(DEBUG, "%s: dup fd %d for %s queue qid %d (%d)",
1274                         pmd->name, *other_fd, dir, qid, *fd);
1275         } else {
1276                 /* Both RX and TX fds do not exist (equal -1). Create fd */
1277                 *fd = tun_alloc(pmd, 0);
1278                 if (*fd < 0) {
1279                         *fd = -1; /* restore original value */
1280                         TAP_LOG(ERR, "%s: tun_alloc() failed.", pmd->name);
1281                         return -1;
1282                 }
1283                 TAP_LOG(DEBUG, "%s: add %s queue for qid %d fd %d",
1284                         pmd->name, dir, qid, *fd);
1285         }
1286
1287         tx->mtu = &dev->data->mtu;
1288         rx->rxmode = &dev->data->dev_conf.rxmode;
1289         if (gso_ctx) {
1290                 ret = tap_gso_ctx_setup(gso_ctx, dev);
1291                 if (ret)
1292                         return -1;
1293         }
1294
1295         tx->type = pmd->type;
1296
1297         return *fd;
1298 }
1299
1300 static int
1301 tap_rx_queue_setup(struct rte_eth_dev *dev,
1302                    uint16_t rx_queue_id,
1303                    uint16_t nb_rx_desc,
1304                    unsigned int socket_id,
1305                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1306                    struct rte_mempool *mp)
1307 {
1308         struct pmd_internals *internals = dev->data->dev_private;
1309         struct pmd_process_private *process_private = dev->process_private;
1310         struct rx_queue *rxq = &internals->rxq[rx_queue_id];
1311         struct rte_mbuf **tmp = &rxq->pool;
1312         long iov_max = sysconf(_SC_IOV_MAX);
1313         uint16_t nb_desc = RTE_MIN(nb_rx_desc, iov_max - 1);
1314         struct iovec (*iovecs)[nb_desc + 1];
1315         int data_off = RTE_PKTMBUF_HEADROOM;
1316         int ret = 0;
1317         int fd;
1318         int i;
1319
1320         if (rx_queue_id >= dev->data->nb_rx_queues || !mp) {
1321                 TAP_LOG(WARNING,
1322                         "nb_rx_queues %d too small or mempool NULL",
1323                         dev->data->nb_rx_queues);
1324                 return -1;
1325         }
1326
1327         rxq->mp = mp;
1328         rxq->trigger_seen = 1; /* force initial burst */
1329         rxq->in_port = dev->data->port_id;
1330         rxq->queue_id = rx_queue_id;
1331         rxq->nb_rx_desc = nb_desc;
1332         iovecs = rte_zmalloc_socket(dev->device->name, sizeof(*iovecs), 0,
1333                                     socket_id);
1334         if (!iovecs) {
1335                 TAP_LOG(WARNING,
1336                         "%s: Couldn't allocate %d RX descriptors",
1337                         dev->device->name, nb_desc);
1338                 return -ENOMEM;
1339         }
1340         rxq->iovecs = iovecs;
1341
1342         dev->data->rx_queues[rx_queue_id] = rxq;
1343         fd = tap_setup_queue(dev, internals, rx_queue_id, 1);
1344         if (fd == -1) {
1345                 ret = fd;
1346                 goto error;
1347         }
1348
1349         (*rxq->iovecs)[0].iov_len = sizeof(struct tun_pi);
1350         (*rxq->iovecs)[0].iov_base = &rxq->pi;
1351
1352         for (i = 1; i <= nb_desc; i++) {
1353                 *tmp = rte_pktmbuf_alloc(rxq->mp);
1354                 if (!*tmp) {
1355                         TAP_LOG(WARNING,
1356                                 "%s: couldn't allocate memory for queue %d",
1357                                 dev->device->name, rx_queue_id);
1358                         ret = -ENOMEM;
1359                         goto error;
1360                 }
1361                 (*rxq->iovecs)[i].iov_len = (*tmp)->buf_len - data_off;
1362                 (*rxq->iovecs)[i].iov_base =
1363                         (char *)(*tmp)->buf_addr + data_off;
1364                 data_off = 0;
1365                 tmp = &(*tmp)->next;
1366         }
1367
1368         TAP_LOG(DEBUG, "  RX TUNTAP device name %s, qid %d on fd %d",
1369                 internals->name, rx_queue_id,
1370                 process_private->rxq_fds[rx_queue_id]);
1371
1372         return 0;
1373
1374 error:
1375         rte_pktmbuf_free(rxq->pool);
1376         rxq->pool = NULL;
1377         rte_free(rxq->iovecs);
1378         rxq->iovecs = NULL;
1379         return ret;
1380 }
1381
1382 static int
1383 tap_tx_queue_setup(struct rte_eth_dev *dev,
1384                    uint16_t tx_queue_id,
1385                    uint16_t nb_tx_desc __rte_unused,
1386                    unsigned int socket_id __rte_unused,
1387                    const struct rte_eth_txconf *tx_conf)
1388 {
1389         struct pmd_internals *internals = dev->data->dev_private;
1390         struct pmd_process_private *process_private = dev->process_private;
1391         struct tx_queue *txq;
1392         int ret;
1393         uint64_t offloads;
1394
1395         if (tx_queue_id >= dev->data->nb_tx_queues)
1396                 return -1;
1397         dev->data->tx_queues[tx_queue_id] = &internals->txq[tx_queue_id];
1398         txq = dev->data->tx_queues[tx_queue_id];
1399         txq->out_port = dev->data->port_id;
1400         txq->queue_id = tx_queue_id;
1401
1402         offloads = tx_conf->offloads | dev->data->dev_conf.txmode.offloads;
1403         txq->csum = !!(offloads &
1404                         (DEV_TX_OFFLOAD_IPV4_CKSUM |
1405                          DEV_TX_OFFLOAD_UDP_CKSUM |
1406                          DEV_TX_OFFLOAD_TCP_CKSUM));
1407
1408         ret = tap_setup_queue(dev, internals, tx_queue_id, 0);
1409         if (ret == -1)
1410                 return -1;
1411         TAP_LOG(DEBUG,
1412                 "  TX TUNTAP device name %s, qid %d on fd %d csum %s",
1413                 internals->name, tx_queue_id,
1414                 process_private->txq_fds[tx_queue_id],
1415                 txq->csum ? "on" : "off");
1416
1417         return 0;
1418 }
1419
1420 static int
1421 tap_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
1422 {
1423         struct pmd_internals *pmd = dev->data->dev_private;
1424         struct ifreq ifr = { .ifr_mtu = mtu };
1425         int err = 0;
1426
1427         err = tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE);
1428         if (!err)
1429                 dev->data->mtu = mtu;
1430
1431         return err;
1432 }
1433
1434 static int
1435 tap_set_mc_addr_list(struct rte_eth_dev *dev __rte_unused,
1436                      struct ether_addr *mc_addr_set __rte_unused,
1437                      uint32_t nb_mc_addr __rte_unused)
1438 {
1439         /*
1440          * Nothing to do actually: the tap has no filtering whatsoever, every
1441          * packet is received.
1442          */
1443         return 0;
1444 }
1445
1446 static int
1447 tap_nl_msg_handler(struct nlmsghdr *nh, void *arg)
1448 {
1449         struct rte_eth_dev *dev = arg;
1450         struct pmd_internals *pmd = dev->data->dev_private;
1451         struct ifinfomsg *info = NLMSG_DATA(nh);
1452
1453         if (nh->nlmsg_type != RTM_NEWLINK ||
1454             (info->ifi_index != pmd->if_index &&
1455              info->ifi_index != pmd->remote_if_index))
1456                 return 0;
1457         return tap_link_update(dev, 0);
1458 }
1459
1460 static void
1461 tap_dev_intr_handler(void *cb_arg)
1462 {
1463         struct rte_eth_dev *dev = cb_arg;
1464         struct pmd_internals *pmd = dev->data->dev_private;
1465
1466         tap_nl_recv(pmd->intr_handle.fd, tap_nl_msg_handler, dev);
1467 }
1468
1469 static int
1470 tap_lsc_intr_handle_set(struct rte_eth_dev *dev, int set)
1471 {
1472         struct pmd_internals *pmd = dev->data->dev_private;
1473
1474         /* In any case, disable interrupt if the conf is no longer there. */
1475         if (!dev->data->dev_conf.intr_conf.lsc) {
1476                 if (pmd->intr_handle.fd != -1) {
1477                         tap_nl_final(pmd->intr_handle.fd);
1478                         rte_intr_callback_unregister(&pmd->intr_handle,
1479                                 tap_dev_intr_handler, dev);
1480                 }
1481                 return 0;
1482         }
1483         if (set) {
1484                 pmd->intr_handle.fd = tap_nl_init(RTMGRP_LINK);
1485                 if (unlikely(pmd->intr_handle.fd == -1))
1486                         return -EBADF;
1487                 return rte_intr_callback_register(
1488                         &pmd->intr_handle, tap_dev_intr_handler, dev);
1489         }
1490         tap_nl_final(pmd->intr_handle.fd);
1491         return rte_intr_callback_unregister(&pmd->intr_handle,
1492                                             tap_dev_intr_handler, dev);
1493 }
1494
1495 static int
1496 tap_intr_handle_set(struct rte_eth_dev *dev, int set)
1497 {
1498         int err;
1499
1500         err = tap_lsc_intr_handle_set(dev, set);
1501         if (err)
1502                 return err;
1503         err = tap_rx_intr_vec_set(dev, set);
1504         if (err && set)
1505                 tap_lsc_intr_handle_set(dev, 0);
1506         return err;
1507 }
1508
1509 static const uint32_t*
1510 tap_dev_supported_ptypes_get(struct rte_eth_dev *dev __rte_unused)
1511 {
1512         static const uint32_t ptypes[] = {
1513                 RTE_PTYPE_INNER_L2_ETHER,
1514                 RTE_PTYPE_INNER_L2_ETHER_VLAN,
1515                 RTE_PTYPE_INNER_L2_ETHER_QINQ,
1516                 RTE_PTYPE_INNER_L3_IPV4,
1517                 RTE_PTYPE_INNER_L3_IPV4_EXT,
1518                 RTE_PTYPE_INNER_L3_IPV6,
1519                 RTE_PTYPE_INNER_L3_IPV6_EXT,
1520                 RTE_PTYPE_INNER_L4_FRAG,
1521                 RTE_PTYPE_INNER_L4_UDP,
1522                 RTE_PTYPE_INNER_L4_TCP,
1523                 RTE_PTYPE_INNER_L4_SCTP,
1524                 RTE_PTYPE_L2_ETHER,
1525                 RTE_PTYPE_L2_ETHER_VLAN,
1526                 RTE_PTYPE_L2_ETHER_QINQ,
1527                 RTE_PTYPE_L3_IPV4,
1528                 RTE_PTYPE_L3_IPV4_EXT,
1529                 RTE_PTYPE_L3_IPV6_EXT,
1530                 RTE_PTYPE_L3_IPV6,
1531                 RTE_PTYPE_L4_FRAG,
1532                 RTE_PTYPE_L4_UDP,
1533                 RTE_PTYPE_L4_TCP,
1534                 RTE_PTYPE_L4_SCTP,
1535         };
1536
1537         return ptypes;
1538 }
1539
1540 static int
1541 tap_flow_ctrl_get(struct rte_eth_dev *dev __rte_unused,
1542                   struct rte_eth_fc_conf *fc_conf)
1543 {
1544         fc_conf->mode = RTE_FC_NONE;
1545         return 0;
1546 }
1547
1548 static int
1549 tap_flow_ctrl_set(struct rte_eth_dev *dev __rte_unused,
1550                   struct rte_eth_fc_conf *fc_conf)
1551 {
1552         if (fc_conf->mode != RTE_FC_NONE)
1553                 return -ENOTSUP;
1554         return 0;
1555 }
1556
1557 /**
1558  * DPDK callback to update the RSS hash configuration.
1559  *
1560  * @param dev
1561  *   Pointer to Ethernet device structure.
1562  * @param[in] rss_conf
1563  *   RSS configuration data.
1564  *
1565  * @return
1566  *   0 on success, a negative errno value otherwise and rte_errno is set.
1567  */
1568 static int
1569 tap_rss_hash_update(struct rte_eth_dev *dev,
1570                 struct rte_eth_rss_conf *rss_conf)
1571 {
1572         if (rss_conf->rss_hf & TAP_RSS_HF_MASK) {
1573                 rte_errno = EINVAL;
1574                 return -rte_errno;
1575         }
1576         if (rss_conf->rss_key && rss_conf->rss_key_len) {
1577                 /*
1578                  * Currently TAP RSS key is hard coded
1579                  * and cannot be updated
1580                  */
1581                 TAP_LOG(ERR,
1582                         "port %u RSS key cannot be updated",
1583                         dev->data->port_id);
1584                 rte_errno = EINVAL;
1585                 return -rte_errno;
1586         }
1587         return 0;
1588 }
1589
1590 static int
1591 tap_rx_queue_start(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1592 {
1593         dev->data->rx_queue_state[rx_queue_id] = RTE_ETH_QUEUE_STATE_STARTED;
1594
1595         return 0;
1596 }
1597
1598 static int
1599 tap_tx_queue_start(struct rte_eth_dev *dev, uint16_t tx_queue_id)
1600 {
1601         dev->data->tx_queue_state[tx_queue_id] = RTE_ETH_QUEUE_STATE_STARTED;
1602
1603         return 0;
1604 }
1605
1606 static int
1607 tap_rx_queue_stop(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1608 {
1609         dev->data->rx_queue_state[rx_queue_id] = RTE_ETH_QUEUE_STATE_STOPPED;
1610
1611         return 0;
1612 }
1613
1614 static int
1615 tap_tx_queue_stop(struct rte_eth_dev *dev, uint16_t tx_queue_id)
1616 {
1617         dev->data->tx_queue_state[tx_queue_id] = RTE_ETH_QUEUE_STATE_STOPPED;
1618
1619         return 0;
1620 }
1621 static const struct eth_dev_ops ops = {
1622         .dev_start              = tap_dev_start,
1623         .dev_stop               = tap_dev_stop,
1624         .dev_close              = tap_dev_close,
1625         .dev_configure          = tap_dev_configure,
1626         .dev_infos_get          = tap_dev_info,
1627         .rx_queue_setup         = tap_rx_queue_setup,
1628         .tx_queue_setup         = tap_tx_queue_setup,
1629         .rx_queue_start         = tap_rx_queue_start,
1630         .tx_queue_start         = tap_tx_queue_start,
1631         .rx_queue_stop          = tap_rx_queue_stop,
1632         .tx_queue_stop          = tap_tx_queue_stop,
1633         .rx_queue_release       = tap_rx_queue_release,
1634         .tx_queue_release       = tap_tx_queue_release,
1635         .flow_ctrl_get          = tap_flow_ctrl_get,
1636         .flow_ctrl_set          = tap_flow_ctrl_set,
1637         .link_update            = tap_link_update,
1638         .dev_set_link_up        = tap_link_set_up,
1639         .dev_set_link_down      = tap_link_set_down,
1640         .promiscuous_enable     = tap_promisc_enable,
1641         .promiscuous_disable    = tap_promisc_disable,
1642         .allmulticast_enable    = tap_allmulti_enable,
1643         .allmulticast_disable   = tap_allmulti_disable,
1644         .mac_addr_set           = tap_mac_set,
1645         .mtu_set                = tap_mtu_set,
1646         .set_mc_addr_list       = tap_set_mc_addr_list,
1647         .stats_get              = tap_stats_get,
1648         .stats_reset            = tap_stats_reset,
1649         .dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
1650         .rss_hash_update        = tap_rss_hash_update,
1651         .filter_ctrl            = tap_dev_filter_ctrl,
1652 };
1653
1654 static int
1655 eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
1656                    char *remote_iface, struct ether_addr *mac_addr,
1657                    enum rte_tuntap_type type)
1658 {
1659         int numa_node = rte_socket_id();
1660         struct rte_eth_dev *dev;
1661         struct pmd_internals *pmd;
1662         struct pmd_process_private *process_private;
1663         struct rte_eth_dev_data *data;
1664         struct ifreq ifr;
1665         int i;
1666
1667         TAP_LOG(DEBUG, "%s device on numa %u",
1668                         tuntap_name, rte_socket_id());
1669
1670         dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
1671         if (!dev) {
1672                 TAP_LOG(ERR, "%s Unable to allocate device struct",
1673                                 tuntap_name);
1674                 goto error_exit_nodev;
1675         }
1676
1677         process_private = (struct pmd_process_private *)
1678                 rte_zmalloc_socket(tap_name, sizeof(struct pmd_process_private),
1679                         RTE_CACHE_LINE_SIZE, dev->device->numa_node);
1680
1681         if (process_private == NULL) {
1682                 TAP_LOG(ERR, "Failed to alloc memory for process private");
1683                 return -1;
1684         }
1685         pmd = dev->data->dev_private;
1686         dev->process_private = process_private;
1687         pmd->dev = dev;
1688         snprintf(pmd->name, sizeof(pmd->name), "%s", tap_name);
1689         pmd->type = type;
1690
1691         pmd->ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0);
1692         if (pmd->ioctl_sock == -1) {
1693                 TAP_LOG(ERR,
1694                         "%s Unable to get a socket for management: %s",
1695                         tuntap_name, strerror(errno));
1696                 goto error_exit;
1697         }
1698
1699         /* Setup some default values */
1700         data = dev->data;
1701         data->dev_private = pmd;
1702         data->dev_flags = RTE_ETH_DEV_INTR_LSC;
1703         data->numa_node = numa_node;
1704
1705         data->dev_link = pmd_link;
1706         data->mac_addrs = &pmd->eth_addr;
1707         /* Set the number of RX and TX queues */
1708         data->nb_rx_queues = 0;
1709         data->nb_tx_queues = 0;
1710
1711         dev->dev_ops = &ops;
1712         dev->rx_pkt_burst = pmd_rx_burst;
1713         dev->tx_pkt_burst = pmd_tx_burst;
1714
1715         pmd->intr_handle.type = RTE_INTR_HANDLE_EXT;
1716         pmd->intr_handle.fd = -1;
1717         dev->intr_handle = &pmd->intr_handle;
1718
1719         /* Presetup the fds to -1 as being not valid */
1720         pmd->ka_fd = -1;
1721         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1722                 process_private->rxq_fds[i] = -1;
1723                 process_private->txq_fds[i] = -1;
1724         }
1725
1726         if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
1727                 if (is_zero_ether_addr(mac_addr))
1728                         eth_random_addr((uint8_t *)&pmd->eth_addr);
1729                 else
1730                         rte_memcpy(&pmd->eth_addr, mac_addr, sizeof(*mac_addr));
1731         }
1732
1733         /*
1734          * Allocate a TUN device keep-alive file descriptor that will only be
1735          * closed when the TUN device itself is closed or removed.
1736          * This keep-alive file descriptor will guarantee that the TUN device
1737          * exists even when all of its queues are closed
1738          */
1739         pmd->ka_fd = tun_alloc(pmd, 1);
1740         if (pmd->ka_fd == -1) {
1741                 TAP_LOG(ERR, "Unable to create %s interface", tuntap_name);
1742                 goto error_exit;
1743         }
1744
1745         ifr.ifr_mtu = dev->data->mtu;
1746         if (tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE) < 0)
1747                 goto error_exit;
1748
1749         if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
1750                 memset(&ifr, 0, sizeof(struct ifreq));
1751                 ifr.ifr_hwaddr.sa_family = AF_LOCAL;
1752                 rte_memcpy(ifr.ifr_hwaddr.sa_data, &pmd->eth_addr,
1753                                 ETHER_ADDR_LEN);
1754                 if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0)
1755                         goto error_exit;
1756         }
1757
1758         /*
1759          * Set up everything related to rte_flow:
1760          * - netlink socket
1761          * - tap / remote if_index
1762          * - mandatory QDISCs
1763          * - rte_flow actual/implicit lists
1764          * - implicit rules
1765          */
1766         pmd->nlsk_fd = tap_nl_init(0);
1767         if (pmd->nlsk_fd == -1) {
1768                 TAP_LOG(WARNING, "%s: failed to create netlink socket.",
1769                         pmd->name);
1770                 goto disable_rte_flow;
1771         }
1772         pmd->if_index = if_nametoindex(pmd->name);
1773         if (!pmd->if_index) {
1774                 TAP_LOG(ERR, "%s: failed to get if_index.", pmd->name);
1775                 goto disable_rte_flow;
1776         }
1777         if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
1778                 TAP_LOG(ERR, "%s: failed to create multiq qdisc.",
1779                         pmd->name);
1780                 goto disable_rte_flow;
1781         }
1782         if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
1783                 TAP_LOG(ERR, "%s: failed to create ingress qdisc.",
1784                         pmd->name);
1785                 goto disable_rte_flow;
1786         }
1787         LIST_INIT(&pmd->flows);
1788
1789         if (strlen(remote_iface)) {
1790                 pmd->remote_if_index = if_nametoindex(remote_iface);
1791                 if (!pmd->remote_if_index) {
1792                         TAP_LOG(ERR, "%s: failed to get %s if_index.",
1793                                 pmd->name, remote_iface);
1794                         goto error_remote;
1795                 }
1796                 snprintf(pmd->remote_iface, RTE_ETH_NAME_MAX_LEN,
1797                          "%s", remote_iface);
1798
1799                 /* Save state of remote device */
1800                 tap_ioctl(pmd, SIOCGIFFLAGS, &pmd->remote_initial_flags, 0, REMOTE_ONLY);
1801
1802                 /* Replicate remote MAC address */
1803                 if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY) < 0) {
1804                         TAP_LOG(ERR, "%s: failed to get %s MAC address.",
1805                                 pmd->name, pmd->remote_iface);
1806                         goto error_remote;
1807                 }
1808                 rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
1809                            ETHER_ADDR_LEN);
1810                 /* The desired MAC is already in ifreq after SIOCGIFHWADDR. */
1811                 if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0) {
1812                         TAP_LOG(ERR, "%s: failed to get %s MAC address.",
1813                                 pmd->name, remote_iface);
1814                         goto error_remote;
1815                 }
1816
1817                 /*
1818                  * Flush usually returns negative value because it tries to
1819                  * delete every QDISC (and on a running device, one QDISC at
1820                  * least is needed). Ignore negative return value.
1821                  */
1822                 qdisc_flush(pmd->nlsk_fd, pmd->remote_if_index);
1823                 if (qdisc_create_ingress(pmd->nlsk_fd,
1824                                          pmd->remote_if_index) < 0) {
1825                         TAP_LOG(ERR, "%s: failed to create ingress qdisc.",
1826                                 pmd->remote_iface);
1827                         goto error_remote;
1828                 }
1829                 LIST_INIT(&pmd->implicit_flows);
1830                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0 ||
1831                     tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0 ||
1832                     tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0 ||
1833                     tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0) {
1834                         TAP_LOG(ERR,
1835                                 "%s: failed to create implicit rules.",
1836                                 pmd->name);
1837                         goto error_remote;
1838                 }
1839         }
1840
1841         rte_eth_dev_probing_finish(dev);
1842         return 0;
1843
1844 disable_rte_flow:
1845         TAP_LOG(ERR, " Disabling rte flow support: %s(%d)",
1846                 strerror(errno), errno);
1847         if (strlen(remote_iface)) {
1848                 TAP_LOG(ERR, "Remote feature requires flow support.");
1849                 goto error_exit;
1850         }
1851         return 0;
1852
1853 error_remote:
1854         TAP_LOG(ERR, " Can't set up remote feature: %s(%d)",
1855                 strerror(errno), errno);
1856         tap_flow_implicit_flush(pmd, NULL);
1857
1858 error_exit:
1859         if (pmd->ioctl_sock > 0)
1860                 close(pmd->ioctl_sock);
1861         /* mac_addrs must not be freed alone because part of dev_private */
1862         dev->data->mac_addrs = NULL;
1863         rte_eth_dev_release_port(dev);
1864
1865 error_exit_nodev:
1866         TAP_LOG(ERR, "%s Unable to initialize %s",
1867                 tuntap_name, rte_vdev_device_name(vdev));
1868
1869         return -EINVAL;
1870 }
1871
1872 static int
1873 set_interface_name(const char *key __rte_unused,
1874                    const char *value,
1875                    void *extra_args)
1876 {
1877         char *name = (char *)extra_args;
1878
1879         if (value)
1880                 strlcpy(name, value, RTE_ETH_NAME_MAX_LEN - 1);
1881         else
1882                 snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s%d",
1883                          DEFAULT_TAP_NAME, (tap_unit - 1));
1884
1885         return 0;
1886 }
1887
1888 static int
1889 set_remote_iface(const char *key __rte_unused,
1890                  const char *value,
1891                  void *extra_args)
1892 {
1893         char *name = (char *)extra_args;
1894
1895         if (value)
1896                 strlcpy(name, value, RTE_ETH_NAME_MAX_LEN);
1897
1898         return 0;
1899 }
1900
1901 static int parse_user_mac(struct ether_addr *user_mac,
1902                 const char *value)
1903 {
1904         unsigned int index = 0;
1905         char mac_temp[strlen(ETH_TAP_USR_MAC_FMT) + 1], *mac_byte = NULL;
1906
1907         if (user_mac == NULL || value == NULL)
1908                 return 0;
1909
1910         strlcpy(mac_temp, value, sizeof(mac_temp));
1911         mac_byte = strtok(mac_temp, ":");
1912
1913         while ((mac_byte != NULL) &&
1914                         (strlen(mac_byte) <= 2) &&
1915                         (strlen(mac_byte) == strspn(mac_byte,
1916                                         ETH_TAP_CMP_MAC_FMT))) {
1917                 user_mac->addr_bytes[index++] = strtoul(mac_byte, NULL, 16);
1918                 mac_byte = strtok(NULL, ":");
1919         }
1920
1921         return index;
1922 }
1923
1924 static int
1925 set_mac_type(const char *key __rte_unused,
1926              const char *value,
1927              void *extra_args)
1928 {
1929         struct ether_addr *user_mac = extra_args;
1930
1931         if (!value)
1932                 return 0;
1933
1934         if (!strncasecmp(ETH_TAP_MAC_FIXED, value, strlen(ETH_TAP_MAC_FIXED))) {
1935                 static int iface_idx;
1936
1937                 /* fixed mac = 00:64:74:61:70:<iface_idx> */
1938                 memcpy((char *)user_mac->addr_bytes, "\0dtap", ETHER_ADDR_LEN);
1939                 user_mac->addr_bytes[ETHER_ADDR_LEN - 1] = iface_idx++ + '0';
1940                 goto success;
1941         }
1942
1943         if (parse_user_mac(user_mac, value) != 6)
1944                 goto error;
1945 success:
1946         TAP_LOG(DEBUG, "TAP user MAC param (%s)", value);
1947         return 0;
1948
1949 error:
1950         TAP_LOG(ERR, "TAP user MAC (%s) is not in format (%s|%s)",
1951                 value, ETH_TAP_MAC_FIXED, ETH_TAP_USR_MAC_FMT);
1952         return -1;
1953 }
1954
1955 /*
1956  * Open a TUN interface device. TUN PMD
1957  * 1) sets tap_type as false
1958  * 2) intakes iface as argument.
1959  * 3) as interface is virtual set speed to 10G
1960  */
1961 static int
1962 rte_pmd_tun_probe(struct rte_vdev_device *dev)
1963 {
1964         const char *name, *params;
1965         int ret;
1966         struct rte_kvargs *kvlist = NULL;
1967         char tun_name[RTE_ETH_NAME_MAX_LEN];
1968         char remote_iface[RTE_ETH_NAME_MAX_LEN];
1969         struct rte_eth_dev *eth_dev;
1970
1971         strcpy(tuntap_name, "TUN");
1972
1973         name = rte_vdev_device_name(dev);
1974         params = rte_vdev_device_args(dev);
1975         memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
1976
1977         if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
1978             strlen(params) == 0) {
1979                 eth_dev = rte_eth_dev_attach_secondary(name);
1980                 if (!eth_dev) {
1981                         TAP_LOG(ERR, "Failed to probe %s", name);
1982                         return -1;
1983                 }
1984                 eth_dev->dev_ops = &ops;
1985                 eth_dev->device = &dev->device;
1986                 rte_eth_dev_probing_finish(eth_dev);
1987                 return 0;
1988         }
1989
1990         snprintf(tun_name, sizeof(tun_name), "%s%u",
1991                  DEFAULT_TUN_NAME, tun_unit++);
1992
1993         if (params && (params[0] != '\0')) {
1994                 TAP_LOG(DEBUG, "parameters (%s)", params);
1995
1996                 kvlist = rte_kvargs_parse(params, valid_arguments);
1997                 if (kvlist) {
1998                         if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
1999                                 ret = rte_kvargs_process(kvlist,
2000                                         ETH_TAP_IFACE_ARG,
2001                                         &set_interface_name,
2002                                         tun_name);
2003
2004                                 if (ret == -1)
2005                                         goto leave;
2006                         }
2007                 }
2008         }
2009         pmd_link.link_speed = ETH_SPEED_NUM_10G;
2010
2011         TAP_LOG(NOTICE, "Initializing pmd_tun for %s as %s",
2012                 name, tun_name);
2013
2014         ret = eth_dev_tap_create(dev, tun_name, remote_iface, 0,
2015                 ETH_TUNTAP_TYPE_TUN);
2016
2017 leave:
2018         if (ret == -1) {
2019                 TAP_LOG(ERR, "Failed to create pmd for %s as %s",
2020                         name, tun_name);
2021                 tun_unit--; /* Restore the unit number */
2022         }
2023         rte_kvargs_free(kvlist);
2024
2025         return ret;
2026 }
2027
2028 /* Request queue file descriptors from secondary to primary. */
2029 static int
2030 tap_mp_attach_queues(const char *port_name, struct rte_eth_dev *dev)
2031 {
2032         int ret;
2033         struct timespec timeout = {.tv_sec = 1, .tv_nsec = 0};
2034         struct rte_mp_msg request, *reply;
2035         struct rte_mp_reply replies;
2036         struct ipc_queues *request_param = (struct ipc_queues *)request.param;
2037         struct ipc_queues *reply_param;
2038         struct pmd_process_private *process_private = dev->process_private;
2039         int queue, fd_iterator;
2040
2041         /* Prepare the request */
2042         strlcpy(request.name, TAP_MP_KEY, sizeof(request.name));
2043         strlcpy(request_param->port_name, port_name,
2044                 sizeof(request_param->port_name));
2045         request.len_param = sizeof(*request_param);
2046         /* Send request and receive reply */
2047         ret = rte_mp_request_sync(&request, &replies, &timeout);
2048         if (ret < 0) {
2049                 TAP_LOG(ERR, "Failed to request queues from primary: %d",
2050                         rte_errno);
2051                 return -1;
2052         }
2053         reply = &replies.msgs[0];
2054         reply_param = (struct ipc_queues *)reply->param;
2055         TAP_LOG(DEBUG, "Received IPC reply for %s", reply_param->port_name);
2056
2057         /* Attach the queues from received file descriptors */
2058         dev->data->nb_rx_queues = reply_param->rxq_count;
2059         dev->data->nb_tx_queues = reply_param->txq_count;
2060         fd_iterator = 0;
2061         for (queue = 0; queue < reply_param->rxq_count; queue++)
2062                 process_private->rxq_fds[queue] = reply->fds[fd_iterator++];
2063         for (queue = 0; queue < reply_param->txq_count; queue++)
2064                 process_private->txq_fds[queue] = reply->fds[fd_iterator++];
2065
2066         return 0;
2067 }
2068
2069 /* Send the queue file descriptors from the primary process to secondary. */
2070 static int
2071 tap_mp_sync_queues(const struct rte_mp_msg *request, const void *peer)
2072 {
2073         struct rte_eth_dev *dev;
2074         struct pmd_process_private *process_private;
2075         struct rte_mp_msg reply;
2076         const struct ipc_queues *request_param =
2077                 (const struct ipc_queues *)request->param;
2078         struct ipc_queues *reply_param =
2079                 (struct ipc_queues *)reply.param;
2080         uint16_t port_id;
2081         int queue;
2082         int ret;
2083
2084         /* Get requested port */
2085         TAP_LOG(DEBUG, "Received IPC request for %s", request_param->port_name);
2086         ret = rte_eth_dev_get_port_by_name(request_param->port_name, &port_id);
2087         if (ret) {
2088                 TAP_LOG(ERR, "Failed to get port id for %s",
2089                         request_param->port_name);
2090                 return -1;
2091         }
2092         dev = &rte_eth_devices[port_id];
2093         process_private = dev->process_private;
2094
2095         /* Fill file descriptors for all queues */
2096         reply.num_fds = 0;
2097         reply_param->rxq_count = 0;
2098         for (queue = 0; queue < dev->data->nb_rx_queues; queue++) {
2099                 reply.fds[reply.num_fds++] = process_private->rxq_fds[queue];
2100                 reply_param->rxq_count++;
2101         }
2102         RTE_ASSERT(reply_param->rxq_count == dev->data->nb_rx_queues);
2103         RTE_ASSERT(reply_param->txq_count == dev->data->nb_tx_queues);
2104         RTE_ASSERT(reply.num_fds <= RTE_MP_MAX_FD_NUM);
2105
2106         reply_param->txq_count = 0;
2107         for (queue = 0; queue < dev->data->nb_tx_queues; queue++) {
2108                 reply.fds[reply.num_fds++] = process_private->txq_fds[queue];
2109                 reply_param->txq_count++;
2110         }
2111
2112         /* Send reply */
2113         strlcpy(reply.name, request->name, sizeof(reply.name));
2114         strlcpy(reply_param->port_name, request_param->port_name,
2115                 sizeof(reply_param->port_name));
2116         reply.len_param = sizeof(*reply_param);
2117         if (rte_mp_reply(&reply, peer) < 0) {
2118                 TAP_LOG(ERR, "Failed to reply an IPC request to sync queues");
2119                 return -1;
2120         }
2121         return 0;
2122 }
2123
2124 /* Open a TAP interface device.
2125  */
2126 static int
2127 rte_pmd_tap_probe(struct rte_vdev_device *dev)
2128 {
2129         const char *name, *params;
2130         int ret;
2131         struct rte_kvargs *kvlist = NULL;
2132         int speed;
2133         char tap_name[RTE_ETH_NAME_MAX_LEN];
2134         char remote_iface[RTE_ETH_NAME_MAX_LEN];
2135         struct ether_addr user_mac = { .addr_bytes = {0} };
2136         struct rte_eth_dev *eth_dev;
2137         int tap_devices_count_increased = 0;
2138
2139         strcpy(tuntap_name, "TAP");
2140
2141         name = rte_vdev_device_name(dev);
2142         params = rte_vdev_device_args(dev);
2143
2144         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
2145                 eth_dev = rte_eth_dev_attach_secondary(name);
2146                 if (!eth_dev) {
2147                         TAP_LOG(ERR, "Failed to probe %s", name);
2148                         return -1;
2149                 }
2150                 eth_dev->dev_ops = &ops;
2151                 eth_dev->device = &dev->device;
2152                 eth_dev->rx_pkt_burst = pmd_rx_burst;
2153                 eth_dev->tx_pkt_burst = pmd_tx_burst;
2154                 if (!rte_eal_primary_proc_alive(NULL)) {
2155                         TAP_LOG(ERR, "Primary process is missing");
2156                         return -1;
2157                 }
2158                 eth_dev->process_private = (struct pmd_process_private *)
2159                         rte_zmalloc_socket(name,
2160                                 sizeof(struct pmd_process_private),
2161                                 RTE_CACHE_LINE_SIZE,
2162                                 eth_dev->device->numa_node);
2163                 if (eth_dev->process_private == NULL) {
2164                         TAP_LOG(ERR,
2165                                 "Failed to alloc memory for process private");
2166                         return -1;
2167                 }
2168
2169                 ret = tap_mp_attach_queues(name, eth_dev);
2170                 if (ret != 0)
2171                         return -1;
2172                 rte_eth_dev_probing_finish(eth_dev);
2173                 return 0;
2174         }
2175
2176         speed = ETH_SPEED_NUM_10G;
2177         snprintf(tap_name, sizeof(tap_name), "%s%u",
2178                  DEFAULT_TAP_NAME, tap_unit++);
2179         memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
2180
2181         if (params && (params[0] != '\0')) {
2182                 TAP_LOG(DEBUG, "parameters (%s)", params);
2183
2184                 kvlist = rte_kvargs_parse(params, valid_arguments);
2185                 if (kvlist) {
2186                         if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
2187                                 ret = rte_kvargs_process(kvlist,
2188                                                          ETH_TAP_IFACE_ARG,
2189                                                          &set_interface_name,
2190                                                          tap_name);
2191                                 if (ret == -1)
2192                                         goto leave;
2193                         }
2194
2195                         if (rte_kvargs_count(kvlist, ETH_TAP_REMOTE_ARG) == 1) {
2196                                 ret = rte_kvargs_process(kvlist,
2197                                                          ETH_TAP_REMOTE_ARG,
2198                                                          &set_remote_iface,
2199                                                          remote_iface);
2200                                 if (ret == -1)
2201                                         goto leave;
2202                         }
2203
2204                         if (rte_kvargs_count(kvlist, ETH_TAP_MAC_ARG) == 1) {
2205                                 ret = rte_kvargs_process(kvlist,
2206                                                          ETH_TAP_MAC_ARG,
2207                                                          &set_mac_type,
2208                                                          &user_mac);
2209                                 if (ret == -1)
2210                                         goto leave;
2211                         }
2212                 }
2213         }
2214         pmd_link.link_speed = speed;
2215
2216         TAP_LOG(NOTICE, "Initializing pmd_tap for %s as %s",
2217                 name, tap_name);
2218
2219         /* Register IPC feed callback */
2220         if (!tap_devices_count) {
2221                 ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
2222                 if (ret < 0) {
2223                         TAP_LOG(ERR, "%s: Failed to register IPC callback: %s",
2224                                 tuntap_name, strerror(rte_errno));
2225                         goto leave;
2226                 }
2227         }
2228         tap_devices_count++;
2229         tap_devices_count_increased = 1;
2230         ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
2231                 ETH_TUNTAP_TYPE_TAP);
2232
2233 leave:
2234         if (ret == -1) {
2235                 TAP_LOG(ERR, "Failed to create pmd for %s as %s",
2236                         name, tap_name);
2237                 if (tap_devices_count_increased == 1) {
2238                         if (tap_devices_count == 1)
2239                                 rte_mp_action_unregister(TAP_MP_KEY);
2240                         tap_devices_count--;
2241                 }
2242                 tap_unit--;             /* Restore the unit number */
2243         }
2244         rte_kvargs_free(kvlist);
2245
2246         return ret;
2247 }
2248
2249 /* detach a TUNTAP device.
2250  */
2251 static int
2252 rte_pmd_tap_remove(struct rte_vdev_device *dev)
2253 {
2254         struct rte_eth_dev *eth_dev = NULL;
2255         struct pmd_internals *internals;
2256         struct pmd_process_private *process_private;
2257         int i;
2258
2259         /* find the ethdev entry */
2260         eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
2261         if (!eth_dev)
2262                 return -ENODEV;
2263
2264         /* mac_addrs must not be freed alone because part of dev_private */
2265         eth_dev->data->mac_addrs = NULL;
2266
2267         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
2268                 return rte_eth_dev_release_port(eth_dev);
2269
2270         internals = eth_dev->data->dev_private;
2271         process_private = eth_dev->process_private;
2272
2273         TAP_LOG(DEBUG, "Closing %s Ethernet device on numa %u",
2274                 (internals->type == ETH_TUNTAP_TYPE_TAP) ? "TAP" : "TUN",
2275                 rte_socket_id());
2276
2277         if (internals->nlsk_fd) {
2278                 tap_flow_flush(eth_dev, NULL);
2279                 tap_flow_implicit_flush(internals, NULL);
2280                 tap_nl_final(internals->nlsk_fd);
2281         }
2282         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
2283                 if (process_private->rxq_fds[i] != -1) {
2284                         close(process_private->rxq_fds[i]);
2285                         process_private->rxq_fds[i] = -1;
2286                 }
2287                 if (process_private->txq_fds[i] != -1) {
2288                         close(process_private->txq_fds[i]);
2289                         process_private->txq_fds[i] = -1;
2290                 }
2291         }
2292
2293         close(internals->ioctl_sock);
2294         rte_free(eth_dev->process_private);
2295         if (tap_devices_count == 1)
2296                 rte_mp_action_unregister(TAP_MP_KEY);
2297         tap_devices_count--;
2298         rte_eth_dev_release_port(eth_dev);
2299
2300         if (internals->ka_fd != -1) {
2301                 close(internals->ka_fd);
2302                 internals->ka_fd = -1;
2303         }
2304         return 0;
2305 }
2306
2307 static struct rte_vdev_driver pmd_tun_drv = {
2308         .probe = rte_pmd_tun_probe,
2309         .remove = rte_pmd_tap_remove,
2310 };
2311
2312 static struct rte_vdev_driver pmd_tap_drv = {
2313         .probe = rte_pmd_tap_probe,
2314         .remove = rte_pmd_tap_remove,
2315 };
2316
2317 RTE_PMD_REGISTER_VDEV(net_tap, pmd_tap_drv);
2318 RTE_PMD_REGISTER_VDEV(net_tun, pmd_tun_drv);
2319 RTE_PMD_REGISTER_ALIAS(net_tap, eth_tap);
2320 RTE_PMD_REGISTER_PARAM_STRING(net_tun,
2321                               ETH_TAP_IFACE_ARG "=<string> ");
2322 RTE_PMD_REGISTER_PARAM_STRING(net_tap,
2323                               ETH_TAP_IFACE_ARG "=<string> "
2324                               ETH_TAP_MAC_ARG "=" ETH_TAP_MAC_ARG_FMT " "
2325                               ETH_TAP_REMOTE_ARG "=<string>");
2326 int tap_logtype;
2327
2328 RTE_INIT(tap_init_log)
2329 {
2330         tap_logtype = rte_log_register("pmd.net.tap");
2331         if (tap_logtype >= 0)
2332                 rte_log_set_level(tap_logtype, RTE_LOG_NOTICE);
2333 }