4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of version 2 of the GNU General Public License as
8 * published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 * The full GNU General Public License is included in this distribution
19 * in the file called LICENSE.GPL.
21 * Contact Information:
25 #include <linux/module.h>
26 #include <linux/net.h>
28 #include <linux/virtio_net.h>
29 #include <linux/wait.h>
31 #include <linux/nsproxy.h>
32 #include <linux/sched.h>
33 #include <linux/if_tun.h>
34 #include <linux/version.h>
35 #include <linux/file.h>
43 #ifdef HAVE_STATIC_SOCK_MAP_FD
44 static int kni_sock_map_fd(struct socket *sock)
47 int fd = get_unused_fd_flags(0);
52 file = sock_alloc_file(sock, 0, NULL);
62 static struct proto kni_raw_proto = {
65 .obj_size = sizeof(struct kni_vhost_queue),
69 kni_vhost_net_tx(struct kni_dev *kni, struct msghdr *m,
70 uint32_t offset, uint32_t len)
72 struct rte_kni_mbuf *pkt_kva = NULL;
73 struct rte_kni_mbuf *pkt_va = NULL;
76 pr_debug("tx offset=%d, len=%d, iovlen=%d\n",
77 #ifdef HAVE_IOV_ITER_MSGHDR
78 offset, len, (int)m->msg_iter.iov->iov_len);
80 offset, len, (int)m->msg_iov->iov_len);
84 * Check if it has at least one free entry in tx_q and
85 * one entry in alloc_q.
87 if (kni_fifo_free_count(kni->tx_q) == 0 ||
88 kni_fifo_count(kni->alloc_q) == 0) {
90 * If no free entry in tx_q or no entry in alloc_q,
91 * drops skb and goes out.
96 /* dequeue a mbuf from alloc_q */
97 ret = kni_fifo_get(kni->alloc_q, (void **)&pkt_va, 1);
98 if (likely(ret == 1)) {
101 pkt_kva = (void *)pkt_va - kni->mbuf_va + kni->mbuf_kva;
102 data_kva = pkt_kva->buf_addr + pkt_kva->data_off
103 - kni->mbuf_va + kni->mbuf_kva;
105 #ifdef HAVE_IOV_ITER_MSGHDR
106 copy_from_iter(data_kva, len, &m->msg_iter);
108 memcpy_fromiovecend(data_kva, m->msg_iov, offset, len);
111 if (unlikely(len < ETH_ZLEN)) {
112 memset(data_kva + len, 0, ETH_ZLEN - len);
115 pkt_kva->pkt_len = len;
116 pkt_kva->data_len = len;
118 /* enqueue mbuf into tx_q */
119 ret = kni_fifo_put(kni->tx_q, (void **)&pkt_va, 1);
120 if (unlikely(ret != 1)) {
121 /* Failing should not happen */
122 pr_err("Fail to enqueue mbuf into tx_q\n");
126 /* Failing should not happen */
127 pr_err("Fail to dequeue mbuf from alloc_q\n");
131 /* update statistics */
132 kni->stats.tx_bytes += len;
133 kni->stats.tx_packets++;
138 /* update statistics */
139 kni->stats.tx_dropped++;
145 kni_vhost_net_rx(struct kni_dev *kni, struct msghdr *m,
146 uint32_t offset, uint32_t len)
149 struct rte_kni_mbuf *kva;
150 struct rte_kni_mbuf *va;
153 struct kni_vhost_queue *q = kni->vhost_queue;
155 if (unlikely(q == NULL))
158 /* ensure at least one entry in free_q */
159 if (unlikely(kni_fifo_free_count(kni->free_q) == 0))
162 skb = skb_dequeue(&q->sk.sk_receive_queue);
163 if (unlikely(skb == NULL))
166 kva = (struct rte_kni_mbuf *)skb->data;
168 /* free skb to cache */
170 if (unlikely(kni_fifo_put(q->fifo, (void **)&skb, 1) != 1))
171 /* Failing should not happen */
172 pr_err("Fail to enqueue entries into rx cache fifo\n");
174 pkt_len = kva->data_len;
175 if (unlikely(pkt_len > len))
178 pr_debug("rx offset=%d, len=%d, pkt_len=%d, iovlen=%d\n",
179 #ifdef HAVE_IOV_ITER_MSGHDR
180 offset, len, pkt_len, (int)m->msg_iter.iov->iov_len);
182 offset, len, pkt_len, (int)m->msg_iov->iov_len);
185 data_kva = kva->buf_addr + kva->data_off - kni->mbuf_va + kni->mbuf_kva;
186 #ifdef HAVE_IOV_ITER_MSGHDR
187 if (unlikely(copy_to_iter(data_kva, pkt_len, &m->msg_iter)))
189 if (unlikely(memcpy_toiovecend(m->msg_iov, data_kva, offset, pkt_len)))
193 /* Update statistics */
194 kni->stats.rx_bytes += pkt_len;
195 kni->stats.rx_packets++;
197 /* enqueue mbufs into free_q */
198 va = (void *)kva - kni->mbuf_kva + kni->mbuf_va;
199 if (unlikely(kni_fifo_put(kni->free_q, (void **)&va, 1) != 1))
200 /* Failing should not happen */
201 pr_err("Fail to enqueue entries into free_q\n");
203 pr_debug("receive done %d\n", pkt_len);
208 /* Update drop statistics */
209 kni->stats.rx_dropped++;
215 kni_sock_poll(struct file *file, struct socket *sock, poll_table *wait)
217 struct kni_vhost_queue *q =
218 container_of(sock->sk, struct kni_vhost_queue, sk);
222 if (unlikely(q == NULL || q->kni == NULL))
226 #ifdef HAVE_SOCKET_WQ
227 pr_debug("start kni_poll on group %d, wq 0x%16llx\n",
228 kni->group_id, (uint64_t)sock->wq);
229 poll_wait(file, &sock->wq->wait, wait);
231 pr_debug("start kni_poll on group %d, wait at 0x%16llx\n",
232 kni->group_id, (uint64_t)&sock->wait);
233 poll_wait(file, &sock->wait, wait);
236 if (kni_fifo_count(kni->rx_q) > 0)
237 mask |= POLLIN | POLLRDNORM;
239 if (sock_writeable(&q->sk) ||
240 #ifdef SOCKWQ_ASYNC_NOSPACE
241 (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock->flags) &&
242 sock_writeable(&q->sk)))
244 (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock->flags) &&
245 sock_writeable(&q->sk)))
247 mask |= POLLOUT | POLLWRNORM;
253 kni_vhost_enqueue(struct kni_dev *kni, struct kni_vhost_queue *q,
254 struct sk_buff *skb, struct rte_kni_mbuf *va)
256 struct rte_kni_mbuf *kva;
258 kva = (void *)(va) - kni->mbuf_va + kni->mbuf_kva;
259 (skb)->data = (unsigned char *)kva;
260 (skb)->len = kva->data_len;
261 skb_queue_tail(&q->sk.sk_receive_queue, skb);
265 kni_vhost_enqueue_burst(struct kni_dev *kni, struct kni_vhost_queue *q,
266 struct sk_buff **skb, struct rte_kni_mbuf **va)
270 for (i = 0; i < RX_BURST_SZ; skb++, va++, i++)
271 kni_vhost_enqueue(kni, q, *skb, *va);
275 kni_chk_vhost_rx(struct kni_dev *kni)
277 struct kni_vhost_queue *q = kni->vhost_queue;
278 uint32_t nb_in, nb_mbuf, nb_skb;
279 const uint32_t BURST_MASK = RX_BURST_SZ - 1;
280 uint32_t nb_burst, nb_backlog, i;
281 struct sk_buff *skb[RX_BURST_SZ];
282 struct rte_kni_mbuf *va[RX_BURST_SZ];
284 if (unlikely(BE_STOP & kni->vq_status)) {
285 kni->vq_status |= BE_FINISH;
289 if (unlikely(q == NULL))
292 nb_skb = kni_fifo_count(q->fifo);
293 nb_mbuf = kni_fifo_count(kni->rx_q);
295 nb_in = min(nb_mbuf, nb_skb);
296 nb_in = min_t(uint32_t, nb_in, RX_BURST_SZ);
297 nb_burst = (nb_in & ~BURST_MASK);
298 nb_backlog = (nb_in & BURST_MASK);
300 /* enqueue skb_queue per BURST_SIZE bulk */
302 if (unlikely(kni_fifo_get(kni->rx_q, (void **)&va, RX_BURST_SZ)
306 if (unlikely(kni_fifo_get(q->fifo, (void **)&skb, RX_BURST_SZ)
310 kni_vhost_enqueue_burst(kni, q, skb, va);
313 /* all leftover, do one by one */
314 for (i = 0; i < nb_backlog; ++i) {
315 if (unlikely(kni_fifo_get(kni->rx_q, (void **)&va, 1) != 1))
318 if (unlikely(kni_fifo_get(q->fifo, (void **)&skb, 1) != 1))
321 kni_vhost_enqueue(kni, q, *skb, *va);
324 /* Ondemand wake up */
325 if ((nb_in == RX_BURST_SZ) || (nb_skb == 0) ||
326 ((nb_mbuf < RX_BURST_SZ) && (nb_mbuf != 0))) {
327 wake_up_interruptible_poll(sk_sleep(&q->sk),
328 POLLIN | POLLRDNORM | POLLRDBAND);
329 pr_debug("RX CHK KICK nb_mbuf %d, nb_skb %d, nb_in %d\n",
330 nb_mbuf, nb_skb, nb_in);
336 /* Failing should not happen */
337 pr_err("Fail to enqueue fifo, it shouldn't happen\n");
344 #ifdef HAVE_KIOCB_MSG_PARAM
345 kni_sock_sndmsg(struct kiocb *iocb, struct socket *sock,
346 struct msghdr *m, size_t total_len)
348 kni_sock_sndmsg(struct socket *sock,
349 struct msghdr *m, size_t total_len)
350 #endif /* HAVE_KIOCB_MSG_PARAM */
352 struct kni_vhost_queue *q =
353 container_of(sock->sk, struct kni_vhost_queue, sk);
354 int vnet_hdr_len = 0;
355 unsigned long len = total_len;
357 if (unlikely(q == NULL || q->kni == NULL))
360 pr_debug("kni_sndmsg len %ld, flags 0x%08x, nb_iov %d\n",
361 #ifdef HAVE_IOV_ITER_MSGHDR
362 len, q->flags, (int)m->msg_iter.iov->iov_len);
364 len, q->flags, (int)m->msg_iovlen);
367 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
368 if (likely(q->flags & IFF_VNET_HDR)) {
369 vnet_hdr_len = q->vnet_hdr_sz;
370 if (unlikely(len < vnet_hdr_len))
376 if (unlikely(len < ETH_HLEN + q->vnet_hdr_sz))
379 return kni_vhost_net_tx(q->kni, m, vnet_hdr_len, len);
383 #ifdef HAVE_KIOCB_MSG_PARAM
384 kni_sock_rcvmsg(struct kiocb *iocb, struct socket *sock,
385 struct msghdr *m, size_t len, int flags)
387 kni_sock_rcvmsg(struct socket *sock,
388 struct msghdr *m, size_t len, int flags)
389 #endif /* HAVE_KIOCB_MSG_PARAM */
391 int vnet_hdr_len = 0;
393 struct kni_vhost_queue *q =
394 container_of(sock->sk, struct kni_vhost_queue, sk);
395 static struct virtio_net_hdr
396 __attribute__ ((unused)) vnet_hdr = {
398 .gso_type = VIRTIO_NET_HDR_GSO_NONE
401 if (unlikely(q == NULL || q->kni == NULL))
404 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
405 if (likely(q->flags & IFF_VNET_HDR)) {
406 vnet_hdr_len = q->vnet_hdr_sz;
413 pkt_len = kni_vhost_net_rx(q->kni, m, vnet_hdr_len, len);
414 if (unlikely(pkt_len == 0))
417 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
418 /* no need to copy hdr when no pkt received */
419 #ifdef HAVE_IOV_ITER_MSGHDR
420 if (unlikely(copy_to_iter((void *)&vnet_hdr, vnet_hdr_len,
423 if (unlikely(memcpy_toiovecend(m->msg_iov,
424 (void *)&vnet_hdr, 0, vnet_hdr_len)))
425 #endif /* HAVE_IOV_ITER_MSGHDR */
427 #endif /* RTE_KNI_VHOST_VNET_HDR_EN */
428 pr_debug("kni_rcvmsg expect_len %ld, flags 0x%08x, pkt_len %d\n",
429 (unsigned long)len, q->flags, pkt_len);
431 return pkt_len + vnet_hdr_len;
434 /* dummy tap like ioctl */
436 kni_sock_ioctl(struct socket *sock, uint32_t cmd, unsigned long arg)
438 void __user *argp = (void __user *)arg;
439 struct ifreq __user *ifr = argp;
440 uint32_t __user *up = argp;
441 struct kni_vhost_queue *q =
442 container_of(sock->sk, struct kni_vhost_queue, sk);
445 int __user *sp = argp;
449 pr_debug("tap ioctl cmd 0x%08x\n", cmd);
453 pr_debug("TUNSETIFF\n");
454 /* ignore the name, just look at flags */
455 if (get_user(u, &ifr->ifr_flags))
459 if ((u & ~IFF_VNET_HDR) != (IFF_NO_PI | IFF_TAP))
467 pr_debug("TUNGETIFF\n");
469 kni = rcu_dereference_bh(q->kni);
471 dev_hold(kni->net_dev);
472 rcu_read_unlock_bh();
478 if (copy_to_user(&ifr->ifr_name, kni->net_dev->name, IFNAMSIZ)
479 || put_user(q->flags, &ifr->ifr_flags))
481 dev_put(kni->net_dev);
485 pr_debug("TUNGETFEATURES\n");
486 u = IFF_TAP | IFF_NO_PI;
487 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
495 pr_debug("TUNSETSNDBUF\n");
502 case TUNGETVNETHDRSZ:
506 pr_debug("TUNGETVNETHDRSZ %d\n", s);
509 case TUNSETVNETHDRSZ:
512 if (s < (int)sizeof(struct virtio_net_hdr))
515 pr_debug("TUNSETVNETHDRSZ %d\n", s);
520 pr_debug("TUNSETOFFLOAD %lx\n", arg);
521 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
522 /* not support any offload yet */
523 if (!(q->flags & IFF_VNET_HDR))
532 pr_debug("NOT SUPPORT\n");
538 kni_sock_compat_ioctl(struct socket *sock, uint32_t cmd,
541 /* 32 bits app on 64 bits OS to be supported later */
542 pr_debug("Not implemented.\n");
547 #define KNI_VHOST_WAIT_WQ_SAFE() \
549 while ((BE_FINISH | BE_STOP) == kni->vq_status) \
555 kni_sock_release(struct socket *sock)
557 struct kni_vhost_queue *q =
558 container_of(sock->sk, struct kni_vhost_queue, sk);
566 kni->vq_status = BE_STOP;
567 KNI_VHOST_WAIT_WQ_SAFE();
568 kni->vhost_queue = NULL;
575 sk_set_socket(&q->sk, NULL);
580 pr_debug("dummy sock release done\n");
586 kni_sock_getname(struct socket *sock, struct sockaddr *addr,
587 int *sockaddr_len, int peer)
589 pr_debug("dummy sock getname\n");
590 ((struct sockaddr_ll *)addr)->sll_family = AF_PACKET;
594 static const struct proto_ops kni_socket_ops = {
595 .getname = kni_sock_getname,
596 .sendmsg = kni_sock_sndmsg,
597 .recvmsg = kni_sock_rcvmsg,
598 .release = kni_sock_release,
599 .poll = kni_sock_poll,
600 .ioctl = kni_sock_ioctl,
601 .compat_ioctl = kni_sock_compat_ioctl,
605 kni_sk_write_space(struct sock *sk)
607 wait_queue_head_t *wqueue;
609 if (!sock_writeable(sk) ||
610 #ifdef SOCKWQ_ASYNC_NOSPACE
611 !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
613 !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
616 wqueue = sk_sleep(sk);
617 if (wqueue && waitqueue_active(wqueue))
618 wake_up_interruptible_poll(
619 wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
623 kni_sk_destruct(struct sock *sk)
625 struct kni_vhost_queue *q =
626 container_of(sk, struct kni_vhost_queue, sk);
631 /* make sure there's no packet in buffer */
632 while (skb_dequeue(&sk->sk_receive_queue) != NULL)
637 if (q->fifo != NULL) {
642 if (q->cache != NULL) {
649 kni_vhost_backend_init(struct kni_dev *kni)
651 struct kni_vhost_queue *q;
652 struct net *net = current->nsproxy->net_ns;
654 struct rte_kni_fifo *fifo;
655 struct sk_buff *elem;
657 if (kni->vhost_queue != NULL)
660 #ifdef HAVE_SK_ALLOC_KERN_PARAM
661 q = (struct kni_vhost_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
664 q = (struct kni_vhost_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
670 err = sock_create_lite(AF_UNSPEC, SOCK_RAW, IPPROTO_RAW, &q->sock);
674 sockfd = kni_sock_map_fd(q->sock);
682 RTE_KNI_VHOST_MAX_CACHE_SIZE * sizeof(struct sk_buff),
687 fifo = kzalloc(RTE_KNI_VHOST_MAX_CACHE_SIZE * sizeof(void *)
688 + sizeof(struct rte_kni_fifo), GFP_KERNEL);
692 kni_fifo_init(fifo, RTE_KNI_VHOST_MAX_CACHE_SIZE);
694 for (i = 0; i < RTE_KNI_VHOST_MAX_CACHE_SIZE; i++) {
696 kni_fifo_put(fifo, (void **)&elem, 1);
700 /* store sockfd in vhost_queue */
704 q->sock->type = SOCK_RAW;
705 q->sock->state = SS_CONNECTED;
706 q->sock->ops = &kni_socket_ops;
707 sock_init_data(q->sock, &q->sk);
710 q->sk.sk_write_space = kni_sk_write_space;
711 q->sk.sk_destruct = kni_sk_destruct;
712 q->flags = IFF_NO_PI | IFF_TAP;
713 q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
714 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
715 q->flags |= IFF_VNET_HDR;
718 /* bind kni_dev with vhost_queue */
720 kni->vhost_queue = q;
724 kni->vq_status = BE_START;
726 #ifdef HAVE_SOCKET_WQ
727 pr_debug("backend init sockfd=%d, sock->wq=0x%16llx,sk->sk_wq=0x%16llx",
728 q->sockfd, (uint64_t)q->sock->wq,
729 (uint64_t)q->sk.sk_wq);
731 pr_debug("backend init sockfd=%d, sock->wait at 0x%16llx,sk->sk_sleep=0x%16llx",
732 q->sockfd, (uint64_t)&q->sock->wait,
733 (uint64_t)q->sk.sk_sleep);
743 put_unused_fd(sockfd);
747 kni->vhost_queue = NULL;
748 kni->vq_status |= BE_FINISH;
749 sock_release(q->sock);
754 sk_free((struct sock *)q);
759 /* kni vhost sock sysfs */
761 show_sock_fd(struct device *dev, struct device_attribute *attr,
764 struct net_device *net_dev = container_of(dev, struct net_device, dev);
765 struct kni_dev *kni = netdev_priv(net_dev);
768 if (kni->vhost_queue != NULL)
769 sockfd = kni->vhost_queue->sockfd;
770 return snprintf(buf, 10, "%d\n", sockfd);
774 show_sock_en(struct device *dev, struct device_attribute *attr,
777 struct net_device *net_dev = container_of(dev, struct net_device, dev);
778 struct kni_dev *kni = netdev_priv(net_dev);
780 return snprintf(buf, 10, "%u\n", (kni->vhost_queue == NULL ? 0 : 1));
784 set_sock_en(struct device *dev, struct device_attribute *attr,
785 const char *buf, size_t count)
787 struct net_device *net_dev = container_of(dev, struct net_device, dev);
788 struct kni_dev *kni = netdev_priv(net_dev);
792 if (kstrtoul(buf, 0, &en) != 0)
796 err = kni_vhost_backend_init(kni);
798 return err ? err : count;
801 static DEVICE_ATTR(sock_fd, S_IRUGO | S_IRUSR, show_sock_fd, NULL);
802 static DEVICE_ATTR(sock_en, S_IRUGO | S_IWUSR, show_sock_en, set_sock_en);
803 static struct attribute *dev_attrs[] = {
804 &dev_attr_sock_fd.attr,
805 &dev_attr_sock_en.attr,
809 static const struct attribute_group dev_attr_grp = {
814 kni_vhost_backend_release(struct kni_dev *kni)
816 struct kni_vhost_queue *q = kni->vhost_queue;
821 /* dettach from kni */
824 pr_debug("release backend done\n");
830 kni_vhost_init(struct kni_dev *kni)
832 struct net_device *dev = kni->net_dev;
834 if (sysfs_create_group(&dev->dev.kobj, &dev_attr_grp))
835 sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp);
837 kni->vq_status = BE_STOP;
839 pr_debug("kni_vhost_init done\n");