550a1329dd6f511034a34c2fcd1325f3ef177f51
[deb_dpdk.git] / lib / librte_vhost / vhost_user.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <unistd.h>
39 #include <sys/mman.h>
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <stdbool.h>
43 #include <assert.h>
44 #ifdef RTE_LIBRTE_VHOST_NUMA
45 #include <numaif.h>
46 #endif
47
48 #include <rte_common.h>
49 #include <rte_malloc.h>
50 #include <rte_log.h>
51
52 #include "vhost.h"
53 #include "vhost_user.h"
54
55 static const char *vhost_message_str[VHOST_USER_MAX] = {
56         [VHOST_USER_NONE] = "VHOST_USER_NONE",
57         [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
58         [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
59         [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
60         [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
61         [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
62         [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
63         [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
64         [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
65         [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
66         [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
67         [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
68         [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
69         [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
70         [VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR",
71         [VHOST_USER_GET_PROTOCOL_FEATURES]  = "VHOST_USER_GET_PROTOCOL_FEATURES",
72         [VHOST_USER_SET_PROTOCOL_FEATURES]  = "VHOST_USER_SET_PROTOCOL_FEATURES",
73         [VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
74         [VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
75         [VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
76 };
77
78 static uint64_t
79 get_blk_size(int fd)
80 {
81         struct stat stat;
82         int ret;
83
84         ret = fstat(fd, &stat);
85         return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
86 }
87
88 static void
89 free_mem_region(struct virtio_net *dev)
90 {
91         uint32_t i;
92         struct virtio_memory_region *reg;
93
94         if (!dev || !dev->mem)
95                 return;
96
97         for (i = 0; i < dev->mem->nregions; i++) {
98                 reg = &dev->mem->regions[i];
99                 if (reg->host_user_addr) {
100                         munmap(reg->mmap_addr, reg->mmap_size);
101                         close(reg->fd);
102                 }
103         }
104 }
105
106 void
107 vhost_backend_cleanup(struct virtio_net *dev)
108 {
109         if (dev->mem) {
110                 free_mem_region(dev);
111                 rte_free(dev->mem);
112                 dev->mem = NULL;
113         }
114
115         free(dev->guest_pages);
116         dev->guest_pages = NULL;
117
118         if (dev->log_addr) {
119                 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
120                 dev->log_addr = 0;
121         }
122 }
123
124 /*
125  * This function just returns success at the moment unless
126  * the device hasn't been initialised.
127  */
128 static int
129 vhost_user_set_owner(void)
130 {
131         return 0;
132 }
133
134 static int
135 vhost_user_reset_owner(struct virtio_net *dev)
136 {
137         if (dev->flags & VIRTIO_DEV_RUNNING) {
138                 dev->flags &= ~VIRTIO_DEV_RUNNING;
139                 notify_ops->destroy_device(dev->vid);
140         }
141
142         cleanup_device(dev, 0);
143         reset_device(dev);
144         return 0;
145 }
146
147 /*
148  * The features that we support are requested.
149  */
150 static uint64_t
151 vhost_user_get_features(void)
152 {
153         return VHOST_FEATURES;
154 }
155
156 /*
157  * We receive the negotiated features supported by us and the virtio device.
158  */
159 static int
160 vhost_user_set_features(struct virtio_net *dev, uint64_t features)
161 {
162         if (features & ~VHOST_FEATURES)
163                 return -1;
164
165         dev->features = features;
166         if (dev->features &
167                 ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
168                 dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
169         } else {
170                 dev->vhost_hlen = sizeof(struct virtio_net_hdr);
171         }
172         LOG_DEBUG(VHOST_CONFIG,
173                 "(%d) mergeable RX buffers %s, virtio 1 %s\n",
174                 dev->vid,
175                 (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
176                 (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
177
178         return 0;
179 }
180
181 /*
182  * The virtio device sends us the size of the descriptor ring.
183  */
184 static int
185 vhost_user_set_vring_num(struct virtio_net *dev,
186                          struct vhost_vring_state *state)
187 {
188         struct vhost_virtqueue *vq = dev->virtqueue[state->index];
189
190         vq->size = state->num;
191
192         if (dev->dequeue_zero_copy) {
193                 vq->nr_zmbuf = 0;
194                 vq->last_zmbuf_idx = 0;
195                 vq->zmbuf_size = vq->size;
196                 vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size *
197                                          sizeof(struct zcopy_mbuf), 0);
198                 if (vq->zmbufs == NULL) {
199                         RTE_LOG(WARNING, VHOST_CONFIG,
200                                 "failed to allocate mem for zero copy; "
201                                 "zero copy is force disabled\n");
202                         dev->dequeue_zero_copy = 0;
203                 }
204         }
205
206         vq->shadow_used_ring = rte_malloc(NULL,
207                                 vq->size * sizeof(struct vring_used_elem),
208                                 RTE_CACHE_LINE_SIZE);
209         if (!vq->shadow_used_ring) {
210                 RTE_LOG(ERR, VHOST_CONFIG,
211                         "failed to allocate memory for shadow used ring.\n");
212                 return -1;
213         }
214
215         return 0;
216 }
217
218 /*
219  * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
220  * same numa node as the memory of vring descriptor.
221  */
222 #ifdef RTE_LIBRTE_VHOST_NUMA
223 static struct virtio_net*
224 numa_realloc(struct virtio_net *dev, int index)
225 {
226         int oldnode, newnode;
227         struct virtio_net *old_dev;
228         struct vhost_virtqueue *old_vq, *vq;
229         int ret;
230
231         /*
232          * vq is allocated on pairs, we should try to do realloc
233          * on first queue of one queue pair only.
234          */
235         if (index % VIRTIO_QNUM != 0)
236                 return dev;
237
238         old_dev = dev;
239         vq = old_vq = dev->virtqueue[index];
240
241         ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
242                             MPOL_F_NODE | MPOL_F_ADDR);
243
244         /* check if we need to reallocate vq */
245         ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
246                              MPOL_F_NODE | MPOL_F_ADDR);
247         if (ret) {
248                 RTE_LOG(ERR, VHOST_CONFIG,
249                         "Unable to get vq numa information.\n");
250                 return dev;
251         }
252         if (oldnode != newnode) {
253                 RTE_LOG(INFO, VHOST_CONFIG,
254                         "reallocate vq from %d to %d node\n", oldnode, newnode);
255                 vq = rte_malloc_socket(NULL, sizeof(*vq) * VIRTIO_QNUM, 0,
256                                        newnode);
257                 if (!vq)
258                         return dev;
259
260                 memcpy(vq, old_vq, sizeof(*vq) * VIRTIO_QNUM);
261                 rte_free(old_vq);
262         }
263
264         /* check if we need to reallocate dev */
265         ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
266                             MPOL_F_NODE | MPOL_F_ADDR);
267         if (ret) {
268                 RTE_LOG(ERR, VHOST_CONFIG,
269                         "Unable to get dev numa information.\n");
270                 goto out;
271         }
272         if (oldnode != newnode) {
273                 RTE_LOG(INFO, VHOST_CONFIG,
274                         "reallocate dev from %d to %d node\n",
275                         oldnode, newnode);
276                 dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
277                 if (!dev) {
278                         dev = old_dev;
279                         goto out;
280                 }
281
282                 memcpy(dev, old_dev, sizeof(*dev));
283                 rte_free(old_dev);
284         }
285
286 out:
287         dev->virtqueue[index] = vq;
288         dev->virtqueue[index + 1] = vq + 1;
289         vhost_devices[dev->vid] = dev;
290
291         return dev;
292 }
293 #else
294 static struct virtio_net*
295 numa_realloc(struct virtio_net *dev, int index __rte_unused)
296 {
297         return dev;
298 }
299 #endif
300
301 /*
302  * Converts QEMU virtual address to Vhost virtual address. This function is
303  * used to convert the ring addresses to our address space.
304  */
305 static uint64_t
306 qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len)
307 {
308         struct virtio_memory_region *r;
309         uint32_t i;
310
311         /* Find the region where the address lives. */
312         for (i = 0; i < dev->mem->nregions; i++) {
313                 r = &dev->mem->regions[i];
314
315                 if (qva >= r->guest_user_addr &&
316                     qva <  r->guest_user_addr + r->size) {
317
318                         if (unlikely(*len > r->guest_user_addr + r->size - qva))
319                                 *len = r->guest_user_addr + r->size - qva;
320
321                         return qva - r->guest_user_addr +
322                                r->host_user_addr;
323                 }
324         }
325         *len = 0;
326
327         return 0;
328 }
329
330 /*
331  * The virtio device sends us the desc, used and avail ring addresses.
332  * This function then converts these to our address space.
333  */
334 static int
335 vhost_user_set_vring_addr(struct virtio_net **pdev,
336                                                   struct vhost_vring_addr *addr)
337 {
338         struct vhost_virtqueue *vq;
339         struct virtio_net *dev = *pdev;
340         uint64_t size, req_size;
341
342         if (dev->mem == NULL)
343                 return -1;
344
345         /* addr->index refers to the queue index. The txq 1, rxq is 0. */
346         vq = dev->virtqueue[addr->index];
347
348         /* The addresses are converted from QEMU virtual to Vhost virtual. */
349         req_size = sizeof(struct vring_desc) * vq->size;
350         size = req_size;
351         vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev,
352                         addr->desc_user_addr, &size);
353         if (vq->desc == 0 || size != req_size) {
354                 RTE_LOG(ERR, VHOST_CONFIG,
355                         "(%d) failed to map desc ring address.\n",
356                         dev->vid);
357                 return -1;
358         }
359
360         dev = numa_realloc(dev, addr->index);
361         *pdev = dev;
362
363         vq = dev->virtqueue[addr->index];
364
365         req_size = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size;
366         size = req_size;
367         vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev,
368                         addr->avail_user_addr, &size);
369         if (vq->avail == 0 || size != req_size) {
370                 RTE_LOG(ERR, VHOST_CONFIG,
371                         "(%d) failed to find avail ring address.\n",
372                         dev->vid);
373                 return -1;
374         }
375
376         req_size = sizeof(struct vring_used);
377         req_size += sizeof(struct vring_used_elem) * vq->size;
378         size = req_size;
379         vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev,
380                         addr->used_user_addr, &size);
381         if (vq->used == 0 || size != req_size) {
382                 RTE_LOG(ERR, VHOST_CONFIG,
383                         "(%d) failed to find used ring address.\n",
384                         dev->vid);
385                 return -1;
386         }
387
388         if (vq->last_used_idx != vq->used->idx) {
389                 RTE_LOG(WARNING, VHOST_CONFIG,
390                         "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
391                         "some packets maybe resent for Tx and dropped for Rx\n",
392                         vq->last_used_idx, vq->used->idx);
393                 vq->last_used_idx  = vq->used->idx;
394                 vq->last_avail_idx = vq->used->idx;
395         }
396
397         vq->log_guest_addr = addr->log_guest_addr;
398
399         LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
400                         dev->vid, vq->desc);
401         LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
402                         dev->vid, vq->avail);
403         LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
404                         dev->vid, vq->used);
405         LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
406                         dev->vid, vq->log_guest_addr);
407
408         return 0;
409 }
410
411 /*
412  * The virtio device sends us the available ring last used index.
413  */
414 static int
415 vhost_user_set_vring_base(struct virtio_net *dev,
416                           struct vhost_vring_state *state)
417 {
418         dev->virtqueue[state->index]->last_used_idx  = state->num;
419         dev->virtqueue[state->index]->last_avail_idx = state->num;
420
421         return 0;
422 }
423
424 static void
425 add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
426                    uint64_t host_phys_addr, uint64_t size)
427 {
428         struct guest_page *page, *last_page;
429
430         if (dev->nr_guest_pages == dev->max_guest_pages) {
431                 dev->max_guest_pages *= 2;
432                 dev->guest_pages = realloc(dev->guest_pages,
433                                         dev->max_guest_pages * sizeof(*page));
434         }
435
436         if (dev->nr_guest_pages > 0) {
437                 last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
438                 /* merge if the two pages are continuous */
439                 if (host_phys_addr == last_page->host_phys_addr +
440                                       last_page->size) {
441                         last_page->size += size;
442                         return;
443                 }
444         }
445
446         page = &dev->guest_pages[dev->nr_guest_pages++];
447         page->guest_phys_addr = guest_phys_addr;
448         page->host_phys_addr  = host_phys_addr;
449         page->size = size;
450 }
451
452 static void
453 add_guest_pages(struct virtio_net *dev, struct virtio_memory_region *reg,
454                 uint64_t page_size)
455 {
456         uint64_t reg_size = reg->size;
457         uint64_t host_user_addr  = reg->host_user_addr;
458         uint64_t guest_phys_addr = reg->guest_phys_addr;
459         uint64_t host_phys_addr;
460         uint64_t size;
461
462         host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
463         size = page_size - (guest_phys_addr & (page_size - 1));
464         size = RTE_MIN(size, reg_size);
465
466         add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
467         host_user_addr  += size;
468         guest_phys_addr += size;
469         reg_size -= size;
470
471         while (reg_size > 0) {
472                 size = RTE_MIN(reg_size, page_size);
473                 host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)
474                                                   host_user_addr);
475                 add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
476
477                 host_user_addr  += size;
478                 guest_phys_addr += size;
479                 reg_size -= size;
480         }
481 }
482
483 #ifdef RTE_LIBRTE_VHOST_DEBUG
484 /* TODO: enable it only in debug mode? */
485 static void
486 dump_guest_pages(struct virtio_net *dev)
487 {
488         uint32_t i;
489         struct guest_page *page;
490
491         for (i = 0; i < dev->nr_guest_pages; i++) {
492                 page = &dev->guest_pages[i];
493
494                 RTE_LOG(INFO, VHOST_CONFIG,
495                         "guest physical page region %u\n"
496                         "\t guest_phys_addr: %" PRIx64 "\n"
497                         "\t host_phys_addr : %" PRIx64 "\n"
498                         "\t size           : %" PRIx64 "\n",
499                         i,
500                         page->guest_phys_addr,
501                         page->host_phys_addr,
502                         page->size);
503         }
504 }
505 #else
506 #define dump_guest_pages(dev)
507 #endif
508
509 static bool
510 vhost_memory_changed(struct VhostUserMemory *new,
511                       struct virtio_memory *old)
512 {
513         uint32_t i;
514
515         if (new->nregions != old->nregions)
516                 return true;
517
518         for (i = 0; i < new->nregions; ++i) {
519                 VhostUserMemoryRegion *new_r = &new->regions[i];
520                 struct virtio_memory_region *old_r = &old->regions[i];
521
522                 if (new_r->guest_phys_addr != old_r->guest_phys_addr)
523                         return true;
524                 if (new_r->memory_size != old_r->size)
525                         return true;
526                 if (new_r->userspace_addr != old_r->guest_user_addr)
527                         return true;
528         }
529
530         return false;
531 }
532
533 static int
534 vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
535 {
536         struct VhostUserMemory memory = pmsg->payload.memory;
537         struct virtio_memory_region *reg;
538         void *mmap_addr;
539         uint64_t mmap_size;
540         uint64_t mmap_offset;
541         uint64_t alignment;
542         uint32_t i;
543         int fd;
544
545         if (dev->mem && !vhost_memory_changed(&memory, dev->mem)) {
546                 RTE_LOG(INFO, VHOST_CONFIG,
547                         "(%d) memory regions not changed\n", dev->vid);
548
549                 for (i = 0; i < memory.nregions; i++)
550                         close(pmsg->fds[i]);
551
552                 return 0;
553         }
554
555         /* Remove from the data plane. */
556         if (dev->flags & VIRTIO_DEV_RUNNING) {
557                 dev->flags &= ~VIRTIO_DEV_RUNNING;
558                 notify_ops->destroy_device(dev->vid);
559         }
560
561         if (dev->mem) {
562                 free_mem_region(dev);
563                 rte_free(dev->mem);
564                 dev->mem = NULL;
565         }
566
567         dev->nr_guest_pages = 0;
568         if (!dev->guest_pages) {
569                 dev->max_guest_pages = 8;
570                 dev->guest_pages = malloc(dev->max_guest_pages *
571                                                 sizeof(struct guest_page));
572         }
573
574         dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct virtio_memory) +
575                 sizeof(struct virtio_memory_region) * memory.nregions, 0);
576         if (dev->mem == NULL) {
577                 RTE_LOG(ERR, VHOST_CONFIG,
578                         "(%d) failed to allocate memory for dev->mem\n",
579                         dev->vid);
580                 return -1;
581         }
582         dev->mem->nregions = memory.nregions;
583
584         for (i = 0; i < memory.nregions; i++) {
585                 fd  = pmsg->fds[i];
586                 reg = &dev->mem->regions[i];
587
588                 reg->guest_phys_addr = memory.regions[i].guest_phys_addr;
589                 reg->guest_user_addr = memory.regions[i].userspace_addr;
590                 reg->size            = memory.regions[i].memory_size;
591                 reg->fd              = fd;
592
593                 mmap_offset = memory.regions[i].mmap_offset;
594                 mmap_size   = reg->size + mmap_offset;
595
596                 /* mmap() without flag of MAP_ANONYMOUS, should be called
597                  * with length argument aligned with hugepagesz at older
598                  * longterm version Linux, like 2.6.32 and 3.2.72, or
599                  * mmap() will fail with EINVAL.
600                  *
601                  * to avoid failure, make sure in caller to keep length
602                  * aligned.
603                  */
604                 alignment = get_blk_size(fd);
605                 if (alignment == (uint64_t)-1) {
606                         RTE_LOG(ERR, VHOST_CONFIG,
607                                 "couldn't get hugepage size through fstat\n");
608                         goto err_mmap;
609                 }
610                 mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
611
612                 mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
613                                  MAP_SHARED | MAP_POPULATE, fd, 0);
614
615                 if (mmap_addr == MAP_FAILED) {
616                         RTE_LOG(ERR, VHOST_CONFIG,
617                                 "mmap region %u failed.\n", i);
618                         goto err_mmap;
619                 }
620
621                 reg->mmap_addr = mmap_addr;
622                 reg->mmap_size = mmap_size;
623                 reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
624                                       mmap_offset;
625
626                 if (dev->dequeue_zero_copy)
627                         add_guest_pages(dev, reg, alignment);
628
629                 RTE_LOG(INFO, VHOST_CONFIG,
630                         "guest memory region %u, size: 0x%" PRIx64 "\n"
631                         "\t guest physical addr: 0x%" PRIx64 "\n"
632                         "\t guest virtual  addr: 0x%" PRIx64 "\n"
633                         "\t host  virtual  addr: 0x%" PRIx64 "\n"
634                         "\t mmap addr : 0x%" PRIx64 "\n"
635                         "\t mmap size : 0x%" PRIx64 "\n"
636                         "\t mmap align: 0x%" PRIx64 "\n"
637                         "\t mmap off  : 0x%" PRIx64 "\n",
638                         i, reg->size,
639                         reg->guest_phys_addr,
640                         reg->guest_user_addr,
641                         reg->host_user_addr,
642                         (uint64_t)(uintptr_t)mmap_addr,
643                         mmap_size,
644                         alignment,
645                         mmap_offset);
646         }
647
648         dump_guest_pages(dev);
649
650         return 0;
651
652 err_mmap:
653         free_mem_region(dev);
654         rte_free(dev->mem);
655         dev->mem = NULL;
656         return -1;
657 }
658
659 static int
660 vq_is_ready(struct vhost_virtqueue *vq)
661 {
662         return vq && vq->desc   &&
663                vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
664                vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
665 }
666
667 static int
668 virtio_is_ready(struct virtio_net *dev)
669 {
670         struct vhost_virtqueue *rvq, *tvq;
671         uint32_t i;
672
673         for (i = 0; i < dev->virt_qp_nb; i++) {
674                 rvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ];
675                 tvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ];
676
677                 if (!vq_is_ready(rvq) || !vq_is_ready(tvq)) {
678                         RTE_LOG(INFO, VHOST_CONFIG,
679                                 "virtio is not ready for processing.\n");
680                         return 0;
681                 }
682         }
683
684         RTE_LOG(INFO, VHOST_CONFIG,
685                 "virtio is now ready for processing.\n");
686         return 1;
687 }
688
689 static void
690 vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg)
691 {
692         struct vhost_vring_file file;
693         struct vhost_virtqueue *vq;
694         uint32_t cur_qp_idx;
695
696         file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
697         if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
698                 file.fd = VIRTIO_INVALID_EVENTFD;
699         else
700                 file.fd = pmsg->fds[0];
701         RTE_LOG(INFO, VHOST_CONFIG,
702                 "vring call idx:%d file:%d\n", file.index, file.fd);
703
704         /*
705          * FIXME: VHOST_SET_VRING_CALL is the first per-vring message
706          * we get, so we do vring queue pair allocation here.
707          */
708         cur_qp_idx = file.index / VIRTIO_QNUM;
709         if (cur_qp_idx + 1 > dev->virt_qp_nb) {
710                 if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0)
711                         return;
712         }
713
714         vq = dev->virtqueue[file.index];
715         assert(vq != NULL);
716
717         if (vq->callfd >= 0)
718                 close(vq->callfd);
719
720         vq->callfd = file.fd;
721 }
722
723 /*
724  *  In vhost-user, when we receive kick message, will test whether virtio
725  *  device is ready for packet processing.
726  */
727 static void
728 vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg)
729 {
730         struct vhost_vring_file file;
731         struct vhost_virtqueue *vq;
732
733         file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
734         if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
735                 file.fd = VIRTIO_INVALID_EVENTFD;
736         else
737                 file.fd = pmsg->fds[0];
738         RTE_LOG(INFO, VHOST_CONFIG,
739                 "vring kick idx:%d file:%d\n", file.index, file.fd);
740
741         vq = dev->virtqueue[file.index];
742         if (vq->kickfd >= 0)
743                 close(vq->kickfd);
744         vq->kickfd = file.fd;
745
746         if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) {
747                 if (dev->dequeue_zero_copy) {
748                         RTE_LOG(INFO, VHOST_CONFIG,
749                                 "dequeue zero copy is enabled\n");
750                 }
751
752                 if (notify_ops->new_device(dev->vid) == 0)
753                         dev->flags |= VIRTIO_DEV_RUNNING;
754         }
755 }
756
757 static void
758 free_zmbufs(struct vhost_virtqueue *vq)
759 {
760         struct zcopy_mbuf *zmbuf, *next;
761
762         for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
763              zmbuf != NULL; zmbuf = next) {
764                 next = TAILQ_NEXT(zmbuf, next);
765
766                 rte_pktmbuf_free(zmbuf->mbuf);
767                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
768         }
769
770         rte_free(vq->zmbufs);
771 }
772
773 /*
774  * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
775  */
776 static int
777 vhost_user_get_vring_base(struct virtio_net *dev,
778                           struct vhost_vring_state *state)
779 {
780         struct vhost_virtqueue *vq = dev->virtqueue[state->index];
781
782         /* We have to stop the queue (virtio) if it is running. */
783         if (dev->flags & VIRTIO_DEV_RUNNING) {
784                 dev->flags &= ~VIRTIO_DEV_RUNNING;
785                 notify_ops->destroy_device(dev->vid);
786         }
787
788         /* Here we are safe to get the last used index */
789         state->num = vq->last_used_idx;
790
791         RTE_LOG(INFO, VHOST_CONFIG,
792                 "vring base idx:%d file:%d\n", state->index, state->num);
793         /*
794          * Based on current qemu vhost-user implementation, this message is
795          * sent and only sent in vhost_vring_stop.
796          * TODO: cleanup the vring, it isn't usable since here.
797          */
798         if (vq->kickfd >= 0)
799                 close(vq->kickfd);
800
801         vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
802
803         if (dev->dequeue_zero_copy)
804                 free_zmbufs(vq);
805         rte_free(vq->shadow_used_ring);
806         vq->shadow_used_ring = NULL;
807
808         return 0;
809 }
810
811 /*
812  * when virtio queues are ready to work, qemu will send us to
813  * enable the virtio queue pair.
814  */
815 static int
816 vhost_user_set_vring_enable(struct virtio_net *dev,
817                             struct vhost_vring_state *state)
818 {
819         int enable = (int)state->num;
820
821         RTE_LOG(INFO, VHOST_CONFIG,
822                 "set queue enable: %d to qp idx: %d\n",
823                 enable, state->index);
824
825         if (notify_ops->vring_state_changed)
826                 notify_ops->vring_state_changed(dev->vid, state->index, enable);
827
828         dev->virtqueue[state->index]->enabled = enable;
829
830         return 0;
831 }
832
833 static void
834 vhost_user_set_protocol_features(struct virtio_net *dev,
835                                  uint64_t protocol_features)
836 {
837         if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
838                 return;
839
840         dev->protocol_features = protocol_features;
841 }
842
843 static int
844 vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg)
845 {
846         int fd = msg->fds[0];
847         uint64_t size, off;
848         void *addr;
849
850         if (fd < 0) {
851                 RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
852                 return -1;
853         }
854
855         if (msg->size != sizeof(VhostUserLog)) {
856                 RTE_LOG(ERR, VHOST_CONFIG,
857                         "invalid log base msg size: %"PRId32" != %d\n",
858                         msg->size, (int)sizeof(VhostUserLog));
859                 return -1;
860         }
861
862         size = msg->payload.log.mmap_size;
863         off  = msg->payload.log.mmap_offset;
864         RTE_LOG(INFO, VHOST_CONFIG,
865                 "log mmap size: %"PRId64", offset: %"PRId64"\n",
866                 size, off);
867
868         /*
869          * mmap from 0 to workaround a hugepage mmap bug: mmap will
870          * fail when offset is not page size aligned.
871          */
872         addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
873         close(fd);
874         if (addr == MAP_FAILED) {
875                 RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
876                 return -1;
877         }
878
879         /*
880          * Free previously mapped log memory on occasionally
881          * multiple VHOST_USER_SET_LOG_BASE.
882          */
883         if (dev->log_addr) {
884                 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
885         }
886         dev->log_addr = (uint64_t)(uintptr_t)addr;
887         dev->log_base = dev->log_addr + off;
888         dev->log_size = size;
889
890         return 0;
891 }
892
893 /*
894  * An rarp packet is constructed and broadcasted to notify switches about
895  * the new location of the migrated VM, so that packets from outside will
896  * not be lost after migration.
897  *
898  * However, we don't actually "send" a rarp packet here, instead, we set
899  * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
900  */
901 static int
902 vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg)
903 {
904         uint8_t *mac = (uint8_t *)&msg->payload.u64;
905
906         RTE_LOG(DEBUG, VHOST_CONFIG,
907                 ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
908                 mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
909         memcpy(dev->mac.addr_bytes, mac, 6);
910
911         /*
912          * Set the flag to inject a RARP broadcast packet at
913          * rte_vhost_dequeue_burst().
914          *
915          * rte_smp_wmb() is for making sure the mac is copied
916          * before the flag is set.
917          */
918         rte_smp_wmb();
919         rte_atomic16_set(&dev->broadcast_rarp, 1);
920
921         return 0;
922 }
923
924 /* return bytes# of read on success or negative val on failure. */
925 static int
926 read_vhost_message(int sockfd, struct VhostUserMsg *msg)
927 {
928         int ret;
929
930         ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
931                 msg->fds, VHOST_MEMORY_MAX_NREGIONS);
932         if (ret <= 0)
933                 return ret;
934
935         if (msg && msg->size) {
936                 if (msg->size > sizeof(msg->payload)) {
937                         RTE_LOG(ERR, VHOST_CONFIG,
938                                 "invalid msg size: %d\n", msg->size);
939                         return -1;
940                 }
941                 ret = read(sockfd, &msg->payload, msg->size);
942                 if (ret <= 0)
943                         return ret;
944                 if (ret != (int)msg->size) {
945                         RTE_LOG(ERR, VHOST_CONFIG,
946                                 "read control message failed\n");
947                         return -1;
948                 }
949         }
950
951         return ret;
952 }
953
954 static int
955 send_vhost_message(int sockfd, struct VhostUserMsg *msg)
956 {
957         int ret;
958
959         if (!msg)
960                 return 0;
961
962         msg->flags &= ~VHOST_USER_VERSION_MASK;
963         msg->flags |= VHOST_USER_VERSION;
964         msg->flags |= VHOST_USER_REPLY_MASK;
965
966         ret = send_fd_message(sockfd, (char *)msg,
967                 VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
968
969         return ret;
970 }
971
972 static void
973 vhost_user_lock_all_queue_pairs(struct virtio_net *dev)
974 {
975         unsigned int i = 0;
976         unsigned int vq_num = 0;
977
978         while (vq_num < dev->virt_qp_nb * 2) {
979                 struct vhost_virtqueue *vq = dev->virtqueue[i];
980
981                 if (vq) {
982                         rte_spinlock_lock(&vq->access_lock);
983                         vq_num++;
984                 }
985                 i++;
986         }
987 }
988
989 static void
990 vhost_user_unlock_all_queue_pairs(struct virtio_net *dev)
991 {
992         unsigned int i = 0;
993         unsigned int vq_num = 0;
994
995         while (vq_num < dev->virt_qp_nb * 2) {
996                 struct vhost_virtqueue *vq = dev->virtqueue[i];
997
998                 if (vq) {
999                         rte_spinlock_unlock(&vq->access_lock);
1000                         vq_num++;
1001                 }
1002                 i++;
1003         }
1004 }
1005
1006 int
1007 vhost_user_msg_handler(int vid, int fd)
1008 {
1009         struct virtio_net *dev;
1010         struct VhostUserMsg msg;
1011         int ret;
1012         int unlock_required = 0;
1013
1014         dev = get_device(vid);
1015         if (dev == NULL)
1016                 return -1;
1017
1018         ret = read_vhost_message(fd, &msg);
1019         if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
1020                 if (ret < 0)
1021                         RTE_LOG(ERR, VHOST_CONFIG,
1022                                 "vhost read message failed\n");
1023                 else if (ret == 0)
1024                         RTE_LOG(INFO, VHOST_CONFIG,
1025                                 "vhost peer closed\n");
1026                 else
1027                         RTE_LOG(ERR, VHOST_CONFIG,
1028                                 "vhost read incorrect message\n");
1029
1030                 return -1;
1031         }
1032
1033         RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
1034                 vhost_message_str[msg.request]);
1035
1036         /*
1037          * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE
1038          * and VHOST_USER_RESET_OWNER, since it is sent when virtio stops
1039          * and device is destroyed. destroy_device waits for queues to be
1040          * inactive, so it is safe. Otherwise taking the access_lock
1041          * would cause a dead lock.
1042          */
1043         switch (msg.request) {
1044         case VHOST_USER_SET_FEATURES:
1045         case VHOST_USER_SET_PROTOCOL_FEATURES:
1046         case VHOST_USER_SET_OWNER:
1047         case VHOST_USER_SET_MEM_TABLE:
1048         case VHOST_USER_SET_LOG_BASE:
1049         case VHOST_USER_SET_LOG_FD:
1050         case VHOST_USER_SET_VRING_NUM:
1051         case VHOST_USER_SET_VRING_ADDR:
1052         case VHOST_USER_SET_VRING_BASE:
1053         case VHOST_USER_SET_VRING_KICK:
1054         case VHOST_USER_SET_VRING_CALL:
1055         case VHOST_USER_SET_VRING_ERR:
1056         case VHOST_USER_SET_VRING_ENABLE:
1057         case VHOST_USER_SEND_RARP:
1058                 vhost_user_lock_all_queue_pairs(dev);
1059                 unlock_required = 1;
1060                 break;
1061         default:
1062                 break;
1063
1064         }
1065
1066         switch (msg.request) {
1067         case VHOST_USER_GET_FEATURES:
1068                 msg.payload.u64 = vhost_user_get_features();
1069                 msg.size = sizeof(msg.payload.u64);
1070                 send_vhost_message(fd, &msg);
1071                 break;
1072         case VHOST_USER_SET_FEATURES:
1073                 vhost_user_set_features(dev, msg.payload.u64);
1074                 break;
1075
1076         case VHOST_USER_GET_PROTOCOL_FEATURES:
1077                 msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
1078                 msg.size = sizeof(msg.payload.u64);
1079                 send_vhost_message(fd, &msg);
1080                 break;
1081         case VHOST_USER_SET_PROTOCOL_FEATURES:
1082                 vhost_user_set_protocol_features(dev, msg.payload.u64);
1083                 break;
1084
1085         case VHOST_USER_SET_OWNER:
1086                 vhost_user_set_owner();
1087                 break;
1088         case VHOST_USER_RESET_OWNER:
1089                 vhost_user_reset_owner(dev);
1090                 break;
1091
1092         case VHOST_USER_SET_MEM_TABLE:
1093                 vhost_user_set_mem_table(dev, &msg);
1094                 break;
1095
1096         case VHOST_USER_SET_LOG_BASE:
1097                 vhost_user_set_log_base(dev, &msg);
1098
1099                 /* it needs a reply */
1100                 msg.size = sizeof(msg.payload.u64);
1101                 send_vhost_message(fd, &msg);
1102                 break;
1103         case VHOST_USER_SET_LOG_FD:
1104                 close(msg.fds[0]);
1105                 RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
1106                 break;
1107
1108         case VHOST_USER_SET_VRING_NUM:
1109                 vhost_user_set_vring_num(dev, &msg.payload.state);
1110                 break;
1111         case VHOST_USER_SET_VRING_ADDR:
1112                 vhost_user_set_vring_addr(&dev, &msg.payload.addr);
1113                 break;
1114         case VHOST_USER_SET_VRING_BASE:
1115                 vhost_user_set_vring_base(dev, &msg.payload.state);
1116                 break;
1117
1118         case VHOST_USER_GET_VRING_BASE:
1119                 ret = vhost_user_get_vring_base(dev, &msg.payload.state);
1120                 msg.size = sizeof(msg.payload.state);
1121                 send_vhost_message(fd, &msg);
1122                 break;
1123
1124         case VHOST_USER_SET_VRING_KICK:
1125                 vhost_user_set_vring_kick(dev, &msg);
1126                 break;
1127         case VHOST_USER_SET_VRING_CALL:
1128                 vhost_user_set_vring_call(dev, &msg);
1129                 break;
1130
1131         case VHOST_USER_SET_VRING_ERR:
1132                 if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
1133                         close(msg.fds[0]);
1134                 RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
1135                 break;
1136
1137         case VHOST_USER_GET_QUEUE_NUM:
1138                 msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
1139                 msg.size = sizeof(msg.payload.u64);
1140                 send_vhost_message(fd, &msg);
1141                 break;
1142
1143         case VHOST_USER_SET_VRING_ENABLE:
1144                 vhost_user_set_vring_enable(dev, &msg.payload.state);
1145                 break;
1146         case VHOST_USER_SEND_RARP:
1147                 vhost_user_send_rarp(dev, &msg);
1148                 break;
1149
1150         default:
1151                 break;
1152
1153         }
1154
1155         if (unlock_required)
1156                 vhost_user_unlock_all_queue_pairs(dev);
1157
1158         return 0;
1159 }