New upstream version 16.11.9
[deb_dpdk.git] / lib / librte_vhost / virtio_net.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdbool.h>
36 #include <linux/virtio_net.h>
37
38 #include <rte_mbuf.h>
39 #include <rte_memcpy.h>
40 #include <rte_ether.h>
41 #include <rte_ip.h>
42 #include <rte_virtio_net.h>
43 #include <rte_tcp.h>
44 #include <rte_udp.h>
45 #include <rte_sctp.h>
46 #include <rte_arp.h>
47 #include <rte_spinlock.h>
48 #include <rte_malloc.h>
49
50 #include "vhost.h"
51
52 #define MAX_PKT_BURST 32
53 #define VHOST_LOG_PAGE  4096
54
55 /*
56  * Atomically set a bit in memory.
57  */
58 static inline void __attribute__((always_inline))
59 vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
60 {
61         __sync_fetch_and_or_1(addr, (1U << nr));
62 }
63
64 static inline void __attribute__((always_inline))
65 vhost_log_page(uint8_t *log_base, uint64_t page)
66 {
67         vhost_set_bit(page % 8, &log_base[page / 8]);
68 }
69
70 static inline void __attribute__((always_inline))
71 vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
72 {
73         uint64_t page;
74
75         if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
76                    !dev->log_base || !len))
77                 return;
78
79         if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
80                 return;
81
82         /* To make sure guest memory updates are committed before logging */
83         rte_smp_wmb();
84
85         page = addr / VHOST_LOG_PAGE;
86         while (page * VHOST_LOG_PAGE < addr + len) {
87                 vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
88                 page += 1;
89         }
90 }
91
92 static inline void __attribute__((always_inline))
93 vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq)
94 {
95         unsigned long *log_base;
96         int i;
97
98         if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
99                                 !dev->log_base))
100                 return;
101
102         log_base = (unsigned long *)(uintptr_t)dev->log_base;
103
104         /*
105          * It is expected a write memory barrier has been issued
106          * before this function is called.
107          */
108
109         for (i = 0; i < vq->log_cache_nb_elem; i++) {
110                 struct log_cache_entry *elem = vq->log_cache + i;
111
112                 __sync_fetch_and_or(log_base + elem->offset, elem->val);
113         }
114
115         rte_smp_wmb();
116
117         vq->log_cache_nb_elem = 0;
118 }
119
120 static inline void __attribute__((always_inline))
121 vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq,
122                 uint64_t page)
123 {
124         uint32_t bit_nr = page % (sizeof(unsigned long) << 3);
125         uint32_t offset = page / (sizeof(unsigned long) << 3);
126         int i;
127
128         for (i = 0; i < vq->log_cache_nb_elem; i++) {
129                 struct log_cache_entry *elem = vq->log_cache + i;
130
131                 if (elem->offset == offset) {
132                         elem->val |= (1UL << bit_nr);
133                         return;
134                 }
135         }
136
137         if (unlikely(i >= VHOST_LOG_CACHE_NR)) {
138                 /*
139                  * No more room for a new log cache entry,
140                  * so write the dirty log map directly.
141                  */
142                 rte_smp_wmb();
143                 vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
144
145                 return;
146         }
147
148         vq->log_cache[i].offset = offset;
149         vq->log_cache[i].val = (1UL << bit_nr);
150         vq->log_cache_nb_elem++;
151 }
152
153 static inline void __attribute__((always_inline))
154 vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
155                 uint64_t addr, uint64_t len)
156 {
157         uint64_t page;
158
159         if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
160                                 !dev->log_base || !len))
161                 return;
162
163         if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
164                 return;
165
166         page = addr / VHOST_LOG_PAGE;
167         while (page * VHOST_LOG_PAGE < addr + len) {
168                 vhost_log_cache_page(dev, vq, page);
169                 page += 1;
170         }
171 }
172
173 static inline void __attribute__((always_inline))
174 vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
175                 uint64_t offset, uint64_t len)
176 {
177         vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len);
178 }
179
180 static inline void __attribute__((always_inline))
181 vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
182                      uint64_t offset, uint64_t len)
183 {
184         vhost_log_write(dev, vq->log_guest_addr + offset, len);
185 }
186
187 static bool
188 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
189 {
190         return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
191 }
192
193 static inline struct vring_desc *__attribute__((always_inline))
194 alloc_copy_ind_table(struct virtio_net *dev, struct vring_desc *desc)
195 {
196         struct vring_desc *idesc;
197         uint64_t src, dst;
198         uint64_t len, remain = desc->len;
199         uint64_t desc_addr = desc->addr;
200
201         idesc = rte_malloc(__func__, desc->len, 0);
202         if (unlikely(!idesc))
203                 return 0;
204
205         dst = (uint64_t)(uintptr_t)idesc;
206
207         while (remain) {
208                 len = remain;
209                 src = gpa_to_vva(dev, desc_addr, &len);
210                 if (unlikely(!src || !len)) {
211                         rte_free(idesc);
212                         return 0;
213                 }
214
215                 rte_memcpy((void *)(uintptr_t)dst, (void *)(uintptr_t)src, len);
216
217                 remain -= len;
218                 dst += len;
219                 desc_addr += len;
220         }
221
222         return idesc;
223 }
224
225 static inline void __attribute__((always_inline))
226 free_ind_table(struct vring_desc *idesc)
227 {
228         rte_free(idesc);
229 }
230
231 static inline void __attribute__((always_inline))
232 do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
233                           uint16_t to, uint16_t from, uint16_t size)
234 {
235         rte_memcpy(&vq->used->ring[to],
236                         &vq->shadow_used_ring[from],
237                         size * sizeof(struct vring_used_elem));
238         vhost_log_cache_used_vring(dev, vq,
239                         offsetof(struct vring_used, ring[to]),
240                         size * sizeof(struct vring_used_elem));
241 }
242
243 static inline void __attribute__((always_inline))
244 flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
245 {
246         uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
247
248         if (used_idx + vq->shadow_used_idx <= vq->size) {
249                 do_flush_shadow_used_ring(dev, vq, used_idx, 0,
250                                           vq->shadow_used_idx);
251         } else {
252                 uint16_t size;
253
254                 /* update used ring interval [used_idx, vq->size] */
255                 size = vq->size - used_idx;
256                 do_flush_shadow_used_ring(dev, vq, used_idx, 0, size);
257
258                 /* update the left half used ring interval [0, left_size] */
259                 do_flush_shadow_used_ring(dev, vq, 0, size,
260                                           vq->shadow_used_idx - size);
261         }
262         vq->last_used_idx += vq->shadow_used_idx;
263
264         rte_smp_wmb();
265
266         vhost_log_cache_sync(dev, vq);
267
268         *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
269         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
270                 sizeof(vq->used->idx));
271 }
272
273 static inline void __attribute__((always_inline))
274 update_shadow_used_ring(struct vhost_virtqueue *vq,
275                          uint16_t desc_idx, uint32_t len)
276 {
277         uint16_t i = vq->shadow_used_idx++;
278
279         vq->shadow_used_ring[i].id  = desc_idx;
280         vq->shadow_used_ring[i].len = len;
281 }
282
283 static void
284 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
285 {
286         uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
287
288         if (m_buf->ol_flags & PKT_TX_TCP_SEG)
289                 csum_l4 |= PKT_TX_TCP_CKSUM;
290
291         if (csum_l4) {
292                 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
293                 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
294
295                 switch (csum_l4) {
296                 case PKT_TX_TCP_CKSUM:
297                         net_hdr->csum_offset = (offsetof(struct tcp_hdr,
298                                                 cksum));
299                         break;
300                 case PKT_TX_UDP_CKSUM:
301                         net_hdr->csum_offset = (offsetof(struct udp_hdr,
302                                                 dgram_cksum));
303                         break;
304                 case PKT_TX_SCTP_CKSUM:
305                         net_hdr->csum_offset = (offsetof(struct sctp_hdr,
306                                                 cksum));
307                         break;
308                 }
309         }
310
311         /* IP cksum verification cannot be bypassed, then calculate here */
312         if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
313                 struct ipv4_hdr *ipv4_hdr;
314
315                 ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct ipv4_hdr *,
316                                                    m_buf->l2_len);
317                 ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
318         }
319
320         if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
321                 if (m_buf->ol_flags & PKT_TX_IPV4)
322                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
323                 else
324                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
325                 net_hdr->gso_size = m_buf->tso_segsz;
326                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
327                                         + m_buf->l4_len;
328         }
329 }
330
331 static inline void
332 copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
333                     struct virtio_net_hdr_mrg_rxbuf hdr)
334 {
335         if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
336                 *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
337         else
338                 *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
339 }
340
341 static inline int __attribute__((always_inline))
342 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
343                 struct vring_desc *descs, struct rte_mbuf *m,
344                 uint16_t desc_idx, uint32_t size)
345 {
346         uint32_t desc_avail, desc_offset;
347         uint32_t mbuf_avail, mbuf_offset;
348         uint32_t cpy_len;
349         uint64_t desc_chunck_len;
350         struct vring_desc *desc;
351         uint64_t desc_addr, desc_gaddr;
352         struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
353         /* A counter to avoid desc dead loop chain */
354         uint16_t nr_desc = 1;
355
356         desc = &descs[desc_idx];
357         desc_chunck_len = desc->len;
358         desc_gaddr = desc->addr;
359         desc_addr = gpa_to_vva(dev, desc_gaddr, &desc_chunck_len);
360         /*
361          * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
362          * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
363          * otherwise stores offset on the stack instead of in a register.
364          */
365         if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
366                 return -1;
367
368         rte_prefetch0((void *)(uintptr_t)desc_addr);
369
370         virtio_enqueue_offload(m, &virtio_hdr.hdr);
371         if (likely(desc_chunck_len >= dev->vhost_hlen)) {
372                 copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
373
374                 virtio_enqueue_offload(m,
375                                 (struct virtio_net_hdr *)(uintptr_t)desc_addr);
376                 PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
377         } else {
378                 uint64_t remain = dev->vhost_hlen;
379                 uint64_t len;
380                 uint64_t src = (uint64_t)(uintptr_t)&virtio_hdr, dst;
381                 uint64_t guest_addr = desc_gaddr;
382
383                 while (remain) {
384                         len = remain;
385                         dst = gpa_to_vva(dev, guest_addr, &len);
386                         if (unlikely(!dst || !len))
387                                 return -1;
388
389                         rte_memcpy((void *)(uintptr_t)dst,
390                                         (void *)(uintptr_t)src, len);
391
392                         PRINT_PACKET(dev, (uintptr_t)dst, (uint32_t)len, 0);
393                         remain -= len;
394                         guest_addr += len;
395                         dst += len;
396                 }
397         }
398
399         vhost_log_cache_write(dev, vq, desc_gaddr, dev->vhost_hlen);
400
401         desc_avail  = desc->len - dev->vhost_hlen;
402         if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
403                 desc_chunck_len = desc_avail;
404                 desc_gaddr += dev->vhost_hlen;
405                 desc_addr = gpa_to_vva(dev,
406                                 desc_gaddr,
407                                 &desc_chunck_len);
408                 if (unlikely(!desc_addr))
409                         return -1;
410
411                 desc_offset = 0;
412         } else {
413                 desc_offset = dev->vhost_hlen;
414                 desc_chunck_len -= dev->vhost_hlen;
415         }
416
417         mbuf_avail  = rte_pktmbuf_data_len(m);
418         mbuf_offset = 0;
419         while (mbuf_avail != 0 || m->next != NULL) {
420                 /* done with current mbuf, fetch next */
421                 if (mbuf_avail == 0) {
422                         m = m->next;
423
424                         mbuf_offset = 0;
425                         mbuf_avail  = rte_pktmbuf_data_len(m);
426                 }
427
428                 /* done with current desc buf, fetch next */
429                 if (desc_avail == 0) {
430                         if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
431                                 /* Room in vring buffer is not enough */
432                                 return -1;
433                         }
434                         if (unlikely(desc->next >= size || ++nr_desc > size))
435                                 return -1;
436
437                         desc = &descs[desc->next];
438                         desc_chunck_len = desc->len;
439                         desc_gaddr = desc->addr;
440                         desc_addr = gpa_to_vva(dev,
441                                         desc_gaddr, &desc_chunck_len);
442                         if (unlikely(!desc_addr))
443                                 return -1;
444
445                         desc_offset = 0;
446                         desc_avail  = desc->len;
447                 } else if (unlikely(desc_chunck_len == 0)) {
448                         desc_chunck_len = desc_avail;
449                         desc_gaddr += desc_offset;
450                         desc_addr = gpa_to_vva(dev,
451                                         desc_gaddr, &desc_chunck_len);
452                         if (unlikely(!desc_addr))
453                                 return -1;
454
455                         desc_offset = 0;
456                 }
457
458                 cpy_len = RTE_MIN(desc_avail, mbuf_avail);
459                 rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
460                         rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
461                         cpy_len);
462                 vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset,
463                                 cpy_len);
464                 PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
465                              cpy_len, 0);
466
467                 mbuf_avail  -= cpy_len;
468                 mbuf_offset += cpy_len;
469                 desc_avail  -= cpy_len;
470                 desc_offset += cpy_len;
471                 desc_chunck_len -= cpy_len;
472         }
473
474         return 0;
475 }
476
477 /**
478  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
479  * be received from the physical port or from another virtio device. A packet
480  * count is returned to indicate the number of packets that are succesfully
481  * added to the RX queue. This function works when the mbuf is scattered, but
482  * it doesn't support the mergeable feature.
483  */
484 static inline uint32_t __attribute__((always_inline))
485 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
486               struct rte_mbuf **pkts, uint32_t count)
487 {
488         struct vhost_virtqueue *vq;
489         uint16_t avail_idx, free_entries, start_idx;
490         uint16_t desc_indexes[MAX_PKT_BURST];
491         struct vring_desc *descs;
492         uint16_t used_idx;
493         uint32_t i, sz;
494         uint64_t dlen;
495
496         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
497         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
498                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
499                         dev->vid, __func__, queue_id);
500                 return 0;
501         }
502
503         vq = dev->virtqueue[queue_id];
504
505         rte_spinlock_lock(&vq->access_lock);
506
507         if (unlikely(vq->enabled == 0))
508                 goto out_access_unlock;
509
510         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
511         start_idx = vq->last_used_idx;
512         free_entries = avail_idx - start_idx;
513         count = RTE_MIN(count, free_entries);
514         count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
515         if (count == 0)
516                 goto out_access_unlock;
517
518         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
519                 dev->vid, start_idx, start_idx + count);
520
521         /* Retrieve all of the desc indexes first to avoid caching issues. */
522         rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
523         for (i = 0; i < count; i++) {
524                 used_idx = (start_idx + i) & (vq->size - 1);
525                 desc_indexes[i] = vq->avail->ring[used_idx];
526                 vq->used->ring[used_idx].id = desc_indexes[i];
527                 vq->used->ring[used_idx].len = pkts[i]->pkt_len +
528                                                dev->vhost_hlen;
529                 vhost_log_cache_used_vring(dev, vq,
530                         offsetof(struct vring_used, ring[used_idx]),
531                         sizeof(vq->used->ring[used_idx]));
532         }
533
534         rte_prefetch0(&vq->desc[desc_indexes[0]]);
535         for (i = 0; i < count; i++) {
536                 struct vring_desc *idesc = NULL;
537                 uint16_t desc_idx = desc_indexes[i];
538                 int err;
539
540                 if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) {
541                         dlen = vq->desc[desc_idx].len;
542                         descs = (struct vring_desc *)(uintptr_t)gpa_to_vva(dev,
543                                         vq->desc[desc_idx].addr, &dlen);
544                         if (unlikely(!descs)) {
545                                 count = i;
546                                 break;
547                         }
548
549                         if (unlikely(dlen < vq->desc[desc_idx].len)) {
550                                 /*
551                                  * The indirect desc table is not contiguous
552                                  * in process VA space, we have to copy it.
553                                  */
554                                 idesc = alloc_copy_ind_table(dev,
555                                                         &vq->desc[desc_idx]);
556                                 if (unlikely(!idesc))
557                                         break;
558
559                                 descs = idesc;
560                         }
561
562                         desc_idx = 0;
563                         sz = vq->desc[desc_idx].len / sizeof(*descs);
564                 } else {
565                         descs = vq->desc;
566                         sz = vq->size;
567                 }
568
569                 err = copy_mbuf_to_desc(dev, vq, descs, pkts[i], desc_idx, sz);
570                 if (unlikely(err)) {
571                         used_idx = (start_idx + i) & (vq->size - 1);
572                         vq->used->ring[used_idx].len = dev->vhost_hlen;
573                         vhost_log_cache_used_vring(dev, vq,
574                                 offsetof(struct vring_used, ring[used_idx]),
575                                 sizeof(vq->used->ring[used_idx]));
576                 }
577
578                 if (i + 1 < count)
579                         rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
580
581                 if (unlikely(!!idesc))
582                         free_ind_table(idesc);
583         }
584
585         rte_smp_wmb();
586
587         vhost_log_cache_sync(dev, vq);
588
589         *(volatile uint16_t *)&vq->used->idx += count;
590         vq->last_used_idx += count;
591         vhost_log_used_vring(dev, vq,
592                 offsetof(struct vring_used, idx),
593                 sizeof(vq->used->idx));
594
595         /* flush used->idx update before we read avail->flags. */
596         rte_mb();
597
598         /* Kick the guest if necessary. */
599         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
600                         && (vq->callfd >= 0))
601                 eventfd_write(vq->callfd, (eventfd_t)1);
602
603 out_access_unlock:
604         rte_spinlock_unlock(&vq->access_lock);
605
606         return count;
607 }
608
609 static inline int __attribute__((always_inline))
610 fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
611                          uint32_t avail_idx, uint32_t *vec_idx,
612                          struct buf_vector *buf_vec, uint16_t *desc_chain_head,
613                          uint32_t *desc_chain_len)
614 {
615         uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
616         uint32_t vec_id = *vec_idx;
617         uint32_t len    = 0;
618         uint64_t dlen;
619         struct vring_desc *descs = vq->desc;
620         struct vring_desc *idesc = NULL;
621
622         *desc_chain_head = idx;
623
624         if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
625                 dlen = vq->desc[idx].len;
626                 descs = (struct vring_desc *)(uintptr_t)
627                                         gpa_to_vva(dev, vq->desc[idx].addr,
628                                                            &dlen);
629                 if (unlikely(!descs))
630                         return -1;
631
632                 if (unlikely(dlen < vq->desc[idx].len)) {
633                         /*
634                          * The indirect desc table is not contiguous
635                          * in process VA space, we have to copy it.
636                          */
637                         idesc = alloc_copy_ind_table(dev, &vq->desc[idx]);
638                         if (unlikely(!idesc))
639                                 return -1;
640
641                         descs = idesc;
642                 }
643
644                 idx = 0;
645         }
646
647         while (1) {
648                 if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size)) {
649                         free_ind_table(idesc);
650                         return -1;
651                 }
652
653                 len += descs[idx].len;
654                 buf_vec[vec_id].buf_addr = descs[idx].addr;
655                 buf_vec[vec_id].buf_len  = descs[idx].len;
656                 buf_vec[vec_id].desc_idx = idx;
657                 vec_id++;
658
659                 if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
660                         break;
661
662                 idx = descs[idx].next;
663         }
664
665         *desc_chain_len = len;
666         *vec_idx = vec_id;
667
668         if (unlikely(!!idesc))
669                 free_ind_table(idesc);
670
671         return 0;
672 }
673
674 /*
675  * Returns -1 on fail, 0 on success
676  */
677 static inline int
678 reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
679                                 uint32_t size, struct buf_vector *buf_vec,
680                                 uint16_t *num_buffers, uint16_t avail_head)
681 {
682         uint16_t cur_idx;
683         uint32_t vec_idx = 0;
684         uint16_t tries = 0;
685
686         uint16_t head_idx = 0;
687         uint32_t len = 0;
688
689         *num_buffers = 0;
690         cur_idx  = vq->last_avail_idx;
691
692         while (size > 0) {
693                 if (unlikely(cur_idx == avail_head))
694                         return -1;
695
696                 if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec,
697                                                 &head_idx, &len) < 0))
698                         return -1;
699                 len = RTE_MIN(len, size);
700                 update_shadow_used_ring(vq, head_idx, len);
701                 size -= len;
702
703                 cur_idx++;
704                 tries++;
705                 *num_buffers += 1;
706
707                 /*
708                  * if we tried all available ring items, and still
709                  * can't get enough buf, it means something abnormal
710                  * happened.
711                  */
712                 if (unlikely(tries >= vq->size))
713                         return -1;
714         }
715
716         return 0;
717 }
718
719 static inline int __attribute__((always_inline))
720 copy_mbuf_to_desc_mergeable(struct virtio_net *dev,
721                         struct vhost_virtqueue *vq, struct rte_mbuf *m,
722                         struct buf_vector *buf_vec, uint16_t num_buffers)
723 {
724         struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
725         struct virtio_net_hdr_mrg_rxbuf *hdr;
726         uint32_t vec_idx = 0;
727         uint64_t desc_addr, desc_gaddr;
728         uint64_t desc_chunck_len;
729         uint32_t mbuf_offset, mbuf_avail;
730         uint32_t desc_offset, desc_avail;
731         uint32_t cpy_len;
732         uint64_t hdr_addr, hdr_phys_addr;
733         struct rte_mbuf *hdr_mbuf;
734
735         if (unlikely(m == NULL))
736                 return -1;
737
738         desc_chunck_len = buf_vec[vec_idx].buf_len;
739         desc_gaddr = buf_vec[vec_idx].buf_addr;
740         desc_addr = gpa_to_vva(dev, desc_gaddr, &desc_chunck_len);
741         if (buf_vec[vec_idx].buf_len < dev->vhost_hlen ||
742                         !desc_addr)
743                 return -1;
744
745         hdr_mbuf = m;
746         hdr_addr = desc_addr;
747         if (unlikely(desc_chunck_len < dev->vhost_hlen))
748                 hdr = &virtio_hdr;
749         else
750                 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
751         hdr_phys_addr = buf_vec[vec_idx].buf_addr;
752         rte_prefetch0((void *)(uintptr_t)hdr_addr);
753
754         virtio_hdr.num_buffers = num_buffers;
755         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
756                 dev->vid, num_buffers);
757
758         desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
759         if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
760                 desc_chunck_len = desc_avail;
761                 desc_gaddr += dev->vhost_hlen;
762                 desc_addr = gpa_to_vva(dev,
763                                 desc_gaddr,
764                                 &desc_chunck_len);
765                 if (unlikely(!desc_addr))
766                         return -1;
767
768                 desc_offset = 0;
769         } else {
770                 desc_offset = dev->vhost_hlen;
771                 desc_chunck_len -= dev->vhost_hlen;
772         }
773
774
775         mbuf_avail  = rte_pktmbuf_data_len(m);
776         mbuf_offset = 0;
777         while (mbuf_avail != 0 || m->next != NULL) {
778                 /* done with current desc buf, get the next one */
779                 if (desc_avail == 0) {
780                         vec_idx++;
781                         desc_gaddr = buf_vec[vec_idx].buf_addr;
782                         desc_chunck_len = buf_vec[vec_idx].buf_len;
783                         desc_addr = gpa_to_vva(dev, desc_gaddr,
784                                         &desc_chunck_len);
785                         if (unlikely(!desc_addr))
786                                 return -1;
787
788                         /* Prefetch buffer address. */
789                         rte_prefetch0((void *)(uintptr_t)desc_addr);
790                         desc_offset = 0;
791                         desc_avail  = buf_vec[vec_idx].buf_len;
792                 } else if (unlikely(desc_chunck_len == 0)) {
793                         desc_chunck_len = desc_avail;
794                         desc_gaddr += desc_offset;
795                         desc_addr = gpa_to_vva(dev,
796                                         desc_gaddr,
797                                         &desc_chunck_len);
798                         if (unlikely(!desc_addr))
799                                 return -1;
800
801                         desc_offset = 0;
802                 }
803
804                 /* done with current mbuf, get the next one */
805                 if (mbuf_avail == 0) {
806                         m = m->next;
807
808                         mbuf_offset = 0;
809                         mbuf_avail  = rte_pktmbuf_data_len(m);
810                 }
811
812                 if (hdr_addr) {
813                         virtio_enqueue_offload(hdr_mbuf, &virtio_hdr.hdr);
814                         if (likely(hdr != &virtio_hdr)) {
815                                 copy_virtio_net_hdr(dev, hdr_addr, virtio_hdr);
816                         } else {
817                                 uint64_t len;
818                                 uint64_t remain = dev->vhost_hlen;
819                                 uint64_t src = (uint64_t)(uintptr_t)&virtio_hdr;
820                                 uint64_t dst;
821                                 uint64_t guest_addr = hdr_phys_addr;
822
823                                 while (remain) {
824                                         len = remain;
825                                         dst = gpa_to_vva(dev, guest_addr, &len);
826                                         if (unlikely(!dst || !len))
827                                                 return -1;
828
829                                         rte_memcpy((void *)(uintptr_t)dst,
830                                                         (void *)(uintptr_t)src,
831                                                         len);
832
833                                         PRINT_PACKET(dev, (uintptr_t)dst,
834                                                         (uint32_t)len, 0);
835
836                                         remain -= len;
837                                         guest_addr += len;
838                                         dst += len;
839                                 }
840                         }
841                         vhost_log_cache_write(dev, vq, hdr_phys_addr,
842                                         dev->vhost_hlen);
843                         PRINT_PACKET(dev, (uintptr_t)hdr_addr,
844                                      dev->vhost_hlen, 0);
845
846                         hdr_addr = 0;
847                 }
848
849                 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
850                 rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
851                         rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
852                         cpy_len);
853                 vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset,
854                                 cpy_len);
855                 PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
856                         cpy_len, 0);
857
858                 mbuf_avail  -= cpy_len;
859                 mbuf_offset += cpy_len;
860                 desc_avail  -= cpy_len;
861                 desc_offset += cpy_len;
862                 desc_chunck_len -= cpy_len;
863         }
864
865         return 0;
866 }
867
868 static inline uint32_t __attribute__((always_inline))
869 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
870         struct rte_mbuf **pkts, uint32_t count)
871 {
872         struct vhost_virtqueue *vq;
873         uint32_t pkt_idx = 0;
874         uint16_t num_buffers;
875         struct buf_vector buf_vec[BUF_VECTOR_MAX];
876         uint16_t avail_head;
877
878         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
879         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
880                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
881                         dev->vid, __func__, queue_id);
882                 return 0;
883         }
884
885         vq = dev->virtqueue[queue_id];
886
887         rte_spinlock_lock(&vq->access_lock);
888
889         if (unlikely(vq->enabled == 0))
890                 goto out_access_unlock;
891
892         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
893         if (count == 0)
894                 goto out_access_unlock;
895
896         rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
897
898         vq->shadow_used_idx = 0;
899         avail_head = *((volatile uint16_t *)&vq->avail->idx);
900         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
901                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
902
903                 if (unlikely(reserve_avail_buf_mergeable(dev, vq,
904                                                 pkt_len, buf_vec, &num_buffers,
905                                                 avail_head) < 0)) {
906                         VHOST_LOG_DEBUG(VHOST_DATA,
907                                 "(%d) failed to get enough desc from vring\n",
908                                 dev->vid);
909                         vq->shadow_used_idx -= num_buffers;
910                         break;
911                 }
912
913                 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
914                         dev->vid, vq->last_avail_idx,
915                         vq->last_avail_idx + num_buffers);
916
917                 if (copy_mbuf_to_desc_mergeable(dev, vq, pkts[pkt_idx],
918                                                 buf_vec, num_buffers) < 0) {
919                         vq->shadow_used_idx -= num_buffers;
920                         break;
921                 }
922
923                 vq->last_avail_idx += num_buffers;
924         }
925
926         if (likely(vq->shadow_used_idx)) {
927                 flush_shadow_used_ring(dev, vq);
928
929                 /* flush used->idx update before we read avail->flags. */
930                 rte_mb();
931
932                 /* Kick the guest if necessary. */
933                 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
934                                 && (vq->callfd >= 0))
935                         eventfd_write(vq->callfd, (eventfd_t)1);
936         }
937
938 out_access_unlock:
939         rte_spinlock_unlock(&vq->access_lock);
940
941         return pkt_idx;
942 }
943
944 uint16_t
945 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
946         struct rte_mbuf **pkts, uint16_t count)
947 {
948         struct virtio_net *dev = get_device(vid);
949
950         if (!dev)
951                 return 0;
952
953         if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
954                 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
955         else
956                 return virtio_dev_rx(dev, queue_id, pkts, count);
957 }
958
959 static inline bool
960 virtio_net_with_host_offload(struct virtio_net *dev)
961 {
962         if (dev->features &
963                         ((1ULL << VIRTIO_NET_F_CSUM) |
964                          (1ULL << VIRTIO_NET_F_HOST_ECN) |
965                          (1ULL << VIRTIO_NET_F_HOST_TSO4) |
966                          (1ULL << VIRTIO_NET_F_HOST_TSO6) |
967                          (1ULL << VIRTIO_NET_F_HOST_UFO)))
968                 return true;
969
970         return false;
971 }
972
973 static void
974 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
975 {
976         struct ipv4_hdr *ipv4_hdr;
977         struct ipv6_hdr *ipv6_hdr;
978         void *l3_hdr = NULL;
979         struct ether_hdr *eth_hdr;
980         uint16_t ethertype;
981
982         eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
983
984         m->l2_len = sizeof(struct ether_hdr);
985         ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
986
987         if (ethertype == ETHER_TYPE_VLAN) {
988                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
989
990                 m->l2_len += sizeof(struct vlan_hdr);
991                 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
992         }
993
994         l3_hdr = (char *)eth_hdr + m->l2_len;
995
996         switch (ethertype) {
997         case ETHER_TYPE_IPv4:
998                 ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
999                 *l4_proto = ipv4_hdr->next_proto_id;
1000                 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
1001                 *l4_hdr = (char *)l3_hdr + m->l3_len;
1002                 m->ol_flags |= PKT_TX_IPV4;
1003                 break;
1004         case ETHER_TYPE_IPv6:
1005                 ipv6_hdr = (struct ipv6_hdr *)l3_hdr;
1006                 *l4_proto = ipv6_hdr->proto;
1007                 m->l3_len = sizeof(struct ipv6_hdr);
1008                 *l4_hdr = (char *)l3_hdr + m->l3_len;
1009                 m->ol_flags |= PKT_TX_IPV6;
1010                 break;
1011         default:
1012                 m->l3_len = 0;
1013                 *l4_proto = 0;
1014                 *l4_hdr = NULL;
1015                 break;
1016         }
1017 }
1018
1019 static inline void __attribute__((always_inline))
1020 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
1021 {
1022         uint16_t l4_proto = 0;
1023         void *l4_hdr = NULL;
1024         struct tcp_hdr *tcp_hdr = NULL;
1025
1026         if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
1027                 return;
1028
1029         parse_ethernet(m, &l4_proto, &l4_hdr);
1030         if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1031                 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
1032                         switch (hdr->csum_offset) {
1033                         case (offsetof(struct tcp_hdr, cksum)):
1034                                 if (l4_proto == IPPROTO_TCP)
1035                                         m->ol_flags |= PKT_TX_TCP_CKSUM;
1036                                 break;
1037                         case (offsetof(struct udp_hdr, dgram_cksum)):
1038                                 if (l4_proto == IPPROTO_UDP)
1039                                         m->ol_flags |= PKT_TX_UDP_CKSUM;
1040                                 break;
1041                         case (offsetof(struct sctp_hdr, cksum)):
1042                                 if (l4_proto == IPPROTO_SCTP)
1043                                         m->ol_flags |= PKT_TX_SCTP_CKSUM;
1044                                 break;
1045                         default:
1046                                 break;
1047                         }
1048                 }
1049         }
1050
1051         if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1052                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1053                 case VIRTIO_NET_HDR_GSO_TCPV4:
1054                 case VIRTIO_NET_HDR_GSO_TCPV6:
1055                         tcp_hdr = (struct tcp_hdr *)l4_hdr;
1056                         m->ol_flags |= PKT_TX_TCP_SEG;
1057                         m->tso_segsz = hdr->gso_size;
1058                         m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
1059                         break;
1060                 default:
1061                         RTE_LOG(WARNING, VHOST_DATA,
1062                                 "unsupported gso type %u.\n", hdr->gso_type);
1063                         break;
1064                 }
1065         }
1066 }
1067
1068 #define RARP_PKT_SIZE   64
1069
1070 static int
1071 make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
1072 {
1073         struct ether_hdr *eth_hdr;
1074         struct arp_hdr  *rarp;
1075
1076         if (rarp_mbuf->buf_len < 64) {
1077                 RTE_LOG(WARNING, VHOST_DATA,
1078                         "failed to make RARP; mbuf size too small %u (< %d)\n",
1079                         rarp_mbuf->buf_len, RARP_PKT_SIZE);
1080                 return -1;
1081         }
1082
1083         /* Ethernet header. */
1084         eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
1085         memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
1086         ether_addr_copy(mac, &eth_hdr->s_addr);
1087         eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
1088
1089         /* RARP header. */
1090         rarp = (struct arp_hdr *)(eth_hdr + 1);
1091         rarp->arp_hrd = htons(ARP_HRD_ETHER);
1092         rarp->arp_pro = htons(ETHER_TYPE_IPv4);
1093         rarp->arp_hln = ETHER_ADDR_LEN;
1094         rarp->arp_pln = 4;
1095         rarp->arp_op  = htons(ARP_OP_REVREQUEST);
1096
1097         ether_addr_copy(mac, &rarp->arp_data.arp_sha);
1098         ether_addr_copy(mac, &rarp->arp_data.arp_tha);
1099         memset(&rarp->arp_data.arp_sip, 0x00, 4);
1100         memset(&rarp->arp_data.arp_tip, 0x00, 4);
1101
1102         rarp_mbuf->pkt_len  = rarp_mbuf->data_len = RARP_PKT_SIZE;
1103
1104         return 0;
1105 }
1106
1107 static inline void __attribute__((always_inline))
1108 put_zmbuf(struct zcopy_mbuf *zmbuf)
1109 {
1110         zmbuf->in_use = 0;
1111 }
1112
1113 static inline int __attribute__((always_inline))
1114 copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
1115                   uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
1116                   struct rte_mempool *mbuf_pool)
1117 {
1118         struct vring_desc *desc;
1119         uint64_t desc_addr, desc_gaddr;
1120         uint32_t desc_avail, desc_offset;
1121         uint32_t mbuf_avail, mbuf_offset;
1122         uint32_t cpy_len;
1123         uint64_t desc_chunck_len;
1124         struct rte_mbuf *cur = m, *prev = m;
1125         struct virtio_net_hdr tmp_hdr;
1126         struct virtio_net_hdr *hdr = NULL;
1127         /* A counter to avoid desc dead loop chain */
1128         uint32_t nr_desc = 1;
1129
1130         desc = &descs[desc_idx];
1131         if (unlikely((desc->len < dev->vhost_hlen)) ||
1132                         (desc->flags & VRING_DESC_F_INDIRECT))
1133                 return -1;
1134
1135         desc_chunck_len = desc->len;
1136         desc_gaddr = desc->addr;
1137         desc_addr = gpa_to_vva(dev, desc_gaddr, &desc_chunck_len);
1138         if (unlikely(!desc_addr))
1139                 return -1;
1140
1141         if (virtio_net_with_host_offload(dev)) {
1142                 if (unlikely(desc_chunck_len < sizeof(struct virtio_net_hdr))) {
1143                         uint64_t len = desc_chunck_len;
1144                         uint64_t remain = sizeof(struct virtio_net_hdr);
1145                         uint64_t src = desc_addr;
1146                         uint64_t dst = (uint64_t)(uintptr_t)&tmp_hdr;
1147                         uint64_t guest_addr = desc_gaddr;
1148
1149                         /*
1150                          * No luck, the virtio-net header doesn't fit
1151                          * in a contiguous virtual area.
1152                          */
1153                         while (remain) {
1154                                 len = remain;
1155                                 src = gpa_to_vva(dev, guest_addr, &len);
1156                                 if (unlikely(!src || !len))
1157                                         return -1;
1158
1159                                 rte_memcpy((void *)(uintptr_t)dst,
1160                                                    (void *)(uintptr_t)src, len);
1161
1162                                 guest_addr += len;
1163                                 remain -= len;
1164                                 dst += len;
1165                         }
1166
1167                         hdr = &tmp_hdr;
1168                 } else {
1169                         hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
1170                         rte_prefetch0(hdr);
1171                 }
1172         }
1173
1174         /*
1175          * A virtio driver normally uses at least 2 desc buffers
1176          * for Tx: the first for storing the header, and others
1177          * for storing the data.
1178          */
1179         if (likely((desc->len == dev->vhost_hlen) &&
1180                    (desc->flags & VRING_DESC_F_NEXT) != 0)) {
1181                 desc = &descs[desc->next];
1182                 if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
1183                         return -1;
1184
1185                 desc_chunck_len = desc->len;
1186                 desc_gaddr = desc->addr;
1187                 desc_addr = gpa_to_vva(dev, desc_gaddr, &desc_chunck_len);
1188                 if (unlikely(!desc_addr))
1189                         return -1;
1190
1191                 desc_offset = 0;
1192                 desc_avail  = desc->len;
1193                 nr_desc    += 1;
1194         } else {
1195                 desc_avail  = desc->len - dev->vhost_hlen;
1196
1197                 if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
1198                         desc_chunck_len = desc_avail;
1199                         desc_gaddr += dev->vhost_hlen;
1200                         desc_addr = gpa_to_vva(dev,
1201                                         desc_gaddr,
1202                                         &desc_chunck_len);
1203                         if (unlikely(!desc_addr))
1204                                 return -1;
1205
1206                         desc_offset = 0;
1207                 } else {
1208                         desc_offset = dev->vhost_hlen;
1209                         desc_chunck_len -= dev->vhost_hlen;
1210                 }
1211         }
1212
1213         rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset));
1214
1215         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
1216                         (uint32_t)desc_chunck_len, 0);
1217
1218         mbuf_offset = 0;
1219         mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
1220         while (1) {
1221                 uint64_t hpa;
1222
1223                 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
1224
1225                 /*
1226                  * A desc buf might across two host physical pages that are
1227                  * not continuous. In such case (gpa_to_hpa returns 0), data
1228                  * will be copied even though zero copy is enabled.
1229                  */
1230                 if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev,
1231                                         desc_gaddr + desc_offset, cpy_len)))) {
1232                         cur->data_len = cpy_len;
1233                         cur->data_off = 0;
1234                         cur->buf_addr = (void *)(uintptr_t)(desc_gaddr
1235                                         + desc_offset);
1236                         cur->buf_physaddr = hpa;
1237
1238                         /*
1239                          * In zero copy mode, one mbuf can only reference data
1240                          * for one or partial of one desc buff.
1241                          */
1242                         mbuf_avail = cpy_len;
1243                 } else {
1244                         rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
1245                                                            mbuf_offset),
1246                                 (void *)((uintptr_t)(desc_addr + desc_offset)),
1247                                 cpy_len);
1248                 }
1249
1250                 mbuf_avail  -= cpy_len;
1251                 mbuf_offset += cpy_len;
1252                 desc_avail  -= cpy_len;
1253                 desc_chunck_len -= cpy_len;
1254                 desc_offset += cpy_len;
1255
1256                 /* This desc reaches to its end, get the next one */
1257                 if (desc_avail == 0) {
1258                         if ((desc->flags & VRING_DESC_F_NEXT) == 0)
1259                                 break;
1260
1261                         if (unlikely(desc->next >= max_desc ||
1262                                      ++nr_desc > max_desc))
1263                                 return -1;
1264                         desc = &descs[desc->next];
1265                         if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
1266                                 return -1;
1267
1268                         desc_chunck_len = desc->len;
1269                         desc_gaddr = desc->addr;
1270                         desc_addr = gpa_to_vva(dev, desc_gaddr,
1271                                         &desc_chunck_len);
1272                         if (unlikely(!desc_addr))
1273                                 return -1;
1274
1275                         rte_prefetch0((void *)(uintptr_t)desc_addr);
1276
1277                         desc_offset = 0;
1278                         desc_avail  = desc->len;
1279
1280                         PRINT_PACKET(dev, (uintptr_t)desc_addr,
1281                                         (uint32_t)desc_chunck_len, 0);
1282                 } else if (unlikely(desc_chunck_len == 0)) {
1283                         desc_chunck_len = desc_avail;
1284                         desc_gaddr += desc_offset;
1285                         desc_addr = gpa_to_vva(dev,
1286                                         desc_gaddr,
1287                                         &desc_chunck_len);
1288                         if (unlikely(!desc_addr))
1289                                 return -1;
1290
1291                         desc_offset = 0;
1292
1293                         PRINT_PACKET(dev, (uintptr_t)desc_addr,
1294                                         (uint32_t)desc_chunck_len, 0);
1295                 }
1296
1297                 /*
1298                  * This mbuf reaches to its end, get a new one
1299                  * to hold more data.
1300                  */
1301                 if (mbuf_avail == 0) {
1302                         cur = rte_pktmbuf_alloc(mbuf_pool);
1303                         if (unlikely(cur == NULL)) {
1304                                 RTE_LOG(ERR, VHOST_DATA, "Failed to "
1305                                         "allocate memory for mbuf.\n");
1306                                 return -1;
1307                         }
1308                         if (unlikely(dev->dequeue_zero_copy))
1309                                 rte_mbuf_refcnt_update(cur, 1);
1310
1311                         prev->next = cur;
1312                         prev->data_len = mbuf_offset;
1313                         m->nb_segs += 1;
1314                         m->pkt_len += mbuf_offset;
1315                         prev = cur;
1316
1317                         mbuf_offset = 0;
1318                         mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
1319                 }
1320         }
1321
1322         prev->data_len = mbuf_offset;
1323         m->pkt_len    += mbuf_offset;
1324
1325         if (hdr)
1326                 vhost_dequeue_offload(hdr, m);
1327
1328         return 0;
1329 }
1330
1331 static inline void __attribute__((always_inline))
1332 update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
1333                  uint32_t used_idx, uint32_t desc_idx)
1334 {
1335         vq->used->ring[used_idx].id  = desc_idx;
1336         vq->used->ring[used_idx].len = 0;
1337         vhost_log_cache_used_vring(dev, vq,
1338                         offsetof(struct vring_used, ring[used_idx]),
1339                         sizeof(vq->used->ring[used_idx]));
1340 }
1341
1342 static inline void __attribute__((always_inline))
1343 update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq,
1344                 uint32_t count)
1345 {
1346         if (unlikely(count == 0))
1347                 return;
1348
1349         rte_smp_wmb();
1350         rte_smp_rmb();
1351
1352         vhost_log_cache_sync(dev, vq);
1353
1354         vq->used->idx += count;
1355         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
1356                         sizeof(vq->used->idx));
1357
1358         /* Kick guest if required. */
1359         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
1360                         && (vq->callfd >= 0))
1361                 eventfd_write(vq->callfd, (eventfd_t)1);
1362 }
1363
1364 static inline struct zcopy_mbuf *__attribute__((always_inline))
1365 get_zmbuf(struct vhost_virtqueue *vq)
1366 {
1367         uint16_t i;
1368         uint16_t last;
1369         int tries = 0;
1370
1371         /* search [last_zmbuf_idx, zmbuf_size) */
1372         i = vq->last_zmbuf_idx;
1373         last = vq->zmbuf_size;
1374
1375 again:
1376         for (; i < last; i++) {
1377                 if (vq->zmbufs[i].in_use == 0) {
1378                         vq->last_zmbuf_idx = i + 1;
1379                         vq->zmbufs[i].in_use = 1;
1380                         return &vq->zmbufs[i];
1381                 }
1382         }
1383
1384         tries++;
1385         if (tries == 1) {
1386                 /* search [0, last_zmbuf_idx) */
1387                 i = 0;
1388                 last = vq->last_zmbuf_idx;
1389                 goto again;
1390         }
1391
1392         return NULL;
1393 }
1394
1395 static inline bool __attribute__((always_inline))
1396 mbuf_is_consumed(struct rte_mbuf *m)
1397 {
1398         while (m) {
1399                 if (rte_mbuf_refcnt_read(m) > 1)
1400                         return false;
1401                 m = m->next;
1402         }
1403
1404         return true;
1405 }
1406
1407 static inline void __attribute__((always_inline))
1408 restore_mbuf(struct rte_mbuf *m)
1409 {
1410         uint32_t mbuf_size, priv_size;
1411
1412         while (m) {
1413                 priv_size = rte_pktmbuf_priv_size(m->pool);
1414                 mbuf_size = sizeof(struct rte_mbuf) + priv_size;
1415                 /* start of buffer is after mbuf structure and priv data */
1416
1417                 m->buf_addr = (char *)m + mbuf_size;
1418                 m->buf_physaddr = rte_mempool_virt2phy(NULL, m) + mbuf_size;
1419                 m = m->next;
1420         }
1421 }
1422
1423 uint16_t
1424 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
1425         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
1426 {
1427         struct virtio_net *dev;
1428         struct rte_mbuf *rarp_mbuf = NULL;
1429         struct vhost_virtqueue *vq;
1430         uint32_t desc_indexes[MAX_PKT_BURST];
1431         uint32_t used_idx;
1432         uint32_t i = 0;
1433         uint16_t free_entries;
1434         uint16_t avail_idx;
1435
1436         dev = get_device(vid);
1437         if (!dev)
1438                 return 0;
1439
1440         if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
1441                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
1442                         dev->vid, __func__, queue_id);
1443                 return 0;
1444         }
1445
1446         vq = dev->virtqueue[queue_id];
1447
1448         if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
1449                 return 0;
1450
1451         if (unlikely(vq->enabled == 0))
1452                 goto out_access_unlock;
1453
1454         if (unlikely(dev->dequeue_zero_copy)) {
1455                 struct zcopy_mbuf *zmbuf, *next;
1456                 int nr_updated = 0;
1457
1458                 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
1459                      zmbuf != NULL; zmbuf = next) {
1460                         next = TAILQ_NEXT(zmbuf, next);
1461
1462                         if (mbuf_is_consumed(zmbuf->mbuf)) {
1463                                 used_idx = vq->last_used_idx++ & (vq->size - 1);
1464                                 update_used_ring(dev, vq, used_idx,
1465                                                  zmbuf->desc_idx);
1466                                 nr_updated += 1;
1467
1468                                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
1469                                 restore_mbuf(zmbuf->mbuf);
1470                                 rte_pktmbuf_free(zmbuf->mbuf);
1471                                 put_zmbuf(zmbuf);
1472                                 vq->nr_zmbuf -= 1;
1473                         }
1474                 }
1475
1476                 update_used_idx(dev, vq, nr_updated);
1477         }
1478
1479         /*
1480          * Construct a RARP broadcast packet, and inject it to the "pkts"
1481          * array, to looks like that guest actually send such packet.
1482          *
1483          * Check user_send_rarp() for more information.
1484          *
1485          * broadcast_rarp shares a cacheline in the virtio_net structure
1486          * with some fields that are accessed during enqueue and
1487          * rte_atomic16_cmpset() causes a write if using cmpxchg. This could
1488          * result in false sharing between enqueue and dequeue.
1489          *
1490          * Prevent unnecessary false sharing by reading broadcast_rarp first
1491          * and only performing cmpset if the read indicates it is likely to
1492          * be set.
1493          */
1494
1495         if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
1496                         rte_atomic16_cmpset((volatile uint16_t *)
1497                                 &dev->broadcast_rarp.cnt, 1, 0))) {
1498
1499                 rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1500                 if (rarp_mbuf == NULL) {
1501                         RTE_LOG(ERR, VHOST_DATA,
1502                                 "Failed to allocate memory for mbuf.\n");
1503                         goto out_access_unlock;
1504                 }
1505
1506                 if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
1507                         rte_pktmbuf_free(rarp_mbuf);
1508                         rarp_mbuf = NULL;
1509                 } else {
1510                         count -= 1;
1511                 }
1512         }
1513
1514         free_entries = *((volatile uint16_t *)&vq->avail->idx) -
1515                         vq->last_avail_idx;
1516         if (free_entries == 0)
1517                 goto out_access_unlock;
1518
1519         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
1520
1521         /* Prefetch available and used ring */
1522         avail_idx = vq->last_avail_idx & (vq->size - 1);
1523         used_idx  = vq->last_used_idx  & (vq->size - 1);
1524         rte_prefetch0(&vq->avail->ring[avail_idx]);
1525         rte_prefetch0(&vq->used->ring[used_idx]);
1526
1527         count = RTE_MIN(count, MAX_PKT_BURST);
1528         count = RTE_MIN(count, free_entries);
1529         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
1530                         dev->vid, count);
1531
1532         /* Retrieve all of the head indexes first to avoid caching issues. */
1533         for (i = 0; i < count; i++) {
1534                 avail_idx = (vq->last_avail_idx + i) & (vq->size - 1);
1535                 used_idx  = (vq->last_used_idx  + i) & (vq->size - 1);
1536                 desc_indexes[i] = vq->avail->ring[avail_idx];
1537
1538                 if (likely(dev->dequeue_zero_copy == 0))
1539                         update_used_ring(dev, vq, used_idx, desc_indexes[i]);
1540         }
1541
1542         /* Prefetch descriptor index. */
1543         rte_prefetch0(&vq->desc[desc_indexes[0]]);
1544         for (i = 0; i < count; i++) {
1545                 struct vring_desc *desc, *idesc = NULL;
1546                 uint16_t sz, idx;
1547                 uint64_t dlen;
1548                 int err;
1549
1550                 if (likely(i + 1 < count))
1551                         rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
1552
1553                 if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
1554                         dlen = vq->desc[desc_indexes[i]].len;
1555                         desc = (struct vring_desc *)(uintptr_t)gpa_to_vva(dev,
1556                                         vq->desc[desc_indexes[i]].addr,
1557                                         &dlen);
1558                         if (unlikely(!desc))
1559                                 break;
1560
1561                         if (unlikely(dlen < vq->desc[desc_indexes[i]].len)) {
1562                                 /*
1563                                  * The indirect desc table is not contiguous
1564                                  * in process VA space, we have to copy it.
1565                                  */
1566                                 idesc = alloc_copy_ind_table(dev,
1567                                                 &vq->desc[desc_indexes[i]]);
1568                                 if (unlikely(!idesc))
1569                                         break;
1570
1571                                 desc = idesc;
1572                         }
1573
1574                         rte_prefetch0(desc);
1575                         sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
1576                         idx = 0;
1577                 } else {
1578                         desc = vq->desc;
1579                         sz = vq->size;
1580                         idx = desc_indexes[i];
1581                 }
1582
1583                 pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
1584                 if (unlikely(pkts[i] == NULL)) {
1585                         RTE_LOG(ERR, VHOST_DATA,
1586                                 "Failed to allocate memory for mbuf.\n");
1587                         free_ind_table(idesc);
1588                         break;
1589                 }
1590
1591                 err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
1592                 if (unlikely(err)) {
1593                         rte_pktmbuf_free(pkts[i]);
1594                         free_ind_table(idesc);
1595                         break;
1596                 }
1597
1598                 if (unlikely(dev->dequeue_zero_copy)) {
1599                         struct zcopy_mbuf *zmbuf;
1600
1601                         zmbuf = get_zmbuf(vq);
1602                         if (!zmbuf) {
1603                                 rte_pktmbuf_free(pkts[i]);
1604                                 free_ind_table(idesc);
1605                                 break;
1606                         }
1607                         zmbuf->mbuf = pkts[i];
1608                         zmbuf->desc_idx = desc_indexes[i];
1609
1610                         /*
1611                          * Pin lock the mbuf; we will check later to see
1612                          * whether the mbuf is freed (when we are the last
1613                          * user) or not. If that's the case, we then could
1614                          * update the used ring safely.
1615                          */
1616                         rte_mbuf_refcnt_update(pkts[i], 1);
1617
1618                         vq->nr_zmbuf += 1;
1619                         TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
1620                 }
1621
1622                 if (unlikely(!!idesc))
1623                         free_ind_table(idesc);
1624         }
1625         vq->last_avail_idx += i;
1626
1627         if (likely(dev->dequeue_zero_copy == 0)) {
1628                 vq->last_used_idx += i;
1629                 update_used_idx(dev, vq, i);
1630         }
1631
1632 out_access_unlock:
1633         rte_spinlock_unlock(&vq->access_lock);
1634
1635         if (unlikely(rarp_mbuf != NULL)) {
1636                 /*
1637                  * Inject it to the head of "pkts" array, so that switch's mac
1638                  * learning table will get updated first.
1639                  */
1640                 memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
1641                 pkts[0] = rarp_mbuf;
1642                 i += 1;
1643         }
1644
1645         return i;
1646 }