New upstream version 16.11.7
[deb_dpdk.git] / lib / librte_vhost / virtio_net.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdbool.h>
36 #include <linux/virtio_net.h>
37
38 #include <rte_mbuf.h>
39 #include <rte_memcpy.h>
40 #include <rte_ether.h>
41 #include <rte_ip.h>
42 #include <rte_virtio_net.h>
43 #include <rte_tcp.h>
44 #include <rte_udp.h>
45 #include <rte_sctp.h>
46 #include <rte_arp.h>
47 #include <rte_spinlock.h>
48 #include <rte_malloc.h>
49
50 #include "vhost.h"
51
52 #define MAX_PKT_BURST 32
53 #define VHOST_LOG_PAGE  4096
54
55 /*
56  * Atomically set a bit in memory.
57  */
58 static inline void __attribute__((always_inline))
59 vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
60 {
61         __sync_fetch_and_or_1(addr, (1U << nr));
62 }
63
64 static inline void __attribute__((always_inline))
65 vhost_log_page(uint8_t *log_base, uint64_t page)
66 {
67         vhost_set_bit(page % 8, &log_base[page / 8]);
68 }
69
70 static inline void __attribute__((always_inline))
71 vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
72 {
73         uint64_t page;
74
75         if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
76                    !dev->log_base || !len))
77                 return;
78
79         if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
80                 return;
81
82         /* To make sure guest memory updates are committed before logging */
83         rte_smp_wmb();
84
85         page = addr / VHOST_LOG_PAGE;
86         while (page * VHOST_LOG_PAGE < addr + len) {
87                 vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
88                 page += 1;
89         }
90 }
91
92 static inline void __attribute__((always_inline))
93 vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq)
94 {
95         unsigned long *log_base;
96         int i;
97
98         if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
99                                 !dev->log_base))
100                 return;
101
102         log_base = (unsigned long *)(uintptr_t)dev->log_base;
103
104         /*
105          * It is expected a write memory barrier has been issued
106          * before this function is called.
107          */
108
109         for (i = 0; i < vq->log_cache_nb_elem; i++) {
110                 struct log_cache_entry *elem = vq->log_cache + i;
111
112                 __sync_fetch_and_or(log_base + elem->offset, elem->val);
113         }
114
115         rte_smp_wmb();
116
117         vq->log_cache_nb_elem = 0;
118 }
119
120 static inline void __attribute__((always_inline))
121 vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq,
122                 uint64_t page)
123 {
124         uint32_t bit_nr = page % (sizeof(unsigned long) << 3);
125         uint32_t offset = page / (sizeof(unsigned long) << 3);
126         int i;
127
128         for (i = 0; i < vq->log_cache_nb_elem; i++) {
129                 struct log_cache_entry *elem = vq->log_cache + i;
130
131                 if (elem->offset == offset) {
132                         elem->val |= (1UL << bit_nr);
133                         return;
134                 }
135         }
136
137         if (unlikely(i >= VHOST_LOG_CACHE_NR)) {
138                 /*
139                  * No more room for a new log cache entry,
140                  * so write the dirty log map directly.
141                  */
142                 rte_smp_wmb();
143                 vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
144
145                 return;
146         }
147
148         vq->log_cache[i].offset = offset;
149         vq->log_cache[i].val = (1UL << bit_nr);
150 }
151
152 static inline void __attribute__((always_inline))
153 vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
154                 uint64_t addr, uint64_t len)
155 {
156         uint64_t page;
157
158         if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
159                                 !dev->log_base || !len))
160                 return;
161
162         if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
163                 return;
164
165         page = addr / VHOST_LOG_PAGE;
166         while (page * VHOST_LOG_PAGE < addr + len) {
167                 vhost_log_cache_page(dev, vq, page);
168                 page += 1;
169         }
170 }
171
172 static inline void __attribute__((always_inline))
173 vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
174                 uint64_t offset, uint64_t len)
175 {
176         vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len);
177 }
178
179 static inline void __attribute__((always_inline))
180 vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
181                      uint64_t offset, uint64_t len)
182 {
183         vhost_log_write(dev, vq->log_guest_addr + offset, len);
184 }
185
186 static bool
187 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
188 {
189         return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
190 }
191
192 static inline struct vring_desc *__attribute__((always_inline))
193 alloc_copy_ind_table(struct virtio_net *dev, struct vring_desc *desc)
194 {
195         struct vring_desc *idesc;
196         uint64_t src, dst;
197         uint64_t len, remain = desc->len;
198         uint64_t desc_addr = desc->addr;
199
200         idesc = rte_malloc(__func__, desc->len, 0);
201         if (unlikely(!idesc))
202                 return 0;
203
204         dst = (uint64_t)(uintptr_t)idesc;
205
206         while (remain) {
207                 len = remain;
208                 src = gpa_to_vva(dev, desc_addr, &len);
209                 if (unlikely(!src || !len)) {
210                         rte_free(idesc);
211                         return 0;
212                 }
213
214                 rte_memcpy((void *)(uintptr_t)dst, (void *)(uintptr_t)src, len);
215
216                 remain -= len;
217                 dst += len;
218                 desc_addr += len;
219         }
220
221         return idesc;
222 }
223
224 static inline void __attribute__((always_inline))
225 free_ind_table(struct vring_desc *idesc)
226 {
227         rte_free(idesc);
228 }
229
230 static inline void __attribute__((always_inline))
231 do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
232                           uint16_t to, uint16_t from, uint16_t size)
233 {
234         rte_memcpy(&vq->used->ring[to],
235                         &vq->shadow_used_ring[from],
236                         size * sizeof(struct vring_used_elem));
237         vhost_log_cache_used_vring(dev, vq,
238                         offsetof(struct vring_used, ring[to]),
239                         size * sizeof(struct vring_used_elem));
240 }
241
242 static inline void __attribute__((always_inline))
243 flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
244 {
245         uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
246
247         if (used_idx + vq->shadow_used_idx <= vq->size) {
248                 do_flush_shadow_used_ring(dev, vq, used_idx, 0,
249                                           vq->shadow_used_idx);
250         } else {
251                 uint16_t size;
252
253                 /* update used ring interval [used_idx, vq->size] */
254                 size = vq->size - used_idx;
255                 do_flush_shadow_used_ring(dev, vq, used_idx, 0, size);
256
257                 /* update the left half used ring interval [0, left_size] */
258                 do_flush_shadow_used_ring(dev, vq, 0, size,
259                                           vq->shadow_used_idx - size);
260         }
261         vq->last_used_idx += vq->shadow_used_idx;
262
263         rte_smp_wmb();
264
265         vhost_log_cache_sync(dev, vq);
266
267         *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
268         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
269                 sizeof(vq->used->idx));
270 }
271
272 static inline void __attribute__((always_inline))
273 update_shadow_used_ring(struct vhost_virtqueue *vq,
274                          uint16_t desc_idx, uint16_t len)
275 {
276         uint16_t i = vq->shadow_used_idx++;
277
278         vq->shadow_used_ring[i].id  = desc_idx;
279         vq->shadow_used_ring[i].len = len;
280 }
281
282 static void
283 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
284 {
285         uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
286
287         if (m_buf->ol_flags & PKT_TX_TCP_SEG)
288                 csum_l4 |= PKT_TX_TCP_CKSUM;
289
290         if (csum_l4) {
291                 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
292                 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
293
294                 switch (csum_l4) {
295                 case PKT_TX_TCP_CKSUM:
296                         net_hdr->csum_offset = (offsetof(struct tcp_hdr,
297                                                 cksum));
298                         break;
299                 case PKT_TX_UDP_CKSUM:
300                         net_hdr->csum_offset = (offsetof(struct udp_hdr,
301                                                 dgram_cksum));
302                         break;
303                 case PKT_TX_SCTP_CKSUM:
304                         net_hdr->csum_offset = (offsetof(struct sctp_hdr,
305                                                 cksum));
306                         break;
307                 }
308         }
309
310         /* IP cksum verification cannot be bypassed, then calculate here */
311         if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
312                 struct ipv4_hdr *ipv4_hdr;
313
314                 ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct ipv4_hdr *,
315                                                    m_buf->l2_len);
316                 ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
317         }
318
319         if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
320                 if (m_buf->ol_flags & PKT_TX_IPV4)
321                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
322                 else
323                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
324                 net_hdr->gso_size = m_buf->tso_segsz;
325                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
326                                         + m_buf->l4_len;
327         }
328 }
329
330 static inline void
331 copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
332                     struct virtio_net_hdr_mrg_rxbuf hdr)
333 {
334         if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
335                 *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
336         else
337                 *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
338 }
339
340 static inline int __attribute__((always_inline))
341 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
342                 struct vring_desc *descs, struct rte_mbuf *m,
343                 uint16_t desc_idx, uint32_t size)
344 {
345         uint32_t desc_avail, desc_offset;
346         uint32_t mbuf_avail, mbuf_offset;
347         uint32_t cpy_len;
348         uint64_t desc_chunck_len;
349         struct vring_desc *desc;
350         uint64_t desc_addr, desc_gaddr;
351         struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
352         /* A counter to avoid desc dead loop chain */
353         uint16_t nr_desc = 1;
354
355         desc = &descs[desc_idx];
356         desc_chunck_len = desc->len;
357         desc_gaddr = desc->addr;
358         desc_addr = gpa_to_vva(dev, desc_gaddr, &desc_chunck_len);
359         /*
360          * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
361          * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
362          * otherwise stores offset on the stack instead of in a register.
363          */
364         if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
365                 return -1;
366
367         rte_prefetch0((void *)(uintptr_t)desc_addr);
368
369         virtio_enqueue_offload(m, &virtio_hdr.hdr);
370         if (likely(desc_chunck_len >= dev->vhost_hlen)) {
371                 copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
372
373                 virtio_enqueue_offload(m,
374                                 (struct virtio_net_hdr *)(uintptr_t)desc_addr);
375                 PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
376         } else {
377                 uint64_t remain = dev->vhost_hlen;
378                 uint64_t len;
379                 uint64_t src = (uint64_t)(uintptr_t)&virtio_hdr, dst;
380                 uint64_t guest_addr = desc_gaddr;
381
382                 while (remain) {
383                         len = remain;
384                         dst = gpa_to_vva(dev, guest_addr, &len);
385                         if (unlikely(!dst || !len))
386                                 return -1;
387
388                         rte_memcpy((void *)(uintptr_t)dst,
389                                         (void *)(uintptr_t)src, len);
390
391                         PRINT_PACKET(dev, (uintptr_t)dst, (uint32_t)len, 0);
392                         remain -= len;
393                         guest_addr += len;
394                         dst += len;
395                 }
396         }
397
398         vhost_log_cache_write(dev, vq, desc_gaddr, dev->vhost_hlen);
399
400         desc_avail  = desc->len - dev->vhost_hlen;
401         if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
402                 desc_chunck_len = desc_avail;
403                 desc_gaddr += dev->vhost_hlen;
404                 desc_addr = gpa_to_vva(dev,
405                                 desc_gaddr,
406                                 &desc_chunck_len);
407                 if (unlikely(!desc_addr))
408                         return -1;
409
410                 desc_offset = 0;
411         } else {
412                 desc_offset = dev->vhost_hlen;
413                 desc_chunck_len -= dev->vhost_hlen;
414         }
415
416         mbuf_avail  = rte_pktmbuf_data_len(m);
417         mbuf_offset = 0;
418         while (mbuf_avail != 0 || m->next != NULL) {
419                 /* done with current mbuf, fetch next */
420                 if (mbuf_avail == 0) {
421                         m = m->next;
422
423                         mbuf_offset = 0;
424                         mbuf_avail  = rte_pktmbuf_data_len(m);
425                 }
426
427                 /* done with current desc buf, fetch next */
428                 if (desc_avail == 0) {
429                         if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
430                                 /* Room in vring buffer is not enough */
431                                 return -1;
432                         }
433                         if (unlikely(desc->next >= size || ++nr_desc > size))
434                                 return -1;
435
436                         desc = &descs[desc->next];
437                         desc_chunck_len = desc->len;
438                         desc_gaddr = desc->addr;
439                         desc_addr = gpa_to_vva(dev,
440                                         desc_gaddr, &desc_chunck_len);
441                         if (unlikely(!desc_addr))
442                                 return -1;
443
444                         desc_offset = 0;
445                         desc_avail  = desc->len;
446                 } else if (unlikely(desc_chunck_len == 0)) {
447                         desc_chunck_len = desc_avail;
448                         desc_gaddr += desc_offset;
449                         desc_addr = gpa_to_vva(dev,
450                                         desc_gaddr, &desc_chunck_len);
451                         if (unlikely(!desc_addr))
452                                 return -1;
453
454                         desc_offset = 0;
455                 }
456
457                 cpy_len = RTE_MIN(desc_avail, mbuf_avail);
458                 rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
459                         rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
460                         cpy_len);
461                 vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset,
462                                 cpy_len);
463                 PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
464                              cpy_len, 0);
465
466                 mbuf_avail  -= cpy_len;
467                 mbuf_offset += cpy_len;
468                 desc_avail  -= cpy_len;
469                 desc_offset += cpy_len;
470                 desc_chunck_len -= cpy_len;
471         }
472
473         return 0;
474 }
475
476 /**
477  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
478  * be received from the physical port or from another virtio device. A packet
479  * count is returned to indicate the number of packets that are succesfully
480  * added to the RX queue. This function works when the mbuf is scattered, but
481  * it doesn't support the mergeable feature.
482  */
483 static inline uint32_t __attribute__((always_inline))
484 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
485               struct rte_mbuf **pkts, uint32_t count)
486 {
487         struct vhost_virtqueue *vq;
488         uint16_t avail_idx, free_entries, start_idx;
489         uint16_t desc_indexes[MAX_PKT_BURST];
490         struct vring_desc *descs;
491         uint16_t used_idx;
492         uint32_t i, sz;
493         uint64_t dlen;
494
495         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
496         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
497                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
498                         dev->vid, __func__, queue_id);
499                 return 0;
500         }
501
502         vq = dev->virtqueue[queue_id];
503
504         rte_spinlock_lock(&vq->access_lock);
505
506         if (unlikely(vq->enabled == 0))
507                 goto out_access_unlock;
508
509         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
510         start_idx = vq->last_used_idx;
511         free_entries = avail_idx - start_idx;
512         count = RTE_MIN(count, free_entries);
513         count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
514         if (count == 0)
515                 goto out_access_unlock;
516
517         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
518                 dev->vid, start_idx, start_idx + count);
519
520         /* Retrieve all of the desc indexes first to avoid caching issues. */
521         rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
522         for (i = 0; i < count; i++) {
523                 used_idx = (start_idx + i) & (vq->size - 1);
524                 desc_indexes[i] = vq->avail->ring[used_idx];
525                 vq->used->ring[used_idx].id = desc_indexes[i];
526                 vq->used->ring[used_idx].len = pkts[i]->pkt_len +
527                                                dev->vhost_hlen;
528                 vhost_log_cache_used_vring(dev, vq,
529                         offsetof(struct vring_used, ring[used_idx]),
530                         sizeof(vq->used->ring[used_idx]));
531         }
532
533         rte_prefetch0(&vq->desc[desc_indexes[0]]);
534         for (i = 0; i < count; i++) {
535                 struct vring_desc *idesc = NULL;
536                 uint16_t desc_idx = desc_indexes[i];
537                 int err;
538
539                 if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) {
540                         dlen = vq->desc[desc_idx].len;
541                         descs = (struct vring_desc *)(uintptr_t)gpa_to_vva(dev,
542                                         vq->desc[desc_idx].addr, &dlen);
543                         if (unlikely(!descs)) {
544                                 count = i;
545                                 break;
546                         }
547
548                         if (unlikely(dlen < vq->desc[desc_idx].len)) {
549                                 /*
550                                  * The indirect desc table is not contiguous
551                                  * in process VA space, we have to copy it.
552                                  */
553                                 idesc = alloc_copy_ind_table(dev,
554                                                         &vq->desc[desc_idx]);
555                                 if (unlikely(!idesc))
556                                         break;
557
558                                 descs = idesc;
559                         }
560
561                         desc_idx = 0;
562                         sz = vq->desc[desc_idx].len / sizeof(*descs);
563                 } else {
564                         descs = vq->desc;
565                         sz = vq->size;
566                 }
567
568                 err = copy_mbuf_to_desc(dev, vq, descs, pkts[i], desc_idx, sz);
569                 if (unlikely(err)) {
570                         used_idx = (start_idx + i) & (vq->size - 1);
571                         vq->used->ring[used_idx].len = dev->vhost_hlen;
572                         vhost_log_cache_used_vring(dev, vq,
573                                 offsetof(struct vring_used, ring[used_idx]),
574                                 sizeof(vq->used->ring[used_idx]));
575                 }
576
577                 if (i + 1 < count)
578                         rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
579
580                 if (unlikely(!!idesc))
581                         free_ind_table(idesc);
582         }
583
584         rte_smp_wmb();
585
586         vhost_log_cache_sync(dev, vq);
587
588         *(volatile uint16_t *)&vq->used->idx += count;
589         vq->last_used_idx += count;
590         vhost_log_used_vring(dev, vq,
591                 offsetof(struct vring_used, idx),
592                 sizeof(vq->used->idx));
593
594         /* flush used->idx update before we read avail->flags. */
595         rte_mb();
596
597         /* Kick the guest if necessary. */
598         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
599                         && (vq->callfd >= 0))
600                 eventfd_write(vq->callfd, (eventfd_t)1);
601
602 out_access_unlock:
603         rte_spinlock_unlock(&vq->access_lock);
604
605         return count;
606 }
607
608 static inline int __attribute__((always_inline))
609 fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
610                          uint32_t avail_idx, uint32_t *vec_idx,
611                          struct buf_vector *buf_vec, uint16_t *desc_chain_head,
612                          uint16_t *desc_chain_len)
613 {
614         uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
615         uint32_t vec_id = *vec_idx;
616         uint32_t len    = 0;
617         uint64_t dlen;
618         struct vring_desc *descs = vq->desc;
619         struct vring_desc *idesc = NULL;
620
621         *desc_chain_head = idx;
622
623         if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
624                 dlen = vq->desc[idx].len;
625                 descs = (struct vring_desc *)(uintptr_t)
626                                         gpa_to_vva(dev, vq->desc[idx].addr,
627                                                            &dlen);
628                 if (unlikely(!descs))
629                         return -1;
630
631                 if (unlikely(dlen < vq->desc[idx].len)) {
632                         /*
633                          * The indirect desc table is not contiguous
634                          * in process VA space, we have to copy it.
635                          */
636                         idesc = alloc_copy_ind_table(dev, &vq->desc[idx]);
637                         if (unlikely(!idesc))
638                                 return -1;
639
640                         descs = idesc;
641                 }
642
643                 idx = 0;
644         }
645
646         while (1) {
647                 if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size)) {
648                         free_ind_table(idesc);
649                         return -1;
650                 }
651
652                 len += descs[idx].len;
653                 buf_vec[vec_id].buf_addr = descs[idx].addr;
654                 buf_vec[vec_id].buf_len  = descs[idx].len;
655                 buf_vec[vec_id].desc_idx = idx;
656                 vec_id++;
657
658                 if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
659                         break;
660
661                 idx = descs[idx].next;
662         }
663
664         *desc_chain_len = len;
665         *vec_idx = vec_id;
666
667         if (unlikely(!!idesc))
668                 free_ind_table(idesc);
669
670         return 0;
671 }
672
673 /*
674  * Returns -1 on fail, 0 on success
675  */
676 static inline int
677 reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
678                                 uint32_t size, struct buf_vector *buf_vec,
679                                 uint16_t *num_buffers, uint16_t avail_head)
680 {
681         uint16_t cur_idx;
682         uint32_t vec_idx = 0;
683         uint16_t tries = 0;
684
685         uint16_t head_idx = 0;
686         uint16_t len = 0;
687
688         *num_buffers = 0;
689         cur_idx  = vq->last_avail_idx;
690
691         while (size > 0) {
692                 if (unlikely(cur_idx == avail_head))
693                         return -1;
694
695                 if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec,
696                                                 &head_idx, &len) < 0))
697                         return -1;
698                 len = RTE_MIN(len, size);
699                 update_shadow_used_ring(vq, head_idx, len);
700                 size -= len;
701
702                 cur_idx++;
703                 tries++;
704                 *num_buffers += 1;
705
706                 /*
707                  * if we tried all available ring items, and still
708                  * can't get enough buf, it means something abnormal
709                  * happened.
710                  */
711                 if (unlikely(tries >= vq->size))
712                         return -1;
713         }
714
715         return 0;
716 }
717
718 static inline int __attribute__((always_inline))
719 copy_mbuf_to_desc_mergeable(struct virtio_net *dev,
720                         struct vhost_virtqueue *vq, struct rte_mbuf *m,
721                         struct buf_vector *buf_vec, uint16_t num_buffers)
722 {
723         struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
724         struct virtio_net_hdr_mrg_rxbuf *hdr;
725         uint32_t vec_idx = 0;
726         uint64_t desc_addr, desc_gaddr;
727         uint64_t desc_chunck_len;
728         uint32_t mbuf_offset, mbuf_avail;
729         uint32_t desc_offset, desc_avail;
730         uint32_t cpy_len;
731         uint64_t hdr_addr, hdr_phys_addr;
732         struct rte_mbuf *hdr_mbuf;
733
734         if (unlikely(m == NULL))
735                 return -1;
736
737         desc_chunck_len = buf_vec[vec_idx].buf_len;
738         desc_gaddr = buf_vec[vec_idx].buf_addr;
739         desc_addr = gpa_to_vva(dev, desc_gaddr, &desc_chunck_len);
740         if (buf_vec[vec_idx].buf_len < dev->vhost_hlen ||
741                         !desc_addr)
742                 return -1;
743
744         hdr_mbuf = m;
745         hdr_addr = desc_addr;
746         if (unlikely(desc_chunck_len < dev->vhost_hlen))
747                 hdr = &virtio_hdr;
748         else
749                 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
750         hdr_phys_addr = buf_vec[vec_idx].buf_addr;
751         rte_prefetch0((void *)(uintptr_t)hdr_addr);
752
753         virtio_hdr.num_buffers = num_buffers;
754         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
755                 dev->vid, num_buffers);
756
757         desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
758         if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
759                 desc_chunck_len = desc_avail;
760                 desc_gaddr += dev->vhost_hlen;
761                 desc_addr = gpa_to_vva(dev,
762                                 desc_gaddr,
763                                 &desc_chunck_len);
764                 if (unlikely(!desc_addr))
765                         return -1;
766
767                 desc_offset = 0;
768         } else {
769                 desc_offset = dev->vhost_hlen;
770                 desc_chunck_len -= dev->vhost_hlen;
771         }
772
773
774         mbuf_avail  = rte_pktmbuf_data_len(m);
775         mbuf_offset = 0;
776         while (mbuf_avail != 0 || m->next != NULL) {
777                 /* done with current desc buf, get the next one */
778                 if (desc_avail == 0) {
779                         vec_idx++;
780                         desc_gaddr = buf_vec[vec_idx].buf_addr;
781                         desc_chunck_len = buf_vec[vec_idx].buf_len;
782                         desc_addr = gpa_to_vva(dev, desc_gaddr,
783                                         &desc_chunck_len);
784                         if (unlikely(!desc_addr))
785                                 return -1;
786
787                         /* Prefetch buffer address. */
788                         rte_prefetch0((void *)(uintptr_t)desc_addr);
789                         desc_offset = 0;
790                         desc_avail  = buf_vec[vec_idx].buf_len;
791                 } else if (unlikely(desc_chunck_len == 0)) {
792                         desc_chunck_len = desc_avail;
793                         desc_gaddr += desc_offset;
794                         desc_addr = gpa_to_vva(dev,
795                                         desc_gaddr,
796                                         &desc_chunck_len);
797                         if (unlikely(!desc_addr))
798                                 return -1;
799
800                         desc_offset = 0;
801                 }
802
803                 /* done with current mbuf, get the next one */
804                 if (mbuf_avail == 0) {
805                         m = m->next;
806
807                         mbuf_offset = 0;
808                         mbuf_avail  = rte_pktmbuf_data_len(m);
809                 }
810
811                 if (hdr_addr) {
812                         virtio_enqueue_offload(hdr_mbuf, &virtio_hdr.hdr);
813                         if (likely(hdr != &virtio_hdr)) {
814                                 copy_virtio_net_hdr(dev, hdr_addr, virtio_hdr);
815                         } else {
816                                 uint64_t len;
817                                 uint64_t remain = dev->vhost_hlen;
818                                 uint64_t src = (uint64_t)(uintptr_t)&virtio_hdr;
819                                 uint64_t dst;
820                                 uint64_t guest_addr = hdr_phys_addr;
821
822                                 while (remain) {
823                                         len = remain;
824                                         dst = gpa_to_vva(dev, guest_addr, &len);
825                                         if (unlikely(!dst || !len))
826                                                 return -1;
827
828                                         rte_memcpy((void *)(uintptr_t)dst,
829                                                         (void *)(uintptr_t)src,
830                                                         len);
831
832                                         PRINT_PACKET(dev, (uintptr_t)dst,
833                                                         (uint32_t)len, 0);
834
835                                         remain -= len;
836                                         guest_addr += len;
837                                         dst += len;
838                                 }
839                         }
840                         vhost_log_cache_write(dev, vq, hdr_phys_addr,
841                                         dev->vhost_hlen);
842                         PRINT_PACKET(dev, (uintptr_t)hdr_addr,
843                                      dev->vhost_hlen, 0);
844
845                         hdr_addr = 0;
846                 }
847
848                 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
849                 rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
850                         rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
851                         cpy_len);
852                 vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset,
853                                 cpy_len);
854                 PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
855                         cpy_len, 0);
856
857                 mbuf_avail  -= cpy_len;
858                 mbuf_offset += cpy_len;
859                 desc_avail  -= cpy_len;
860                 desc_offset += cpy_len;
861                 desc_chunck_len -= cpy_len;
862         }
863
864         return 0;
865 }
866
867 static inline uint32_t __attribute__((always_inline))
868 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
869         struct rte_mbuf **pkts, uint32_t count)
870 {
871         struct vhost_virtqueue *vq;
872         uint32_t pkt_idx = 0;
873         uint16_t num_buffers;
874         struct buf_vector buf_vec[BUF_VECTOR_MAX];
875         uint16_t avail_head;
876
877         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
878         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
879                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
880                         dev->vid, __func__, queue_id);
881                 return 0;
882         }
883
884         vq = dev->virtqueue[queue_id];
885
886         rte_spinlock_lock(&vq->access_lock);
887
888         if (unlikely(vq->enabled == 0))
889                 goto out_access_unlock;
890
891         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
892         if (count == 0)
893                 goto out_access_unlock;
894
895         rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
896
897         vq->shadow_used_idx = 0;
898         avail_head = *((volatile uint16_t *)&vq->avail->idx);
899         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
900                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
901
902                 if (unlikely(reserve_avail_buf_mergeable(dev, vq,
903                                                 pkt_len, buf_vec, &num_buffers,
904                                                 avail_head) < 0)) {
905                         VHOST_LOG_DEBUG(VHOST_DATA,
906                                 "(%d) failed to get enough desc from vring\n",
907                                 dev->vid);
908                         vq->shadow_used_idx -= num_buffers;
909                         break;
910                 }
911
912                 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
913                         dev->vid, vq->last_avail_idx,
914                         vq->last_avail_idx + num_buffers);
915
916                 if (copy_mbuf_to_desc_mergeable(dev, vq, pkts[pkt_idx],
917                                                 buf_vec, num_buffers) < 0) {
918                         vq->shadow_used_idx -= num_buffers;
919                         break;
920                 }
921
922                 vq->last_avail_idx += num_buffers;
923         }
924
925         if (likely(vq->shadow_used_idx)) {
926                 flush_shadow_used_ring(dev, vq);
927
928                 /* flush used->idx update before we read avail->flags. */
929                 rte_mb();
930
931                 /* Kick the guest if necessary. */
932                 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
933                                 && (vq->callfd >= 0))
934                         eventfd_write(vq->callfd, (eventfd_t)1);
935         }
936
937 out_access_unlock:
938         rte_spinlock_unlock(&vq->access_lock);
939
940         return pkt_idx;
941 }
942
943 uint16_t
944 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
945         struct rte_mbuf **pkts, uint16_t count)
946 {
947         struct virtio_net *dev = get_device(vid);
948
949         if (!dev)
950                 return 0;
951
952         if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
953                 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
954         else
955                 return virtio_dev_rx(dev, queue_id, pkts, count);
956 }
957
958 static inline bool
959 virtio_net_with_host_offload(struct virtio_net *dev)
960 {
961         if (dev->features &
962                         ((1ULL << VIRTIO_NET_F_CSUM) |
963                          (1ULL << VIRTIO_NET_F_HOST_ECN) |
964                          (1ULL << VIRTIO_NET_F_HOST_TSO4) |
965                          (1ULL << VIRTIO_NET_F_HOST_TSO6) |
966                          (1ULL << VIRTIO_NET_F_HOST_UFO)))
967                 return true;
968
969         return false;
970 }
971
972 static void
973 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
974 {
975         struct ipv4_hdr *ipv4_hdr;
976         struct ipv6_hdr *ipv6_hdr;
977         void *l3_hdr = NULL;
978         struct ether_hdr *eth_hdr;
979         uint16_t ethertype;
980
981         eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
982
983         m->l2_len = sizeof(struct ether_hdr);
984         ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
985
986         if (ethertype == ETHER_TYPE_VLAN) {
987                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
988
989                 m->l2_len += sizeof(struct vlan_hdr);
990                 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
991         }
992
993         l3_hdr = (char *)eth_hdr + m->l2_len;
994
995         switch (ethertype) {
996         case ETHER_TYPE_IPv4:
997                 ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
998                 *l4_proto = ipv4_hdr->next_proto_id;
999                 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
1000                 *l4_hdr = (char *)l3_hdr + m->l3_len;
1001                 m->ol_flags |= PKT_TX_IPV4;
1002                 break;
1003         case ETHER_TYPE_IPv6:
1004                 ipv6_hdr = (struct ipv6_hdr *)l3_hdr;
1005                 *l4_proto = ipv6_hdr->proto;
1006                 m->l3_len = sizeof(struct ipv6_hdr);
1007                 *l4_hdr = (char *)l3_hdr + m->l3_len;
1008                 m->ol_flags |= PKT_TX_IPV6;
1009                 break;
1010         default:
1011                 m->l3_len = 0;
1012                 *l4_proto = 0;
1013                 *l4_hdr = NULL;
1014                 break;
1015         }
1016 }
1017
1018 static inline void __attribute__((always_inline))
1019 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
1020 {
1021         uint16_t l4_proto = 0;
1022         void *l4_hdr = NULL;
1023         struct tcp_hdr *tcp_hdr = NULL;
1024
1025         if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
1026                 return;
1027
1028         parse_ethernet(m, &l4_proto, &l4_hdr);
1029         if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1030                 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
1031                         switch (hdr->csum_offset) {
1032                         case (offsetof(struct tcp_hdr, cksum)):
1033                                 if (l4_proto == IPPROTO_TCP)
1034                                         m->ol_flags |= PKT_TX_TCP_CKSUM;
1035                                 break;
1036                         case (offsetof(struct udp_hdr, dgram_cksum)):
1037                                 if (l4_proto == IPPROTO_UDP)
1038                                         m->ol_flags |= PKT_TX_UDP_CKSUM;
1039                                 break;
1040                         case (offsetof(struct sctp_hdr, cksum)):
1041                                 if (l4_proto == IPPROTO_SCTP)
1042                                         m->ol_flags |= PKT_TX_SCTP_CKSUM;
1043                                 break;
1044                         default:
1045                                 break;
1046                         }
1047                 }
1048         }
1049
1050         if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1051                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1052                 case VIRTIO_NET_HDR_GSO_TCPV4:
1053                 case VIRTIO_NET_HDR_GSO_TCPV6:
1054                         tcp_hdr = (struct tcp_hdr *)l4_hdr;
1055                         m->ol_flags |= PKT_TX_TCP_SEG;
1056                         m->tso_segsz = hdr->gso_size;
1057                         m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
1058                         break;
1059                 default:
1060                         RTE_LOG(WARNING, VHOST_DATA,
1061                                 "unsupported gso type %u.\n", hdr->gso_type);
1062                         break;
1063                 }
1064         }
1065 }
1066
1067 #define RARP_PKT_SIZE   64
1068
1069 static int
1070 make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
1071 {
1072         struct ether_hdr *eth_hdr;
1073         struct arp_hdr  *rarp;
1074
1075         if (rarp_mbuf->buf_len < 64) {
1076                 RTE_LOG(WARNING, VHOST_DATA,
1077                         "failed to make RARP; mbuf size too small %u (< %d)\n",
1078                         rarp_mbuf->buf_len, RARP_PKT_SIZE);
1079                 return -1;
1080         }
1081
1082         /* Ethernet header. */
1083         eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
1084         memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
1085         ether_addr_copy(mac, &eth_hdr->s_addr);
1086         eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
1087
1088         /* RARP header. */
1089         rarp = (struct arp_hdr *)(eth_hdr + 1);
1090         rarp->arp_hrd = htons(ARP_HRD_ETHER);
1091         rarp->arp_pro = htons(ETHER_TYPE_IPv4);
1092         rarp->arp_hln = ETHER_ADDR_LEN;
1093         rarp->arp_pln = 4;
1094         rarp->arp_op  = htons(ARP_OP_REVREQUEST);
1095
1096         ether_addr_copy(mac, &rarp->arp_data.arp_sha);
1097         ether_addr_copy(mac, &rarp->arp_data.arp_tha);
1098         memset(&rarp->arp_data.arp_sip, 0x00, 4);
1099         memset(&rarp->arp_data.arp_tip, 0x00, 4);
1100
1101         rarp_mbuf->pkt_len  = rarp_mbuf->data_len = RARP_PKT_SIZE;
1102
1103         return 0;
1104 }
1105
1106 static inline void __attribute__((always_inline))
1107 put_zmbuf(struct zcopy_mbuf *zmbuf)
1108 {
1109         zmbuf->in_use = 0;
1110 }
1111
1112 static inline int __attribute__((always_inline))
1113 copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
1114                   uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
1115                   struct rte_mempool *mbuf_pool)
1116 {
1117         struct vring_desc *desc;
1118         uint64_t desc_addr, desc_gaddr;
1119         uint32_t desc_avail, desc_offset;
1120         uint32_t mbuf_avail, mbuf_offset;
1121         uint32_t cpy_len;
1122         uint64_t desc_chunck_len;
1123         struct rte_mbuf *cur = m, *prev = m;
1124         struct virtio_net_hdr tmp_hdr;
1125         struct virtio_net_hdr *hdr = NULL;
1126         /* A counter to avoid desc dead loop chain */
1127         uint32_t nr_desc = 1;
1128
1129         desc = &descs[desc_idx];
1130         if (unlikely((desc->len < dev->vhost_hlen)) ||
1131                         (desc->flags & VRING_DESC_F_INDIRECT))
1132                 return -1;
1133
1134         desc_chunck_len = desc->len;
1135         desc_gaddr = desc->addr;
1136         desc_addr = gpa_to_vva(dev, desc_gaddr, &desc_chunck_len);
1137         if (unlikely(!desc_addr))
1138                 return -1;
1139
1140         if (virtio_net_with_host_offload(dev)) {
1141                 if (unlikely(desc_chunck_len < sizeof(struct virtio_net_hdr))) {
1142                         uint64_t len = desc_chunck_len;
1143                         uint64_t remain = sizeof(struct virtio_net_hdr);
1144                         uint64_t src = desc_addr;
1145                         uint64_t dst = (uint64_t)(uintptr_t)&tmp_hdr;
1146                         uint64_t guest_addr = desc_gaddr;
1147
1148                         /*
1149                          * No luck, the virtio-net header doesn't fit
1150                          * in a contiguous virtual area.
1151                          */
1152                         while (remain) {
1153                                 len = remain;
1154                                 src = gpa_to_vva(dev, guest_addr, &len);
1155                                 if (unlikely(!src || !len))
1156                                         return -1;
1157
1158                                 rte_memcpy((void *)(uintptr_t)dst,
1159                                                    (void *)(uintptr_t)src, len);
1160
1161                                 guest_addr += len;
1162                                 remain -= len;
1163                                 dst += len;
1164                         }
1165
1166                         hdr = &tmp_hdr;
1167                 } else {
1168                         hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
1169                         rte_prefetch0(hdr);
1170                 }
1171         }
1172
1173         /*
1174          * A virtio driver normally uses at least 2 desc buffers
1175          * for Tx: the first for storing the header, and others
1176          * for storing the data.
1177          */
1178         if (likely((desc->len == dev->vhost_hlen) &&
1179                    (desc->flags & VRING_DESC_F_NEXT) != 0)) {
1180                 desc = &descs[desc->next];
1181                 if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
1182                         return -1;
1183
1184                 desc_chunck_len = desc->len;
1185                 desc_gaddr = desc->addr;
1186                 desc_addr = gpa_to_vva(dev, desc_gaddr, &desc_chunck_len);
1187                 if (unlikely(!desc_addr))
1188                         return -1;
1189
1190                 desc_offset = 0;
1191                 desc_avail  = desc->len;
1192                 nr_desc    += 1;
1193         } else {
1194                 desc_avail  = desc->len - dev->vhost_hlen;
1195
1196                 if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
1197                         desc_chunck_len = desc_avail;
1198                         desc_gaddr += dev->vhost_hlen;
1199                         desc_addr = gpa_to_vva(dev,
1200                                         desc_gaddr,
1201                                         &desc_chunck_len);
1202                         if (unlikely(!desc_addr))
1203                                 return -1;
1204
1205                         desc_offset = 0;
1206                 } else {
1207                         desc_offset = dev->vhost_hlen;
1208                         desc_chunck_len -= dev->vhost_hlen;
1209                 }
1210         }
1211
1212         rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset));
1213
1214         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
1215                         (uint32_t)desc_chunck_len, 0);
1216
1217         mbuf_offset = 0;
1218         mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
1219         while (1) {
1220                 uint64_t hpa;
1221
1222                 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
1223
1224                 /*
1225                  * A desc buf might across two host physical pages that are
1226                  * not continuous. In such case (gpa_to_hpa returns 0), data
1227                  * will be copied even though zero copy is enabled.
1228                  */
1229                 if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev,
1230                                         desc_gaddr + desc_offset, cpy_len)))) {
1231                         cur->data_len = cpy_len;
1232                         cur->data_off = 0;
1233                         cur->buf_addr = (void *)(uintptr_t)(desc_gaddr
1234                                         + desc_offset);
1235                         cur->buf_physaddr = hpa;
1236
1237                         /*
1238                          * In zero copy mode, one mbuf can only reference data
1239                          * for one or partial of one desc buff.
1240                          */
1241                         mbuf_avail = cpy_len;
1242                 } else {
1243                         rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
1244                                                            mbuf_offset),
1245                                 (void *)((uintptr_t)(desc_addr + desc_offset)),
1246                                 cpy_len);
1247                 }
1248
1249                 mbuf_avail  -= cpy_len;
1250                 mbuf_offset += cpy_len;
1251                 desc_avail  -= cpy_len;
1252                 desc_chunck_len -= cpy_len;
1253                 desc_offset += cpy_len;
1254
1255                 /* This desc reaches to its end, get the next one */
1256                 if (desc_avail == 0) {
1257                         if ((desc->flags & VRING_DESC_F_NEXT) == 0)
1258                                 break;
1259
1260                         if (unlikely(desc->next >= max_desc ||
1261                                      ++nr_desc > max_desc))
1262                                 return -1;
1263                         desc = &descs[desc->next];
1264                         if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
1265                                 return -1;
1266
1267                         desc_chunck_len = desc->len;
1268                         desc_gaddr = desc->addr;
1269                         desc_addr = gpa_to_vva(dev, desc_gaddr,
1270                                         &desc_chunck_len);
1271                         if (unlikely(!desc_addr))
1272                                 return -1;
1273
1274                         rte_prefetch0((void *)(uintptr_t)desc_addr);
1275
1276                         desc_offset = 0;
1277                         desc_avail  = desc->len;
1278
1279                         PRINT_PACKET(dev, (uintptr_t)desc_addr,
1280                                         (uint32_t)desc_chunck_len, 0);
1281                 } else if (unlikely(desc_chunck_len == 0)) {
1282                         desc_chunck_len = desc_avail;
1283                         desc_gaddr += desc_offset;
1284                         desc_addr = gpa_to_vva(dev,
1285                                         desc_gaddr,
1286                                         &desc_chunck_len);
1287                         if (unlikely(!desc_addr))
1288                                 return -1;
1289
1290                         desc_offset = 0;
1291
1292                         PRINT_PACKET(dev, (uintptr_t)desc_addr,
1293                                         (uint32_t)desc_chunck_len, 0);
1294                 }
1295
1296                 /*
1297                  * This mbuf reaches to its end, get a new one
1298                  * to hold more data.
1299                  */
1300                 if (mbuf_avail == 0) {
1301                         cur = rte_pktmbuf_alloc(mbuf_pool);
1302                         if (unlikely(cur == NULL)) {
1303                                 RTE_LOG(ERR, VHOST_DATA, "Failed to "
1304                                         "allocate memory for mbuf.\n");
1305                                 return -1;
1306                         }
1307                         if (unlikely(dev->dequeue_zero_copy))
1308                                 rte_mbuf_refcnt_update(cur, 1);
1309
1310                         prev->next = cur;
1311                         prev->data_len = mbuf_offset;
1312                         m->nb_segs += 1;
1313                         m->pkt_len += mbuf_offset;
1314                         prev = cur;
1315
1316                         mbuf_offset = 0;
1317                         mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
1318                 }
1319         }
1320
1321         prev->data_len = mbuf_offset;
1322         m->pkt_len    += mbuf_offset;
1323
1324         if (hdr)
1325                 vhost_dequeue_offload(hdr, m);
1326
1327         return 0;
1328 }
1329
1330 static inline void __attribute__((always_inline))
1331 update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
1332                  uint32_t used_idx, uint32_t desc_idx)
1333 {
1334         vq->used->ring[used_idx].id  = desc_idx;
1335         vq->used->ring[used_idx].len = 0;
1336         vhost_log_cache_used_vring(dev, vq,
1337                         offsetof(struct vring_used, ring[used_idx]),
1338                         sizeof(vq->used->ring[used_idx]));
1339 }
1340
1341 static inline void __attribute__((always_inline))
1342 update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq,
1343                 uint32_t count)
1344 {
1345         if (unlikely(count == 0))
1346                 return;
1347
1348         rte_smp_wmb();
1349         rte_smp_rmb();
1350
1351         vhost_log_cache_sync(dev, vq);
1352
1353         vq->used->idx += count;
1354         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
1355                         sizeof(vq->used->idx));
1356
1357         /* Kick guest if required. */
1358         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
1359                         && (vq->callfd >= 0))
1360                 eventfd_write(vq->callfd, (eventfd_t)1);
1361 }
1362
1363 static inline struct zcopy_mbuf *__attribute__((always_inline))
1364 get_zmbuf(struct vhost_virtqueue *vq)
1365 {
1366         uint16_t i;
1367         uint16_t last;
1368         int tries = 0;
1369
1370         /* search [last_zmbuf_idx, zmbuf_size) */
1371         i = vq->last_zmbuf_idx;
1372         last = vq->zmbuf_size;
1373
1374 again:
1375         for (; i < last; i++) {
1376                 if (vq->zmbufs[i].in_use == 0) {
1377                         vq->last_zmbuf_idx = i + 1;
1378                         vq->zmbufs[i].in_use = 1;
1379                         return &vq->zmbufs[i];
1380                 }
1381         }
1382
1383         tries++;
1384         if (tries == 1) {
1385                 /* search [0, last_zmbuf_idx) */
1386                 i = 0;
1387                 last = vq->last_zmbuf_idx;
1388                 goto again;
1389         }
1390
1391         return NULL;
1392 }
1393
1394 static inline bool __attribute__((always_inline))
1395 mbuf_is_consumed(struct rte_mbuf *m)
1396 {
1397         while (m) {
1398                 if (rte_mbuf_refcnt_read(m) > 1)
1399                         return false;
1400                 m = m->next;
1401         }
1402
1403         return true;
1404 }
1405
1406 static inline void __attribute__((always_inline))
1407 restore_mbuf(struct rte_mbuf *m)
1408 {
1409         uint32_t mbuf_size, priv_size;
1410
1411         while (m) {
1412                 priv_size = rte_pktmbuf_priv_size(m->pool);
1413                 mbuf_size = sizeof(struct rte_mbuf) + priv_size;
1414                 /* start of buffer is after mbuf structure and priv data */
1415
1416                 m->buf_addr = (char *)m + mbuf_size;
1417                 m->buf_physaddr = rte_mempool_virt2phy(NULL, m) + mbuf_size;
1418                 m = m->next;
1419         }
1420 }
1421
1422 uint16_t
1423 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
1424         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
1425 {
1426         struct virtio_net *dev;
1427         struct rte_mbuf *rarp_mbuf = NULL;
1428         struct vhost_virtqueue *vq;
1429         uint32_t desc_indexes[MAX_PKT_BURST];
1430         uint32_t used_idx;
1431         uint32_t i = 0;
1432         uint16_t free_entries;
1433         uint16_t avail_idx;
1434
1435         dev = get_device(vid);
1436         if (!dev)
1437                 return 0;
1438
1439         if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
1440                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
1441                         dev->vid, __func__, queue_id);
1442                 return 0;
1443         }
1444
1445         vq = dev->virtqueue[queue_id];
1446
1447         if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
1448                 return 0;
1449
1450         if (unlikely(vq->enabled == 0))
1451                 goto out_access_unlock;
1452
1453         if (unlikely(dev->dequeue_zero_copy)) {
1454                 struct zcopy_mbuf *zmbuf, *next;
1455                 int nr_updated = 0;
1456
1457                 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
1458                      zmbuf != NULL; zmbuf = next) {
1459                         next = TAILQ_NEXT(zmbuf, next);
1460
1461                         if (mbuf_is_consumed(zmbuf->mbuf)) {
1462                                 used_idx = vq->last_used_idx++ & (vq->size - 1);
1463                                 update_used_ring(dev, vq, used_idx,
1464                                                  zmbuf->desc_idx);
1465                                 nr_updated += 1;
1466
1467                                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
1468                                 restore_mbuf(zmbuf->mbuf);
1469                                 rte_pktmbuf_free(zmbuf->mbuf);
1470                                 put_zmbuf(zmbuf);
1471                                 vq->nr_zmbuf -= 1;
1472                         }
1473                 }
1474
1475                 update_used_idx(dev, vq, nr_updated);
1476         }
1477
1478         /*
1479          * Construct a RARP broadcast packet, and inject it to the "pkts"
1480          * array, to looks like that guest actually send such packet.
1481          *
1482          * Check user_send_rarp() for more information.
1483          *
1484          * broadcast_rarp shares a cacheline in the virtio_net structure
1485          * with some fields that are accessed during enqueue and
1486          * rte_atomic16_cmpset() causes a write if using cmpxchg. This could
1487          * result in false sharing between enqueue and dequeue.
1488          *
1489          * Prevent unnecessary false sharing by reading broadcast_rarp first
1490          * and only performing cmpset if the read indicates it is likely to
1491          * be set.
1492          */
1493
1494         if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
1495                         rte_atomic16_cmpset((volatile uint16_t *)
1496                                 &dev->broadcast_rarp.cnt, 1, 0))) {
1497
1498                 rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1499                 if (rarp_mbuf == NULL) {
1500                         RTE_LOG(ERR, VHOST_DATA,
1501                                 "Failed to allocate memory for mbuf.\n");
1502                         goto out_access_unlock;
1503                 }
1504
1505                 if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
1506                         rte_pktmbuf_free(rarp_mbuf);
1507                         rarp_mbuf = NULL;
1508                 } else {
1509                         count -= 1;
1510                 }
1511         }
1512
1513         free_entries = *((volatile uint16_t *)&vq->avail->idx) -
1514                         vq->last_avail_idx;
1515         if (free_entries == 0)
1516                 goto out_access_unlock;
1517
1518         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
1519
1520         /* Prefetch available and used ring */
1521         avail_idx = vq->last_avail_idx & (vq->size - 1);
1522         used_idx  = vq->last_used_idx  & (vq->size - 1);
1523         rte_prefetch0(&vq->avail->ring[avail_idx]);
1524         rte_prefetch0(&vq->used->ring[used_idx]);
1525
1526         count = RTE_MIN(count, MAX_PKT_BURST);
1527         count = RTE_MIN(count, free_entries);
1528         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
1529                         dev->vid, count);
1530
1531         /* Retrieve all of the head indexes first to avoid caching issues. */
1532         for (i = 0; i < count; i++) {
1533                 avail_idx = (vq->last_avail_idx + i) & (vq->size - 1);
1534                 used_idx  = (vq->last_used_idx  + i) & (vq->size - 1);
1535                 desc_indexes[i] = vq->avail->ring[avail_idx];
1536
1537                 if (likely(dev->dequeue_zero_copy == 0))
1538                         update_used_ring(dev, vq, used_idx, desc_indexes[i]);
1539         }
1540
1541         /* Prefetch descriptor index. */
1542         rte_prefetch0(&vq->desc[desc_indexes[0]]);
1543         for (i = 0; i < count; i++) {
1544                 struct vring_desc *desc, *idesc = NULL;
1545                 uint16_t sz, idx;
1546                 uint64_t dlen;
1547                 int err;
1548
1549                 if (likely(i + 1 < count))
1550                         rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
1551
1552                 if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
1553                         dlen = vq->desc[desc_indexes[i]].len;
1554                         desc = (struct vring_desc *)(uintptr_t)gpa_to_vva(dev,
1555                                         vq->desc[desc_indexes[i]].addr,
1556                                         &dlen);
1557                         if (unlikely(!desc))
1558                                 break;
1559
1560                         if (unlikely(dlen < vq->desc[desc_indexes[i]].len)) {
1561                                 /*
1562                                  * The indirect desc table is not contiguous
1563                                  * in process VA space, we have to copy it.
1564                                  */
1565                                 idesc = alloc_copy_ind_table(dev,
1566                                                 &vq->desc[desc_indexes[i]]);
1567                                 if (unlikely(!idesc))
1568                                         break;
1569
1570                                 desc = idesc;
1571                         }
1572
1573                         rte_prefetch0(desc);
1574                         sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
1575                         idx = 0;
1576                 } else {
1577                         desc = vq->desc;
1578                         sz = vq->size;
1579                         idx = desc_indexes[i];
1580                 }
1581
1582                 pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
1583                 if (unlikely(pkts[i] == NULL)) {
1584                         RTE_LOG(ERR, VHOST_DATA,
1585                                 "Failed to allocate memory for mbuf.\n");
1586                         free_ind_table(idesc);
1587                         break;
1588                 }
1589
1590                 err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
1591                 if (unlikely(err)) {
1592                         rte_pktmbuf_free(pkts[i]);
1593                         free_ind_table(idesc);
1594                         break;
1595                 }
1596
1597                 if (unlikely(dev->dequeue_zero_copy)) {
1598                         struct zcopy_mbuf *zmbuf;
1599
1600                         zmbuf = get_zmbuf(vq);
1601                         if (!zmbuf) {
1602                                 rte_pktmbuf_free(pkts[i]);
1603                                 free_ind_table(idesc);
1604                                 break;
1605                         }
1606                         zmbuf->mbuf = pkts[i];
1607                         zmbuf->desc_idx = desc_indexes[i];
1608
1609                         /*
1610                          * Pin lock the mbuf; we will check later to see
1611                          * whether the mbuf is freed (when we are the last
1612                          * user) or not. If that's the case, we then could
1613                          * update the used ring safely.
1614                          */
1615                         rte_mbuf_refcnt_update(pkts[i], 1);
1616
1617                         vq->nr_zmbuf += 1;
1618                         TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
1619                 }
1620
1621                 if (unlikely(!!idesc))
1622                         free_ind_table(idesc);
1623         }
1624         vq->last_avail_idx += i;
1625
1626         if (likely(dev->dequeue_zero_copy == 0)) {
1627                 vq->last_used_idx += i;
1628                 update_used_idx(dev, vq, i);
1629         }
1630
1631 out_access_unlock:
1632         rte_spinlock_unlock(&vq->access_lock);
1633
1634         if (unlikely(rarp_mbuf != NULL)) {
1635                 /*
1636                  * Inject it to the head of "pkts" array, so that switch's mac
1637                  * learning table will get updated first.
1638                  */
1639                 memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
1640                 pkts[0] = rarp_mbuf;
1641                 i += 1;
1642         }
1643
1644         return i;
1645 }