New upstream version 17.11.3
[deb_dpdk.git] / lib / librte_vhost / virtio_net.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdbool.h>
36 #include <linux/virtio_net.h>
37
38 #include <rte_mbuf.h>
39 #include <rte_memcpy.h>
40 #include <rte_ether.h>
41 #include <rte_ip.h>
42 #include <rte_vhost.h>
43 #include <rte_tcp.h>
44 #include <rte_udp.h>
45 #include <rte_sctp.h>
46 #include <rte_arp.h>
47 #include <rte_spinlock.h>
48 #include <rte_malloc.h>
49
50 #include "iotlb.h"
51 #include "vhost.h"
52
53 #define MAX_PKT_BURST 32
54
55 #define MAX_BATCH_LEN 256
56
57 static bool
58 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
59 {
60         return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
61 }
62
63 static __rte_always_inline struct vring_desc *
64 alloc_copy_ind_table(struct virtio_net *dev, struct vhost_virtqueue *vq,
65                                          struct vring_desc *desc)
66 {
67         struct vring_desc *idesc;
68         uint64_t src, dst;
69         uint64_t len, remain = desc->len;
70         uint64_t desc_addr = desc->addr;
71
72         idesc = rte_malloc(__func__, desc->len, 0);
73         if (unlikely(!idesc))
74                 return 0;
75
76         dst = (uint64_t)(uintptr_t)idesc;
77
78         while (remain) {
79                 len = remain;
80                 src = vhost_iova_to_vva(dev, vq, desc_addr, &len,
81                                 VHOST_ACCESS_RO);
82                 if (unlikely(!src || !len)) {
83                         rte_free(idesc);
84                         return 0;
85                 }
86
87                 rte_memcpy((void *)(uintptr_t)dst, (void *)(uintptr_t)src, len);
88
89                 remain -= len;
90                 dst += len;
91                 desc_addr += len;
92         }
93
94         return idesc;
95 }
96
97 static __rte_always_inline void
98 free_ind_table(struct vring_desc *idesc)
99 {
100         rte_free(idesc);
101 }
102
103 static __rte_always_inline void
104 do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
105                           uint16_t to, uint16_t from, uint16_t size)
106 {
107         rte_memcpy(&vq->used->ring[to],
108                         &vq->shadow_used_ring[from],
109                         size * sizeof(struct vring_used_elem));
110         vhost_log_used_vring(dev, vq,
111                         offsetof(struct vring_used, ring[to]),
112                         size * sizeof(struct vring_used_elem));
113 }
114
115 static __rte_always_inline void
116 flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
117 {
118         uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
119
120         if (used_idx + vq->shadow_used_idx <= vq->size) {
121                 do_flush_shadow_used_ring(dev, vq, used_idx, 0,
122                                           vq->shadow_used_idx);
123         } else {
124                 uint16_t size;
125
126                 /* update used ring interval [used_idx, vq->size] */
127                 size = vq->size - used_idx;
128                 do_flush_shadow_used_ring(dev, vq, used_idx, 0, size);
129
130                 /* update the left half used ring interval [0, left_size] */
131                 do_flush_shadow_used_ring(dev, vq, 0, size,
132                                           vq->shadow_used_idx - size);
133         }
134         vq->last_used_idx += vq->shadow_used_idx;
135
136         rte_smp_wmb();
137
138         *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
139         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
140                 sizeof(vq->used->idx));
141 }
142
143 static __rte_always_inline void
144 update_shadow_used_ring(struct vhost_virtqueue *vq,
145                          uint16_t desc_idx, uint16_t len)
146 {
147         uint16_t i = vq->shadow_used_idx++;
148
149         vq->shadow_used_ring[i].id  = desc_idx;
150         vq->shadow_used_ring[i].len = len;
151 }
152
153 static inline void
154 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
155 {
156         struct batch_copy_elem *elem = vq->batch_copy_elems;
157         uint16_t count = vq->batch_copy_nb_elems;
158         int i;
159
160         for (i = 0; i < count; i++) {
161                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
162                 vhost_log_write(dev, elem[i].log_addr, elem[i].len);
163                 PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
164         }
165 }
166
167 static inline void
168 do_data_copy_dequeue(struct vhost_virtqueue *vq)
169 {
170         struct batch_copy_elem *elem = vq->batch_copy_elems;
171         uint16_t count = vq->batch_copy_nb_elems;
172         int i;
173
174         for (i = 0; i < count; i++)
175                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
176 }
177
178 /* avoid write operation when necessary, to lessen cache issues */
179 #define ASSIGN_UNLESS_EQUAL(var, val) do {      \
180         if ((var) != (val))                     \
181                 (var) = (val);                  \
182 } while (0)
183
184 static void
185 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
186 {
187         uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
188
189         if (m_buf->ol_flags & PKT_TX_TCP_SEG)
190                 csum_l4 |= PKT_TX_TCP_CKSUM;
191
192         if (csum_l4) {
193                 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
194                 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
195
196                 switch (csum_l4) {
197                 case PKT_TX_TCP_CKSUM:
198                         net_hdr->csum_offset = (offsetof(struct tcp_hdr,
199                                                 cksum));
200                         break;
201                 case PKT_TX_UDP_CKSUM:
202                         net_hdr->csum_offset = (offsetof(struct udp_hdr,
203                                                 dgram_cksum));
204                         break;
205                 case PKT_TX_SCTP_CKSUM:
206                         net_hdr->csum_offset = (offsetof(struct sctp_hdr,
207                                                 cksum));
208                         break;
209                 }
210         } else {
211                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
212                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
213                 ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
214         }
215
216         /* IP cksum verification cannot be bypassed, then calculate here */
217         if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
218                 struct ipv4_hdr *ipv4_hdr;
219
220                 ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct ipv4_hdr *,
221                                                    m_buf->l2_len);
222                 ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
223         }
224
225         if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
226                 if (m_buf->ol_flags & PKT_TX_IPV4)
227                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
228                 else
229                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
230                 net_hdr->gso_size = m_buf->tso_segsz;
231                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
232                                         + m_buf->l4_len;
233         } else {
234                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
235                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
236                 ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
237         }
238 }
239
240 static __rte_always_inline int
241 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
242                   struct vring_desc *descs, struct rte_mbuf *m,
243                   uint16_t desc_idx, uint32_t size)
244 {
245         uint32_t desc_avail, desc_offset;
246         uint32_t mbuf_avail, mbuf_offset;
247         uint32_t cpy_len;
248         uint64_t desc_chunck_len;
249         struct vring_desc *desc;
250         uint64_t desc_addr, desc_gaddr;
251         /* A counter to avoid desc dead loop chain */
252         uint16_t nr_desc = 1;
253         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
254         uint16_t copy_nb = vq->batch_copy_nb_elems;
255         int error = 0;
256
257         desc = &descs[desc_idx];
258         desc_chunck_len = desc->len;
259         desc_gaddr = desc->addr;
260         desc_addr = vhost_iova_to_vva(dev, vq, desc_gaddr,
261                                         &desc_chunck_len, VHOST_ACCESS_RW);
262         /*
263          * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
264          * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
265          * otherwise stores offset on the stack instead of in a register.
266          */
267         if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr) {
268                 error = -1;
269                 goto out;
270         }
271
272         rte_prefetch0((void *)(uintptr_t)desc_addr);
273
274         if (likely(desc_chunck_len >= dev->vhost_hlen)) {
275                 virtio_enqueue_offload(m,
276                                 (struct virtio_net_hdr *)(uintptr_t)desc_addr);
277                 PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
278                 vhost_log_write(dev, desc_gaddr, dev->vhost_hlen);
279         } else {
280                 struct virtio_net_hdr vnet_hdr;
281                 uint64_t remain = dev->vhost_hlen;
282                 uint64_t len;
283                 uint64_t src = (uint64_t)(uintptr_t)&vnet_hdr, dst;
284                 uint64_t guest_addr = desc_gaddr;
285
286                 virtio_enqueue_offload(m, &vnet_hdr);
287
288                 while (remain) {
289                         len = remain;
290                         dst = vhost_iova_to_vva(dev, vq, guest_addr,
291                                         &len, VHOST_ACCESS_RW);
292                         if (unlikely(!dst || !len)) {
293                                 error = -1;
294                                 goto out;
295                         }
296
297                         rte_memcpy((void *)(uintptr_t)dst,
298                                         (void *)(uintptr_t)src, len);
299
300                         PRINT_PACKET(dev, (uintptr_t)dst, (uint32_t)len, 0);
301                         vhost_log_write(dev, guest_addr, len);
302                         remain -= len;
303                         guest_addr += len;
304                         dst += len;
305                 }
306         }
307
308         desc_avail  = desc->len - dev->vhost_hlen;
309         if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
310                 desc_chunck_len = desc_avail;
311                 desc_gaddr = desc->addr + dev->vhost_hlen;
312                 desc_addr = vhost_iova_to_vva(dev,
313                                 vq, desc_gaddr,
314                                 &desc_chunck_len,
315                                 VHOST_ACCESS_RW);
316                 if (unlikely(!desc_addr)) {
317                         error = -1;
318                         goto out;
319                 }
320
321                 desc_offset = 0;
322         } else {
323                 desc_offset = dev->vhost_hlen;
324                 desc_chunck_len -= dev->vhost_hlen;
325         }
326
327         mbuf_avail  = rte_pktmbuf_data_len(m);
328         mbuf_offset = 0;
329         while (mbuf_avail != 0 || m->next != NULL) {
330                 /* done with current mbuf, fetch next */
331                 if (mbuf_avail == 0) {
332                         m = m->next;
333
334                         mbuf_offset = 0;
335                         mbuf_avail  = rte_pktmbuf_data_len(m);
336                 }
337
338                 /* done with current desc buf, fetch next */
339                 if (desc_avail == 0) {
340                         if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
341                                 /* Room in vring buffer is not enough */
342                                 error = -1;
343                                 goto out;
344                         }
345                         if (unlikely(desc->next >= size || ++nr_desc > size)) {
346                                 error = -1;
347                                 goto out;
348                         }
349
350                         desc = &descs[desc->next];
351                         desc_chunck_len = desc->len;
352                         desc_gaddr = desc->addr;
353                         desc_addr = vhost_iova_to_vva(dev, vq, desc_gaddr,
354                                                         &desc_chunck_len,
355                                                         VHOST_ACCESS_RW);
356                         if (unlikely(!desc_addr)) {
357                                 error = -1;
358                                 goto out;
359                         }
360
361                         desc_offset = 0;
362                         desc_avail  = desc->len;
363                 } else if (unlikely(desc_chunck_len == 0)) {
364                         desc_chunck_len = desc_avail;
365                         desc_gaddr += desc_offset;
366                         desc_addr = vhost_iova_to_vva(dev,
367                                         vq, desc_gaddr,
368                                         &desc_chunck_len, VHOST_ACCESS_RW);
369                         if (unlikely(!desc_addr)) {
370                                 error = -1;
371                                 goto out;
372                         }
373                         desc_offset = 0;
374                 }
375
376                 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
377                 if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) {
378                         rte_memcpy((void *)((uintptr_t)(desc_addr +
379                                                         desc_offset)),
380                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
381                                 cpy_len);
382                         vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
383                         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
384                                      cpy_len, 0);
385                 } else {
386                         batch_copy[copy_nb].dst =
387                                 (void *)((uintptr_t)(desc_addr + desc_offset));
388                         batch_copy[copy_nb].src =
389                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
390                         batch_copy[copy_nb].log_addr = desc_gaddr + desc_offset;
391                         batch_copy[copy_nb].len = cpy_len;
392                         copy_nb++;
393                 }
394
395                 mbuf_avail  -= cpy_len;
396                 mbuf_offset += cpy_len;
397                 desc_avail  -= cpy_len;
398                 desc_offset += cpy_len;
399                 desc_chunck_len -= cpy_len;
400         }
401
402 out:
403         vq->batch_copy_nb_elems = copy_nb;
404
405         return error;
406 }
407
408 /**
409  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
410  * be received from the physical port or from another virtio device. A packet
411  * count is returned to indicate the number of packets that are successfully
412  * added to the RX queue. This function works when the mbuf is scattered, but
413  * it doesn't support the mergeable feature.
414  */
415 static __rte_always_inline uint32_t
416 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
417               struct rte_mbuf **pkts, uint32_t count)
418 {
419         struct vhost_virtqueue *vq;
420         uint16_t avail_idx, free_entries, start_idx;
421         uint16_t desc_indexes[MAX_PKT_BURST];
422         struct vring_desc *descs;
423         uint16_t used_idx;
424         uint32_t i, sz;
425
426         LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
427         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
428                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
429                         dev->vid, __func__, queue_id);
430                 return 0;
431         }
432
433         vq = dev->virtqueue[queue_id];
434
435         rte_spinlock_lock(&vq->access_lock);
436
437         if (unlikely(vq->enabled == 0))
438                 goto out_access_unlock;
439
440         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
441                 vhost_user_iotlb_rd_lock(vq);
442
443         if (unlikely(vq->access_ok == 0)) {
444                 if (unlikely(vring_translate(dev, vq) < 0)) {
445                         count = 0;
446                         goto out;
447                 }
448         }
449
450         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
451         start_idx = vq->last_used_idx;
452         free_entries = avail_idx - start_idx;
453         count = RTE_MIN(count, free_entries);
454         count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
455         if (count == 0)
456                 goto out;
457
458         LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
459                 dev->vid, start_idx, start_idx + count);
460
461         vq->batch_copy_nb_elems = 0;
462
463         /* Retrieve all of the desc indexes first to avoid caching issues. */
464         rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
465         for (i = 0; i < count; i++) {
466                 used_idx = (start_idx + i) & (vq->size - 1);
467                 desc_indexes[i] = vq->avail->ring[used_idx];
468                 vq->used->ring[used_idx].id = desc_indexes[i];
469                 vq->used->ring[used_idx].len = pkts[i]->pkt_len +
470                                                dev->vhost_hlen;
471                 vhost_log_used_vring(dev, vq,
472                         offsetof(struct vring_used, ring[used_idx]),
473                         sizeof(vq->used->ring[used_idx]));
474         }
475
476         rte_prefetch0(&vq->desc[desc_indexes[0]]);
477         for (i = 0; i < count; i++) {
478                 struct vring_desc *idesc = NULL;
479                 uint16_t desc_idx = desc_indexes[i];
480                 int err;
481
482                 if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) {
483                         uint64_t dlen = vq->desc[desc_idx].len;
484                         descs = (struct vring_desc *)(uintptr_t)
485                                 vhost_iova_to_vva(dev,
486                                                 vq, vq->desc[desc_idx].addr,
487                                                 &dlen, VHOST_ACCESS_RO);
488                         if (unlikely(!descs)) {
489                                 count = i;
490                                 break;
491                         }
492
493                         if (unlikely(dlen < vq->desc[desc_idx].len)) {
494                                 /*
495                                  * The indirect desc table is not contiguous
496                                  * in process VA space, we have to copy it.
497                                  */
498                                 idesc = alloc_copy_ind_table(dev, vq,
499                                                         &vq->desc[desc_idx]);
500                                 if (unlikely(!idesc))
501                                         break;
502
503                                 descs = idesc;
504                         }
505
506                         desc_idx = 0;
507                         sz = vq->desc[desc_idx].len / sizeof(*descs);
508                 } else {
509                         descs = vq->desc;
510                         sz = vq->size;
511                 }
512
513                 err = copy_mbuf_to_desc(dev, vq, descs, pkts[i], desc_idx, sz);
514                 if (unlikely(err)) {
515                         count = i;
516                         free_ind_table(idesc);
517                         break;
518                 }
519
520                 if (i + 1 < count)
521                         rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
522
523                 if (unlikely(!!idesc))
524                         free_ind_table(idesc);
525         }
526
527         do_data_copy_enqueue(dev, vq);
528
529         rte_smp_wmb();
530
531         *(volatile uint16_t *)&vq->used->idx += count;
532         vq->last_used_idx += count;
533         vhost_log_used_vring(dev, vq,
534                 offsetof(struct vring_used, idx),
535                 sizeof(vq->used->idx));
536
537         /* flush used->idx update before we read avail->flags. */
538         rte_mb();
539
540         /* Kick the guest if necessary. */
541         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
542                         && (vq->callfd >= 0))
543                 eventfd_write(vq->callfd, (eventfd_t)1);
544 out:
545         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
546                 vhost_user_iotlb_rd_unlock(vq);
547
548 out_access_unlock:
549         rte_spinlock_unlock(&vq->access_lock);
550
551         return count;
552 }
553
554 static __rte_always_inline int
555 fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
556                          uint32_t avail_idx, uint32_t *vec_idx,
557                          struct buf_vector *buf_vec, uint16_t *desc_chain_head,
558                          uint16_t *desc_chain_len)
559 {
560         uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
561         uint32_t vec_id = *vec_idx;
562         uint32_t len    = 0;
563         uint64_t dlen;
564         struct vring_desc *descs = vq->desc;
565         struct vring_desc *idesc = NULL;
566
567         *desc_chain_head = idx;
568
569         if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
570                 dlen = vq->desc[idx].len;
571                 descs = (struct vring_desc *)(uintptr_t)
572                         vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
573                                                 &dlen,
574                                                 VHOST_ACCESS_RO);
575                 if (unlikely(!descs))
576                         return -1;
577
578                 if (unlikely(dlen < vq->desc[idx].len)) {
579                         /*
580                          * The indirect desc table is not contiguous
581                          * in process VA space, we have to copy it.
582                          */
583                         idesc = alloc_copy_ind_table(dev, vq, &vq->desc[idx]);
584                         if (unlikely(!idesc))
585                                 return -1;
586
587                         descs = idesc;
588                 }
589
590                 idx = 0;
591         }
592
593         while (1) {
594                 if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size)) {
595                         free_ind_table(idesc);
596                         return -1;
597                 }
598
599                 len += descs[idx].len;
600                 buf_vec[vec_id].buf_addr = descs[idx].addr;
601                 buf_vec[vec_id].buf_len  = descs[idx].len;
602                 buf_vec[vec_id].desc_idx = idx;
603                 vec_id++;
604
605                 if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
606                         break;
607
608                 idx = descs[idx].next;
609         }
610
611         *desc_chain_len = len;
612         *vec_idx = vec_id;
613
614         if (unlikely(!!idesc))
615                 free_ind_table(idesc);
616
617         return 0;
618 }
619
620 /*
621  * Returns -1 on fail, 0 on success
622  */
623 static inline int
624 reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
625                                 uint32_t size, struct buf_vector *buf_vec,
626                                 uint16_t *num_buffers, uint16_t avail_head)
627 {
628         uint16_t cur_idx;
629         uint32_t vec_idx = 0;
630         uint16_t tries = 0;
631
632         uint16_t head_idx = 0;
633         uint16_t len = 0;
634
635         *num_buffers = 0;
636         cur_idx  = vq->last_avail_idx;
637
638         while (size > 0) {
639                 if (unlikely(cur_idx == avail_head))
640                         return -1;
641
642                 if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec,
643                                                 &head_idx, &len) < 0))
644                         return -1;
645                 len = RTE_MIN(len, size);
646                 update_shadow_used_ring(vq, head_idx, len);
647                 size -= len;
648
649                 cur_idx++;
650                 tries++;
651                 *num_buffers += 1;
652
653                 /*
654                  * if we tried all available ring items, and still
655                  * can't get enough buf, it means something abnormal
656                  * happened.
657                  */
658                 if (unlikely(tries >= vq->size))
659                         return -1;
660         }
661
662         return 0;
663 }
664
665 static __rte_always_inline int
666 copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
667                             struct rte_mbuf *m, struct buf_vector *buf_vec,
668                             uint16_t num_buffers)
669 {
670         uint32_t vec_idx = 0;
671         uint64_t desc_addr, desc_gaddr;
672         uint32_t mbuf_offset, mbuf_avail;
673         uint32_t desc_offset, desc_avail;
674         uint32_t cpy_len;
675         uint64_t desc_chunck_len;
676         uint64_t hdr_addr, hdr_phys_addr;
677         struct rte_mbuf *hdr_mbuf;
678         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
679         struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
680         uint16_t copy_nb = vq->batch_copy_nb_elems;
681         int error = 0;
682
683         if (unlikely(m == NULL)) {
684                 error = -1;
685                 goto out;
686         }
687
688         desc_chunck_len = buf_vec[vec_idx].buf_len;
689         desc_gaddr = buf_vec[vec_idx].buf_addr;
690         desc_addr = vhost_iova_to_vva(dev, vq,
691                                         desc_gaddr,
692                                         &desc_chunck_len,
693                                         VHOST_ACCESS_RW);
694         if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) {
695                 error = -1;
696                 goto out;
697         }
698
699         hdr_mbuf = m;
700         hdr_addr = desc_addr;
701         if (unlikely(desc_chunck_len < dev->vhost_hlen))
702                 hdr = &tmp_hdr;
703         else
704                 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
705         hdr_phys_addr = desc_gaddr;
706         rte_prefetch0((void *)(uintptr_t)hdr_addr);
707
708         LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
709                 dev->vid, num_buffers);
710
711         desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
712         if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
713                 desc_chunck_len = desc_avail;
714                 desc_gaddr += dev->vhost_hlen;
715                 desc_addr = vhost_iova_to_vva(dev, vq,
716                                 desc_gaddr,
717                                 &desc_chunck_len,
718                                 VHOST_ACCESS_RW);
719                 if (unlikely(!desc_addr)) {
720                         error = -1;
721                         goto out;
722                 }
723
724                 desc_offset = 0;
725         } else {
726                 desc_offset = dev->vhost_hlen;
727                 desc_chunck_len -= dev->vhost_hlen;
728         }
729
730
731         mbuf_avail  = rte_pktmbuf_data_len(m);
732         mbuf_offset = 0;
733         while (mbuf_avail != 0 || m->next != NULL) {
734                 /* done with current desc buf, get the next one */
735                 if (desc_avail == 0) {
736                         vec_idx++;
737                         desc_chunck_len = buf_vec[vec_idx].buf_len;
738                         desc_gaddr = buf_vec[vec_idx].buf_addr;
739                         desc_addr =
740                                 vhost_iova_to_vva(dev, vq,
741                                         desc_gaddr,
742                                         &desc_chunck_len,
743                                         VHOST_ACCESS_RW);
744                         if (unlikely(!desc_addr)) {
745                                 error = -1;
746                                 goto out;
747                         }
748
749                         /* Prefetch buffer address. */
750                         rte_prefetch0((void *)(uintptr_t)desc_addr);
751                         desc_offset = 0;
752                         desc_avail  = buf_vec[vec_idx].buf_len;
753                 } else if (unlikely(desc_chunck_len == 0)) {
754                         desc_chunck_len = desc_avail;
755                         desc_gaddr += desc_offset;
756                         desc_addr = vhost_iova_to_vva(dev, vq,
757                                         desc_gaddr,
758                                         &desc_chunck_len, VHOST_ACCESS_RW);
759                         if (unlikely(!desc_addr)) {
760                                 error = -1;
761                                 goto out;
762                         }
763                         desc_offset = 0;
764                 }
765
766                 /* done with current mbuf, get the next one */
767                 if (mbuf_avail == 0) {
768                         m = m->next;
769
770                         mbuf_offset = 0;
771                         mbuf_avail  = rte_pktmbuf_data_len(m);
772                 }
773
774                 if (hdr_addr) {
775                         virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
776                         ASSIGN_UNLESS_EQUAL(hdr->num_buffers, num_buffers);
777
778                         if (unlikely(hdr == &tmp_hdr)) {
779                                 uint64_t len;
780                                 uint64_t remain = dev->vhost_hlen;
781                                 uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
782                                 uint64_t guest_addr = hdr_phys_addr;
783
784                                 while (remain) {
785                                         len = remain;
786                                         dst = vhost_iova_to_vva(dev, vq,
787                                                         guest_addr, &len,
788                                                         VHOST_ACCESS_RW);
789                                         if (unlikely(!dst || !len)) {
790                                                 error = -1;
791                                                 goto out;
792                                         }
793
794                                         rte_memcpy((void *)(uintptr_t)dst,
795                                                         (void *)(uintptr_t)src,
796                                                         len);
797
798                                         PRINT_PACKET(dev, (uintptr_t)dst,
799                                                         (uint32_t)len, 0);
800                                         vhost_log_write(dev, guest_addr, len);
801
802                                         remain -= len;
803                                         guest_addr += len;
804                                         dst += len;
805                                 }
806                         } else {
807                                 PRINT_PACKET(dev, (uintptr_t)hdr_addr,
808                                                 dev->vhost_hlen, 0);
809                                 vhost_log_write(dev, hdr_phys_addr,
810                                                 dev->vhost_hlen);
811                         }
812
813                         hdr_addr = 0;
814                 }
815
816                 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
817
818                 if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) {
819                         rte_memcpy((void *)((uintptr_t)(desc_addr +
820                                                         desc_offset)),
821                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
822                                 cpy_len);
823                         vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
824                         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
825                                 cpy_len, 0);
826                 } else {
827                         batch_copy[copy_nb].dst =
828                                 (void *)((uintptr_t)(desc_addr + desc_offset));
829                         batch_copy[copy_nb].src =
830                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
831                         batch_copy[copy_nb].log_addr = desc_gaddr + desc_offset;
832                         batch_copy[copy_nb].len = cpy_len;
833                         copy_nb++;
834                 }
835
836                 mbuf_avail  -= cpy_len;
837                 mbuf_offset += cpy_len;
838                 desc_avail  -= cpy_len;
839                 desc_offset += cpy_len;
840                 desc_chunck_len -= cpy_len;
841         }
842
843 out:
844         vq->batch_copy_nb_elems = copy_nb;
845
846         return error;
847 }
848
849 static __rte_always_inline uint32_t
850 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
851         struct rte_mbuf **pkts, uint32_t count)
852 {
853         struct vhost_virtqueue *vq;
854         uint32_t pkt_idx = 0;
855         uint16_t num_buffers;
856         struct buf_vector buf_vec[BUF_VECTOR_MAX];
857         uint16_t avail_head;
858
859         LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
860         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
861                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
862                         dev->vid, __func__, queue_id);
863                 return 0;
864         }
865
866         vq = dev->virtqueue[queue_id];
867
868         rte_spinlock_lock(&vq->access_lock);
869
870         if (unlikely(vq->enabled == 0))
871                 goto out_access_unlock;
872
873         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
874                 vhost_user_iotlb_rd_lock(vq);
875
876         if (unlikely(vq->access_ok == 0))
877                 if (unlikely(vring_translate(dev, vq) < 0))
878                         goto out;
879
880         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
881         if (count == 0)
882                 goto out;
883
884         vq->batch_copy_nb_elems = 0;
885
886         rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
887
888         vq->shadow_used_idx = 0;
889         avail_head = *((volatile uint16_t *)&vq->avail->idx);
890         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
891                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
892
893                 if (unlikely(reserve_avail_buf_mergeable(dev, vq,
894                                                 pkt_len, buf_vec, &num_buffers,
895                                                 avail_head) < 0)) {
896                         LOG_DEBUG(VHOST_DATA,
897                                 "(%d) failed to get enough desc from vring\n",
898                                 dev->vid);
899                         vq->shadow_used_idx -= num_buffers;
900                         break;
901                 }
902
903                 LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
904                         dev->vid, vq->last_avail_idx,
905                         vq->last_avail_idx + num_buffers);
906
907                 if (copy_mbuf_to_desc_mergeable(dev, vq, pkts[pkt_idx],
908                                                 buf_vec, num_buffers) < 0) {
909                         vq->shadow_used_idx -= num_buffers;
910                         break;
911                 }
912
913                 vq->last_avail_idx += num_buffers;
914         }
915
916         do_data_copy_enqueue(dev, vq);
917
918         if (likely(vq->shadow_used_idx)) {
919                 flush_shadow_used_ring(dev, vq);
920
921                 /* flush used->idx update before we read avail->flags. */
922                 rte_mb();
923
924                 /* Kick the guest if necessary. */
925                 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
926                                 && (vq->callfd >= 0))
927                         eventfd_write(vq->callfd, (eventfd_t)1);
928         }
929
930 out:
931         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
932                 vhost_user_iotlb_rd_unlock(vq);
933
934 out_access_unlock:
935         rte_spinlock_unlock(&vq->access_lock);
936
937         return pkt_idx;
938 }
939
940 uint16_t
941 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
942         struct rte_mbuf **pkts, uint16_t count)
943 {
944         struct virtio_net *dev = get_device(vid);
945
946         if (!dev)
947                 return 0;
948
949         if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
950                 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
951         else
952                 return virtio_dev_rx(dev, queue_id, pkts, count);
953 }
954
955 static inline bool
956 virtio_net_with_host_offload(struct virtio_net *dev)
957 {
958         if (dev->features &
959                         ((1ULL << VIRTIO_NET_F_CSUM) |
960                          (1ULL << VIRTIO_NET_F_HOST_ECN) |
961                          (1ULL << VIRTIO_NET_F_HOST_TSO4) |
962                          (1ULL << VIRTIO_NET_F_HOST_TSO6) |
963                          (1ULL << VIRTIO_NET_F_HOST_UFO)))
964                 return true;
965
966         return false;
967 }
968
969 static void
970 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
971 {
972         struct ipv4_hdr *ipv4_hdr;
973         struct ipv6_hdr *ipv6_hdr;
974         void *l3_hdr = NULL;
975         struct ether_hdr *eth_hdr;
976         uint16_t ethertype;
977
978         eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
979
980         m->l2_len = sizeof(struct ether_hdr);
981         ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
982
983         if (ethertype == ETHER_TYPE_VLAN) {
984                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
985
986                 m->l2_len += sizeof(struct vlan_hdr);
987                 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
988         }
989
990         l3_hdr = (char *)eth_hdr + m->l2_len;
991
992         switch (ethertype) {
993         case ETHER_TYPE_IPv4:
994                 ipv4_hdr = l3_hdr;
995                 *l4_proto = ipv4_hdr->next_proto_id;
996                 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
997                 *l4_hdr = (char *)l3_hdr + m->l3_len;
998                 m->ol_flags |= PKT_TX_IPV4;
999                 break;
1000         case ETHER_TYPE_IPv6:
1001                 ipv6_hdr = l3_hdr;
1002                 *l4_proto = ipv6_hdr->proto;
1003                 m->l3_len = sizeof(struct ipv6_hdr);
1004                 *l4_hdr = (char *)l3_hdr + m->l3_len;
1005                 m->ol_flags |= PKT_TX_IPV6;
1006                 break;
1007         default:
1008                 m->l3_len = 0;
1009                 *l4_proto = 0;
1010                 *l4_hdr = NULL;
1011                 break;
1012         }
1013 }
1014
1015 static __rte_always_inline void
1016 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
1017 {
1018         uint16_t l4_proto = 0;
1019         void *l4_hdr = NULL;
1020         struct tcp_hdr *tcp_hdr = NULL;
1021
1022         if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
1023                 return;
1024
1025         parse_ethernet(m, &l4_proto, &l4_hdr);
1026         if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1027                 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
1028                         switch (hdr->csum_offset) {
1029                         case (offsetof(struct tcp_hdr, cksum)):
1030                                 if (l4_proto == IPPROTO_TCP)
1031                                         m->ol_flags |= PKT_TX_TCP_CKSUM;
1032                                 break;
1033                         case (offsetof(struct udp_hdr, dgram_cksum)):
1034                                 if (l4_proto == IPPROTO_UDP)
1035                                         m->ol_flags |= PKT_TX_UDP_CKSUM;
1036                                 break;
1037                         case (offsetof(struct sctp_hdr, cksum)):
1038                                 if (l4_proto == IPPROTO_SCTP)
1039                                         m->ol_flags |= PKT_TX_SCTP_CKSUM;
1040                                 break;
1041                         default:
1042                                 break;
1043                         }
1044                 }
1045         }
1046
1047         if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1048                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1049                 case VIRTIO_NET_HDR_GSO_TCPV4:
1050                 case VIRTIO_NET_HDR_GSO_TCPV6:
1051                         tcp_hdr = l4_hdr;
1052                         m->ol_flags |= PKT_TX_TCP_SEG;
1053                         m->tso_segsz = hdr->gso_size;
1054                         m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
1055                         break;
1056                 default:
1057                         RTE_LOG(WARNING, VHOST_DATA,
1058                                 "unsupported gso type %u.\n", hdr->gso_type);
1059                         break;
1060                 }
1061         }
1062 }
1063
1064 #define RARP_PKT_SIZE   64
1065
1066 static int
1067 make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
1068 {
1069         struct ether_hdr *eth_hdr;
1070         struct arp_hdr  *rarp;
1071
1072         if (rarp_mbuf->buf_len < 64) {
1073                 RTE_LOG(WARNING, VHOST_DATA,
1074                         "failed to make RARP; mbuf size too small %u (< %d)\n",
1075                         rarp_mbuf->buf_len, RARP_PKT_SIZE);
1076                 return -1;
1077         }
1078
1079         /* Ethernet header. */
1080         eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
1081         memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
1082         ether_addr_copy(mac, &eth_hdr->s_addr);
1083         eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
1084
1085         /* RARP header. */
1086         rarp = (struct arp_hdr *)(eth_hdr + 1);
1087         rarp->arp_hrd = htons(ARP_HRD_ETHER);
1088         rarp->arp_pro = htons(ETHER_TYPE_IPv4);
1089         rarp->arp_hln = ETHER_ADDR_LEN;
1090         rarp->arp_pln = 4;
1091         rarp->arp_op  = htons(ARP_OP_REVREQUEST);
1092
1093         ether_addr_copy(mac, &rarp->arp_data.arp_sha);
1094         ether_addr_copy(mac, &rarp->arp_data.arp_tha);
1095         memset(&rarp->arp_data.arp_sip, 0x00, 4);
1096         memset(&rarp->arp_data.arp_tip, 0x00, 4);
1097
1098         rarp_mbuf->pkt_len  = rarp_mbuf->data_len = RARP_PKT_SIZE;
1099
1100         return 0;
1101 }
1102
1103 static __rte_always_inline void
1104 put_zmbuf(struct zcopy_mbuf *zmbuf)
1105 {
1106         zmbuf->in_use = 0;
1107 }
1108
1109 static __rte_always_inline int
1110 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
1111                   struct vring_desc *descs, uint16_t max_desc,
1112                   struct rte_mbuf *m, uint16_t desc_idx,
1113                   struct rte_mempool *mbuf_pool)
1114 {
1115         struct vring_desc *desc;
1116         uint64_t desc_addr, desc_gaddr;
1117         uint32_t desc_avail, desc_offset;
1118         uint32_t mbuf_avail, mbuf_offset;
1119         uint32_t cpy_len;
1120         uint64_t desc_chunck_len;
1121         struct rte_mbuf *cur = m, *prev = m;
1122         struct virtio_net_hdr tmp_hdr;
1123         struct virtio_net_hdr *hdr = NULL;
1124         /* A counter to avoid desc dead loop chain */
1125         uint32_t nr_desc = 1;
1126         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
1127         uint16_t copy_nb = vq->batch_copy_nb_elems;
1128         int error = 0;
1129
1130         desc = &descs[desc_idx];
1131         if (unlikely((desc->len < dev->vhost_hlen)) ||
1132                         (desc->flags & VRING_DESC_F_INDIRECT)) {
1133                 error = -1;
1134                 goto out;
1135         }
1136
1137         desc_chunck_len = desc->len;
1138         desc_gaddr = desc->addr;
1139         desc_addr = vhost_iova_to_vva(dev,
1140                                         vq, desc_gaddr,
1141                                         &desc_chunck_len,
1142                                         VHOST_ACCESS_RO);
1143         if (unlikely(!desc_addr)) {
1144                 error = -1;
1145                 goto out;
1146         }
1147
1148         if (virtio_net_with_host_offload(dev)) {
1149                 if (unlikely(desc_chunck_len < sizeof(struct virtio_net_hdr))) {
1150                         uint64_t len = desc_chunck_len;
1151                         uint64_t remain = sizeof(struct virtio_net_hdr);
1152                         uint64_t src = desc_addr;
1153                         uint64_t dst = (uint64_t)(uintptr_t)&tmp_hdr;
1154                         uint64_t guest_addr = desc_gaddr;
1155
1156                         /*
1157                          * No luck, the virtio-net header doesn't fit
1158                          * in a contiguous virtual area.
1159                          */
1160                         while (remain) {
1161                                 len = remain;
1162                                 src = vhost_iova_to_vva(dev, vq,
1163                                                 guest_addr, &len,
1164                                                 VHOST_ACCESS_RO);
1165                                 if (unlikely(!src || !len)) {
1166                                         error = -1;
1167                                         goto out;
1168                                 }
1169
1170                                 rte_memcpy((void *)(uintptr_t)dst,
1171                                                    (void *)(uintptr_t)src, len);
1172
1173                                 guest_addr += len;
1174                                 remain -= len;
1175                                 dst += len;
1176                         }
1177
1178                         hdr = &tmp_hdr;
1179                 } else {
1180                         hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
1181                         rte_prefetch0(hdr);
1182                 }
1183         }
1184
1185         /*
1186          * A virtio driver normally uses at least 2 desc buffers
1187          * for Tx: the first for storing the header, and others
1188          * for storing the data.
1189          */
1190         if (likely((desc->len == dev->vhost_hlen) &&
1191                    (desc->flags & VRING_DESC_F_NEXT) != 0)) {
1192                 desc = &descs[desc->next];
1193                 if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) {
1194                         error = -1;
1195                         goto out;
1196                 }
1197
1198                 desc_chunck_len = desc->len;
1199                 desc_gaddr = desc->addr;
1200                 desc_addr = vhost_iova_to_vva(dev,
1201                                                         vq, desc_gaddr,
1202                                                         &desc_chunck_len,
1203                                                         VHOST_ACCESS_RO);
1204                 if (unlikely(!desc_addr)) {
1205                         error = -1;
1206                         goto out;
1207                 }
1208
1209                 desc_offset = 0;
1210                 desc_avail  = desc->len;
1211                 nr_desc    += 1;
1212         } else {
1213                 desc_avail  = desc->len - dev->vhost_hlen;
1214
1215                 if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
1216                         desc_chunck_len = desc_avail;
1217                         desc_gaddr += dev->vhost_hlen;
1218                         desc_addr = vhost_iova_to_vva(dev,
1219                                         vq, desc_gaddr,
1220                                         &desc_chunck_len,
1221                                         VHOST_ACCESS_RO);
1222                         if (unlikely(!desc_addr)) {
1223                                 error = -1;
1224                                 goto out;
1225                         }
1226
1227                         desc_offset = 0;
1228                 } else {
1229                         desc_offset = dev->vhost_hlen;
1230                         desc_chunck_len -= dev->vhost_hlen;
1231                 }
1232         }
1233
1234         rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset));
1235
1236         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
1237                         (uint32_t)desc_chunck_len, 0);
1238
1239         mbuf_offset = 0;
1240         mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
1241         while (1) {
1242                 uint64_t hpa;
1243
1244                 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
1245
1246                 /*
1247                  * A desc buf might across two host physical pages that are
1248                  * not continuous. In such case (gpa_to_hpa returns 0), data
1249                  * will be copied even though zero copy is enabled.
1250                  */
1251                 if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev,
1252                                         desc_gaddr + desc_offset, cpy_len)))) {
1253                         cur->data_len = cpy_len;
1254                         cur->data_off = 0;
1255                         cur->buf_addr = (void *)(uintptr_t)(desc_addr
1256                                 + desc_offset);
1257                         cur->buf_iova = hpa;
1258
1259                         /*
1260                          * In zero copy mode, one mbuf can only reference data
1261                          * for one or partial of one desc buff.
1262                          */
1263                         mbuf_avail = cpy_len;
1264                 } else {
1265                         if (likely(cpy_len > MAX_BATCH_LEN ||
1266                                    copy_nb >= vq->size ||
1267                                    (hdr && cur == m) ||
1268                                    desc->len != desc_chunck_len)) {
1269                                 rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
1270                                                                    mbuf_offset),
1271                                            (void *)((uintptr_t)(desc_addr +
1272                                                                 desc_offset)),
1273                                            cpy_len);
1274                         } else {
1275                                 batch_copy[copy_nb].dst =
1276                                         rte_pktmbuf_mtod_offset(cur, void *,
1277                                                                 mbuf_offset);
1278                                 batch_copy[copy_nb].src =
1279                                         (void *)((uintptr_t)(desc_addr +
1280                                                              desc_offset));
1281                                 batch_copy[copy_nb].len = cpy_len;
1282                                 copy_nb++;
1283                         }
1284                 }
1285
1286                 mbuf_avail  -= cpy_len;
1287                 mbuf_offset += cpy_len;
1288                 desc_avail  -= cpy_len;
1289                 desc_chunck_len -= cpy_len;
1290                 desc_offset += cpy_len;
1291
1292                 /* This desc reaches to its end, get the next one */
1293                 if (desc_avail == 0) {
1294                         if ((desc->flags & VRING_DESC_F_NEXT) == 0)
1295                                 break;
1296
1297                         if (unlikely(desc->next >= max_desc ||
1298                                      ++nr_desc > max_desc)) {
1299                                 error = -1;
1300                                 goto out;
1301                         }
1302                         desc = &descs[desc->next];
1303                         if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) {
1304                                 error = -1;
1305                                 goto out;
1306                         }
1307
1308                         desc_chunck_len = desc->len;
1309                         desc_gaddr = desc->addr;
1310                         desc_addr = vhost_iova_to_vva(dev,
1311                                                         vq, desc_gaddr,
1312                                                         &desc_chunck_len,
1313                                                         VHOST_ACCESS_RO);
1314                         if (unlikely(!desc_addr)) {
1315                                 error = -1;
1316                                 goto out;
1317                         }
1318
1319                         rte_prefetch0((void *)(uintptr_t)desc_addr);
1320
1321                         desc_offset = 0;
1322                         desc_avail  = desc->len;
1323
1324                         PRINT_PACKET(dev, (uintptr_t)desc_addr,
1325                                         (uint32_t)desc_chunck_len, 0);
1326                 } else if (unlikely(desc_chunck_len == 0)) {
1327                         desc_chunck_len = desc_avail;
1328                         desc_gaddr += desc_offset;
1329                         desc_addr = vhost_iova_to_vva(dev, vq,
1330                                         desc_gaddr,
1331                                         &desc_chunck_len,
1332                                         VHOST_ACCESS_RO);
1333                         if (unlikely(!desc_addr)) {
1334                                 error = -1;
1335                                 goto out;
1336                         }
1337                         desc_offset = 0;
1338
1339                         PRINT_PACKET(dev, (uintptr_t)desc_addr,
1340                                         (uint32_t)desc_chunck_len, 0);
1341                 }
1342
1343                 /*
1344                  * This mbuf reaches to its end, get a new one
1345                  * to hold more data.
1346                  */
1347                 if (mbuf_avail == 0) {
1348                         cur = rte_pktmbuf_alloc(mbuf_pool);
1349                         if (unlikely(cur == NULL)) {
1350                                 RTE_LOG(ERR, VHOST_DATA, "Failed to "
1351                                         "allocate memory for mbuf.\n");
1352                                 error = -1;
1353                                 goto out;
1354                         }
1355                         if (unlikely(dev->dequeue_zero_copy))
1356                                 rte_mbuf_refcnt_update(cur, 1);
1357
1358                         prev->next = cur;
1359                         prev->data_len = mbuf_offset;
1360                         m->nb_segs += 1;
1361                         m->pkt_len += mbuf_offset;
1362                         prev = cur;
1363
1364                         mbuf_offset = 0;
1365                         mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
1366                 }
1367         }
1368
1369         prev->data_len = mbuf_offset;
1370         m->pkt_len    += mbuf_offset;
1371
1372         if (hdr)
1373                 vhost_dequeue_offload(hdr, m);
1374
1375 out:
1376         vq->batch_copy_nb_elems = copy_nb;
1377
1378         return error;
1379 }
1380
1381 static __rte_always_inline void
1382 update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
1383                  uint32_t used_idx, uint32_t desc_idx)
1384 {
1385         vq->used->ring[used_idx].id  = desc_idx;
1386         vq->used->ring[used_idx].len = 0;
1387         vhost_log_used_vring(dev, vq,
1388                         offsetof(struct vring_used, ring[used_idx]),
1389                         sizeof(vq->used->ring[used_idx]));
1390 }
1391
1392 static __rte_always_inline void
1393 update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq,
1394                 uint32_t count)
1395 {
1396         if (unlikely(count == 0))
1397                 return;
1398
1399         rte_smp_wmb();
1400         rte_smp_rmb();
1401
1402         vq->used->idx += count;
1403         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
1404                         sizeof(vq->used->idx));
1405
1406         /* Kick guest if required. */
1407         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
1408                         && (vq->callfd >= 0))
1409                 eventfd_write(vq->callfd, (eventfd_t)1);
1410 }
1411
1412 static __rte_always_inline struct zcopy_mbuf *
1413 get_zmbuf(struct vhost_virtqueue *vq)
1414 {
1415         uint16_t i;
1416         uint16_t last;
1417         int tries = 0;
1418
1419         /* search [last_zmbuf_idx, zmbuf_size) */
1420         i = vq->last_zmbuf_idx;
1421         last = vq->zmbuf_size;
1422
1423 again:
1424         for (; i < last; i++) {
1425                 if (vq->zmbufs[i].in_use == 0) {
1426                         vq->last_zmbuf_idx = i + 1;
1427                         vq->zmbufs[i].in_use = 1;
1428                         return &vq->zmbufs[i];
1429                 }
1430         }
1431
1432         tries++;
1433         if (tries == 1) {
1434                 /* search [0, last_zmbuf_idx) */
1435                 i = 0;
1436                 last = vq->last_zmbuf_idx;
1437                 goto again;
1438         }
1439
1440         return NULL;
1441 }
1442
1443 static __rte_always_inline bool
1444 mbuf_is_consumed(struct rte_mbuf *m)
1445 {
1446         while (m) {
1447                 if (rte_mbuf_refcnt_read(m) > 1)
1448                         return false;
1449                 m = m->next;
1450         }
1451
1452         return true;
1453 }
1454
1455 static __rte_always_inline void
1456 restore_mbuf(struct rte_mbuf *m)
1457 {
1458         uint32_t mbuf_size, priv_size;
1459
1460         while (m) {
1461                 priv_size = rte_pktmbuf_priv_size(m->pool);
1462                 mbuf_size = sizeof(struct rte_mbuf) + priv_size;
1463                 /* start of buffer is after mbuf structure and priv data */
1464
1465                 m->buf_addr = (char *)m + mbuf_size;
1466                 m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
1467                 m = m->next;
1468         }
1469 }
1470
1471 uint16_t
1472 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
1473         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
1474 {
1475         struct virtio_net *dev;
1476         struct rte_mbuf *rarp_mbuf = NULL;
1477         struct vhost_virtqueue *vq;
1478         uint32_t desc_indexes[MAX_PKT_BURST];
1479         uint32_t used_idx;
1480         uint32_t i = 0;
1481         uint16_t free_entries;
1482         uint16_t avail_idx;
1483
1484         dev = get_device(vid);
1485         if (!dev)
1486                 return 0;
1487
1488         if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
1489                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
1490                         dev->vid, __func__, queue_id);
1491                 return 0;
1492         }
1493
1494         vq = dev->virtqueue[queue_id];
1495
1496         if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
1497                 return 0;
1498
1499         if (unlikely(vq->enabled == 0))
1500                 goto out_access_unlock;
1501
1502         vq->batch_copy_nb_elems = 0;
1503
1504         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1505                 vhost_user_iotlb_rd_lock(vq);
1506
1507         if (unlikely(vq->access_ok == 0))
1508                 if (unlikely(vring_translate(dev, vq) < 0))
1509                         goto out;
1510
1511         if (unlikely(dev->dequeue_zero_copy)) {
1512                 struct zcopy_mbuf *zmbuf, *next;
1513                 int nr_updated = 0;
1514
1515                 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
1516                      zmbuf != NULL; zmbuf = next) {
1517                         next = TAILQ_NEXT(zmbuf, next);
1518
1519                         if (mbuf_is_consumed(zmbuf->mbuf)) {
1520                                 used_idx = vq->last_used_idx++ & (vq->size - 1);
1521                                 update_used_ring(dev, vq, used_idx,
1522                                                  zmbuf->desc_idx);
1523                                 nr_updated += 1;
1524
1525                                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
1526                                 restore_mbuf(zmbuf->mbuf);
1527                                 rte_pktmbuf_free(zmbuf->mbuf);
1528                                 put_zmbuf(zmbuf);
1529                                 vq->nr_zmbuf -= 1;
1530                         }
1531                 }
1532
1533                 update_used_idx(dev, vq, nr_updated);
1534         }
1535
1536         /*
1537          * Construct a RARP broadcast packet, and inject it to the "pkts"
1538          * array, to looks like that guest actually send such packet.
1539          *
1540          * Check user_send_rarp() for more information.
1541          *
1542          * broadcast_rarp shares a cacheline in the virtio_net structure
1543          * with some fields that are accessed during enqueue and
1544          * rte_atomic16_cmpset() causes a write if using cmpxchg. This could
1545          * result in false sharing between enqueue and dequeue.
1546          *
1547          * Prevent unnecessary false sharing by reading broadcast_rarp first
1548          * and only performing cmpset if the read indicates it is likely to
1549          * be set.
1550          */
1551
1552         if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
1553                         rte_atomic16_cmpset((volatile uint16_t *)
1554                                 &dev->broadcast_rarp.cnt, 1, 0))) {
1555
1556                 rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1557                 if (rarp_mbuf == NULL) {
1558                         RTE_LOG(ERR, VHOST_DATA,
1559                                 "Failed to allocate memory for mbuf.\n");
1560                         return 0;
1561                 }
1562
1563                 if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
1564                         rte_pktmbuf_free(rarp_mbuf);
1565                         rarp_mbuf = NULL;
1566                 } else {
1567                         count -= 1;
1568                 }
1569         }
1570
1571         free_entries = *((volatile uint16_t *)&vq->avail->idx) -
1572                         vq->last_avail_idx;
1573         if (free_entries == 0)
1574                 goto out;
1575
1576         LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
1577
1578         /* Prefetch available and used ring */
1579         avail_idx = vq->last_avail_idx & (vq->size - 1);
1580         used_idx  = vq->last_used_idx  & (vq->size - 1);
1581         rte_prefetch0(&vq->avail->ring[avail_idx]);
1582         rte_prefetch0(&vq->used->ring[used_idx]);
1583
1584         count = RTE_MIN(count, MAX_PKT_BURST);
1585         count = RTE_MIN(count, free_entries);
1586         LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
1587                         dev->vid, count);
1588
1589         /* Retrieve all of the head indexes first to avoid caching issues. */
1590         for (i = 0; i < count; i++) {
1591                 avail_idx = (vq->last_avail_idx + i) & (vq->size - 1);
1592                 used_idx  = (vq->last_used_idx  + i) & (vq->size - 1);
1593                 desc_indexes[i] = vq->avail->ring[avail_idx];
1594
1595                 if (likely(dev->dequeue_zero_copy == 0))
1596                         update_used_ring(dev, vq, used_idx, desc_indexes[i]);
1597         }
1598
1599         /* Prefetch descriptor index. */
1600         rte_prefetch0(&vq->desc[desc_indexes[0]]);
1601         for (i = 0; i < count; i++) {
1602                 struct vring_desc *desc, *idesc = NULL;
1603                 uint16_t sz, idx;
1604                 uint64_t dlen;
1605                 int err;
1606
1607                 if (likely(i + 1 < count))
1608                         rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
1609
1610                 if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
1611                         dlen = vq->desc[desc_indexes[i]].len;
1612                         desc = (struct vring_desc *)(uintptr_t)
1613                                 vhost_iova_to_vva(dev, vq,
1614                                                 vq->desc[desc_indexes[i]].addr,
1615                                                 &dlen,
1616                                                 VHOST_ACCESS_RO);
1617                         if (unlikely(!desc))
1618                                 break;
1619
1620                         if (unlikely(dlen < vq->desc[desc_indexes[i]].len)) {
1621                                 /*
1622                                  * The indirect desc table is not contiguous
1623                                  * in process VA space, we have to copy it.
1624                                  */
1625                                 idesc = alloc_copy_ind_table(dev, vq,
1626                                                 &vq->desc[desc_indexes[i]]);
1627                                 if (unlikely(!idesc))
1628                                         break;
1629
1630                                 desc = idesc;
1631                         }
1632
1633                         rte_prefetch0(desc);
1634                         sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
1635                         idx = 0;
1636                 } else {
1637                         desc = vq->desc;
1638                         sz = vq->size;
1639                         idx = desc_indexes[i];
1640                 }
1641
1642                 pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
1643                 if (unlikely(pkts[i] == NULL)) {
1644                         RTE_LOG(ERR, VHOST_DATA,
1645                                 "Failed to allocate memory for mbuf.\n");
1646                         free_ind_table(idesc);
1647                         break;
1648                 }
1649
1650                 err = copy_desc_to_mbuf(dev, vq, desc, sz, pkts[i], idx,
1651                                         mbuf_pool);
1652                 if (unlikely(err)) {
1653                         rte_pktmbuf_free(pkts[i]);
1654                         free_ind_table(idesc);
1655                         break;
1656                 }
1657
1658                 if (unlikely(dev->dequeue_zero_copy)) {
1659                         struct zcopy_mbuf *zmbuf;
1660
1661                         zmbuf = get_zmbuf(vq);
1662                         if (!zmbuf) {
1663                                 rte_pktmbuf_free(pkts[i]);
1664                                 free_ind_table(idesc);
1665                                 break;
1666                         }
1667                         zmbuf->mbuf = pkts[i];
1668                         zmbuf->desc_idx = desc_indexes[i];
1669
1670                         /*
1671                          * Pin lock the mbuf; we will check later to see
1672                          * whether the mbuf is freed (when we are the last
1673                          * user) or not. If that's the case, we then could
1674                          * update the used ring safely.
1675                          */
1676                         rte_mbuf_refcnt_update(pkts[i], 1);
1677
1678                         vq->nr_zmbuf += 1;
1679                         TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
1680                 }
1681
1682                 if (unlikely(!!idesc))
1683                         free_ind_table(idesc);
1684         }
1685         vq->last_avail_idx += i;
1686
1687         if (likely(dev->dequeue_zero_copy == 0)) {
1688                 do_data_copy_dequeue(vq);
1689                 vq->last_used_idx += i;
1690                 update_used_idx(dev, vq, i);
1691         }
1692
1693 out:
1694         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1695                 vhost_user_iotlb_rd_unlock(vq);
1696
1697 out_access_unlock:
1698         rte_spinlock_unlock(&vq->access_lock);
1699
1700         if (unlikely(rarp_mbuf != NULL)) {
1701                 /*
1702                  * Inject it to the head of "pkts" array, so that switch's mac
1703                  * learning table will get updated first.
1704                  */
1705                 memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
1706                 pkts[0] = rarp_mbuf;
1707                 i += 1;
1708         }
1709
1710         return i;
1711 }