New upstream version 17.11.1
[deb_dpdk.git] / lib / librte_vhost / virtio_net.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdbool.h>
36 #include <linux/virtio_net.h>
37
38 #include <rte_mbuf.h>
39 #include <rte_memcpy.h>
40 #include <rte_ether.h>
41 #include <rte_ip.h>
42 #include <rte_vhost.h>
43 #include <rte_tcp.h>
44 #include <rte_udp.h>
45 #include <rte_sctp.h>
46 #include <rte_arp.h>
47 #include <rte_spinlock.h>
48
49 #include "iotlb.h"
50 #include "vhost.h"
51
52 #define MAX_PKT_BURST 32
53
54 #define MAX_BATCH_LEN 256
55
56 static bool
57 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
58 {
59         return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
60 }
61
62 static __rte_always_inline void
63 do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
64                           uint16_t to, uint16_t from, uint16_t size)
65 {
66         rte_memcpy(&vq->used->ring[to],
67                         &vq->shadow_used_ring[from],
68                         size * sizeof(struct vring_used_elem));
69         vhost_log_used_vring(dev, vq,
70                         offsetof(struct vring_used, ring[to]),
71                         size * sizeof(struct vring_used_elem));
72 }
73
74 static __rte_always_inline void
75 flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
76 {
77         uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
78
79         if (used_idx + vq->shadow_used_idx <= vq->size) {
80                 do_flush_shadow_used_ring(dev, vq, used_idx, 0,
81                                           vq->shadow_used_idx);
82         } else {
83                 uint16_t size;
84
85                 /* update used ring interval [used_idx, vq->size] */
86                 size = vq->size - used_idx;
87                 do_flush_shadow_used_ring(dev, vq, used_idx, 0, size);
88
89                 /* update the left half used ring interval [0, left_size] */
90                 do_flush_shadow_used_ring(dev, vq, 0, size,
91                                           vq->shadow_used_idx - size);
92         }
93         vq->last_used_idx += vq->shadow_used_idx;
94
95         rte_smp_wmb();
96
97         *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
98         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
99                 sizeof(vq->used->idx));
100 }
101
102 static __rte_always_inline void
103 update_shadow_used_ring(struct vhost_virtqueue *vq,
104                          uint16_t desc_idx, uint16_t len)
105 {
106         uint16_t i = vq->shadow_used_idx++;
107
108         vq->shadow_used_ring[i].id  = desc_idx;
109         vq->shadow_used_ring[i].len = len;
110 }
111
112 static inline void
113 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
114 {
115         struct batch_copy_elem *elem = vq->batch_copy_elems;
116         uint16_t count = vq->batch_copy_nb_elems;
117         int i;
118
119         for (i = 0; i < count; i++) {
120                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
121                 vhost_log_write(dev, elem[i].log_addr, elem[i].len);
122                 PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
123         }
124 }
125
126 static inline void
127 do_data_copy_dequeue(struct vhost_virtqueue *vq)
128 {
129         struct batch_copy_elem *elem = vq->batch_copy_elems;
130         uint16_t count = vq->batch_copy_nb_elems;
131         int i;
132
133         for (i = 0; i < count; i++)
134                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
135 }
136
137 /* avoid write operation when necessary, to lessen cache issues */
138 #define ASSIGN_UNLESS_EQUAL(var, val) do {      \
139         if ((var) != (val))                     \
140                 (var) = (val);                  \
141 } while (0)
142
143 static void
144 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
145 {
146         uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
147
148         if (m_buf->ol_flags & PKT_TX_TCP_SEG)
149                 csum_l4 |= PKT_TX_TCP_CKSUM;
150
151         if (csum_l4) {
152                 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
153                 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
154
155                 switch (csum_l4) {
156                 case PKT_TX_TCP_CKSUM:
157                         net_hdr->csum_offset = (offsetof(struct tcp_hdr,
158                                                 cksum));
159                         break;
160                 case PKT_TX_UDP_CKSUM:
161                         net_hdr->csum_offset = (offsetof(struct udp_hdr,
162                                                 dgram_cksum));
163                         break;
164                 case PKT_TX_SCTP_CKSUM:
165                         net_hdr->csum_offset = (offsetof(struct sctp_hdr,
166                                                 cksum));
167                         break;
168                 }
169         } else {
170                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
171                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
172                 ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
173         }
174
175         /* IP cksum verification cannot be bypassed, then calculate here */
176         if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
177                 struct ipv4_hdr *ipv4_hdr;
178
179                 ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct ipv4_hdr *,
180                                                    m_buf->l2_len);
181                 ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
182         }
183
184         if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
185                 if (m_buf->ol_flags & PKT_TX_IPV4)
186                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
187                 else
188                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
189                 net_hdr->gso_size = m_buf->tso_segsz;
190                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
191                                         + m_buf->l4_len;
192         } else {
193                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
194                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
195                 ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
196         }
197 }
198
199 static __rte_always_inline int
200 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
201                   struct vring_desc *descs, struct rte_mbuf *m,
202                   uint16_t desc_idx, uint32_t size)
203 {
204         uint32_t desc_avail, desc_offset;
205         uint32_t mbuf_avail, mbuf_offset;
206         uint32_t cpy_len;
207         struct vring_desc *desc;
208         uint64_t desc_addr;
209         /* A counter to avoid desc dead loop chain */
210         uint16_t nr_desc = 1;
211         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
212         uint16_t copy_nb = vq->batch_copy_nb_elems;
213         int error = 0;
214
215         desc = &descs[desc_idx];
216         desc_addr = vhost_iova_to_vva(dev, vq, desc->addr,
217                                         desc->len, VHOST_ACCESS_RW);
218         /*
219          * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
220          * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
221          * otherwise stores offset on the stack instead of in a register.
222          */
223         if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr) {
224                 error = -1;
225                 goto out;
226         }
227
228         rte_prefetch0((void *)(uintptr_t)desc_addr);
229
230         virtio_enqueue_offload(m, (struct virtio_net_hdr *)(uintptr_t)desc_addr);
231         vhost_log_write(dev, desc->addr, dev->vhost_hlen);
232         PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
233
234         desc_offset = dev->vhost_hlen;
235         desc_avail  = desc->len - dev->vhost_hlen;
236
237         mbuf_avail  = rte_pktmbuf_data_len(m);
238         mbuf_offset = 0;
239         while (mbuf_avail != 0 || m->next != NULL) {
240                 /* done with current mbuf, fetch next */
241                 if (mbuf_avail == 0) {
242                         m = m->next;
243
244                         mbuf_offset = 0;
245                         mbuf_avail  = rte_pktmbuf_data_len(m);
246                 }
247
248                 /* done with current desc buf, fetch next */
249                 if (desc_avail == 0) {
250                         if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
251                                 /* Room in vring buffer is not enough */
252                                 error = -1;
253                                 goto out;
254                         }
255                         if (unlikely(desc->next >= size || ++nr_desc > size)) {
256                                 error = -1;
257                                 goto out;
258                         }
259
260                         desc = &descs[desc->next];
261                         desc_addr = vhost_iova_to_vva(dev, vq, desc->addr,
262                                                         desc->len,
263                                                         VHOST_ACCESS_RW);
264                         if (unlikely(!desc_addr)) {
265                                 error = -1;
266                                 goto out;
267                         }
268
269                         desc_offset = 0;
270                         desc_avail  = desc->len;
271                 }
272
273                 cpy_len = RTE_MIN(desc_avail, mbuf_avail);
274                 if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) {
275                         rte_memcpy((void *)((uintptr_t)(desc_addr +
276                                                         desc_offset)),
277                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
278                                 cpy_len);
279                         vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
280                         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
281                                      cpy_len, 0);
282                 } else {
283                         batch_copy[copy_nb].dst =
284                                 (void *)((uintptr_t)(desc_addr + desc_offset));
285                         batch_copy[copy_nb].src =
286                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
287                         batch_copy[copy_nb].log_addr = desc->addr + desc_offset;
288                         batch_copy[copy_nb].len = cpy_len;
289                         copy_nb++;
290                 }
291
292                 mbuf_avail  -= cpy_len;
293                 mbuf_offset += cpy_len;
294                 desc_avail  -= cpy_len;
295                 desc_offset += cpy_len;
296         }
297
298 out:
299         vq->batch_copy_nb_elems = copy_nb;
300
301         return error;
302 }
303
304 /**
305  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
306  * be received from the physical port or from another virtio device. A packet
307  * count is returned to indicate the number of packets that are successfully
308  * added to the RX queue. This function works when the mbuf is scattered, but
309  * it doesn't support the mergeable feature.
310  */
311 static __rte_always_inline uint32_t
312 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
313               struct rte_mbuf **pkts, uint32_t count)
314 {
315         struct vhost_virtqueue *vq;
316         uint16_t avail_idx, free_entries, start_idx;
317         uint16_t desc_indexes[MAX_PKT_BURST];
318         struct vring_desc *descs;
319         uint16_t used_idx;
320         uint32_t i, sz;
321
322         LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
323         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
324                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
325                         dev->vid, __func__, queue_id);
326                 return 0;
327         }
328
329         vq = dev->virtqueue[queue_id];
330
331         rte_spinlock_lock(&vq->access_lock);
332
333         if (unlikely(vq->enabled == 0))
334                 goto out_access_unlock;
335
336         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
337                 vhost_user_iotlb_rd_lock(vq);
338
339         if (unlikely(vq->access_ok == 0)) {
340                 if (unlikely(vring_translate(dev, vq) < 0)) {
341                         count = 0;
342                         goto out;
343                 }
344         }
345
346         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
347         start_idx = vq->last_used_idx;
348         free_entries = avail_idx - start_idx;
349         count = RTE_MIN(count, free_entries);
350         count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
351         if (count == 0)
352                 goto out;
353
354         LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
355                 dev->vid, start_idx, start_idx + count);
356
357         vq->batch_copy_nb_elems = 0;
358
359         /* Retrieve all of the desc indexes first to avoid caching issues. */
360         rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
361         for (i = 0; i < count; i++) {
362                 used_idx = (start_idx + i) & (vq->size - 1);
363                 desc_indexes[i] = vq->avail->ring[used_idx];
364                 vq->used->ring[used_idx].id = desc_indexes[i];
365                 vq->used->ring[used_idx].len = pkts[i]->pkt_len +
366                                                dev->vhost_hlen;
367                 vhost_log_used_vring(dev, vq,
368                         offsetof(struct vring_used, ring[used_idx]),
369                         sizeof(vq->used->ring[used_idx]));
370         }
371
372         rte_prefetch0(&vq->desc[desc_indexes[0]]);
373         for (i = 0; i < count; i++) {
374                 uint16_t desc_idx = desc_indexes[i];
375                 int err;
376
377                 if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) {
378                         descs = (struct vring_desc *)(uintptr_t)
379                                 vhost_iova_to_vva(dev,
380                                                 vq, vq->desc[desc_idx].addr,
381                                                 vq->desc[desc_idx].len,
382                                                 VHOST_ACCESS_RO);
383                         if (unlikely(!descs)) {
384                                 count = i;
385                                 break;
386                         }
387
388                         desc_idx = 0;
389                         sz = vq->desc[desc_idx].len / sizeof(*descs);
390                 } else {
391                         descs = vq->desc;
392                         sz = vq->size;
393                 }
394
395                 err = copy_mbuf_to_desc(dev, vq, descs, pkts[i], desc_idx, sz);
396                 if (unlikely(err)) {
397                         count = i;
398                         break;
399                 }
400
401                 if (i + 1 < count)
402                         rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
403         }
404
405         do_data_copy_enqueue(dev, vq);
406
407         rte_smp_wmb();
408
409         *(volatile uint16_t *)&vq->used->idx += count;
410         vq->last_used_idx += count;
411         vhost_log_used_vring(dev, vq,
412                 offsetof(struct vring_used, idx),
413                 sizeof(vq->used->idx));
414
415         /* flush used->idx update before we read avail->flags. */
416         rte_mb();
417
418         /* Kick the guest if necessary. */
419         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
420                         && (vq->callfd >= 0))
421                 eventfd_write(vq->callfd, (eventfd_t)1);
422 out:
423         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
424                 vhost_user_iotlb_rd_unlock(vq);
425
426 out_access_unlock:
427         rte_spinlock_unlock(&vq->access_lock);
428
429         return count;
430 }
431
432 static __rte_always_inline int
433 fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
434                          uint32_t avail_idx, uint32_t *vec_idx,
435                          struct buf_vector *buf_vec, uint16_t *desc_chain_head,
436                          uint16_t *desc_chain_len)
437 {
438         uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
439         uint32_t vec_id = *vec_idx;
440         uint32_t len    = 0;
441         struct vring_desc *descs = vq->desc;
442
443         *desc_chain_head = idx;
444
445         if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
446                 descs = (struct vring_desc *)(uintptr_t)
447                         vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
448                                                 vq->desc[idx].len,
449                                                 VHOST_ACCESS_RO);
450                 if (unlikely(!descs))
451                         return -1;
452
453                 idx = 0;
454         }
455
456         while (1) {
457                 if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
458                         return -1;
459
460                 len += descs[idx].len;
461                 buf_vec[vec_id].buf_addr = descs[idx].addr;
462                 buf_vec[vec_id].buf_len  = descs[idx].len;
463                 buf_vec[vec_id].desc_idx = idx;
464                 vec_id++;
465
466                 if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
467                         break;
468
469                 idx = descs[idx].next;
470         }
471
472         *desc_chain_len = len;
473         *vec_idx = vec_id;
474
475         return 0;
476 }
477
478 /*
479  * Returns -1 on fail, 0 on success
480  */
481 static inline int
482 reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
483                                 uint32_t size, struct buf_vector *buf_vec,
484                                 uint16_t *num_buffers, uint16_t avail_head)
485 {
486         uint16_t cur_idx;
487         uint32_t vec_idx = 0;
488         uint16_t tries = 0;
489
490         uint16_t head_idx = 0;
491         uint16_t len = 0;
492
493         *num_buffers = 0;
494         cur_idx  = vq->last_avail_idx;
495
496         while (size > 0) {
497                 if (unlikely(cur_idx == avail_head))
498                         return -1;
499
500                 if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec,
501                                                 &head_idx, &len) < 0))
502                         return -1;
503                 len = RTE_MIN(len, size);
504                 update_shadow_used_ring(vq, head_idx, len);
505                 size -= len;
506
507                 cur_idx++;
508                 tries++;
509                 *num_buffers += 1;
510
511                 /*
512                  * if we tried all available ring items, and still
513                  * can't get enough buf, it means something abnormal
514                  * happened.
515                  */
516                 if (unlikely(tries >= vq->size))
517                         return -1;
518         }
519
520         return 0;
521 }
522
523 static __rte_always_inline int
524 copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
525                             struct rte_mbuf *m, struct buf_vector *buf_vec,
526                             uint16_t num_buffers)
527 {
528         uint32_t vec_idx = 0;
529         uint64_t desc_addr;
530         uint32_t mbuf_offset, mbuf_avail;
531         uint32_t desc_offset, desc_avail;
532         uint32_t cpy_len;
533         uint64_t hdr_addr, hdr_phys_addr;
534         struct rte_mbuf *hdr_mbuf;
535         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
536         uint16_t copy_nb = vq->batch_copy_nb_elems;
537         int error = 0;
538
539         if (unlikely(m == NULL)) {
540                 error = -1;
541                 goto out;
542         }
543
544         desc_addr = vhost_iova_to_vva(dev, vq, buf_vec[vec_idx].buf_addr,
545                                                 buf_vec[vec_idx].buf_len,
546                                                 VHOST_ACCESS_RW);
547         if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) {
548                 error = -1;
549                 goto out;
550         }
551
552         hdr_mbuf = m;
553         hdr_addr = desc_addr;
554         hdr_phys_addr = buf_vec[vec_idx].buf_addr;
555         rte_prefetch0((void *)(uintptr_t)hdr_addr);
556
557         LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
558                 dev->vid, num_buffers);
559
560         desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
561         desc_offset = dev->vhost_hlen;
562
563         mbuf_avail  = rte_pktmbuf_data_len(m);
564         mbuf_offset = 0;
565         while (mbuf_avail != 0 || m->next != NULL) {
566                 /* done with current desc buf, get the next one */
567                 if (desc_avail == 0) {
568                         vec_idx++;
569                         desc_addr =
570                                 vhost_iova_to_vva(dev, vq,
571                                         buf_vec[vec_idx].buf_addr,
572                                         buf_vec[vec_idx].buf_len,
573                                         VHOST_ACCESS_RW);
574                         if (unlikely(!desc_addr)) {
575                                 error = -1;
576                                 goto out;
577                         }
578
579                         /* Prefetch buffer address. */
580                         rte_prefetch0((void *)(uintptr_t)desc_addr);
581                         desc_offset = 0;
582                         desc_avail  = buf_vec[vec_idx].buf_len;
583                 }
584
585                 /* done with current mbuf, get the next one */
586                 if (mbuf_avail == 0) {
587                         m = m->next;
588
589                         mbuf_offset = 0;
590                         mbuf_avail  = rte_pktmbuf_data_len(m);
591                 }
592
593                 if (hdr_addr) {
594                         struct virtio_net_hdr_mrg_rxbuf *hdr;
595
596                         hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)
597                                 hdr_addr;
598                         virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
599                         ASSIGN_UNLESS_EQUAL(hdr->num_buffers, num_buffers);
600
601                         vhost_log_write(dev, hdr_phys_addr, dev->vhost_hlen);
602                         PRINT_PACKET(dev, (uintptr_t)hdr_addr,
603                                      dev->vhost_hlen, 0);
604
605                         hdr_addr = 0;
606                 }
607
608                 cpy_len = RTE_MIN(desc_avail, mbuf_avail);
609
610                 if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) {
611                         rte_memcpy((void *)((uintptr_t)(desc_addr +
612                                                         desc_offset)),
613                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
614                                 cpy_len);
615                         vhost_log_write(dev,
616                                 buf_vec[vec_idx].buf_addr + desc_offset,
617                                 cpy_len);
618                         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
619                                 cpy_len, 0);
620                 } else {
621                         batch_copy[copy_nb].dst =
622                                 (void *)((uintptr_t)(desc_addr + desc_offset));
623                         batch_copy[copy_nb].src =
624                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
625                         batch_copy[copy_nb].log_addr =
626                                 buf_vec[vec_idx].buf_addr + desc_offset;
627                         batch_copy[copy_nb].len = cpy_len;
628                         copy_nb++;
629                 }
630
631                 mbuf_avail  -= cpy_len;
632                 mbuf_offset += cpy_len;
633                 desc_avail  -= cpy_len;
634                 desc_offset += cpy_len;
635         }
636
637 out:
638         vq->batch_copy_nb_elems = copy_nb;
639
640         return error;
641 }
642
643 static __rte_always_inline uint32_t
644 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
645         struct rte_mbuf **pkts, uint32_t count)
646 {
647         struct vhost_virtqueue *vq;
648         uint32_t pkt_idx = 0;
649         uint16_t num_buffers;
650         struct buf_vector buf_vec[BUF_VECTOR_MAX];
651         uint16_t avail_head;
652
653         LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
654         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
655                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
656                         dev->vid, __func__, queue_id);
657                 return 0;
658         }
659
660         vq = dev->virtqueue[queue_id];
661
662         rte_spinlock_lock(&vq->access_lock);
663
664         if (unlikely(vq->enabled == 0))
665                 goto out_access_unlock;
666
667         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
668                 vhost_user_iotlb_rd_lock(vq);
669
670         if (unlikely(vq->access_ok == 0))
671                 if (unlikely(vring_translate(dev, vq) < 0))
672                         goto out;
673
674         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
675         if (count == 0)
676                 goto out;
677
678         vq->batch_copy_nb_elems = 0;
679
680         rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
681
682         vq->shadow_used_idx = 0;
683         avail_head = *((volatile uint16_t *)&vq->avail->idx);
684         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
685                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
686
687                 if (unlikely(reserve_avail_buf_mergeable(dev, vq,
688                                                 pkt_len, buf_vec, &num_buffers,
689                                                 avail_head) < 0)) {
690                         LOG_DEBUG(VHOST_DATA,
691                                 "(%d) failed to get enough desc from vring\n",
692                                 dev->vid);
693                         vq->shadow_used_idx -= num_buffers;
694                         break;
695                 }
696
697                 LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
698                         dev->vid, vq->last_avail_idx,
699                         vq->last_avail_idx + num_buffers);
700
701                 if (copy_mbuf_to_desc_mergeable(dev, vq, pkts[pkt_idx],
702                                                 buf_vec, num_buffers) < 0) {
703                         vq->shadow_used_idx -= num_buffers;
704                         break;
705                 }
706
707                 vq->last_avail_idx += num_buffers;
708         }
709
710         do_data_copy_enqueue(dev, vq);
711
712         if (likely(vq->shadow_used_idx)) {
713                 flush_shadow_used_ring(dev, vq);
714
715                 /* flush used->idx update before we read avail->flags. */
716                 rte_mb();
717
718                 /* Kick the guest if necessary. */
719                 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
720                                 && (vq->callfd >= 0))
721                         eventfd_write(vq->callfd, (eventfd_t)1);
722         }
723
724 out:
725         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
726                 vhost_user_iotlb_rd_unlock(vq);
727
728 out_access_unlock:
729         rte_spinlock_unlock(&vq->access_lock);
730
731         return pkt_idx;
732 }
733
734 uint16_t
735 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
736         struct rte_mbuf **pkts, uint16_t count)
737 {
738         struct virtio_net *dev = get_device(vid);
739
740         if (!dev)
741                 return 0;
742
743         if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
744                 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
745         else
746                 return virtio_dev_rx(dev, queue_id, pkts, count);
747 }
748
749 static inline bool
750 virtio_net_with_host_offload(struct virtio_net *dev)
751 {
752         if (dev->features &
753                         ((1ULL << VIRTIO_NET_F_CSUM) |
754                          (1ULL << VIRTIO_NET_F_HOST_ECN) |
755                          (1ULL << VIRTIO_NET_F_HOST_TSO4) |
756                          (1ULL << VIRTIO_NET_F_HOST_TSO6) |
757                          (1ULL << VIRTIO_NET_F_HOST_UFO)))
758                 return true;
759
760         return false;
761 }
762
763 static void
764 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
765 {
766         struct ipv4_hdr *ipv4_hdr;
767         struct ipv6_hdr *ipv6_hdr;
768         void *l3_hdr = NULL;
769         struct ether_hdr *eth_hdr;
770         uint16_t ethertype;
771
772         eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
773
774         m->l2_len = sizeof(struct ether_hdr);
775         ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
776
777         if (ethertype == ETHER_TYPE_VLAN) {
778                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
779
780                 m->l2_len += sizeof(struct vlan_hdr);
781                 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
782         }
783
784         l3_hdr = (char *)eth_hdr + m->l2_len;
785
786         switch (ethertype) {
787         case ETHER_TYPE_IPv4:
788                 ipv4_hdr = l3_hdr;
789                 *l4_proto = ipv4_hdr->next_proto_id;
790                 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
791                 *l4_hdr = (char *)l3_hdr + m->l3_len;
792                 m->ol_flags |= PKT_TX_IPV4;
793                 break;
794         case ETHER_TYPE_IPv6:
795                 ipv6_hdr = l3_hdr;
796                 *l4_proto = ipv6_hdr->proto;
797                 m->l3_len = sizeof(struct ipv6_hdr);
798                 *l4_hdr = (char *)l3_hdr + m->l3_len;
799                 m->ol_flags |= PKT_TX_IPV6;
800                 break;
801         default:
802                 m->l3_len = 0;
803                 *l4_proto = 0;
804                 *l4_hdr = NULL;
805                 break;
806         }
807 }
808
809 static __rte_always_inline void
810 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
811 {
812         uint16_t l4_proto = 0;
813         void *l4_hdr = NULL;
814         struct tcp_hdr *tcp_hdr = NULL;
815
816         if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
817                 return;
818
819         parse_ethernet(m, &l4_proto, &l4_hdr);
820         if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
821                 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
822                         switch (hdr->csum_offset) {
823                         case (offsetof(struct tcp_hdr, cksum)):
824                                 if (l4_proto == IPPROTO_TCP)
825                                         m->ol_flags |= PKT_TX_TCP_CKSUM;
826                                 break;
827                         case (offsetof(struct udp_hdr, dgram_cksum)):
828                                 if (l4_proto == IPPROTO_UDP)
829                                         m->ol_flags |= PKT_TX_UDP_CKSUM;
830                                 break;
831                         case (offsetof(struct sctp_hdr, cksum)):
832                                 if (l4_proto == IPPROTO_SCTP)
833                                         m->ol_flags |= PKT_TX_SCTP_CKSUM;
834                                 break;
835                         default:
836                                 break;
837                         }
838                 }
839         }
840
841         if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
842                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
843                 case VIRTIO_NET_HDR_GSO_TCPV4:
844                 case VIRTIO_NET_HDR_GSO_TCPV6:
845                         tcp_hdr = l4_hdr;
846                         m->ol_flags |= PKT_TX_TCP_SEG;
847                         m->tso_segsz = hdr->gso_size;
848                         m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
849                         break;
850                 default:
851                         RTE_LOG(WARNING, VHOST_DATA,
852                                 "unsupported gso type %u.\n", hdr->gso_type);
853                         break;
854                 }
855         }
856 }
857
858 #define RARP_PKT_SIZE   64
859
860 static int
861 make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
862 {
863         struct ether_hdr *eth_hdr;
864         struct arp_hdr  *rarp;
865
866         if (rarp_mbuf->buf_len < 64) {
867                 RTE_LOG(WARNING, VHOST_DATA,
868                         "failed to make RARP; mbuf size too small %u (< %d)\n",
869                         rarp_mbuf->buf_len, RARP_PKT_SIZE);
870                 return -1;
871         }
872
873         /* Ethernet header. */
874         eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
875         memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
876         ether_addr_copy(mac, &eth_hdr->s_addr);
877         eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
878
879         /* RARP header. */
880         rarp = (struct arp_hdr *)(eth_hdr + 1);
881         rarp->arp_hrd = htons(ARP_HRD_ETHER);
882         rarp->arp_pro = htons(ETHER_TYPE_IPv4);
883         rarp->arp_hln = ETHER_ADDR_LEN;
884         rarp->arp_pln = 4;
885         rarp->arp_op  = htons(ARP_OP_REVREQUEST);
886
887         ether_addr_copy(mac, &rarp->arp_data.arp_sha);
888         ether_addr_copy(mac, &rarp->arp_data.arp_tha);
889         memset(&rarp->arp_data.arp_sip, 0x00, 4);
890         memset(&rarp->arp_data.arp_tip, 0x00, 4);
891
892         rarp_mbuf->pkt_len  = rarp_mbuf->data_len = RARP_PKT_SIZE;
893
894         return 0;
895 }
896
897 static __rte_always_inline void
898 put_zmbuf(struct zcopy_mbuf *zmbuf)
899 {
900         zmbuf->in_use = 0;
901 }
902
903 static __rte_always_inline int
904 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
905                   struct vring_desc *descs, uint16_t max_desc,
906                   struct rte_mbuf *m, uint16_t desc_idx,
907                   struct rte_mempool *mbuf_pool)
908 {
909         struct vring_desc *desc;
910         uint64_t desc_addr;
911         uint32_t desc_avail, desc_offset;
912         uint32_t mbuf_avail, mbuf_offset;
913         uint32_t cpy_len;
914         struct rte_mbuf *cur = m, *prev = m;
915         struct virtio_net_hdr *hdr = NULL;
916         /* A counter to avoid desc dead loop chain */
917         uint32_t nr_desc = 1;
918         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
919         uint16_t copy_nb = vq->batch_copy_nb_elems;
920         int error = 0;
921
922         desc = &descs[desc_idx];
923         if (unlikely((desc->len < dev->vhost_hlen)) ||
924                         (desc->flags & VRING_DESC_F_INDIRECT)) {
925                 error = -1;
926                 goto out;
927         }
928
929         desc_addr = vhost_iova_to_vva(dev,
930                                         vq, desc->addr,
931                                         desc->len,
932                                         VHOST_ACCESS_RO);
933         if (unlikely(!desc_addr)) {
934                 error = -1;
935                 goto out;
936         }
937
938         if (virtio_net_with_host_offload(dev)) {
939                 hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
940                 rte_prefetch0(hdr);
941         }
942
943         /*
944          * A virtio driver normally uses at least 2 desc buffers
945          * for Tx: the first for storing the header, and others
946          * for storing the data.
947          */
948         if (likely((desc->len == dev->vhost_hlen) &&
949                    (desc->flags & VRING_DESC_F_NEXT) != 0)) {
950                 desc = &descs[desc->next];
951                 if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) {
952                         error = -1;
953                         goto out;
954                 }
955
956                 desc_addr = vhost_iova_to_vva(dev,
957                                                         vq, desc->addr,
958                                                         desc->len,
959                                                         VHOST_ACCESS_RO);
960                 if (unlikely(!desc_addr)) {
961                         error = -1;
962                         goto out;
963                 }
964
965                 desc_offset = 0;
966                 desc_avail  = desc->len;
967                 nr_desc    += 1;
968         } else {
969                 desc_avail  = desc->len - dev->vhost_hlen;
970                 desc_offset = dev->vhost_hlen;
971         }
972
973         rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset));
974
975         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), desc_avail, 0);
976
977         mbuf_offset = 0;
978         mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
979         while (1) {
980                 uint64_t hpa;
981
982                 cpy_len = RTE_MIN(desc_avail, mbuf_avail);
983
984                 /*
985                  * A desc buf might across two host physical pages that are
986                  * not continuous. In such case (gpa_to_hpa returns 0), data
987                  * will be copied even though zero copy is enabled.
988                  */
989                 if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev,
990                                         desc->addr + desc_offset, cpy_len)))) {
991                         cur->data_len = cpy_len;
992                         cur->data_off = 0;
993                         cur->buf_addr = (void *)(uintptr_t)(desc_addr
994                                 + desc_offset);
995                         cur->buf_iova = hpa;
996
997                         /*
998                          * In zero copy mode, one mbuf can only reference data
999                          * for one or partial of one desc buff.
1000                          */
1001                         mbuf_avail = cpy_len;
1002                 } else {
1003                         if (likely(cpy_len > MAX_BATCH_LEN ||
1004                                    copy_nb >= vq->size ||
1005                                    (hdr && cur == m))) {
1006                                 rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
1007                                                                    mbuf_offset),
1008                                            (void *)((uintptr_t)(desc_addr +
1009                                                                 desc_offset)),
1010                                            cpy_len);
1011                         } else {
1012                                 batch_copy[copy_nb].dst =
1013                                         rte_pktmbuf_mtod_offset(cur, void *,
1014                                                                 mbuf_offset);
1015                                 batch_copy[copy_nb].src =
1016                                         (void *)((uintptr_t)(desc_addr +
1017                                                              desc_offset));
1018                                 batch_copy[copy_nb].len = cpy_len;
1019                                 copy_nb++;
1020                         }
1021                 }
1022
1023                 mbuf_avail  -= cpy_len;
1024                 mbuf_offset += cpy_len;
1025                 desc_avail  -= cpy_len;
1026                 desc_offset += cpy_len;
1027
1028                 /* This desc reaches to its end, get the next one */
1029                 if (desc_avail == 0) {
1030                         if ((desc->flags & VRING_DESC_F_NEXT) == 0)
1031                                 break;
1032
1033                         if (unlikely(desc->next >= max_desc ||
1034                                      ++nr_desc > max_desc)) {
1035                                 error = -1;
1036                                 goto out;
1037                         }
1038                         desc = &descs[desc->next];
1039                         if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) {
1040                                 error = -1;
1041                                 goto out;
1042                         }
1043
1044                         desc_addr = vhost_iova_to_vva(dev,
1045                                                         vq, desc->addr,
1046                                                         desc->len,
1047                                                         VHOST_ACCESS_RO);
1048                         if (unlikely(!desc_addr)) {
1049                                 error = -1;
1050                                 goto out;
1051                         }
1052
1053                         rte_prefetch0((void *)(uintptr_t)desc_addr);
1054
1055                         desc_offset = 0;
1056                         desc_avail  = desc->len;
1057
1058                         PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
1059                 }
1060
1061                 /*
1062                  * This mbuf reaches to its end, get a new one
1063                  * to hold more data.
1064                  */
1065                 if (mbuf_avail == 0) {
1066                         cur = rte_pktmbuf_alloc(mbuf_pool);
1067                         if (unlikely(cur == NULL)) {
1068                                 RTE_LOG(ERR, VHOST_DATA, "Failed to "
1069                                         "allocate memory for mbuf.\n");
1070                                 error = -1;
1071                                 goto out;
1072                         }
1073                         if (unlikely(dev->dequeue_zero_copy))
1074                                 rte_mbuf_refcnt_update(cur, 1);
1075
1076                         prev->next = cur;
1077                         prev->data_len = mbuf_offset;
1078                         m->nb_segs += 1;
1079                         m->pkt_len += mbuf_offset;
1080                         prev = cur;
1081
1082                         mbuf_offset = 0;
1083                         mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
1084                 }
1085         }
1086
1087         prev->data_len = mbuf_offset;
1088         m->pkt_len    += mbuf_offset;
1089
1090         if (hdr)
1091                 vhost_dequeue_offload(hdr, m);
1092
1093 out:
1094         vq->batch_copy_nb_elems = copy_nb;
1095
1096         return error;
1097 }
1098
1099 static __rte_always_inline void
1100 update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
1101                  uint32_t used_idx, uint32_t desc_idx)
1102 {
1103         vq->used->ring[used_idx].id  = desc_idx;
1104         vq->used->ring[used_idx].len = 0;
1105         vhost_log_used_vring(dev, vq,
1106                         offsetof(struct vring_used, ring[used_idx]),
1107                         sizeof(vq->used->ring[used_idx]));
1108 }
1109
1110 static __rte_always_inline void
1111 update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq,
1112                 uint32_t count)
1113 {
1114         if (unlikely(count == 0))
1115                 return;
1116
1117         rte_smp_wmb();
1118         rte_smp_rmb();
1119
1120         vq->used->idx += count;
1121         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
1122                         sizeof(vq->used->idx));
1123
1124         /* Kick guest if required. */
1125         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
1126                         && (vq->callfd >= 0))
1127                 eventfd_write(vq->callfd, (eventfd_t)1);
1128 }
1129
1130 static __rte_always_inline struct zcopy_mbuf *
1131 get_zmbuf(struct vhost_virtqueue *vq)
1132 {
1133         uint16_t i;
1134         uint16_t last;
1135         int tries = 0;
1136
1137         /* search [last_zmbuf_idx, zmbuf_size) */
1138         i = vq->last_zmbuf_idx;
1139         last = vq->zmbuf_size;
1140
1141 again:
1142         for (; i < last; i++) {
1143                 if (vq->zmbufs[i].in_use == 0) {
1144                         vq->last_zmbuf_idx = i + 1;
1145                         vq->zmbufs[i].in_use = 1;
1146                         return &vq->zmbufs[i];
1147                 }
1148         }
1149
1150         tries++;
1151         if (tries == 1) {
1152                 /* search [0, last_zmbuf_idx) */
1153                 i = 0;
1154                 last = vq->last_zmbuf_idx;
1155                 goto again;
1156         }
1157
1158         return NULL;
1159 }
1160
1161 static __rte_always_inline bool
1162 mbuf_is_consumed(struct rte_mbuf *m)
1163 {
1164         while (m) {
1165                 if (rte_mbuf_refcnt_read(m) > 1)
1166                         return false;
1167                 m = m->next;
1168         }
1169
1170         return true;
1171 }
1172
1173 static __rte_always_inline void
1174 restore_mbuf(struct rte_mbuf *m)
1175 {
1176         uint32_t mbuf_size, priv_size;
1177
1178         while (m) {
1179                 priv_size = rte_pktmbuf_priv_size(m->pool);
1180                 mbuf_size = sizeof(struct rte_mbuf) + priv_size;
1181                 /* start of buffer is after mbuf structure and priv data */
1182
1183                 m->buf_addr = (char *)m + mbuf_size;
1184                 m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
1185                 m = m->next;
1186         }
1187 }
1188
1189 uint16_t
1190 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
1191         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
1192 {
1193         struct virtio_net *dev;
1194         struct rte_mbuf *rarp_mbuf = NULL;
1195         struct vhost_virtqueue *vq;
1196         uint32_t desc_indexes[MAX_PKT_BURST];
1197         uint32_t used_idx;
1198         uint32_t i = 0;
1199         uint16_t free_entries;
1200         uint16_t avail_idx;
1201
1202         dev = get_device(vid);
1203         if (!dev)
1204                 return 0;
1205
1206         if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
1207                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
1208                         dev->vid, __func__, queue_id);
1209                 return 0;
1210         }
1211
1212         vq = dev->virtqueue[queue_id];
1213
1214         if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
1215                 return 0;
1216
1217         if (unlikely(vq->enabled == 0))
1218                 goto out_access_unlock;
1219
1220         vq->batch_copy_nb_elems = 0;
1221
1222         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1223                 vhost_user_iotlb_rd_lock(vq);
1224
1225         if (unlikely(vq->access_ok == 0))
1226                 if (unlikely(vring_translate(dev, vq) < 0))
1227                         goto out;
1228
1229         if (unlikely(dev->dequeue_zero_copy)) {
1230                 struct zcopy_mbuf *zmbuf, *next;
1231                 int nr_updated = 0;
1232
1233                 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
1234                      zmbuf != NULL; zmbuf = next) {
1235                         next = TAILQ_NEXT(zmbuf, next);
1236
1237                         if (mbuf_is_consumed(zmbuf->mbuf)) {
1238                                 used_idx = vq->last_used_idx++ & (vq->size - 1);
1239                                 update_used_ring(dev, vq, used_idx,
1240                                                  zmbuf->desc_idx);
1241                                 nr_updated += 1;
1242
1243                                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
1244                                 restore_mbuf(zmbuf->mbuf);
1245                                 rte_pktmbuf_free(zmbuf->mbuf);
1246                                 put_zmbuf(zmbuf);
1247                                 vq->nr_zmbuf -= 1;
1248                         }
1249                 }
1250
1251                 update_used_idx(dev, vq, nr_updated);
1252         }
1253
1254         /*
1255          * Construct a RARP broadcast packet, and inject it to the "pkts"
1256          * array, to looks like that guest actually send such packet.
1257          *
1258          * Check user_send_rarp() for more information.
1259          *
1260          * broadcast_rarp shares a cacheline in the virtio_net structure
1261          * with some fields that are accessed during enqueue and
1262          * rte_atomic16_cmpset() causes a write if using cmpxchg. This could
1263          * result in false sharing between enqueue and dequeue.
1264          *
1265          * Prevent unnecessary false sharing by reading broadcast_rarp first
1266          * and only performing cmpset if the read indicates it is likely to
1267          * be set.
1268          */
1269
1270         if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
1271                         rte_atomic16_cmpset((volatile uint16_t *)
1272                                 &dev->broadcast_rarp.cnt, 1, 0))) {
1273
1274                 rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1275                 if (rarp_mbuf == NULL) {
1276                         RTE_LOG(ERR, VHOST_DATA,
1277                                 "Failed to allocate memory for mbuf.\n");
1278                         return 0;
1279                 }
1280
1281                 if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
1282                         rte_pktmbuf_free(rarp_mbuf);
1283                         rarp_mbuf = NULL;
1284                 } else {
1285                         count -= 1;
1286                 }
1287         }
1288
1289         free_entries = *((volatile uint16_t *)&vq->avail->idx) -
1290                         vq->last_avail_idx;
1291         if (free_entries == 0)
1292                 goto out;
1293
1294         LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
1295
1296         /* Prefetch available and used ring */
1297         avail_idx = vq->last_avail_idx & (vq->size - 1);
1298         used_idx  = vq->last_used_idx  & (vq->size - 1);
1299         rte_prefetch0(&vq->avail->ring[avail_idx]);
1300         rte_prefetch0(&vq->used->ring[used_idx]);
1301
1302         count = RTE_MIN(count, MAX_PKT_BURST);
1303         count = RTE_MIN(count, free_entries);
1304         LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
1305                         dev->vid, count);
1306
1307         /* Retrieve all of the head indexes first to avoid caching issues. */
1308         for (i = 0; i < count; i++) {
1309                 avail_idx = (vq->last_avail_idx + i) & (vq->size - 1);
1310                 used_idx  = (vq->last_used_idx  + i) & (vq->size - 1);
1311                 desc_indexes[i] = vq->avail->ring[avail_idx];
1312
1313                 if (likely(dev->dequeue_zero_copy == 0))
1314                         update_used_ring(dev, vq, used_idx, desc_indexes[i]);
1315         }
1316
1317         /* Prefetch descriptor index. */
1318         rte_prefetch0(&vq->desc[desc_indexes[0]]);
1319         for (i = 0; i < count; i++) {
1320                 struct vring_desc *desc;
1321                 uint16_t sz, idx;
1322                 int err;
1323
1324                 if (likely(i + 1 < count))
1325                         rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
1326
1327                 if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
1328                         desc = (struct vring_desc *)(uintptr_t)
1329                                 vhost_iova_to_vva(dev, vq,
1330                                                 vq->desc[desc_indexes[i]].addr,
1331                                                 sizeof(*desc),
1332                                                 VHOST_ACCESS_RO);
1333                         if (unlikely(!desc))
1334                                 break;
1335
1336                         rte_prefetch0(desc);
1337                         sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
1338                         idx = 0;
1339                 } else {
1340                         desc = vq->desc;
1341                         sz = vq->size;
1342                         idx = desc_indexes[i];
1343                 }
1344
1345                 pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
1346                 if (unlikely(pkts[i] == NULL)) {
1347                         RTE_LOG(ERR, VHOST_DATA,
1348                                 "Failed to allocate memory for mbuf.\n");
1349                         break;
1350                 }
1351
1352                 err = copy_desc_to_mbuf(dev, vq, desc, sz, pkts[i], idx,
1353                                         mbuf_pool);
1354                 if (unlikely(err)) {
1355                         rte_pktmbuf_free(pkts[i]);
1356                         break;
1357                 }
1358
1359                 if (unlikely(dev->dequeue_zero_copy)) {
1360                         struct zcopy_mbuf *zmbuf;
1361
1362                         zmbuf = get_zmbuf(vq);
1363                         if (!zmbuf) {
1364                                 rte_pktmbuf_free(pkts[i]);
1365                                 break;
1366                         }
1367                         zmbuf->mbuf = pkts[i];
1368                         zmbuf->desc_idx = desc_indexes[i];
1369
1370                         /*
1371                          * Pin lock the mbuf; we will check later to see
1372                          * whether the mbuf is freed (when we are the last
1373                          * user) or not. If that's the case, we then could
1374                          * update the used ring safely.
1375                          */
1376                         rte_mbuf_refcnt_update(pkts[i], 1);
1377
1378                         vq->nr_zmbuf += 1;
1379                         TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
1380                 }
1381         }
1382         vq->last_avail_idx += i;
1383
1384         if (likely(dev->dequeue_zero_copy == 0)) {
1385                 do_data_copy_dequeue(vq);
1386                 vq->last_used_idx += i;
1387                 update_used_idx(dev, vq, i);
1388         }
1389
1390 out:
1391         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1392                 vhost_user_iotlb_rd_unlock(vq);
1393
1394 out_access_unlock:
1395         rte_spinlock_unlock(&vq->access_lock);
1396
1397         if (unlikely(rarp_mbuf != NULL)) {
1398                 /*
1399                  * Inject it to the head of "pkts" array, so that switch's mac
1400                  * learning table will get updated first.
1401                  */
1402                 memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
1403                 pkts[0] = rarp_mbuf;
1404                 i += 1;
1405         }
1406
1407         return i;
1408 }