48219e0509c381d8b7754cdc9e130bdfe132e2d7
[deb_dpdk.git] / lib / librte_vhost / virtio_net.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdbool.h>
36 #include <linux/virtio_net.h>
37
38 #include <rte_mbuf.h>
39 #include <rte_memcpy.h>
40 #include <rte_ether.h>
41 #include <rte_ip.h>
42 #include <rte_vhost.h>
43 #include <rte_tcp.h>
44 #include <rte_udp.h>
45 #include <rte_sctp.h>
46 #include <rte_arp.h>
47
48 #include "vhost.h"
49
50 #define MAX_PKT_BURST 32
51
52 static bool
53 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
54 {
55         return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
56 }
57
58 static inline void __attribute__((always_inline))
59 do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
60                           uint16_t to, uint16_t from, uint16_t size)
61 {
62         rte_memcpy(&vq->used->ring[to],
63                         &vq->shadow_used_ring[from],
64                         size * sizeof(struct vring_used_elem));
65         vhost_log_used_vring(dev, vq,
66                         offsetof(struct vring_used, ring[to]),
67                         size * sizeof(struct vring_used_elem));
68 }
69
70 static inline void __attribute__((always_inline))
71 flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
72 {
73         uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
74
75         if (used_idx + vq->shadow_used_idx <= vq->size) {
76                 do_flush_shadow_used_ring(dev, vq, used_idx, 0,
77                                           vq->shadow_used_idx);
78         } else {
79                 uint16_t size;
80
81                 /* update used ring interval [used_idx, vq->size] */
82                 size = vq->size - used_idx;
83                 do_flush_shadow_used_ring(dev, vq, used_idx, 0, size);
84
85                 /* update the left half used ring interval [0, left_size] */
86                 do_flush_shadow_used_ring(dev, vq, 0, size,
87                                           vq->shadow_used_idx - size);
88         }
89         vq->last_used_idx += vq->shadow_used_idx;
90
91         rte_smp_wmb();
92
93         *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
94         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
95                 sizeof(vq->used->idx));
96 }
97
98 static inline void __attribute__((always_inline))
99 update_shadow_used_ring(struct vhost_virtqueue *vq,
100                          uint16_t desc_idx, uint16_t len)
101 {
102         uint16_t i = vq->shadow_used_idx++;
103
104         vq->shadow_used_ring[i].id  = desc_idx;
105         vq->shadow_used_ring[i].len = len;
106 }
107
108 /* avoid write operation when necessary, to lessen cache issues */
109 #define ASSIGN_UNLESS_EQUAL(var, val) do {      \
110         if ((var) != (val))                     \
111                 (var) = (val);                  \
112 } while (0)
113
114 static void
115 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
116 {
117         if (m_buf->ol_flags & PKT_TX_L4_MASK) {
118                 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
119                 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
120
121                 switch (m_buf->ol_flags & PKT_TX_L4_MASK) {
122                 case PKT_TX_TCP_CKSUM:
123                         net_hdr->csum_offset = (offsetof(struct tcp_hdr,
124                                                 cksum));
125                         break;
126                 case PKT_TX_UDP_CKSUM:
127                         net_hdr->csum_offset = (offsetof(struct udp_hdr,
128                                                 dgram_cksum));
129                         break;
130                 case PKT_TX_SCTP_CKSUM:
131                         net_hdr->csum_offset = (offsetof(struct sctp_hdr,
132                                                 cksum));
133                         break;
134                 }
135         } else {
136                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
137                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
138                 ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
139         }
140
141         if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
142                 if (m_buf->ol_flags & PKT_TX_IPV4)
143                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
144                 else
145                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
146                 net_hdr->gso_size = m_buf->tso_segsz;
147                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
148                                         + m_buf->l4_len;
149         } else {
150                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
151                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
152                 ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
153         }
154 }
155
156 static inline int __attribute__((always_inline))
157 copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs,
158                   struct rte_mbuf *m, uint16_t desc_idx, uint32_t size)
159 {
160         uint32_t desc_avail, desc_offset;
161         uint32_t mbuf_avail, mbuf_offset;
162         uint32_t cpy_len;
163         struct vring_desc *desc;
164         uint64_t desc_addr;
165         /* A counter to avoid desc dead loop chain */
166         uint16_t nr_desc = 1;
167
168         desc = &descs[desc_idx];
169         desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr);
170         /*
171          * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
172          * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
173          * otherwise stores offset on the stack instead of in a register.
174          */
175         if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
176                 return -1;
177
178         rte_prefetch0((void *)(uintptr_t)desc_addr);
179
180         virtio_enqueue_offload(m, (struct virtio_net_hdr *)(uintptr_t)desc_addr);
181         vhost_log_write(dev, desc->addr, dev->vhost_hlen);
182         PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
183
184         desc_offset = dev->vhost_hlen;
185         desc_avail  = desc->len - dev->vhost_hlen;
186
187         mbuf_avail  = rte_pktmbuf_data_len(m);
188         mbuf_offset = 0;
189         while (mbuf_avail != 0 || m->next != NULL) {
190                 /* done with current mbuf, fetch next */
191                 if (mbuf_avail == 0) {
192                         m = m->next;
193
194                         mbuf_offset = 0;
195                         mbuf_avail  = rte_pktmbuf_data_len(m);
196                 }
197
198                 /* done with current desc buf, fetch next */
199                 if (desc_avail == 0) {
200                         if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
201                                 /* Room in vring buffer is not enough */
202                                 return -1;
203                         }
204                         if (unlikely(desc->next >= size || ++nr_desc > size))
205                                 return -1;
206
207                         desc = &descs[desc->next];
208                         desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr);
209                         if (unlikely(!desc_addr))
210                                 return -1;
211
212                         desc_offset = 0;
213                         desc_avail  = desc->len;
214                 }
215
216                 cpy_len = RTE_MIN(desc_avail, mbuf_avail);
217                 rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
218                         rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
219                         cpy_len);
220                 vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
221                 PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
222                              cpy_len, 0);
223
224                 mbuf_avail  -= cpy_len;
225                 mbuf_offset += cpy_len;
226                 desc_avail  -= cpy_len;
227                 desc_offset += cpy_len;
228         }
229
230         return 0;
231 }
232
233 /**
234  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
235  * be received from the physical port or from another virtio device. A packet
236  * count is returned to indicate the number of packets that are succesfully
237  * added to the RX queue. This function works when the mbuf is scattered, but
238  * it doesn't support the mergeable feature.
239  */
240 static inline uint32_t __attribute__((always_inline))
241 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
242               struct rte_mbuf **pkts, uint32_t count)
243 {
244         struct vhost_virtqueue *vq;
245         uint16_t avail_idx, free_entries, start_idx;
246         uint16_t desc_indexes[MAX_PKT_BURST];
247         struct vring_desc *descs;
248         uint16_t used_idx;
249         uint32_t i, sz;
250
251         LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
252         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
253                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
254                         dev->vid, __func__, queue_id);
255                 return 0;
256         }
257
258         vq = dev->virtqueue[queue_id];
259         if (unlikely(vq->enabled == 0))
260                 return 0;
261
262         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
263         start_idx = vq->last_used_idx;
264         free_entries = avail_idx - start_idx;
265         count = RTE_MIN(count, free_entries);
266         count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
267         if (count == 0)
268                 return 0;
269
270         LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
271                 dev->vid, start_idx, start_idx + count);
272
273         /* Retrieve all of the desc indexes first to avoid caching issues. */
274         rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
275         for (i = 0; i < count; i++) {
276                 used_idx = (start_idx + i) & (vq->size - 1);
277                 desc_indexes[i] = vq->avail->ring[used_idx];
278                 vq->used->ring[used_idx].id = desc_indexes[i];
279                 vq->used->ring[used_idx].len = pkts[i]->pkt_len +
280                                                dev->vhost_hlen;
281                 vhost_log_used_vring(dev, vq,
282                         offsetof(struct vring_used, ring[used_idx]),
283                         sizeof(vq->used->ring[used_idx]));
284         }
285
286         rte_prefetch0(&vq->desc[desc_indexes[0]]);
287         for (i = 0; i < count; i++) {
288                 uint16_t desc_idx = desc_indexes[i];
289                 int err;
290
291                 if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) {
292                         descs = (struct vring_desc *)(uintptr_t)
293                                 rte_vhost_gpa_to_vva(dev->mem,
294                                         vq->desc[desc_idx].addr);
295                         if (unlikely(!descs)) {
296                                 count = i;
297                                 break;
298                         }
299
300                         desc_idx = 0;
301                         sz = vq->desc[desc_idx].len / sizeof(*descs);
302                 } else {
303                         descs = vq->desc;
304                         sz = vq->size;
305                 }
306
307                 err = copy_mbuf_to_desc(dev, descs, pkts[i], desc_idx, sz);
308                 if (unlikely(err)) {
309                         used_idx = (start_idx + i) & (vq->size - 1);
310                         vq->used->ring[used_idx].len = dev->vhost_hlen;
311                         vhost_log_used_vring(dev, vq,
312                                 offsetof(struct vring_used, ring[used_idx]),
313                                 sizeof(vq->used->ring[used_idx]));
314                 }
315
316                 if (i + 1 < count)
317                         rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
318         }
319
320         rte_smp_wmb();
321
322         *(volatile uint16_t *)&vq->used->idx += count;
323         vq->last_used_idx += count;
324         vhost_log_used_vring(dev, vq,
325                 offsetof(struct vring_used, idx),
326                 sizeof(vq->used->idx));
327
328         /* flush used->idx update before we read avail->flags. */
329         rte_mb();
330
331         /* Kick the guest if necessary. */
332         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
333                         && (vq->callfd >= 0))
334                 eventfd_write(vq->callfd, (eventfd_t)1);
335         return count;
336 }
337
338 static inline int __attribute__((always_inline))
339 fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
340                          uint32_t avail_idx, uint32_t *vec_idx,
341                          struct buf_vector *buf_vec, uint16_t *desc_chain_head,
342                          uint16_t *desc_chain_len)
343 {
344         uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
345         uint32_t vec_id = *vec_idx;
346         uint32_t len    = 0;
347         struct vring_desc *descs = vq->desc;
348
349         *desc_chain_head = idx;
350
351         if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
352                 descs = (struct vring_desc *)(uintptr_t)
353                         rte_vhost_gpa_to_vva(dev->mem, vq->desc[idx].addr);
354                 if (unlikely(!descs))
355                         return -1;
356
357                 idx = 0;
358         }
359
360         while (1) {
361                 if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
362                         return -1;
363
364                 len += descs[idx].len;
365                 buf_vec[vec_id].buf_addr = descs[idx].addr;
366                 buf_vec[vec_id].buf_len  = descs[idx].len;
367                 buf_vec[vec_id].desc_idx = idx;
368                 vec_id++;
369
370                 if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
371                         break;
372
373                 idx = descs[idx].next;
374         }
375
376         *desc_chain_len = len;
377         *vec_idx = vec_id;
378
379         return 0;
380 }
381
382 /*
383  * Returns -1 on fail, 0 on success
384  */
385 static inline int
386 reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
387                                 uint32_t size, struct buf_vector *buf_vec,
388                                 uint16_t *num_buffers, uint16_t avail_head)
389 {
390         uint16_t cur_idx;
391         uint32_t vec_idx = 0;
392         uint16_t tries = 0;
393
394         uint16_t head_idx = 0;
395         uint16_t len = 0;
396
397         *num_buffers = 0;
398         cur_idx  = vq->last_avail_idx;
399
400         while (size > 0) {
401                 if (unlikely(cur_idx == avail_head))
402                         return -1;
403
404                 if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec,
405                                                 &head_idx, &len) < 0))
406                         return -1;
407                 len = RTE_MIN(len, size);
408                 update_shadow_used_ring(vq, head_idx, len);
409                 size -= len;
410
411                 cur_idx++;
412                 tries++;
413                 *num_buffers += 1;
414
415                 /*
416                  * if we tried all available ring items, and still
417                  * can't get enough buf, it means something abnormal
418                  * happened.
419                  */
420                 if (unlikely(tries >= vq->size))
421                         return -1;
422         }
423
424         return 0;
425 }
426
427 static inline int __attribute__((always_inline))
428 copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m,
429                             struct buf_vector *buf_vec, uint16_t num_buffers)
430 {
431         uint32_t vec_idx = 0;
432         uint64_t desc_addr;
433         uint32_t mbuf_offset, mbuf_avail;
434         uint32_t desc_offset, desc_avail;
435         uint32_t cpy_len;
436         uint64_t hdr_addr, hdr_phys_addr;
437         struct rte_mbuf *hdr_mbuf;
438
439         if (unlikely(m == NULL))
440                 return -1;
441
442         desc_addr = rte_vhost_gpa_to_vva(dev->mem, buf_vec[vec_idx].buf_addr);
443         if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
444                 return -1;
445
446         hdr_mbuf = m;
447         hdr_addr = desc_addr;
448         hdr_phys_addr = buf_vec[vec_idx].buf_addr;
449         rte_prefetch0((void *)(uintptr_t)hdr_addr);
450
451         LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
452                 dev->vid, num_buffers);
453
454         desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
455         desc_offset = dev->vhost_hlen;
456
457         mbuf_avail  = rte_pktmbuf_data_len(m);
458         mbuf_offset = 0;
459         while (mbuf_avail != 0 || m->next != NULL) {
460                 /* done with current desc buf, get the next one */
461                 if (desc_avail == 0) {
462                         vec_idx++;
463                         desc_addr = rte_vhost_gpa_to_vva(dev->mem,
464                                         buf_vec[vec_idx].buf_addr);
465                         if (unlikely(!desc_addr))
466                                 return -1;
467
468                         /* Prefetch buffer address. */
469                         rte_prefetch0((void *)(uintptr_t)desc_addr);
470                         desc_offset = 0;
471                         desc_avail  = buf_vec[vec_idx].buf_len;
472                 }
473
474                 /* done with current mbuf, get the next one */
475                 if (mbuf_avail == 0) {
476                         m = m->next;
477
478                         mbuf_offset = 0;
479                         mbuf_avail  = rte_pktmbuf_data_len(m);
480                 }
481
482                 if (hdr_addr) {
483                         struct virtio_net_hdr_mrg_rxbuf *hdr;
484
485                         hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)
486                                 hdr_addr;
487                         virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
488                         ASSIGN_UNLESS_EQUAL(hdr->num_buffers, num_buffers);
489
490                         vhost_log_write(dev, hdr_phys_addr, dev->vhost_hlen);
491                         PRINT_PACKET(dev, (uintptr_t)hdr_addr,
492                                      dev->vhost_hlen, 0);
493
494                         hdr_addr = 0;
495                 }
496
497                 cpy_len = RTE_MIN(desc_avail, mbuf_avail);
498                 rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
499                         rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
500                         cpy_len);
501                 vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
502                         cpy_len);
503                 PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
504                         cpy_len, 0);
505
506                 mbuf_avail  -= cpy_len;
507                 mbuf_offset += cpy_len;
508                 desc_avail  -= cpy_len;
509                 desc_offset += cpy_len;
510         }
511
512         return 0;
513 }
514
515 static inline uint32_t __attribute__((always_inline))
516 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
517         struct rte_mbuf **pkts, uint32_t count)
518 {
519         struct vhost_virtqueue *vq;
520         uint32_t pkt_idx = 0;
521         uint16_t num_buffers;
522         struct buf_vector buf_vec[BUF_VECTOR_MAX];
523         uint16_t avail_head;
524
525         LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
526         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
527                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
528                         dev->vid, __func__, queue_id);
529                 return 0;
530         }
531
532         vq = dev->virtqueue[queue_id];
533         if (unlikely(vq->enabled == 0))
534                 return 0;
535
536         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
537         if (count == 0)
538                 return 0;
539
540         rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
541
542         vq->shadow_used_idx = 0;
543         avail_head = *((volatile uint16_t *)&vq->avail->idx);
544         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
545                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
546
547                 if (unlikely(reserve_avail_buf_mergeable(dev, vq,
548                                                 pkt_len, buf_vec, &num_buffers,
549                                                 avail_head) < 0)) {
550                         LOG_DEBUG(VHOST_DATA,
551                                 "(%d) failed to get enough desc from vring\n",
552                                 dev->vid);
553                         vq->shadow_used_idx -= num_buffers;
554                         break;
555                 }
556
557                 LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
558                         dev->vid, vq->last_avail_idx,
559                         vq->last_avail_idx + num_buffers);
560
561                 if (copy_mbuf_to_desc_mergeable(dev, pkts[pkt_idx],
562                                                 buf_vec, num_buffers) < 0) {
563                         vq->shadow_used_idx -= num_buffers;
564                         break;
565                 }
566
567                 vq->last_avail_idx += num_buffers;
568         }
569
570         if (likely(vq->shadow_used_idx)) {
571                 flush_shadow_used_ring(dev, vq);
572
573                 /* flush used->idx update before we read avail->flags. */
574                 rte_mb();
575
576                 /* Kick the guest if necessary. */
577                 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
578                                 && (vq->callfd >= 0))
579                         eventfd_write(vq->callfd, (eventfd_t)1);
580         }
581
582         return pkt_idx;
583 }
584
585 uint16_t
586 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
587         struct rte_mbuf **pkts, uint16_t count)
588 {
589         struct virtio_net *dev = get_device(vid);
590
591         if (!dev)
592                 return 0;
593
594         if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
595                 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
596         else
597                 return virtio_dev_rx(dev, queue_id, pkts, count);
598 }
599
600 static inline bool
601 virtio_net_with_host_offload(struct virtio_net *dev)
602 {
603         if (dev->features &
604                         (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_ECN |
605                          VIRTIO_NET_F_HOST_TSO4 | VIRTIO_NET_F_HOST_TSO6 |
606                          VIRTIO_NET_F_HOST_UFO))
607                 return true;
608
609         return false;
610 }
611
612 static void
613 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
614 {
615         struct ipv4_hdr *ipv4_hdr;
616         struct ipv6_hdr *ipv6_hdr;
617         void *l3_hdr = NULL;
618         struct ether_hdr *eth_hdr;
619         uint16_t ethertype;
620
621         eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
622
623         m->l2_len = sizeof(struct ether_hdr);
624         ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
625
626         if (ethertype == ETHER_TYPE_VLAN) {
627                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
628
629                 m->l2_len += sizeof(struct vlan_hdr);
630                 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
631         }
632
633         l3_hdr = (char *)eth_hdr + m->l2_len;
634
635         switch (ethertype) {
636         case ETHER_TYPE_IPv4:
637                 ipv4_hdr = l3_hdr;
638                 *l4_proto = ipv4_hdr->next_proto_id;
639                 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
640                 *l4_hdr = (char *)l3_hdr + m->l3_len;
641                 m->ol_flags |= PKT_TX_IPV4;
642                 break;
643         case ETHER_TYPE_IPv6:
644                 ipv6_hdr = l3_hdr;
645                 *l4_proto = ipv6_hdr->proto;
646                 m->l3_len = sizeof(struct ipv6_hdr);
647                 *l4_hdr = (char *)l3_hdr + m->l3_len;
648                 m->ol_flags |= PKT_TX_IPV6;
649                 break;
650         default:
651                 m->l3_len = 0;
652                 *l4_proto = 0;
653                 *l4_hdr = NULL;
654                 break;
655         }
656 }
657
658 static inline void __attribute__((always_inline))
659 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
660 {
661         uint16_t l4_proto = 0;
662         void *l4_hdr = NULL;
663         struct tcp_hdr *tcp_hdr = NULL;
664
665         if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
666                 return;
667
668         parse_ethernet(m, &l4_proto, &l4_hdr);
669         if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
670                 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
671                         switch (hdr->csum_offset) {
672                         case (offsetof(struct tcp_hdr, cksum)):
673                                 if (l4_proto == IPPROTO_TCP)
674                                         m->ol_flags |= PKT_TX_TCP_CKSUM;
675                                 break;
676                         case (offsetof(struct udp_hdr, dgram_cksum)):
677                                 if (l4_proto == IPPROTO_UDP)
678                                         m->ol_flags |= PKT_TX_UDP_CKSUM;
679                                 break;
680                         case (offsetof(struct sctp_hdr, cksum)):
681                                 if (l4_proto == IPPROTO_SCTP)
682                                         m->ol_flags |= PKT_TX_SCTP_CKSUM;
683                                 break;
684                         default:
685                                 break;
686                         }
687                 }
688         }
689
690         if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
691                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
692                 case VIRTIO_NET_HDR_GSO_TCPV4:
693                 case VIRTIO_NET_HDR_GSO_TCPV6:
694                         tcp_hdr = l4_hdr;
695                         m->ol_flags |= PKT_TX_TCP_SEG;
696                         m->tso_segsz = hdr->gso_size;
697                         m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
698                         break;
699                 default:
700                         RTE_LOG(WARNING, VHOST_DATA,
701                                 "unsupported gso type %u.\n", hdr->gso_type);
702                         break;
703                 }
704         }
705 }
706
707 #define RARP_PKT_SIZE   64
708
709 static int
710 make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
711 {
712         struct ether_hdr *eth_hdr;
713         struct arp_hdr  *rarp;
714
715         if (rarp_mbuf->buf_len < 64) {
716                 RTE_LOG(WARNING, VHOST_DATA,
717                         "failed to make RARP; mbuf size too small %u (< %d)\n",
718                         rarp_mbuf->buf_len, RARP_PKT_SIZE);
719                 return -1;
720         }
721
722         /* Ethernet header. */
723         eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
724         memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
725         ether_addr_copy(mac, &eth_hdr->s_addr);
726         eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
727
728         /* RARP header. */
729         rarp = (struct arp_hdr *)(eth_hdr + 1);
730         rarp->arp_hrd = htons(ARP_HRD_ETHER);
731         rarp->arp_pro = htons(ETHER_TYPE_IPv4);
732         rarp->arp_hln = ETHER_ADDR_LEN;
733         rarp->arp_pln = 4;
734         rarp->arp_op  = htons(ARP_OP_REVREQUEST);
735
736         ether_addr_copy(mac, &rarp->arp_data.arp_sha);
737         ether_addr_copy(mac, &rarp->arp_data.arp_tha);
738         memset(&rarp->arp_data.arp_sip, 0x00, 4);
739         memset(&rarp->arp_data.arp_tip, 0x00, 4);
740
741         rarp_mbuf->pkt_len  = rarp_mbuf->data_len = RARP_PKT_SIZE;
742
743         return 0;
744 }
745
746 static inline void __attribute__((always_inline))
747 put_zmbuf(struct zcopy_mbuf *zmbuf)
748 {
749         zmbuf->in_use = 0;
750 }
751
752 static inline int __attribute__((always_inline))
753 copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
754                   uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
755                   struct rte_mempool *mbuf_pool)
756 {
757         struct vring_desc *desc;
758         uint64_t desc_addr;
759         uint32_t desc_avail, desc_offset;
760         uint32_t mbuf_avail, mbuf_offset;
761         uint32_t cpy_len;
762         struct rte_mbuf *cur = m, *prev = m;
763         struct virtio_net_hdr *hdr = NULL;
764         /* A counter to avoid desc dead loop chain */
765         uint32_t nr_desc = 1;
766
767         desc = &descs[desc_idx];
768         if (unlikely((desc->len < dev->vhost_hlen)) ||
769                         (desc->flags & VRING_DESC_F_INDIRECT))
770                 return -1;
771
772         desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr);
773         if (unlikely(!desc_addr))
774                 return -1;
775
776         if (virtio_net_with_host_offload(dev)) {
777                 hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
778                 rte_prefetch0(hdr);
779         }
780
781         /*
782          * A virtio driver normally uses at least 2 desc buffers
783          * for Tx: the first for storing the header, and others
784          * for storing the data.
785          */
786         if (likely((desc->len == dev->vhost_hlen) &&
787                    (desc->flags & VRING_DESC_F_NEXT) != 0)) {
788                 desc = &descs[desc->next];
789                 if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
790                         return -1;
791
792                 desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr);
793                 if (unlikely(!desc_addr))
794                         return -1;
795
796                 desc_offset = 0;
797                 desc_avail  = desc->len;
798                 nr_desc    += 1;
799         } else {
800                 desc_avail  = desc->len - dev->vhost_hlen;
801                 desc_offset = dev->vhost_hlen;
802         }
803
804         rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset));
805
806         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), desc_avail, 0);
807
808         mbuf_offset = 0;
809         mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
810         while (1) {
811                 uint64_t hpa;
812
813                 cpy_len = RTE_MIN(desc_avail, mbuf_avail);
814
815                 /*
816                  * A desc buf might across two host physical pages that are
817                  * not continuous. In such case (gpa_to_hpa returns 0), data
818                  * will be copied even though zero copy is enabled.
819                  */
820                 if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev,
821                                         desc->addr + desc_offset, cpy_len)))) {
822                         cur->data_len = cpy_len;
823                         cur->data_off = 0;
824                         cur->buf_addr = (void *)(uintptr_t)desc_addr;
825                         cur->buf_physaddr = hpa;
826
827                         /*
828                          * In zero copy mode, one mbuf can only reference data
829                          * for one or partial of one desc buff.
830                          */
831                         mbuf_avail = cpy_len;
832                 } else {
833                         rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
834                                                            mbuf_offset),
835                                 (void *)((uintptr_t)(desc_addr + desc_offset)),
836                                 cpy_len);
837                 }
838
839                 mbuf_avail  -= cpy_len;
840                 mbuf_offset += cpy_len;
841                 desc_avail  -= cpy_len;
842                 desc_offset += cpy_len;
843
844                 /* This desc reaches to its end, get the next one */
845                 if (desc_avail == 0) {
846                         if ((desc->flags & VRING_DESC_F_NEXT) == 0)
847                                 break;
848
849                         if (unlikely(desc->next >= max_desc ||
850                                      ++nr_desc > max_desc))
851                                 return -1;
852                         desc = &descs[desc->next];
853                         if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
854                                 return -1;
855
856                         desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr);
857                         if (unlikely(!desc_addr))
858                                 return -1;
859
860                         rte_prefetch0((void *)(uintptr_t)desc_addr);
861
862                         desc_offset = 0;
863                         desc_avail  = desc->len;
864
865                         PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
866                 }
867
868                 /*
869                  * This mbuf reaches to its end, get a new one
870                  * to hold more data.
871                  */
872                 if (mbuf_avail == 0) {
873                         cur = rte_pktmbuf_alloc(mbuf_pool);
874                         if (unlikely(cur == NULL)) {
875                                 RTE_LOG(ERR, VHOST_DATA, "Failed to "
876                                         "allocate memory for mbuf.\n");
877                                 return -1;
878                         }
879                         if (unlikely(dev->dequeue_zero_copy))
880                                 rte_mbuf_refcnt_update(cur, 1);
881
882                         prev->next = cur;
883                         prev->data_len = mbuf_offset;
884                         m->nb_segs += 1;
885                         m->pkt_len += mbuf_offset;
886                         prev = cur;
887
888                         mbuf_offset = 0;
889                         mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
890                 }
891         }
892
893         prev->data_len = mbuf_offset;
894         m->pkt_len    += mbuf_offset;
895
896         if (hdr)
897                 vhost_dequeue_offload(hdr, m);
898
899         return 0;
900 }
901
902 static inline void __attribute__((always_inline))
903 update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
904                  uint32_t used_idx, uint32_t desc_idx)
905 {
906         vq->used->ring[used_idx].id  = desc_idx;
907         vq->used->ring[used_idx].len = 0;
908         vhost_log_used_vring(dev, vq,
909                         offsetof(struct vring_used, ring[used_idx]),
910                         sizeof(vq->used->ring[used_idx]));
911 }
912
913 static inline void __attribute__((always_inline))
914 update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq,
915                 uint32_t count)
916 {
917         if (unlikely(count == 0))
918                 return;
919
920         rte_smp_wmb();
921         rte_smp_rmb();
922
923         vq->used->idx += count;
924         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
925                         sizeof(vq->used->idx));
926
927         /* Kick guest if required. */
928         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
929                         && (vq->callfd >= 0))
930                 eventfd_write(vq->callfd, (eventfd_t)1);
931 }
932
933 static inline struct zcopy_mbuf *__attribute__((always_inline))
934 get_zmbuf(struct vhost_virtqueue *vq)
935 {
936         uint16_t i;
937         uint16_t last;
938         int tries = 0;
939
940         /* search [last_zmbuf_idx, zmbuf_size) */
941         i = vq->last_zmbuf_idx;
942         last = vq->zmbuf_size;
943
944 again:
945         for (; i < last; i++) {
946                 if (vq->zmbufs[i].in_use == 0) {
947                         vq->last_zmbuf_idx = i + 1;
948                         vq->zmbufs[i].in_use = 1;
949                         return &vq->zmbufs[i];
950                 }
951         }
952
953         tries++;
954         if (tries == 1) {
955                 /* search [0, last_zmbuf_idx) */
956                 i = 0;
957                 last = vq->last_zmbuf_idx;
958                 goto again;
959         }
960
961         return NULL;
962 }
963
964 static inline bool __attribute__((always_inline))
965 mbuf_is_consumed(struct rte_mbuf *m)
966 {
967         while (m) {
968                 if (rte_mbuf_refcnt_read(m) > 1)
969                         return false;
970                 m = m->next;
971         }
972
973         return true;
974 }
975
976 uint16_t
977 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
978         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
979 {
980         struct virtio_net *dev;
981         struct rte_mbuf *rarp_mbuf = NULL;
982         struct vhost_virtqueue *vq;
983         uint32_t desc_indexes[MAX_PKT_BURST];
984         uint32_t used_idx;
985         uint32_t i = 0;
986         uint16_t free_entries;
987         uint16_t avail_idx;
988
989         dev = get_device(vid);
990         if (!dev)
991                 return 0;
992
993         if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
994                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
995                         dev->vid, __func__, queue_id);
996                 return 0;
997         }
998
999         vq = dev->virtqueue[queue_id];
1000         if (unlikely(vq->enabled == 0))
1001                 return 0;
1002
1003         if (unlikely(dev->dequeue_zero_copy)) {
1004                 struct zcopy_mbuf *zmbuf, *next;
1005                 int nr_updated = 0;
1006
1007                 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
1008                      zmbuf != NULL; zmbuf = next) {
1009                         next = TAILQ_NEXT(zmbuf, next);
1010
1011                         if (mbuf_is_consumed(zmbuf->mbuf)) {
1012                                 used_idx = vq->last_used_idx++ & (vq->size - 1);
1013                                 update_used_ring(dev, vq, used_idx,
1014                                                  zmbuf->desc_idx);
1015                                 nr_updated += 1;
1016
1017                                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
1018                                 rte_pktmbuf_free(zmbuf->mbuf);
1019                                 put_zmbuf(zmbuf);
1020                                 vq->nr_zmbuf -= 1;
1021                         }
1022                 }
1023
1024                 update_used_idx(dev, vq, nr_updated);
1025         }
1026
1027         /*
1028          * Construct a RARP broadcast packet, and inject it to the "pkts"
1029          * array, to looks like that guest actually send such packet.
1030          *
1031          * Check user_send_rarp() for more information.
1032          *
1033          * broadcast_rarp shares a cacheline in the virtio_net structure
1034          * with some fields that are accessed during enqueue and
1035          * rte_atomic16_cmpset() causes a write if using cmpxchg. This could
1036          * result in false sharing between enqueue and dequeue.
1037          *
1038          * Prevent unnecessary false sharing by reading broadcast_rarp first
1039          * and only performing cmpset if the read indicates it is likely to
1040          * be set.
1041          */
1042
1043         if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
1044                         rte_atomic16_cmpset((volatile uint16_t *)
1045                                 &dev->broadcast_rarp.cnt, 1, 0))) {
1046
1047                 rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1048                 if (rarp_mbuf == NULL) {
1049                         RTE_LOG(ERR, VHOST_DATA,
1050                                 "Failed to allocate memory for mbuf.\n");
1051                         return 0;
1052                 }
1053
1054                 if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
1055                         rte_pktmbuf_free(rarp_mbuf);
1056                         rarp_mbuf = NULL;
1057                 } else {
1058                         count -= 1;
1059                 }
1060         }
1061
1062         free_entries = *((volatile uint16_t *)&vq->avail->idx) -
1063                         vq->last_avail_idx;
1064         if (free_entries == 0)
1065                 goto out;
1066
1067         LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
1068
1069         /* Prefetch available and used ring */
1070         avail_idx = vq->last_avail_idx & (vq->size - 1);
1071         used_idx  = vq->last_used_idx  & (vq->size - 1);
1072         rte_prefetch0(&vq->avail->ring[avail_idx]);
1073         rte_prefetch0(&vq->used->ring[used_idx]);
1074
1075         count = RTE_MIN(count, MAX_PKT_BURST);
1076         count = RTE_MIN(count, free_entries);
1077         LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
1078                         dev->vid, count);
1079
1080         /* Retrieve all of the head indexes first to avoid caching issues. */
1081         for (i = 0; i < count; i++) {
1082                 avail_idx = (vq->last_avail_idx + i) & (vq->size - 1);
1083                 used_idx  = (vq->last_used_idx  + i) & (vq->size - 1);
1084                 desc_indexes[i] = vq->avail->ring[avail_idx];
1085
1086                 if (likely(dev->dequeue_zero_copy == 0))
1087                         update_used_ring(dev, vq, used_idx, desc_indexes[i]);
1088         }
1089
1090         /* Prefetch descriptor index. */
1091         rte_prefetch0(&vq->desc[desc_indexes[0]]);
1092         for (i = 0; i < count; i++) {
1093                 struct vring_desc *desc;
1094                 uint16_t sz, idx;
1095                 int err;
1096
1097                 if (likely(i + 1 < count))
1098                         rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
1099
1100                 if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
1101                         desc = (struct vring_desc *)(uintptr_t)
1102                                 rte_vhost_gpa_to_vva(dev->mem,
1103                                         vq->desc[desc_indexes[i]].addr);
1104                         if (unlikely(!desc))
1105                                 break;
1106
1107                         rte_prefetch0(desc);
1108                         sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
1109                         idx = 0;
1110                 } else {
1111                         desc = vq->desc;
1112                         sz = vq->size;
1113                         idx = desc_indexes[i];
1114                 }
1115
1116                 pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
1117                 if (unlikely(pkts[i] == NULL)) {
1118                         RTE_LOG(ERR, VHOST_DATA,
1119                                 "Failed to allocate memory for mbuf.\n");
1120                         break;
1121                 }
1122
1123                 err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
1124                 if (unlikely(err)) {
1125                         rte_pktmbuf_free(pkts[i]);
1126                         break;
1127                 }
1128
1129                 if (unlikely(dev->dequeue_zero_copy)) {
1130                         struct zcopy_mbuf *zmbuf;
1131
1132                         zmbuf = get_zmbuf(vq);
1133                         if (!zmbuf) {
1134                                 rte_pktmbuf_free(pkts[i]);
1135                                 break;
1136                         }
1137                         zmbuf->mbuf = pkts[i];
1138                         zmbuf->desc_idx = desc_indexes[i];
1139
1140                         /*
1141                          * Pin lock the mbuf; we will check later to see
1142                          * whether the mbuf is freed (when we are the last
1143                          * user) or not. If that's the case, we then could
1144                          * update the used ring safely.
1145                          */
1146                         rte_mbuf_refcnt_update(pkts[i], 1);
1147
1148                         vq->nr_zmbuf += 1;
1149                         TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
1150                 }
1151         }
1152         vq->last_avail_idx += i;
1153
1154         if (likely(dev->dequeue_zero_copy == 0)) {
1155                 vq->last_used_idx += i;
1156                 update_used_idx(dev, vq, i);
1157         }
1158
1159 out:
1160         if (unlikely(rarp_mbuf != NULL)) {
1161                 /*
1162                  * Inject it to the head of "pkts" array, so that switch's mac
1163                  * learning table will get updated first.
1164                  */
1165                 memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
1166                 pkts[0] = rarp_mbuf;
1167                 i += 1;
1168         }
1169
1170         return i;
1171 }