New upstream version 17.11.2
[deb_dpdk.git] / examples / vhost / virtio_net.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdbool.h>
36 #include <linux/virtio_net.h>
37
38 #include <rte_mbuf.h>
39 #include <rte_memcpy.h>
40 #include <rte_vhost.h>
41
42 #include "main.h"
43
44 /*
45  * A very simple vhost-user net driver implementation, without
46  * any extra features being enabled, such as TSO and mrg-Rx.
47  */
48
49 void
50 vs_vhost_net_setup(struct vhost_dev *dev)
51 {
52         uint16_t i;
53         int vid = dev->vid;
54         struct vhost_queue *queue;
55
56         RTE_LOG(INFO, VHOST_CONFIG,
57                 "setting builtin vhost-user net driver\n");
58
59         rte_vhost_get_negotiated_features(vid, &dev->features);
60         if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
61                 dev->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
62         else
63                 dev->hdr_len = sizeof(struct virtio_net_hdr);
64
65         rte_vhost_get_mem_table(vid, &dev->mem);
66
67         dev->nr_vrings = rte_vhost_get_vring_num(vid);
68         for (i = 0; i < dev->nr_vrings; i++) {
69                 queue = &dev->queues[i];
70
71                 queue->last_used_idx  = 0;
72                 queue->last_avail_idx = 0;
73                 rte_vhost_get_vhost_vring(vid, i, &queue->vr);
74         }
75 }
76
77 void
78 vs_vhost_net_remove(struct vhost_dev *dev)
79 {
80         free(dev->mem);
81 }
82
83 static __rte_always_inline int
84 enqueue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
85             struct rte_mbuf *m, uint16_t desc_idx)
86 {
87         uint32_t desc_avail, desc_offset;
88         uint64_t desc_chunck_len;
89         uint32_t mbuf_avail, mbuf_offset;
90         uint32_t cpy_len;
91         struct vring_desc *desc;
92         uint64_t desc_addr, desc_gaddr;
93         struct virtio_net_hdr virtio_hdr = {0, 0, 0, 0, 0, 0};
94         /* A counter to avoid desc dead loop chain */
95         uint16_t nr_desc = 1;
96
97         desc = &vr->desc[desc_idx];
98         desc_chunck_len = desc->len;
99         desc_gaddr = desc->addr;
100         desc_addr = rte_vhost_va_from_guest_pa(
101                         dev->mem, desc_gaddr, &desc_chunck_len);
102         /*
103          * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
104          * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
105          * otherwise stores offset on the stack instead of in a register.
106          */
107         if (unlikely(desc->len < dev->hdr_len) || !desc_addr)
108                 return -1;
109
110         rte_prefetch0((void *)(uintptr_t)desc_addr);
111
112         /* write virtio-net header */
113         if (likely(desc_chunck_len >= dev->hdr_len)) {
114                 *(struct virtio_net_hdr *)(uintptr_t)desc_addr = virtio_hdr;
115                 desc_offset = dev->hdr_len;
116         } else {
117                 uint64_t len;
118                 uint64_t remain = dev->hdr_len;
119                 uint64_t src = (uint64_t)(uintptr_t)&virtio_hdr, dst;
120                 uint64_t guest_addr = desc_gaddr;
121
122                 while (remain) {
123                         len = remain;
124                         dst = rte_vhost_va_from_guest_pa(dev->mem,
125                                         guest_addr, &len);
126                         if (unlikely(!dst || !len))
127                                 return -1;
128
129                         rte_memcpy((void *)(uintptr_t)dst,
130                                         (void *)(uintptr_t)src,
131                                         len);
132
133                         remain -= len;
134                         guest_addr += len;
135                         dst += len;
136                 }
137
138                 desc_chunck_len = desc->len - dev->hdr_len;
139                 desc_gaddr += dev->hdr_len;
140                 desc_addr = rte_vhost_va_from_guest_pa(
141                                 dev->mem, desc_gaddr,
142                                 &desc_chunck_len);
143                 if (unlikely(!desc_addr))
144                         return -1;
145
146                 desc_offset = 0;
147         }
148
149         desc_avail  = desc->len - dev->hdr_len;
150
151         mbuf_avail  = rte_pktmbuf_data_len(m);
152         mbuf_offset = 0;
153         while (mbuf_avail != 0 || m->next != NULL) {
154                 /* done with current mbuf, fetch next */
155                 if (mbuf_avail == 0) {
156                         m = m->next;
157
158                         mbuf_offset = 0;
159                         mbuf_avail  = rte_pktmbuf_data_len(m);
160                 }
161
162                 /* done with current desc buf, fetch next */
163                 if (desc_avail == 0) {
164                         if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
165                                 /* Room in vring buffer is not enough */
166                                 return -1;
167                         }
168                         if (unlikely(desc->next >= vr->size ||
169                                      ++nr_desc > vr->size))
170                                 return -1;
171
172                         desc = &vr->desc[desc->next];
173                         desc_chunck_len = desc->len;
174                         desc_gaddr = desc->addr;
175                         desc_addr = rte_vhost_va_from_guest_pa(
176                                         dev->mem, desc_gaddr, &desc_chunck_len);
177                         if (unlikely(!desc_addr))
178                                 return -1;
179
180                         desc_offset = 0;
181                         desc_avail  = desc->len;
182                 } else if (unlikely(desc_chunck_len == 0)) {
183                         desc_chunck_len = desc_avail;
184                         desc_gaddr += desc_offset;
185                         desc_addr = rte_vhost_va_from_guest_pa(dev->mem,
186                                         desc_gaddr,
187                                         &desc_chunck_len);
188                         if (unlikely(!desc_addr))
189                                 return -1;
190
191                         desc_offset = 0;
192                 }
193
194                 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
195                 rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
196                         rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
197                         cpy_len);
198
199                 mbuf_avail  -= cpy_len;
200                 mbuf_offset += cpy_len;
201                 desc_avail  -= cpy_len;
202                 desc_offset += cpy_len;
203                 desc_chunck_len -= cpy_len;
204         }
205
206         return 0;
207 }
208
209 uint16_t
210 vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
211                 struct rte_mbuf **pkts, uint32_t count)
212 {
213         struct vhost_queue *queue;
214         struct rte_vhost_vring *vr;
215         uint16_t avail_idx, free_entries, start_idx;
216         uint16_t desc_indexes[MAX_PKT_BURST];
217         uint16_t used_idx;
218         uint32_t i;
219
220         queue = &dev->queues[queue_id];
221         vr    = &queue->vr;
222
223         avail_idx = *((volatile uint16_t *)&vr->avail->idx);
224         start_idx = queue->last_used_idx;
225         free_entries = avail_idx - start_idx;
226         count = RTE_MIN(count, free_entries);
227         count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
228         if (count == 0)
229                 return 0;
230
231         /* Retrieve all of the desc indexes first to avoid caching issues. */
232         rte_prefetch0(&vr->avail->ring[start_idx & (vr->size - 1)]);
233         for (i = 0; i < count; i++) {
234                 used_idx = (start_idx + i) & (vr->size - 1);
235                 desc_indexes[i] = vr->avail->ring[used_idx];
236                 vr->used->ring[used_idx].id = desc_indexes[i];
237                 vr->used->ring[used_idx].len = pkts[i]->pkt_len +
238                                                dev->hdr_len;
239         }
240
241         rte_prefetch0(&vr->desc[desc_indexes[0]]);
242         for (i = 0; i < count; i++) {
243                 uint16_t desc_idx = desc_indexes[i];
244                 int err;
245
246                 err = enqueue_pkt(dev, vr, pkts[i], desc_idx);
247                 if (unlikely(err)) {
248                         used_idx = (start_idx + i) & (vr->size - 1);
249                         vr->used->ring[used_idx].len = dev->hdr_len;
250                 }
251
252                 if (i + 1 < count)
253                         rte_prefetch0(&vr->desc[desc_indexes[i+1]]);
254         }
255
256         rte_smp_wmb();
257
258         *(volatile uint16_t *)&vr->used->idx += count;
259         queue->last_used_idx += count;
260
261         /* flush used->idx update before we read avail->flags. */
262         rte_mb();
263
264         /* Kick the guest if necessary. */
265         if (!(vr->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
266                         && (vr->callfd >= 0))
267                 eventfd_write(vr->callfd, (eventfd_t)1);
268         return count;
269 }
270
271 static __rte_always_inline int
272 dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
273             struct rte_mbuf *m, uint16_t desc_idx,
274             struct rte_mempool *mbuf_pool)
275 {
276         struct vring_desc *desc;
277         uint64_t desc_addr, desc_gaddr;
278         uint32_t desc_avail, desc_offset;
279         uint64_t desc_chunck_len;
280         uint32_t mbuf_avail, mbuf_offset;
281         uint32_t cpy_len;
282         struct rte_mbuf *cur = m, *prev = m;
283         /* A counter to avoid desc dead loop chain */
284         uint32_t nr_desc = 1;
285
286         desc = &vr->desc[desc_idx];
287         if (unlikely((desc->len < dev->hdr_len)) ||
288                         (desc->flags & VRING_DESC_F_INDIRECT))
289                 return -1;
290
291         desc_chunck_len = desc->len;
292         desc_gaddr = desc->addr;
293         desc_addr = rte_vhost_va_from_guest_pa(
294                         dev->mem, desc_gaddr, &desc_chunck_len);
295         if (unlikely(!desc_addr))
296                 return -1;
297
298         /*
299          * We don't support ANY_LAYOUT, neither VERSION_1, meaning
300          * a Tx packet from guest must have 2 desc buffers at least:
301          * the first for storing the header and the others for
302          * storing the data.
303          *
304          * And since we don't support TSO, we could simply skip the
305          * header.
306          */
307         desc = &vr->desc[desc->next];
308         desc_chunck_len = desc->len;
309         desc_gaddr = desc->addr;
310         desc_addr = rte_vhost_va_from_guest_pa(
311                         dev->mem, desc_gaddr, &desc_chunck_len);
312         if (unlikely(!desc_addr))
313                 return -1;
314         rte_prefetch0((void *)(uintptr_t)desc_addr);
315
316         desc_offset = 0;
317         desc_avail  = desc->len;
318         nr_desc    += 1;
319
320         mbuf_offset = 0;
321         mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
322         while (1) {
323                 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
324                 rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
325                                                    mbuf_offset),
326                         (void *)((uintptr_t)(desc_addr + desc_offset)),
327                         cpy_len);
328
329                 mbuf_avail  -= cpy_len;
330                 mbuf_offset += cpy_len;
331                 desc_avail  -= cpy_len;
332                 desc_offset += cpy_len;
333                 desc_chunck_len -= cpy_len;
334
335                 /* This desc reaches to its end, get the next one */
336                 if (desc_avail == 0) {
337                         if ((desc->flags & VRING_DESC_F_NEXT) == 0)
338                                 break;
339
340                         if (unlikely(desc->next >= vr->size ||
341                                      ++nr_desc > vr->size))
342                                 return -1;
343                         desc = &vr->desc[desc->next];
344
345                         desc_chunck_len = desc->len;
346                         desc_gaddr = desc->addr;
347                         desc_addr = rte_vhost_va_from_guest_pa(
348                                         dev->mem, desc_gaddr, &desc_chunck_len);
349                         if (unlikely(!desc_addr))
350                                 return -1;
351                         rte_prefetch0((void *)(uintptr_t)desc_addr);
352
353                         desc_offset = 0;
354                         desc_avail  = desc->len;
355                 } else if (unlikely(desc_chunck_len == 0)) {
356                         desc_chunck_len = desc_avail;
357                         desc_gaddr += desc_offset;
358                         desc_addr = rte_vhost_va_from_guest_pa(dev->mem,
359                                         desc_gaddr,
360                                         &desc_chunck_len);
361                         if (unlikely(!desc_addr))
362                                 return -1;
363
364                         desc_offset = 0;
365                 }
366
367                 /*
368                  * This mbuf reaches to its end, get a new one
369                  * to hold more data.
370                  */
371                 if (mbuf_avail == 0) {
372                         cur = rte_pktmbuf_alloc(mbuf_pool);
373                         if (unlikely(cur == NULL)) {
374                                 RTE_LOG(ERR, VHOST_DATA, "Failed to "
375                                         "allocate memory for mbuf.\n");
376                                 return -1;
377                         }
378
379                         prev->next = cur;
380                         prev->data_len = mbuf_offset;
381                         m->nb_segs += 1;
382                         m->pkt_len += mbuf_offset;
383                         prev = cur;
384
385                         mbuf_offset = 0;
386                         mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
387                 }
388         }
389
390         prev->data_len = mbuf_offset;
391         m->pkt_len    += mbuf_offset;
392
393         return 0;
394 }
395
396 uint16_t
397 vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
398         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
399 {
400         struct vhost_queue *queue;
401         struct rte_vhost_vring *vr;
402         uint32_t desc_indexes[MAX_PKT_BURST];
403         uint32_t used_idx;
404         uint32_t i = 0;
405         uint16_t free_entries;
406         uint16_t avail_idx;
407
408         queue = &dev->queues[queue_id];
409         vr    = &queue->vr;
410
411         free_entries = *((volatile uint16_t *)&vr->avail->idx) -
412                         queue->last_avail_idx;
413         if (free_entries == 0)
414                 return 0;
415
416         /* Prefetch available and used ring */
417         avail_idx = queue->last_avail_idx & (vr->size - 1);
418         used_idx  = queue->last_used_idx  & (vr->size - 1);
419         rte_prefetch0(&vr->avail->ring[avail_idx]);
420         rte_prefetch0(&vr->used->ring[used_idx]);
421
422         count = RTE_MIN(count, MAX_PKT_BURST);
423         count = RTE_MIN(count, free_entries);
424
425         if (unlikely(count == 0))
426                 return 0;
427
428         /*
429          * Retrieve all of the head indexes first and pre-update used entries
430          * to avoid caching issues.
431          */
432         for (i = 0; i < count; i++) {
433                 avail_idx = (queue->last_avail_idx + i) & (vr->size - 1);
434                 used_idx  = (queue->last_used_idx  + i) & (vr->size - 1);
435                 desc_indexes[i] = vr->avail->ring[avail_idx];
436
437                 vr->used->ring[used_idx].id  = desc_indexes[i];
438                 vr->used->ring[used_idx].len = 0;
439         }
440
441         /* Prefetch descriptor index. */
442         rte_prefetch0(&vr->desc[desc_indexes[0]]);
443         for (i = 0; i < count; i++) {
444                 int err;
445
446                 if (likely(i + 1 < count))
447                         rte_prefetch0(&vr->desc[desc_indexes[i + 1]]);
448
449                 pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
450                 if (unlikely(pkts[i] == NULL)) {
451                         RTE_LOG(ERR, VHOST_DATA,
452                                 "Failed to allocate memory for mbuf.\n");
453                         break;
454                 }
455
456                 err = dequeue_pkt(dev, vr, pkts[i], desc_indexes[i], mbuf_pool);
457                 if (unlikely(err)) {
458                         rte_pktmbuf_free(pkts[i]);
459                         break;
460                 }
461
462         }
463
464         queue->last_avail_idx += i;
465         queue->last_used_idx += i;
466         rte_smp_wmb();
467         rte_smp_rmb();
468
469         vr->used->idx += i;
470
471         if (!(vr->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
472                         && (vr->callfd >= 0))
473                 eventfd_write(vr->callfd, (eventfd_t)1);
474
475         return i;
476 }