New upstream version 16.11.5
[deb_dpdk.git] / drivers / net / virtio / virtio_rxtx.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <errno.h>
39
40 #include <rte_cycles.h>
41 #include <rte_memory.h>
42 #include <rte_memzone.h>
43 #include <rte_branch_prediction.h>
44 #include <rte_mempool.h>
45 #include <rte_malloc.h>
46 #include <rte_mbuf.h>
47 #include <rte_ether.h>
48 #include <rte_ethdev.h>
49 #include <rte_prefetch.h>
50 #include <rte_string_fns.h>
51 #include <rte_errno.h>
52 #include <rte_byteorder.h>
53 #include <rte_cpuflags.h>
54 #include <rte_net.h>
55 #include <rte_ip.h>
56 #include <rte_udp.h>
57 #include <rte_tcp.h>
58
59 #include "virtio_logs.h"
60 #include "virtio_ethdev.h"
61 #include "virtio_pci.h"
62 #include "virtqueue.h"
63 #include "virtio_rxtx.h"
64 #include "virtio_rxtx_simple.h"
65
66 #ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP
67 #define VIRTIO_DUMP_PACKET(m, len) rte_pktmbuf_dump(stdout, m, len)
68 #else
69 #define  VIRTIO_DUMP_PACKET(m, len) do { } while (0)
70 #endif
71
72
73 #define VIRTIO_SIMPLE_FLAGS ((uint32_t)ETH_TXQ_FLAGS_NOMULTSEGS | \
74         ETH_TXQ_FLAGS_NOOFFLOADS)
75
76 void
77 vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
78 {
79         struct vring_desc *dp, *dp_tail;
80         struct vq_desc_extra *dxp;
81         uint16_t desc_idx_last = desc_idx;
82
83         dp  = &vq->vq_ring.desc[desc_idx];
84         dxp = &vq->vq_descx[desc_idx];
85         vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt + dxp->ndescs);
86         if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) {
87                 while (dp->flags & VRING_DESC_F_NEXT) {
88                         desc_idx_last = dp->next;
89                         dp = &vq->vq_ring.desc[dp->next];
90                 }
91         }
92         dxp->ndescs = 0;
93
94         /*
95          * We must append the existing free chain, if any, to the end of
96          * newly freed chain. If the virtqueue was completely used, then
97          * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above).
98          */
99         if (vq->vq_desc_tail_idx == VQ_RING_DESC_CHAIN_END) {
100                 vq->vq_desc_head_idx = desc_idx;
101         } else {
102                 dp_tail = &vq->vq_ring.desc[vq->vq_desc_tail_idx];
103                 dp_tail->next = desc_idx;
104         }
105
106         vq->vq_desc_tail_idx = desc_idx_last;
107         dp->next = VQ_RING_DESC_CHAIN_END;
108 }
109
110 static uint16_t
111 virtqueue_dequeue_burst_rx(struct virtqueue *vq, struct rte_mbuf **rx_pkts,
112                            uint32_t *len, uint16_t num)
113 {
114         struct vring_used_elem *uep;
115         struct rte_mbuf *cookie;
116         uint16_t used_idx, desc_idx;
117         uint16_t i;
118
119         /*  Caller does the check */
120         for (i = 0; i < num ; i++) {
121                 used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
122                 uep = &vq->vq_ring.used->ring[used_idx];
123                 desc_idx = (uint16_t) uep->id;
124                 len[i] = uep->len;
125                 cookie = (struct rte_mbuf *)vq->vq_descx[desc_idx].cookie;
126
127                 if (unlikely(cookie == NULL)) {
128                         PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u\n",
129                                 vq->vq_used_cons_idx);
130                         break;
131                 }
132
133                 rte_prefetch0(cookie);
134                 rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
135                 rx_pkts[i]  = cookie;
136                 vq->vq_used_cons_idx++;
137                 vq_ring_free_chain(vq, desc_idx);
138                 vq->vq_descx[desc_idx].cookie = NULL;
139         }
140
141         return i;
142 }
143
144 #ifndef DEFAULT_TX_FREE_THRESH
145 #define DEFAULT_TX_FREE_THRESH 32
146 #endif
147
148 /* Cleanup from completed transmits. */
149 static void
150 virtio_xmit_cleanup(struct virtqueue *vq, uint16_t num)
151 {
152         uint16_t i, used_idx, desc_idx;
153         for (i = 0; i < num; i++) {
154                 struct vring_used_elem *uep;
155                 struct vq_desc_extra *dxp;
156
157                 used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
158                 uep = &vq->vq_ring.used->ring[used_idx];
159
160                 desc_idx = (uint16_t) uep->id;
161                 dxp = &vq->vq_descx[desc_idx];
162                 vq->vq_used_cons_idx++;
163                 vq_ring_free_chain(vq, desc_idx);
164
165                 if (dxp->cookie != NULL) {
166                         rte_pktmbuf_free(dxp->cookie);
167                         dxp->cookie = NULL;
168                 }
169         }
170 }
171
172
173 static inline int
174 virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie)
175 {
176         struct vq_desc_extra *dxp;
177         struct virtio_hw *hw = vq->hw;
178         struct vring_desc *start_dp;
179         uint16_t needed = 1;
180         uint16_t head_idx, idx;
181
182         if (unlikely(vq->vq_free_cnt == 0))
183                 return -ENOSPC;
184         if (unlikely(vq->vq_free_cnt < needed))
185                 return -EMSGSIZE;
186
187         head_idx = vq->vq_desc_head_idx;
188         if (unlikely(head_idx >= vq->vq_nentries))
189                 return -EFAULT;
190
191         idx = head_idx;
192         dxp = &vq->vq_descx[idx];
193         dxp->cookie = (void *)cookie;
194         dxp->ndescs = needed;
195
196         start_dp = vq->vq_ring.desc;
197         start_dp[idx].addr =
198                 VIRTIO_MBUF_ADDR(cookie, vq) +
199                 RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size;
200         start_dp[idx].len =
201                 cookie->buf_len - RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
202         start_dp[idx].flags =  VRING_DESC_F_WRITE;
203         idx = start_dp[idx].next;
204         vq->vq_desc_head_idx = idx;
205         if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
206                 vq->vq_desc_tail_idx = idx;
207         vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - needed);
208         vq_update_avail_ring(vq, head_idx);
209
210         return 0;
211 }
212
213 /* When doing TSO, the IP length is not included in the pseudo header
214  * checksum of the packet given to the PMD, but for virtio it is
215  * expected.
216  */
217 static void
218 virtio_tso_fix_cksum(struct rte_mbuf *m)
219 {
220         /* common case: header is not fragmented */
221         if (likely(rte_pktmbuf_data_len(m) >= m->l2_len + m->l3_len +
222                         m->l4_len)) {
223                 struct ipv4_hdr *iph;
224                 struct ipv6_hdr *ip6h;
225                 struct tcp_hdr *th;
226                 uint16_t prev_cksum, new_cksum, ip_len, ip_paylen;
227                 uint32_t tmp;
228
229                 iph = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, m->l2_len);
230                 th = RTE_PTR_ADD(iph, m->l3_len);
231                 if ((iph->version_ihl >> 4) == 4) {
232                         iph->hdr_checksum = 0;
233                         iph->hdr_checksum = rte_ipv4_cksum(iph);
234                         ip_len = iph->total_length;
235                         ip_paylen = rte_cpu_to_be_16(rte_be_to_cpu_16(ip_len) -
236                                 m->l3_len);
237                 } else {
238                         ip6h = (struct ipv6_hdr *)iph;
239                         ip_paylen = ip6h->payload_len;
240                 }
241
242                 /* calculate the new phdr checksum not including ip_paylen */
243                 prev_cksum = th->cksum;
244                 tmp = prev_cksum;
245                 tmp += ip_paylen;
246                 tmp = (tmp & 0xffff) + (tmp >> 16);
247                 new_cksum = tmp;
248
249                 /* replace it in the packet */
250                 th->cksum = new_cksum;
251         }
252 }
253
254 static inline int
255 tx_offload_enabled(struct virtio_hw *hw)
256 {
257         return vtpci_with_feature(hw, VIRTIO_NET_F_CSUM) ||
258                 vtpci_with_feature(hw, VIRTIO_NET_F_HOST_TSO4) ||
259                 vtpci_with_feature(hw, VIRTIO_NET_F_HOST_TSO6);
260 }
261
262 /* avoid write operation when necessary, to lessen cache issues */
263 #define ASSIGN_UNLESS_EQUAL(var, val) do {      \
264         if ((var) != (val))                     \
265                 (var) = (val);                  \
266 } while (0)
267
268 static inline void
269 virtqueue_enqueue_xmit(struct virtnet_tx *txvq, struct rte_mbuf *cookie,
270                        uint16_t needed, int use_indirect, int can_push)
271 {
272         struct virtio_tx_region *txr = txvq->virtio_net_hdr_mz->addr;
273         struct vq_desc_extra *dxp;
274         struct virtqueue *vq = txvq->vq;
275         struct vring_desc *start_dp;
276         uint16_t seg_num = cookie->nb_segs;
277         uint16_t head_idx, idx;
278         uint16_t head_size = vq->hw->vtnet_hdr_size;
279         struct virtio_net_hdr *hdr;
280         int offload;
281
282         offload = tx_offload_enabled(vq->hw);
283         head_idx = vq->vq_desc_head_idx;
284         idx = head_idx;
285         dxp = &vq->vq_descx[idx];
286         dxp->cookie = (void *)cookie;
287         dxp->ndescs = needed;
288
289         start_dp = vq->vq_ring.desc;
290
291         if (can_push) {
292                 /* prepend cannot fail, checked by caller */
293                 hdr = (struct virtio_net_hdr *)
294                         rte_pktmbuf_prepend(cookie, head_size);
295                 /* rte_pktmbuf_prepend() counts the hdr size to the pkt length,
296                  * which is wrong. Below subtract restores correct pkt size.
297                  */
298                 cookie->pkt_len -= head_size;
299                 /* if offload disabled, it is not zeroed below, do it now */
300                 if (offload == 0) {
301                         ASSIGN_UNLESS_EQUAL(hdr->csum_start, 0);
302                         ASSIGN_UNLESS_EQUAL(hdr->csum_offset, 0);
303                         ASSIGN_UNLESS_EQUAL(hdr->flags, 0);
304                         ASSIGN_UNLESS_EQUAL(hdr->gso_type, 0);
305                         ASSIGN_UNLESS_EQUAL(hdr->gso_size, 0);
306                         ASSIGN_UNLESS_EQUAL(hdr->hdr_len, 0);
307                 }
308         } else if (use_indirect) {
309                 /* setup tx ring slot to point to indirect
310                  * descriptor list stored in reserved region.
311                  *
312                  * the first slot in indirect ring is already preset
313                  * to point to the header in reserved region
314                  */
315                 start_dp[idx].addr  = txvq->virtio_net_hdr_mem +
316                         RTE_PTR_DIFF(&txr[idx].tx_indir, txr);
317                 start_dp[idx].len   = (seg_num + 1) * sizeof(struct vring_desc);
318                 start_dp[idx].flags = VRING_DESC_F_INDIRECT;
319                 hdr = (struct virtio_net_hdr *)&txr[idx].tx_hdr;
320
321                 /* loop below will fill in rest of the indirect elements */
322                 start_dp = txr[idx].tx_indir;
323                 idx = 1;
324         } else {
325                 /* setup first tx ring slot to point to header
326                  * stored in reserved region.
327                  */
328                 start_dp[idx].addr  = txvq->virtio_net_hdr_mem +
329                         RTE_PTR_DIFF(&txr[idx].tx_hdr, txr);
330                 start_dp[idx].len   = vq->hw->vtnet_hdr_size;
331                 start_dp[idx].flags = VRING_DESC_F_NEXT;
332                 hdr = (struct virtio_net_hdr *)&txr[idx].tx_hdr;
333
334                 idx = start_dp[idx].next;
335         }
336
337         /* Checksum Offload / TSO */
338         if (offload) {
339                 if (cookie->ol_flags & PKT_TX_TCP_SEG)
340                         cookie->ol_flags |= PKT_TX_TCP_CKSUM;
341
342                 switch (cookie->ol_flags & PKT_TX_L4_MASK) {
343                 case PKT_TX_UDP_CKSUM:
344                         hdr->csum_start = cookie->l2_len + cookie->l3_len;
345                         hdr->csum_offset = offsetof(struct udp_hdr,
346                                 dgram_cksum);
347                         hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
348                         break;
349
350                 case PKT_TX_TCP_CKSUM:
351                         hdr->csum_start = cookie->l2_len + cookie->l3_len;
352                         hdr->csum_offset = offsetof(struct tcp_hdr, cksum);
353                         hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
354                         break;
355
356                 default:
357                         ASSIGN_UNLESS_EQUAL(hdr->csum_start, 0);
358                         ASSIGN_UNLESS_EQUAL(hdr->csum_offset, 0);
359                         ASSIGN_UNLESS_EQUAL(hdr->flags, 0);
360                         break;
361                 }
362
363                 /* TCP Segmentation Offload */
364                 if (cookie->ol_flags & PKT_TX_TCP_SEG) {
365                         virtio_tso_fix_cksum(cookie);
366                         hdr->gso_type = (cookie->ol_flags & PKT_TX_IPV6) ?
367                                 VIRTIO_NET_HDR_GSO_TCPV6 :
368                                 VIRTIO_NET_HDR_GSO_TCPV4;
369                         hdr->gso_size = cookie->tso_segsz;
370                         hdr->hdr_len =
371                                 cookie->l2_len +
372                                 cookie->l3_len +
373                                 cookie->l4_len;
374                 } else {
375                         ASSIGN_UNLESS_EQUAL(hdr->gso_type, 0);
376                         ASSIGN_UNLESS_EQUAL(hdr->gso_size, 0);
377                         ASSIGN_UNLESS_EQUAL(hdr->hdr_len, 0);
378                 }
379         }
380
381         do {
382                 start_dp[idx].addr  = VIRTIO_MBUF_DATA_DMA_ADDR(cookie, vq);
383                 start_dp[idx].len   = cookie->data_len;
384                 start_dp[idx].flags = cookie->next ? VRING_DESC_F_NEXT : 0;
385                 idx = start_dp[idx].next;
386         } while ((cookie = cookie->next) != NULL);
387
388         if (use_indirect)
389                 idx = vq->vq_ring.desc[head_idx].next;
390
391         vq->vq_desc_head_idx = idx;
392         if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
393                 vq->vq_desc_tail_idx = idx;
394         vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - needed);
395         vq_update_avail_ring(vq, head_idx);
396 }
397
398 void
399 virtio_dev_cq_start(struct rte_eth_dev *dev)
400 {
401         struct virtio_hw *hw = dev->data->dev_private;
402
403         if (hw->cvq && hw->cvq->vq) {
404                 VIRTQUEUE_DUMP((struct virtqueue *)hw->cvq->vq);
405         }
406 }
407
408 int
409 virtio_dev_rx_queue_setup(struct rte_eth_dev *dev,
410                         uint16_t queue_idx,
411                         uint16_t nb_desc,
412                         unsigned int socket_id __rte_unused,
413                         __rte_unused const struct rte_eth_rxconf *rx_conf,
414                         struct rte_mempool *mp)
415 {
416         uint16_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_RQ_QUEUE_IDX;
417         struct virtio_hw *hw = dev->data->dev_private;
418         struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
419         struct virtnet_rx *rxvq;
420
421         PMD_INIT_FUNC_TRACE();
422
423         if (nb_desc == 0 || nb_desc > vq->vq_nentries)
424                 nb_desc = vq->vq_nentries;
425         vq->vq_free_cnt = RTE_MIN(vq->vq_free_cnt, nb_desc);
426
427         rxvq = &vq->rxq;
428         rxvq->queue_id = queue_idx;
429         rxvq->mpool = mp;
430         if (rxvq->mpool == NULL) {
431                 rte_exit(EXIT_FAILURE,
432                         "Cannot allocate mbufs for rx virtqueue");
433         }
434         dev->data->rx_queues[queue_idx] = rxvq;
435
436         return 0;
437 }
438
439 int
440 virtio_dev_rx_queue_setup_finish(struct rte_eth_dev *dev, uint16_t queue_idx)
441 {
442         uint16_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_RQ_QUEUE_IDX;
443         struct virtio_hw *hw = dev->data->dev_private;
444         struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
445         struct virtnet_rx *rxvq = &vq->rxq;
446         struct rte_mbuf *m;
447         uint16_t desc_idx;
448         int error, nbufs;
449
450         PMD_INIT_FUNC_TRACE();
451
452         /* Allocate blank mbufs for the each rx descriptor */
453         nbufs = 0;
454
455         if (hw->use_simple_rxtx) {
456                 for (desc_idx = 0; desc_idx < vq->vq_nentries;
457                      desc_idx++) {
458                         vq->vq_ring.avail->ring[desc_idx] = desc_idx;
459                         vq->vq_ring.desc[desc_idx].flags =
460                                 VRING_DESC_F_WRITE;
461                 }
462
463                 virtio_rxq_vec_setup(rxvq);
464         }
465
466         memset(&rxvq->fake_mbuf, 0, sizeof(rxvq->fake_mbuf));
467         for (desc_idx = 0; desc_idx < RTE_PMD_VIRTIO_RX_MAX_BURST;
468              desc_idx++) {
469                 vq->sw_ring[vq->vq_nentries + desc_idx] =
470                         &rxvq->fake_mbuf;
471         }
472
473         if (hw->use_simple_rxtx) {
474                 while (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
475                         virtio_rxq_rearm_vec(rxvq);
476                         nbufs += RTE_VIRTIO_VPMD_RX_REARM_THRESH;
477                 }
478         } else {
479                 while (!virtqueue_full(vq)) {
480                         m = rte_mbuf_raw_alloc(rxvq->mpool);
481                         if (m == NULL)
482                                 break;
483
484                         /* Enqueue allocated buffers */
485                         error = virtqueue_enqueue_recv_refill(vq, m);
486                         if (error) {
487                                 rte_pktmbuf_free(m);
488                                 break;
489                         }
490                         nbufs++;
491                 }
492
493                 vq_update_avail_idx(vq);
494         }
495
496         PMD_INIT_LOG(DEBUG, "Allocated %d bufs", nbufs);
497
498         VIRTQUEUE_DUMP(vq);
499
500         return 0;
501 }
502
503 static void
504 virtio_update_rxtx_handler(struct rte_eth_dev *dev,
505                            const struct rte_eth_txconf *tx_conf)
506 {
507         uint8_t use_simple_rxtx = 0;
508         struct virtio_hw *hw = dev->data->dev_private;
509
510 #if defined RTE_ARCH_X86
511         if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3))
512                 use_simple_rxtx = 1;
513 #elif defined RTE_ARCH_ARM64 || defined RTE_ARCH_ARM
514         if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
515                 use_simple_rxtx = 1;
516 #endif
517         /* Use simple rx/tx func if single segment and no offloads */
518         if (use_simple_rxtx &&
519             (tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
520             !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
521                 PMD_INIT_LOG(INFO, "Using simple rx/tx path");
522                 dev->tx_pkt_burst = virtio_xmit_pkts_simple;
523                 dev->rx_pkt_burst = virtio_recv_pkts_vec;
524                 hw->use_simple_rxtx = use_simple_rxtx;
525         }
526 }
527
528 /*
529  * struct rte_eth_dev *dev: Used to update dev
530  * uint16_t nb_desc: Defaults to values read from config space
531  * unsigned int socket_id: Used to allocate memzone
532  * const struct rte_eth_txconf *tx_conf: Used to setup tx engine
533  * uint16_t queue_idx: Just used as an index in dev txq list
534  */
535 int
536 virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
537                         uint16_t queue_idx,
538                         uint16_t nb_desc,
539                         unsigned int socket_id __rte_unused,
540                         const struct rte_eth_txconf *tx_conf)
541 {
542         uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
543         struct virtio_hw *hw = dev->data->dev_private;
544         struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
545         struct virtnet_tx *txvq;
546         uint16_t tx_free_thresh;
547
548         PMD_INIT_FUNC_TRACE();
549
550         virtio_update_rxtx_handler(dev, tx_conf);
551
552         if (nb_desc == 0 || nb_desc > vq->vq_nentries)
553                 nb_desc = vq->vq_nentries;
554         vq->vq_free_cnt = RTE_MIN(vq->vq_free_cnt, nb_desc);
555
556         txvq = &vq->txq;
557         txvq->queue_id = queue_idx;
558
559         tx_free_thresh = tx_conf->tx_free_thresh;
560         if (tx_free_thresh == 0)
561                 tx_free_thresh =
562                         RTE_MIN(vq->vq_nentries / 4, DEFAULT_TX_FREE_THRESH);
563
564         if (tx_free_thresh >= (vq->vq_nentries - 3)) {
565                 RTE_LOG(ERR, PMD, "tx_free_thresh must be less than the "
566                         "number of TX entries minus 3 (%u)."
567                         " (tx_free_thresh=%u port=%u queue=%u)\n",
568                         vq->vq_nentries - 3,
569                         tx_free_thresh, dev->data->port_id, queue_idx);
570                 return -EINVAL;
571         }
572
573         vq->vq_free_thresh = tx_free_thresh;
574
575         dev->data->tx_queues[queue_idx] = txvq;
576         return 0;
577 }
578
579 int
580 virtio_dev_tx_queue_setup_finish(struct rte_eth_dev *dev,
581                                 uint16_t queue_idx)
582 {
583         uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
584         struct virtio_hw *hw = dev->data->dev_private;
585         struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
586         uint16_t mid_idx = vq->vq_nentries >> 1;
587         struct virtnet_tx *txvq = &vq->txq;
588         uint16_t desc_idx;
589
590         PMD_INIT_FUNC_TRACE();
591
592         if (hw->use_simple_rxtx) {
593                 for (desc_idx = 0; desc_idx < mid_idx; desc_idx++) {
594                         vq->vq_ring.avail->ring[desc_idx] =
595                                 desc_idx + mid_idx;
596                         vq->vq_ring.desc[desc_idx + mid_idx].next =
597                                 desc_idx;
598                         vq->vq_ring.desc[desc_idx + mid_idx].addr =
599                                 txvq->virtio_net_hdr_mem +
600                                 offsetof(struct virtio_tx_region, tx_hdr);
601                         vq->vq_ring.desc[desc_idx + mid_idx].len =
602                                 vq->hw->vtnet_hdr_size;
603                         vq->vq_ring.desc[desc_idx + mid_idx].flags =
604                                 VRING_DESC_F_NEXT;
605                         vq->vq_ring.desc[desc_idx].flags = 0;
606                 }
607                 for (desc_idx = mid_idx; desc_idx < vq->vq_nentries;
608                      desc_idx++)
609                         vq->vq_ring.avail->ring[desc_idx] = desc_idx;
610         }
611
612         VIRTQUEUE_DUMP(vq);
613
614         return 0;
615 }
616
617 static void
618 virtio_discard_rxbuf(struct virtqueue *vq, struct rte_mbuf *m)
619 {
620         int error;
621         /*
622          * Requeue the discarded mbuf. This should always be
623          * successful since it was just dequeued.
624          */
625         error = virtqueue_enqueue_recv_refill(vq, m);
626         if (unlikely(error)) {
627                 RTE_LOG(ERR, PMD, "cannot requeue discarded mbuf");
628                 rte_pktmbuf_free(m);
629         }
630 }
631
632 static void
633 virtio_update_packet_stats(struct virtnet_stats *stats, struct rte_mbuf *mbuf)
634 {
635         uint32_t s = mbuf->pkt_len;
636         struct ether_addr *ea;
637
638         if (s == 64) {
639                 stats->size_bins[1]++;
640         } else if (s > 64 && s < 1024) {
641                 uint32_t bin;
642
643                 /* count zeros, and offset into correct bin */
644                 bin = (sizeof(s) * 8) - __builtin_clz(s) - 5;
645                 stats->size_bins[bin]++;
646         } else {
647                 if (s < 64)
648                         stats->size_bins[0]++;
649                 else if (s < 1519)
650                         stats->size_bins[6]++;
651                 else if (s >= 1519)
652                         stats->size_bins[7]++;
653         }
654
655         ea = rte_pktmbuf_mtod(mbuf, struct ether_addr *);
656         if (is_multicast_ether_addr(ea)) {
657                 if (is_broadcast_ether_addr(ea))
658                         stats->broadcast++;
659                 else
660                         stats->multicast++;
661         }
662 }
663
664 /* Optionally fill offload information in structure */
665 static int
666 virtio_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
667 {
668         struct rte_net_hdr_lens hdr_lens;
669         uint32_t hdrlen, ptype;
670         int l4_supported = 0;
671
672         /* nothing to do */
673         if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
674                 return 0;
675
676         m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
677
678         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
679         m->packet_type = ptype;
680         if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
681             (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
682             (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
683                 l4_supported = 1;
684
685         if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
686                 hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
687                 if (hdr->csum_start <= hdrlen && l4_supported) {
688                         m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
689                 } else {
690                         /* Unknown proto or tunnel, do sw cksum. We can assume
691                          * the cksum field is in the first segment since the
692                          * buffers we provided to the host are large enough.
693                          * In case of SCTP, this will be wrong since it's a CRC
694                          * but there's nothing we can do.
695                          */
696                         uint16_t csum = 0, off;
697
698                         rte_raw_cksum_mbuf(m, hdr->csum_start,
699                                 rte_pktmbuf_pkt_len(m) - hdr->csum_start,
700                                 &csum);
701                         if (likely(csum != 0xffff))
702                                 csum = ~csum;
703                         off = hdr->csum_offset + hdr->csum_start;
704                         if (rte_pktmbuf_data_len(m) >= off + 1)
705                                 *rte_pktmbuf_mtod_offset(m, uint16_t *,
706                                         off) = csum;
707                 }
708         } else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) {
709                 m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
710         }
711
712         /* GSO request, save required information in mbuf */
713         if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
714                 /* Check unsupported modes */
715                 if ((hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) ||
716                     (hdr->gso_size == 0)) {
717                         return -EINVAL;
718                 }
719
720                 /* Update mss lengthes in mbuf */
721                 m->tso_segsz = hdr->gso_size;
722                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
723                         case VIRTIO_NET_HDR_GSO_TCPV4:
724                         case VIRTIO_NET_HDR_GSO_TCPV6:
725                                 m->ol_flags |= PKT_RX_LRO | \
726                                         PKT_RX_L4_CKSUM_NONE;
727                                 break;
728                         default:
729                                 return -EINVAL;
730                 }
731         }
732
733         return 0;
734 }
735
736 static inline int
737 rx_offload_enabled(struct virtio_hw *hw)
738 {
739         return vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_CSUM) ||
740                 vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO4) ||
741                 vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO6);
742 }
743
744 #define VIRTIO_MBUF_BURST_SZ 64
745 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
746 uint16_t
747 virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
748 {
749         struct virtnet_rx *rxvq = rx_queue;
750         struct virtqueue *vq = rxvq->vq;
751         struct virtio_hw *hw;
752         struct rte_mbuf *rxm, *new_mbuf;
753         uint16_t nb_used, num, nb_rx;
754         uint32_t len[VIRTIO_MBUF_BURST_SZ];
755         struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
756         int error;
757         uint32_t i, nb_enqueued;
758         uint32_t hdr_size;
759         int offload;
760         struct virtio_net_hdr *hdr;
761
762         nb_used = VIRTQUEUE_NUSED(vq);
763
764         virtio_rmb();
765
766         num = (uint16_t)(likely(nb_used <= nb_pkts) ? nb_used : nb_pkts);
767         num = (uint16_t)(likely(num <= VIRTIO_MBUF_BURST_SZ) ? num : VIRTIO_MBUF_BURST_SZ);
768         if (likely(num > DESC_PER_CACHELINE))
769                 num = num - ((vq->vq_used_cons_idx + num) % DESC_PER_CACHELINE);
770
771         num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, num);
772         PMD_RX_LOG(DEBUG, "used:%d dequeue:%d", nb_used, num);
773
774         hw = vq->hw;
775         nb_rx = 0;
776         nb_enqueued = 0;
777         hdr_size = hw->vtnet_hdr_size;
778         offload = rx_offload_enabled(hw);
779
780         for (i = 0; i < num ; i++) {
781                 rxm = rcv_pkts[i];
782
783                 PMD_RX_LOG(DEBUG, "packet len:%d", len[i]);
784
785                 if (unlikely(len[i] < hdr_size + ETHER_HDR_LEN)) {
786                         PMD_RX_LOG(ERR, "Packet drop");
787                         nb_enqueued++;
788                         virtio_discard_rxbuf(vq, rxm);
789                         rxvq->stats.errors++;
790                         continue;
791                 }
792
793                 rxm->port = rxvq->port_id;
794                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
795                 rxm->ol_flags = 0;
796                 rxm->vlan_tci = 0;
797
798                 rxm->nb_segs = 1;
799                 rxm->next = NULL;
800                 rxm->pkt_len = (uint32_t)(len[i] - hdr_size);
801                 rxm->data_len = (uint16_t)(len[i] - hdr_size);
802
803                 hdr = (struct virtio_net_hdr *)((char *)rxm->buf_addr +
804                         RTE_PKTMBUF_HEADROOM - hdr_size);
805
806                 if (hw->vlan_strip)
807                         rte_vlan_strip(rxm);
808
809                 if (offload && virtio_rx_offload(rxm, hdr) < 0) {
810                         virtio_discard_rxbuf(vq, rxm);
811                         rxvq->stats.errors++;
812                         continue;
813                 }
814
815                 VIRTIO_DUMP_PACKET(rxm, rxm->data_len);
816
817                 rx_pkts[nb_rx++] = rxm;
818
819                 rxvq->stats.bytes += rx_pkts[nb_rx - 1]->pkt_len;
820                 virtio_update_packet_stats(&rxvq->stats, rxm);
821         }
822
823         rxvq->stats.packets += nb_rx;
824
825         /* Allocate new mbuf for the used descriptor */
826         error = ENOSPC;
827         while (likely(!virtqueue_full(vq))) {
828                 new_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);
829                 if (unlikely(new_mbuf == NULL)) {
830                         struct rte_eth_dev *dev
831                                 = &rte_eth_devices[rxvq->port_id];
832                         dev->data->rx_mbuf_alloc_failed++;
833                         break;
834                 }
835                 error = virtqueue_enqueue_recv_refill(vq, new_mbuf);
836                 if (unlikely(error)) {
837                         rte_pktmbuf_free(new_mbuf);
838                         break;
839                 }
840                 nb_enqueued++;
841         }
842
843         if (likely(nb_enqueued)) {
844                 vq_update_avail_idx(vq);
845
846                 if (unlikely(virtqueue_kick_prepare(vq))) {
847                         virtqueue_notify(vq);
848                         PMD_RX_LOG(DEBUG, "Notified");
849                 }
850         }
851
852         return nb_rx;
853 }
854
855 uint16_t
856 virtio_recv_mergeable_pkts(void *rx_queue,
857                         struct rte_mbuf **rx_pkts,
858                         uint16_t nb_pkts)
859 {
860         struct virtnet_rx *rxvq = rx_queue;
861         struct virtqueue *vq = rxvq->vq;
862         struct virtio_hw *hw;
863         struct rte_mbuf *rxm, *new_mbuf;
864         uint16_t nb_used, num, nb_rx;
865         uint32_t len[VIRTIO_MBUF_BURST_SZ];
866         struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
867         struct rte_mbuf *prev;
868         int error;
869         uint32_t i, nb_enqueued;
870         uint32_t seg_num;
871         uint16_t extra_idx;
872         uint32_t seg_res;
873         uint32_t hdr_size;
874         int offload;
875
876         nb_used = VIRTQUEUE_NUSED(vq);
877
878         virtio_rmb();
879
880         PMD_RX_LOG(DEBUG, "used:%d", nb_used);
881
882         hw = vq->hw;
883         nb_rx = 0;
884         i = 0;
885         nb_enqueued = 0;
886         seg_num = 0;
887         extra_idx = 0;
888         seg_res = 0;
889         hdr_size = hw->vtnet_hdr_size;
890         offload = rx_offload_enabled(hw);
891
892         while (i < nb_used) {
893                 struct virtio_net_hdr_mrg_rxbuf *header;
894
895                 if (nb_rx == nb_pkts)
896                         break;
897
898                 num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, 1);
899                 if (num != 1)
900                         continue;
901
902                 i++;
903
904                 PMD_RX_LOG(DEBUG, "dequeue:%d", num);
905                 PMD_RX_LOG(DEBUG, "packet len:%d", len[0]);
906
907                 rxm = rcv_pkts[0];
908
909                 if (unlikely(len[0] < hdr_size + ETHER_HDR_LEN)) {
910                         PMD_RX_LOG(ERR, "Packet drop");
911                         nb_enqueued++;
912                         virtio_discard_rxbuf(vq, rxm);
913                         rxvq->stats.errors++;
914                         continue;
915                 }
916
917                 header = (struct virtio_net_hdr_mrg_rxbuf *)((char *)rxm->buf_addr +
918                         RTE_PKTMBUF_HEADROOM - hdr_size);
919                 seg_num = header->num_buffers;
920
921                 if (seg_num == 0)
922                         seg_num = 1;
923
924                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
925                 rxm->nb_segs = seg_num;
926                 rxm->next = NULL;
927                 rxm->ol_flags = 0;
928                 rxm->vlan_tci = 0;
929                 rxm->pkt_len = (uint32_t)(len[0] - hdr_size);
930                 rxm->data_len = (uint16_t)(len[0] - hdr_size);
931
932                 rxm->port = rxvq->port_id;
933                 rx_pkts[nb_rx] = rxm;
934                 prev = rxm;
935
936                 if (offload && virtio_rx_offload(rxm, &header->hdr) < 0) {
937                         virtio_discard_rxbuf(vq, rxm);
938                         rxvq->stats.errors++;
939                         continue;
940                 }
941
942                 seg_res = seg_num - 1;
943
944                 while (seg_res != 0) {
945                         /*
946                          * Get extra segments for current uncompleted packet.
947                          */
948                         uint16_t  rcv_cnt =
949                                 RTE_MIN(seg_res, RTE_DIM(rcv_pkts));
950                         if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) {
951                                 uint32_t rx_num =
952                                         virtqueue_dequeue_burst_rx(vq,
953                                         rcv_pkts, len, rcv_cnt);
954                                 i += rx_num;
955                                 rcv_cnt = rx_num;
956                         } else {
957                                 PMD_RX_LOG(ERR,
958                                            "No enough segments for packet.");
959                                 nb_enqueued++;
960                                 virtio_discard_rxbuf(vq, rxm);
961                                 rxvq->stats.errors++;
962                                 break;
963                         }
964
965                         extra_idx = 0;
966
967                         while (extra_idx < rcv_cnt) {
968                                 rxm = rcv_pkts[extra_idx];
969
970                                 rxm->data_off = RTE_PKTMBUF_HEADROOM - hdr_size;
971                                 rxm->next = NULL;
972                                 rxm->pkt_len = (uint32_t)(len[extra_idx]);
973                                 rxm->data_len = (uint16_t)(len[extra_idx]);
974
975                                 if (prev)
976                                         prev->next = rxm;
977
978                                 prev = rxm;
979                                 rx_pkts[nb_rx]->pkt_len += rxm->pkt_len;
980                                 extra_idx++;
981                         };
982                         seg_res -= rcv_cnt;
983                 }
984
985                 if (hw->vlan_strip)
986                         rte_vlan_strip(rx_pkts[nb_rx]);
987
988                 VIRTIO_DUMP_PACKET(rx_pkts[nb_rx],
989                         rx_pkts[nb_rx]->data_len);
990
991                 rxvq->stats.bytes += rx_pkts[nb_rx]->pkt_len;
992                 virtio_update_packet_stats(&rxvq->stats, rx_pkts[nb_rx]);
993                 nb_rx++;
994         }
995
996         rxvq->stats.packets += nb_rx;
997
998         /* Allocate new mbuf for the used descriptor */
999         error = ENOSPC;
1000         while (likely(!virtqueue_full(vq))) {
1001                 new_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);
1002                 if (unlikely(new_mbuf == NULL)) {
1003                         struct rte_eth_dev *dev
1004                                 = &rte_eth_devices[rxvq->port_id];
1005                         dev->data->rx_mbuf_alloc_failed++;
1006                         break;
1007                 }
1008                 error = virtqueue_enqueue_recv_refill(vq, new_mbuf);
1009                 if (unlikely(error)) {
1010                         rte_pktmbuf_free(new_mbuf);
1011                         break;
1012                 }
1013                 nb_enqueued++;
1014         }
1015
1016         if (likely(nb_enqueued)) {
1017                 vq_update_avail_idx(vq);
1018
1019                 if (unlikely(virtqueue_kick_prepare(vq))) {
1020                         virtqueue_notify(vq);
1021                         PMD_RX_LOG(DEBUG, "Notified");
1022                 }
1023         }
1024
1025         return nb_rx;
1026 }
1027
1028 uint16_t
1029 virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
1030 {
1031         struct virtnet_tx *txvq = tx_queue;
1032         struct virtqueue *vq = txvq->vq;
1033         struct virtio_hw *hw = vq->hw;
1034         uint16_t hdr_size = hw->vtnet_hdr_size;
1035         uint16_t nb_used, nb_tx;
1036         int error;
1037
1038         if (unlikely(nb_pkts < 1))
1039                 return nb_pkts;
1040
1041         PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
1042         nb_used = VIRTQUEUE_NUSED(vq);
1043
1044         virtio_rmb();
1045         if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
1046                 virtio_xmit_cleanup(vq, nb_used);
1047
1048         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
1049                 struct rte_mbuf *txm = tx_pkts[nb_tx];
1050                 int can_push = 0, use_indirect = 0, slots, need;
1051
1052                 /* Do VLAN tag insertion */
1053                 if (unlikely(txm->ol_flags & PKT_TX_VLAN_PKT)) {
1054                         error = rte_vlan_insert(&txm);
1055                         if (unlikely(error)) {
1056                                 rte_pktmbuf_free(txm);
1057                                 continue;
1058                         }
1059                 }
1060
1061                 /* optimize ring usage */
1062                 if (vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) &&
1063                     rte_mbuf_refcnt_read(txm) == 1 &&
1064                     RTE_MBUF_DIRECT(txm) &&
1065                     txm->nb_segs == 1 &&
1066                     rte_pktmbuf_headroom(txm) >= hdr_size &&
1067                     rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
1068                                    __alignof__(struct virtio_net_hdr_mrg_rxbuf)))
1069                         can_push = 1;
1070                 else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
1071                          txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
1072                         use_indirect = 1;
1073
1074                 /* How many main ring entries are needed to this Tx?
1075                  * any_layout => number of segments
1076                  * indirect   => 1
1077                  * default    => number of segments + 1
1078                  */
1079                 slots = use_indirect ? 1 : (txm->nb_segs + !can_push);
1080                 need = slots - vq->vq_free_cnt;
1081
1082                 /* Positive value indicates it need free vring descriptors */
1083                 if (unlikely(need > 0)) {
1084                         nb_used = VIRTQUEUE_NUSED(vq);
1085                         virtio_rmb();
1086                         need = RTE_MIN(need, (int)nb_used);
1087
1088                         virtio_xmit_cleanup(vq, need);
1089                         need = slots - vq->vq_free_cnt;
1090                         if (unlikely(need > 0)) {
1091                                 PMD_TX_LOG(ERR,
1092                                            "No free tx descriptors to transmit");
1093                                 break;
1094                         }
1095                 }
1096
1097                 /* Enqueue Packet buffers */
1098                 virtqueue_enqueue_xmit(txvq, txm, slots, use_indirect, can_push);
1099
1100                 txvq->stats.bytes += txm->pkt_len;
1101                 virtio_update_packet_stats(&txvq->stats, txm);
1102         }
1103
1104         txvq->stats.packets += nb_tx;
1105
1106         if (likely(nb_tx)) {
1107                 vq_update_avail_idx(vq);
1108
1109                 if (unlikely(virtqueue_kick_prepare(vq))) {
1110                         virtqueue_notify(vq);
1111                         PMD_TX_LOG(DEBUG, "Notified backend after xmit");
1112                 }
1113         }
1114
1115         return nb_tx;
1116 }