Imported Upstream version 16.04
[deb_dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55
56 #include "main.h"
57
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64
65 /*
66  * Calculate the number of buffers needed per port
67  */
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
69                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
70                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71                                                         ((num_switching_cores+1)*MBUF_CACHE_SIZE))
72
73 #define MBUF_CACHE_SIZE 128
74 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
75
76 /*
77  * No frame data buffer allocated from host are required for zero copy
78  * implementation, guest will allocate the frame data buffer, and vhost
79  * directly use it.
80  */
81 #define VIRTIO_DESCRIPTOR_LEN_ZCP       RTE_MBUF_DEFAULT_DATAROOM
82 #define MBUF_DATA_SIZE_ZCP              RTE_MBUF_DEFAULT_BUF_SIZE
83 #define MBUF_CACHE_SIZE_ZCP 0
84
85 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
86 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
87
88 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
89 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
90
91 #define JUMBO_FRAME_MAX_SIZE    0x2600
92
93 /* State of virtio device. */
94 #define DEVICE_MAC_LEARNING 0
95 #define DEVICE_RX                       1
96 #define DEVICE_SAFE_REMOVE      2
97
98 /* Config_core_flag status definitions. */
99 #define REQUEST_DEV_REMOVAL 1
100 #define ACK_DEV_REMOVAL 0
101
102 /* Configurable number of RX/TX ring descriptors */
103 #define RTE_TEST_RX_DESC_DEFAULT 1024
104 #define RTE_TEST_TX_DESC_DEFAULT 512
105
106 /*
107  * Need refine these 2 macros for legacy and DPDK based front end:
108  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
109  * And then adjust power 2.
110  */
111 /*
112  * For legacy front end, 128 descriptors,
113  * half for virtio header, another half for mbuf.
114  */
115 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
116 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
117
118 /* Get first 4 bytes in mbuf headroom. */
119 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
120                 + sizeof(struct rte_mbuf)))
121
122 /* true if x is a power of 2 */
123 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
124
125 #define INVALID_PORT_ID 0xFF
126
127 /* Max number of devices. Limited by vmdq. */
128 #define MAX_DEVICES 64
129
130 /* Size of buffers used for snprintfs. */
131 #define MAX_PRINT_BUFF 6072
132
133 /* Maximum character device basename size. */
134 #define MAX_BASENAME_SZ 10
135
136 /* Maximum long option length for option parsing. */
137 #define MAX_LONG_OPT_SZ 64
138
139 /* Used to compare MAC addresses. */
140 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
141
142 /* Number of descriptors per cacheline. */
143 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
144
145 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
146
147 /* mask of enabled ports */
148 static uint32_t enabled_port_mask = 0;
149
150 /* Promiscuous mode */
151 static uint32_t promiscuous;
152
153 /*Number of switching cores enabled*/
154 static uint32_t num_switching_cores = 0;
155
156 /* number of devices/queues to support*/
157 static uint32_t num_queues = 0;
158 static uint32_t num_devices;
159
160 /*
161  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
162  * disabled on default.
163  */
164 static uint32_t zero_copy;
165 static int mergeable;
166
167 /* Do vlan strip on host, enabled on default */
168 static uint32_t vlan_strip = 1;
169
170 /* number of descriptors to apply*/
171 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
172 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
173
174 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
175 #define MAX_RING_DESC 4096
176
177 struct vpool {
178         struct rte_mempool *pool;
179         struct rte_ring *ring;
180         uint32_t buf_size;
181 } vpool_array[MAX_QUEUES+MAX_QUEUES];
182
183 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
184 typedef enum {
185         VM2VM_DISABLED = 0,
186         VM2VM_SOFTWARE = 1,
187         VM2VM_HARDWARE = 2,
188         VM2VM_LAST
189 } vm2vm_type;
190 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
191
192 /* The type of host physical address translated from guest physical address. */
193 typedef enum {
194         PHYS_ADDR_CONTINUOUS = 0,
195         PHYS_ADDR_CROSS_SUBREG = 1,
196         PHYS_ADDR_INVALID = 2,
197         PHYS_ADDR_LAST
198 } hpa_type;
199
200 /* Enable stats. */
201 static uint32_t enable_stats = 0;
202 /* Enable retries on RX. */
203 static uint32_t enable_retry = 1;
204
205 /* Disable TX checksum offload */
206 static uint32_t enable_tx_csum;
207
208 /* Disable TSO offload */
209 static uint32_t enable_tso;
210
211 /* Specify timeout (in useconds) between retries on RX. */
212 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
213 /* Specify the number of retries on RX. */
214 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
215
216 /* Character device basename. Can be set by user. */
217 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
218
219 /* empty vmdq configuration structure. Filled in programatically */
220 static struct rte_eth_conf vmdq_conf_default = {
221         .rxmode = {
222                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
223                 .split_hdr_size = 0,
224                 .header_split   = 0, /**< Header Split disabled */
225                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
226                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
227                 /*
228                  * It is necessary for 1G NIC such as I350,
229                  * this fixes bug of ipv4 forwarding in guest can't
230                  * forward pakets from one virtio dev to another virtio dev.
231                  */
232                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
233                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
234                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
235         },
236
237         .txmode = {
238                 .mq_mode = ETH_MQ_TX_NONE,
239         },
240         .rx_adv_conf = {
241                 /*
242                  * should be overridden separately in code with
243                  * appropriate values
244                  */
245                 .vmdq_rx_conf = {
246                         .nb_queue_pools = ETH_8_POOLS,
247                         .enable_default_pool = 0,
248                         .default_pool = 0,
249                         .nb_pool_maps = 0,
250                         .pool_map = {{0, 0},},
251                 },
252         },
253 };
254
255 static unsigned lcore_ids[RTE_MAX_LCORE];
256 static uint8_t ports[RTE_MAX_ETHPORTS];
257 static unsigned num_ports = 0; /**< The number of ports specified in command line */
258 static uint16_t num_pf_queues, num_vmdq_queues;
259 static uint16_t vmdq_pool_base, vmdq_queue_base;
260 static uint16_t queues_per_pool;
261
262 static const uint16_t external_pkt_default_vlan_tag = 2000;
263 const uint16_t vlan_tags[] = {
264         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
265         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
266         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
267         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
268         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
269         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
270         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
271         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
272 };
273
274 /* ethernet addresses of ports */
275 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
276
277 /* heads for the main used and free linked lists for the data path. */
278 static struct virtio_net_data_ll *ll_root_used = NULL;
279 static struct virtio_net_data_ll *ll_root_free = NULL;
280
281 /* Array of data core structures containing information on individual core linked lists. */
282 static struct lcore_info lcore_info[RTE_MAX_LCORE];
283
284 /* Used for queueing bursts of TX packets. */
285 struct mbuf_table {
286         unsigned len;
287         unsigned txq_id;
288         struct rte_mbuf *m_table[MAX_PKT_BURST];
289 };
290
291 /* TX queue for each data core. */
292 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
293
294 /* TX queue fori each virtio device for zero copy. */
295 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
296
297 /* Vlan header struct used to insert vlan tags on TX. */
298 struct vlan_ethhdr {
299         unsigned char   h_dest[ETH_ALEN];
300         unsigned char   h_source[ETH_ALEN];
301         __be16          h_vlan_proto;
302         __be16          h_vlan_TCI;
303         __be16          h_vlan_encapsulated_proto;
304 };
305
306 /* Header lengths. */
307 #define VLAN_HLEN       4
308 #define VLAN_ETH_HLEN   18
309
310 /* Per-device statistics struct */
311 struct device_statistics {
312         uint64_t tx_total;
313         rte_atomic64_t rx_total_atomic;
314         uint64_t rx_total;
315         uint64_t tx;
316         rte_atomic64_t rx_atomic;
317         uint64_t rx;
318 } __rte_cache_aligned;
319 struct device_statistics dev_statistics[MAX_DEVICES];
320
321 /*
322  * Builds up the correct configuration for VMDQ VLAN pool map
323  * according to the pool & queue limits.
324  */
325 static inline int
326 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
327 {
328         struct rte_eth_vmdq_rx_conf conf;
329         struct rte_eth_vmdq_rx_conf *def_conf =
330                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
331         unsigned i;
332
333         memset(&conf, 0, sizeof(conf));
334         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
335         conf.nb_pool_maps = num_devices;
336         conf.enable_loop_back = def_conf->enable_loop_back;
337         conf.rx_mode = def_conf->rx_mode;
338
339         for (i = 0; i < conf.nb_pool_maps; i++) {
340                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
341                 conf.pool_map[i].pools = (1UL << i);
342         }
343
344         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
345         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
346                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
347         return 0;
348 }
349
350 /*
351  * Validate the device number according to the max pool number gotten form
352  * dev_info. If the device number is invalid, give the error message and
353  * return -1. Each device must have its own pool.
354  */
355 static inline int
356 validate_num_devices(uint32_t max_nb_devices)
357 {
358         if (num_devices > max_nb_devices) {
359                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
360                 return -1;
361         }
362         return 0;
363 }
364
365 /*
366  * Initialises a given port using global settings and with the rx buffers
367  * coming from the mbuf_pool passed as parameter
368  */
369 static inline int
370 port_init(uint8_t port)
371 {
372         struct rte_eth_dev_info dev_info;
373         struct rte_eth_conf port_conf;
374         struct rte_eth_rxconf *rxconf;
375         struct rte_eth_txconf *txconf;
376         int16_t rx_rings, tx_rings;
377         uint16_t rx_ring_size, tx_ring_size;
378         int retval;
379         uint16_t q;
380
381         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
382         rte_eth_dev_info_get (port, &dev_info);
383
384         if (dev_info.max_rx_queues > MAX_QUEUES) {
385                 rte_exit(EXIT_FAILURE,
386                         "please define MAX_QUEUES no less than %u in %s\n",
387                         dev_info.max_rx_queues, __FILE__);
388         }
389
390         rxconf = &dev_info.default_rxconf;
391         txconf = &dev_info.default_txconf;
392         rxconf->rx_drop_en = 1;
393
394         /* Enable vlan offload */
395         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
396
397         /*
398          * Zero copy defers queue RX/TX start to the time when guest
399          * finishes its startup and packet buffers from that guest are
400          * available.
401          */
402         if (zero_copy) {
403                 rxconf->rx_deferred_start = 1;
404                 rxconf->rx_drop_en = 0;
405                 txconf->tx_deferred_start = 1;
406         }
407
408         /*configure the number of supported virtio devices based on VMDQ limits */
409         num_devices = dev_info.max_vmdq_pools;
410
411         if (zero_copy) {
412                 rx_ring_size = num_rx_descriptor;
413                 tx_ring_size = num_tx_descriptor;
414                 tx_rings = dev_info.max_tx_queues;
415         } else {
416                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
417                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
418                 tx_rings = (uint16_t)rte_lcore_count();
419         }
420
421         retval = validate_num_devices(MAX_DEVICES);
422         if (retval < 0)
423                 return retval;
424
425         /* Get port configuration. */
426         retval = get_eth_conf(&port_conf, num_devices);
427         if (retval < 0)
428                 return retval;
429         /* NIC queues are divided into pf queues and vmdq queues.  */
430         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
431         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
432         num_vmdq_queues = num_devices * queues_per_pool;
433         num_queues = num_pf_queues + num_vmdq_queues;
434         vmdq_queue_base = dev_info.vmdq_queue_base;
435         vmdq_pool_base  = dev_info.vmdq_pool_base;
436         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
437                 num_pf_queues, num_devices, queues_per_pool);
438
439         if (port >= rte_eth_dev_count()) return -1;
440
441         if (enable_tx_csum == 0)
442                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
443
444         if (enable_tso == 0) {
445                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
446                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
447         }
448
449         rx_rings = (uint16_t)dev_info.max_rx_queues;
450         /* Configure ethernet device. */
451         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
452         if (retval != 0)
453                 return retval;
454
455         /* Setup the queues. */
456         for (q = 0; q < rx_rings; q ++) {
457                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458                                                 rte_eth_dev_socket_id(port),
459                                                 rxconf,
460                                                 vpool_array[q].pool);
461                 if (retval < 0)
462                         return retval;
463         }
464         for (q = 0; q < tx_rings; q ++) {
465                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
466                                                 rte_eth_dev_socket_id(port),
467                                                 txconf);
468                 if (retval < 0)
469                         return retval;
470         }
471
472         /* Start the device. */
473         retval  = rte_eth_dev_start(port);
474         if (retval < 0) {
475                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
476                 return retval;
477         }
478
479         if (promiscuous)
480                 rte_eth_promiscuous_enable(port);
481
482         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
483         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
484         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
485                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
486                         (unsigned)port,
487                         vmdq_ports_eth_addr[port].addr_bytes[0],
488                         vmdq_ports_eth_addr[port].addr_bytes[1],
489                         vmdq_ports_eth_addr[port].addr_bytes[2],
490                         vmdq_ports_eth_addr[port].addr_bytes[3],
491                         vmdq_ports_eth_addr[port].addr_bytes[4],
492                         vmdq_ports_eth_addr[port].addr_bytes[5]);
493
494         return 0;
495 }
496
497 /*
498  * Set character device basename.
499  */
500 static int
501 us_vhost_parse_basename(const char *q_arg)
502 {
503         /* parse number string */
504
505         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
506                 return -1;
507         else
508                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
509
510         return 0;
511 }
512
513 /*
514  * Parse the portmask provided at run time.
515  */
516 static int
517 parse_portmask(const char *portmask)
518 {
519         char *end = NULL;
520         unsigned long pm;
521
522         errno = 0;
523
524         /* parse hexadecimal string */
525         pm = strtoul(portmask, &end, 16);
526         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
527                 return -1;
528
529         if (pm == 0)
530                 return -1;
531
532         return pm;
533
534 }
535
536 /*
537  * Parse num options at run time.
538  */
539 static int
540 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
541 {
542         char *end = NULL;
543         unsigned long num;
544
545         errno = 0;
546
547         /* parse unsigned int string */
548         num = strtoul(q_arg, &end, 10);
549         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
550                 return -1;
551
552         if (num > max_valid_value)
553                 return -1;
554
555         return num;
556
557 }
558
559 /*
560  * Display usage
561  */
562 static void
563 us_vhost_usage(const char *prgname)
564 {
565         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
566         "               --vm2vm [0|1|2]\n"
567         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
568         "               --dev-basename <name>\n"
569         "               --nb-devices ND\n"
570         "               -p PORTMASK: Set mask for ports to be used by application\n"
571         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
572         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
573         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
574         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
575         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
576         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
577         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
578         "               --dev-basename: The basename to be used for the character device.\n"
579         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
580                         "zero copy\n"
581         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
582                         "used only when zero copy is enabled.\n"
583         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
584                         "used only when zero copy is enabled.\n"
585         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
586         "               --tso [0|1] disable/enable TCP segment offload.\n",
587                prgname);
588 }
589
590 /*
591  * Parse the arguments given in the command line of the application.
592  */
593 static int
594 us_vhost_parse_args(int argc, char **argv)
595 {
596         int opt, ret;
597         int option_index;
598         unsigned i;
599         const char *prgname = argv[0];
600         static struct option long_option[] = {
601                 {"vm2vm", required_argument, NULL, 0},
602                 {"rx-retry", required_argument, NULL, 0},
603                 {"rx-retry-delay", required_argument, NULL, 0},
604                 {"rx-retry-num", required_argument, NULL, 0},
605                 {"mergeable", required_argument, NULL, 0},
606                 {"vlan-strip", required_argument, NULL, 0},
607                 {"stats", required_argument, NULL, 0},
608                 {"dev-basename", required_argument, NULL, 0},
609                 {"zero-copy", required_argument, NULL, 0},
610                 {"rx-desc-num", required_argument, NULL, 0},
611                 {"tx-desc-num", required_argument, NULL, 0},
612                 {"tx-csum", required_argument, NULL, 0},
613                 {"tso", required_argument, NULL, 0},
614                 {NULL, 0, 0, 0},
615         };
616
617         /* Parse command line */
618         while ((opt = getopt_long(argc, argv, "p:P",
619                         long_option, &option_index)) != EOF) {
620                 switch (opt) {
621                 /* Portmask */
622                 case 'p':
623                         enabled_port_mask = parse_portmask(optarg);
624                         if (enabled_port_mask == 0) {
625                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
626                                 us_vhost_usage(prgname);
627                                 return -1;
628                         }
629                         break;
630
631                 case 'P':
632                         promiscuous = 1;
633                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
634                                 ETH_VMDQ_ACCEPT_BROADCAST |
635                                 ETH_VMDQ_ACCEPT_MULTICAST;
636                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
637
638                         break;
639
640                 case 0:
641                         /* Enable/disable vm2vm comms. */
642                         if (!strncmp(long_option[option_index].name, "vm2vm",
643                                 MAX_LONG_OPT_SZ)) {
644                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
645                                 if (ret == -1) {
646                                         RTE_LOG(INFO, VHOST_CONFIG,
647                                                 "Invalid argument for "
648                                                 "vm2vm [0|1|2]\n");
649                                         us_vhost_usage(prgname);
650                                         return -1;
651                                 } else {
652                                         vm2vm_mode = (vm2vm_type)ret;
653                                 }
654                         }
655
656                         /* Enable/disable retries on RX. */
657                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
658                                 ret = parse_num_opt(optarg, 1);
659                                 if (ret == -1) {
660                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
661                                         us_vhost_usage(prgname);
662                                         return -1;
663                                 } else {
664                                         enable_retry = ret;
665                                 }
666                         }
667
668                         /* Enable/disable TX checksum offload. */
669                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
670                                 ret = parse_num_opt(optarg, 1);
671                                 if (ret == -1) {
672                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
673                                         us_vhost_usage(prgname);
674                                         return -1;
675                                 } else
676                                         enable_tx_csum = ret;
677                         }
678
679                         /* Enable/disable TSO offload. */
680                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
681                                 ret = parse_num_opt(optarg, 1);
682                                 if (ret == -1) {
683                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
684                                         us_vhost_usage(prgname);
685                                         return -1;
686                                 } else
687                                         enable_tso = ret;
688                         }
689
690                         /* Specify the retries delay time (in useconds) on RX. */
691                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
692                                 ret = parse_num_opt(optarg, INT32_MAX);
693                                 if (ret == -1) {
694                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
695                                         us_vhost_usage(prgname);
696                                         return -1;
697                                 } else {
698                                         burst_rx_delay_time = ret;
699                                 }
700                         }
701
702                         /* Specify the retries number on RX. */
703                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
704                                 ret = parse_num_opt(optarg, INT32_MAX);
705                                 if (ret == -1) {
706                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
707                                         us_vhost_usage(prgname);
708                                         return -1;
709                                 } else {
710                                         burst_rx_retry_num = ret;
711                                 }
712                         }
713
714                         /* Enable/disable RX mergeable buffers. */
715                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
716                                 ret = parse_num_opt(optarg, 1);
717                                 if (ret == -1) {
718                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
719                                         us_vhost_usage(prgname);
720                                         return -1;
721                                 } else {
722                                         mergeable = !!ret;
723                                         if (ret) {
724                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
725                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
726                                                         = JUMBO_FRAME_MAX_SIZE;
727                                         }
728                                 }
729                         }
730
731                         /* Enable/disable RX VLAN strip on host. */
732                         if (!strncmp(long_option[option_index].name,
733                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
734                                 ret = parse_num_opt(optarg, 1);
735                                 if (ret == -1) {
736                                         RTE_LOG(INFO, VHOST_CONFIG,
737                                                 "Invalid argument for VLAN strip [0|1]\n");
738                                         us_vhost_usage(prgname);
739                                         return -1;
740                                 } else {
741                                         vlan_strip = !!ret;
742                                         vmdq_conf_default.rxmode.hw_vlan_strip =
743                                                 vlan_strip;
744                                 }
745                         }
746
747                         /* Enable/disable stats. */
748                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
749                                 ret = parse_num_opt(optarg, INT32_MAX);
750                                 if (ret == -1) {
751                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
752                                         us_vhost_usage(prgname);
753                                         return -1;
754                                 } else {
755                                         enable_stats = ret;
756                                 }
757                         }
758
759                         /* Set character device basename. */
760                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
761                                 if (us_vhost_parse_basename(optarg) == -1) {
762                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
763                                         us_vhost_usage(prgname);
764                                         return -1;
765                                 }
766                         }
767
768                         /* Enable/disable rx/tx zero copy. */
769                         if (!strncmp(long_option[option_index].name,
770                                 "zero-copy", MAX_LONG_OPT_SZ)) {
771                                 ret = parse_num_opt(optarg, 1);
772                                 if (ret == -1) {
773                                         RTE_LOG(INFO, VHOST_CONFIG,
774                                                 "Invalid argument"
775                                                 " for zero-copy [0|1]\n");
776                                         us_vhost_usage(prgname);
777                                         return -1;
778                                 } else
779                                         zero_copy = ret;
780                         }
781
782                         /* Specify the descriptor number on RX. */
783                         if (!strncmp(long_option[option_index].name,
784                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
785                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
786                                 if ((ret == -1) || (!POWEROF2(ret))) {
787                                         RTE_LOG(INFO, VHOST_CONFIG,
788                                         "Invalid argument for rx-desc-num[0-N],"
789                                         "power of 2 required.\n");
790                                         us_vhost_usage(prgname);
791                                         return -1;
792                                 } else {
793                                         num_rx_descriptor = ret;
794                                 }
795                         }
796
797                         /* Specify the descriptor number on TX. */
798                         if (!strncmp(long_option[option_index].name,
799                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
800                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
801                                 if ((ret == -1) || (!POWEROF2(ret))) {
802                                         RTE_LOG(INFO, VHOST_CONFIG,
803                                         "Invalid argument for tx-desc-num [0-N],"
804                                         "power of 2 required.\n");
805                                         us_vhost_usage(prgname);
806                                         return -1;
807                                 } else {
808                                         num_tx_descriptor = ret;
809                                 }
810                         }
811
812                         break;
813
814                         /* Invalid option - print options. */
815                 default:
816                         us_vhost_usage(prgname);
817                         return -1;
818                 }
819         }
820
821         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
822                 if (enabled_port_mask & (1 << i))
823                         ports[num_ports++] = (uint8_t)i;
824         }
825
826         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
827                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
828                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
829                 return -1;
830         }
831
832         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
833                 RTE_LOG(INFO, VHOST_PORT,
834                         "Vhost zero copy doesn't support software vm2vm,"
835                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
836                 return -1;
837         }
838
839         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
840                 RTE_LOG(INFO, VHOST_PORT,
841                         "Vhost zero copy doesn't support jumbo frame,"
842                         "please specify '--mergeable 0' to disable the "
843                         "mergeable feature.\n");
844                 return -1;
845         }
846
847         return 0;
848 }
849
850 /*
851  * Update the global var NUM_PORTS and array PORTS according to system ports number
852  * and return valid ports number
853  */
854 static unsigned check_ports_num(unsigned nb_ports)
855 {
856         unsigned valid_num_ports = num_ports;
857         unsigned portid;
858
859         if (num_ports > nb_ports) {
860                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
861                         num_ports, nb_ports);
862                 num_ports = nb_ports;
863         }
864
865         for (portid = 0; portid < num_ports; portid ++) {
866                 if (ports[portid] >= nb_ports) {
867                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
868                                 ports[portid], (nb_ports - 1));
869                         ports[portid] = INVALID_PORT_ID;
870                         valid_num_ports--;
871                 }
872         }
873         return valid_num_ports;
874 }
875
876 /*
877  * Macro to print out packet contents. Wrapped in debug define so that the
878  * data path is not effected when debug is disabled.
879  */
880 #ifdef DEBUG
881 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
882         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
883         unsigned int index;                                                                                                                                                                                             \
884         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
885                                                                                                                                                                                                                                         \
886         if ((header))                                                                                                                                                                                                   \
887                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
888         else                                                                                                                                                                                                                    \
889                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
890         for (index = 0; index < (size); index++) {                                                                                                                                              \
891                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
892                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
893         }                                                                                                                                                                                                                               \
894         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
895                                                                                                                                                                                                                                         \
896         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
897 } while(0)
898 #else
899 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
900 #endif
901
902 /*
903  * Function to convert guest physical addresses to vhost physical addresses.
904  * This is used to convert virtio buffer addresses.
905  */
906 static inline uint64_t __attribute__((always_inline))
907 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
908         uint32_t buf_len, hpa_type *addr_type)
909 {
910         struct virtio_memory_regions_hpa *region;
911         uint32_t regionidx;
912         uint64_t vhost_pa = 0;
913
914         *addr_type = PHYS_ADDR_INVALID;
915
916         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
917                 region = &vdev->regions_hpa[regionidx];
918                 if ((guest_pa >= region->guest_phys_address) &&
919                         (guest_pa <= region->guest_phys_address_end)) {
920                         vhost_pa = region->host_phys_addr_offset + guest_pa;
921                         if (likely((guest_pa + buf_len - 1)
922                                 <= region->guest_phys_address_end))
923                                 *addr_type = PHYS_ADDR_CONTINUOUS;
924                         else
925                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
926                         break;
927                 }
928         }
929
930         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
931                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
932                 (void *)(uintptr_t)vhost_pa);
933
934         return vhost_pa;
935 }
936
937 /*
938  * Compares a packet destination MAC address to a device MAC address.
939  */
940 static inline int __attribute__((always_inline))
941 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
942 {
943         return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
944 }
945
946 /*
947  * This function learns the MAC address of the device and registers this along with a
948  * vlan tag to a VMDQ.
949  */
950 static int
951 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
952 {
953         struct ether_hdr *pkt_hdr;
954         struct virtio_net_data_ll *dev_ll;
955         struct virtio_net *dev = vdev->dev;
956         int i, ret;
957
958         /* Learn MAC address of guest device from packet */
959         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
960
961         dev_ll = ll_root_used;
962
963         while (dev_ll != NULL) {
964                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
965                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
966                         return -1;
967                 }
968                 dev_ll = dev_ll->next;
969         }
970
971         for (i = 0; i < ETHER_ADDR_LEN; i++)
972                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
973
974         /* vlan_tag currently uses the device_id. */
975         vdev->vlan_tag = vlan_tags[dev->device_fh];
976
977         /* Print out VMDQ registration info. */
978         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
979                 dev->device_fh,
980                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
981                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
982                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
983                 vdev->vlan_tag);
984
985         /* Register the MAC address. */
986         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
987                                 (uint32_t)dev->device_fh + vmdq_pool_base);
988         if (ret)
989                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
990                                         dev->device_fh);
991
992         /* Enable stripping of the vlan tag as we handle routing. */
993         if (vlan_strip)
994                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
995                         (uint16_t)vdev->vmdq_rx_q, 1);
996
997         /* Set device as ready for RX. */
998         vdev->ready = DEVICE_RX;
999
1000         return 0;
1001 }
1002
1003 /*
1004  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1005  * queue before disabling RX on the device.
1006  */
1007 static inline void
1008 unlink_vmdq(struct vhost_dev *vdev)
1009 {
1010         unsigned i = 0;
1011         unsigned rx_count;
1012         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1013
1014         if (vdev->ready == DEVICE_RX) {
1015                 /*clear MAC and VLAN settings*/
1016                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
1017                 for (i = 0; i < 6; i++)
1018                         vdev->mac_address.addr_bytes[i] = 0;
1019
1020                 vdev->vlan_tag = 0;
1021
1022                 /*Clear out the receive buffers*/
1023                 rx_count = rte_eth_rx_burst(ports[0],
1024                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1025
1026                 while (rx_count) {
1027                         for (i = 0; i < rx_count; i++)
1028                                 rte_pktmbuf_free(pkts_burst[i]);
1029
1030                         rx_count = rte_eth_rx_burst(ports[0],
1031                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1032                 }
1033
1034                 vdev->ready = DEVICE_MAC_LEARNING;
1035         }
1036 }
1037
1038 /*
1039  * Check if the packet destination MAC address is for a local device. If so then put
1040  * the packet on that devices RX queue. If not then return.
1041  */
1042 static inline int __attribute__((always_inline))
1043 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1044 {
1045         struct virtio_net_data_ll *dev_ll;
1046         struct ether_hdr *pkt_hdr;
1047         uint64_t ret = 0;
1048         struct virtio_net *dev = vdev->dev;
1049         struct virtio_net *tdev; /* destination virito device */
1050
1051         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1052
1053         /*get the used devices list*/
1054         dev_ll = ll_root_used;
1055
1056         while (dev_ll != NULL) {
1057                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1058                                           &dev_ll->vdev->mac_address)) {
1059
1060                         /* Drop the packet if the TX packet is destined for the TX device. */
1061                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1062                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1063                                                         dev->device_fh);
1064                                 return 0;
1065                         }
1066                         tdev = dev_ll->vdev->dev;
1067
1068
1069                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1070
1071                         if (unlikely(dev_ll->vdev->remove)) {
1072                                 /*drop the packet if the device is marked for removal*/
1073                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1074                         } else {
1075                                 /*send the packet to the local virtio device*/
1076                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1077                                 if (enable_stats) {
1078                                         rte_atomic64_add(
1079                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1080                                         1);
1081                                         rte_atomic64_add(
1082                                         &dev_statistics[tdev->device_fh].rx_atomic,
1083                                         ret);
1084                                         dev_statistics[dev->device_fh].tx_total++;
1085                                         dev_statistics[dev->device_fh].tx += ret;
1086                                 }
1087                         }
1088
1089                         return 0;
1090                 }
1091                 dev_ll = dev_ll->next;
1092         }
1093
1094         return -1;
1095 }
1096
1097 /*
1098  * Check if the destination MAC of a packet is one local VM,
1099  * and get its vlan tag, and offset if it is.
1100  */
1101 static inline int __attribute__((always_inline))
1102 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1103         uint32_t *offset, uint16_t *vlan_tag)
1104 {
1105         struct virtio_net_data_ll *dev_ll = ll_root_used;
1106         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1107
1108         while (dev_ll != NULL) {
1109                 if ((dev_ll->vdev->ready == DEVICE_RX)
1110                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1111                 &dev_ll->vdev->mac_address)) {
1112                         /*
1113                          * Drop the packet if the TX packet is
1114                          * destined for the TX device.
1115                          */
1116                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1117                                 LOG_DEBUG(VHOST_DATA,
1118                                 "(%"PRIu64") TX: Source and destination"
1119                                 " MAC addresses are the same. Dropping "
1120                                 "packet.\n",
1121                                 dev_ll->vdev->dev->device_fh);
1122                                 return -1;
1123                         }
1124
1125                         /*
1126                          * HW vlan strip will reduce the packet length
1127                          * by minus length of vlan tag, so need restore
1128                          * the packet length by plus it.
1129                          */
1130                         *offset = VLAN_HLEN;
1131                         *vlan_tag =
1132                         (uint16_t)
1133                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1134
1135                         LOG_DEBUG(VHOST_DATA,
1136                         "(%"PRIu64") TX: pkt to local VM device id:"
1137                         "(%"PRIu64") vlan tag: %d.\n",
1138                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1139                         (int)*vlan_tag);
1140
1141                         break;
1142                 }
1143                 dev_ll = dev_ll->next;
1144         }
1145         return 0;
1146 }
1147
1148 static uint16_t
1149 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1150 {
1151         if (ol_flags & PKT_TX_IPV4)
1152                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1153         else /* assume ethertype == ETHER_TYPE_IPv6 */
1154                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1155 }
1156
1157 static void virtio_tx_offload(struct rte_mbuf *m)
1158 {
1159         void *l3_hdr;
1160         struct ipv4_hdr *ipv4_hdr = NULL;
1161         struct tcp_hdr *tcp_hdr = NULL;
1162         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1163
1164         l3_hdr = (char *)eth_hdr + m->l2_len;
1165
1166         if (m->ol_flags & PKT_TX_IPV4) {
1167                 ipv4_hdr = l3_hdr;
1168                 ipv4_hdr->hdr_checksum = 0;
1169                 m->ol_flags |= PKT_TX_IP_CKSUM;
1170         }
1171
1172         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
1173         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1174 }
1175
1176 /*
1177  * This function routes the TX packet to the correct interface. This may be a local device
1178  * or the physical port.
1179  */
1180 static inline void __attribute__((always_inline))
1181 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1182 {
1183         struct mbuf_table *tx_q;
1184         struct rte_mbuf **m_table;
1185         unsigned len, ret, offset = 0;
1186         const uint16_t lcore_id = rte_lcore_id();
1187         struct virtio_net *dev = vdev->dev;
1188         struct ether_hdr *nh;
1189
1190         /*check if destination is local VM*/
1191         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1192                 rte_pktmbuf_free(m);
1193                 return;
1194         }
1195
1196         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1197                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1198                         rte_pktmbuf_free(m);
1199                         return;
1200                 }
1201         }
1202
1203         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1204
1205         /*Add packet to the port tx queue*/
1206         tx_q = &lcore_tx_queue[lcore_id];
1207         len = tx_q->len;
1208
1209         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1210         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1211                 /* Guest has inserted the vlan tag. */
1212                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1213                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1214                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1215                         (vh->vlan_tci != vlan_tag_be))
1216                         vh->vlan_tci = vlan_tag_be;
1217         } else {
1218                 m->ol_flags |= PKT_TX_VLAN_PKT;
1219
1220                 /*
1221                  * Find the right seg to adjust the data len when offset is
1222                  * bigger than tail room size.
1223                  */
1224                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1225                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1226                                 m->data_len += offset;
1227                         else {
1228                                 struct rte_mbuf *seg = m;
1229
1230                                 while ((seg->next != NULL) &&
1231                                         (offset > rte_pktmbuf_tailroom(seg)))
1232                                         seg = seg->next;
1233
1234                                 seg->data_len += offset;
1235                         }
1236                         m->pkt_len += offset;
1237                 }
1238
1239                 m->vlan_tci = vlan_tag;
1240         }
1241
1242         if (m->ol_flags & PKT_TX_TCP_SEG)
1243                 virtio_tx_offload(m);
1244
1245         tx_q->m_table[len] = m;
1246         len++;
1247         if (enable_stats) {
1248                 dev_statistics[dev->device_fh].tx_total++;
1249                 dev_statistics[dev->device_fh].tx++;
1250         }
1251
1252         if (unlikely(len == MAX_PKT_BURST)) {
1253                 m_table = (struct rte_mbuf **)tx_q->m_table;
1254                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1255                 /* Free any buffers not handled by TX and update the port stats. */
1256                 if (unlikely(ret < len)) {
1257                         do {
1258                                 rte_pktmbuf_free(m_table[ret]);
1259                         } while (++ret < len);
1260                 }
1261
1262                 len = 0;
1263         }
1264
1265         tx_q->len = len;
1266         return;
1267 }
1268 /*
1269  * This function is called by each data core. It handles all RX/TX registered with the
1270  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1271  * with all devices in the main linked list.
1272  */
1273 static int
1274 switch_worker(__attribute__((unused)) void *arg)
1275 {
1276         struct rte_mempool *mbuf_pool = arg;
1277         struct virtio_net *dev = NULL;
1278         struct vhost_dev *vdev = NULL;
1279         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1280         struct virtio_net_data_ll *dev_ll;
1281         struct mbuf_table *tx_q;
1282         volatile struct lcore_ll_info *lcore_ll;
1283         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1284         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1285         unsigned ret, i;
1286         const uint16_t lcore_id = rte_lcore_id();
1287         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1288         uint16_t rx_count = 0;
1289         uint16_t tx_count;
1290         uint32_t retry = 0;
1291
1292         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1293         lcore_ll = lcore_info[lcore_id].lcore_ll;
1294         prev_tsc = 0;
1295
1296         tx_q = &lcore_tx_queue[lcore_id];
1297         for (i = 0; i < num_cores; i ++) {
1298                 if (lcore_ids[i] == lcore_id) {
1299                         tx_q->txq_id = i;
1300                         break;
1301                 }
1302         }
1303
1304         while(1) {
1305                 cur_tsc = rte_rdtsc();
1306                 /*
1307                  * TX burst queue drain
1308                  */
1309                 diff_tsc = cur_tsc - prev_tsc;
1310                 if (unlikely(diff_tsc > drain_tsc)) {
1311
1312                         if (tx_q->len) {
1313                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1314
1315                                 /*Tx any packets in the queue*/
1316                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1317                                                                            (struct rte_mbuf **)tx_q->m_table,
1318                                                                            (uint16_t)tx_q->len);
1319                                 if (unlikely(ret < tx_q->len)) {
1320                                         do {
1321                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1322                                         } while (++ret < tx_q->len);
1323                                 }
1324
1325                                 tx_q->len = 0;
1326                         }
1327
1328                         prev_tsc = cur_tsc;
1329
1330                 }
1331
1332                 rte_prefetch0(lcore_ll->ll_root_used);
1333                 /*
1334                  * Inform the configuration core that we have exited the linked list and that no devices are
1335                  * in use if requested.
1336                  */
1337                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1338                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1339
1340                 /*
1341                  * Process devices
1342                  */
1343                 dev_ll = lcore_ll->ll_root_used;
1344
1345                 while (dev_ll != NULL) {
1346                         /*get virtio device ID*/
1347                         vdev = dev_ll->vdev;
1348                         dev = vdev->dev;
1349
1350                         if (unlikely(vdev->remove)) {
1351                                 dev_ll = dev_ll->next;
1352                                 unlink_vmdq(vdev);
1353                                 vdev->ready = DEVICE_SAFE_REMOVE;
1354                                 continue;
1355                         }
1356                         if (likely(vdev->ready == DEVICE_RX)) {
1357                                 /*Handle guest RX*/
1358                                 rx_count = rte_eth_rx_burst(ports[0],
1359                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1360
1361                                 if (rx_count) {
1362                                         /*
1363                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1364                                         * Here MAX_PKT_BURST must be less than virtio queue size
1365                                         */
1366                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1367                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1368                                                         rte_delay_us(burst_rx_delay_time);
1369                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1370                                                                 break;
1371                                                 }
1372                                         }
1373                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1374                                         if (enable_stats) {
1375                                                 rte_atomic64_add(
1376                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1377                                                 rx_count);
1378                                                 rte_atomic64_add(
1379                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1380                                         }
1381                                         while (likely(rx_count)) {
1382                                                 rx_count--;
1383                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1384                                         }
1385
1386                                 }
1387                         }
1388
1389                         if (likely(!vdev->remove)) {
1390                                 /* Handle guest TX*/
1391                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1392                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1393                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1394                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1395                                                 while (tx_count)
1396                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1397                                         }
1398                                 }
1399                                 for (i = 0; i < tx_count; ++i) {
1400                                         virtio_tx_route(vdev, pkts_burst[i],
1401                                                 vlan_tags[(uint16_t)dev->device_fh]);
1402                                 }
1403                         }
1404
1405                         /*move to the next device in the list*/
1406                         dev_ll = dev_ll->next;
1407                 }
1408         }
1409
1410         return 0;
1411 }
1412
1413 /*
1414  * This function gets available ring number for zero copy rx.
1415  * Only one thread will call this funciton for a paticular virtio device,
1416  * so, it is designed as non-thread-safe function.
1417  */
1418 static inline uint32_t __attribute__((always_inline))
1419 get_available_ring_num_zcp(struct virtio_net *dev)
1420 {
1421         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1422         uint16_t avail_idx;
1423
1424         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1425         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1426 }
1427
1428 /*
1429  * This function gets available ring index for zero copy rx,
1430  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1431  * Only one thread will call this funciton for a paticular virtio device,
1432  * so, it is designed as non-thread-safe function.
1433  */
1434 static inline uint32_t __attribute__((always_inline))
1435 get_available_ring_index_zcp(struct virtio_net *dev,
1436         uint16_t *res_base_idx, uint32_t count)
1437 {
1438         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1439         uint16_t avail_idx;
1440         uint32_t retry = 0;
1441         uint16_t free_entries;
1442
1443         *res_base_idx = vq->last_used_idx_res;
1444         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1445         free_entries = (avail_idx - *res_base_idx);
1446
1447         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1448                         "avail idx: %d, "
1449                         "res base idx:%d, free entries:%d\n",
1450                         dev->device_fh, avail_idx, *res_base_idx,
1451                         free_entries);
1452
1453         /*
1454          * If retry is enabled and the queue is full then we wait
1455          * and retry to avoid packet loss.
1456          */
1457         if (enable_retry && unlikely(count > free_entries)) {
1458                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1459                         rte_delay_us(burst_rx_delay_time);
1460                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1461                         free_entries = (avail_idx - *res_base_idx);
1462                         if (count <= free_entries)
1463                                 break;
1464                 }
1465         }
1466
1467         /*check that we have enough buffers*/
1468         if (unlikely(count > free_entries))
1469                 count = free_entries;
1470
1471         if (unlikely(count == 0)) {
1472                 LOG_DEBUG(VHOST_DATA,
1473                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1474                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1475                         dev->device_fh, avail_idx,
1476                         *res_base_idx, free_entries);
1477                 return 0;
1478         }
1479
1480         vq->last_used_idx_res = *res_base_idx + count;
1481
1482         return count;
1483 }
1484
1485 /*
1486  * This function put descriptor back to used list.
1487  */
1488 static inline void __attribute__((always_inline))
1489 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1490 {
1491         uint16_t res_cur_idx = vq->last_used_idx;
1492         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1493         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1494         rte_compiler_barrier();
1495         *(volatile uint16_t *)&vq->used->idx += 1;
1496         vq->last_used_idx += 1;
1497
1498         /* Kick the guest if necessary. */
1499         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1500                 eventfd_write(vq->callfd, (eventfd_t)1);
1501 }
1502
1503 /*
1504  * This function get available descriptor from vitio vring and un-attached mbuf
1505  * from vpool->ring, and then attach them together. It needs adjust the offset
1506  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1507  * frame data may be put to wrong location in mbuf.
1508  */
1509 static inline void __attribute__((always_inline))
1510 attach_rxmbuf_zcp(struct virtio_net *dev)
1511 {
1512         uint16_t res_base_idx, desc_idx;
1513         uint64_t buff_addr, phys_addr;
1514         struct vhost_virtqueue *vq;
1515         struct vring_desc *desc;
1516         void *obj = NULL;
1517         struct rte_mbuf *mbuf;
1518         struct vpool *vpool;
1519         hpa_type addr_type;
1520         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1521
1522         vpool = &vpool_array[vdev->vmdq_rx_q];
1523         vq = dev->virtqueue[VIRTIO_RXQ];
1524
1525         do {
1526                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1527                                 1) != 1))
1528                         return;
1529                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1530
1531                 desc = &vq->desc[desc_idx];
1532                 if (desc->flags & VRING_DESC_F_NEXT) {
1533                         desc = &vq->desc[desc->next];
1534                         buff_addr = gpa_to_vva(dev, desc->addr);
1535                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1536                                         &addr_type);
1537                 } else {
1538                         buff_addr = gpa_to_vva(dev,
1539                                         desc->addr + vq->vhost_hlen);
1540                         phys_addr = gpa_to_hpa(vdev,
1541                                         desc->addr + vq->vhost_hlen,
1542                                         desc->len, &addr_type);
1543                 }
1544
1545                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1546                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1547                                 " address found when attaching RX frame buffer"
1548                                 " address!\n", dev->device_fh);
1549                         put_desc_to_used_list_zcp(vq, desc_idx);
1550                         continue;
1551                 }
1552
1553                 /*
1554                  * Check if the frame buffer address from guest crosses
1555                  * sub-region or not.
1556                  */
1557                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1558                         RTE_LOG(ERR, VHOST_DATA,
1559                                 "(%"PRIu64") Frame buffer address cross "
1560                                 "sub-regioin found when attaching RX frame "
1561                                 "buffer address!\n",
1562                                 dev->device_fh);
1563                         put_desc_to_used_list_zcp(vq, desc_idx);
1564                         continue;
1565                 }
1566         } while (unlikely(phys_addr == 0));
1567
1568         rte_ring_sc_dequeue(vpool->ring, &obj);
1569         mbuf = obj;
1570         if (unlikely(mbuf == NULL)) {
1571                 LOG_DEBUG(VHOST_DATA,
1572                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1573                         "ring_sc_dequeue fail.\n",
1574                         dev->device_fh);
1575                 put_desc_to_used_list_zcp(vq, desc_idx);
1576                 return;
1577         }
1578
1579         if (unlikely(vpool->buf_size > desc->len)) {
1580                 LOG_DEBUG(VHOST_DATA,
1581                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1582                         "length(%d) of descriptor idx: %d less than room "
1583                         "size required: %d\n",
1584                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1585                 put_desc_to_used_list_zcp(vq, desc_idx);
1586                 rte_ring_sp_enqueue(vpool->ring, obj);
1587                 return;
1588         }
1589
1590         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1591         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1592         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1593         mbuf->data_len = desc->len;
1594         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1595
1596         LOG_DEBUG(VHOST_DATA,
1597                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1598                 "descriptor idx:%d\n",
1599                 dev->device_fh, res_base_idx, desc_idx);
1600
1601         __rte_mbuf_raw_free(mbuf);
1602
1603         return;
1604 }
1605
1606 /*
1607  * Detach an attched packet mbuf -
1608  *  - restore original mbuf address and length values.
1609  *  - reset pktmbuf data and data_len to their default values.
1610  *  All other fields of the given packet mbuf will be left intact.
1611  *
1612  * @param m
1613  *   The attached packet mbuf.
1614  */
1615 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1616 {
1617         const struct rte_mempool *mp = m->pool;
1618         void *buf = rte_mbuf_to_baddr(m);
1619         uint32_t buf_ofs;
1620         uint32_t buf_len = mp->elt_size - sizeof(*m);
1621         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1622
1623         m->buf_addr = buf;
1624         m->buf_len = (uint16_t)buf_len;
1625
1626         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1627                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1628         m->data_off = buf_ofs;
1629
1630         m->data_len = 0;
1631 }
1632
1633 /*
1634  * This function is called after packets have been transimited. It fetchs mbuf
1635  * from vpool->pool, detached it and put into vpool->ring. It also update the
1636  * used index and kick the guest if necessary.
1637  */
1638 static inline uint32_t __attribute__((always_inline))
1639 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1640 {
1641         struct rte_mbuf *mbuf;
1642         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1643         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1644         uint32_t index = 0;
1645         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1646
1647         LOG_DEBUG(VHOST_DATA,
1648                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1649                 "clean is: %d\n",
1650                 dev->device_fh, mbuf_count);
1651         LOG_DEBUG(VHOST_DATA,
1652                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1653                 "clean  is : %d\n",
1654                 dev->device_fh, rte_ring_count(vpool->ring));
1655
1656         for (index = 0; index < mbuf_count; index++) {
1657                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1658                 if (likely(MBUF_EXT_MEM(mbuf)))
1659                         pktmbuf_detach_zcp(mbuf);
1660                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1661
1662                 /* Update used index buffer information. */
1663                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1664                 vq->used->ring[used_idx].len = 0;
1665
1666                 used_idx = (used_idx + 1) & (vq->size - 1);
1667         }
1668
1669         LOG_DEBUG(VHOST_DATA,
1670                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1671                 "clean is: %d\n",
1672                 dev->device_fh, rte_mempool_count(vpool->pool));
1673         LOG_DEBUG(VHOST_DATA,
1674                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1675                 "clean  is : %d\n",
1676                 dev->device_fh, rte_ring_count(vpool->ring));
1677         LOG_DEBUG(VHOST_DATA,
1678                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1679                 "vq->last_used_idx:%d\n",
1680                 dev->device_fh, vq->last_used_idx);
1681
1682         vq->last_used_idx += mbuf_count;
1683
1684         LOG_DEBUG(VHOST_DATA,
1685                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1686                 "vq->last_used_idx:%d\n",
1687                 dev->device_fh, vq->last_used_idx);
1688
1689         rte_compiler_barrier();
1690
1691         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1692
1693         /* Kick guest if required. */
1694         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1695                 eventfd_write(vq->callfd, (eventfd_t)1);
1696
1697         return 0;
1698 }
1699
1700 /*
1701  * This function is called when a virtio device is destroy.
1702  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1703  */
1704 static void mbuf_destroy_zcp(struct vpool *vpool)
1705 {
1706         struct rte_mbuf *mbuf = NULL;
1707         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1708
1709         LOG_DEBUG(VHOST_CONFIG,
1710                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1711                 "mbuf_destroy_zcp is: %d\n",
1712                 mbuf_count);
1713         LOG_DEBUG(VHOST_CONFIG,
1714                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1715                 "mbuf_destroy_zcp  is : %d\n",
1716                 rte_ring_count(vpool->ring));
1717
1718         for (index = 0; index < mbuf_count; index++) {
1719                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1720                 if (likely(mbuf != NULL)) {
1721                         if (likely(MBUF_EXT_MEM(mbuf)))
1722                                 pktmbuf_detach_zcp(mbuf);
1723                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1724                 }
1725         }
1726
1727         LOG_DEBUG(VHOST_CONFIG,
1728                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1729                 "mbuf_destroy_zcp is: %d\n",
1730                 rte_mempool_count(vpool->pool));
1731         LOG_DEBUG(VHOST_CONFIG,
1732                 "in mbuf_destroy_zcp: mbuf count in ring after "
1733                 "mbuf_destroy_zcp is : %d\n",
1734                 rte_ring_count(vpool->ring));
1735 }
1736
1737 /*
1738  * This function update the use flag and counter.
1739  */
1740 static inline uint32_t __attribute__((always_inline))
1741 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1742         uint32_t count)
1743 {
1744         struct vhost_virtqueue *vq;
1745         struct vring_desc *desc;
1746         struct rte_mbuf *buff;
1747         /* The virtio_hdr is initialised to 0. */
1748         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1749                 = {{0, 0, 0, 0, 0, 0}, 0};
1750         uint64_t buff_hdr_addr = 0;
1751         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1752         uint32_t head_idx, packet_success = 0;
1753         uint16_t res_cur_idx;
1754
1755         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1756
1757         if (count == 0)
1758                 return 0;
1759
1760         vq = dev->virtqueue[VIRTIO_RXQ];
1761         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1762
1763         res_cur_idx = vq->last_used_idx;
1764         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1765                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1766
1767         /* Retrieve all of the head indexes first to avoid caching issues. */
1768         for (head_idx = 0; head_idx < count; head_idx++)
1769                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1770
1771         /*Prefetch descriptor index. */
1772         rte_prefetch0(&vq->desc[head[packet_success]]);
1773
1774         while (packet_success != count) {
1775                 /* Get descriptor from available ring */
1776                 desc = &vq->desc[head[packet_success]];
1777
1778                 buff = pkts[packet_success];
1779                 LOG_DEBUG(VHOST_DATA,
1780                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1781                         "pkt[%d] descriptor idx: %d\n",
1782                         dev->device_fh, packet_success,
1783                         MBUF_HEADROOM_UINT32(buff));
1784
1785                 PRINT_PACKET(dev,
1786                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1787                         + RTE_PKTMBUF_HEADROOM),
1788                         rte_pktmbuf_data_len(buff), 0);
1789
1790                 /* Buffer address translation for virtio header. */
1791                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1792                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1793
1794                 /*
1795                  * If the descriptors are chained the header and data are
1796                  * placed in separate buffers.
1797                  */
1798                 if (desc->flags & VRING_DESC_F_NEXT) {
1799                         desc->len = vq->vhost_hlen;
1800                         desc = &vq->desc[desc->next];
1801                         desc->len = rte_pktmbuf_data_len(buff);
1802                 } else {
1803                         desc->len = packet_len;
1804                 }
1805
1806                 /* Update used ring with desc information */
1807                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1808                         = head[packet_success];
1809                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1810                         = packet_len;
1811                 res_cur_idx++;
1812                 packet_success++;
1813
1814                 /* A header is required per buffer. */
1815                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1816                         (const void *)&virtio_hdr, vq->vhost_hlen);
1817
1818                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1819
1820                 if (likely(packet_success < count)) {
1821                         /* Prefetch descriptor index. */
1822                         rte_prefetch0(&vq->desc[head[packet_success]]);
1823                 }
1824         }
1825
1826         rte_compiler_barrier();
1827
1828         LOG_DEBUG(VHOST_DATA,
1829                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1830                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1831                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1832
1833         *(volatile uint16_t *)&vq->used->idx += count;
1834         vq->last_used_idx += count;
1835
1836         LOG_DEBUG(VHOST_DATA,
1837                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1838                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1839                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1840
1841         /* Kick the guest if necessary. */
1842         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1843                 eventfd_write(vq->callfd, (eventfd_t)1);
1844
1845         return count;
1846 }
1847
1848 /*
1849  * This function routes the TX packet to the correct interface.
1850  * This may be a local device or the physical port.
1851  */
1852 static inline void __attribute__((always_inline))
1853 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1854         uint32_t desc_idx, uint8_t need_copy)
1855 {
1856         struct mbuf_table *tx_q;
1857         struct rte_mbuf **m_table;
1858         void *obj = NULL;
1859         struct rte_mbuf *mbuf;
1860         unsigned len, ret, offset = 0;
1861         struct vpool *vpool;
1862         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1863         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1864
1865         /*Add packet to the port tx queue*/
1866         tx_q = &tx_queue_zcp[vmdq_rx_q];
1867         len = tx_q->len;
1868
1869         /* Allocate an mbuf and populate the structure. */
1870         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1871         rte_ring_sc_dequeue(vpool->ring, &obj);
1872         mbuf = obj;
1873         if (unlikely(mbuf == NULL)) {
1874                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1875                 RTE_LOG(ERR, VHOST_DATA,
1876                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1877                         dev->device_fh);
1878                 put_desc_to_used_list_zcp(vq, desc_idx);
1879                 return;
1880         }
1881
1882         if (vm2vm_mode == VM2VM_HARDWARE) {
1883                 /* Avoid using a vlan tag from any vm for external pkt, such as
1884                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1885                  * selection, MAC address determines it as an external pkt
1886                  * which should go to network, while vlan tag determine it as
1887                  * a vm2vm pkt should forward to another vm. Hardware confuse
1888                  * such a ambiguous situation, so pkt will lost.
1889                  */
1890                 vlan_tag = external_pkt_default_vlan_tag;
1891                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1892                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1893                         __rte_mbuf_raw_free(mbuf);
1894                         return;
1895                 }
1896         }
1897
1898         mbuf->nb_segs = m->nb_segs;
1899         mbuf->next = m->next;
1900         mbuf->data_len = m->data_len + offset;
1901         mbuf->pkt_len = mbuf->data_len;
1902         if (unlikely(need_copy)) {
1903                 /* Copy the packet contents to the mbuf. */
1904                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1905                         rte_pktmbuf_mtod(m, void *),
1906                         m->data_len);
1907         } else {
1908                 mbuf->data_off = m->data_off;
1909                 mbuf->buf_physaddr = m->buf_physaddr;
1910                 mbuf->buf_addr = m->buf_addr;
1911         }
1912         mbuf->ol_flags |= PKT_TX_VLAN_PKT;
1913         mbuf->vlan_tci = vlan_tag;
1914         mbuf->l2_len = sizeof(struct ether_hdr);
1915         mbuf->l3_len = sizeof(struct ipv4_hdr);
1916         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1917
1918         tx_q->m_table[len] = mbuf;
1919         len++;
1920
1921         LOG_DEBUG(VHOST_DATA,
1922                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1923                 dev->device_fh,
1924                 mbuf->nb_segs,
1925                 (mbuf->next == NULL) ? "null" : "non-null");
1926
1927         if (enable_stats) {
1928                 dev_statistics[dev->device_fh].tx_total++;
1929                 dev_statistics[dev->device_fh].tx++;
1930         }
1931
1932         if (unlikely(len == MAX_PKT_BURST)) {
1933                 m_table = (struct rte_mbuf **)tx_q->m_table;
1934                 ret = rte_eth_tx_burst(ports[0],
1935                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1936
1937                 /*
1938                  * Free any buffers not handled by TX and update
1939                  * the port stats.
1940                  */
1941                 if (unlikely(ret < len)) {
1942                         do {
1943                                 rte_pktmbuf_free(m_table[ret]);
1944                         } while (++ret < len);
1945                 }
1946
1947                 len = 0;
1948                 txmbuf_clean_zcp(dev, vpool);
1949         }
1950
1951         tx_q->len = len;
1952
1953         return;
1954 }
1955
1956 /*
1957  * This function TX all available packets in virtio TX queue for one
1958  * virtio-net device. If it is first packet, it learns MAC address and
1959  * setup VMDQ.
1960  */
1961 static inline void __attribute__((always_inline))
1962 virtio_dev_tx_zcp(struct virtio_net *dev)
1963 {
1964         struct rte_mbuf m;
1965         struct vhost_virtqueue *vq;
1966         struct vring_desc *desc;
1967         uint64_t buff_addr = 0, phys_addr;
1968         uint32_t head[MAX_PKT_BURST];
1969         uint32_t i;
1970         uint16_t free_entries, packet_success = 0;
1971         uint16_t avail_idx;
1972         uint8_t need_copy = 0;
1973         hpa_type addr_type;
1974         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1975
1976         vq = dev->virtqueue[VIRTIO_TXQ];
1977         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1978
1979         /* If there are no available buffers then return. */
1980         if (vq->last_used_idx_res == avail_idx)
1981                 return;
1982
1983         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1984
1985         /* Prefetch available ring to retrieve head indexes. */
1986         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1987
1988         /* Get the number of free entries in the ring */
1989         free_entries = (avail_idx - vq->last_used_idx_res);
1990
1991         /* Limit to MAX_PKT_BURST. */
1992         free_entries
1993                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1994
1995         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1996                 dev->device_fh, free_entries);
1997
1998         /* Retrieve all of the head indexes first to avoid caching issues. */
1999         for (i = 0; i < free_entries; i++)
2000                 head[i]
2001                         = vq->avail->ring[(vq->last_used_idx_res + i)
2002                         & (vq->size - 1)];
2003
2004         vq->last_used_idx_res += free_entries;
2005
2006         /* Prefetch descriptor index. */
2007         rte_prefetch0(&vq->desc[head[packet_success]]);
2008         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2009
2010         while (packet_success < free_entries) {
2011                 desc = &vq->desc[head[packet_success]];
2012
2013                 /* Discard first buffer as it is the virtio header */
2014                 desc = &vq->desc[desc->next];
2015
2016                 /* Buffer address translation. */
2017                 buff_addr = gpa_to_vva(dev, desc->addr);
2018                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
2019                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
2020                         &addr_type);
2021
2022                 if (likely(packet_success < (free_entries - 1)))
2023                         /* Prefetch descriptor index. */
2024                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2025
2026                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2027                         RTE_LOG(ERR, VHOST_DATA,
2028                                 "(%"PRIu64") Invalid frame buffer address found"
2029                                 "when TX packets!\n",
2030                                 dev->device_fh);
2031                         packet_success++;
2032                         continue;
2033                 }
2034
2035                 /* Prefetch buffer address. */
2036                 rte_prefetch0((void *)(uintptr_t)buff_addr);
2037
2038                 /*
2039                  * Setup dummy mbuf. This is copied to a real mbuf if
2040                  * transmitted out the physical port.
2041                  */
2042                 m.data_len = desc->len;
2043                 m.nb_segs = 1;
2044                 m.next = NULL;
2045                 m.data_off = 0;
2046                 m.buf_addr = (void *)(uintptr_t)buff_addr;
2047                 m.buf_physaddr = phys_addr;
2048
2049                 /*
2050                  * Check if the frame buffer address from guest crosses
2051                  * sub-region or not.
2052                  */
2053                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2054                         RTE_LOG(ERR, VHOST_DATA,
2055                                 "(%"PRIu64") Frame buffer address cross "
2056                                 "sub-regioin found when attaching TX frame "
2057                                 "buffer address!\n",
2058                                 dev->device_fh);
2059                         need_copy = 1;
2060                 } else
2061                         need_copy = 0;
2062
2063                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2064
2065                 /*
2066                  * If this is the first received packet we need to learn
2067                  * the MAC and setup VMDQ
2068                  */
2069                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2070                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2071                                 /*
2072                                  * Discard frame if device is scheduled for
2073                                  * removal or a duplicate MAC address is found.
2074                                  */
2075                                 packet_success += free_entries;
2076                                 vq->last_used_idx += packet_success;
2077                                 break;
2078                         }
2079                 }
2080
2081                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2082                 packet_success++;
2083         }
2084 }
2085
2086 /*
2087  * This function is called by each data core. It handles all RX/TX registered
2088  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2089  * addresses are compared with all devices in the main linked list.
2090  */
2091 static int
2092 switch_worker_zcp(__attribute__((unused)) void *arg)
2093 {
2094         struct virtio_net *dev = NULL;
2095         struct vhost_dev  *vdev = NULL;
2096         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2097         struct virtio_net_data_ll *dev_ll;
2098         struct mbuf_table *tx_q;
2099         volatile struct lcore_ll_info *lcore_ll;
2100         const uint64_t drain_tsc
2101                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2102                 * BURST_TX_DRAIN_US;
2103         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2104         unsigned ret;
2105         const uint16_t lcore_id = rte_lcore_id();
2106         uint16_t count_in_ring, rx_count = 0;
2107
2108         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2109
2110         lcore_ll = lcore_info[lcore_id].lcore_ll;
2111         prev_tsc = 0;
2112
2113         while (1) {
2114                 cur_tsc = rte_rdtsc();
2115
2116                 /* TX burst queue drain */
2117                 diff_tsc = cur_tsc - prev_tsc;
2118                 if (unlikely(diff_tsc > drain_tsc)) {
2119                         /*
2120                          * Get mbuf from vpool.pool and detach mbuf and
2121                          * put back into vpool.ring.
2122                          */
2123                         dev_ll = lcore_ll->ll_root_used;
2124                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2125                                 /* Get virtio device ID */
2126                                 vdev = dev_ll->vdev;
2127                                 dev = vdev->dev;
2128
2129                                 if (likely(!vdev->remove)) {
2130                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2131                                         if (tx_q->len) {
2132                                                 LOG_DEBUG(VHOST_DATA,
2133                                                 "TX queue drained after timeout"
2134                                                 " with burst size %u\n",
2135                                                 tx_q->len);
2136
2137                                                 /*
2138                                                  * Tx any packets in the queue
2139                                                  */
2140                                                 ret = rte_eth_tx_burst(
2141                                                         ports[0],
2142                                                         (uint16_t)tx_q->txq_id,
2143                                                         (struct rte_mbuf **)
2144                                                         tx_q->m_table,
2145                                                         (uint16_t)tx_q->len);
2146                                                 if (unlikely(ret < tx_q->len)) {
2147                                                         do {
2148                                                                 rte_pktmbuf_free(
2149                                                                         tx_q->m_table[ret]);
2150                                                         } while (++ret < tx_q->len);
2151                                                 }
2152                                                 tx_q->len = 0;
2153
2154                                                 txmbuf_clean_zcp(dev,
2155                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2156                                         }
2157                                 }
2158                                 dev_ll = dev_ll->next;
2159                         }
2160                         prev_tsc = cur_tsc;
2161                 }
2162
2163                 rte_prefetch0(lcore_ll->ll_root_used);
2164
2165                 /*
2166                  * Inform the configuration core that we have exited the linked
2167                  * list and that no devices are in use if requested.
2168                  */
2169                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2170                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2171
2172                 /* Process devices */
2173                 dev_ll = lcore_ll->ll_root_used;
2174
2175                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2176                         vdev = dev_ll->vdev;
2177                         dev  = vdev->dev;
2178                         if (unlikely(vdev->remove)) {
2179                                 dev_ll = dev_ll->next;
2180                                 unlink_vmdq(vdev);
2181                                 vdev->ready = DEVICE_SAFE_REMOVE;
2182                                 continue;
2183                         }
2184
2185                         if (likely(vdev->ready == DEVICE_RX)) {
2186                                 uint32_t index = vdev->vmdq_rx_q;
2187                                 uint16_t i;
2188                                 count_in_ring
2189                                 = rte_ring_count(vpool_array[index].ring);
2190                                 uint16_t free_entries
2191                                 = (uint16_t)get_available_ring_num_zcp(dev);
2192
2193                                 /*
2194                                  * Attach all mbufs in vpool.ring and put back
2195                                  * into vpool.pool.
2196                                  */
2197                                 for (i = 0;
2198                                 i < RTE_MIN(free_entries,
2199                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2200                                 i++)
2201                                         attach_rxmbuf_zcp(dev);
2202
2203                                 /* Handle guest RX */
2204                                 rx_count = rte_eth_rx_burst(ports[0],
2205                                         vdev->vmdq_rx_q, pkts_burst,
2206                                         MAX_PKT_BURST);
2207
2208                                 if (rx_count) {
2209                                         ret_count = virtio_dev_rx_zcp(dev,
2210                                                         pkts_burst, rx_count);
2211                                         if (enable_stats) {
2212                                                 dev_statistics[dev->device_fh].rx_total
2213                                                         += rx_count;
2214                                                 dev_statistics[dev->device_fh].rx
2215                                                         += ret_count;
2216                                         }
2217                                         while (likely(rx_count)) {
2218                                                 rx_count--;
2219                                                 pktmbuf_detach_zcp(
2220                                                         pkts_burst[rx_count]);
2221                                                 rte_ring_sp_enqueue(
2222                                                         vpool_array[index].ring,
2223                                                         (void *)pkts_burst[rx_count]);
2224                                         }
2225                                 }
2226                         }
2227
2228                         if (likely(!vdev->remove))
2229                                 /* Handle guest TX */
2230                                 virtio_dev_tx_zcp(dev);
2231
2232                         /* Move to the next device in the list */
2233                         dev_ll = dev_ll->next;
2234                 }
2235         }
2236
2237         return 0;
2238 }
2239
2240
2241 /*
2242  * Add an entry to a used linked list. A free entry must first be found
2243  * in the free linked list using get_data_ll_free_entry();
2244  */
2245 static void
2246 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2247         struct virtio_net_data_ll *ll_dev)
2248 {
2249         struct virtio_net_data_ll *ll = *ll_root_addr;
2250
2251         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2252         ll_dev->next = NULL;
2253         rte_compiler_barrier();
2254
2255         /* If ll == NULL then this is the first device. */
2256         if (ll) {
2257                 /* Increment to the tail of the linked list. */
2258                 while ((ll->next != NULL) )
2259                         ll = ll->next;
2260
2261                 ll->next = ll_dev;
2262         } else {
2263                 *ll_root_addr = ll_dev;
2264         }
2265 }
2266
2267 /*
2268  * Remove an entry from a used linked list. The entry must then be added to
2269  * the free linked list using put_data_ll_free_entry().
2270  */
2271 static void
2272 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2273         struct virtio_net_data_ll *ll_dev,
2274         struct virtio_net_data_ll *ll_dev_last)
2275 {
2276         struct virtio_net_data_ll *ll = *ll_root_addr;
2277
2278         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2279                 return;
2280
2281         if (ll_dev == ll)
2282                 *ll_root_addr = ll_dev->next;
2283         else
2284                 if (likely(ll_dev_last != NULL))
2285                         ll_dev_last->next = ll_dev->next;
2286                 else
2287                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2288 }
2289
2290 /*
2291  * Find and return an entry from the free linked list.
2292  */
2293 static struct virtio_net_data_ll *
2294 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2295 {
2296         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2297         struct virtio_net_data_ll *ll_dev;
2298
2299         if (ll_free == NULL)
2300                 return NULL;
2301
2302         ll_dev = ll_free;
2303         *ll_root_addr = ll_free->next;
2304
2305         return ll_dev;
2306 }
2307
2308 /*
2309  * Place an entry back on to the free linked list.
2310  */
2311 static void
2312 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2313         struct virtio_net_data_ll *ll_dev)
2314 {
2315         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2316
2317         if (ll_dev == NULL)
2318                 return;
2319
2320         ll_dev->next = ll_free;
2321         *ll_root_addr = ll_dev;
2322 }
2323
2324 /*
2325  * Creates a linked list of a given size.
2326  */
2327 static struct virtio_net_data_ll *
2328 alloc_data_ll(uint32_t size)
2329 {
2330         struct virtio_net_data_ll *ll_new;
2331         uint32_t i;
2332
2333         /* Malloc and then chain the linked list. */
2334         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2335         if (ll_new == NULL) {
2336                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2337                 return NULL;
2338         }
2339
2340         for (i = 0; i < size - 1; i++) {
2341                 ll_new[i].vdev = NULL;
2342                 ll_new[i].next = &ll_new[i+1];
2343         }
2344         ll_new[i].next = NULL;
2345
2346         return ll_new;
2347 }
2348
2349 /*
2350  * Create the main linked list along with each individual cores linked list. A used and a free list
2351  * are created to manage entries.
2352  */
2353 static int
2354 init_data_ll (void)
2355 {
2356         int lcore;
2357
2358         RTE_LCORE_FOREACH_SLAVE(lcore) {
2359                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2360                 if (lcore_info[lcore].lcore_ll == NULL) {
2361                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2362                         return -1;
2363                 }
2364
2365                 lcore_info[lcore].lcore_ll->device_num = 0;
2366                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2367                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2368                 if (num_devices % num_switching_cores)
2369                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2370                 else
2371                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2372         }
2373
2374         /* Allocate devices up to a maximum of MAX_DEVICES. */
2375         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2376
2377         return 0;
2378 }
2379
2380 /*
2381  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2382  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2383  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2384  */
2385 static void
2386 destroy_device (volatile struct virtio_net *dev)
2387 {
2388         struct virtio_net_data_ll *ll_lcore_dev_cur;
2389         struct virtio_net_data_ll *ll_main_dev_cur;
2390         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2391         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2392         struct vhost_dev *vdev;
2393         int lcore;
2394
2395         dev->flags &= ~VIRTIO_DEV_RUNNING;
2396
2397         vdev = (struct vhost_dev *)dev->priv;
2398         /*set the remove flag. */
2399         vdev->remove = 1;
2400         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2401                 rte_pause();
2402         }
2403
2404         /* Search for entry to be removed from lcore ll */
2405         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2406         while (ll_lcore_dev_cur != NULL) {
2407                 if (ll_lcore_dev_cur->vdev == vdev) {
2408                         break;
2409                 } else {
2410                         ll_lcore_dev_last = ll_lcore_dev_cur;
2411                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2412                 }
2413         }
2414
2415         if (ll_lcore_dev_cur == NULL) {
2416                 RTE_LOG(ERR, VHOST_CONFIG,
2417                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2418                         dev->device_fh);
2419                 return;
2420         }
2421
2422         /* Search for entry to be removed from main ll */
2423         ll_main_dev_cur = ll_root_used;
2424         ll_main_dev_last = NULL;
2425         while (ll_main_dev_cur != NULL) {
2426                 if (ll_main_dev_cur->vdev == vdev) {
2427                         break;
2428                 } else {
2429                         ll_main_dev_last = ll_main_dev_cur;
2430                         ll_main_dev_cur = ll_main_dev_cur->next;
2431                 }
2432         }
2433
2434         /* Remove entries from the lcore and main ll. */
2435         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2436         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2437
2438         /* Set the dev_removal_flag on each lcore. */
2439         RTE_LCORE_FOREACH_SLAVE(lcore) {
2440                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2441         }
2442
2443         /*
2444          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2445          * they can no longer access the device removed from the linked lists and that the devices
2446          * are no longer in use.
2447          */
2448         RTE_LCORE_FOREACH_SLAVE(lcore) {
2449                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2450                         rte_pause();
2451                 }
2452         }
2453
2454         /* Add the entries back to the lcore and main free ll.*/
2455         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2456         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2457
2458         /* Decrement number of device on the lcore. */
2459         lcore_info[vdev->coreid].lcore_ll->device_num--;
2460
2461         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2462
2463         if (zero_copy) {
2464                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2465
2466                 /* Stop the RX queue. */
2467                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2468                         LOG_DEBUG(VHOST_CONFIG,
2469                                 "(%"PRIu64") In destroy_device: Failed to stop "
2470                                 "rx queue:%d\n",
2471                                 dev->device_fh,
2472                                 vdev->vmdq_rx_q);
2473                 }
2474
2475                 LOG_DEBUG(VHOST_CONFIG,
2476                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2477                         "mempool back to ring for RX queue: %d\n",
2478                         dev->device_fh, vdev->vmdq_rx_q);
2479
2480                 mbuf_destroy_zcp(vpool);
2481
2482                 /* Stop the TX queue. */
2483                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2484                         LOG_DEBUG(VHOST_CONFIG,
2485                                 "(%"PRIu64") In destroy_device: Failed to "
2486                                 "stop tx queue:%d\n",
2487                                 dev->device_fh, vdev->vmdq_rx_q);
2488                 }
2489
2490                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2491
2492                 LOG_DEBUG(VHOST_CONFIG,
2493                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2494                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2495                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2496                         dev->device_fh);
2497
2498                 mbuf_destroy_zcp(vpool);
2499                 rte_free(vdev->regions_hpa);
2500         }
2501         rte_free(vdev);
2502
2503 }
2504
2505 /*
2506  * Calculate the region count of physical continous regions for one particular
2507  * region of whose vhost virtual address is continous. The particular region
2508  * start from vva_start, with size of 'size' in argument.
2509  */
2510 static uint32_t
2511 check_hpa_regions(uint64_t vva_start, uint64_t size)
2512 {
2513         uint32_t i, nregions = 0, page_size = getpagesize();
2514         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2515         if (vva_start % page_size) {
2516                 LOG_DEBUG(VHOST_CONFIG,
2517                         "in check_countinous: vva start(%p) mod page_size(%d) "
2518                         "has remainder\n",
2519                         (void *)(uintptr_t)vva_start, page_size);
2520                 return 0;
2521         }
2522         if (size % page_size) {
2523                 LOG_DEBUG(VHOST_CONFIG,
2524                         "in check_countinous: "
2525                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2526                         size, page_size);
2527                 return 0;
2528         }
2529         for (i = 0; i < size - page_size; i = i + page_size) {
2530                 cur_phys_addr
2531                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2532                 next_phys_addr = rte_mem_virt2phy(
2533                         (void *)(uintptr_t)(vva_start + i + page_size));
2534                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2535                         ++nregions;
2536                         LOG_DEBUG(VHOST_CONFIG,
2537                                 "in check_continuous: hva addr:(%p) is not "
2538                                 "continuous with hva addr:(%p), diff:%d\n",
2539                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2540                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2541                                 + page_size), page_size);
2542                         LOG_DEBUG(VHOST_CONFIG,
2543                                 "in check_continuous: hpa addr:(%p) is not "
2544                                 "continuous with hpa addr:(%p), "
2545                                 "diff:(%"PRIu64")\n",
2546                                 (void *)(uintptr_t)cur_phys_addr,
2547                                 (void *)(uintptr_t)next_phys_addr,
2548                                 (next_phys_addr-cur_phys_addr));
2549                 }
2550         }
2551         return nregions;
2552 }
2553
2554 /*
2555  * Divide each region whose vhost virtual address is continous into a few
2556  * sub-regions, make sure the physical address within each sub-region are
2557  * continous. And fill offset(to GPA) and size etc. information of each
2558  * sub-region into regions_hpa.
2559  */
2560 static uint32_t
2561 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2562 {
2563         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2564         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2565
2566         if (mem_region_hpa == NULL)
2567                 return 0;
2568
2569         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2570                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2571                         virtio_memory->regions[regionidx].address_offset;
2572                 mem_region_hpa[regionidx_hpa].guest_phys_address
2573                         = virtio_memory->regions[regionidx].guest_phys_address;
2574                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2575                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2576                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2577                 LOG_DEBUG(VHOST_CONFIG,
2578                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2579                         regionidx_hpa,
2580                         (void *)(uintptr_t)
2581                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2582                 LOG_DEBUG(VHOST_CONFIG,
2583                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2584                         regionidx_hpa,
2585                         (void *)(uintptr_t)
2586                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2587                 for (i = 0, k = 0;
2588                         i < virtio_memory->regions[regionidx].memory_size -
2589                                 page_size;
2590                         i += page_size) {
2591                         cur_phys_addr = rte_mem_virt2phy(
2592                                         (void *)(uintptr_t)(vva_start + i));
2593                         next_phys_addr = rte_mem_virt2phy(
2594                                         (void *)(uintptr_t)(vva_start +
2595                                         i + page_size));
2596                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2597                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2598                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2599                                         k + page_size;
2600                                 mem_region_hpa[regionidx_hpa].memory_size
2601                                         = k + page_size;
2602                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2603                                         "phys addr end  [%d]:(%p)\n",
2604                                         regionidx_hpa,
2605                                         (void *)(uintptr_t)
2606                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2607                                 LOG_DEBUG(VHOST_CONFIG,
2608                                         "in fill_hpa_regions: guest phys addr "
2609                                         "size [%d]:(%p)\n",
2610                                         regionidx_hpa,
2611                                         (void *)(uintptr_t)
2612                                         (mem_region_hpa[regionidx_hpa].memory_size));
2613                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2614                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2615                                 ++regionidx_hpa;
2616                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2617                                         next_phys_addr -
2618                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2619                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2620                                         " phys addr start[%d]:(%p)\n",
2621                                         regionidx_hpa,
2622                                         (void *)(uintptr_t)
2623                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2624                                 LOG_DEBUG(VHOST_CONFIG,
2625                                         "in fill_hpa_regions: host  phys addr "
2626                                         "start[%d]:(%p)\n",
2627                                         regionidx_hpa,
2628                                         (void *)(uintptr_t)
2629                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2630                                 k = 0;
2631                         } else {
2632                                 k += page_size;
2633                         }
2634                 }
2635                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2636                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2637                         + k + page_size;
2638                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2639                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2640                         "[%d]:(%p)\n", regionidx_hpa,
2641                         (void *)(uintptr_t)
2642                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2643                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2644                         "[%d]:(%p)\n", regionidx_hpa,
2645                         (void *)(uintptr_t)
2646                         (mem_region_hpa[regionidx_hpa].memory_size));
2647                 ++regionidx_hpa;
2648         }
2649         return regionidx_hpa;
2650 }
2651
2652 /*
2653  * A new device is added to a data core. First the device is added to the main linked list
2654  * and the allocated to a specific data core.
2655  */
2656 static int
2657 new_device (struct virtio_net *dev)
2658 {
2659         struct virtio_net_data_ll *ll_dev;
2660         int lcore, core_add = 0;
2661         uint32_t device_num_min = num_devices;
2662         struct vhost_dev *vdev;
2663         uint32_t regionidx;
2664
2665         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2666         if (vdev == NULL) {
2667                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2668                         dev->device_fh);
2669                 return -1;
2670         }
2671         vdev->dev = dev;
2672         dev->priv = vdev;
2673
2674         if (zero_copy) {
2675                 vdev->nregions_hpa = dev->mem->nregions;
2676                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2677                         vdev->nregions_hpa
2678                                 += check_hpa_regions(
2679                                         dev->mem->regions[regionidx].guest_phys_address
2680                                         + dev->mem->regions[regionidx].address_offset,
2681                                         dev->mem->regions[regionidx].memory_size);
2682
2683                 }
2684
2685                 vdev->regions_hpa = rte_calloc("vhost hpa region",
2686                                                vdev->nregions_hpa,
2687                                                sizeof(struct virtio_memory_regions_hpa),
2688                                                RTE_CACHE_LINE_SIZE);
2689                 if (vdev->regions_hpa == NULL) {
2690                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2691                         rte_free(vdev);
2692                         return -1;
2693                 }
2694
2695
2696                 if (fill_hpa_memory_regions(
2697                         vdev->regions_hpa, dev->mem
2698                         ) != vdev->nregions_hpa) {
2699
2700                         RTE_LOG(ERR, VHOST_CONFIG,
2701                                 "hpa memory regions number mismatch: "
2702                                 "[%d]\n", vdev->nregions_hpa);
2703                         rte_free(vdev->regions_hpa);
2704                         rte_free(vdev);
2705                         return -1;
2706                 }
2707         }
2708
2709
2710         /* Add device to main ll */
2711         ll_dev = get_data_ll_free_entry(&ll_root_free);
2712         if (ll_dev == NULL) {
2713                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2714                         "of %d devices per core has been reached\n",
2715                         dev->device_fh, num_devices);
2716                 if (vdev->regions_hpa)
2717                         rte_free(vdev->regions_hpa);
2718                 rte_free(vdev);
2719                 return -1;
2720         }
2721         ll_dev->vdev = vdev;
2722         add_data_ll_entry(&ll_root_used, ll_dev);
2723         vdev->vmdq_rx_q
2724                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2725
2726         if (zero_copy) {
2727                 uint32_t index = vdev->vmdq_rx_q;
2728                 uint32_t count_in_ring, i;
2729                 struct mbuf_table *tx_q;
2730
2731                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2732
2733                 LOG_DEBUG(VHOST_CONFIG,
2734                         "(%"PRIu64") in new_device: mbuf count in mempool "
2735                         "before attach is: %d\n",
2736                         dev->device_fh,
2737                         rte_mempool_count(vpool_array[index].pool));
2738                 LOG_DEBUG(VHOST_CONFIG,
2739                         "(%"PRIu64") in new_device: mbuf count in  ring "
2740                         "before attach  is : %d\n",
2741                         dev->device_fh, count_in_ring);
2742
2743                 /*
2744                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2745                  */
2746                 for (i = 0; i < count_in_ring; i++)
2747                         attach_rxmbuf_zcp(dev);
2748
2749                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2750                         "mempool after attach is: %d\n",
2751                         dev->device_fh,
2752                         rte_mempool_count(vpool_array[index].pool));
2753                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2754                         "ring after attach  is : %d\n",
2755                         dev->device_fh,
2756                         rte_ring_count(vpool_array[index].ring));
2757
2758                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2759                 tx_q->txq_id = vdev->vmdq_rx_q;
2760
2761                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2762                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2763
2764                         LOG_DEBUG(VHOST_CONFIG,
2765                                 "(%"PRIu64") In new_device: Failed to start "
2766                                 "tx queue:%d\n",
2767                                 dev->device_fh, vdev->vmdq_rx_q);
2768
2769                         mbuf_destroy_zcp(vpool);
2770                         rte_free(vdev->regions_hpa);
2771                         rte_free(vdev);
2772                         return -1;
2773                 }
2774
2775                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2776                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2777
2778                         LOG_DEBUG(VHOST_CONFIG,
2779                                 "(%"PRIu64") In new_device: Failed to start "
2780                                 "rx queue:%d\n",
2781                                 dev->device_fh, vdev->vmdq_rx_q);
2782
2783                         /* Stop the TX queue. */
2784                         if (rte_eth_dev_tx_queue_stop(ports[0],
2785                                 vdev->vmdq_rx_q) != 0) {
2786                                 LOG_DEBUG(VHOST_CONFIG,
2787                                         "(%"PRIu64") In new_device: Failed to "
2788                                         "stop tx queue:%d\n",
2789                                         dev->device_fh, vdev->vmdq_rx_q);
2790                         }
2791
2792                         mbuf_destroy_zcp(vpool);
2793                         rte_free(vdev->regions_hpa);
2794                         rte_free(vdev);
2795                         return -1;
2796                 }
2797
2798         }
2799
2800         /*reset ready flag*/
2801         vdev->ready = DEVICE_MAC_LEARNING;
2802         vdev->remove = 0;
2803
2804         /* Find a suitable lcore to add the device. */
2805         RTE_LCORE_FOREACH_SLAVE(lcore) {
2806                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2807                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2808                         core_add = lcore;
2809                 }
2810         }
2811         /* Add device to lcore ll */
2812         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2813         if (ll_dev == NULL) {
2814                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2815                 vdev->ready = DEVICE_SAFE_REMOVE;
2816                 destroy_device(dev);
2817                 rte_free(vdev->regions_hpa);
2818                 rte_free(vdev);
2819                 return -1;
2820         }
2821         ll_dev->vdev = vdev;
2822         vdev->coreid = core_add;
2823
2824         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2825
2826         /* Initialize device stats */
2827         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2828
2829         /* Disable notifications. */
2830         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2831         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2832         lcore_info[vdev->coreid].lcore_ll->device_num++;
2833         dev->flags |= VIRTIO_DEV_RUNNING;
2834
2835         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2836
2837         return 0;
2838 }
2839
2840 /*
2841  * These callback allow devices to be added to the data core when configuration
2842  * has been fully complete.
2843  */
2844 static const struct virtio_net_device_ops virtio_net_device_ops =
2845 {
2846         .new_device =  new_device,
2847         .destroy_device = destroy_device,
2848 };
2849
2850 /*
2851  * This is a thread will wake up after a period to print stats if the user has
2852  * enabled them.
2853  */
2854 static void
2855 print_stats(void)
2856 {
2857         struct virtio_net_data_ll *dev_ll;
2858         uint64_t tx_dropped, rx_dropped;
2859         uint64_t tx, tx_total, rx, rx_total;
2860         uint32_t device_fh;
2861         const char clr[] = { 27, '[', '2', 'J', '\0' };
2862         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2863
2864         while(1) {
2865                 sleep(enable_stats);
2866
2867                 /* Clear screen and move to top left */
2868                 printf("%s%s", clr, top_left);
2869
2870                 printf("\nDevice statistics ====================================");
2871
2872                 dev_ll = ll_root_used;
2873                 while (dev_ll != NULL) {
2874                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2875                         tx_total = dev_statistics[device_fh].tx_total;
2876                         tx = dev_statistics[device_fh].tx;
2877                         tx_dropped = tx_total - tx;
2878                         if (zero_copy == 0) {
2879                                 rx_total = rte_atomic64_read(
2880                                         &dev_statistics[device_fh].rx_total_atomic);
2881                                 rx = rte_atomic64_read(
2882                                         &dev_statistics[device_fh].rx_atomic);
2883                         } else {
2884                                 rx_total = dev_statistics[device_fh].rx_total;
2885                                 rx = dev_statistics[device_fh].rx;
2886                         }
2887                         rx_dropped = rx_total - rx;
2888
2889                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2890                                         "\nTX total:            %"PRIu64""
2891                                         "\nTX dropped:          %"PRIu64""
2892                                         "\nTX successful:               %"PRIu64""
2893                                         "\nRX total:            %"PRIu64""
2894                                         "\nRX dropped:          %"PRIu64""
2895                                         "\nRX successful:               %"PRIu64"",
2896                                         device_fh,
2897                                         tx_total,
2898                                         tx_dropped,
2899                                         tx,
2900                                         rx_total,
2901                                         rx_dropped,
2902                                         rx);
2903
2904                         dev_ll = dev_ll->next;
2905                 }
2906                 printf("\n======================================================\n");
2907         }
2908 }
2909
2910 static void
2911 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2912         char *ring_name, uint32_t nb_mbuf)
2913 {
2914         vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2915                 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2916         if (vpool_array[index].pool != NULL) {
2917                 vpool_array[index].ring
2918                         = rte_ring_create(ring_name,
2919                                 rte_align32pow2(nb_mbuf + 1),
2920                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2921                 if (likely(vpool_array[index].ring != NULL)) {
2922                         LOG_DEBUG(VHOST_CONFIG,
2923                                 "in setup_mempool_tbl: mbuf count in "
2924                                 "mempool is: %d\n",
2925                                 rte_mempool_count(vpool_array[index].pool));
2926                         LOG_DEBUG(VHOST_CONFIG,
2927                                 "in setup_mempool_tbl: mbuf count in "
2928                                 "ring   is: %d\n",
2929                                 rte_ring_count(vpool_array[index].ring));
2930                 } else {
2931                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2932                                 ring_name);
2933                 }
2934
2935                 /* Need consider head room. */
2936                 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2937         } else {
2938                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2939         }
2940 }
2941
2942 /* When we receive a INT signal, unregister vhost driver */
2943 static void
2944 sigint_handler(__rte_unused int signum)
2945 {
2946         /* Unregister vhost driver. */
2947         int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2948         if (ret != 0)
2949                 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2950         exit(0);
2951 }
2952
2953 /*
2954  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2955  * device is also registered here to handle the IOCTLs.
2956  */
2957 int
2958 main(int argc, char *argv[])
2959 {
2960         struct rte_mempool *mbuf_pool = NULL;
2961         unsigned lcore_id, core_id = 0;
2962         unsigned nb_ports, valid_num_ports;
2963         int ret;
2964         uint8_t portid;
2965         uint16_t queue_id;
2966         static pthread_t tid;
2967         char thread_name[RTE_MAX_THREAD_NAME_LEN];
2968
2969         signal(SIGINT, sigint_handler);
2970
2971         /* init EAL */
2972         ret = rte_eal_init(argc, argv);
2973         if (ret < 0)
2974                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2975         argc -= ret;
2976         argv += ret;
2977
2978         /* parse app arguments */
2979         ret = us_vhost_parse_args(argc, argv);
2980         if (ret < 0)
2981                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2982
2983         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2984                 if (rte_lcore_is_enabled(lcore_id))
2985                         lcore_ids[core_id ++] = lcore_id;
2986
2987         if (rte_lcore_count() > RTE_MAX_LCORE)
2988                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2989
2990         /*set the number of swithcing cores available*/
2991         num_switching_cores = rte_lcore_count()-1;
2992
2993         /* Get the number of physical ports. */
2994         nb_ports = rte_eth_dev_count();
2995         if (nb_ports > RTE_MAX_ETHPORTS)
2996                 nb_ports = RTE_MAX_ETHPORTS;
2997
2998         /*
2999          * Update the global var NUM_PORTS and global array PORTS
3000          * and get value of var VALID_NUM_PORTS according to system ports number
3001          */
3002         valid_num_ports = check_ports_num(nb_ports);
3003
3004         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
3005                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
3006                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
3007                 return -1;
3008         }
3009
3010         if (zero_copy == 0) {
3011                 /* Create the mbuf pool. */
3012                 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
3013                         NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
3014                         0, MBUF_DATA_SIZE, rte_socket_id());
3015                 if (mbuf_pool == NULL)
3016                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
3017
3018                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
3019                         vpool_array[queue_id].pool = mbuf_pool;
3020
3021                 if (vm2vm_mode == VM2VM_HARDWARE) {
3022                         /* Enable VT loop back to let L2 switch to do it. */
3023                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3024                         LOG_DEBUG(VHOST_CONFIG,
3025                                 "Enable loop back for L2 switch in vmdq.\n");
3026                 }
3027         } else {
3028                 uint32_t nb_mbuf;
3029                 char pool_name[RTE_MEMPOOL_NAMESIZE];
3030                 char ring_name[RTE_MEMPOOL_NAMESIZE];
3031
3032                 nb_mbuf = num_rx_descriptor
3033                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3034                         + num_switching_cores * MAX_PKT_BURST;
3035
3036                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3037                         snprintf(pool_name, sizeof(pool_name),
3038                                 "rxmbuf_pool_%u", queue_id);
3039                         snprintf(ring_name, sizeof(ring_name),
3040                                 "rxmbuf_ring_%u", queue_id);
3041                         setup_mempool_tbl(rte_socket_id(), queue_id,
3042                                 pool_name, ring_name, nb_mbuf);
3043                 }
3044
3045                 nb_mbuf = num_tx_descriptor
3046                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3047                                 + num_switching_cores * MAX_PKT_BURST;
3048
3049                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3050                         snprintf(pool_name, sizeof(pool_name),
3051                                 "txmbuf_pool_%u", queue_id);
3052                         snprintf(ring_name, sizeof(ring_name),
3053                                 "txmbuf_ring_%u", queue_id);
3054                         setup_mempool_tbl(rte_socket_id(),
3055                                 (queue_id + MAX_QUEUES),
3056                                 pool_name, ring_name, nb_mbuf);
3057                 }
3058
3059                 if (vm2vm_mode == VM2VM_HARDWARE) {
3060                         /* Enable VT loop back to let L2 switch to do it. */
3061                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3062                         LOG_DEBUG(VHOST_CONFIG,
3063                                 "Enable loop back for L2 switch in vmdq.\n");
3064                 }
3065         }
3066         /* Set log level. */
3067         rte_set_log_level(LOG_LEVEL);
3068
3069         /* initialize all ports */
3070         for (portid = 0; portid < nb_ports; portid++) {
3071                 /* skip ports that are not enabled */
3072                 if ((enabled_port_mask & (1 << portid)) == 0) {
3073                         RTE_LOG(INFO, VHOST_PORT,
3074                                 "Skipping disabled port %d\n", portid);
3075                         continue;
3076                 }
3077                 if (port_init(portid) != 0)
3078                         rte_exit(EXIT_FAILURE,
3079                                 "Cannot initialize network ports\n");
3080         }
3081
3082         /* Initialise all linked lists. */
3083         if (init_data_ll() == -1)
3084                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3085
3086         /* Initialize device stats */
3087         memset(&dev_statistics, 0, sizeof(dev_statistics));
3088
3089         /* Enable stats if the user option is set. */
3090         if (enable_stats) {
3091                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3092                 if (ret != 0)
3093                         rte_exit(EXIT_FAILURE,
3094                                 "Cannot create print-stats thread\n");
3095
3096                 /* Set thread_name for aid in debugging.  */
3097                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3098                 ret = rte_thread_setname(tid, thread_name);
3099                 if (ret != 0)
3100                         RTE_LOG(ERR, VHOST_CONFIG,
3101                                 "Cannot set print-stats name\n");
3102         }
3103
3104         /* Launch all data cores. */
3105         if (zero_copy == 0) {
3106                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3107                         rte_eal_remote_launch(switch_worker,
3108                                 mbuf_pool, lcore_id);
3109                 }
3110         } else {
3111                 uint32_t count_in_mempool, index, i;
3112                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3113                         /* For all RX and TX queues. */
3114                         count_in_mempool
3115                                 = rte_mempool_count(vpool_array[index].pool);
3116
3117                         /*
3118                          * Transfer all un-attached mbufs from vpool.pool
3119                          * to vpoo.ring.
3120                          */
3121                         for (i = 0; i < count_in_mempool; i++) {
3122                                 struct rte_mbuf *mbuf
3123                                         = __rte_mbuf_raw_alloc(
3124                                                 vpool_array[index].pool);
3125                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3126                                                 (void *)mbuf);
3127                         }
3128
3129                         LOG_DEBUG(VHOST_CONFIG,
3130                                 "in main: mbuf count in mempool at initial "
3131                                 "is: %d\n", count_in_mempool);
3132                         LOG_DEBUG(VHOST_CONFIG,
3133                                 "in main: mbuf count in  ring at initial  is :"
3134                                 " %d\n",
3135                                 rte_ring_count(vpool_array[index].ring));
3136                 }
3137
3138                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3139                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3140                                 lcore_id);
3141         }
3142
3143         if (mergeable == 0)
3144                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3145
3146         /* Register vhost(cuse or user) driver to handle vhost messages. */
3147         ret = rte_vhost_driver_register((char *)&dev_basename);
3148         if (ret != 0)
3149                 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3150
3151         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3152
3153         /* Start CUSE session. */
3154         rte_vhost_driver_session_start();
3155         return 0;
3156
3157 }