New upstream version 18.11-rc1
[deb_dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "main.h"
29
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
39
40 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
41
42 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
44
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX                       1
50 #define DEVICE_SAFE_REMOVE      2
51
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55
56 #define INVALID_PORT_ID 0xFF
57
58 /* Max number of devices. Limited by vmdq. */
59 #define MAX_DEVICES 64
60
61 /* Maximum long option length for option parsing. */
62 #define MAX_LONG_OPT_SZ 64
63
64 /* mask of enabled ports */
65 static uint32_t enabled_port_mask = 0;
66
67 /* Promiscuous mode */
68 static uint32_t promiscuous;
69
70 /* number of devices/queues to support*/
71 static uint32_t num_queues = 0;
72 static uint32_t num_devices;
73
74 static struct rte_mempool *mbuf_pool;
75 static int mergeable;
76
77 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
78 typedef enum {
79         VM2VM_DISABLED = 0,
80         VM2VM_SOFTWARE = 1,
81         VM2VM_HARDWARE = 2,
82         VM2VM_LAST
83 } vm2vm_type;
84 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
85
86 /* Enable stats. */
87 static uint32_t enable_stats = 0;
88 /* Enable retries on RX. */
89 static uint32_t enable_retry = 1;
90
91 /* Disable TX checksum offload */
92 static uint32_t enable_tx_csum;
93
94 /* Disable TSO offload */
95 static uint32_t enable_tso;
96
97 static int client_mode;
98 static int dequeue_zero_copy;
99
100 static int builtin_net_driver;
101
102 /* Specify timeout (in useconds) between retries on RX. */
103 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
104 /* Specify the number of retries on RX. */
105 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
106
107 /* Socket file paths. Can be set by user */
108 static char *socket_files;
109 static int nb_sockets;
110
111 /* empty vmdq configuration structure. Filled in programatically */
112 static struct rte_eth_conf vmdq_conf_default = {
113         .rxmode = {
114                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
115                 .split_hdr_size = 0,
116                 /*
117                  * VLAN strip is necessary for 1G NIC such as I350,
118                  * this fixes bug of ipv4 forwarding in guest can't
119                  * forward pakets from one virtio dev to another virtio dev.
120                  */
121                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
122         },
123
124         .txmode = {
125                 .mq_mode = ETH_MQ_TX_NONE,
126                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
127                              DEV_TX_OFFLOAD_TCP_CKSUM |
128                              DEV_TX_OFFLOAD_VLAN_INSERT |
129                              DEV_TX_OFFLOAD_MULTI_SEGS |
130                              DEV_TX_OFFLOAD_TCP_TSO),
131         },
132         .rx_adv_conf = {
133                 /*
134                  * should be overridden separately in code with
135                  * appropriate values
136                  */
137                 .vmdq_rx_conf = {
138                         .nb_queue_pools = ETH_8_POOLS,
139                         .enable_default_pool = 0,
140                         .default_pool = 0,
141                         .nb_pool_maps = 0,
142                         .pool_map = {{0, 0},},
143                 },
144         },
145 };
146
147
148 static unsigned lcore_ids[RTE_MAX_LCORE];
149 static uint16_t ports[RTE_MAX_ETHPORTS];
150 static unsigned num_ports = 0; /**< The number of ports specified in command line */
151 static uint16_t num_pf_queues, num_vmdq_queues;
152 static uint16_t vmdq_pool_base, vmdq_queue_base;
153 static uint16_t queues_per_pool;
154
155 const uint16_t vlan_tags[] = {
156         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
157         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
158         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
159         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
160         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
161         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
162         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
163         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
164 };
165
166 /* ethernet addresses of ports */
167 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
168
169 static struct vhost_dev_tailq_list vhost_dev_list =
170         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
171
172 static struct lcore_info lcore_info[RTE_MAX_LCORE];
173
174 /* Used for queueing bursts of TX packets. */
175 struct mbuf_table {
176         unsigned len;
177         unsigned txq_id;
178         struct rte_mbuf *m_table[MAX_PKT_BURST];
179 };
180
181 /* TX queue for each data core. */
182 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
183
184 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
185                                  / US_PER_S * BURST_TX_DRAIN_US)
186 #define VLAN_HLEN       4
187
188 /*
189  * Builds up the correct configuration for VMDQ VLAN pool map
190  * according to the pool & queue limits.
191  */
192 static inline int
193 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
194 {
195         struct rte_eth_vmdq_rx_conf conf;
196         struct rte_eth_vmdq_rx_conf *def_conf =
197                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
198         unsigned i;
199
200         memset(&conf, 0, sizeof(conf));
201         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
202         conf.nb_pool_maps = num_devices;
203         conf.enable_loop_back = def_conf->enable_loop_back;
204         conf.rx_mode = def_conf->rx_mode;
205
206         for (i = 0; i < conf.nb_pool_maps; i++) {
207                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
208                 conf.pool_map[i].pools = (1UL << i);
209         }
210
211         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
212         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
213                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
214         return 0;
215 }
216
217 /*
218  * Validate the device number according to the max pool number gotten form
219  * dev_info. If the device number is invalid, give the error message and
220  * return -1. Each device must have its own pool.
221  */
222 static inline int
223 validate_num_devices(uint32_t max_nb_devices)
224 {
225         if (num_devices > max_nb_devices) {
226                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
227                 return -1;
228         }
229         return 0;
230 }
231
232 /*
233  * Initialises a given port using global settings and with the rx buffers
234  * coming from the mbuf_pool passed as parameter
235  */
236 static inline int
237 port_init(uint16_t port)
238 {
239         struct rte_eth_dev_info dev_info;
240         struct rte_eth_conf port_conf;
241         struct rte_eth_rxconf *rxconf;
242         struct rte_eth_txconf *txconf;
243         int16_t rx_rings, tx_rings;
244         uint16_t rx_ring_size, tx_ring_size;
245         int retval;
246         uint16_t q;
247
248         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
249         rte_eth_dev_info_get (port, &dev_info);
250
251         rxconf = &dev_info.default_rxconf;
252         txconf = &dev_info.default_txconf;
253         rxconf->rx_drop_en = 1;
254
255         /*configure the number of supported virtio devices based on VMDQ limits */
256         num_devices = dev_info.max_vmdq_pools;
257
258         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
259         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
260
261         /*
262          * When dequeue zero copy is enabled, guest Tx used vring will be
263          * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
264          * (tx_ring_size here) must be small enough so that the driver will
265          * hit the free threshold easily and free mbufs timely. Otherwise,
266          * guest Tx vring would be starved.
267          */
268         if (dequeue_zero_copy)
269                 tx_ring_size = 64;
270
271         tx_rings = (uint16_t)rte_lcore_count();
272
273         retval = validate_num_devices(MAX_DEVICES);
274         if (retval < 0)
275                 return retval;
276
277         /* Get port configuration. */
278         retval = get_eth_conf(&port_conf, num_devices);
279         if (retval < 0)
280                 return retval;
281         /* NIC queues are divided into pf queues and vmdq queues.  */
282         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
283         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
284         num_vmdq_queues = num_devices * queues_per_pool;
285         num_queues = num_pf_queues + num_vmdq_queues;
286         vmdq_queue_base = dev_info.vmdq_queue_base;
287         vmdq_pool_base  = dev_info.vmdq_pool_base;
288         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
289                 num_pf_queues, num_devices, queues_per_pool);
290
291         if (!rte_eth_dev_is_valid_port(port))
292                 return -1;
293
294         rx_rings = (uint16_t)dev_info.max_rx_queues;
295         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
296                 port_conf.txmode.offloads |=
297                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
298         /* Configure ethernet device. */
299         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
300         if (retval != 0) {
301                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
302                         port, strerror(-retval));
303                 return retval;
304         }
305
306         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
307                 &tx_ring_size);
308         if (retval != 0) {
309                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
310                         "for port %u: %s.\n", port, strerror(-retval));
311                 return retval;
312         }
313         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
314                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
315                         "for Rx queues on port %u.\n", port);
316                 return -1;
317         }
318
319         /* Setup the queues. */
320         rxconf->offloads = port_conf.rxmode.offloads;
321         for (q = 0; q < rx_rings; q ++) {
322                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
323                                                 rte_eth_dev_socket_id(port),
324                                                 rxconf,
325                                                 mbuf_pool);
326                 if (retval < 0) {
327                         RTE_LOG(ERR, VHOST_PORT,
328                                 "Failed to setup rx queue %u of port %u: %s.\n",
329                                 q, port, strerror(-retval));
330                         return retval;
331                 }
332         }
333         txconf->offloads = port_conf.txmode.offloads;
334         for (q = 0; q < tx_rings; q ++) {
335                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
336                                                 rte_eth_dev_socket_id(port),
337                                                 txconf);
338                 if (retval < 0) {
339                         RTE_LOG(ERR, VHOST_PORT,
340                                 "Failed to setup tx queue %u of port %u: %s.\n",
341                                 q, port, strerror(-retval));
342                         return retval;
343                 }
344         }
345
346         /* Start the device. */
347         retval  = rte_eth_dev_start(port);
348         if (retval < 0) {
349                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
350                         port, strerror(-retval));
351                 return retval;
352         }
353
354         if (promiscuous)
355                 rte_eth_promiscuous_enable(port);
356
357         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
358         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
359         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
360                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
361                         port,
362                         vmdq_ports_eth_addr[port].addr_bytes[0],
363                         vmdq_ports_eth_addr[port].addr_bytes[1],
364                         vmdq_ports_eth_addr[port].addr_bytes[2],
365                         vmdq_ports_eth_addr[port].addr_bytes[3],
366                         vmdq_ports_eth_addr[port].addr_bytes[4],
367                         vmdq_ports_eth_addr[port].addr_bytes[5]);
368
369         return 0;
370 }
371
372 /*
373  * Set socket file path.
374  */
375 static int
376 us_vhost_parse_socket_path(const char *q_arg)
377 {
378         /* parse number string */
379         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
380                 return -1;
381
382         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
383         snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
384         nb_sockets++;
385
386         return 0;
387 }
388
389 /*
390  * Parse the portmask provided at run time.
391  */
392 static int
393 parse_portmask(const char *portmask)
394 {
395         char *end = NULL;
396         unsigned long pm;
397
398         errno = 0;
399
400         /* parse hexadecimal string */
401         pm = strtoul(portmask, &end, 16);
402         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
403                 return -1;
404
405         if (pm == 0)
406                 return -1;
407
408         return pm;
409
410 }
411
412 /*
413  * Parse num options at run time.
414  */
415 static int
416 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
417 {
418         char *end = NULL;
419         unsigned long num;
420
421         errno = 0;
422
423         /* parse unsigned int string */
424         num = strtoul(q_arg, &end, 10);
425         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
426                 return -1;
427
428         if (num > max_valid_value)
429                 return -1;
430
431         return num;
432
433 }
434
435 /*
436  * Display usage
437  */
438 static void
439 us_vhost_usage(const char *prgname)
440 {
441         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
442         "               --vm2vm [0|1|2]\n"
443         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
444         "               --socket-file <path>\n"
445         "               --nb-devices ND\n"
446         "               -p PORTMASK: Set mask for ports to be used by application\n"
447         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
448         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
449         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
450         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
451         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
452         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
453         "               --socket-file: The path of the socket file.\n"
454         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
455         "               --tso [0|1] disable/enable TCP segment offload.\n"
456         "               --client register a vhost-user socket as client mode.\n"
457         "               --dequeue-zero-copy enables dequeue zero copy\n",
458                prgname);
459 }
460
461 /*
462  * Parse the arguments given in the command line of the application.
463  */
464 static int
465 us_vhost_parse_args(int argc, char **argv)
466 {
467         int opt, ret;
468         int option_index;
469         unsigned i;
470         const char *prgname = argv[0];
471         static struct option long_option[] = {
472                 {"vm2vm", required_argument, NULL, 0},
473                 {"rx-retry", required_argument, NULL, 0},
474                 {"rx-retry-delay", required_argument, NULL, 0},
475                 {"rx-retry-num", required_argument, NULL, 0},
476                 {"mergeable", required_argument, NULL, 0},
477                 {"stats", required_argument, NULL, 0},
478                 {"socket-file", required_argument, NULL, 0},
479                 {"tx-csum", required_argument, NULL, 0},
480                 {"tso", required_argument, NULL, 0},
481                 {"client", no_argument, &client_mode, 1},
482                 {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
483                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
484                 {NULL, 0, 0, 0},
485         };
486
487         /* Parse command line */
488         while ((opt = getopt_long(argc, argv, "p:P",
489                         long_option, &option_index)) != EOF) {
490                 switch (opt) {
491                 /* Portmask */
492                 case 'p':
493                         enabled_port_mask = parse_portmask(optarg);
494                         if (enabled_port_mask == 0) {
495                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
496                                 us_vhost_usage(prgname);
497                                 return -1;
498                         }
499                         break;
500
501                 case 'P':
502                         promiscuous = 1;
503                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
504                                 ETH_VMDQ_ACCEPT_BROADCAST |
505                                 ETH_VMDQ_ACCEPT_MULTICAST;
506
507                         break;
508
509                 case 0:
510                         /* Enable/disable vm2vm comms. */
511                         if (!strncmp(long_option[option_index].name, "vm2vm",
512                                 MAX_LONG_OPT_SZ)) {
513                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
514                                 if (ret == -1) {
515                                         RTE_LOG(INFO, VHOST_CONFIG,
516                                                 "Invalid argument for "
517                                                 "vm2vm [0|1|2]\n");
518                                         us_vhost_usage(prgname);
519                                         return -1;
520                                 } else {
521                                         vm2vm_mode = (vm2vm_type)ret;
522                                 }
523                         }
524
525                         /* Enable/disable retries on RX. */
526                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
527                                 ret = parse_num_opt(optarg, 1);
528                                 if (ret == -1) {
529                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
530                                         us_vhost_usage(prgname);
531                                         return -1;
532                                 } else {
533                                         enable_retry = ret;
534                                 }
535                         }
536
537                         /* Enable/disable TX checksum offload. */
538                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
539                                 ret = parse_num_opt(optarg, 1);
540                                 if (ret == -1) {
541                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
542                                         us_vhost_usage(prgname);
543                                         return -1;
544                                 } else
545                                         enable_tx_csum = ret;
546                         }
547
548                         /* Enable/disable TSO offload. */
549                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
550                                 ret = parse_num_opt(optarg, 1);
551                                 if (ret == -1) {
552                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
553                                         us_vhost_usage(prgname);
554                                         return -1;
555                                 } else
556                                         enable_tso = ret;
557                         }
558
559                         /* Specify the retries delay time (in useconds) on RX. */
560                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
561                                 ret = parse_num_opt(optarg, INT32_MAX);
562                                 if (ret == -1) {
563                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
564                                         us_vhost_usage(prgname);
565                                         return -1;
566                                 } else {
567                                         burst_rx_delay_time = ret;
568                                 }
569                         }
570
571                         /* Specify the retries number on RX. */
572                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
573                                 ret = parse_num_opt(optarg, INT32_MAX);
574                                 if (ret == -1) {
575                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
576                                         us_vhost_usage(prgname);
577                                         return -1;
578                                 } else {
579                                         burst_rx_retry_num = ret;
580                                 }
581                         }
582
583                         /* Enable/disable RX mergeable buffers. */
584                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
585                                 ret = parse_num_opt(optarg, 1);
586                                 if (ret == -1) {
587                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
588                                         us_vhost_usage(prgname);
589                                         return -1;
590                                 } else {
591                                         mergeable = !!ret;
592                                         if (ret) {
593                                                 vmdq_conf_default.rxmode.offloads |=
594                                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
595                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
596                                                         = JUMBO_FRAME_MAX_SIZE;
597                                         }
598                                 }
599                         }
600
601                         /* Enable/disable stats. */
602                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
603                                 ret = parse_num_opt(optarg, INT32_MAX);
604                                 if (ret == -1) {
605                                         RTE_LOG(INFO, VHOST_CONFIG,
606                                                 "Invalid argument for stats [0..N]\n");
607                                         us_vhost_usage(prgname);
608                                         return -1;
609                                 } else {
610                                         enable_stats = ret;
611                                 }
612                         }
613
614                         /* Set socket file path. */
615                         if (!strncmp(long_option[option_index].name,
616                                                 "socket-file", MAX_LONG_OPT_SZ)) {
617                                 if (us_vhost_parse_socket_path(optarg) == -1) {
618                                         RTE_LOG(INFO, VHOST_CONFIG,
619                                         "Invalid argument for socket name (Max %d characters)\n",
620                                         PATH_MAX);
621                                         us_vhost_usage(prgname);
622                                         return -1;
623                                 }
624                         }
625
626                         break;
627
628                         /* Invalid option - print options. */
629                 default:
630                         us_vhost_usage(prgname);
631                         return -1;
632                 }
633         }
634
635         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
636                 if (enabled_port_mask & (1 << i))
637                         ports[num_ports++] = i;
638         }
639
640         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
641                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
642                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
643                 return -1;
644         }
645
646         return 0;
647 }
648
649 /*
650  * Update the global var NUM_PORTS and array PORTS according to system ports number
651  * and return valid ports number
652  */
653 static unsigned check_ports_num(unsigned nb_ports)
654 {
655         unsigned valid_num_ports = num_ports;
656         unsigned portid;
657
658         if (num_ports > nb_ports) {
659                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
660                         num_ports, nb_ports);
661                 num_ports = nb_ports;
662         }
663
664         for (portid = 0; portid < num_ports; portid ++) {
665                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
666                         RTE_LOG(INFO, VHOST_PORT,
667                                 "\nSpecified port ID(%u) is not valid\n",
668                                 ports[portid]);
669                         ports[portid] = INVALID_PORT_ID;
670                         valid_num_ports--;
671                 }
672         }
673         return valid_num_ports;
674 }
675
676 static __rte_always_inline struct vhost_dev *
677 find_vhost_dev(struct ether_addr *mac)
678 {
679         struct vhost_dev *vdev;
680
681         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
682                 if (vdev->ready == DEVICE_RX &&
683                     is_same_ether_addr(mac, &vdev->mac_address))
684                         return vdev;
685         }
686
687         return NULL;
688 }
689
690 /*
691  * This function learns the MAC address of the device and registers this along with a
692  * vlan tag to a VMDQ.
693  */
694 static int
695 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
696 {
697         struct ether_hdr *pkt_hdr;
698         int i, ret;
699
700         /* Learn MAC address of guest device from packet */
701         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
702
703         if (find_vhost_dev(&pkt_hdr->s_addr)) {
704                 RTE_LOG(ERR, VHOST_DATA,
705                         "(%d) device is using a registered MAC!\n",
706                         vdev->vid);
707                 return -1;
708         }
709
710         for (i = 0; i < ETHER_ADDR_LEN; i++)
711                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
712
713         /* vlan_tag currently uses the device_id. */
714         vdev->vlan_tag = vlan_tags[vdev->vid];
715
716         /* Print out VMDQ registration info. */
717         RTE_LOG(INFO, VHOST_DATA,
718                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
719                 vdev->vid,
720                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
721                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
722                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
723                 vdev->vlan_tag);
724
725         /* Register the MAC address. */
726         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
727                                 (uint32_t)vdev->vid + vmdq_pool_base);
728         if (ret)
729                 RTE_LOG(ERR, VHOST_DATA,
730                         "(%d) failed to add device MAC address to VMDQ\n",
731                         vdev->vid);
732
733         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
734
735         /* Set device as ready for RX. */
736         vdev->ready = DEVICE_RX;
737
738         return 0;
739 }
740
741 /*
742  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
743  * queue before disabling RX on the device.
744  */
745 static inline void
746 unlink_vmdq(struct vhost_dev *vdev)
747 {
748         unsigned i = 0;
749         unsigned rx_count;
750         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
751
752         if (vdev->ready == DEVICE_RX) {
753                 /*clear MAC and VLAN settings*/
754                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
755                 for (i = 0; i < 6; i++)
756                         vdev->mac_address.addr_bytes[i] = 0;
757
758                 vdev->vlan_tag = 0;
759
760                 /*Clear out the receive buffers*/
761                 rx_count = rte_eth_rx_burst(ports[0],
762                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
763
764                 while (rx_count) {
765                         for (i = 0; i < rx_count; i++)
766                                 rte_pktmbuf_free(pkts_burst[i]);
767
768                         rx_count = rte_eth_rx_burst(ports[0],
769                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
770                 }
771
772                 vdev->ready = DEVICE_MAC_LEARNING;
773         }
774 }
775
776 static __rte_always_inline void
777 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
778             struct rte_mbuf *m)
779 {
780         uint16_t ret;
781
782         if (builtin_net_driver) {
783                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
784         } else {
785                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
786         }
787
788         if (enable_stats) {
789                 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
790                 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
791                 src_vdev->stats.tx_total++;
792                 src_vdev->stats.tx += ret;
793         }
794 }
795
796 /*
797  * Check if the packet destination MAC address is for a local device. If so then put
798  * the packet on that devices RX queue. If not then return.
799  */
800 static __rte_always_inline int
801 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
802 {
803         struct ether_hdr *pkt_hdr;
804         struct vhost_dev *dst_vdev;
805
806         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
807
808         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
809         if (!dst_vdev)
810                 return -1;
811
812         if (vdev->vid == dst_vdev->vid) {
813                 RTE_LOG_DP(DEBUG, VHOST_DATA,
814                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
815                         vdev->vid);
816                 return 0;
817         }
818
819         RTE_LOG_DP(DEBUG, VHOST_DATA,
820                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
821
822         if (unlikely(dst_vdev->remove)) {
823                 RTE_LOG_DP(DEBUG, VHOST_DATA,
824                         "(%d) device is marked for removal\n", dst_vdev->vid);
825                 return 0;
826         }
827
828         virtio_xmit(dst_vdev, vdev, m);
829         return 0;
830 }
831
832 /*
833  * Check if the destination MAC of a packet is one local VM,
834  * and get its vlan tag, and offset if it is.
835  */
836 static __rte_always_inline int
837 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
838         uint32_t *offset, uint16_t *vlan_tag)
839 {
840         struct vhost_dev *dst_vdev;
841         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
842
843         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
844         if (!dst_vdev)
845                 return 0;
846
847         if (vdev->vid == dst_vdev->vid) {
848                 RTE_LOG_DP(DEBUG, VHOST_DATA,
849                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
850                         vdev->vid);
851                 return -1;
852         }
853
854         /*
855          * HW vlan strip will reduce the packet length
856          * by minus length of vlan tag, so need restore
857          * the packet length by plus it.
858          */
859         *offset  = VLAN_HLEN;
860         *vlan_tag = vlan_tags[vdev->vid];
861
862         RTE_LOG_DP(DEBUG, VHOST_DATA,
863                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
864                 vdev->vid, dst_vdev->vid, *vlan_tag);
865
866         return 0;
867 }
868
869 static uint16_t
870 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
871 {
872         if (ol_flags & PKT_TX_IPV4)
873                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
874         else /* assume ethertype == ETHER_TYPE_IPv6 */
875                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
876 }
877
878 static void virtio_tx_offload(struct rte_mbuf *m)
879 {
880         void *l3_hdr;
881         struct ipv4_hdr *ipv4_hdr = NULL;
882         struct tcp_hdr *tcp_hdr = NULL;
883         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
884
885         l3_hdr = (char *)eth_hdr + m->l2_len;
886
887         if (m->ol_flags & PKT_TX_IPV4) {
888                 ipv4_hdr = l3_hdr;
889                 ipv4_hdr->hdr_checksum = 0;
890                 m->ol_flags |= PKT_TX_IP_CKSUM;
891         }
892
893         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
894         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
895 }
896
897 static inline void
898 free_pkts(struct rte_mbuf **pkts, uint16_t n)
899 {
900         while (n--)
901                 rte_pktmbuf_free(pkts[n]);
902 }
903
904 static __rte_always_inline void
905 do_drain_mbuf_table(struct mbuf_table *tx_q)
906 {
907         uint16_t count;
908
909         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
910                                  tx_q->m_table, tx_q->len);
911         if (unlikely(count < tx_q->len))
912                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
913
914         tx_q->len = 0;
915 }
916
917 /*
918  * This function routes the TX packet to the correct interface. This
919  * may be a local device or the physical port.
920  */
921 static __rte_always_inline void
922 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
923 {
924         struct mbuf_table *tx_q;
925         unsigned offset = 0;
926         const uint16_t lcore_id = rte_lcore_id();
927         struct ether_hdr *nh;
928
929
930         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
931         if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
932                 struct vhost_dev *vdev2;
933
934                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
935                         if (vdev2 != vdev)
936                                 virtio_xmit(vdev2, vdev, m);
937                 }
938                 goto queue2nic;
939         }
940
941         /*check if destination is local VM*/
942         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
943                 rte_pktmbuf_free(m);
944                 return;
945         }
946
947         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
948                 if (unlikely(find_local_dest(vdev, m, &offset,
949                                              &vlan_tag) != 0)) {
950                         rte_pktmbuf_free(m);
951                         return;
952                 }
953         }
954
955         RTE_LOG_DP(DEBUG, VHOST_DATA,
956                 "(%d) TX: MAC address is external\n", vdev->vid);
957
958 queue2nic:
959
960         /*Add packet to the port tx queue*/
961         tx_q = &lcore_tx_queue[lcore_id];
962
963         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
964         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
965                 /* Guest has inserted the vlan tag. */
966                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
967                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
968                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
969                         (vh->vlan_tci != vlan_tag_be))
970                         vh->vlan_tci = vlan_tag_be;
971         } else {
972                 m->ol_flags |= PKT_TX_VLAN_PKT;
973
974                 /*
975                  * Find the right seg to adjust the data len when offset is
976                  * bigger than tail room size.
977                  */
978                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
979                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
980                                 m->data_len += offset;
981                         else {
982                                 struct rte_mbuf *seg = m;
983
984                                 while ((seg->next != NULL) &&
985                                         (offset > rte_pktmbuf_tailroom(seg)))
986                                         seg = seg->next;
987
988                                 seg->data_len += offset;
989                         }
990                         m->pkt_len += offset;
991                 }
992
993                 m->vlan_tci = vlan_tag;
994         }
995
996         if (m->ol_flags & PKT_TX_TCP_SEG)
997                 virtio_tx_offload(m);
998
999         tx_q->m_table[tx_q->len++] = m;
1000         if (enable_stats) {
1001                 vdev->stats.tx_total++;
1002                 vdev->stats.tx++;
1003         }
1004
1005         if (unlikely(tx_q->len == MAX_PKT_BURST))
1006                 do_drain_mbuf_table(tx_q);
1007 }
1008
1009
1010 static __rte_always_inline void
1011 drain_mbuf_table(struct mbuf_table *tx_q)
1012 {
1013         static uint64_t prev_tsc;
1014         uint64_t cur_tsc;
1015
1016         if (tx_q->len == 0)
1017                 return;
1018
1019         cur_tsc = rte_rdtsc();
1020         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1021                 prev_tsc = cur_tsc;
1022
1023                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1024                         "TX queue drained after timeout with burst size %u\n",
1025                         tx_q->len);
1026                 do_drain_mbuf_table(tx_q);
1027         }
1028 }
1029
1030 static __rte_always_inline void
1031 drain_eth_rx(struct vhost_dev *vdev)
1032 {
1033         uint16_t rx_count, enqueue_count;
1034         struct rte_mbuf *pkts[MAX_PKT_BURST];
1035
1036         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1037                                     pkts, MAX_PKT_BURST);
1038         if (!rx_count)
1039                 return;
1040
1041         /*
1042          * When "enable_retry" is set, here we wait and retry when there
1043          * is no enough free slots in the queue to hold @rx_count packets,
1044          * to diminish packet loss.
1045          */
1046         if (enable_retry &&
1047             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1048                         VIRTIO_RXQ))) {
1049                 uint32_t retry;
1050
1051                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1052                         rte_delay_us(burst_rx_delay_time);
1053                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1054                                         VIRTIO_RXQ))
1055                                 break;
1056                 }
1057         }
1058
1059         if (builtin_net_driver) {
1060                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1061                                                 pkts, rx_count);
1062         } else {
1063                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1064                                                 pkts, rx_count);
1065         }
1066         if (enable_stats) {
1067                 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1068                 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1069         }
1070
1071         free_pkts(pkts, rx_count);
1072 }
1073
1074 static __rte_always_inline void
1075 drain_virtio_tx(struct vhost_dev *vdev)
1076 {
1077         struct rte_mbuf *pkts[MAX_PKT_BURST];
1078         uint16_t count;
1079         uint16_t i;
1080
1081         if (builtin_net_driver) {
1082                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1083                                         pkts, MAX_PKT_BURST);
1084         } else {
1085                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1086                                         mbuf_pool, pkts, MAX_PKT_BURST);
1087         }
1088
1089         /* setup VMDq for the first packet */
1090         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1091                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1092                         free_pkts(pkts, count);
1093         }
1094
1095         for (i = 0; i < count; ++i)
1096                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1097 }
1098
1099 /*
1100  * Main function of vhost-switch. It basically does:
1101  *
1102  * for each vhost device {
1103  *    - drain_eth_rx()
1104  *
1105  *      Which drains the host eth Rx queue linked to the vhost device,
1106  *      and deliver all of them to guest virito Rx ring associated with
1107  *      this vhost device.
1108  *
1109  *    - drain_virtio_tx()
1110  *
1111  *      Which drains the guest virtio Tx queue and deliver all of them
1112  *      to the target, which could be another vhost device, or the
1113  *      physical eth dev. The route is done in function "virtio_tx_route".
1114  * }
1115  */
1116 static int
1117 switch_worker(void *arg __rte_unused)
1118 {
1119         unsigned i;
1120         unsigned lcore_id = rte_lcore_id();
1121         struct vhost_dev *vdev;
1122         struct mbuf_table *tx_q;
1123
1124         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1125
1126         tx_q = &lcore_tx_queue[lcore_id];
1127         for (i = 0; i < rte_lcore_count(); i++) {
1128                 if (lcore_ids[i] == lcore_id) {
1129                         tx_q->txq_id = i;
1130                         break;
1131                 }
1132         }
1133
1134         while(1) {
1135                 drain_mbuf_table(tx_q);
1136
1137                 /*
1138                  * Inform the configuration core that we have exited the
1139                  * linked list and that no devices are in use if requested.
1140                  */
1141                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1142                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1143
1144                 /*
1145                  * Process vhost devices
1146                  */
1147                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1148                               lcore_vdev_entry) {
1149                         if (unlikely(vdev->remove)) {
1150                                 unlink_vmdq(vdev);
1151                                 vdev->ready = DEVICE_SAFE_REMOVE;
1152                                 continue;
1153                         }
1154
1155                         if (likely(vdev->ready == DEVICE_RX))
1156                                 drain_eth_rx(vdev);
1157
1158                         if (likely(!vdev->remove))
1159                                 drain_virtio_tx(vdev);
1160                 }
1161         }
1162
1163         return 0;
1164 }
1165
1166 /*
1167  * Remove a device from the specific data core linked list and from the
1168  * main linked list. Synchonization  occurs through the use of the
1169  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1170  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1171  */
1172 static void
1173 destroy_device(int vid)
1174 {
1175         struct vhost_dev *vdev = NULL;
1176         int lcore;
1177
1178         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1179                 if (vdev->vid == vid)
1180                         break;
1181         }
1182         if (!vdev)
1183                 return;
1184         /*set the remove flag. */
1185         vdev->remove = 1;
1186         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1187                 rte_pause();
1188         }
1189
1190         if (builtin_net_driver)
1191                 vs_vhost_net_remove(vdev);
1192
1193         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1194                      lcore_vdev_entry);
1195         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1196
1197
1198         /* Set the dev_removal_flag on each lcore. */
1199         RTE_LCORE_FOREACH_SLAVE(lcore)
1200                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1201
1202         /*
1203          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1204          * we can be sure that they can no longer access the device removed
1205          * from the linked lists and that the devices are no longer in use.
1206          */
1207         RTE_LCORE_FOREACH_SLAVE(lcore) {
1208                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1209                         rte_pause();
1210         }
1211
1212         lcore_info[vdev->coreid].device_num--;
1213
1214         RTE_LOG(INFO, VHOST_DATA,
1215                 "(%d) device has been removed from data core\n",
1216                 vdev->vid);
1217
1218         rte_free(vdev);
1219 }
1220
1221 /*
1222  * A new device is added to a data core. First the device is added to the main linked list
1223  * and the allocated to a specific data core.
1224  */
1225 static int
1226 new_device(int vid)
1227 {
1228         int lcore, core_add = 0;
1229         uint32_t device_num_min = num_devices;
1230         struct vhost_dev *vdev;
1231
1232         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1233         if (vdev == NULL) {
1234                 RTE_LOG(INFO, VHOST_DATA,
1235                         "(%d) couldn't allocate memory for vhost dev\n",
1236                         vid);
1237                 return -1;
1238         }
1239         vdev->vid = vid;
1240
1241         if (builtin_net_driver)
1242                 vs_vhost_net_setup(vdev);
1243
1244         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1245         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1246
1247         /*reset ready flag*/
1248         vdev->ready = DEVICE_MAC_LEARNING;
1249         vdev->remove = 0;
1250
1251         /* Find a suitable lcore to add the device. */
1252         RTE_LCORE_FOREACH_SLAVE(lcore) {
1253                 if (lcore_info[lcore].device_num < device_num_min) {
1254                         device_num_min = lcore_info[lcore].device_num;
1255                         core_add = lcore;
1256                 }
1257         }
1258         vdev->coreid = core_add;
1259
1260         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1261                           lcore_vdev_entry);
1262         lcore_info[vdev->coreid].device_num++;
1263
1264         /* Disable notifications. */
1265         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1266         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1267
1268         RTE_LOG(INFO, VHOST_DATA,
1269                 "(%d) device has been added to data core %d\n",
1270                 vid, vdev->coreid);
1271
1272         return 0;
1273 }
1274
1275 /*
1276  * These callback allow devices to be added to the data core when configuration
1277  * has been fully complete.
1278  */
1279 static const struct vhost_device_ops virtio_net_device_ops =
1280 {
1281         .new_device =  new_device,
1282         .destroy_device = destroy_device,
1283 };
1284
1285 /*
1286  * This is a thread will wake up after a period to print stats if the user has
1287  * enabled them.
1288  */
1289 static void *
1290 print_stats(__rte_unused void *arg)
1291 {
1292         struct vhost_dev *vdev;
1293         uint64_t tx_dropped, rx_dropped;
1294         uint64_t tx, tx_total, rx, rx_total;
1295         const char clr[] = { 27, '[', '2', 'J', '\0' };
1296         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1297
1298         while(1) {
1299                 sleep(enable_stats);
1300
1301                 /* Clear screen and move to top left */
1302                 printf("%s%s\n", clr, top_left);
1303                 printf("Device statistics =================================\n");
1304
1305                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1306                         tx_total   = vdev->stats.tx_total;
1307                         tx         = vdev->stats.tx;
1308                         tx_dropped = tx_total - tx;
1309
1310                         rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1311                         rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1312                         rx_dropped = rx_total - rx;
1313
1314                         printf("Statistics for device %d\n"
1315                                 "-----------------------\n"
1316                                 "TX total:              %" PRIu64 "\n"
1317                                 "TX dropped:            %" PRIu64 "\n"
1318                                 "TX successful:         %" PRIu64 "\n"
1319                                 "RX total:              %" PRIu64 "\n"
1320                                 "RX dropped:            %" PRIu64 "\n"
1321                                 "RX successful:         %" PRIu64 "\n",
1322                                 vdev->vid,
1323                                 tx_total, tx_dropped, tx,
1324                                 rx_total, rx_dropped, rx);
1325                 }
1326
1327                 printf("===================================================\n");
1328         }
1329
1330         return NULL;
1331 }
1332
1333 static void
1334 unregister_drivers(int socket_num)
1335 {
1336         int i, ret;
1337
1338         for (i = 0; i < socket_num; i++) {
1339                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1340                 if (ret != 0)
1341                         RTE_LOG(ERR, VHOST_CONFIG,
1342                                 "Fail to unregister vhost driver for %s.\n",
1343                                 socket_files + i * PATH_MAX);
1344         }
1345 }
1346
1347 /* When we receive a INT signal, unregister vhost driver */
1348 static void
1349 sigint_handler(__rte_unused int signum)
1350 {
1351         /* Unregister vhost driver. */
1352         unregister_drivers(nb_sockets);
1353
1354         exit(0);
1355 }
1356
1357 /*
1358  * While creating an mbuf pool, one key thing is to figure out how
1359  * many mbuf entries is enough for our use. FYI, here are some
1360  * guidelines:
1361  *
1362  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1363  *
1364  * - For each switch core (A CPU core does the packet switch), we need
1365  *   also make some reservation for receiving the packets from virtio
1366  *   Tx queue. How many is enough depends on the usage. It's normally
1367  *   a simple calculation like following:
1368  *
1369  *       MAX_PKT_BURST * max packet size / mbuf size
1370  *
1371  *   So, we definitely need allocate more mbufs when TSO is enabled.
1372  *
1373  * - Similarly, for each switching core, we should serve @nr_rx_desc
1374  *   mbufs for receiving the packets from physical NIC device.
1375  *
1376  * - We also need make sure, for each switch core, we have allocated
1377  *   enough mbufs to fill up the mbuf cache.
1378  */
1379 static void
1380 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1381         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1382 {
1383         uint32_t nr_mbufs;
1384         uint32_t nr_mbufs_per_core;
1385         uint32_t mtu = 1500;
1386
1387         if (mergeable)
1388                 mtu = 9000;
1389         if (enable_tso)
1390                 mtu = 64 * 1024;
1391
1392         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1393                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1394         nr_mbufs_per_core += nr_rx_desc;
1395         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1396
1397         nr_mbufs  = nr_queues * nr_rx_desc;
1398         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1399         nr_mbufs *= nr_port;
1400
1401         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1402                                             nr_mbuf_cache, 0, mbuf_size,
1403                                             rte_socket_id());
1404         if (mbuf_pool == NULL)
1405                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1406 }
1407
1408 /*
1409  * Main function, does initialisation and calls the per-lcore functions.
1410  */
1411 int
1412 main(int argc, char *argv[])
1413 {
1414         unsigned lcore_id, core_id = 0;
1415         unsigned nb_ports, valid_num_ports;
1416         int ret, i;
1417         uint16_t portid;
1418         static pthread_t tid;
1419         uint64_t flags = 0;
1420
1421         signal(SIGINT, sigint_handler);
1422
1423         /* init EAL */
1424         ret = rte_eal_init(argc, argv);
1425         if (ret < 0)
1426                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1427         argc -= ret;
1428         argv += ret;
1429
1430         /* parse app arguments */
1431         ret = us_vhost_parse_args(argc, argv);
1432         if (ret < 0)
1433                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1434
1435         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1436                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1437
1438                 if (rte_lcore_is_enabled(lcore_id))
1439                         lcore_ids[core_id++] = lcore_id;
1440         }
1441
1442         if (rte_lcore_count() > RTE_MAX_LCORE)
1443                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1444
1445         /* Get the number of physical ports. */
1446         nb_ports = rte_eth_dev_count_avail();
1447
1448         /*
1449          * Update the global var NUM_PORTS and global array PORTS
1450          * and get value of var VALID_NUM_PORTS according to system ports number
1451          */
1452         valid_num_ports = check_ports_num(nb_ports);
1453
1454         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1455                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1456                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1457                 return -1;
1458         }
1459
1460         /*
1461          * FIXME: here we are trying to allocate mbufs big enough for
1462          * @MAX_QUEUES, but the truth is we're never going to use that
1463          * many queues here. We probably should only do allocation for
1464          * those queues we are going to use.
1465          */
1466         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1467                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1468
1469         if (vm2vm_mode == VM2VM_HARDWARE) {
1470                 /* Enable VT loop back to let L2 switch to do it. */
1471                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1472                 RTE_LOG(DEBUG, VHOST_CONFIG,
1473                         "Enable loop back for L2 switch in vmdq.\n");
1474         }
1475
1476         /* initialize all ports */
1477         RTE_ETH_FOREACH_DEV(portid) {
1478                 /* skip ports that are not enabled */
1479                 if ((enabled_port_mask & (1 << portid)) == 0) {
1480                         RTE_LOG(INFO, VHOST_PORT,
1481                                 "Skipping disabled port %d\n", portid);
1482                         continue;
1483                 }
1484                 if (port_init(portid) != 0)
1485                         rte_exit(EXIT_FAILURE,
1486                                 "Cannot initialize network ports\n");
1487         }
1488
1489         /* Enable stats if the user option is set. */
1490         if (enable_stats) {
1491                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1492                                         print_stats, NULL);
1493                 if (ret < 0)
1494                         rte_exit(EXIT_FAILURE,
1495                                 "Cannot create print-stats thread\n");
1496         }
1497
1498         /* Launch all data cores. */
1499         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1500                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1501
1502         if (client_mode)
1503                 flags |= RTE_VHOST_USER_CLIENT;
1504
1505         if (dequeue_zero_copy)
1506                 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1507
1508         /* Register vhost user driver to handle vhost messages. */
1509         for (i = 0; i < nb_sockets; i++) {
1510                 char *file = socket_files + i * PATH_MAX;
1511                 ret = rte_vhost_driver_register(file, flags);
1512                 if (ret != 0) {
1513                         unregister_drivers(i);
1514                         rte_exit(EXIT_FAILURE,
1515                                 "vhost driver register failure.\n");
1516                 }
1517
1518                 if (builtin_net_driver)
1519                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1520
1521                 if (mergeable == 0) {
1522                         rte_vhost_driver_disable_features(file,
1523                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1524                 }
1525
1526                 if (enable_tx_csum == 0) {
1527                         rte_vhost_driver_disable_features(file,
1528                                 1ULL << VIRTIO_NET_F_CSUM);
1529                 }
1530
1531                 if (enable_tso == 0) {
1532                         rte_vhost_driver_disable_features(file,
1533                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1534                         rte_vhost_driver_disable_features(file,
1535                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1536                         rte_vhost_driver_disable_features(file,
1537                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1538                         rte_vhost_driver_disable_features(file,
1539                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1540                 }
1541
1542                 if (promiscuous) {
1543                         rte_vhost_driver_enable_features(file,
1544                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1545                 }
1546
1547                 ret = rte_vhost_driver_callback_register(file,
1548                         &virtio_net_device_ops);
1549                 if (ret != 0) {
1550                         rte_exit(EXIT_FAILURE,
1551                                 "failed to register vhost driver callbacks.\n");
1552                 }
1553
1554                 if (rte_vhost_driver_start(file) < 0) {
1555                         rte_exit(EXIT_FAILURE,
1556                                 "failed to start vhost driver.\n");
1557                 }
1558         }
1559
1560         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1561                 rte_eal_wait_lcore(lcore_id);
1562
1563         return 0;
1564
1565 }