New upstream version 18.11.1
[deb_dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "main.h"
29
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
39
40 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
41
42 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
44
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX                       1
50 #define DEVICE_SAFE_REMOVE      2
51
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55
56 #define INVALID_PORT_ID 0xFF
57
58 /* Max number of devices. Limited by vmdq. */
59 #define MAX_DEVICES 64
60
61 /* Maximum long option length for option parsing. */
62 #define MAX_LONG_OPT_SZ 64
63
64 /* mask of enabled ports */
65 static uint32_t enabled_port_mask = 0;
66
67 /* Promiscuous mode */
68 static uint32_t promiscuous;
69
70 /* number of devices/queues to support*/
71 static uint32_t num_queues = 0;
72 static uint32_t num_devices;
73
74 static struct rte_mempool *mbuf_pool;
75 static int mergeable;
76
77 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
78 typedef enum {
79         VM2VM_DISABLED = 0,
80         VM2VM_SOFTWARE = 1,
81         VM2VM_HARDWARE = 2,
82         VM2VM_LAST
83 } vm2vm_type;
84 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
85
86 /* Enable stats. */
87 static uint32_t enable_stats = 0;
88 /* Enable retries on RX. */
89 static uint32_t enable_retry = 1;
90
91 /* Disable TX checksum offload */
92 static uint32_t enable_tx_csum;
93
94 /* Disable TSO offload */
95 static uint32_t enable_tso;
96
97 static int client_mode;
98 static int dequeue_zero_copy;
99
100 static int builtin_net_driver;
101
102 /* Specify timeout (in useconds) between retries on RX. */
103 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
104 /* Specify the number of retries on RX. */
105 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
106
107 /* Socket file paths. Can be set by user */
108 static char *socket_files;
109 static int nb_sockets;
110
111 /* empty vmdq configuration structure. Filled in programatically */
112 static struct rte_eth_conf vmdq_conf_default = {
113         .rxmode = {
114                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
115                 .split_hdr_size = 0,
116                 /*
117                  * VLAN strip is necessary for 1G NIC such as I350,
118                  * this fixes bug of ipv4 forwarding in guest can't
119                  * forward pakets from one virtio dev to another virtio dev.
120                  */
121                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
122         },
123
124         .txmode = {
125                 .mq_mode = ETH_MQ_TX_NONE,
126                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
127                              DEV_TX_OFFLOAD_TCP_CKSUM |
128                              DEV_TX_OFFLOAD_VLAN_INSERT |
129                              DEV_TX_OFFLOAD_MULTI_SEGS |
130                              DEV_TX_OFFLOAD_TCP_TSO),
131         },
132         .rx_adv_conf = {
133                 /*
134                  * should be overridden separately in code with
135                  * appropriate values
136                  */
137                 .vmdq_rx_conf = {
138                         .nb_queue_pools = ETH_8_POOLS,
139                         .enable_default_pool = 0,
140                         .default_pool = 0,
141                         .nb_pool_maps = 0,
142                         .pool_map = {{0, 0},},
143                 },
144         },
145 };
146
147
148 static unsigned lcore_ids[RTE_MAX_LCORE];
149 static uint16_t ports[RTE_MAX_ETHPORTS];
150 static unsigned num_ports = 0; /**< The number of ports specified in command line */
151 static uint16_t num_pf_queues, num_vmdq_queues;
152 static uint16_t vmdq_pool_base, vmdq_queue_base;
153 static uint16_t queues_per_pool;
154
155 const uint16_t vlan_tags[] = {
156         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
157         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
158         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
159         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
160         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
161         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
162         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
163         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
164 };
165
166 /* ethernet addresses of ports */
167 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
168
169 static struct vhost_dev_tailq_list vhost_dev_list =
170         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
171
172 static struct lcore_info lcore_info[RTE_MAX_LCORE];
173
174 /* Used for queueing bursts of TX packets. */
175 struct mbuf_table {
176         unsigned len;
177         unsigned txq_id;
178         struct rte_mbuf *m_table[MAX_PKT_BURST];
179 };
180
181 /* TX queue for each data core. */
182 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
183
184 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
185                                  / US_PER_S * BURST_TX_DRAIN_US)
186 #define VLAN_HLEN       4
187
188 /*
189  * Builds up the correct configuration for VMDQ VLAN pool map
190  * according to the pool & queue limits.
191  */
192 static inline int
193 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
194 {
195         struct rte_eth_vmdq_rx_conf conf;
196         struct rte_eth_vmdq_rx_conf *def_conf =
197                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
198         unsigned i;
199
200         memset(&conf, 0, sizeof(conf));
201         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
202         conf.nb_pool_maps = num_devices;
203         conf.enable_loop_back = def_conf->enable_loop_back;
204         conf.rx_mode = def_conf->rx_mode;
205
206         for (i = 0; i < conf.nb_pool_maps; i++) {
207                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
208                 conf.pool_map[i].pools = (1UL << i);
209         }
210
211         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
212         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
213                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
214         return 0;
215 }
216
217 /*
218  * Validate the device number according to the max pool number gotten form
219  * dev_info. If the device number is invalid, give the error message and
220  * return -1. Each device must have its own pool.
221  */
222 static inline int
223 validate_num_devices(uint32_t max_nb_devices)
224 {
225         if (num_devices > max_nb_devices) {
226                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
227                 return -1;
228         }
229         return 0;
230 }
231
232 /*
233  * Initialises a given port using global settings and with the rx buffers
234  * coming from the mbuf_pool passed as parameter
235  */
236 static inline int
237 port_init(uint16_t port)
238 {
239         struct rte_eth_dev_info dev_info;
240         struct rte_eth_conf port_conf;
241         struct rte_eth_rxconf *rxconf;
242         struct rte_eth_txconf *txconf;
243         int16_t rx_rings, tx_rings;
244         uint16_t rx_ring_size, tx_ring_size;
245         int retval;
246         uint16_t q;
247
248         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
249         rte_eth_dev_info_get (port, &dev_info);
250
251         rxconf = &dev_info.default_rxconf;
252         txconf = &dev_info.default_txconf;
253         rxconf->rx_drop_en = 1;
254
255         /*configure the number of supported virtio devices based on VMDQ limits */
256         num_devices = dev_info.max_vmdq_pools;
257
258         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
259         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
260
261         /*
262          * When dequeue zero copy is enabled, guest Tx used vring will be
263          * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
264          * (tx_ring_size here) must be small enough so that the driver will
265          * hit the free threshold easily and free mbufs timely. Otherwise,
266          * guest Tx vring would be starved.
267          */
268         if (dequeue_zero_copy)
269                 tx_ring_size = 64;
270
271         tx_rings = (uint16_t)rte_lcore_count();
272
273         retval = validate_num_devices(MAX_DEVICES);
274         if (retval < 0)
275                 return retval;
276
277         /* Get port configuration. */
278         retval = get_eth_conf(&port_conf, num_devices);
279         if (retval < 0)
280                 return retval;
281         /* NIC queues are divided into pf queues and vmdq queues.  */
282         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
283         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
284         num_vmdq_queues = num_devices * queues_per_pool;
285         num_queues = num_pf_queues + num_vmdq_queues;
286         vmdq_queue_base = dev_info.vmdq_queue_base;
287         vmdq_pool_base  = dev_info.vmdq_pool_base;
288         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
289                 num_pf_queues, num_devices, queues_per_pool);
290
291         if (!rte_eth_dev_is_valid_port(port))
292                 return -1;
293
294         rx_rings = (uint16_t)dev_info.max_rx_queues;
295         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
296                 port_conf.txmode.offloads |=
297                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
298         /* Configure ethernet device. */
299         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
300         if (retval != 0) {
301                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
302                         port, strerror(-retval));
303                 return retval;
304         }
305
306         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
307                 &tx_ring_size);
308         if (retval != 0) {
309                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
310                         "for port %u: %s.\n", port, strerror(-retval));
311                 return retval;
312         }
313         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
314                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
315                         "for Rx queues on port %u.\n", port);
316                 return -1;
317         }
318
319         /* Setup the queues. */
320         rxconf->offloads = port_conf.rxmode.offloads;
321         for (q = 0; q < rx_rings; q ++) {
322                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
323                                                 rte_eth_dev_socket_id(port),
324                                                 rxconf,
325                                                 mbuf_pool);
326                 if (retval < 0) {
327                         RTE_LOG(ERR, VHOST_PORT,
328                                 "Failed to setup rx queue %u of port %u: %s.\n",
329                                 q, port, strerror(-retval));
330                         return retval;
331                 }
332         }
333         txconf->offloads = port_conf.txmode.offloads;
334         for (q = 0; q < tx_rings; q ++) {
335                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
336                                                 rte_eth_dev_socket_id(port),
337                                                 txconf);
338                 if (retval < 0) {
339                         RTE_LOG(ERR, VHOST_PORT,
340                                 "Failed to setup tx queue %u of port %u: %s.\n",
341                                 q, port, strerror(-retval));
342                         return retval;
343                 }
344         }
345
346         /* Start the device. */
347         retval  = rte_eth_dev_start(port);
348         if (retval < 0) {
349                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
350                         port, strerror(-retval));
351                 return retval;
352         }
353
354         if (promiscuous)
355                 rte_eth_promiscuous_enable(port);
356
357         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
358         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
359         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
360                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
361                         port,
362                         vmdq_ports_eth_addr[port].addr_bytes[0],
363                         vmdq_ports_eth_addr[port].addr_bytes[1],
364                         vmdq_ports_eth_addr[port].addr_bytes[2],
365                         vmdq_ports_eth_addr[port].addr_bytes[3],
366                         vmdq_ports_eth_addr[port].addr_bytes[4],
367                         vmdq_ports_eth_addr[port].addr_bytes[5]);
368
369         return 0;
370 }
371
372 /*
373  * Set socket file path.
374  */
375 static int
376 us_vhost_parse_socket_path(const char *q_arg)
377 {
378         char *old;
379
380         /* parse number string */
381         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
382                 return -1;
383
384         old = socket_files;
385         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
386         if (socket_files == NULL) {
387                 free(old);
388                 return -1;
389         }
390
391         snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
392         nb_sockets++;
393
394         return 0;
395 }
396
397 /*
398  * Parse the portmask provided at run time.
399  */
400 static int
401 parse_portmask(const char *portmask)
402 {
403         char *end = NULL;
404         unsigned long pm;
405
406         errno = 0;
407
408         /* parse hexadecimal string */
409         pm = strtoul(portmask, &end, 16);
410         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
411                 return -1;
412
413         if (pm == 0)
414                 return -1;
415
416         return pm;
417
418 }
419
420 /*
421  * Parse num options at run time.
422  */
423 static int
424 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
425 {
426         char *end = NULL;
427         unsigned long num;
428
429         errno = 0;
430
431         /* parse unsigned int string */
432         num = strtoul(q_arg, &end, 10);
433         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
434                 return -1;
435
436         if (num > max_valid_value)
437                 return -1;
438
439         return num;
440
441 }
442
443 /*
444  * Display usage
445  */
446 static void
447 us_vhost_usage(const char *prgname)
448 {
449         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
450         "               --vm2vm [0|1|2]\n"
451         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
452         "               --socket-file <path>\n"
453         "               --nb-devices ND\n"
454         "               -p PORTMASK: Set mask for ports to be used by application\n"
455         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
456         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
457         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
458         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
459         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
460         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
461         "               --socket-file: The path of the socket file.\n"
462         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
463         "               --tso [0|1] disable/enable TCP segment offload.\n"
464         "               --client register a vhost-user socket as client mode.\n"
465         "               --dequeue-zero-copy enables dequeue zero copy\n",
466                prgname);
467 }
468
469 /*
470  * Parse the arguments given in the command line of the application.
471  */
472 static int
473 us_vhost_parse_args(int argc, char **argv)
474 {
475         int opt, ret;
476         int option_index;
477         unsigned i;
478         const char *prgname = argv[0];
479         static struct option long_option[] = {
480                 {"vm2vm", required_argument, NULL, 0},
481                 {"rx-retry", required_argument, NULL, 0},
482                 {"rx-retry-delay", required_argument, NULL, 0},
483                 {"rx-retry-num", required_argument, NULL, 0},
484                 {"mergeable", required_argument, NULL, 0},
485                 {"stats", required_argument, NULL, 0},
486                 {"socket-file", required_argument, NULL, 0},
487                 {"tx-csum", required_argument, NULL, 0},
488                 {"tso", required_argument, NULL, 0},
489                 {"client", no_argument, &client_mode, 1},
490                 {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
491                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
492                 {NULL, 0, 0, 0},
493         };
494
495         /* Parse command line */
496         while ((opt = getopt_long(argc, argv, "p:P",
497                         long_option, &option_index)) != EOF) {
498                 switch (opt) {
499                 /* Portmask */
500                 case 'p':
501                         enabled_port_mask = parse_portmask(optarg);
502                         if (enabled_port_mask == 0) {
503                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
504                                 us_vhost_usage(prgname);
505                                 return -1;
506                         }
507                         break;
508
509                 case 'P':
510                         promiscuous = 1;
511                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
512                                 ETH_VMDQ_ACCEPT_BROADCAST |
513                                 ETH_VMDQ_ACCEPT_MULTICAST;
514
515                         break;
516
517                 case 0:
518                         /* Enable/disable vm2vm comms. */
519                         if (!strncmp(long_option[option_index].name, "vm2vm",
520                                 MAX_LONG_OPT_SZ)) {
521                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
522                                 if (ret == -1) {
523                                         RTE_LOG(INFO, VHOST_CONFIG,
524                                                 "Invalid argument for "
525                                                 "vm2vm [0|1|2]\n");
526                                         us_vhost_usage(prgname);
527                                         return -1;
528                                 } else {
529                                         vm2vm_mode = (vm2vm_type)ret;
530                                 }
531                         }
532
533                         /* Enable/disable retries on RX. */
534                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
535                                 ret = parse_num_opt(optarg, 1);
536                                 if (ret == -1) {
537                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
538                                         us_vhost_usage(prgname);
539                                         return -1;
540                                 } else {
541                                         enable_retry = ret;
542                                 }
543                         }
544
545                         /* Enable/disable TX checksum offload. */
546                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
547                                 ret = parse_num_opt(optarg, 1);
548                                 if (ret == -1) {
549                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
550                                         us_vhost_usage(prgname);
551                                         return -1;
552                                 } else
553                                         enable_tx_csum = ret;
554                         }
555
556                         /* Enable/disable TSO offload. */
557                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
558                                 ret = parse_num_opt(optarg, 1);
559                                 if (ret == -1) {
560                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
561                                         us_vhost_usage(prgname);
562                                         return -1;
563                                 } else
564                                         enable_tso = ret;
565                         }
566
567                         /* Specify the retries delay time (in useconds) on RX. */
568                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
569                                 ret = parse_num_opt(optarg, INT32_MAX);
570                                 if (ret == -1) {
571                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
572                                         us_vhost_usage(prgname);
573                                         return -1;
574                                 } else {
575                                         burst_rx_delay_time = ret;
576                                 }
577                         }
578
579                         /* Specify the retries number on RX. */
580                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
581                                 ret = parse_num_opt(optarg, INT32_MAX);
582                                 if (ret == -1) {
583                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
584                                         us_vhost_usage(prgname);
585                                         return -1;
586                                 } else {
587                                         burst_rx_retry_num = ret;
588                                 }
589                         }
590
591                         /* Enable/disable RX mergeable buffers. */
592                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
593                                 ret = parse_num_opt(optarg, 1);
594                                 if (ret == -1) {
595                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
596                                         us_vhost_usage(prgname);
597                                         return -1;
598                                 } else {
599                                         mergeable = !!ret;
600                                         if (ret) {
601                                                 vmdq_conf_default.rxmode.offloads |=
602                                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
603                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
604                                                         = JUMBO_FRAME_MAX_SIZE;
605                                         }
606                                 }
607                         }
608
609                         /* Enable/disable stats. */
610                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
611                                 ret = parse_num_opt(optarg, INT32_MAX);
612                                 if (ret == -1) {
613                                         RTE_LOG(INFO, VHOST_CONFIG,
614                                                 "Invalid argument for stats [0..N]\n");
615                                         us_vhost_usage(prgname);
616                                         return -1;
617                                 } else {
618                                         enable_stats = ret;
619                                 }
620                         }
621
622                         /* Set socket file path. */
623                         if (!strncmp(long_option[option_index].name,
624                                                 "socket-file", MAX_LONG_OPT_SZ)) {
625                                 if (us_vhost_parse_socket_path(optarg) == -1) {
626                                         RTE_LOG(INFO, VHOST_CONFIG,
627                                         "Invalid argument for socket name (Max %d characters)\n",
628                                         PATH_MAX);
629                                         us_vhost_usage(prgname);
630                                         return -1;
631                                 }
632                         }
633
634                         break;
635
636                         /* Invalid option - print options. */
637                 default:
638                         us_vhost_usage(prgname);
639                         return -1;
640                 }
641         }
642
643         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
644                 if (enabled_port_mask & (1 << i))
645                         ports[num_ports++] = i;
646         }
647
648         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
649                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
650                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
651                 return -1;
652         }
653
654         return 0;
655 }
656
657 /*
658  * Update the global var NUM_PORTS and array PORTS according to system ports number
659  * and return valid ports number
660  */
661 static unsigned check_ports_num(unsigned nb_ports)
662 {
663         unsigned valid_num_ports = num_ports;
664         unsigned portid;
665
666         if (num_ports > nb_ports) {
667                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
668                         num_ports, nb_ports);
669                 num_ports = nb_ports;
670         }
671
672         for (portid = 0; portid < num_ports; portid ++) {
673                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
674                         RTE_LOG(INFO, VHOST_PORT,
675                                 "\nSpecified port ID(%u) is not valid\n",
676                                 ports[portid]);
677                         ports[portid] = INVALID_PORT_ID;
678                         valid_num_ports--;
679                 }
680         }
681         return valid_num_ports;
682 }
683
684 static __rte_always_inline struct vhost_dev *
685 find_vhost_dev(struct ether_addr *mac)
686 {
687         struct vhost_dev *vdev;
688
689         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
690                 if (vdev->ready == DEVICE_RX &&
691                     is_same_ether_addr(mac, &vdev->mac_address))
692                         return vdev;
693         }
694
695         return NULL;
696 }
697
698 /*
699  * This function learns the MAC address of the device and registers this along with a
700  * vlan tag to a VMDQ.
701  */
702 static int
703 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
704 {
705         struct ether_hdr *pkt_hdr;
706         int i, ret;
707
708         /* Learn MAC address of guest device from packet */
709         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
710
711         if (find_vhost_dev(&pkt_hdr->s_addr)) {
712                 RTE_LOG(ERR, VHOST_DATA,
713                         "(%d) device is using a registered MAC!\n",
714                         vdev->vid);
715                 return -1;
716         }
717
718         for (i = 0; i < ETHER_ADDR_LEN; i++)
719                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
720
721         /* vlan_tag currently uses the device_id. */
722         vdev->vlan_tag = vlan_tags[vdev->vid];
723
724         /* Print out VMDQ registration info. */
725         RTE_LOG(INFO, VHOST_DATA,
726                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
727                 vdev->vid,
728                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
729                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
730                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
731                 vdev->vlan_tag);
732
733         /* Register the MAC address. */
734         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
735                                 (uint32_t)vdev->vid + vmdq_pool_base);
736         if (ret)
737                 RTE_LOG(ERR, VHOST_DATA,
738                         "(%d) failed to add device MAC address to VMDQ\n",
739                         vdev->vid);
740
741         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
742
743         /* Set device as ready for RX. */
744         vdev->ready = DEVICE_RX;
745
746         return 0;
747 }
748
749 /*
750  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
751  * queue before disabling RX on the device.
752  */
753 static inline void
754 unlink_vmdq(struct vhost_dev *vdev)
755 {
756         unsigned i = 0;
757         unsigned rx_count;
758         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
759
760         if (vdev->ready == DEVICE_RX) {
761                 /*clear MAC and VLAN settings*/
762                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
763                 for (i = 0; i < 6; i++)
764                         vdev->mac_address.addr_bytes[i] = 0;
765
766                 vdev->vlan_tag = 0;
767
768                 /*Clear out the receive buffers*/
769                 rx_count = rte_eth_rx_burst(ports[0],
770                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
771
772                 while (rx_count) {
773                         for (i = 0; i < rx_count; i++)
774                                 rte_pktmbuf_free(pkts_burst[i]);
775
776                         rx_count = rte_eth_rx_burst(ports[0],
777                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
778                 }
779
780                 vdev->ready = DEVICE_MAC_LEARNING;
781         }
782 }
783
784 static __rte_always_inline void
785 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
786             struct rte_mbuf *m)
787 {
788         uint16_t ret;
789
790         if (builtin_net_driver) {
791                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
792         } else {
793                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
794         }
795
796         if (enable_stats) {
797                 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
798                 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
799                 src_vdev->stats.tx_total++;
800                 src_vdev->stats.tx += ret;
801         }
802 }
803
804 /*
805  * Check if the packet destination MAC address is for a local device. If so then put
806  * the packet on that devices RX queue. If not then return.
807  */
808 static __rte_always_inline int
809 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
810 {
811         struct ether_hdr *pkt_hdr;
812         struct vhost_dev *dst_vdev;
813
814         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
815
816         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
817         if (!dst_vdev)
818                 return -1;
819
820         if (vdev->vid == dst_vdev->vid) {
821                 RTE_LOG_DP(DEBUG, VHOST_DATA,
822                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
823                         vdev->vid);
824                 return 0;
825         }
826
827         RTE_LOG_DP(DEBUG, VHOST_DATA,
828                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
829
830         if (unlikely(dst_vdev->remove)) {
831                 RTE_LOG_DP(DEBUG, VHOST_DATA,
832                         "(%d) device is marked for removal\n", dst_vdev->vid);
833                 return 0;
834         }
835
836         virtio_xmit(dst_vdev, vdev, m);
837         return 0;
838 }
839
840 /*
841  * Check if the destination MAC of a packet is one local VM,
842  * and get its vlan tag, and offset if it is.
843  */
844 static __rte_always_inline int
845 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
846         uint32_t *offset, uint16_t *vlan_tag)
847 {
848         struct vhost_dev *dst_vdev;
849         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
850
851         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
852         if (!dst_vdev)
853                 return 0;
854
855         if (vdev->vid == dst_vdev->vid) {
856                 RTE_LOG_DP(DEBUG, VHOST_DATA,
857                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
858                         vdev->vid);
859                 return -1;
860         }
861
862         /*
863          * HW vlan strip will reduce the packet length
864          * by minus length of vlan tag, so need restore
865          * the packet length by plus it.
866          */
867         *offset  = VLAN_HLEN;
868         *vlan_tag = vlan_tags[vdev->vid];
869
870         RTE_LOG_DP(DEBUG, VHOST_DATA,
871                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
872                 vdev->vid, dst_vdev->vid, *vlan_tag);
873
874         return 0;
875 }
876
877 static uint16_t
878 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
879 {
880         if (ol_flags & PKT_TX_IPV4)
881                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
882         else /* assume ethertype == ETHER_TYPE_IPv6 */
883                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
884 }
885
886 static void virtio_tx_offload(struct rte_mbuf *m)
887 {
888         void *l3_hdr;
889         struct ipv4_hdr *ipv4_hdr = NULL;
890         struct tcp_hdr *tcp_hdr = NULL;
891         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
892
893         l3_hdr = (char *)eth_hdr + m->l2_len;
894
895         if (m->ol_flags & PKT_TX_IPV4) {
896                 ipv4_hdr = l3_hdr;
897                 ipv4_hdr->hdr_checksum = 0;
898                 m->ol_flags |= PKT_TX_IP_CKSUM;
899         }
900
901         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
902         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
903 }
904
905 static inline void
906 free_pkts(struct rte_mbuf **pkts, uint16_t n)
907 {
908         while (n--)
909                 rte_pktmbuf_free(pkts[n]);
910 }
911
912 static __rte_always_inline void
913 do_drain_mbuf_table(struct mbuf_table *tx_q)
914 {
915         uint16_t count;
916
917         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
918                                  tx_q->m_table, tx_q->len);
919         if (unlikely(count < tx_q->len))
920                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
921
922         tx_q->len = 0;
923 }
924
925 /*
926  * This function routes the TX packet to the correct interface. This
927  * may be a local device or the physical port.
928  */
929 static __rte_always_inline void
930 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
931 {
932         struct mbuf_table *tx_q;
933         unsigned offset = 0;
934         const uint16_t lcore_id = rte_lcore_id();
935         struct ether_hdr *nh;
936
937
938         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
939         if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
940                 struct vhost_dev *vdev2;
941
942                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
943                         if (vdev2 != vdev)
944                                 virtio_xmit(vdev2, vdev, m);
945                 }
946                 goto queue2nic;
947         }
948
949         /*check if destination is local VM*/
950         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
951                 rte_pktmbuf_free(m);
952                 return;
953         }
954
955         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
956                 if (unlikely(find_local_dest(vdev, m, &offset,
957                                              &vlan_tag) != 0)) {
958                         rte_pktmbuf_free(m);
959                         return;
960                 }
961         }
962
963         RTE_LOG_DP(DEBUG, VHOST_DATA,
964                 "(%d) TX: MAC address is external\n", vdev->vid);
965
966 queue2nic:
967
968         /*Add packet to the port tx queue*/
969         tx_q = &lcore_tx_queue[lcore_id];
970
971         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
972         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
973                 /* Guest has inserted the vlan tag. */
974                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
975                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
976                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
977                         (vh->vlan_tci != vlan_tag_be))
978                         vh->vlan_tci = vlan_tag_be;
979         } else {
980                 m->ol_flags |= PKT_TX_VLAN_PKT;
981
982                 /*
983                  * Find the right seg to adjust the data len when offset is
984                  * bigger than tail room size.
985                  */
986                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
987                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
988                                 m->data_len += offset;
989                         else {
990                                 struct rte_mbuf *seg = m;
991
992                                 while ((seg->next != NULL) &&
993                                         (offset > rte_pktmbuf_tailroom(seg)))
994                                         seg = seg->next;
995
996                                 seg->data_len += offset;
997                         }
998                         m->pkt_len += offset;
999                 }
1000
1001                 m->vlan_tci = vlan_tag;
1002         }
1003
1004         if (m->ol_flags & PKT_TX_TCP_SEG)
1005                 virtio_tx_offload(m);
1006
1007         tx_q->m_table[tx_q->len++] = m;
1008         if (enable_stats) {
1009                 vdev->stats.tx_total++;
1010                 vdev->stats.tx++;
1011         }
1012
1013         if (unlikely(tx_q->len == MAX_PKT_BURST))
1014                 do_drain_mbuf_table(tx_q);
1015 }
1016
1017
1018 static __rte_always_inline void
1019 drain_mbuf_table(struct mbuf_table *tx_q)
1020 {
1021         static uint64_t prev_tsc;
1022         uint64_t cur_tsc;
1023
1024         if (tx_q->len == 0)
1025                 return;
1026
1027         cur_tsc = rte_rdtsc();
1028         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1029                 prev_tsc = cur_tsc;
1030
1031                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1032                         "TX queue drained after timeout with burst size %u\n",
1033                         tx_q->len);
1034                 do_drain_mbuf_table(tx_q);
1035         }
1036 }
1037
1038 static __rte_always_inline void
1039 drain_eth_rx(struct vhost_dev *vdev)
1040 {
1041         uint16_t rx_count, enqueue_count;
1042         struct rte_mbuf *pkts[MAX_PKT_BURST];
1043
1044         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1045                                     pkts, MAX_PKT_BURST);
1046         if (!rx_count)
1047                 return;
1048
1049         /*
1050          * When "enable_retry" is set, here we wait and retry when there
1051          * is no enough free slots in the queue to hold @rx_count packets,
1052          * to diminish packet loss.
1053          */
1054         if (enable_retry &&
1055             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1056                         VIRTIO_RXQ))) {
1057                 uint32_t retry;
1058
1059                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1060                         rte_delay_us(burst_rx_delay_time);
1061                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1062                                         VIRTIO_RXQ))
1063                                 break;
1064                 }
1065         }
1066
1067         if (builtin_net_driver) {
1068                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1069                                                 pkts, rx_count);
1070         } else {
1071                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1072                                                 pkts, rx_count);
1073         }
1074         if (enable_stats) {
1075                 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1076                 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1077         }
1078
1079         free_pkts(pkts, rx_count);
1080 }
1081
1082 static __rte_always_inline void
1083 drain_virtio_tx(struct vhost_dev *vdev)
1084 {
1085         struct rte_mbuf *pkts[MAX_PKT_BURST];
1086         uint16_t count;
1087         uint16_t i;
1088
1089         if (builtin_net_driver) {
1090                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1091                                         pkts, MAX_PKT_BURST);
1092         } else {
1093                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1094                                         mbuf_pool, pkts, MAX_PKT_BURST);
1095         }
1096
1097         /* setup VMDq for the first packet */
1098         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1099                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1100                         free_pkts(pkts, count);
1101         }
1102
1103         for (i = 0; i < count; ++i)
1104                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1105 }
1106
1107 /*
1108  * Main function of vhost-switch. It basically does:
1109  *
1110  * for each vhost device {
1111  *    - drain_eth_rx()
1112  *
1113  *      Which drains the host eth Rx queue linked to the vhost device,
1114  *      and deliver all of them to guest virito Rx ring associated with
1115  *      this vhost device.
1116  *
1117  *    - drain_virtio_tx()
1118  *
1119  *      Which drains the guest virtio Tx queue and deliver all of them
1120  *      to the target, which could be another vhost device, or the
1121  *      physical eth dev. The route is done in function "virtio_tx_route".
1122  * }
1123  */
1124 static int
1125 switch_worker(void *arg __rte_unused)
1126 {
1127         unsigned i;
1128         unsigned lcore_id = rte_lcore_id();
1129         struct vhost_dev *vdev;
1130         struct mbuf_table *tx_q;
1131
1132         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1133
1134         tx_q = &lcore_tx_queue[lcore_id];
1135         for (i = 0; i < rte_lcore_count(); i++) {
1136                 if (lcore_ids[i] == lcore_id) {
1137                         tx_q->txq_id = i;
1138                         break;
1139                 }
1140         }
1141
1142         while(1) {
1143                 drain_mbuf_table(tx_q);
1144
1145                 /*
1146                  * Inform the configuration core that we have exited the
1147                  * linked list and that no devices are in use if requested.
1148                  */
1149                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1150                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1151
1152                 /*
1153                  * Process vhost devices
1154                  */
1155                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1156                               lcore_vdev_entry) {
1157                         if (unlikely(vdev->remove)) {
1158                                 unlink_vmdq(vdev);
1159                                 vdev->ready = DEVICE_SAFE_REMOVE;
1160                                 continue;
1161                         }
1162
1163                         if (likely(vdev->ready == DEVICE_RX))
1164                                 drain_eth_rx(vdev);
1165
1166                         if (likely(!vdev->remove))
1167                                 drain_virtio_tx(vdev);
1168                 }
1169         }
1170
1171         return 0;
1172 }
1173
1174 /*
1175  * Remove a device from the specific data core linked list and from the
1176  * main linked list. Synchonization  occurs through the use of the
1177  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1178  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1179  */
1180 static void
1181 destroy_device(int vid)
1182 {
1183         struct vhost_dev *vdev = NULL;
1184         int lcore;
1185
1186         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1187                 if (vdev->vid == vid)
1188                         break;
1189         }
1190         if (!vdev)
1191                 return;
1192         /*set the remove flag. */
1193         vdev->remove = 1;
1194         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1195                 rte_pause();
1196         }
1197
1198         if (builtin_net_driver)
1199                 vs_vhost_net_remove(vdev);
1200
1201         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1202                      lcore_vdev_entry);
1203         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1204
1205
1206         /* Set the dev_removal_flag on each lcore. */
1207         RTE_LCORE_FOREACH_SLAVE(lcore)
1208                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1209
1210         /*
1211          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1212          * we can be sure that they can no longer access the device removed
1213          * from the linked lists and that the devices are no longer in use.
1214          */
1215         RTE_LCORE_FOREACH_SLAVE(lcore) {
1216                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1217                         rte_pause();
1218         }
1219
1220         lcore_info[vdev->coreid].device_num--;
1221
1222         RTE_LOG(INFO, VHOST_DATA,
1223                 "(%d) device has been removed from data core\n",
1224                 vdev->vid);
1225
1226         rte_free(vdev);
1227 }
1228
1229 /*
1230  * A new device is added to a data core. First the device is added to the main linked list
1231  * and then allocated to a specific data core.
1232  */
1233 static int
1234 new_device(int vid)
1235 {
1236         int lcore, core_add = 0;
1237         uint32_t device_num_min = num_devices;
1238         struct vhost_dev *vdev;
1239
1240         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1241         if (vdev == NULL) {
1242                 RTE_LOG(INFO, VHOST_DATA,
1243                         "(%d) couldn't allocate memory for vhost dev\n",
1244                         vid);
1245                 return -1;
1246         }
1247         vdev->vid = vid;
1248
1249         if (builtin_net_driver)
1250                 vs_vhost_net_setup(vdev);
1251
1252         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1253         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1254
1255         /*reset ready flag*/
1256         vdev->ready = DEVICE_MAC_LEARNING;
1257         vdev->remove = 0;
1258
1259         /* Find a suitable lcore to add the device. */
1260         RTE_LCORE_FOREACH_SLAVE(lcore) {
1261                 if (lcore_info[lcore].device_num < device_num_min) {
1262                         device_num_min = lcore_info[lcore].device_num;
1263                         core_add = lcore;
1264                 }
1265         }
1266         vdev->coreid = core_add;
1267
1268         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1269                           lcore_vdev_entry);
1270         lcore_info[vdev->coreid].device_num++;
1271
1272         /* Disable notifications. */
1273         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1274         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1275
1276         RTE_LOG(INFO, VHOST_DATA,
1277                 "(%d) device has been added to data core %d\n",
1278                 vid, vdev->coreid);
1279
1280         return 0;
1281 }
1282
1283 /*
1284  * These callback allow devices to be added to the data core when configuration
1285  * has been fully complete.
1286  */
1287 static const struct vhost_device_ops virtio_net_device_ops =
1288 {
1289         .new_device =  new_device,
1290         .destroy_device = destroy_device,
1291 };
1292
1293 /*
1294  * This is a thread will wake up after a period to print stats if the user has
1295  * enabled them.
1296  */
1297 static void *
1298 print_stats(__rte_unused void *arg)
1299 {
1300         struct vhost_dev *vdev;
1301         uint64_t tx_dropped, rx_dropped;
1302         uint64_t tx, tx_total, rx, rx_total;
1303         const char clr[] = { 27, '[', '2', 'J', '\0' };
1304         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1305
1306         while(1) {
1307                 sleep(enable_stats);
1308
1309                 /* Clear screen and move to top left */
1310                 printf("%s%s\n", clr, top_left);
1311                 printf("Device statistics =================================\n");
1312
1313                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1314                         tx_total   = vdev->stats.tx_total;
1315                         tx         = vdev->stats.tx;
1316                         tx_dropped = tx_total - tx;
1317
1318                         rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1319                         rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1320                         rx_dropped = rx_total - rx;
1321
1322                         printf("Statistics for device %d\n"
1323                                 "-----------------------\n"
1324                                 "TX total:              %" PRIu64 "\n"
1325                                 "TX dropped:            %" PRIu64 "\n"
1326                                 "TX successful:         %" PRIu64 "\n"
1327                                 "RX total:              %" PRIu64 "\n"
1328                                 "RX dropped:            %" PRIu64 "\n"
1329                                 "RX successful:         %" PRIu64 "\n",
1330                                 vdev->vid,
1331                                 tx_total, tx_dropped, tx,
1332                                 rx_total, rx_dropped, rx);
1333                 }
1334
1335                 printf("===================================================\n");
1336         }
1337
1338         return NULL;
1339 }
1340
1341 static void
1342 unregister_drivers(int socket_num)
1343 {
1344         int i, ret;
1345
1346         for (i = 0; i < socket_num; i++) {
1347                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1348                 if (ret != 0)
1349                         RTE_LOG(ERR, VHOST_CONFIG,
1350                                 "Fail to unregister vhost driver for %s.\n",
1351                                 socket_files + i * PATH_MAX);
1352         }
1353 }
1354
1355 /* When we receive a INT signal, unregister vhost driver */
1356 static void
1357 sigint_handler(__rte_unused int signum)
1358 {
1359         /* Unregister vhost driver. */
1360         unregister_drivers(nb_sockets);
1361
1362         exit(0);
1363 }
1364
1365 /*
1366  * While creating an mbuf pool, one key thing is to figure out how
1367  * many mbuf entries is enough for our use. FYI, here are some
1368  * guidelines:
1369  *
1370  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1371  *
1372  * - For each switch core (A CPU core does the packet switch), we need
1373  *   also make some reservation for receiving the packets from virtio
1374  *   Tx queue. How many is enough depends on the usage. It's normally
1375  *   a simple calculation like following:
1376  *
1377  *       MAX_PKT_BURST * max packet size / mbuf size
1378  *
1379  *   So, we definitely need allocate more mbufs when TSO is enabled.
1380  *
1381  * - Similarly, for each switching core, we should serve @nr_rx_desc
1382  *   mbufs for receiving the packets from physical NIC device.
1383  *
1384  * - We also need make sure, for each switch core, we have allocated
1385  *   enough mbufs to fill up the mbuf cache.
1386  */
1387 static void
1388 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1389         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1390 {
1391         uint32_t nr_mbufs;
1392         uint32_t nr_mbufs_per_core;
1393         uint32_t mtu = 1500;
1394
1395         if (mergeable)
1396                 mtu = 9000;
1397         if (enable_tso)
1398                 mtu = 64 * 1024;
1399
1400         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1401                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1402         nr_mbufs_per_core += nr_rx_desc;
1403         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1404
1405         nr_mbufs  = nr_queues * nr_rx_desc;
1406         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1407         nr_mbufs *= nr_port;
1408
1409         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1410                                             nr_mbuf_cache, 0, mbuf_size,
1411                                             rte_socket_id());
1412         if (mbuf_pool == NULL)
1413                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1414 }
1415
1416 /*
1417  * Main function, does initialisation and calls the per-lcore functions.
1418  */
1419 int
1420 main(int argc, char *argv[])
1421 {
1422         unsigned lcore_id, core_id = 0;
1423         unsigned nb_ports, valid_num_ports;
1424         int ret, i;
1425         uint16_t portid;
1426         static pthread_t tid;
1427         uint64_t flags = 0;
1428
1429         signal(SIGINT, sigint_handler);
1430
1431         /* init EAL */
1432         ret = rte_eal_init(argc, argv);
1433         if (ret < 0)
1434                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1435         argc -= ret;
1436         argv += ret;
1437
1438         /* parse app arguments */
1439         ret = us_vhost_parse_args(argc, argv);
1440         if (ret < 0)
1441                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1442
1443         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1444                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1445
1446                 if (rte_lcore_is_enabled(lcore_id))
1447                         lcore_ids[core_id++] = lcore_id;
1448         }
1449
1450         if (rte_lcore_count() > RTE_MAX_LCORE)
1451                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1452
1453         /* Get the number of physical ports. */
1454         nb_ports = rte_eth_dev_count_avail();
1455
1456         /*
1457          * Update the global var NUM_PORTS and global array PORTS
1458          * and get value of var VALID_NUM_PORTS according to system ports number
1459          */
1460         valid_num_ports = check_ports_num(nb_ports);
1461
1462         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1463                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1464                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1465                 return -1;
1466         }
1467
1468         /*
1469          * FIXME: here we are trying to allocate mbufs big enough for
1470          * @MAX_QUEUES, but the truth is we're never going to use that
1471          * many queues here. We probably should only do allocation for
1472          * those queues we are going to use.
1473          */
1474         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1475                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1476
1477         if (vm2vm_mode == VM2VM_HARDWARE) {
1478                 /* Enable VT loop back to let L2 switch to do it. */
1479                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1480                 RTE_LOG(DEBUG, VHOST_CONFIG,
1481                         "Enable loop back for L2 switch in vmdq.\n");
1482         }
1483
1484         /* initialize all ports */
1485         RTE_ETH_FOREACH_DEV(portid) {
1486                 /* skip ports that are not enabled */
1487                 if ((enabled_port_mask & (1 << portid)) == 0) {
1488                         RTE_LOG(INFO, VHOST_PORT,
1489                                 "Skipping disabled port %d\n", portid);
1490                         continue;
1491                 }
1492                 if (port_init(portid) != 0)
1493                         rte_exit(EXIT_FAILURE,
1494                                 "Cannot initialize network ports\n");
1495         }
1496
1497         /* Enable stats if the user option is set. */
1498         if (enable_stats) {
1499                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1500                                         print_stats, NULL);
1501                 if (ret < 0)
1502                         rte_exit(EXIT_FAILURE,
1503                                 "Cannot create print-stats thread\n");
1504         }
1505
1506         /* Launch all data cores. */
1507         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1508                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1509
1510         if (client_mode)
1511                 flags |= RTE_VHOST_USER_CLIENT;
1512
1513         if (dequeue_zero_copy)
1514                 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1515
1516         /* Register vhost user driver to handle vhost messages. */
1517         for (i = 0; i < nb_sockets; i++) {
1518                 char *file = socket_files + i * PATH_MAX;
1519                 ret = rte_vhost_driver_register(file, flags);
1520                 if (ret != 0) {
1521                         unregister_drivers(i);
1522                         rte_exit(EXIT_FAILURE,
1523                                 "vhost driver register failure.\n");
1524                 }
1525
1526                 if (builtin_net_driver)
1527                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1528
1529                 if (mergeable == 0) {
1530                         rte_vhost_driver_disable_features(file,
1531                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1532                 }
1533
1534                 if (enable_tx_csum == 0) {
1535                         rte_vhost_driver_disable_features(file,
1536                                 1ULL << VIRTIO_NET_F_CSUM);
1537                 }
1538
1539                 if (enable_tso == 0) {
1540                         rte_vhost_driver_disable_features(file,
1541                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1542                         rte_vhost_driver_disable_features(file,
1543                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1544                         rte_vhost_driver_disable_features(file,
1545                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1546                         rte_vhost_driver_disable_features(file,
1547                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1548                 }
1549
1550                 if (promiscuous) {
1551                         rte_vhost_driver_enable_features(file,
1552                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1553                 }
1554
1555                 ret = rte_vhost_driver_callback_register(file,
1556                         &virtio_net_device_ops);
1557                 if (ret != 0) {
1558                         rte_exit(EXIT_FAILURE,
1559                                 "failed to register vhost driver callbacks.\n");
1560                 }
1561
1562                 if (rte_vhost_driver_start(file) < 0) {
1563                         rte_exit(EXIT_FAILURE,
1564                                 "failed to start vhost driver.\n");
1565                 }
1566         }
1567
1568         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1569                 rte_eal_wait_lcore(lcore_id);
1570
1571         return 0;
1572
1573 }