New upstream version 18.02
[deb_dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "main.h"
29
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
39
40 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
41
42 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
44
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX                       1
50 #define DEVICE_SAFE_REMOVE      2
51
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55
56 #define INVALID_PORT_ID 0xFF
57
58 /* Max number of devices. Limited by vmdq. */
59 #define MAX_DEVICES 64
60
61 /* Size of buffers used for snprintfs. */
62 #define MAX_PRINT_BUFF 6072
63
64 /* Maximum long option length for option parsing. */
65 #define MAX_LONG_OPT_SZ 64
66
67 /* mask of enabled ports */
68 static uint32_t enabled_port_mask = 0;
69
70 /* Promiscuous mode */
71 static uint32_t promiscuous;
72
73 /* number of devices/queues to support*/
74 static uint32_t num_queues = 0;
75 static uint32_t num_devices;
76
77 static struct rte_mempool *mbuf_pool;
78 static int mergeable;
79
80 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
81 typedef enum {
82         VM2VM_DISABLED = 0,
83         VM2VM_SOFTWARE = 1,
84         VM2VM_HARDWARE = 2,
85         VM2VM_LAST
86 } vm2vm_type;
87 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
88
89 /* Enable stats. */
90 static uint32_t enable_stats = 0;
91 /* Enable retries on RX. */
92 static uint32_t enable_retry = 1;
93
94 /* Disable TX checksum offload */
95 static uint32_t enable_tx_csum;
96
97 /* Disable TSO offload */
98 static uint32_t enable_tso;
99
100 static int client_mode;
101 static int dequeue_zero_copy;
102
103 static int builtin_net_driver;
104
105 /* Specify timeout (in useconds) between retries on RX. */
106 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
107 /* Specify the number of retries on RX. */
108 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
109
110 /* Socket file paths. Can be set by user */
111 static char *socket_files;
112 static int nb_sockets;
113
114 /* empty vmdq configuration structure. Filled in programatically */
115 static struct rte_eth_conf vmdq_conf_default = {
116         .rxmode = {
117                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
118                 .split_hdr_size = 0,
119                 .ignore_offload_bitfield = 1,
120                 /*
121                  * VLAN strip is necessary for 1G NIC such as I350,
122                  * this fixes bug of ipv4 forwarding in guest can't
123                  * forward pakets from one virtio dev to another virtio dev.
124                  */
125                 .offloads = (DEV_RX_OFFLOAD_CRC_STRIP |
126                              DEV_RX_OFFLOAD_VLAN_STRIP),
127         },
128
129         .txmode = {
130                 .mq_mode = ETH_MQ_TX_NONE,
131                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
132                              DEV_TX_OFFLOAD_TCP_CKSUM |
133                              DEV_TX_OFFLOAD_VLAN_INSERT |
134                              DEV_TX_OFFLOAD_MULTI_SEGS |
135                              DEV_TX_OFFLOAD_TCP_TSO),
136         },
137         .rx_adv_conf = {
138                 /*
139                  * should be overridden separately in code with
140                  * appropriate values
141                  */
142                 .vmdq_rx_conf = {
143                         .nb_queue_pools = ETH_8_POOLS,
144                         .enable_default_pool = 0,
145                         .default_pool = 0,
146                         .nb_pool_maps = 0,
147                         .pool_map = {{0, 0},},
148                 },
149         },
150 };
151
152
153 static unsigned lcore_ids[RTE_MAX_LCORE];
154 static uint16_t ports[RTE_MAX_ETHPORTS];
155 static unsigned num_ports = 0; /**< The number of ports specified in command line */
156 static uint16_t num_pf_queues, num_vmdq_queues;
157 static uint16_t vmdq_pool_base, vmdq_queue_base;
158 static uint16_t queues_per_pool;
159
160 const uint16_t vlan_tags[] = {
161         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
162         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
163         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
164         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
165         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
166         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
167         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
168         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
169 };
170
171 /* ethernet addresses of ports */
172 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
173
174 static struct vhost_dev_tailq_list vhost_dev_list =
175         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
176
177 static struct lcore_info lcore_info[RTE_MAX_LCORE];
178
179 /* Used for queueing bursts of TX packets. */
180 struct mbuf_table {
181         unsigned len;
182         unsigned txq_id;
183         struct rte_mbuf *m_table[MAX_PKT_BURST];
184 };
185
186 /* TX queue for each data core. */
187 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
188
189 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
190                                  / US_PER_S * BURST_TX_DRAIN_US)
191 #define VLAN_HLEN       4
192
193 /*
194  * Builds up the correct configuration for VMDQ VLAN pool map
195  * according to the pool & queue limits.
196  */
197 static inline int
198 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
199 {
200         struct rte_eth_vmdq_rx_conf conf;
201         struct rte_eth_vmdq_rx_conf *def_conf =
202                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
203         unsigned i;
204
205         memset(&conf, 0, sizeof(conf));
206         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
207         conf.nb_pool_maps = num_devices;
208         conf.enable_loop_back = def_conf->enable_loop_back;
209         conf.rx_mode = def_conf->rx_mode;
210
211         for (i = 0; i < conf.nb_pool_maps; i++) {
212                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
213                 conf.pool_map[i].pools = (1UL << i);
214         }
215
216         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
217         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
218                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
219         return 0;
220 }
221
222 /*
223  * Validate the device number according to the max pool number gotten form
224  * dev_info. If the device number is invalid, give the error message and
225  * return -1. Each device must have its own pool.
226  */
227 static inline int
228 validate_num_devices(uint32_t max_nb_devices)
229 {
230         if (num_devices > max_nb_devices) {
231                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
232                 return -1;
233         }
234         return 0;
235 }
236
237 /*
238  * Initialises a given port using global settings and with the rx buffers
239  * coming from the mbuf_pool passed as parameter
240  */
241 static inline int
242 port_init(uint16_t port)
243 {
244         struct rte_eth_dev_info dev_info;
245         struct rte_eth_conf port_conf;
246         struct rte_eth_rxconf *rxconf;
247         struct rte_eth_txconf *txconf;
248         int16_t rx_rings, tx_rings;
249         uint16_t rx_ring_size, tx_ring_size;
250         int retval;
251         uint16_t q;
252
253         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
254         rte_eth_dev_info_get (port, &dev_info);
255
256         rxconf = &dev_info.default_rxconf;
257         txconf = &dev_info.default_txconf;
258         rxconf->rx_drop_en = 1;
259         txconf->txq_flags = ETH_TXQ_FLAGS_IGNORE;
260
261         /*configure the number of supported virtio devices based on VMDQ limits */
262         num_devices = dev_info.max_vmdq_pools;
263
264         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
265         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
266
267         /*
268          * When dequeue zero copy is enabled, guest Tx used vring will be
269          * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
270          * (tx_ring_size here) must be small enough so that the driver will
271          * hit the free threshold easily and free mbufs timely. Otherwise,
272          * guest Tx vring would be starved.
273          */
274         if (dequeue_zero_copy)
275                 tx_ring_size = 64;
276
277         tx_rings = (uint16_t)rte_lcore_count();
278
279         retval = validate_num_devices(MAX_DEVICES);
280         if (retval < 0)
281                 return retval;
282
283         /* Get port configuration. */
284         retval = get_eth_conf(&port_conf, num_devices);
285         if (retval < 0)
286                 return retval;
287         /* NIC queues are divided into pf queues and vmdq queues.  */
288         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
289         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
290         num_vmdq_queues = num_devices * queues_per_pool;
291         num_queues = num_pf_queues + num_vmdq_queues;
292         vmdq_queue_base = dev_info.vmdq_queue_base;
293         vmdq_pool_base  = dev_info.vmdq_pool_base;
294         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
295                 num_pf_queues, num_devices, queues_per_pool);
296
297         if (port >= rte_eth_dev_count()) return -1;
298
299         rx_rings = (uint16_t)dev_info.max_rx_queues;
300         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
301                 port_conf.txmode.offloads |=
302                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
303         /* Configure ethernet device. */
304         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
305         if (retval != 0) {
306                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
307                         port, strerror(-retval));
308                 return retval;
309         }
310
311         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
312                 &tx_ring_size);
313         if (retval != 0) {
314                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
315                         "for port %u: %s.\n", port, strerror(-retval));
316                 return retval;
317         }
318         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
319                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
320                         "for Rx queues on port %u.\n", port);
321                 return -1;
322         }
323
324         /* Setup the queues. */
325         rxconf->offloads = port_conf.rxmode.offloads;
326         for (q = 0; q < rx_rings; q ++) {
327                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
328                                                 rte_eth_dev_socket_id(port),
329                                                 rxconf,
330                                                 mbuf_pool);
331                 if (retval < 0) {
332                         RTE_LOG(ERR, VHOST_PORT,
333                                 "Failed to setup rx queue %u of port %u: %s.\n",
334                                 q, port, strerror(-retval));
335                         return retval;
336                 }
337         }
338         txconf->offloads = port_conf.txmode.offloads;
339         for (q = 0; q < tx_rings; q ++) {
340                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
341                                                 rte_eth_dev_socket_id(port),
342                                                 txconf);
343                 if (retval < 0) {
344                         RTE_LOG(ERR, VHOST_PORT,
345                                 "Failed to setup tx queue %u of port %u: %s.\n",
346                                 q, port, strerror(-retval));
347                         return retval;
348                 }
349         }
350
351         /* Start the device. */
352         retval  = rte_eth_dev_start(port);
353         if (retval < 0) {
354                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
355                         port, strerror(-retval));
356                 return retval;
357         }
358
359         if (promiscuous)
360                 rte_eth_promiscuous_enable(port);
361
362         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
363         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
364         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
365                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
366                         port,
367                         vmdq_ports_eth_addr[port].addr_bytes[0],
368                         vmdq_ports_eth_addr[port].addr_bytes[1],
369                         vmdq_ports_eth_addr[port].addr_bytes[2],
370                         vmdq_ports_eth_addr[port].addr_bytes[3],
371                         vmdq_ports_eth_addr[port].addr_bytes[4],
372                         vmdq_ports_eth_addr[port].addr_bytes[5]);
373
374         return 0;
375 }
376
377 /*
378  * Set socket file path.
379  */
380 static int
381 us_vhost_parse_socket_path(const char *q_arg)
382 {
383         /* parse number string */
384         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
385                 return -1;
386
387         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
388         snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
389         nb_sockets++;
390
391         return 0;
392 }
393
394 /*
395  * Parse the portmask provided at run time.
396  */
397 static int
398 parse_portmask(const char *portmask)
399 {
400         char *end = NULL;
401         unsigned long pm;
402
403         errno = 0;
404
405         /* parse hexadecimal string */
406         pm = strtoul(portmask, &end, 16);
407         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
408                 return -1;
409
410         if (pm == 0)
411                 return -1;
412
413         return pm;
414
415 }
416
417 /*
418  * Parse num options at run time.
419  */
420 static int
421 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
422 {
423         char *end = NULL;
424         unsigned long num;
425
426         errno = 0;
427
428         /* parse unsigned int string */
429         num = strtoul(q_arg, &end, 10);
430         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
431                 return -1;
432
433         if (num > max_valid_value)
434                 return -1;
435
436         return num;
437
438 }
439
440 /*
441  * Display usage
442  */
443 static void
444 us_vhost_usage(const char *prgname)
445 {
446         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
447         "               --vm2vm [0|1|2]\n"
448         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
449         "               --socket-file <path>\n"
450         "               --nb-devices ND\n"
451         "               -p PORTMASK: Set mask for ports to be used by application\n"
452         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
453         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
454         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
455         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
456         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
457         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
458         "               --socket-file: The path of the socket file.\n"
459         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
460         "               --tso [0|1] disable/enable TCP segment offload.\n"
461         "               --client register a vhost-user socket as client mode.\n"
462         "               --dequeue-zero-copy enables dequeue zero copy\n",
463                prgname);
464 }
465
466 /*
467  * Parse the arguments given in the command line of the application.
468  */
469 static int
470 us_vhost_parse_args(int argc, char **argv)
471 {
472         int opt, ret;
473         int option_index;
474         unsigned i;
475         const char *prgname = argv[0];
476         static struct option long_option[] = {
477                 {"vm2vm", required_argument, NULL, 0},
478                 {"rx-retry", required_argument, NULL, 0},
479                 {"rx-retry-delay", required_argument, NULL, 0},
480                 {"rx-retry-num", required_argument, NULL, 0},
481                 {"mergeable", required_argument, NULL, 0},
482                 {"stats", required_argument, NULL, 0},
483                 {"socket-file", required_argument, NULL, 0},
484                 {"tx-csum", required_argument, NULL, 0},
485                 {"tso", required_argument, NULL, 0},
486                 {"client", no_argument, &client_mode, 1},
487                 {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
488                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
489                 {NULL, 0, 0, 0},
490         };
491
492         /* Parse command line */
493         while ((opt = getopt_long(argc, argv, "p:P",
494                         long_option, &option_index)) != EOF) {
495                 switch (opt) {
496                 /* Portmask */
497                 case 'p':
498                         enabled_port_mask = parse_portmask(optarg);
499                         if (enabled_port_mask == 0) {
500                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
501                                 us_vhost_usage(prgname);
502                                 return -1;
503                         }
504                         break;
505
506                 case 'P':
507                         promiscuous = 1;
508                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
509                                 ETH_VMDQ_ACCEPT_BROADCAST |
510                                 ETH_VMDQ_ACCEPT_MULTICAST;
511
512                         break;
513
514                 case 0:
515                         /* Enable/disable vm2vm comms. */
516                         if (!strncmp(long_option[option_index].name, "vm2vm",
517                                 MAX_LONG_OPT_SZ)) {
518                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
519                                 if (ret == -1) {
520                                         RTE_LOG(INFO, VHOST_CONFIG,
521                                                 "Invalid argument for "
522                                                 "vm2vm [0|1|2]\n");
523                                         us_vhost_usage(prgname);
524                                         return -1;
525                                 } else {
526                                         vm2vm_mode = (vm2vm_type)ret;
527                                 }
528                         }
529
530                         /* Enable/disable retries on RX. */
531                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
532                                 ret = parse_num_opt(optarg, 1);
533                                 if (ret == -1) {
534                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
535                                         us_vhost_usage(prgname);
536                                         return -1;
537                                 } else {
538                                         enable_retry = ret;
539                                 }
540                         }
541
542                         /* Enable/disable TX checksum offload. */
543                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
544                                 ret = parse_num_opt(optarg, 1);
545                                 if (ret == -1) {
546                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
547                                         us_vhost_usage(prgname);
548                                         return -1;
549                                 } else
550                                         enable_tx_csum = ret;
551                         }
552
553                         /* Enable/disable TSO offload. */
554                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
555                                 ret = parse_num_opt(optarg, 1);
556                                 if (ret == -1) {
557                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
558                                         us_vhost_usage(prgname);
559                                         return -1;
560                                 } else
561                                         enable_tso = ret;
562                         }
563
564                         /* Specify the retries delay time (in useconds) on RX. */
565                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
566                                 ret = parse_num_opt(optarg, INT32_MAX);
567                                 if (ret == -1) {
568                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
569                                         us_vhost_usage(prgname);
570                                         return -1;
571                                 } else {
572                                         burst_rx_delay_time = ret;
573                                 }
574                         }
575
576                         /* Specify the retries number on RX. */
577                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
578                                 ret = parse_num_opt(optarg, INT32_MAX);
579                                 if (ret == -1) {
580                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
581                                         us_vhost_usage(prgname);
582                                         return -1;
583                                 } else {
584                                         burst_rx_retry_num = ret;
585                                 }
586                         }
587
588                         /* Enable/disable RX mergeable buffers. */
589                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
590                                 ret = parse_num_opt(optarg, 1);
591                                 if (ret == -1) {
592                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
593                                         us_vhost_usage(prgname);
594                                         return -1;
595                                 } else {
596                                         mergeable = !!ret;
597                                         if (ret) {
598                                                 vmdq_conf_default.rxmode.offloads |=
599                                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
600                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
601                                                         = JUMBO_FRAME_MAX_SIZE;
602                                         }
603                                 }
604                         }
605
606                         /* Enable/disable stats. */
607                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
608                                 ret = parse_num_opt(optarg, INT32_MAX);
609                                 if (ret == -1) {
610                                         RTE_LOG(INFO, VHOST_CONFIG,
611                                                 "Invalid argument for stats [0..N]\n");
612                                         us_vhost_usage(prgname);
613                                         return -1;
614                                 } else {
615                                         enable_stats = ret;
616                                 }
617                         }
618
619                         /* Set socket file path. */
620                         if (!strncmp(long_option[option_index].name,
621                                                 "socket-file", MAX_LONG_OPT_SZ)) {
622                                 if (us_vhost_parse_socket_path(optarg) == -1) {
623                                         RTE_LOG(INFO, VHOST_CONFIG,
624                                         "Invalid argument for socket name (Max %d characters)\n",
625                                         PATH_MAX);
626                                         us_vhost_usage(prgname);
627                                         return -1;
628                                 }
629                         }
630
631                         break;
632
633                         /* Invalid option - print options. */
634                 default:
635                         us_vhost_usage(prgname);
636                         return -1;
637                 }
638         }
639
640         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
641                 if (enabled_port_mask & (1 << i))
642                         ports[num_ports++] = i;
643         }
644
645         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
646                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
647                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
648                 return -1;
649         }
650
651         return 0;
652 }
653
654 /*
655  * Update the global var NUM_PORTS and array PORTS according to system ports number
656  * and return valid ports number
657  */
658 static unsigned check_ports_num(unsigned nb_ports)
659 {
660         unsigned valid_num_ports = num_ports;
661         unsigned portid;
662
663         if (num_ports > nb_ports) {
664                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
665                         num_ports, nb_ports);
666                 num_ports = nb_ports;
667         }
668
669         for (portid = 0; portid < num_ports; portid ++) {
670                 if (ports[portid] >= nb_ports) {
671                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
672                                 ports[portid], (nb_ports - 1));
673                         ports[portid] = INVALID_PORT_ID;
674                         valid_num_ports--;
675                 }
676         }
677         return valid_num_ports;
678 }
679
680 static __rte_always_inline struct vhost_dev *
681 find_vhost_dev(struct ether_addr *mac)
682 {
683         struct vhost_dev *vdev;
684
685         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
686                 if (vdev->ready == DEVICE_RX &&
687                     is_same_ether_addr(mac, &vdev->mac_address))
688                         return vdev;
689         }
690
691         return NULL;
692 }
693
694 /*
695  * This function learns the MAC address of the device and registers this along with a
696  * vlan tag to a VMDQ.
697  */
698 static int
699 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
700 {
701         struct ether_hdr *pkt_hdr;
702         int i, ret;
703
704         /* Learn MAC address of guest device from packet */
705         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
706
707         if (find_vhost_dev(&pkt_hdr->s_addr)) {
708                 RTE_LOG(ERR, VHOST_DATA,
709                         "(%d) device is using a registered MAC!\n",
710                         vdev->vid);
711                 return -1;
712         }
713
714         for (i = 0; i < ETHER_ADDR_LEN; i++)
715                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
716
717         /* vlan_tag currently uses the device_id. */
718         vdev->vlan_tag = vlan_tags[vdev->vid];
719
720         /* Print out VMDQ registration info. */
721         RTE_LOG(INFO, VHOST_DATA,
722                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
723                 vdev->vid,
724                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
725                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
726                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
727                 vdev->vlan_tag);
728
729         /* Register the MAC address. */
730         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
731                                 (uint32_t)vdev->vid + vmdq_pool_base);
732         if (ret)
733                 RTE_LOG(ERR, VHOST_DATA,
734                         "(%d) failed to add device MAC address to VMDQ\n",
735                         vdev->vid);
736
737         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
738
739         /* Set device as ready for RX. */
740         vdev->ready = DEVICE_RX;
741
742         return 0;
743 }
744
745 /*
746  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
747  * queue before disabling RX on the device.
748  */
749 static inline void
750 unlink_vmdq(struct vhost_dev *vdev)
751 {
752         unsigned i = 0;
753         unsigned rx_count;
754         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
755
756         if (vdev->ready == DEVICE_RX) {
757                 /*clear MAC and VLAN settings*/
758                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
759                 for (i = 0; i < 6; i++)
760                         vdev->mac_address.addr_bytes[i] = 0;
761
762                 vdev->vlan_tag = 0;
763
764                 /*Clear out the receive buffers*/
765                 rx_count = rte_eth_rx_burst(ports[0],
766                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
767
768                 while (rx_count) {
769                         for (i = 0; i < rx_count; i++)
770                                 rte_pktmbuf_free(pkts_burst[i]);
771
772                         rx_count = rte_eth_rx_burst(ports[0],
773                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
774                 }
775
776                 vdev->ready = DEVICE_MAC_LEARNING;
777         }
778 }
779
780 static __rte_always_inline void
781 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
782             struct rte_mbuf *m)
783 {
784         uint16_t ret;
785
786         if (builtin_net_driver) {
787                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
788         } else {
789                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
790         }
791
792         if (enable_stats) {
793                 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
794                 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
795                 src_vdev->stats.tx_total++;
796                 src_vdev->stats.tx += ret;
797         }
798 }
799
800 /*
801  * Check if the packet destination MAC address is for a local device. If so then put
802  * the packet on that devices RX queue. If not then return.
803  */
804 static __rte_always_inline int
805 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
806 {
807         struct ether_hdr *pkt_hdr;
808         struct vhost_dev *dst_vdev;
809
810         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
811
812         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
813         if (!dst_vdev)
814                 return -1;
815
816         if (vdev->vid == dst_vdev->vid) {
817                 RTE_LOG_DP(DEBUG, VHOST_DATA,
818                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
819                         vdev->vid);
820                 return 0;
821         }
822
823         RTE_LOG_DP(DEBUG, VHOST_DATA,
824                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
825
826         if (unlikely(dst_vdev->remove)) {
827                 RTE_LOG_DP(DEBUG, VHOST_DATA,
828                         "(%d) device is marked for removal\n", dst_vdev->vid);
829                 return 0;
830         }
831
832         virtio_xmit(dst_vdev, vdev, m);
833         return 0;
834 }
835
836 /*
837  * Check if the destination MAC of a packet is one local VM,
838  * and get its vlan tag, and offset if it is.
839  */
840 static __rte_always_inline int
841 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
842         uint32_t *offset, uint16_t *vlan_tag)
843 {
844         struct vhost_dev *dst_vdev;
845         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
846
847         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
848         if (!dst_vdev)
849                 return 0;
850
851         if (vdev->vid == dst_vdev->vid) {
852                 RTE_LOG_DP(DEBUG, VHOST_DATA,
853                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
854                         vdev->vid);
855                 return -1;
856         }
857
858         /*
859          * HW vlan strip will reduce the packet length
860          * by minus length of vlan tag, so need restore
861          * the packet length by plus it.
862          */
863         *offset  = VLAN_HLEN;
864         *vlan_tag = vlan_tags[vdev->vid];
865
866         RTE_LOG_DP(DEBUG, VHOST_DATA,
867                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
868                 vdev->vid, dst_vdev->vid, *vlan_tag);
869
870         return 0;
871 }
872
873 static uint16_t
874 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
875 {
876         if (ol_flags & PKT_TX_IPV4)
877                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
878         else /* assume ethertype == ETHER_TYPE_IPv6 */
879                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
880 }
881
882 static void virtio_tx_offload(struct rte_mbuf *m)
883 {
884         void *l3_hdr;
885         struct ipv4_hdr *ipv4_hdr = NULL;
886         struct tcp_hdr *tcp_hdr = NULL;
887         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
888
889         l3_hdr = (char *)eth_hdr + m->l2_len;
890
891         if (m->ol_flags & PKT_TX_IPV4) {
892                 ipv4_hdr = l3_hdr;
893                 ipv4_hdr->hdr_checksum = 0;
894                 m->ol_flags |= PKT_TX_IP_CKSUM;
895         }
896
897         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
898         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
899 }
900
901 static inline void
902 free_pkts(struct rte_mbuf **pkts, uint16_t n)
903 {
904         while (n--)
905                 rte_pktmbuf_free(pkts[n]);
906 }
907
908 static __rte_always_inline void
909 do_drain_mbuf_table(struct mbuf_table *tx_q)
910 {
911         uint16_t count;
912
913         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
914                                  tx_q->m_table, tx_q->len);
915         if (unlikely(count < tx_q->len))
916                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
917
918         tx_q->len = 0;
919 }
920
921 /*
922  * This function routes the TX packet to the correct interface. This
923  * may be a local device or the physical port.
924  */
925 static __rte_always_inline void
926 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
927 {
928         struct mbuf_table *tx_q;
929         unsigned offset = 0;
930         const uint16_t lcore_id = rte_lcore_id();
931         struct ether_hdr *nh;
932
933
934         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
935         if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
936                 struct vhost_dev *vdev2;
937
938                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
939                         if (vdev2 != vdev)
940                                 virtio_xmit(vdev2, vdev, m);
941                 }
942                 goto queue2nic;
943         }
944
945         /*check if destination is local VM*/
946         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
947                 rte_pktmbuf_free(m);
948                 return;
949         }
950
951         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
952                 if (unlikely(find_local_dest(vdev, m, &offset,
953                                              &vlan_tag) != 0)) {
954                         rte_pktmbuf_free(m);
955                         return;
956                 }
957         }
958
959         RTE_LOG_DP(DEBUG, VHOST_DATA,
960                 "(%d) TX: MAC address is external\n", vdev->vid);
961
962 queue2nic:
963
964         /*Add packet to the port tx queue*/
965         tx_q = &lcore_tx_queue[lcore_id];
966
967         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
968         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
969                 /* Guest has inserted the vlan tag. */
970                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
971                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
972                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
973                         (vh->vlan_tci != vlan_tag_be))
974                         vh->vlan_tci = vlan_tag_be;
975         } else {
976                 m->ol_flags |= PKT_TX_VLAN_PKT;
977
978                 /*
979                  * Find the right seg to adjust the data len when offset is
980                  * bigger than tail room size.
981                  */
982                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
983                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
984                                 m->data_len += offset;
985                         else {
986                                 struct rte_mbuf *seg = m;
987
988                                 while ((seg->next != NULL) &&
989                                         (offset > rte_pktmbuf_tailroom(seg)))
990                                         seg = seg->next;
991
992                                 seg->data_len += offset;
993                         }
994                         m->pkt_len += offset;
995                 }
996
997                 m->vlan_tci = vlan_tag;
998         }
999
1000         if (m->ol_flags & PKT_TX_TCP_SEG)
1001                 virtio_tx_offload(m);
1002
1003         tx_q->m_table[tx_q->len++] = m;
1004         if (enable_stats) {
1005                 vdev->stats.tx_total++;
1006                 vdev->stats.tx++;
1007         }
1008
1009         if (unlikely(tx_q->len == MAX_PKT_BURST))
1010                 do_drain_mbuf_table(tx_q);
1011 }
1012
1013
1014 static __rte_always_inline void
1015 drain_mbuf_table(struct mbuf_table *tx_q)
1016 {
1017         static uint64_t prev_tsc;
1018         uint64_t cur_tsc;
1019
1020         if (tx_q->len == 0)
1021                 return;
1022
1023         cur_tsc = rte_rdtsc();
1024         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1025                 prev_tsc = cur_tsc;
1026
1027                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1028                         "TX queue drained after timeout with burst size %u\n",
1029                         tx_q->len);
1030                 do_drain_mbuf_table(tx_q);
1031         }
1032 }
1033
1034 static __rte_always_inline void
1035 drain_eth_rx(struct vhost_dev *vdev)
1036 {
1037         uint16_t rx_count, enqueue_count;
1038         struct rte_mbuf *pkts[MAX_PKT_BURST];
1039
1040         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1041                                     pkts, MAX_PKT_BURST);
1042         if (!rx_count)
1043                 return;
1044
1045         /*
1046          * When "enable_retry" is set, here we wait and retry when there
1047          * is no enough free slots in the queue to hold @rx_count packets,
1048          * to diminish packet loss.
1049          */
1050         if (enable_retry &&
1051             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1052                         VIRTIO_RXQ))) {
1053                 uint32_t retry;
1054
1055                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1056                         rte_delay_us(burst_rx_delay_time);
1057                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1058                                         VIRTIO_RXQ))
1059                                 break;
1060                 }
1061         }
1062
1063         if (builtin_net_driver) {
1064                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1065                                                 pkts, rx_count);
1066         } else {
1067                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1068                                                 pkts, rx_count);
1069         }
1070         if (enable_stats) {
1071                 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1072                 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1073         }
1074
1075         free_pkts(pkts, rx_count);
1076 }
1077
1078 static __rte_always_inline void
1079 drain_virtio_tx(struct vhost_dev *vdev)
1080 {
1081         struct rte_mbuf *pkts[MAX_PKT_BURST];
1082         uint16_t count;
1083         uint16_t i;
1084
1085         if (builtin_net_driver) {
1086                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1087                                         pkts, MAX_PKT_BURST);
1088         } else {
1089                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1090                                         mbuf_pool, pkts, MAX_PKT_BURST);
1091         }
1092
1093         /* setup VMDq for the first packet */
1094         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1095                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1096                         free_pkts(pkts, count);
1097         }
1098
1099         for (i = 0; i < count; ++i)
1100                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1101 }
1102
1103 /*
1104  * Main function of vhost-switch. It basically does:
1105  *
1106  * for each vhost device {
1107  *    - drain_eth_rx()
1108  *
1109  *      Which drains the host eth Rx queue linked to the vhost device,
1110  *      and deliver all of them to guest virito Rx ring associated with
1111  *      this vhost device.
1112  *
1113  *    - drain_virtio_tx()
1114  *
1115  *      Which drains the guest virtio Tx queue and deliver all of them
1116  *      to the target, which could be another vhost device, or the
1117  *      physical eth dev. The route is done in function "virtio_tx_route".
1118  * }
1119  */
1120 static int
1121 switch_worker(void *arg __rte_unused)
1122 {
1123         unsigned i;
1124         unsigned lcore_id = rte_lcore_id();
1125         struct vhost_dev *vdev;
1126         struct mbuf_table *tx_q;
1127
1128         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1129
1130         tx_q = &lcore_tx_queue[lcore_id];
1131         for (i = 0; i < rte_lcore_count(); i++) {
1132                 if (lcore_ids[i] == lcore_id) {
1133                         tx_q->txq_id = i;
1134                         break;
1135                 }
1136         }
1137
1138         while(1) {
1139                 drain_mbuf_table(tx_q);
1140
1141                 /*
1142                  * Inform the configuration core that we have exited the
1143                  * linked list and that no devices are in use if requested.
1144                  */
1145                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1146                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1147
1148                 /*
1149                  * Process vhost devices
1150                  */
1151                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1152                               lcore_vdev_entry) {
1153                         if (unlikely(vdev->remove)) {
1154                                 unlink_vmdq(vdev);
1155                                 vdev->ready = DEVICE_SAFE_REMOVE;
1156                                 continue;
1157                         }
1158
1159                         if (likely(vdev->ready == DEVICE_RX))
1160                                 drain_eth_rx(vdev);
1161
1162                         if (likely(!vdev->remove))
1163                                 drain_virtio_tx(vdev);
1164                 }
1165         }
1166
1167         return 0;
1168 }
1169
1170 /*
1171  * Remove a device from the specific data core linked list and from the
1172  * main linked list. Synchonization  occurs through the use of the
1173  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1174  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1175  */
1176 static void
1177 destroy_device(int vid)
1178 {
1179         struct vhost_dev *vdev = NULL;
1180         int lcore;
1181
1182         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1183                 if (vdev->vid == vid)
1184                         break;
1185         }
1186         if (!vdev)
1187                 return;
1188         /*set the remove flag. */
1189         vdev->remove = 1;
1190         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1191                 rte_pause();
1192         }
1193
1194         if (builtin_net_driver)
1195                 vs_vhost_net_remove(vdev);
1196
1197         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1198                      lcore_vdev_entry);
1199         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1200
1201
1202         /* Set the dev_removal_flag on each lcore. */
1203         RTE_LCORE_FOREACH_SLAVE(lcore)
1204                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1205
1206         /*
1207          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1208          * we can be sure that they can no longer access the device removed
1209          * from the linked lists and that the devices are no longer in use.
1210          */
1211         RTE_LCORE_FOREACH_SLAVE(lcore) {
1212                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1213                         rte_pause();
1214         }
1215
1216         lcore_info[vdev->coreid].device_num--;
1217
1218         RTE_LOG(INFO, VHOST_DATA,
1219                 "(%d) device has been removed from data core\n",
1220                 vdev->vid);
1221
1222         rte_free(vdev);
1223 }
1224
1225 /*
1226  * A new device is added to a data core. First the device is added to the main linked list
1227  * and the allocated to a specific data core.
1228  */
1229 static int
1230 new_device(int vid)
1231 {
1232         int lcore, core_add = 0;
1233         uint32_t device_num_min = num_devices;
1234         struct vhost_dev *vdev;
1235
1236         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1237         if (vdev == NULL) {
1238                 RTE_LOG(INFO, VHOST_DATA,
1239                         "(%d) couldn't allocate memory for vhost dev\n",
1240                         vid);
1241                 return -1;
1242         }
1243         vdev->vid = vid;
1244
1245         if (builtin_net_driver)
1246                 vs_vhost_net_setup(vdev);
1247
1248         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1249         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1250
1251         /*reset ready flag*/
1252         vdev->ready = DEVICE_MAC_LEARNING;
1253         vdev->remove = 0;
1254
1255         /* Find a suitable lcore to add the device. */
1256         RTE_LCORE_FOREACH_SLAVE(lcore) {
1257                 if (lcore_info[lcore].device_num < device_num_min) {
1258                         device_num_min = lcore_info[lcore].device_num;
1259                         core_add = lcore;
1260                 }
1261         }
1262         vdev->coreid = core_add;
1263
1264         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1265                           lcore_vdev_entry);
1266         lcore_info[vdev->coreid].device_num++;
1267
1268         /* Disable notifications. */
1269         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1270         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1271
1272         RTE_LOG(INFO, VHOST_DATA,
1273                 "(%d) device has been added to data core %d\n",
1274                 vid, vdev->coreid);
1275
1276         return 0;
1277 }
1278
1279 /*
1280  * These callback allow devices to be added to the data core when configuration
1281  * has been fully complete.
1282  */
1283 static const struct vhost_device_ops virtio_net_device_ops =
1284 {
1285         .new_device =  new_device,
1286         .destroy_device = destroy_device,
1287 };
1288
1289 /*
1290  * This is a thread will wake up after a period to print stats if the user has
1291  * enabled them.
1292  */
1293 static void
1294 print_stats(void)
1295 {
1296         struct vhost_dev *vdev;
1297         uint64_t tx_dropped, rx_dropped;
1298         uint64_t tx, tx_total, rx, rx_total;
1299         const char clr[] = { 27, '[', '2', 'J', '\0' };
1300         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1301
1302         while(1) {
1303                 sleep(enable_stats);
1304
1305                 /* Clear screen and move to top left */
1306                 printf("%s%s\n", clr, top_left);
1307                 printf("Device statistics =================================\n");
1308
1309                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1310                         tx_total   = vdev->stats.tx_total;
1311                         tx         = vdev->stats.tx;
1312                         tx_dropped = tx_total - tx;
1313
1314                         rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1315                         rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1316                         rx_dropped = rx_total - rx;
1317
1318                         printf("Statistics for device %d\n"
1319                                 "-----------------------\n"
1320                                 "TX total:              %" PRIu64 "\n"
1321                                 "TX dropped:            %" PRIu64 "\n"
1322                                 "TX successful:         %" PRIu64 "\n"
1323                                 "RX total:              %" PRIu64 "\n"
1324                                 "RX dropped:            %" PRIu64 "\n"
1325                                 "RX successful:         %" PRIu64 "\n",
1326                                 vdev->vid,
1327                                 tx_total, tx_dropped, tx,
1328                                 rx_total, rx_dropped, rx);
1329                 }
1330
1331                 printf("===================================================\n");
1332         }
1333 }
1334
1335 static void
1336 unregister_drivers(int socket_num)
1337 {
1338         int i, ret;
1339
1340         for (i = 0; i < socket_num; i++) {
1341                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1342                 if (ret != 0)
1343                         RTE_LOG(ERR, VHOST_CONFIG,
1344                                 "Fail to unregister vhost driver for %s.\n",
1345                                 socket_files + i * PATH_MAX);
1346         }
1347 }
1348
1349 /* When we receive a INT signal, unregister vhost driver */
1350 static void
1351 sigint_handler(__rte_unused int signum)
1352 {
1353         /* Unregister vhost driver. */
1354         unregister_drivers(nb_sockets);
1355
1356         exit(0);
1357 }
1358
1359 /*
1360  * While creating an mbuf pool, one key thing is to figure out how
1361  * many mbuf entries is enough for our use. FYI, here are some
1362  * guidelines:
1363  *
1364  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1365  *
1366  * - For each switch core (A CPU core does the packet switch), we need
1367  *   also make some reservation for receiving the packets from virtio
1368  *   Tx queue. How many is enough depends on the usage. It's normally
1369  *   a simple calculation like following:
1370  *
1371  *       MAX_PKT_BURST * max packet size / mbuf size
1372  *
1373  *   So, we definitely need allocate more mbufs when TSO is enabled.
1374  *
1375  * - Similarly, for each switching core, we should serve @nr_rx_desc
1376  *   mbufs for receiving the packets from physical NIC device.
1377  *
1378  * - We also need make sure, for each switch core, we have allocated
1379  *   enough mbufs to fill up the mbuf cache.
1380  */
1381 static void
1382 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1383         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1384 {
1385         uint32_t nr_mbufs;
1386         uint32_t nr_mbufs_per_core;
1387         uint32_t mtu = 1500;
1388
1389         if (mergeable)
1390                 mtu = 9000;
1391         if (enable_tso)
1392                 mtu = 64 * 1024;
1393
1394         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1395                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1396         nr_mbufs_per_core += nr_rx_desc;
1397         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1398
1399         nr_mbufs  = nr_queues * nr_rx_desc;
1400         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1401         nr_mbufs *= nr_port;
1402
1403         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1404                                             nr_mbuf_cache, 0, mbuf_size,
1405                                             rte_socket_id());
1406         if (mbuf_pool == NULL)
1407                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1408 }
1409
1410 /*
1411  * Main function, does initialisation and calls the per-lcore functions.
1412  */
1413 int
1414 main(int argc, char *argv[])
1415 {
1416         unsigned lcore_id, core_id = 0;
1417         unsigned nb_ports, valid_num_ports;
1418         int ret, i;
1419         uint16_t portid;
1420         static pthread_t tid;
1421         char thread_name[RTE_MAX_THREAD_NAME_LEN];
1422         uint64_t flags = 0;
1423
1424         signal(SIGINT, sigint_handler);
1425
1426         /* init EAL */
1427         ret = rte_eal_init(argc, argv);
1428         if (ret < 0)
1429                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1430         argc -= ret;
1431         argv += ret;
1432
1433         /* parse app arguments */
1434         ret = us_vhost_parse_args(argc, argv);
1435         if (ret < 0)
1436                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1437
1438         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1439                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1440
1441                 if (rte_lcore_is_enabled(lcore_id))
1442                         lcore_ids[core_id++] = lcore_id;
1443         }
1444
1445         if (rte_lcore_count() > RTE_MAX_LCORE)
1446                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1447
1448         /* Get the number of physical ports. */
1449         nb_ports = rte_eth_dev_count();
1450
1451         /*
1452          * Update the global var NUM_PORTS and global array PORTS
1453          * and get value of var VALID_NUM_PORTS according to system ports number
1454          */
1455         valid_num_ports = check_ports_num(nb_ports);
1456
1457         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1458                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1459                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1460                 return -1;
1461         }
1462
1463         /*
1464          * FIXME: here we are trying to allocate mbufs big enough for
1465          * @MAX_QUEUES, but the truth is we're never going to use that
1466          * many queues here. We probably should only do allocation for
1467          * those queues we are going to use.
1468          */
1469         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1470                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1471
1472         if (vm2vm_mode == VM2VM_HARDWARE) {
1473                 /* Enable VT loop back to let L2 switch to do it. */
1474                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1475                 RTE_LOG(DEBUG, VHOST_CONFIG,
1476                         "Enable loop back for L2 switch in vmdq.\n");
1477         }
1478
1479         /* initialize all ports */
1480         for (portid = 0; portid < nb_ports; portid++) {
1481                 /* skip ports that are not enabled */
1482                 if ((enabled_port_mask & (1 << portid)) == 0) {
1483                         RTE_LOG(INFO, VHOST_PORT,
1484                                 "Skipping disabled port %d\n", portid);
1485                         continue;
1486                 }
1487                 if (port_init(portid) != 0)
1488                         rte_exit(EXIT_FAILURE,
1489                                 "Cannot initialize network ports\n");
1490         }
1491
1492         /* Enable stats if the user option is set. */
1493         if (enable_stats) {
1494                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1495                 if (ret != 0)
1496                         rte_exit(EXIT_FAILURE,
1497                                 "Cannot create print-stats thread\n");
1498
1499                 /* Set thread_name for aid in debugging.  */
1500                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1501                 ret = rte_thread_setname(tid, thread_name);
1502                 if (ret != 0)
1503                         RTE_LOG(DEBUG, VHOST_CONFIG,
1504                                 "Cannot set print-stats name\n");
1505         }
1506
1507         /* Launch all data cores. */
1508         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1509                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1510
1511         if (client_mode)
1512                 flags |= RTE_VHOST_USER_CLIENT;
1513
1514         if (dequeue_zero_copy)
1515                 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1516
1517         /* Register vhost user driver to handle vhost messages. */
1518         for (i = 0; i < nb_sockets; i++) {
1519                 char *file = socket_files + i * PATH_MAX;
1520                 ret = rte_vhost_driver_register(file, flags);
1521                 if (ret != 0) {
1522                         unregister_drivers(i);
1523                         rte_exit(EXIT_FAILURE,
1524                                 "vhost driver register failure.\n");
1525                 }
1526
1527                 if (builtin_net_driver)
1528                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1529
1530                 if (mergeable == 0) {
1531                         rte_vhost_driver_disable_features(file,
1532                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1533                 }
1534
1535                 if (enable_tx_csum == 0) {
1536                         rte_vhost_driver_disable_features(file,
1537                                 1ULL << VIRTIO_NET_F_CSUM);
1538                 }
1539
1540                 if (enable_tso == 0) {
1541                         rte_vhost_driver_disable_features(file,
1542                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1543                         rte_vhost_driver_disable_features(file,
1544                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1545                         rte_vhost_driver_disable_features(file,
1546                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1547                         rte_vhost_driver_disable_features(file,
1548                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1549                 }
1550
1551                 if (promiscuous) {
1552                         rte_vhost_driver_enable_features(file,
1553                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1554                 }
1555
1556                 ret = rte_vhost_driver_callback_register(file,
1557                         &virtio_net_device_ops);
1558                 if (ret != 0) {
1559                         rte_exit(EXIT_FAILURE,
1560                                 "failed to register vhost driver callbacks.\n");
1561                 }
1562
1563                 if (rte_vhost_driver_start(file) < 0) {
1564                         rte_exit(EXIT_FAILURE,
1565                                 "failed to start vhost driver.\n");
1566                 }
1567         }
1568
1569         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1570                 rte_eal_wait_lcore(lcore_id);
1571
1572         return 0;
1573
1574 }