Imported Upstream version 16.07-rc2
[deb_dpdk.git] / examples / tep_termination / vxlan_setup.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <getopt.h>
35 #include <linux/if_ether.h>
36 #include <linux/if_vlan.h>
37 #include <linux/virtio_net.h>
38 #include <linux/virtio_ring.h>
39 #include <sys/param.h>
40 #include <unistd.h>
41
42 #include <rte_ethdev.h>
43 #include <rte_log.h>
44 #include <rte_string_fns.h>
45 #include <rte_mbuf.h>
46 #include <rte_malloc.h>
47 #include <rte_ip.h>
48 #include <rte_udp.h>
49 #include <rte_tcp.h>
50
51 #include "main.h"
52 #include "rte_virtio_net.h"
53 #include "vxlan.h"
54 #include "vxlan_setup.h"
55
56 #define IPV4_HEADER_LEN 20
57 #define UDP_HEADER_LEN  8
58 #define VXLAN_HEADER_LEN 8
59
60 #define IP_VERSION 0x40
61 #define IP_HDRLEN  0x05 /* default IP header length == five 32-bits words. */
62 #define IP_DEFTTL  64   /* from RFC 1340. */
63 #define IP_VHL_DEF (IP_VERSION | IP_HDRLEN)
64
65 #define IP_DN_FRAGMENT_FLAG 0x0040
66
67 /* Used to compare MAC addresses. */
68 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
69
70 /* Configurable number of RX/TX ring descriptors */
71 #define RTE_TEST_RX_DESC_DEFAULT 1024
72 #define RTE_TEST_TX_DESC_DEFAULT 512
73
74 /* Default inner VLAN ID */
75 #define INNER_VLAN_ID 100
76
77 /* VXLAN device */
78 struct vxlan_conf vxdev;
79
80 struct ipv4_hdr app_ip_hdr[VXLAN_N_PORTS];
81 struct ether_hdr app_l2_hdr[VXLAN_N_PORTS];
82
83 /* local VTEP IP address */
84 uint8_t vxlan_multicast_ips[2][4] = { {239, 1, 1, 1 }, {239, 1, 2, 1 } };
85
86 /* Remote VTEP IP address */
87 uint8_t vxlan_overlay_ips[2][4] = { {192, 168, 10, 1}, {192, 168, 30, 1} };
88
89 /* Remote VTEP MAC address */
90 uint8_t peer_mac[6] = {0x00, 0x11, 0x01, 0x00, 0x00, 0x01};
91
92 /* VXLAN RX filter type */
93 uint8_t tep_filter_type[] = {RTE_TUNNEL_FILTER_IMAC_TENID,
94                         RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID,
95                         RTE_TUNNEL_FILTER_OMAC_TENID_IMAC,};
96
97 /* Options for configuring ethernet port */
98 static const struct rte_eth_conf port_conf = {
99         .rxmode = {
100                 .split_hdr_size = 0,
101                 .header_split   = 0, /**< Header Split disabled */
102                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
103                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
104                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
105                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
106         },
107         .txmode = {
108                 .mq_mode = ETH_MQ_TX_NONE,
109         },
110 };
111
112 /**
113  * The one or two device(s) that belongs to the same tenant ID can
114  * be assigned in a VM.
115  */
116 const uint16_t tenant_id_conf[] = {
117         1000, 1000, 1001, 1001, 1002, 1002, 1003, 1003,
118         1004, 1004, 1005, 1005, 1006, 1006, 1007, 1007,
119         1008, 1008, 1009, 1009, 1010, 1010, 1011, 1011,
120         1012, 1012, 1013, 1013, 1014, 1014, 1015, 1015,
121         1016, 1016, 1017, 1017, 1018, 1018, 1019, 1019,
122         1020, 1020, 1021, 1021, 1022, 1022, 1023, 1023,
123         1024, 1024, 1025, 1025, 1026, 1026, 1027, 1027,
124         1028, 1028, 1029, 1029, 1030, 1030, 1031, 1031,
125 };
126
127 /**
128  * Initialises a given port using global settings and with the rx buffers
129  * coming from the mbuf_pool passed as parameter
130  */
131 int
132 vxlan_port_init(uint8_t port, struct rte_mempool *mbuf_pool)
133 {
134         int retval;
135         uint16_t q;
136         struct rte_eth_dev_info dev_info;
137         uint16_t rx_rings, tx_rings = (uint16_t)rte_lcore_count();
138         const uint16_t rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
139         const uint16_t tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
140         struct rte_eth_udp_tunnel tunnel_udp;
141         struct rte_eth_rxconf *rxconf;
142         struct rte_eth_txconf *txconf;
143         struct vxlan_conf *pconf = &vxdev;
144
145         pconf->dst_port = udp_port;
146
147         rte_eth_dev_info_get(port, &dev_info);
148
149         if (dev_info.max_rx_queues > MAX_QUEUES) {
150                 rte_exit(EXIT_FAILURE,
151                         "please define MAX_QUEUES no less than %u in %s\n",
152                         dev_info.max_rx_queues, __FILE__);
153         }
154
155         rxconf = &dev_info.default_rxconf;
156         txconf = &dev_info.default_txconf;
157         txconf->txq_flags = 0;
158
159         if (port >= rte_eth_dev_count())
160                 return -1;
161
162         rx_rings = nb_devices;
163
164         /* Configure ethernet device. */
165         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
166         if (retval != 0)
167                 return retval;
168
169         /* Setup the queues. */
170         for (q = 0; q < rx_rings; q++) {
171                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
172                                                 rte_eth_dev_socket_id(port),
173                                                 rxconf,
174                                                 mbuf_pool);
175                 if (retval < 0)
176                         return retval;
177         }
178         for (q = 0; q < tx_rings; q++) {
179                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
180                                                 rte_eth_dev_socket_id(port),
181                                                 txconf);
182                 if (retval < 0)
183                         return retval;
184         }
185
186         /* Start the device. */
187         retval  = rte_eth_dev_start(port);
188         if (retval < 0)
189                 return retval;
190
191         /* Configure UDP port for UDP tunneling */
192         tunnel_udp.udp_port = udp_port;
193         tunnel_udp.prot_type = RTE_TUNNEL_TYPE_VXLAN;
194         retval = rte_eth_dev_udp_tunnel_port_add(port, &tunnel_udp);
195         if (retval < 0)
196                 return retval;
197         rte_eth_macaddr_get(port, &ports_eth_addr[port]);
198         RTE_LOG(INFO, PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
199                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
200                         (unsigned)port,
201                         ports_eth_addr[port].addr_bytes[0],
202                         ports_eth_addr[port].addr_bytes[1],
203                         ports_eth_addr[port].addr_bytes[2],
204                         ports_eth_addr[port].addr_bytes[3],
205                         ports_eth_addr[port].addr_bytes[4],
206                         ports_eth_addr[port].addr_bytes[5]);
207
208         if (tso_segsz != 0) {
209                 struct rte_eth_dev_info dev_info;
210                 rte_eth_dev_info_get(port, &dev_info);
211                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) == 0)
212                         RTE_LOG(WARNING, PORT,
213                                 "hardware TSO offload is not supported\n");
214         }
215         return 0;
216 }
217
218 static int
219 vxlan_rx_process(struct rte_mbuf *pkt)
220 {
221         int ret = 0;
222
223         if (rx_decap)
224                 ret = decapsulation(pkt);
225
226         return ret;
227 }
228
229 static void
230 vxlan_tx_process(uint8_t queue_id, struct rte_mbuf *pkt)
231 {
232         if (tx_encap)
233                 encapsulation(pkt, queue_id);
234
235         return;
236 }
237
238 /*
239  * This function learns the MAC address of the device and set init
240  * L2 header and L3 header info.
241  */
242 int
243 vxlan_link(struct vhost_dev *vdev, struct rte_mbuf *m)
244 {
245         int i, ret;
246         struct ether_hdr *pkt_hdr;
247         uint64_t portid = vdev->vid;
248         struct ipv4_hdr *ip;
249
250         struct rte_eth_tunnel_filter_conf tunnel_filter_conf;
251
252         if (unlikely(portid >= VXLAN_N_PORTS)) {
253                 RTE_LOG(INFO, VHOST_DATA,
254                         "(%d) WARNING: Not configuring device,"
255                         "as already have %d ports for VXLAN.",
256                         vdev->vid, VXLAN_N_PORTS);
257                 return -1;
258         }
259
260         /* Learn MAC address of guest device from packet */
261         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
262         if (is_same_ether_addr(&(pkt_hdr->s_addr), &vdev->mac_address)) {
263                 RTE_LOG(INFO, VHOST_DATA,
264                         "(%d) WARNING: This device is using an existing"
265                         " MAC address and has not been registered.\n",
266                         vdev->vid);
267                 return -1;
268         }
269
270         for (i = 0; i < ETHER_ADDR_LEN; i++) {
271                 vdev->mac_address.addr_bytes[i] =
272                         vxdev.port[portid].vport_mac.addr_bytes[i] =
273                         pkt_hdr->s_addr.addr_bytes[i];
274                 vxdev.port[portid].peer_mac.addr_bytes[i] = peer_mac[i];
275         }
276
277         memset(&tunnel_filter_conf, 0,
278                 sizeof(struct rte_eth_tunnel_filter_conf));
279
280         ether_addr_copy(&ports_eth_addr[0], &tunnel_filter_conf.outer_mac);
281         tunnel_filter_conf.filter_type = tep_filter_type[filter_idx];
282
283         /* inner MAC */
284         ether_addr_copy(&vdev->mac_address, &tunnel_filter_conf.inner_mac);
285
286         tunnel_filter_conf.queue_id = vdev->rx_q;
287         tunnel_filter_conf.tenant_id = tenant_id_conf[vdev->rx_q];
288
289         if (tep_filter_type[filter_idx] == RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID)
290                 tunnel_filter_conf.inner_vlan = INNER_VLAN_ID;
291
292         tunnel_filter_conf.tunnel_type = RTE_TUNNEL_TYPE_VXLAN;
293
294         ret = rte_eth_dev_filter_ctrl(ports[0],
295                 RTE_ETH_FILTER_TUNNEL,
296                 RTE_ETH_FILTER_ADD,
297                 &tunnel_filter_conf);
298         if (ret) {
299                 RTE_LOG(ERR, VHOST_DATA,
300                         "%d Failed to add device MAC address to cloud filter\n",
301                 vdev->rx_q);
302                 return -1;
303         }
304
305         /* Print out inner MAC and VNI info. */
306         RTE_LOG(INFO, VHOST_DATA,
307                 "(%d) MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VNI %d registered\n",
308                 vdev->rx_q,
309                 vdev->mac_address.addr_bytes[0],
310                 vdev->mac_address.addr_bytes[1],
311                 vdev->mac_address.addr_bytes[2],
312                 vdev->mac_address.addr_bytes[3],
313                 vdev->mac_address.addr_bytes[4],
314                 vdev->mac_address.addr_bytes[5],
315                 tenant_id_conf[vdev->rx_q]);
316
317         vxdev.port[portid].vport_id = portid;
318
319         for (i = 0; i < 4; i++) {
320                 /* Local VTEP IP */
321                 vxdev.port_ip |= vxlan_multicast_ips[portid][i] << (8 * i);
322                 /* Remote VTEP IP */
323                 vxdev.port[portid].peer_ip |=
324                         vxlan_overlay_ips[portid][i] << (8 * i);
325         }
326
327         vxdev.out_key = tenant_id_conf[vdev->rx_q];
328         ether_addr_copy(&vxdev.port[portid].peer_mac,
329                         &app_l2_hdr[portid].d_addr);
330         ether_addr_copy(&ports_eth_addr[0],
331                         &app_l2_hdr[portid].s_addr);
332         app_l2_hdr[portid].ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
333
334         ip = &app_ip_hdr[portid];
335         ip->version_ihl = IP_VHL_DEF;
336         ip->type_of_service = 0;
337         ip->total_length = 0;
338         ip->packet_id = 0;
339         ip->fragment_offset = IP_DN_FRAGMENT_FLAG;
340         ip->time_to_live = IP_DEFTTL;
341         ip->next_proto_id = IPPROTO_UDP;
342         ip->hdr_checksum = 0;
343         ip->src_addr = vxdev.port_ip;
344         ip->dst_addr = vxdev.port[portid].peer_ip;
345
346         /* Set device as ready for RX. */
347         vdev->ready = DEVICE_RX;
348
349         return 0;
350 }
351
352 /**
353  * Removes cloud filter. Ensures that nothing is adding buffers to the RX
354  * queue before disabling RX on the device.
355  */
356 void
357 vxlan_unlink(struct vhost_dev *vdev)
358 {
359         unsigned i = 0, rx_count;
360         int ret;
361         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
362         struct rte_eth_tunnel_filter_conf tunnel_filter_conf;
363
364         if (vdev->ready == DEVICE_RX) {
365                 memset(&tunnel_filter_conf, 0,
366                         sizeof(struct rte_eth_tunnel_filter_conf));
367
368                 ether_addr_copy(&ports_eth_addr[0], &tunnel_filter_conf.outer_mac);
369                 ether_addr_copy(&vdev->mac_address, &tunnel_filter_conf.inner_mac);
370                 tunnel_filter_conf.tenant_id = tenant_id_conf[vdev->rx_q];
371                 tunnel_filter_conf.filter_type = tep_filter_type[filter_idx];
372
373                 if (tep_filter_type[filter_idx] ==
374                         RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID)
375                         tunnel_filter_conf.inner_vlan = INNER_VLAN_ID;
376
377                 tunnel_filter_conf.queue_id = vdev->rx_q;
378                 tunnel_filter_conf.tunnel_type = RTE_TUNNEL_TYPE_VXLAN;
379
380                 ret = rte_eth_dev_filter_ctrl(ports[0],
381                                 RTE_ETH_FILTER_TUNNEL,
382                                 RTE_ETH_FILTER_DELETE,
383                                 &tunnel_filter_conf);
384                 if (ret) {
385                         RTE_LOG(ERR, VHOST_DATA,
386                                 "%d Failed to add device MAC address to cloud filter\n",
387                                 vdev->rx_q);
388                         return;
389                 }
390                 for (i = 0; i < ETHER_ADDR_LEN; i++)
391                         vdev->mac_address.addr_bytes[i] = 0;
392
393                 /* Clear out the receive buffers */
394                 rx_count = rte_eth_rx_burst(ports[0],
395                                 (uint16_t)vdev->rx_q,
396                                 pkts_burst, MAX_PKT_BURST);
397
398                 while (rx_count) {
399                         for (i = 0; i < rx_count; i++)
400                                 rte_pktmbuf_free(pkts_burst[i]);
401
402                         rx_count = rte_eth_rx_burst(ports[0],
403                                         (uint16_t)vdev->rx_q,
404                                         pkts_burst, MAX_PKT_BURST);
405                 }
406                 vdev->ready = DEVICE_MAC_LEARNING;
407         }
408 }
409
410 /* Transmit packets after encapsulating */
411 int
412 vxlan_tx_pkts(uint8_t port_id, uint16_t queue_id,
413                 struct rte_mbuf **tx_pkts, uint16_t nb_pkts) {
414         int ret = 0;
415         uint16_t i;
416
417         for (i = 0; i < nb_pkts; i++)
418                 vxlan_tx_process(queue_id, tx_pkts[i]);
419
420         ret = rte_eth_tx_burst(port_id, queue_id, tx_pkts, nb_pkts);
421
422         return ret;
423 }
424
425 /* Check for decapsulation and pass packets directly to VIRTIO device */
426 int
427 vxlan_rx_pkts(int vid, struct rte_mbuf **pkts_burst, uint32_t rx_count)
428 {
429         uint32_t i = 0;
430         uint32_t count = 0;
431         int ret;
432         struct rte_mbuf *pkts_valid[rx_count];
433
434         for (i = 0; i < rx_count; i++) {
435                 if (enable_stats) {
436                         rte_atomic64_add(
437                                 &dev_statistics[vid].rx_bad_ip_csum,
438                                 (pkts_burst[i]->ol_flags & PKT_RX_IP_CKSUM_BAD)
439                                 != 0);
440                         rte_atomic64_add(
441                                 &dev_statistics[vid].rx_bad_ip_csum,
442                                 (pkts_burst[i]->ol_flags & PKT_RX_L4_CKSUM_BAD)
443                                 != 0);
444                 }
445                 ret = vxlan_rx_process(pkts_burst[i]);
446                 if (unlikely(ret < 0))
447                         continue;
448
449                 pkts_valid[count] = pkts_burst[i];
450                         count++;
451         }
452
453         ret = rte_vhost_enqueue_burst(vid, VIRTIO_RXQ, pkts_valid, count);
454         return ret;
455 }