New upstream version 17.11-rc3
[deb_dpdk.git] / examples / tep_termination / vxlan_setup.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <getopt.h>
35 #include <linux/if_ether.h>
36 #include <linux/if_vlan.h>
37 #include <linux/virtio_net.h>
38 #include <linux/virtio_ring.h>
39 #include <sys/param.h>
40 #include <unistd.h>
41
42 #include <rte_ethdev.h>
43 #include <rte_log.h>
44 #include <rte_string_fns.h>
45 #include <rte_mbuf.h>
46 #include <rte_malloc.h>
47 #include <rte_ip.h>
48 #include <rte_udp.h>
49 #include <rte_tcp.h>
50
51 #include "main.h"
52 #include "rte_vhost.h"
53 #include "vxlan.h"
54 #include "vxlan_setup.h"
55
56 #define IPV4_HEADER_LEN 20
57 #define UDP_HEADER_LEN  8
58 #define VXLAN_HEADER_LEN 8
59
60 #define IP_VERSION 0x40
61 #define IP_HDRLEN  0x05 /* default IP header length == five 32-bits words. */
62 #define IP_DEFTTL  64   /* from RFC 1340. */
63 #define IP_VHL_DEF (IP_VERSION | IP_HDRLEN)
64
65 #define IP_DN_FRAGMENT_FLAG 0x0040
66
67 /* Used to compare MAC addresses. */
68 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
69
70 /* Configurable number of RX/TX ring descriptors */
71 #define RTE_TEST_RX_DESC_DEFAULT 1024
72 #define RTE_TEST_TX_DESC_DEFAULT 512
73
74 /* Default inner VLAN ID */
75 #define INNER_VLAN_ID 100
76
77 /* VXLAN device */
78 struct vxlan_conf vxdev;
79
80 struct ipv4_hdr app_ip_hdr[VXLAN_N_PORTS];
81 struct ether_hdr app_l2_hdr[VXLAN_N_PORTS];
82
83 /* local VTEP IP address */
84 uint8_t vxlan_multicast_ips[2][4] = { {239, 1, 1, 1 }, {239, 1, 2, 1 } };
85
86 /* Remote VTEP IP address */
87 uint8_t vxlan_overlay_ips[2][4] = { {192, 168, 10, 1}, {192, 168, 30, 1} };
88
89 /* Remote VTEP MAC address */
90 uint8_t peer_mac[6] = {0x00, 0x11, 0x01, 0x00, 0x00, 0x01};
91
92 /* VXLAN RX filter type */
93 uint8_t tep_filter_type[] = {RTE_TUNNEL_FILTER_IMAC_TENID,
94                         RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID,
95                         RTE_TUNNEL_FILTER_OMAC_TENID_IMAC,};
96
97 /* Options for configuring ethernet port */
98 static const struct rte_eth_conf port_conf = {
99         .rxmode = {
100                 .split_hdr_size = 0,
101                 .header_split   = 0, /**< Header Split disabled */
102                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
103                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
104                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
105                 .hw_strip_crc   = 1, /**< CRC stripped by hardware */
106         },
107         .txmode = {
108                 .mq_mode = ETH_MQ_TX_NONE,
109         },
110 };
111
112 /**
113  * The one or two device(s) that belongs to the same tenant ID can
114  * be assigned in a VM.
115  */
116 const uint16_t tenant_id_conf[] = {
117         1000, 1000, 1001, 1001, 1002, 1002, 1003, 1003,
118         1004, 1004, 1005, 1005, 1006, 1006, 1007, 1007,
119         1008, 1008, 1009, 1009, 1010, 1010, 1011, 1011,
120         1012, 1012, 1013, 1013, 1014, 1014, 1015, 1015,
121         1016, 1016, 1017, 1017, 1018, 1018, 1019, 1019,
122         1020, 1020, 1021, 1021, 1022, 1022, 1023, 1023,
123         1024, 1024, 1025, 1025, 1026, 1026, 1027, 1027,
124         1028, 1028, 1029, 1029, 1030, 1030, 1031, 1031,
125 };
126
127 /**
128  * Initialises a given port using global settings and with the rx buffers
129  * coming from the mbuf_pool passed as parameter
130  */
131 int
132 vxlan_port_init(uint16_t port, struct rte_mempool *mbuf_pool)
133 {
134         int retval;
135         uint16_t q;
136         struct rte_eth_dev_info dev_info;
137         uint16_t rx_rings, tx_rings = (uint16_t)rte_lcore_count();
138         uint16_t rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
139         uint16_t tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
140         struct rte_eth_udp_tunnel tunnel_udp;
141         struct rte_eth_rxconf *rxconf;
142         struct rte_eth_txconf *txconf;
143         struct vxlan_conf *pconf = &vxdev;
144
145         pconf->dst_port = udp_port;
146
147         rte_eth_dev_info_get(port, &dev_info);
148
149         if (dev_info.max_rx_queues > MAX_QUEUES) {
150                 rte_exit(EXIT_FAILURE,
151                         "please define MAX_QUEUES no less than %u in %s\n",
152                         dev_info.max_rx_queues, __FILE__);
153         }
154
155         rxconf = &dev_info.default_rxconf;
156         txconf = &dev_info.default_txconf;
157         txconf->txq_flags = 0;
158
159         if (port >= rte_eth_dev_count())
160                 return -1;
161
162         rx_rings = nb_devices;
163
164         /* Configure ethernet device. */
165         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
166         if (retval != 0)
167                 return retval;
168
169         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
170                         &tx_ring_size);
171         if (retval != 0)
172                 return retval;
173
174         /* Setup the queues. */
175         for (q = 0; q < rx_rings; q++) {
176                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
177                                                 rte_eth_dev_socket_id(port),
178                                                 rxconf,
179                                                 mbuf_pool);
180                 if (retval < 0)
181                         return retval;
182         }
183         for (q = 0; q < tx_rings; q++) {
184                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
185                                                 rte_eth_dev_socket_id(port),
186                                                 txconf);
187                 if (retval < 0)
188                         return retval;
189         }
190
191         /* Start the device. */
192         retval  = rte_eth_dev_start(port);
193         if (retval < 0)
194                 return retval;
195
196         /* Configure UDP port for UDP tunneling */
197         tunnel_udp.udp_port = udp_port;
198         tunnel_udp.prot_type = RTE_TUNNEL_TYPE_VXLAN;
199         retval = rte_eth_dev_udp_tunnel_port_add(port, &tunnel_udp);
200         if (retval < 0)
201                 return retval;
202         rte_eth_macaddr_get(port, &ports_eth_addr[port]);
203         RTE_LOG(INFO, PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
204                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
205                         port,
206                         ports_eth_addr[port].addr_bytes[0],
207                         ports_eth_addr[port].addr_bytes[1],
208                         ports_eth_addr[port].addr_bytes[2],
209                         ports_eth_addr[port].addr_bytes[3],
210                         ports_eth_addr[port].addr_bytes[4],
211                         ports_eth_addr[port].addr_bytes[5]);
212
213         if (tso_segsz != 0) {
214                 struct rte_eth_dev_info dev_info;
215                 rte_eth_dev_info_get(port, &dev_info);
216                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) == 0)
217                         RTE_LOG(WARNING, PORT,
218                                 "hardware TSO offload is not supported\n");
219         }
220         return 0;
221 }
222
223 static int
224 vxlan_rx_process(struct rte_mbuf *pkt)
225 {
226         int ret = 0;
227
228         if (rx_decap)
229                 ret = decapsulation(pkt);
230
231         return ret;
232 }
233
234 static void
235 vxlan_tx_process(uint8_t queue_id, struct rte_mbuf *pkt)
236 {
237         if (tx_encap)
238                 encapsulation(pkt, queue_id);
239
240         return;
241 }
242
243 /*
244  * This function learns the MAC address of the device and set init
245  * L2 header and L3 header info.
246  */
247 int
248 vxlan_link(struct vhost_dev *vdev, struct rte_mbuf *m)
249 {
250         int i, ret;
251         struct ether_hdr *pkt_hdr;
252         uint64_t portid = vdev->vid;
253         struct ipv4_hdr *ip;
254
255         struct rte_eth_tunnel_filter_conf tunnel_filter_conf;
256
257         if (unlikely(portid >= VXLAN_N_PORTS)) {
258                 RTE_LOG(INFO, VHOST_DATA,
259                         "(%d) WARNING: Not configuring device,"
260                         "as already have %d ports for VXLAN.",
261                         vdev->vid, VXLAN_N_PORTS);
262                 return -1;
263         }
264
265         /* Learn MAC address of guest device from packet */
266         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
267         if (is_same_ether_addr(&(pkt_hdr->s_addr), &vdev->mac_address)) {
268                 RTE_LOG(INFO, VHOST_DATA,
269                         "(%d) WARNING: This device is using an existing"
270                         " MAC address and has not been registered.\n",
271                         vdev->vid);
272                 return -1;
273         }
274
275         for (i = 0; i < ETHER_ADDR_LEN; i++) {
276                 vdev->mac_address.addr_bytes[i] =
277                         vxdev.port[portid].vport_mac.addr_bytes[i] =
278                         pkt_hdr->s_addr.addr_bytes[i];
279                 vxdev.port[portid].peer_mac.addr_bytes[i] = peer_mac[i];
280         }
281
282         memset(&tunnel_filter_conf, 0,
283                 sizeof(struct rte_eth_tunnel_filter_conf));
284
285         ether_addr_copy(&ports_eth_addr[0], &tunnel_filter_conf.outer_mac);
286         tunnel_filter_conf.filter_type = tep_filter_type[filter_idx];
287
288         /* inner MAC */
289         ether_addr_copy(&vdev->mac_address, &tunnel_filter_conf.inner_mac);
290
291         tunnel_filter_conf.queue_id = vdev->rx_q;
292         tunnel_filter_conf.tenant_id = tenant_id_conf[vdev->rx_q];
293
294         if (tep_filter_type[filter_idx] == RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID)
295                 tunnel_filter_conf.inner_vlan = INNER_VLAN_ID;
296
297         tunnel_filter_conf.tunnel_type = RTE_TUNNEL_TYPE_VXLAN;
298
299         ret = rte_eth_dev_filter_ctrl(ports[0],
300                 RTE_ETH_FILTER_TUNNEL,
301                 RTE_ETH_FILTER_ADD,
302                 &tunnel_filter_conf);
303         if (ret) {
304                 RTE_LOG(ERR, VHOST_DATA,
305                         "%d Failed to add device MAC address to cloud filter\n",
306                 vdev->rx_q);
307                 return -1;
308         }
309
310         /* Print out inner MAC and VNI info. */
311         RTE_LOG(INFO, VHOST_DATA,
312                 "(%d) MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VNI %d registered\n",
313                 vdev->rx_q,
314                 vdev->mac_address.addr_bytes[0],
315                 vdev->mac_address.addr_bytes[1],
316                 vdev->mac_address.addr_bytes[2],
317                 vdev->mac_address.addr_bytes[3],
318                 vdev->mac_address.addr_bytes[4],
319                 vdev->mac_address.addr_bytes[5],
320                 tenant_id_conf[vdev->rx_q]);
321
322         vxdev.port[portid].vport_id = portid;
323
324         for (i = 0; i < 4; i++) {
325                 /* Local VTEP IP */
326                 vxdev.port_ip |= vxlan_multicast_ips[portid][i] << (8 * i);
327                 /* Remote VTEP IP */
328                 vxdev.port[portid].peer_ip |=
329                         vxlan_overlay_ips[portid][i] << (8 * i);
330         }
331
332         vxdev.out_key = tenant_id_conf[vdev->rx_q];
333         ether_addr_copy(&vxdev.port[portid].peer_mac,
334                         &app_l2_hdr[portid].d_addr);
335         ether_addr_copy(&ports_eth_addr[0],
336                         &app_l2_hdr[portid].s_addr);
337         app_l2_hdr[portid].ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
338
339         ip = &app_ip_hdr[portid];
340         ip->version_ihl = IP_VHL_DEF;
341         ip->type_of_service = 0;
342         ip->total_length = 0;
343         ip->packet_id = 0;
344         ip->fragment_offset = IP_DN_FRAGMENT_FLAG;
345         ip->time_to_live = IP_DEFTTL;
346         ip->next_proto_id = IPPROTO_UDP;
347         ip->hdr_checksum = 0;
348         ip->src_addr = vxdev.port_ip;
349         ip->dst_addr = vxdev.port[portid].peer_ip;
350
351         /* Set device as ready for RX. */
352         vdev->ready = DEVICE_RX;
353
354         return 0;
355 }
356
357 /**
358  * Removes cloud filter. Ensures that nothing is adding buffers to the RX
359  * queue before disabling RX on the device.
360  */
361 void
362 vxlan_unlink(struct vhost_dev *vdev)
363 {
364         unsigned i = 0, rx_count;
365         int ret;
366         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
367         struct rte_eth_tunnel_filter_conf tunnel_filter_conf;
368
369         if (vdev->ready == DEVICE_RX) {
370                 memset(&tunnel_filter_conf, 0,
371                         sizeof(struct rte_eth_tunnel_filter_conf));
372
373                 ether_addr_copy(&ports_eth_addr[0], &tunnel_filter_conf.outer_mac);
374                 ether_addr_copy(&vdev->mac_address, &tunnel_filter_conf.inner_mac);
375                 tunnel_filter_conf.tenant_id = tenant_id_conf[vdev->rx_q];
376                 tunnel_filter_conf.filter_type = tep_filter_type[filter_idx];
377
378                 if (tep_filter_type[filter_idx] ==
379                         RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID)
380                         tunnel_filter_conf.inner_vlan = INNER_VLAN_ID;
381
382                 tunnel_filter_conf.queue_id = vdev->rx_q;
383                 tunnel_filter_conf.tunnel_type = RTE_TUNNEL_TYPE_VXLAN;
384
385                 ret = rte_eth_dev_filter_ctrl(ports[0],
386                                 RTE_ETH_FILTER_TUNNEL,
387                                 RTE_ETH_FILTER_DELETE,
388                                 &tunnel_filter_conf);
389                 if (ret) {
390                         RTE_LOG(ERR, VHOST_DATA,
391                                 "%d Failed to add device MAC address to cloud filter\n",
392                                 vdev->rx_q);
393                         return;
394                 }
395                 for (i = 0; i < ETHER_ADDR_LEN; i++)
396                         vdev->mac_address.addr_bytes[i] = 0;
397
398                 /* Clear out the receive buffers */
399                 rx_count = rte_eth_rx_burst(ports[0],
400                                 (uint16_t)vdev->rx_q,
401                                 pkts_burst, MAX_PKT_BURST);
402
403                 while (rx_count) {
404                         for (i = 0; i < rx_count; i++)
405                                 rte_pktmbuf_free(pkts_burst[i]);
406
407                         rx_count = rte_eth_rx_burst(ports[0],
408                                         (uint16_t)vdev->rx_q,
409                                         pkts_burst, MAX_PKT_BURST);
410                 }
411                 vdev->ready = DEVICE_MAC_LEARNING;
412         }
413 }
414
415 /* Transmit packets after encapsulating */
416 int
417 vxlan_tx_pkts(uint16_t port_id, uint16_t queue_id,
418                 struct rte_mbuf **tx_pkts, uint16_t nb_pkts) {
419         int ret = 0;
420         uint16_t i;
421
422         for (i = 0; i < nb_pkts; i++)
423                 vxlan_tx_process(queue_id, tx_pkts[i]);
424
425         ret = rte_eth_tx_burst(port_id, queue_id, tx_pkts, nb_pkts);
426
427         return ret;
428 }
429
430 /* Check for decapsulation and pass packets directly to VIRTIO device */
431 int
432 vxlan_rx_pkts(int vid, struct rte_mbuf **pkts_burst, uint32_t rx_count)
433 {
434         uint32_t i = 0;
435         uint32_t count = 0;
436         int ret;
437         struct rte_mbuf *pkts_valid[rx_count];
438
439         for (i = 0; i < rx_count; i++) {
440                 if (enable_stats) {
441                         rte_atomic64_add(
442                                 &dev_statistics[vid].rx_bad_ip_csum,
443                                 (pkts_burst[i]->ol_flags & PKT_RX_IP_CKSUM_BAD)
444                                 != 0);
445                         rte_atomic64_add(
446                                 &dev_statistics[vid].rx_bad_ip_csum,
447                                 (pkts_burst[i]->ol_flags & PKT_RX_L4_CKSUM_BAD)
448                                 != 0);
449                 }
450                 ret = vxlan_rx_process(pkts_burst[i]);
451                 if (unlikely(ret < 0))
452                         continue;
453
454                 pkts_valid[count] = pkts_burst[i];
455                         count++;
456         }
457
458         ret = rte_vhost_enqueue_burst(vid, VIRTIO_RXQ, pkts_valid, count);
459         return ret;
460 }