New upstream version 18.11
[deb_dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
18 #include <stdalign.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <sys/socket.h>
24
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
28 #include <rte_flow.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31
32 #include "mlx5.h"
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
35
36 #ifdef HAVE_TC_ACT_VLAN
37
38 #include <linux/tc_act/tc_vlan.h>
39
40 #else /* HAVE_TC_ACT_VLAN */
41
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
50
51 struct tc_vlan {
52         tc_gen;
53         int v_action;
54 };
55
56 #endif /* HAVE_TC_ACT_VLAN */
57
58 #ifdef HAVE_TC_ACT_PEDIT
59
60 #include <linux/tc_act/tc_pedit.h>
61
62 #else /* HAVE_TC_ACT_VLAN */
63
64 enum {
65         TCA_PEDIT_UNSPEC,
66         TCA_PEDIT_TM,
67         TCA_PEDIT_PARMS,
68         TCA_PEDIT_PAD,
69         TCA_PEDIT_PARMS_EX,
70         TCA_PEDIT_KEYS_EX,
71         TCA_PEDIT_KEY_EX,
72         __TCA_PEDIT_MAX
73 };
74
75 enum {
76         TCA_PEDIT_KEY_EX_HTYPE = 1,
77         TCA_PEDIT_KEY_EX_CMD = 2,
78         __TCA_PEDIT_KEY_EX_MAX
79 };
80
81 enum pedit_header_type {
82         TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83         TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84         TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85         TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86         TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87         TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
88         __PEDIT_HDR_TYPE_MAX,
89 };
90
91 enum pedit_cmd {
92         TCA_PEDIT_KEY_EX_CMD_SET = 0,
93         TCA_PEDIT_KEY_EX_CMD_ADD = 1,
94         __PEDIT_CMD_MAX,
95 };
96
97 struct tc_pedit_key {
98         __u32 mask; /* AND */
99         __u32 val; /*XOR */
100         __u32 off; /*offset */
101         __u32 at;
102         __u32 offmask;
103         __u32 shift;
104 };
105
106 __extension__
107 struct tc_pedit_sel {
108         tc_gen;
109         unsigned char nkeys;
110         unsigned char flags;
111         struct tc_pedit_key keys[0];
112 };
113
114 #endif /* HAVE_TC_ACT_VLAN */
115
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
117
118 #include <linux/tc_act/tc_tunnel_key.h>
119
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
122 #endif
123
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
126 #endif
127
128 #else /* HAVE_TC_ACT_TUNNEL_KEY */
129
130 #define TCA_ACT_TUNNEL_KEY 17
131 #define TCA_TUNNEL_KEY_ACT_SET 1
132 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
133 #define TCA_TUNNEL_KEY_PARMS 2
134 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
135 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
136 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
137 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
138 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
139 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
140 #define TCA_TUNNEL_KEY_NO_CSUM 10
141
142 struct tc_tunnel_key {
143         tc_gen;
144         int t_action;
145 };
146
147 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
148
149 /* Normally found in linux/netlink.h. */
150 #ifndef NETLINK_CAP_ACK
151 #define NETLINK_CAP_ACK 10
152 #endif
153
154 /* Normally found in linux/pkt_sched.h. */
155 #ifndef TC_H_MIN_INGRESS
156 #define TC_H_MIN_INGRESS 0xfff2u
157 #endif
158
159 /* Normally found in linux/pkt_cls.h. */
160 #ifndef TCA_CLS_FLAGS_SKIP_SW
161 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
162 #endif
163 #ifndef TCA_CLS_FLAGS_IN_HW
164 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
165 #endif
166 #ifndef HAVE_TCA_CHAIN
167 #define TCA_CHAIN 11
168 #endif
169 #ifndef HAVE_TCA_FLOWER_ACT
170 #define TCA_FLOWER_ACT 3
171 #endif
172 #ifndef HAVE_TCA_FLOWER_FLAGS
173 #define TCA_FLOWER_FLAGS 22
174 #endif
175 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
176 #define TCA_FLOWER_KEY_ETH_TYPE 8
177 #endif
178 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
179 #define TCA_FLOWER_KEY_ETH_DST 4
180 #endif
181 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
182 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
183 #endif
184 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
185 #define TCA_FLOWER_KEY_ETH_SRC 6
186 #endif
187 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
188 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
189 #endif
190 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
191 #define TCA_FLOWER_KEY_IP_PROTO 9
192 #endif
193 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
194 #define TCA_FLOWER_KEY_IPV4_SRC 10
195 #endif
196 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
197 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
198 #endif
199 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
200 #define TCA_FLOWER_KEY_IPV4_DST 12
201 #endif
202 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
203 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
204 #endif
205 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
206 #define TCA_FLOWER_KEY_IPV6_SRC 14
207 #endif
208 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
209 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
210 #endif
211 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
212 #define TCA_FLOWER_KEY_IPV6_DST 16
213 #endif
214 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
215 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
216 #endif
217 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
218 #define TCA_FLOWER_KEY_TCP_SRC 18
219 #endif
220 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
221 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
222 #endif
223 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
224 #define TCA_FLOWER_KEY_TCP_DST 19
225 #endif
226 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
227 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
228 #endif
229 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
230 #define TCA_FLOWER_KEY_UDP_SRC 20
231 #endif
232 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
233 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
234 #endif
235 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
236 #define TCA_FLOWER_KEY_UDP_DST 21
237 #endif
238 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
239 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
240 #endif
241 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
242 #define TCA_FLOWER_KEY_VLAN_ID 23
243 #endif
244 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
245 #define TCA_FLOWER_KEY_VLAN_PRIO 24
246 #endif
247 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
248 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
249 #endif
250 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
251 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
252 #endif
253 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
254 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
255 #endif
256 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
257 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
258 #endif
259 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
260 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
261 #endif
262 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
263 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
264 #endif
265 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
266 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
267 #endif
268 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
269 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
270 #endif
271 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
272 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
273 #endif
274 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
275 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
276 #endif
277 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
278 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
279 #endif
280 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
281 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
282 #endif
283 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
284 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
285 #endif
286 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
287 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
288 #endif
289 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
290 #define TCA_FLOWER_KEY_TCP_FLAGS 71
291 #endif
292 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
293 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
294 #endif
295 #ifndef HAVE_TC_ACT_GOTO_CHAIN
296 #define TC_ACT_GOTO_CHAIN 0x20000000
297 #endif
298
299 #ifndef IPV6_ADDR_LEN
300 #define IPV6_ADDR_LEN 16
301 #endif
302
303 #ifndef IPV4_ADDR_LEN
304 #define IPV4_ADDR_LEN 4
305 #endif
306
307 #ifndef TP_PORT_LEN
308 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
309 #endif
310
311 #ifndef TTL_LEN
312 #define TTL_LEN 1
313 #endif
314
315 #ifndef TCA_ACT_MAX_PRIO
316 #define TCA_ACT_MAX_PRIO 32
317 #endif
318
319 /** UDP port range of VXLAN devices created by driver. */
320 #define MLX5_VXLAN_PORT_MIN 30000
321 #define MLX5_VXLAN_PORT_MAX 60000
322 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
323
324 /** Tunnel action type, used for @p type in header structure. */
325 enum flow_tcf_tunact_type {
326         FLOW_TCF_TUNACT_VXLAN_DECAP,
327         FLOW_TCF_TUNACT_VXLAN_ENCAP,
328 };
329
330 /** Flags used for @p mask in tunnel action encap descriptors. */
331 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
332 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
333 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
334 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
335 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
336 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
337 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
338 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
339 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
340
341 /**
342  * Structure for holding netlink context.
343  * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
344  * Using this (8KB) buffer size ensures that netlink messages will never be
345  * truncated.
346  */
347 struct mlx5_flow_tcf_context {
348         struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
349         uint32_t seq; /* Message sequence number. */
350         uint32_t buf_size; /* Message buffer size. */
351         uint8_t *buf; /* Message buffer. */
352 };
353
354 /**
355  * Neigh rule structure. The neigh rule is applied via Netlink to
356  * outer tunnel iface in order to provide destination MAC address
357  * for the VXLAN encapsultion. The neigh rule is implicitly related
358  * to the Flow itself and can be shared by multiple Flows.
359  */
360 struct tcf_neigh_rule {
361         LIST_ENTRY(tcf_neigh_rule) next;
362         uint32_t refcnt;
363         struct ether_addr eth;
364         uint16_t mask;
365         union {
366                 struct {
367                         rte_be32_t dst;
368                 } ipv4;
369                 struct {
370                         uint8_t dst[IPV6_ADDR_LEN];
371                 } ipv6;
372         };
373 };
374
375 /**
376  * Local rule structure. The local rule is applied via Netlink to
377  * outer tunnel iface in order to provide local and peer IP addresses
378  * of the VXLAN tunnel for encapsulation. The local rule is implicitly
379  * related to the Flow itself and can be shared by multiple Flows.
380  */
381 struct tcf_local_rule {
382         LIST_ENTRY(tcf_local_rule) next;
383         uint32_t refcnt;
384         uint16_t mask;
385         union {
386                 struct {
387                         rte_be32_t dst;
388                         rte_be32_t src;
389                 } ipv4;
390                 struct {
391                         uint8_t dst[IPV6_ADDR_LEN];
392                         uint8_t src[IPV6_ADDR_LEN];
393                 } ipv6;
394         };
395 };
396
397 /** VXLAN virtual netdev. */
398 struct tcf_vtep {
399         LIST_ENTRY(tcf_vtep) next;
400         LIST_HEAD(, tcf_neigh_rule) neigh;
401         LIST_HEAD(, tcf_local_rule) local;
402         uint32_t refcnt;
403         unsigned int ifindex; /**< Own interface index. */
404         unsigned int ifouter; /**< Index of device attached to. */
405         uint16_t port;
406         uint8_t created;
407 };
408
409 /** Tunnel descriptor header, common for all tunnel types. */
410 struct flow_tcf_tunnel_hdr {
411         uint32_t type; /**< Tunnel action type. */
412         struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
413         unsigned int ifindex_org; /**< Original dst/src interface */
414         unsigned int *ifindex_ptr; /**< Interface ptr in message. */
415 };
416
417 struct flow_tcf_vxlan_decap {
418         struct flow_tcf_tunnel_hdr hdr;
419         uint16_t udp_port;
420 };
421
422 struct flow_tcf_vxlan_encap {
423         struct flow_tcf_tunnel_hdr hdr;
424         uint32_t mask;
425         struct {
426                 struct ether_addr dst;
427                 struct ether_addr src;
428         } eth;
429         union {
430                 struct {
431                         rte_be32_t dst;
432                         rte_be32_t src;
433                 } ipv4;
434                 struct {
435                         uint8_t dst[IPV6_ADDR_LEN];
436                         uint8_t src[IPV6_ADDR_LEN];
437                 } ipv6;
438         };
439 struct {
440                 rte_be16_t src;
441                 rte_be16_t dst;
442         } udp;
443         struct {
444                 uint8_t vni[3];
445         } vxlan;
446 };
447
448 /** Structure used when extracting the values of a flow counters
449  * from a netlink message.
450  */
451 struct flow_tcf_stats_basic {
452         bool valid;
453         struct gnet_stats_basic counters;
454 };
455
456 /** Empty masks for known item types. */
457 static const union {
458         struct rte_flow_item_port_id port_id;
459         struct rte_flow_item_eth eth;
460         struct rte_flow_item_vlan vlan;
461         struct rte_flow_item_ipv4 ipv4;
462         struct rte_flow_item_ipv6 ipv6;
463         struct rte_flow_item_tcp tcp;
464         struct rte_flow_item_udp udp;
465         struct rte_flow_item_vxlan vxlan;
466 } flow_tcf_mask_empty;
467
468 /** Supported masks for known item types. */
469 static const struct {
470         struct rte_flow_item_port_id port_id;
471         struct rte_flow_item_eth eth;
472         struct rte_flow_item_vlan vlan;
473         struct rte_flow_item_ipv4 ipv4;
474         struct rte_flow_item_ipv6 ipv6;
475         struct rte_flow_item_tcp tcp;
476         struct rte_flow_item_udp udp;
477         struct rte_flow_item_vxlan vxlan;
478 } flow_tcf_mask_supported = {
479         .port_id = {
480                 .id = 0xffffffff,
481         },
482         .eth = {
483                 .type = RTE_BE16(0xffff),
484                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
485                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
486         },
487         .vlan = {
488                 /* PCP and VID only, no DEI. */
489                 .tci = RTE_BE16(0xefff),
490                 .inner_type = RTE_BE16(0xffff),
491         },
492         .ipv4.hdr = {
493                 .next_proto_id = 0xff,
494                 .src_addr = RTE_BE32(0xffffffff),
495                 .dst_addr = RTE_BE32(0xffffffff),
496         },
497         .ipv6.hdr = {
498                 .proto = 0xff,
499                 .src_addr =
500                         "\xff\xff\xff\xff\xff\xff\xff\xff"
501                         "\xff\xff\xff\xff\xff\xff\xff\xff",
502                 .dst_addr =
503                         "\xff\xff\xff\xff\xff\xff\xff\xff"
504                         "\xff\xff\xff\xff\xff\xff\xff\xff",
505         },
506         .tcp.hdr = {
507                 .src_port = RTE_BE16(0xffff),
508                 .dst_port = RTE_BE16(0xffff),
509                 .tcp_flags = 0xff,
510         },
511         .udp.hdr = {
512                 .src_port = RTE_BE16(0xffff),
513                 .dst_port = RTE_BE16(0xffff),
514         },
515         .vxlan = {
516                .vni = "\xff\xff\xff",
517         },
518 };
519
520 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
521 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
522 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
523 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
524 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
525
526 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
527
528 /** DPDK port to network interface index (ifindex) conversion. */
529 struct flow_tcf_ptoi {
530         uint16_t port_id; /**< DPDK port ID. */
531         unsigned int ifindex; /**< Network interface index. */
532 };
533
534 /* Due to a limitation on driver/FW. */
535 #define MLX5_TCF_GROUP_ID_MAX 3
536
537 /*
538  * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
539  * Priority in rte_flow attribute starts from 0 and is added by 1 in
540  * translation. This is subject to be changed to determine the max priority
541  * based on trial-and-error like Verbs driver once the restriction is lifted or
542  * the range is extended.
543  */
544 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
545
546 #define MLX5_TCF_FATE_ACTIONS \
547         (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
548          MLX5_FLOW_ACTION_JUMP)
549
550 #define MLX5_TCF_VLAN_ACTIONS \
551         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
552          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
553
554 #define MLX5_TCF_VXLAN_ACTIONS \
555         (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
556
557 #define MLX5_TCF_PEDIT_ACTIONS \
558         (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
559          MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
560          MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
561          MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
562          MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
563
564 #define MLX5_TCF_CONFIG_ACTIONS \
565         (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
566          MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
567          MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
568          (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
569
570 #define MAX_PEDIT_KEYS 128
571 #define SZ_PEDIT_KEY_VAL 4
572
573 #define NUM_OF_PEDIT_KEYS(sz) \
574         (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
575
576 struct pedit_key_ex {
577         enum pedit_header_type htype;
578         enum pedit_cmd cmd;
579 };
580
581 struct pedit_parser {
582         struct tc_pedit_sel sel;
583         struct tc_pedit_key keys[MAX_PEDIT_KEYS];
584         struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
585 };
586
587 /**
588  * Create space for using the implicitly created TC flow counter.
589  *
590  * @param[in] dev
591  *   Pointer to the Ethernet device structure.
592  *
593  * @return
594  *   A pointer to the counter data structure, NULL otherwise and
595  *   rte_errno is set.
596  */
597 static struct mlx5_flow_counter *
598 flow_tcf_counter_new(void)
599 {
600         struct mlx5_flow_counter *cnt;
601
602         /*
603          * eswitch counter cannot be shared and its id is unknown.
604          * currently returning all with id 0.
605          * in the future maybe better to switch to unique numbers.
606          */
607         struct mlx5_flow_counter tmpl = {
608                 .ref_cnt = 1,
609         };
610         cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
611         if (!cnt) {
612                 rte_errno = ENOMEM;
613                 return NULL;
614         }
615         *cnt = tmpl;
616         /* Implicit counter, do not add to list. */
617         return cnt;
618 }
619
620 /**
621  * Set pedit key of MAC address
622  *
623  * @param[in] actions
624  *   pointer to action specification
625  * @param[in,out] p_parser
626  *   pointer to pedit_parser
627  */
628 static void
629 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
630                            struct pedit_parser *p_parser)
631 {
632         int idx = p_parser->sel.nkeys;
633         uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
634                                         offsetof(struct ether_hdr, s_addr) :
635                                         offsetof(struct ether_hdr, d_addr);
636         const struct rte_flow_action_set_mac *conf =
637                 (const struct rte_flow_action_set_mac *)actions->conf;
638
639         p_parser->keys[idx].off = off;
640         p_parser->keys[idx].mask = ~UINT32_MAX;
641         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
642         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
643         memcpy(&p_parser->keys[idx].val,
644                 conf->mac_addr, SZ_PEDIT_KEY_VAL);
645         idx++;
646         p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
647         p_parser->keys[idx].mask = 0xFFFF0000;
648         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
649         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
650         memcpy(&p_parser->keys[idx].val,
651                 conf->mac_addr + SZ_PEDIT_KEY_VAL,
652                 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
653         p_parser->sel.nkeys = (++idx);
654 }
655
656 /**
657  * Set pedit key of decrease/set ttl
658  *
659  * @param[in] actions
660  *   pointer to action specification
661  * @param[in,out] p_parser
662  *   pointer to pedit_parser
663  * @param[in] item_flags
664  *   flags of all items presented
665  */
666 static void
667 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
668                                 struct pedit_parser *p_parser,
669                                 uint64_t item_flags)
670 {
671         int idx = p_parser->sel.nkeys;
672
673         p_parser->keys[idx].mask = 0xFFFFFF00;
674         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
675                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
676                 p_parser->keys[idx].off =
677                         offsetof(struct ipv4_hdr, time_to_live);
678         }
679         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
680                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
681                 p_parser->keys[idx].off =
682                         offsetof(struct ipv6_hdr, hop_limits);
683         }
684         if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
685                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
686                 p_parser->keys[idx].val = 0x000000FF;
687         } else {
688                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
689                 p_parser->keys[idx].val =
690                         (__u32)((const struct rte_flow_action_set_ttl *)
691                          actions->conf)->ttl_value;
692         }
693         p_parser->sel.nkeys = (++idx);
694 }
695
696 /**
697  * Set pedit key of transport (TCP/UDP) port value
698  *
699  * @param[in] actions
700  *   pointer to action specification
701  * @param[in,out] p_parser
702  *   pointer to pedit_parser
703  * @param[in] item_flags
704  *   flags of all items presented
705  */
706 static void
707 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
708                                 struct pedit_parser *p_parser,
709                                 uint64_t item_flags)
710 {
711         int idx = p_parser->sel.nkeys;
712
713         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
714                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
715         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
716                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
717         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
718         /* offset of src/dst port is same for TCP and UDP */
719         p_parser->keys[idx].off =
720                 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
721                 offsetof(struct tcp_hdr, src_port) :
722                 offsetof(struct tcp_hdr, dst_port);
723         p_parser->keys[idx].mask = 0xFFFF0000;
724         p_parser->keys[idx].val =
725                 (__u32)((const struct rte_flow_action_set_tp *)
726                                 actions->conf)->port;
727         p_parser->sel.nkeys = (++idx);
728 }
729
730 /**
731  * Set pedit key of ipv6 address
732  *
733  * @param[in] actions
734  *   pointer to action specification
735  * @param[in,out] p_parser
736  *   pointer to pedit_parser
737  */
738 static void
739 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
740                                  struct pedit_parser *p_parser)
741 {
742         int idx = p_parser->sel.nkeys;
743         int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
744         int off_base =
745                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
746                 offsetof(struct ipv6_hdr, src_addr) :
747                 offsetof(struct ipv6_hdr, dst_addr);
748         const struct rte_flow_action_set_ipv6 *conf =
749                 (const struct rte_flow_action_set_ipv6 *)actions->conf;
750
751         for (int i = 0; i < keys; i++, idx++) {
752                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
753                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
754                 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
755                 p_parser->keys[idx].mask = ~UINT32_MAX;
756                 memcpy(&p_parser->keys[idx].val,
757                         conf->ipv6_addr + i *  SZ_PEDIT_KEY_VAL,
758                         SZ_PEDIT_KEY_VAL);
759         }
760         p_parser->sel.nkeys += keys;
761 }
762
763 /**
764  * Set pedit key of ipv4 address
765  *
766  * @param[in] actions
767  *   pointer to action specification
768  * @param[in,out] p_parser
769  *   pointer to pedit_parser
770  */
771 static void
772 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
773                                  struct pedit_parser *p_parser)
774 {
775         int idx = p_parser->sel.nkeys;
776
777         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
778         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
779         p_parser->keys[idx].off =
780                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
781                 offsetof(struct ipv4_hdr, src_addr) :
782                 offsetof(struct ipv4_hdr, dst_addr);
783         p_parser->keys[idx].mask = ~UINT32_MAX;
784         p_parser->keys[idx].val =
785                 ((const struct rte_flow_action_set_ipv4 *)
786                  actions->conf)->ipv4_addr;
787         p_parser->sel.nkeys = (++idx);
788 }
789
790 /**
791  * Create the pedit's na attribute in netlink message
792  * on pre-allocate message buffer
793  *
794  * @param[in,out] nl
795  *   pointer to pre-allocated netlink message buffer
796  * @param[in,out] actions
797  *   pointer to pointer of actions specification.
798  * @param[in,out] action_flags
799  *   pointer to actions flags
800  * @param[in] item_flags
801  *   flags of all item presented
802  */
803 static void
804 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
805                               const struct rte_flow_action **actions,
806                               uint64_t item_flags)
807 {
808         struct pedit_parser p_parser;
809         struct nlattr *na_act_options;
810         struct nlattr *na_pedit_keys;
811
812         memset(&p_parser, 0, sizeof(p_parser));
813         mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
814         na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
815         /* all modify header actions should be in one tc-pedit action */
816         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
817                 switch ((*actions)->type) {
818                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
819                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
820                         flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
821                         break;
822                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
823                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
824                         flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
825                         break;
826                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
827                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
828                         flow_tcf_pedit_key_set_tp_port(*actions,
829                                                         &p_parser, item_flags);
830                         break;
831                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
832                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
833                         flow_tcf_pedit_key_set_dec_ttl(*actions,
834                                                         &p_parser, item_flags);
835                         break;
836                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
837                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
838                         flow_tcf_pedit_key_set_mac(*actions, &p_parser);
839                         break;
840                 default:
841                         goto pedit_mnl_msg_done;
842                 }
843         }
844 pedit_mnl_msg_done:
845         p_parser.sel.action = TC_ACT_PIPE;
846         mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
847                      sizeof(p_parser.sel) +
848                      p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
849                      &p_parser);
850         na_pedit_keys =
851                 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
852         for (int i = 0; i < p_parser.sel.nkeys; i++) {
853                 struct nlattr *na_pedit_key =
854                         mnl_attr_nest_start(nl,
855                                             TCA_PEDIT_KEY_EX | NLA_F_NESTED);
856                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
857                                  p_parser.keys_ex[i].htype);
858                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
859                                  p_parser.keys_ex[i].cmd);
860                 mnl_attr_nest_end(nl, na_pedit_key);
861         }
862         mnl_attr_nest_end(nl, na_pedit_keys);
863         mnl_attr_nest_end(nl, na_act_options);
864         (*actions)--;
865 }
866
867 /**
868  * Calculate max memory size of one TC-pedit actions.
869  * One TC-pedit action can contain set of keys each defining
870  * a rewrite element (rte_flow action)
871  *
872  * @param[in,out] actions
873  *   actions specification.
874  * @param[in,out] action_flags
875  *   actions flags
876  * @param[in,out] size
877  *   accumulated size
878  * @return
879  *   Max memory size of one TC-pedit action
880  */
881 static int
882 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
883                                 uint64_t *action_flags)
884 {
885         int pedit_size = 0;
886         int keys = 0;
887         uint64_t flags = 0;
888
889         pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
890                       SZ_NLATTR_STRZ_OF("pedit") +
891                       SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
892         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
893                 switch ((*actions)->type) {
894                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
895                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
896                         flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
897                         break;
898                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
899                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
900                         flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
901                         break;
902                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
903                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
904                         flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
905                         break;
906                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
907                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
908                         flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
909                         break;
910                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
911                         /* TCP is as same as UDP */
912                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
913                         flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
914                         break;
915                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
916                         /* TCP is as same as UDP */
917                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
918                         flags |= MLX5_FLOW_ACTION_SET_TP_DST;
919                         break;
920                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
921                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
922                         flags |= MLX5_FLOW_ACTION_SET_TTL;
923                         break;
924                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
925                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
926                         flags |= MLX5_FLOW_ACTION_DEC_TTL;
927                         break;
928                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
929                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
930                         flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
931                         break;
932                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
933                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
934                         flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
935                         break;
936                 default:
937                         goto get_pedit_action_size_done;
938                 }
939         }
940 get_pedit_action_size_done:
941         /* TCA_PEDIT_PARAMS_EX */
942         pedit_size +=
943                 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
944                                   keys * sizeof(struct tc_pedit_key));
945         pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
946         pedit_size += keys *
947                       /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
948                       (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
949                        SZ_NLATTR_DATA_OF(2));
950         (*action_flags) |= flags;
951         (*actions)--;
952         return pedit_size;
953 }
954
955 /**
956  * Retrieve mask for pattern item.
957  *
958  * This function does basic sanity checks on a pattern item in order to
959  * return the most appropriate mask for it.
960  *
961  * @param[in] item
962  *   Item specification.
963  * @param[in] mask_default
964  *   Default mask for pattern item as specified by the flow API.
965  * @param[in] mask_supported
966  *   Mask fields supported by the implementation.
967  * @param[in] mask_empty
968  *   Empty mask to return when there is no specification.
969  * @param[out] error
970  *   Perform verbose error reporting if not NULL.
971  *
972  * @return
973  *   Either @p item->mask or one of the mask parameters on success, NULL
974  *   otherwise and rte_errno is set.
975  */
976 static const void *
977 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
978                    const void *mask_supported, const void *mask_empty,
979                    size_t mask_size, struct rte_flow_error *error)
980 {
981         const uint8_t *mask;
982         size_t i;
983
984         /* item->last and item->mask cannot exist without item->spec. */
985         if (!item->spec && (item->mask || item->last)) {
986                 rte_flow_error_set(error, EINVAL,
987                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
988                                    "\"mask\" or \"last\" field provided without"
989                                    " a corresponding \"spec\"");
990                 return NULL;
991         }
992         /* No spec, no mask, no problem. */
993         if (!item->spec)
994                 return mask_empty;
995         mask = item->mask ? item->mask : mask_default;
996         assert(mask);
997         /*
998          * Single-pass check to make sure that:
999          * - Mask is supported, no bits are set outside mask_supported.
1000          * - Both item->spec and item->last are included in mask.
1001          */
1002         for (i = 0; i != mask_size; ++i) {
1003                 if (!mask[i])
1004                         continue;
1005                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1006                     ((const uint8_t *)mask_supported)[i]) {
1007                         rte_flow_error_set(error, ENOTSUP,
1008                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1009                                            "unsupported field found"
1010                                            " in \"mask\"");
1011                         return NULL;
1012                 }
1013                 if (item->last &&
1014                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
1015                     (((const uint8_t *)item->last)[i] & mask[i])) {
1016                         rte_flow_error_set(error, EINVAL,
1017                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1018                                            item->last,
1019                                            "range between \"spec\" and \"last\""
1020                                            " not comprised in \"mask\"");
1021                         return NULL;
1022                 }
1023         }
1024         return mask;
1025 }
1026
1027 /**
1028  * Build a conversion table between port ID and ifindex.
1029  *
1030  * @param[in] dev
1031  *   Pointer to Ethernet device.
1032  * @param[out] ptoi
1033  *   Pointer to ptoi table.
1034  * @param[in] len
1035  *   Size of ptoi table provided.
1036  *
1037  * @return
1038  *   Size of ptoi table filled.
1039  */
1040 static unsigned int
1041 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1042                           unsigned int len)
1043 {
1044         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1045         uint16_t port_id[n + 1];
1046         unsigned int i;
1047         unsigned int own = 0;
1048
1049         /* At least one port is needed when no switch domain is present. */
1050         if (!n) {
1051                 n = 1;
1052                 port_id[0] = dev->data->port_id;
1053         } else {
1054                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1055         }
1056         if (n > len)
1057                 return 0;
1058         for (i = 0; i != n; ++i) {
1059                 struct rte_eth_dev_info dev_info;
1060
1061                 rte_eth_dev_info_get(port_id[i], &dev_info);
1062                 if (port_id[i] == dev->data->port_id)
1063                         own = i;
1064                 ptoi[i].port_id = port_id[i];
1065                 ptoi[i].ifindex = dev_info.if_index;
1066         }
1067         /* Ensure first entry of ptoi[] is the current device. */
1068         if (own) {
1069                 ptoi[n] = ptoi[0];
1070                 ptoi[0] = ptoi[own];
1071                 ptoi[own] = ptoi[n];
1072         }
1073         /* An entry with zero ifindex terminates ptoi[]. */
1074         ptoi[n].port_id = 0;
1075         ptoi[n].ifindex = 0;
1076         return n;
1077 }
1078
1079 /**
1080  * Verify the @p attr will be correctly understood by the E-switch.
1081  *
1082  * @param[in] attr
1083  *   Pointer to flow attributes
1084  * @param[out] error
1085  *   Pointer to error structure.
1086  *
1087  * @return
1088  *   0 on success, a negative errno value otherwise and rte_errno is set.
1089  */
1090 static int
1091 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1092                              struct rte_flow_error *error)
1093 {
1094         /*
1095          * Supported attributes: groups, some priorities and ingress only.
1096          * group is supported only if kernel supports chain. Don't care about
1097          * transfer as it is the caller's problem.
1098          */
1099         if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1100                 return rte_flow_error_set(error, ENOTSUP,
1101                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1102                                           "group ID larger than "
1103                                           RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1104                                           " isn't supported");
1105         else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1106                 return rte_flow_error_set(error, ENOTSUP,
1107                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1108                                           attr,
1109                                           "priority more than "
1110                                           RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1111                                           " is not supported");
1112         if (!attr->ingress)
1113                 return rte_flow_error_set(error, EINVAL,
1114                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1115                                           attr, "only ingress is supported");
1116         if (attr->egress)
1117                 return rte_flow_error_set(error, ENOTSUP,
1118                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1119                                           attr, "egress is not supported");
1120         return 0;
1121 }
1122
1123 /**
1124  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1125  * The routine checks the L2 fields to be used in encapsulation header.
1126  *
1127  * @param[in] item
1128  *   Pointer to the item structure.
1129  * @param[out] error
1130  *   Pointer to the error structure.
1131  *
1132  * @return
1133  *   0 on success, a negative errno value otherwise and rte_errno is set.
1134  **/
1135 static int
1136 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1137                                   struct rte_flow_error *error)
1138 {
1139         const struct rte_flow_item_eth *spec = item->spec;
1140         const struct rte_flow_item_eth *mask = item->mask;
1141
1142         if (!spec) {
1143                 /*
1144                  * Specification for L2 addresses can be empty
1145                  * because these ones are optional and not
1146                  * required directly by tc rule. Kernel tries
1147                  * to resolve these ones on its own
1148                  */
1149                 return 0;
1150         }
1151         if (!mask) {
1152                 /* If mask is not specified use the default one. */
1153                 mask = &rte_flow_item_eth_mask;
1154         }
1155         if (memcmp(&mask->dst,
1156                    &flow_tcf_mask_empty.eth.dst,
1157                    sizeof(flow_tcf_mask_empty.eth.dst))) {
1158                 if (memcmp(&mask->dst,
1159                            &rte_flow_item_eth_mask.dst,
1160                            sizeof(rte_flow_item_eth_mask.dst)))
1161                         return rte_flow_error_set
1162                                 (error, ENOTSUP,
1163                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1164                                  "no support for partial mask on"
1165                                  " \"eth.dst\" field");
1166         }
1167         if (memcmp(&mask->src,
1168                    &flow_tcf_mask_empty.eth.src,
1169                    sizeof(flow_tcf_mask_empty.eth.src))) {
1170                 if (memcmp(&mask->src,
1171                            &rte_flow_item_eth_mask.src,
1172                            sizeof(rte_flow_item_eth_mask.src)))
1173                         return rte_flow_error_set
1174                                 (error, ENOTSUP,
1175                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1176                                  "no support for partial mask on"
1177                                  " \"eth.src\" field");
1178         }
1179         if (mask->type != RTE_BE16(0x0000)) {
1180                 if (mask->type != RTE_BE16(0xffff))
1181                         return rte_flow_error_set
1182                                 (error, ENOTSUP,
1183                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1184                                  "no support for partial mask on"
1185                                  " \"eth.type\" field");
1186                 DRV_LOG(WARNING,
1187                         "outer ethernet type field"
1188                         " cannot be forced for vxlan"
1189                         " encapsulation, parameter ignored");
1190         }
1191         return 0;
1192 }
1193
1194 /**
1195  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1196  * The routine checks the IPv4 fields to be used in encapsulation header.
1197  *
1198  * @param[in] item
1199  *   Pointer to the item structure.
1200  * @param[out] error
1201  *   Pointer to the error structure.
1202  *
1203  * @return
1204  *   0 on success, a negative errno value otherwise and rte_errno is set.
1205  **/
1206 static int
1207 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1208                                    struct rte_flow_error *error)
1209 {
1210         const struct rte_flow_item_ipv4 *spec = item->spec;
1211         const struct rte_flow_item_ipv4 *mask = item->mask;
1212
1213         if (!spec) {
1214                 /*
1215                  * Specification for IP addresses cannot be empty
1216                  * because it is required by tunnel_key parameter.
1217                  */
1218                 return rte_flow_error_set(error, EINVAL,
1219                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1220                                           "NULL outer ipv4 address"
1221                                           " specification for vxlan"
1222                                           " encapsulation");
1223         }
1224         if (!mask)
1225                 mask = &rte_flow_item_ipv4_mask;
1226         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1227                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1228                         return rte_flow_error_set
1229                                 (error, ENOTSUP,
1230                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1231                                  "no support for partial mask on"
1232                                  " \"ipv4.hdr.dst_addr\" field"
1233                                  " for vxlan encapsulation");
1234                 /* More IPv4 address validations can be put here. */
1235         } else {
1236                 /*
1237                  * Kernel uses the destination IP address to determine
1238                  * the routing path and obtain the MAC destination
1239                  * address, so IP destination address must be
1240                  * specified in the tc rule.
1241                  */
1242                 return rte_flow_error_set(error, EINVAL,
1243                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1244                                           "outer ipv4 destination address"
1245                                           " must be specified for"
1246                                           " vxlan encapsulation");
1247         }
1248         if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1249                 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1250                         return rte_flow_error_set
1251                                 (error, ENOTSUP,
1252                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1253                                  "no support for partial mask on"
1254                                  " \"ipv4.hdr.src_addr\" field"
1255                                  " for vxlan encapsulation");
1256                 /* More IPv4 address validations can be put here. */
1257         } else {
1258                 /*
1259                  * Kernel uses the source IP address to select the
1260                  * interface for egress encapsulated traffic, so
1261                  * it must be specified in the tc rule.
1262                  */
1263                 return rte_flow_error_set(error, EINVAL,
1264                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1265                                           "outer ipv4 source address"
1266                                           " must be specified for"
1267                                           " vxlan encapsulation");
1268         }
1269         return 0;
1270 }
1271
1272 /**
1273  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1274  * The routine checks the IPv6 fields to be used in encapsulation header.
1275  *
1276  * @param[in] item
1277  *   Pointer to the item structure.
1278  * @param[out] error
1279  *   Pointer to the error structure.
1280  *
1281  * @return
1282  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1283  **/
1284 static int
1285 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1286                                    struct rte_flow_error *error)
1287 {
1288         const struct rte_flow_item_ipv6 *spec = item->spec;
1289         const struct rte_flow_item_ipv6 *mask = item->mask;
1290
1291         if (!spec) {
1292                 /*
1293                  * Specification for IP addresses cannot be empty
1294                  * because it is required by tunnel_key parameter.
1295                  */
1296                 return rte_flow_error_set(error, EINVAL,
1297                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1298                                           "NULL outer ipv6 address"
1299                                           " specification for"
1300                                           " vxlan encapsulation");
1301         }
1302         if (!mask)
1303                 mask = &rte_flow_item_ipv6_mask;
1304         if (memcmp(&mask->hdr.dst_addr,
1305                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1306                    IPV6_ADDR_LEN)) {
1307                 if (memcmp(&mask->hdr.dst_addr,
1308                            &rte_flow_item_ipv6_mask.hdr.dst_addr,
1309                            IPV6_ADDR_LEN))
1310                         return rte_flow_error_set
1311                                         (error, ENOTSUP,
1312                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1313                                          "no support for partial mask on"
1314                                          " \"ipv6.hdr.dst_addr\" field"
1315                                          " for vxlan encapsulation");
1316                 /* More IPv6 address validations can be put here. */
1317         } else {
1318                 /*
1319                  * Kernel uses the destination IP address to determine
1320                  * the routing path and obtain the MAC destination
1321                  * address (heigh or gate), so IP destination address
1322                  * must be specified within the tc rule.
1323                  */
1324                 return rte_flow_error_set(error, EINVAL,
1325                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1326                                           "outer ipv6 destination address"
1327                                           " must be specified for"
1328                                           " vxlan encapsulation");
1329         }
1330         if (memcmp(&mask->hdr.src_addr,
1331                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1332                    IPV6_ADDR_LEN)) {
1333                 if (memcmp(&mask->hdr.src_addr,
1334                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1335                            IPV6_ADDR_LEN))
1336                         return rte_flow_error_set
1337                                         (error, ENOTSUP,
1338                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1339                                          "no support for partial mask on"
1340                                          " \"ipv6.hdr.src_addr\" field"
1341                                          " for vxlan encapsulation");
1342                 /* More L3 address validation can be put here. */
1343         } else {
1344                 /*
1345                  * Kernel uses the source IP address to select the
1346                  * interface for egress encapsulated traffic, so
1347                  * it must be specified in the tc rule.
1348                  */
1349                 return rte_flow_error_set(error, EINVAL,
1350                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1351                                           "outer L3 source address"
1352                                           " must be specified for"
1353                                           " vxlan encapsulation");
1354         }
1355         return 0;
1356 }
1357
1358 /**
1359  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1360  * The routine checks the UDP fields to be used in encapsulation header.
1361  *
1362  * @param[in] item
1363  *   Pointer to the item structure.
1364  * @param[out] error
1365  *   Pointer to the error structure.
1366  *
1367  * @return
1368  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1369  **/
1370 static int
1371 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1372                                   struct rte_flow_error *error)
1373 {
1374         const struct rte_flow_item_udp *spec = item->spec;
1375         const struct rte_flow_item_udp *mask = item->mask;
1376
1377         if (!spec) {
1378                 /*
1379                  * Specification for UDP ports cannot be empty
1380                  * because it is required by tunnel_key parameter.
1381                  */
1382                 return rte_flow_error_set(error, EINVAL,
1383                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1384                                           "NULL UDP port specification "
1385                                           " for vxlan encapsulation");
1386         }
1387         if (!mask)
1388                 mask = &rte_flow_item_udp_mask;
1389         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1390                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1391                         return rte_flow_error_set
1392                                         (error, ENOTSUP,
1393                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1394                                          "no support for partial mask on"
1395                                          " \"udp.hdr.dst_port\" field"
1396                                          " for vxlan encapsulation");
1397                 if (!spec->hdr.dst_port)
1398                         return rte_flow_error_set
1399                                         (error, EINVAL,
1400                                          RTE_FLOW_ERROR_TYPE_ITEM, item,
1401                                          "outer UDP remote port cannot be"
1402                                          " 0 for vxlan encapsulation");
1403         } else {
1404                 return rte_flow_error_set(error, EINVAL,
1405                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1406                                           "outer UDP remote port"
1407                                           " must be specified for"
1408                                           " vxlan encapsulation");
1409         }
1410         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1411                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1412                         return rte_flow_error_set
1413                                         (error, ENOTSUP,
1414                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1415                                          "no support for partial mask on"
1416                                          " \"udp.hdr.src_port\" field"
1417                                          " for vxlan encapsulation");
1418                 DRV_LOG(WARNING,
1419                         "outer UDP source port cannot be"
1420                         " forced for vxlan encapsulation,"
1421                         " parameter ignored");
1422         }
1423         return 0;
1424 }
1425
1426 /**
1427  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1428  * The routine checks the VNIP fields to be used in encapsulation header.
1429  *
1430  * @param[in] item
1431  *   Pointer to the item structure.
1432  * @param[out] error
1433  *   Pointer to the error structure.
1434  *
1435  * @return
1436  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1437  **/
1438 static int
1439 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1440                                   struct rte_flow_error *error)
1441 {
1442         const struct rte_flow_item_vxlan *spec = item->spec;
1443         const struct rte_flow_item_vxlan *mask = item->mask;
1444
1445         if (!spec) {
1446                 /* Outer VNI is required by tunnel_key parameter. */
1447                 return rte_flow_error_set(error, EINVAL,
1448                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1449                                           "NULL VNI specification"
1450                                           " for vxlan encapsulation");
1451         }
1452         if (!mask)
1453                 mask = &rte_flow_item_vxlan_mask;
1454         if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1455                 return rte_flow_error_set(error, EINVAL,
1456                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1457                                           "outer VNI must be specified "
1458                                           "for vxlan encapsulation");
1459         if (mask->vni[0] != 0xff ||
1460             mask->vni[1] != 0xff ||
1461             mask->vni[2] != 0xff)
1462                 return rte_flow_error_set(error, ENOTSUP,
1463                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1464                                           "no support for partial mask on"
1465                                           " \"vxlan.vni\" field");
1466
1467         if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1468                 return rte_flow_error_set(error, EINVAL,
1469                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1470                                           "vxlan vni cannot be 0");
1471         return 0;
1472 }
1473
1474 /**
1475  * Validate VXLAN_ENCAP action item list for E-Switch.
1476  * The routine checks items to be used in encapsulation header.
1477  *
1478  * @param[in] action
1479  *   Pointer to the VXLAN_ENCAP action structure.
1480  * @param[out] error
1481  *   Pointer to the error structure.
1482  *
1483  * @return
1484  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1485  **/
1486 static int
1487 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1488                               struct rte_flow_error *error)
1489 {
1490         const struct rte_flow_item *items;
1491         int ret;
1492         uint32_t item_flags = 0;
1493
1494         if (!action->conf)
1495                 return rte_flow_error_set(error, EINVAL,
1496                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1497                                           "Missing vxlan tunnel"
1498                                           " action configuration");
1499         items = ((const struct rte_flow_action_vxlan_encap *)
1500                                         action->conf)->definition;
1501         if (!items)
1502                 return rte_flow_error_set(error, EINVAL,
1503                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1504                                           "Missing vxlan tunnel"
1505                                           " encapsulation parameters");
1506         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1507                 switch (items->type) {
1508                 case RTE_FLOW_ITEM_TYPE_VOID:
1509                         break;
1510                 case RTE_FLOW_ITEM_TYPE_ETH:
1511                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1512                                                           error);
1513                         if (ret < 0)
1514                                 return ret;
1515                         ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1516                         if (ret < 0)
1517                                 return ret;
1518                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1519                         break;
1520                 break;
1521                 case RTE_FLOW_ITEM_TYPE_IPV4:
1522                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
1523                                                            error);
1524                         if (ret < 0)
1525                                 return ret;
1526                         ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1527                         if (ret < 0)
1528                                 return ret;
1529                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1530                         break;
1531                 case RTE_FLOW_ITEM_TYPE_IPV6:
1532                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
1533                                                            error);
1534                         if (ret < 0)
1535                                 return ret;
1536                         ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1537                         if (ret < 0)
1538                                 return ret;
1539                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1540                         break;
1541                 case RTE_FLOW_ITEM_TYPE_UDP:
1542                         ret = mlx5_flow_validate_item_udp(items, item_flags,
1543                                                            0xFF, error);
1544                         if (ret < 0)
1545                                 return ret;
1546                         ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1547                         if (ret < 0)
1548                                 return ret;
1549                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1550                         break;
1551                 case RTE_FLOW_ITEM_TYPE_VXLAN:
1552                         ret = mlx5_flow_validate_item_vxlan(items,
1553                                                             item_flags, error);
1554                         if (ret < 0)
1555                                 return ret;
1556                         ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1557                         if (ret < 0)
1558                                 return ret;
1559                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
1560                         break;
1561                 default:
1562                         return rte_flow_error_set
1563                                         (error, ENOTSUP,
1564                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1565                                          "vxlan encap item not supported");
1566                 }
1567         }
1568         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1569                 return rte_flow_error_set(error, EINVAL,
1570                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1571                                           "no outer IP layer found"
1572                                           " for vxlan encapsulation");
1573         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1574                 return rte_flow_error_set(error, EINVAL,
1575                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1576                                           "no outer UDP layer found"
1577                                           " for vxlan encapsulation");
1578         if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1579                 return rte_flow_error_set(error, EINVAL,
1580                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1581                                           "no VXLAN VNI found"
1582                                           " for vxlan encapsulation");
1583         return 0;
1584 }
1585
1586 /**
1587  * Validate RTE_FLOW_ITEM_TYPE_IPV4 item if VXLAN_DECAP action
1588  * is present in actions list.
1589  *
1590  * @param[in] ipv4
1591  *   Outer IPv4 address item (if any, NULL otherwise).
1592  * @param[out] error
1593  *   Pointer to the error structure.
1594  *
1595  * @return
1596  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1597  **/
1598 static int
1599 flow_tcf_validate_vxlan_decap_ipv4(const struct rte_flow_item *ipv4,
1600                                    struct rte_flow_error *error)
1601 {
1602         const struct rte_flow_item_ipv4 *spec = ipv4->spec;
1603         const struct rte_flow_item_ipv4 *mask = ipv4->mask;
1604
1605         if (!spec) {
1606                 /*
1607                  * Specification for IP addresses cannot be empty
1608                  * because it is required as decap parameter.
1609                  */
1610                 return rte_flow_error_set(error, EINVAL,
1611                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv4,
1612                                           "NULL outer ipv4 address"
1613                                           " specification for vxlan"
1614                                           " for vxlan decapsulation");
1615         }
1616         if (!mask)
1617                 mask = &rte_flow_item_ipv4_mask;
1618         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1619                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1620                         return rte_flow_error_set
1621                                         (error, ENOTSUP,
1622                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1623                                          "no support for partial mask on"
1624                                          " \"ipv4.hdr.dst_addr\" field");
1625                 /* More IP address validations can be put here. */
1626         } else {
1627                 /*
1628                  * Kernel uses the destination IP address
1629                  * to determine the ingress network interface
1630                  * for traffic being decapsulated.
1631                  */
1632                 return rte_flow_error_set(error, EINVAL,
1633                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv4,
1634                                           "outer ipv4 destination address"
1635                                           " must be specified for"
1636                                           " vxlan decapsulation");
1637         }
1638         /* Source IP address is optional for decap. */
1639         if (mask->hdr.src_addr != RTE_BE32(0x00000000) &&
1640             mask->hdr.src_addr != RTE_BE32(0xffffffff))
1641                 return rte_flow_error_set(error, ENOTSUP,
1642                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1643                                           "no support for partial mask on"
1644                                           " \"ipv4.hdr.src_addr\" field");
1645         return 0;
1646 }
1647
1648 /**
1649  * Validate RTE_FLOW_ITEM_TYPE_IPV6 item if VXLAN_DECAP action
1650  * is present in actions list.
1651  *
1652  * @param[in] ipv6
1653  *   Outer IPv6 address item (if any, NULL otherwise).
1654  * @param[out] error
1655  *   Pointer to the error structure.
1656  *
1657  * @return
1658  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1659  **/
1660 static int
1661 flow_tcf_validate_vxlan_decap_ipv6(const struct rte_flow_item *ipv6,
1662                                    struct rte_flow_error *error)
1663 {
1664         const struct rte_flow_item_ipv6 *spec = ipv6->spec;
1665         const struct rte_flow_item_ipv6 *mask = ipv6->mask;
1666
1667         if (!spec) {
1668                 /*
1669                  * Specification for IP addresses cannot be empty
1670                  * because it is required as decap parameter.
1671                  */
1672                 return rte_flow_error_set(error, EINVAL,
1673                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv6,
1674                                           "NULL outer ipv6 address"
1675                                           " specification for vxlan"
1676                                           " decapsulation");
1677         }
1678         if (!mask)
1679                 mask = &rte_flow_item_ipv6_mask;
1680         if (memcmp(&mask->hdr.dst_addr,
1681                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1682                    IPV6_ADDR_LEN)) {
1683                 if (memcmp(&mask->hdr.dst_addr,
1684                         &rte_flow_item_ipv6_mask.hdr.dst_addr,
1685                         IPV6_ADDR_LEN))
1686                         return rte_flow_error_set
1687                                         (error, ENOTSUP,
1688                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1689                                          "no support for partial mask on"
1690                                          " \"ipv6.hdr.dst_addr\" field");
1691                 /* More IP address validations can be put here. */
1692         } else {
1693                 /*
1694                  * Kernel uses the destination IP address
1695                  * to determine the ingress network interface
1696                  * for traffic being decapsulated.
1697                  */
1698                 return rte_flow_error_set(error, EINVAL,
1699                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv6,
1700                                           "outer ipv6 destination address must be "
1701                                           "specified for vxlan decapsulation");
1702         }
1703         /* Source IP address is optional for decap. */
1704         if (memcmp(&mask->hdr.src_addr,
1705                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1706                    IPV6_ADDR_LEN)) {
1707                 if (memcmp(&mask->hdr.src_addr,
1708                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1709                            IPV6_ADDR_LEN))
1710                         return rte_flow_error_set
1711                                         (error, ENOTSUP,
1712                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1713                                          "no support for partial mask on"
1714                                          " \"ipv6.hdr.src_addr\" field");
1715         }
1716         return 0;
1717 }
1718
1719 /**
1720  * Validate RTE_FLOW_ITEM_TYPE_UDP item if VXLAN_DECAP action
1721  * is present in actions list.
1722  *
1723  * @param[in] udp
1724  *   Outer UDP layer item (if any, NULL otherwise).
1725  * @param[out] error
1726  *   Pointer to the error structure.
1727  *
1728  * @return
1729  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1730  **/
1731 static int
1732 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1733                                   struct rte_flow_error *error)
1734 {
1735         const struct rte_flow_item_udp *spec = udp->spec;
1736         const struct rte_flow_item_udp *mask = udp->mask;
1737
1738         if (!spec)
1739                 /*
1740                  * Specification for UDP ports cannot be empty
1741                  * because it is required as decap parameter.
1742                  */
1743                 return rte_flow_error_set(error, EINVAL,
1744                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1745                                           "NULL UDP port specification"
1746                                           " for VXLAN decapsulation");
1747         if (!mask)
1748                 mask = &rte_flow_item_udp_mask;
1749         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1750                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1751                         return rte_flow_error_set
1752                                         (error, ENOTSUP,
1753                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1754                                          "no support for partial mask on"
1755                                          " \"udp.hdr.dst_port\" field");
1756                 if (!spec->hdr.dst_port)
1757                         return rte_flow_error_set
1758                                         (error, EINVAL,
1759                                          RTE_FLOW_ERROR_TYPE_ITEM, udp,
1760                                          "zero decap local UDP port");
1761         } else {
1762                 return rte_flow_error_set(error, EINVAL,
1763                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1764                                           "outer UDP destination port must be "
1765                                           "specified for vxlan decapsulation");
1766         }
1767         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1768                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1769                         return rte_flow_error_set
1770                                         (error, ENOTSUP,
1771                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1772                                          "no support for partial mask on"
1773                                          " \"udp.hdr.src_port\" field");
1774                 DRV_LOG(WARNING,
1775                         "outer UDP local port cannot be "
1776                         "forced for VXLAN encapsulation, "
1777                         "parameter ignored");
1778         }
1779         return 0;
1780 }
1781
1782 /**
1783  * Validate flow for E-Switch.
1784  *
1785  * @param[in] priv
1786  *   Pointer to the priv structure.
1787  * @param[in] attr
1788  *   Pointer to the flow attributes.
1789  * @param[in] items
1790  *   Pointer to the list of items.
1791  * @param[in] actions
1792  *   Pointer to the list of actions.
1793  * @param[out] error
1794  *   Pointer to the error structure.
1795  *
1796  * @return
1797  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1798  */
1799 static int
1800 flow_tcf_validate(struct rte_eth_dev *dev,
1801                   const struct rte_flow_attr *attr,
1802                   const struct rte_flow_item items[],
1803                   const struct rte_flow_action actions[],
1804                   struct rte_flow_error *error)
1805 {
1806         union {
1807                 const struct rte_flow_item_port_id *port_id;
1808                 const struct rte_flow_item_eth *eth;
1809                 const struct rte_flow_item_vlan *vlan;
1810                 const struct rte_flow_item_ipv4 *ipv4;
1811                 const struct rte_flow_item_ipv6 *ipv6;
1812                 const struct rte_flow_item_tcp *tcp;
1813                 const struct rte_flow_item_udp *udp;
1814                 const struct rte_flow_item_vxlan *vxlan;
1815         } spec, mask;
1816         union {
1817                 const struct rte_flow_action_port_id *port_id;
1818                 const struct rte_flow_action_jump *jump;
1819                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1820                 const struct rte_flow_action_of_set_vlan_vid *
1821                         of_set_vlan_vid;
1822                 const struct rte_flow_action_of_set_vlan_pcp *
1823                         of_set_vlan_pcp;
1824                 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1825                 const struct rte_flow_action_set_ipv4 *set_ipv4;
1826                 const struct rte_flow_action_set_ipv6 *set_ipv6;
1827         } conf;
1828         uint64_t item_flags = 0;
1829         uint64_t action_flags = 0;
1830         uint8_t next_protocol = -1;
1831         unsigned int tcm_ifindex = 0;
1832         uint8_t pedit_validated = 0;
1833         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1834         struct rte_eth_dev *port_id_dev = NULL;
1835         bool in_port_id_set;
1836         int ret;
1837
1838         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1839                                                 PTOI_TABLE_SZ_MAX(dev)));
1840         ret = flow_tcf_validate_attributes(attr, error);
1841         if (ret < 0)
1842                 return ret;
1843         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1844                 unsigned int i;
1845                 uint64_t current_action_flag = 0;
1846
1847                 switch (actions->type) {
1848                 case RTE_FLOW_ACTION_TYPE_VOID:
1849                         break;
1850                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1851                         current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1852                         if (!actions->conf)
1853                                 break;
1854                         conf.port_id = actions->conf;
1855                         if (conf.port_id->original)
1856                                 i = 0;
1857                         else
1858                                 for (i = 0; ptoi[i].ifindex; ++i)
1859                                         if (ptoi[i].port_id == conf.port_id->id)
1860                                                 break;
1861                         if (!ptoi[i].ifindex)
1862                                 return rte_flow_error_set
1863                                         (error, ENODEV,
1864                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1865                                          conf.port_id,
1866                                          "missing data to convert port ID to"
1867                                          " ifindex");
1868                         port_id_dev = &rte_eth_devices[conf.port_id->id];
1869                         break;
1870                 case RTE_FLOW_ACTION_TYPE_JUMP:
1871                         current_action_flag = MLX5_FLOW_ACTION_JUMP;
1872                         if (!actions->conf)
1873                                 break;
1874                         conf.jump = actions->conf;
1875                         if (attr->group >= conf.jump->group)
1876                                 return rte_flow_error_set
1877                                         (error, ENOTSUP,
1878                                          RTE_FLOW_ERROR_TYPE_ACTION,
1879                                          actions,
1880                                          "can jump only to a group forward");
1881                         break;
1882                 case RTE_FLOW_ACTION_TYPE_DROP:
1883                         current_action_flag = MLX5_FLOW_ACTION_DROP;
1884                         break;
1885                 case RTE_FLOW_ACTION_TYPE_COUNT:
1886                         break;
1887                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1888                         current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1889                         break;
1890                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1891                         rte_be16_t ethertype;
1892
1893                         current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1894                         if (!actions->conf)
1895                                 break;
1896                         conf.of_push_vlan = actions->conf;
1897                         ethertype = conf.of_push_vlan->ethertype;
1898                         if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1899                             ethertype != RTE_BE16(ETH_P_8021AD))
1900                                 return rte_flow_error_set
1901                                         (error, EINVAL,
1902                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1903                                          "vlan push TPID must be "
1904                                          "802.1Q or 802.1AD");
1905                         break;
1906                 }
1907                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1908                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1909                                 return rte_flow_error_set
1910                                         (error, ENOTSUP,
1911                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1912                                          "vlan modify is not supported,"
1913                                          " set action must follow push action");
1914                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1915                         break;
1916                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1917                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1918                                 return rte_flow_error_set
1919                                         (error, ENOTSUP,
1920                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1921                                          "vlan modify is not supported,"
1922                                          " set action must follow push action");
1923                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1924                         break;
1925                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1926                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1927                         break;
1928                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1929                         ret = flow_tcf_validate_vxlan_encap(actions, error);
1930                         if (ret < 0)
1931                                 return ret;
1932                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1933                         break;
1934                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1935                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1936                         break;
1937                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1938                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1939                         break;
1940                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1941                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1942                         break;
1943                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1944                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1945                         break;
1946                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1947                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1948                         break;
1949                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1950                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1951                         break;
1952                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1953                         current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1954                         break;
1955                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1956                         current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1957                         break;
1958                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1959                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1960                         break;
1961                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1962                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1963                         break;
1964                 default:
1965                         return rte_flow_error_set(error, ENOTSUP,
1966                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1967                                                   actions,
1968                                                   "action not supported");
1969                 }
1970                 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1971                         if (!actions->conf)
1972                                 return rte_flow_error_set
1973                                         (error, EINVAL,
1974                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1975                                          actions,
1976                                          "action configuration not set");
1977                 }
1978                 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1979                     pedit_validated)
1980                         return rte_flow_error_set(error, ENOTSUP,
1981                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1982                                                   actions,
1983                                                   "set actions should be "
1984                                                   "listed successively");
1985                 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1986                     (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1987                         pedit_validated = 1;
1988                 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1989                     (action_flags & MLX5_TCF_FATE_ACTIONS))
1990                         return rte_flow_error_set(error, EINVAL,
1991                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1992                                                   actions,
1993                                                   "can't have multiple fate"
1994                                                   " actions");
1995                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1996                     (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1997                         return rte_flow_error_set(error, EINVAL,
1998                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1999                                                   actions,
2000                                                   "can't have multiple vxlan"
2001                                                   " actions");
2002                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
2003                     (action_flags & MLX5_TCF_VLAN_ACTIONS))
2004                         return rte_flow_error_set(error, ENOTSUP,
2005                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2006                                                   actions,
2007                                                   "can't have vxlan and vlan"
2008                                                   " actions in the same rule");
2009                 action_flags |= current_action_flag;
2010         }
2011         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2012                 unsigned int i;
2013
2014                 if ((item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2015                     items->type != RTE_FLOW_ITEM_TYPE_ETH)
2016                         return rte_flow_error_set(error, ENOTSUP,
2017                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2018                                                   items,
2019                                                   "only L2 inner item"
2020                                                   " is supported");
2021                 switch (items->type) {
2022                 case RTE_FLOW_ITEM_TYPE_VOID:
2023                         break;
2024                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2025                         mask.port_id = flow_tcf_item_mask
2026                                 (items, &rte_flow_item_port_id_mask,
2027                                  &flow_tcf_mask_supported.port_id,
2028                                  &flow_tcf_mask_empty.port_id,
2029                                  sizeof(flow_tcf_mask_supported.port_id),
2030                                  error);
2031                         if (!mask.port_id)
2032                                 return -rte_errno;
2033                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
2034                                 in_port_id_set = 1;
2035                                 break;
2036                         }
2037                         spec.port_id = items->spec;
2038                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
2039                                 return rte_flow_error_set
2040                                         (error, ENOTSUP,
2041                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2042                                          mask.port_id,
2043                                          "no support for partial mask on"
2044                                          " \"id\" field");
2045                         if (!mask.port_id->id)
2046                                 i = 0;
2047                         else
2048                                 for (i = 0; ptoi[i].ifindex; ++i)
2049                                         if (ptoi[i].port_id == spec.port_id->id)
2050                                                 break;
2051                         if (!ptoi[i].ifindex)
2052                                 return rte_flow_error_set
2053                                         (error, ENODEV,
2054                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2055                                          spec.port_id,
2056                                          "missing data to convert port ID to"
2057                                          " ifindex");
2058                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
2059                                 return rte_flow_error_set
2060                                         (error, ENOTSUP,
2061                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2062                                          spec.port_id,
2063                                          "cannot match traffic for"
2064                                          " several port IDs through"
2065                                          " a single flow rule");
2066                         tcm_ifindex = ptoi[i].ifindex;
2067                         in_port_id_set = 1;
2068                         break;
2069                 case RTE_FLOW_ITEM_TYPE_ETH:
2070                         ret = mlx5_flow_validate_item_eth(items, item_flags,
2071                                                           error);
2072                         if (ret < 0)
2073                                 return ret;
2074                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2075                                         MLX5_FLOW_LAYER_INNER_L2 :
2076                                         MLX5_FLOW_LAYER_OUTER_L2;
2077                         /* TODO:
2078                          * Redundant check due to different supported mask.
2079                          * Same for the rest of items.
2080                          */
2081                         mask.eth = flow_tcf_item_mask
2082                                 (items, &rte_flow_item_eth_mask,
2083                                  &flow_tcf_mask_supported.eth,
2084                                  &flow_tcf_mask_empty.eth,
2085                                  sizeof(flow_tcf_mask_supported.eth),
2086                                  error);
2087                         if (!mask.eth)
2088                                 return -rte_errno;
2089                         if (mask.eth->type && mask.eth->type !=
2090                             RTE_BE16(0xffff))
2091                                 return rte_flow_error_set
2092                                         (error, ENOTSUP,
2093                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2094                                          mask.eth,
2095                                          "no support for partial mask on"
2096                                          " \"type\" field");
2097                         break;
2098                 case RTE_FLOW_ITEM_TYPE_VLAN:
2099                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
2100                                                            error);
2101                         if (ret < 0)
2102                                 return ret;
2103                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2104                         mask.vlan = flow_tcf_item_mask
2105                                 (items, &rte_flow_item_vlan_mask,
2106                                  &flow_tcf_mask_supported.vlan,
2107                                  &flow_tcf_mask_empty.vlan,
2108                                  sizeof(flow_tcf_mask_supported.vlan),
2109                                  error);
2110                         if (!mask.vlan)
2111                                 return -rte_errno;
2112                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2113                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
2114                               RTE_BE16(0xe000)) ||
2115                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
2116                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2117                               RTE_BE16(0x0fff)) ||
2118                             (mask.vlan->inner_type &&
2119                              mask.vlan->inner_type != RTE_BE16(0xffff)))
2120                                 return rte_flow_error_set
2121                                         (error, ENOTSUP,
2122                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2123                                          mask.vlan,
2124                                          "no support for partial masks on"
2125                                          " \"tci\" (PCP and VID parts) and"
2126                                          " \"inner_type\" fields");
2127                         break;
2128                 case RTE_FLOW_ITEM_TYPE_IPV4:
2129                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
2130                                                            error);
2131                         if (ret < 0)
2132                                 return ret;
2133                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2134                         mask.ipv4 = flow_tcf_item_mask
2135                                 (items, &rte_flow_item_ipv4_mask,
2136                                  &flow_tcf_mask_supported.ipv4,
2137                                  &flow_tcf_mask_empty.ipv4,
2138                                  sizeof(flow_tcf_mask_supported.ipv4),
2139                                  error);
2140                         if (!mask.ipv4)
2141                                 return -rte_errno;
2142                         if (mask.ipv4->hdr.next_proto_id &&
2143                             mask.ipv4->hdr.next_proto_id != 0xff)
2144                                 return rte_flow_error_set
2145                                         (error, ENOTSUP,
2146                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2147                                          mask.ipv4,
2148                                          "no support for partial mask on"
2149                                          " \"hdr.next_proto_id\" field");
2150                         else if (mask.ipv4->hdr.next_proto_id)
2151                                 next_protocol =
2152                                         ((const struct rte_flow_item_ipv4 *)
2153                                          (items->spec))->hdr.next_proto_id;
2154                         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2155                                 ret = flow_tcf_validate_vxlan_decap_ipv4
2156                                                                 (items, error);
2157                                 if (ret < 0)
2158                                         return ret;
2159                         }
2160                         break;
2161                 case RTE_FLOW_ITEM_TYPE_IPV6:
2162                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
2163                                                            error);
2164                         if (ret < 0)
2165                                 return ret;
2166                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2167                         mask.ipv6 = flow_tcf_item_mask
2168                                 (items, &rte_flow_item_ipv6_mask,
2169                                  &flow_tcf_mask_supported.ipv6,
2170                                  &flow_tcf_mask_empty.ipv6,
2171                                  sizeof(flow_tcf_mask_supported.ipv6),
2172                                  error);
2173                         if (!mask.ipv6)
2174                                 return -rte_errno;
2175                         if (mask.ipv6->hdr.proto &&
2176                             mask.ipv6->hdr.proto != 0xff)
2177                                 return rte_flow_error_set
2178                                         (error, ENOTSUP,
2179                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2180                                          mask.ipv6,
2181                                          "no support for partial mask on"
2182                                          " \"hdr.proto\" field");
2183                         else if (mask.ipv6->hdr.proto)
2184                                 next_protocol =
2185                                         ((const struct rte_flow_item_ipv6 *)
2186                                          (items->spec))->hdr.proto;
2187                         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2188                                 ret = flow_tcf_validate_vxlan_decap_ipv6
2189                                                                 (items, error);
2190                                 if (ret < 0)
2191                                         return ret;
2192                         }
2193                         break;
2194                 case RTE_FLOW_ITEM_TYPE_UDP:
2195                         ret = mlx5_flow_validate_item_udp(items, item_flags,
2196                                                           next_protocol, error);
2197                         if (ret < 0)
2198                                 return ret;
2199                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
2200                         mask.udp = flow_tcf_item_mask
2201                                 (items, &rte_flow_item_udp_mask,
2202                                  &flow_tcf_mask_supported.udp,
2203                                  &flow_tcf_mask_empty.udp,
2204                                  sizeof(flow_tcf_mask_supported.udp),
2205                                  error);
2206                         if (!mask.udp)
2207                                 return -rte_errno;
2208                         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2209                                 ret = flow_tcf_validate_vxlan_decap_udp
2210                                                                 (items, error);
2211                                 if (ret < 0)
2212                                         return ret;
2213                         }
2214                         break;
2215                 case RTE_FLOW_ITEM_TYPE_TCP:
2216                         ret = mlx5_flow_validate_item_tcp
2217                                              (items, item_flags,
2218                                               next_protocol,
2219                                               &flow_tcf_mask_supported.tcp,
2220                                               error);
2221                         if (ret < 0)
2222                                 return ret;
2223                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
2224                         mask.tcp = flow_tcf_item_mask
2225                                 (items, &rte_flow_item_tcp_mask,
2226                                  &flow_tcf_mask_supported.tcp,
2227                                  &flow_tcf_mask_empty.tcp,
2228                                  sizeof(flow_tcf_mask_supported.tcp),
2229                                  error);
2230                         if (!mask.tcp)
2231                                 return -rte_errno;
2232                         break;
2233                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2234                         if (!(action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP))
2235                                 return rte_flow_error_set
2236                                         (error, ENOTSUP,
2237                                          RTE_FLOW_ERROR_TYPE_ITEM,
2238                                          items,
2239                                          "vni pattern should be followed by"
2240                                          " vxlan decapsulation action");
2241                         ret = mlx5_flow_validate_item_vxlan(items,
2242                                                             item_flags, error);
2243                         if (ret < 0)
2244                                 return ret;
2245                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
2246                         mask.vxlan = flow_tcf_item_mask
2247                                 (items, &rte_flow_item_vxlan_mask,
2248                                  &flow_tcf_mask_supported.vxlan,
2249                                  &flow_tcf_mask_empty.vxlan,
2250                                  sizeof(flow_tcf_mask_supported.vxlan), error);
2251                         if (!mask.vxlan)
2252                                 return -rte_errno;
2253                         if (mask.vxlan->vni[0] != 0xff ||
2254                             mask.vxlan->vni[1] != 0xff ||
2255                             mask.vxlan->vni[2] != 0xff)
2256                                 return rte_flow_error_set
2257                                         (error, ENOTSUP,
2258                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2259                                          mask.vxlan,
2260                                          "no support for partial or "
2261                                          "empty mask on \"vxlan.vni\" field");
2262                         break;
2263                 default:
2264                         return rte_flow_error_set(error, ENOTSUP,
2265                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2266                                                   items, "item not supported");
2267                 }
2268         }
2269         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2270             (action_flags & MLX5_FLOW_ACTION_DROP))
2271                 return rte_flow_error_set(error, ENOTSUP,
2272                                           RTE_FLOW_ERROR_TYPE_ACTION,
2273                                           actions,
2274                                           "set action is not compatible with "
2275                                           "drop action");
2276         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2277             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2278                 return rte_flow_error_set(error, ENOTSUP,
2279                                           RTE_FLOW_ERROR_TYPE_ACTION,
2280                                           actions,
2281                                           "set action must be followed by "
2282                                           "port_id action");
2283         if (action_flags &
2284            (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2285                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2286                         return rte_flow_error_set(error, EINVAL,
2287                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2288                                                   actions,
2289                                                   "no ipv4 item found in"
2290                                                   " pattern");
2291         }
2292         if (action_flags &
2293            (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2294                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2295                         return rte_flow_error_set(error, EINVAL,
2296                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2297                                                   actions,
2298                                                   "no ipv6 item found in"
2299                                                   " pattern");
2300         }
2301         if (action_flags &
2302            (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2303                 if (!(item_flags &
2304                      (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2305                       MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2306                         return rte_flow_error_set(error, EINVAL,
2307                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2308                                                   actions,
2309                                                   "no TCP/UDP item found in"
2310                                                   " pattern");
2311         }
2312         /*
2313          * FW syndrome (0xA9C090):
2314          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
2315          *     forward to the uplink.
2316          */
2317         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2318             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2319             ((struct priv *)port_id_dev->data->dev_private)->representor)
2320                 return rte_flow_error_set(error, ENOTSUP,
2321                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2322                                           "vlan push can only be applied"
2323                                           " when forwarding to uplink port");
2324         /*
2325          * FW syndrome (0x294609):
2326          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
2327          *     are supported only while forwarding to vport.
2328          */
2329         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2330             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2331                 return rte_flow_error_set(error, ENOTSUP,
2332                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2333                                           "vlan actions are supported"
2334                                           " only with port_id action");
2335         if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2336             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2337                 return rte_flow_error_set(error, ENOTSUP,
2338                                           RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2339                                           "vxlan actions are supported"
2340                                           " only with port_id action");
2341         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2342                 return rte_flow_error_set(error, EINVAL,
2343                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2344                                           "no fate action is found");
2345         if (action_flags &
2346            (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2347                 if (!(item_flags &
2348                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2349                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2350                         return rte_flow_error_set(error, EINVAL,
2351                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2352                                                   actions,
2353                                                   "no IP found in pattern");
2354         }
2355         if (action_flags &
2356             (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2357                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2358                         return rte_flow_error_set(error, ENOTSUP,
2359                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2360                                                   actions,
2361                                                   "no ethernet found in"
2362                                                   " pattern");
2363         }
2364         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2365                 if (!(item_flags &
2366                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2367                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2368                         return rte_flow_error_set(error, EINVAL,
2369                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2370                                                   NULL,
2371                                                   "no outer IP pattern found"
2372                                                   " for vxlan decap action");
2373                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2374                         return rte_flow_error_set(error, EINVAL,
2375                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2376                                                   NULL,
2377                                                   "no outer UDP pattern found"
2378                                                   " for vxlan decap action");
2379                 if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
2380                         return rte_flow_error_set(error, EINVAL,
2381                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2382                                                   NULL,
2383                                                   "no VNI pattern found"
2384                                                   " for vxlan decap action");
2385         }
2386         return 0;
2387 }
2388
2389 /**
2390  * Calculate maximum size of memory for flow items of Linux TC flower.
2391  *
2392  * @param[in] attr
2393  *   Pointer to the flow attributes.
2394  * @param[in] items
2395  *   Pointer to the list of items.
2396  *
2397  * @return
2398  *   Maximum size of memory for items.
2399  */
2400 static int
2401 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2402                         const struct rte_flow_item items[])
2403 {
2404         int size = 0;
2405
2406         size += SZ_NLATTR_STRZ_OF("flower") +
2407                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2408                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2409         if (attr->group > 0)
2410                 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2411         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2412                 switch (items->type) {
2413                 case RTE_FLOW_ITEM_TYPE_VOID:
2414                         break;
2415                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2416                         break;
2417                 case RTE_FLOW_ITEM_TYPE_ETH:
2418                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2419                                 SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2420                                 /* dst/src MAC addr and mask. */
2421                         break;
2422                 case RTE_FLOW_ITEM_TYPE_VLAN:
2423                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2424                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2425                                 /* VLAN Ether type. */
2426                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2427                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2428                         break;
2429                 case RTE_FLOW_ITEM_TYPE_IPV4:
2430                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2431                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2432                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2433                                 /* dst/src IP addr and mask. */
2434                         break;
2435                 case RTE_FLOW_ITEM_TYPE_IPV6:
2436                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2437                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2438                                 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2439                                 /* dst/src IP addr and mask. */
2440                         break;
2441                 case RTE_FLOW_ITEM_TYPE_UDP:
2442                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2443                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2444                                 /* dst/src port and mask. */
2445                         break;
2446                 case RTE_FLOW_ITEM_TYPE_TCP:
2447                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2448                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2449                                 /* dst/src port and mask. */
2450                         break;
2451                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2452                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2453                         break;
2454                 default:
2455                         DRV_LOG(WARNING,
2456                                 "unsupported item %p type %d,"
2457                                 " items must be validated before flow creation",
2458                                 (const void *)items, items->type);
2459                         break;
2460                 }
2461         }
2462         return size;
2463 }
2464
2465 /**
2466  * Calculate size of memory to store the VXLAN encapsultion
2467  * related items in the Netlink message buffer. Items list
2468  * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2469  * The item list should be validated.
2470  *
2471  * @param[in] action
2472  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2473  *   List of pattern items to scan data from.
2474  *
2475  * @return
2476  *   The size the part of Netlink message buffer to store the
2477  *   VXLAN encapsulation item attributes.
2478  */
2479 static int
2480 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2481 {
2482         const struct rte_flow_item *items;
2483         int size = 0;
2484
2485         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2486         assert(action->conf);
2487
2488         items = ((const struct rte_flow_action_vxlan_encap *)
2489                                         action->conf)->definition;
2490         assert(items);
2491         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2492                 switch (items->type) {
2493                 case RTE_FLOW_ITEM_TYPE_VOID:
2494                         break;
2495                 case RTE_FLOW_ITEM_TYPE_ETH:
2496                         /* This item does not require message buffer. */
2497                         break;
2498                 case RTE_FLOW_ITEM_TYPE_IPV4:
2499                         size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2500                         break;
2501                 case RTE_FLOW_ITEM_TYPE_IPV6:
2502                         size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2503                         break;
2504                 case RTE_FLOW_ITEM_TYPE_UDP: {
2505                         const struct rte_flow_item_udp *udp = items->mask;
2506
2507                         size += SZ_NLATTR_TYPE_OF(uint16_t);
2508                         if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2509                                 size += SZ_NLATTR_TYPE_OF(uint16_t);
2510                         break;
2511                 }
2512                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2513                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2514                         break;
2515                 default:
2516                         assert(false);
2517                         DRV_LOG(WARNING,
2518                                 "unsupported item %p type %d,"
2519                                 " items must be validated"
2520                                 " before flow creation",
2521                                 (const void *)items, items->type);
2522                         return 0;
2523                 }
2524         }
2525         return size;
2526 }
2527
2528 /**
2529  * Calculate maximum size of memory for flow actions of Linux TC flower and
2530  * extract specified actions.
2531  *
2532  * @param[in] actions
2533  *   Pointer to the list of actions.
2534  * @param[out] action_flags
2535  *   Pointer to the detected actions.
2536  *
2537  * @return
2538  *   Maximum size of memory for actions.
2539  */
2540 static int
2541 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2542                               uint64_t *action_flags)
2543 {
2544         int size = 0;
2545         uint64_t flags = 0;
2546
2547         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2548         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2549                 switch (actions->type) {
2550                 case RTE_FLOW_ACTION_TYPE_VOID:
2551                         break;
2552                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2553                         size += SZ_NLATTR_NEST + /* na_act_index. */
2554                                 SZ_NLATTR_STRZ_OF("mirred") +
2555                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2556                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2557                         flags |= MLX5_FLOW_ACTION_PORT_ID;
2558                         break;
2559                 case RTE_FLOW_ACTION_TYPE_JUMP:
2560                         size += SZ_NLATTR_NEST + /* na_act_index. */
2561                                 SZ_NLATTR_STRZ_OF("gact") +
2562                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2563                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2564                         flags |= MLX5_FLOW_ACTION_JUMP;
2565                         break;
2566                 case RTE_FLOW_ACTION_TYPE_DROP:
2567                         size += SZ_NLATTR_NEST + /* na_act_index. */
2568                                 SZ_NLATTR_STRZ_OF("gact") +
2569                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2570                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2571                         flags |= MLX5_FLOW_ACTION_DROP;
2572                         break;
2573                 case RTE_FLOW_ACTION_TYPE_COUNT:
2574                         break;
2575                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2576                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2577                         goto action_of_vlan;
2578                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2579                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2580                         goto action_of_vlan;
2581                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2582                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2583                         goto action_of_vlan;
2584                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2585                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2586                         goto action_of_vlan;
2587 action_of_vlan:
2588                         size += SZ_NLATTR_NEST + /* na_act_index. */
2589                                 SZ_NLATTR_STRZ_OF("vlan") +
2590                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2591                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2592                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2593                                 /* VLAN protocol. */
2594                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2595                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2596                         break;
2597                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2598                         size += SZ_NLATTR_NEST + /* na_act_index. */
2599                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2600                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2601                                 SZ_NLATTR_TYPE_OF(uint8_t);
2602                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2603                         size += flow_tcf_vxlan_encap_size(actions) +
2604                                 RTE_ALIGN_CEIL /* preceding encap params. */
2605                                 (sizeof(struct flow_tcf_vxlan_encap),
2606                                 MNL_ALIGNTO);
2607                         flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2608                         break;
2609                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2610                         size += SZ_NLATTR_NEST + /* na_act_index. */
2611                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2612                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2613                                 SZ_NLATTR_TYPE_OF(uint8_t);
2614                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2615                         size += RTE_ALIGN_CEIL /* preceding decap params. */
2616                                 (sizeof(struct flow_tcf_vxlan_decap),
2617                                 MNL_ALIGNTO);
2618                         flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2619                         break;
2620                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2621                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2622                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2623                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2624                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2625                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2626                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2627                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2628                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2629                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2630                         size += flow_tcf_get_pedit_actions_size(&actions,
2631                                                                 &flags);
2632                         break;
2633                 default:
2634                         DRV_LOG(WARNING,
2635                                 "unsupported action %p type %d,"
2636                                 " items must be validated before flow creation",
2637                                 (const void *)actions, actions->type);
2638                         break;
2639                 }
2640         }
2641         *action_flags = flags;
2642         return size;
2643 }
2644
2645 /**
2646  * Brand rtnetlink buffer with unique handle.
2647  *
2648  * This handle should be unique for a given network interface to avoid
2649  * collisions.
2650  *
2651  * @param nlh
2652  *   Pointer to Netlink message.
2653  * @param handle
2654  *   Unique 32-bit handle to use.
2655  */
2656 static void
2657 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
2658 {
2659         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
2660
2661         tcm->tcm_handle = handle;
2662         DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
2663                 (void *)nlh, handle);
2664 }
2665
2666 /**
2667  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2668  * memory required, allocates the memory, initializes Netlink message headers
2669  * and set unique TC message handle.
2670  *
2671  * @param[in] attr
2672  *   Pointer to the flow attributes.
2673  * @param[in] items
2674  *   Pointer to the list of items.
2675  * @param[in] actions
2676  *   Pointer to the list of actions.
2677  * @param[out] error
2678  *   Pointer to the error structure.
2679  *
2680  * @return
2681  *   Pointer to mlx5_flow object on success,
2682  *   otherwise NULL and rte_ernno is set.
2683  */
2684 static struct mlx5_flow *
2685 flow_tcf_prepare(const struct rte_flow_attr *attr,
2686                  const struct rte_flow_item items[],
2687                  const struct rte_flow_action actions[],
2688                  struct rte_flow_error *error)
2689 {
2690         size_t size = RTE_ALIGN_CEIL
2691                         (sizeof(struct mlx5_flow),
2692                          alignof(struct flow_tcf_tunnel_hdr)) +
2693                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
2694                       MNL_ALIGN(sizeof(struct tcmsg));
2695         struct mlx5_flow *dev_flow;
2696         uint64_t action_flags = 0;
2697         struct nlmsghdr *nlh;
2698         struct tcmsg *tcm;
2699         uint8_t *sp, *tun = NULL;
2700
2701         size += flow_tcf_get_items_size(attr, items);
2702         size += flow_tcf_get_actions_and_size(actions, &action_flags);
2703         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2704         if (!dev_flow) {
2705                 rte_flow_error_set(error, ENOMEM,
2706                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2707                                    "not enough memory to create E-Switch flow");
2708                 return NULL;
2709         }
2710         sp = (uint8_t *)(dev_flow + 1);
2711         if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2712                 sp = RTE_PTR_ALIGN
2713                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2714                 tun = sp;
2715                 sp += RTE_ALIGN_CEIL
2716                         (sizeof(struct flow_tcf_vxlan_encap),
2717                         MNL_ALIGNTO);
2718 #ifndef NDEBUG
2719                 size -= RTE_ALIGN_CEIL
2720                         (sizeof(struct flow_tcf_vxlan_encap),
2721                         MNL_ALIGNTO);
2722 #endif
2723         } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2724                 sp = RTE_PTR_ALIGN
2725                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2726                 tun = sp;
2727                 sp += RTE_ALIGN_CEIL
2728                         (sizeof(struct flow_tcf_vxlan_decap),
2729                         MNL_ALIGNTO);
2730 #ifndef NDEBUG
2731                 size -= RTE_ALIGN_CEIL
2732                         (sizeof(struct flow_tcf_vxlan_decap),
2733                         MNL_ALIGNTO);
2734 #endif
2735         } else {
2736                 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2737         }
2738         nlh = mnl_nlmsg_put_header(sp);
2739         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2740         *dev_flow = (struct mlx5_flow){
2741                 .tcf = (struct mlx5_flow_tcf){
2742 #ifndef NDEBUG
2743                         .nlsize = size - RTE_ALIGN_CEIL
2744                                 (sizeof(struct mlx5_flow),
2745                                  alignof(struct flow_tcf_tunnel_hdr)),
2746 #endif
2747                         .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2748                         .nlh = nlh,
2749                         .tcm = tcm,
2750                 },
2751         };
2752         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2753                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2754         else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2755                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2756         /*
2757          * Generate a reasonably unique handle based on the address of the
2758          * target buffer.
2759          *
2760          * This is straightforward on 32-bit systems where the flow pointer can
2761          * be used directly. Otherwise, its least significant part is taken
2762          * after shifting it by the previous power of two of the pointed buffer
2763          * size.
2764          */
2765         if (sizeof(dev_flow) <= 4)
2766                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
2767         else
2768                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
2769                                        rte_log2_u32(rte_align32prevpow2(size)));
2770         return dev_flow;
2771 }
2772
2773 /**
2774  * Make adjustments for supporting count actions.
2775  *
2776  * @param[in] dev
2777  *   Pointer to the Ethernet device structure.
2778  * @param[in] dev_flow
2779  *   Pointer to mlx5_flow.
2780  * @param[out] error
2781  *   Pointer to error structure.
2782  *
2783  * @return
2784  *   0 On success else a negative errno value is returned and rte_errno is set.
2785  */
2786 static int
2787 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2788                                   struct mlx5_flow *dev_flow,
2789                                   struct rte_flow_error *error)
2790 {
2791         struct rte_flow *flow = dev_flow->flow;
2792
2793         if (!flow->counter) {
2794                 flow->counter = flow_tcf_counter_new();
2795                 if (!flow->counter)
2796                         return rte_flow_error_set(error, rte_errno,
2797                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2798                                                   NULL,
2799                                                   "cannot get counter"
2800                                                   " context.");
2801         }
2802         return 0;
2803 }
2804
2805 /**
2806  * Convert VXLAN VNI to 32-bit integer.
2807  *
2808  * @param[in] vni
2809  *   VXLAN VNI in 24-bit wire format.
2810  *
2811  * @return
2812  *   VXLAN VNI as a 32-bit integer value in network endian.
2813  */
2814 static inline rte_be32_t
2815 vxlan_vni_as_be32(const uint8_t vni[3])
2816 {
2817         union {
2818                 uint8_t vni[4];
2819                 rte_be32_t dword;
2820         } ret = {
2821                 .vni = { 0, vni[0], vni[1], vni[2] },
2822         };
2823         return ret.dword;
2824 }
2825
2826 /**
2827  * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2828  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2829  * in the encapsulation parameters structure. The item must be prevalidated,
2830  * no any validation checks performed by function.
2831  *
2832  * @param[in] spec
2833  *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
2834  * @param[in] mask
2835  *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
2836  * @param[out] encap
2837  *   Structure to fill the gathered MAC address data.
2838  */
2839 static void
2840 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2841                                const struct rte_flow_item_eth *mask,
2842                                struct flow_tcf_vxlan_encap *encap)
2843 {
2844         /* Item must be validated before. No redundant checks. */
2845         assert(spec);
2846         if (!mask || !memcmp(&mask->dst,
2847                              &rte_flow_item_eth_mask.dst,
2848                              sizeof(rte_flow_item_eth_mask.dst))) {
2849                 /*
2850                  * Ethernet addresses are not supported by
2851                  * tc as tunnel_key parameters. Destination
2852                  * address is needed to form encap packet
2853                  * header and retrieved by kernel from
2854                  * implicit sources (ARP table, etc),
2855                  * address masks are not supported at all.
2856                  */
2857                 encap->eth.dst = spec->dst;
2858                 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2859         }
2860         if (!mask || !memcmp(&mask->src,
2861                              &rte_flow_item_eth_mask.src,
2862                              sizeof(rte_flow_item_eth_mask.src))) {
2863                 /*
2864                  * Ethernet addresses are not supported by
2865                  * tc as tunnel_key parameters. Source ethernet
2866                  * address is ignored anyway.
2867                  */
2868                 encap->eth.src = spec->src;
2869                 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2870         }
2871 }
2872
2873 /**
2874  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
2875  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
2876  * in the encapsulation parameters structure. The item must be prevalidated,
2877  * no any validation checks performed by function.
2878  *
2879  * @param[in] spec
2880  *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
2881  * @param[out] encap
2882  *   Structure to fill the gathered IPV4 address data.
2883  */
2884 static void
2885 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
2886                                 struct flow_tcf_vxlan_encap *encap)
2887 {
2888         /* Item must be validated before. No redundant checks. */
2889         assert(spec);
2890         encap->ipv4.dst = spec->hdr.dst_addr;
2891         encap->ipv4.src = spec->hdr.src_addr;
2892         encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
2893                        FLOW_TCF_ENCAP_IPV4_DST;
2894 }
2895
2896 /**
2897  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
2898  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
2899  * in the encapsulation parameters structure. The item must be prevalidated,
2900  * no any validation checks performed by function.
2901  *
2902  * @param[in] spec
2903  *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
2904  * @param[out] encap
2905  *   Structure to fill the gathered IPV6 address data.
2906  */
2907 static void
2908 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
2909                                 struct flow_tcf_vxlan_encap *encap)
2910 {
2911         /* Item must be validated before. No redundant checks. */
2912         assert(spec);
2913         memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
2914         memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
2915         encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
2916                        FLOW_TCF_ENCAP_IPV6_DST;
2917 }
2918
2919 /**
2920  * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
2921  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
2922  * in the encapsulation parameters structure. The item must be prevalidated,
2923  * no any validation checks performed by function.
2924  *
2925  * @param[in] spec
2926  *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
2927  * @param[in] mask
2928  *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
2929  * @param[out] encap
2930  *   Structure to fill the gathered UDP port data.
2931  */
2932 static void
2933 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
2934                                const struct rte_flow_item_udp *mask,
2935                                struct flow_tcf_vxlan_encap *encap)
2936 {
2937         assert(spec);
2938         encap->udp.dst = spec->hdr.dst_port;
2939         encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
2940         if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
2941                 encap->udp.src = spec->hdr.src_port;
2942                 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
2943         }
2944 }
2945
2946 /**
2947  * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
2948  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
2949  * in the encapsulation parameters structure. The item must be prevalidated,
2950  * no any validation checks performed by function.
2951  *
2952  * @param[in] spec
2953  *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
2954  * @param[out] encap
2955  *   Structure to fill the gathered VNI address data.
2956  */
2957 static void
2958 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
2959                                struct flow_tcf_vxlan_encap *encap)
2960 {
2961         /* Item must be validated before. Do not redundant checks. */
2962         assert(spec);
2963         memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
2964         encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
2965 }
2966
2967 /**
2968  * Populate consolidated encapsulation object from list of pattern items.
2969  *
2970  * Helper function to process configuration of action such as
2971  * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
2972  * validated, there is no way to return an meaningful error.
2973  *
2974  * @param[in] action
2975  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2976  *   List of pattern items to gather data from.
2977  * @param[out] src
2978  *   Structure to fill gathered data.
2979  */
2980 static void
2981 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
2982                            struct flow_tcf_vxlan_encap *encap)
2983 {
2984         union {
2985                 const struct rte_flow_item_eth *eth;
2986                 const struct rte_flow_item_ipv4 *ipv4;
2987                 const struct rte_flow_item_ipv6 *ipv6;
2988                 const struct rte_flow_item_udp *udp;
2989                 const struct rte_flow_item_vxlan *vxlan;
2990         } spec, mask;
2991         const struct rte_flow_item *items;
2992
2993         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2994         assert(action->conf);
2995
2996         items = ((const struct rte_flow_action_vxlan_encap *)
2997                                         action->conf)->definition;
2998         assert(items);
2999         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3000                 switch (items->type) {
3001                 case RTE_FLOW_ITEM_TYPE_VOID:
3002                         break;
3003                 case RTE_FLOW_ITEM_TYPE_ETH:
3004                         mask.eth = items->mask;
3005                         spec.eth = items->spec;
3006                         flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3007                                                        encap);
3008                         break;
3009                 case RTE_FLOW_ITEM_TYPE_IPV4:
3010                         spec.ipv4 = items->spec;
3011                         flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, encap);
3012                         break;
3013                 case RTE_FLOW_ITEM_TYPE_IPV6:
3014                         spec.ipv6 = items->spec;
3015                         flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, encap);
3016                         break;
3017                 case RTE_FLOW_ITEM_TYPE_UDP:
3018                         mask.udp = items->mask;
3019                         spec.udp = items->spec;
3020                         flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3021                                                        encap);
3022                         break;
3023                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3024                         spec.vxlan = items->spec;
3025                         flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3026                         break;
3027                 default:
3028                         assert(false);
3029                         DRV_LOG(WARNING,
3030                                 "unsupported item %p type %d,"
3031                                 " items must be validated"
3032                                 " before flow creation",
3033                                 (const void *)items, items->type);
3034                         encap->mask = 0;
3035                         return;
3036                 }
3037         }
3038 }
3039
3040 /**
3041  * Translate flow for Linux TC flower and construct Netlink message.
3042  *
3043  * @param[in] priv
3044  *   Pointer to the priv structure.
3045  * @param[in, out] flow
3046  *   Pointer to the sub flow.
3047  * @param[in] attr
3048  *   Pointer to the flow attributes.
3049  * @param[in] items
3050  *   Pointer to the list of items.
3051  * @param[in] actions
3052  *   Pointer to the list of actions.
3053  * @param[out] error
3054  *   Pointer to the error structure.
3055  *
3056  * @return
3057  *   0 on success, a negative errno value otherwise and rte_ernno is set.
3058  */
3059 static int
3060 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3061                    const struct rte_flow_attr *attr,
3062                    const struct rte_flow_item items[],
3063                    const struct rte_flow_action actions[],
3064                    struct rte_flow_error *error)
3065 {
3066         union {
3067                 const struct rte_flow_item_port_id *port_id;
3068                 const struct rte_flow_item_eth *eth;
3069                 const struct rte_flow_item_vlan *vlan;
3070                 const struct rte_flow_item_ipv4 *ipv4;
3071                 const struct rte_flow_item_ipv6 *ipv6;
3072                 const struct rte_flow_item_tcp *tcp;
3073                 const struct rte_flow_item_udp *udp;
3074                 const struct rte_flow_item_vxlan *vxlan;
3075         } spec, mask;
3076         union {
3077                 const struct rte_flow_action_port_id *port_id;
3078                 const struct rte_flow_action_jump *jump;
3079                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3080                 const struct rte_flow_action_of_set_vlan_vid *
3081                         of_set_vlan_vid;
3082                 const struct rte_flow_action_of_set_vlan_pcp *
3083                         of_set_vlan_pcp;
3084         } conf;
3085         union {
3086                 struct flow_tcf_tunnel_hdr *hdr;
3087                 struct flow_tcf_vxlan_decap *vxlan;
3088         } decap = {
3089                 .hdr = NULL,
3090         };
3091         union {
3092                 struct flow_tcf_tunnel_hdr *hdr;
3093                 struct flow_tcf_vxlan_encap *vxlan;
3094         } encap = {
3095                 .hdr = NULL,
3096         };
3097         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3098         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3099         struct tcmsg *tcm = dev_flow->tcf.tcm;
3100         uint32_t na_act_index_cur;
3101         bool eth_type_set = 0;
3102         bool vlan_present = 0;
3103         bool vlan_eth_type_set = 0;
3104         bool ip_proto_set = 0;
3105         struct nlattr *na_flower;
3106         struct nlattr *na_flower_act;
3107         struct nlattr *na_vlan_id = NULL;
3108         struct nlattr *na_vlan_priority = NULL;
3109         uint64_t item_flags = 0;
3110         int ret;
3111
3112         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3113                                                 PTOI_TABLE_SZ_MAX(dev)));
3114         if (dev_flow->tcf.tunnel) {
3115                 switch (dev_flow->tcf.tunnel->type) {
3116                 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3117                         decap.vxlan = dev_flow->tcf.vxlan_decap;
3118                         break;
3119                 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3120                         encap.vxlan = dev_flow->tcf.vxlan_encap;
3121                         break;
3122                 /* New tunnel actions can be added here. */
3123                 default:
3124                         assert(false);
3125                         break;
3126                 }
3127         }
3128         nlh = dev_flow->tcf.nlh;
3129         tcm = dev_flow->tcf.tcm;
3130         /* Prepare API must have been called beforehand. */
3131         assert(nlh != NULL && tcm != NULL);
3132         tcm->tcm_family = AF_UNSPEC;
3133         tcm->tcm_ifindex = ptoi[0].ifindex;
3134         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3135         /*
3136          * Priority cannot be zero to prevent the kernel from picking one
3137          * automatically.
3138          */
3139         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3140                                   RTE_BE16(ETH_P_ALL));
3141         if (attr->group > 0)
3142                 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3143         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3144         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3145         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3146                 unsigned int i;
3147
3148                 switch (items->type) {
3149                 case RTE_FLOW_ITEM_TYPE_VOID:
3150                         break;
3151                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3152                         mask.port_id = flow_tcf_item_mask
3153                                 (items, &rte_flow_item_port_id_mask,
3154                                  &flow_tcf_mask_supported.port_id,
3155                                  &flow_tcf_mask_empty.port_id,
3156                                  sizeof(flow_tcf_mask_supported.port_id),
3157                                  error);
3158                         assert(mask.port_id);
3159                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
3160                                 break;
3161                         spec.port_id = items->spec;
3162                         if (!mask.port_id->id)
3163                                 i = 0;
3164                         else
3165                                 for (i = 0; ptoi[i].ifindex; ++i)
3166                                         if (ptoi[i].port_id == spec.port_id->id)
3167                                                 break;
3168                         assert(ptoi[i].ifindex);
3169                         tcm->tcm_ifindex = ptoi[i].ifindex;
3170                         break;
3171                 case RTE_FLOW_ITEM_TYPE_ETH:
3172                         item_flags |= (item_flags & MLX5_FLOW_LAYER_VXLAN) ?
3173                                       MLX5_FLOW_LAYER_INNER_L2 :
3174                                       MLX5_FLOW_LAYER_OUTER_L2;
3175                         mask.eth = flow_tcf_item_mask
3176                                 (items, &rte_flow_item_eth_mask,
3177                                  &flow_tcf_mask_supported.eth,
3178                                  &flow_tcf_mask_empty.eth,
3179                                  sizeof(flow_tcf_mask_supported.eth),
3180                                  error);
3181                         assert(mask.eth);
3182                         if (mask.eth == &flow_tcf_mask_empty.eth)
3183                                 break;
3184                         spec.eth = items->spec;
3185                         if (decap.vxlan &&
3186                             !(item_flags & MLX5_FLOW_LAYER_VXLAN)) {
3187                                 DRV_LOG(WARNING,
3188                                         "outer L2 addresses cannot be forced"
3189                                         " for vxlan decapsulation, parameter"
3190                                         " ignored");
3191                                 break;
3192                         }
3193                         if (mask.eth->type) {
3194                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
3195                                                  spec.eth->type);
3196                                 eth_type_set = 1;
3197                         }
3198                         if (!is_zero_ether_addr(&mask.eth->dst)) {
3199                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3200                                              ETHER_ADDR_LEN,
3201                                              spec.eth->dst.addr_bytes);
3202                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3203                                              ETHER_ADDR_LEN,
3204                                              mask.eth->dst.addr_bytes);
3205                         }
3206                         if (!is_zero_ether_addr(&mask.eth->src)) {
3207                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3208                                              ETHER_ADDR_LEN,
3209                                              spec.eth->src.addr_bytes);
3210                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3211                                              ETHER_ADDR_LEN,
3212                                              mask.eth->src.addr_bytes);
3213                         }
3214                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3215                         break;
3216                 case RTE_FLOW_ITEM_TYPE_VLAN:
3217                         assert(!encap.hdr);
3218                         assert(!decap.hdr);
3219                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3220                         mask.vlan = flow_tcf_item_mask
3221                                 (items, &rte_flow_item_vlan_mask,
3222                                  &flow_tcf_mask_supported.vlan,
3223                                  &flow_tcf_mask_empty.vlan,
3224                                  sizeof(flow_tcf_mask_supported.vlan),
3225                                  error);
3226                         assert(mask.vlan);
3227                         if (!eth_type_set)
3228                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
3229                                                  RTE_BE16(ETH_P_8021Q));
3230                         eth_type_set = 1;
3231                         vlan_present = 1;
3232                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
3233                                 break;
3234                         spec.vlan = items->spec;
3235                         if (mask.vlan->inner_type) {
3236                                 mnl_attr_put_u16(nlh,
3237                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3238                                                  spec.vlan->inner_type);
3239                                 vlan_eth_type_set = 1;
3240                         }
3241                         if (mask.vlan->tci & RTE_BE16(0xe000))
3242                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3243                                                 (rte_be_to_cpu_16
3244                                                  (spec.vlan->tci) >> 13) & 0x7);
3245                         if (mask.vlan->tci & RTE_BE16(0x0fff))
3246                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3247                                                  rte_be_to_cpu_16
3248                                                  (spec.vlan->tci &
3249                                                   RTE_BE16(0x0fff)));
3250                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3251                         break;
3252                 case RTE_FLOW_ITEM_TYPE_IPV4:
3253                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3254                         mask.ipv4 = flow_tcf_item_mask
3255                                 (items, &rte_flow_item_ipv4_mask,
3256                                  &flow_tcf_mask_supported.ipv4,
3257                                  &flow_tcf_mask_empty.ipv4,
3258                                  sizeof(flow_tcf_mask_supported.ipv4),
3259                                  error);
3260                         assert(mask.ipv4);
3261                         spec.ipv4 = items->spec;
3262                         if (!decap.vxlan) {
3263                                 if (!eth_type_set ||
3264                                     (!vlan_eth_type_set && vlan_present))
3265                                         mnl_attr_put_u16
3266                                                 (nlh,
3267                                                  vlan_present ?
3268                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
3269                                                  TCA_FLOWER_KEY_ETH_TYPE,
3270                                                  RTE_BE16(ETH_P_IP));
3271                                 eth_type_set = 1;
3272                                 vlan_eth_type_set = 1;
3273                                 if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
3274                                         break;
3275                                 if (mask.ipv4->hdr.next_proto_id) {
3276                                         mnl_attr_put_u8
3277                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3278                                                  spec.ipv4->hdr.next_proto_id);
3279                                         ip_proto_set = 1;
3280                                 }
3281                         } else {
3282                                 assert(mask.ipv4 != &flow_tcf_mask_empty.ipv4);
3283                         }
3284                         if (mask.ipv4->hdr.src_addr) {
3285                                 mnl_attr_put_u32
3286                                         (nlh, decap.vxlan ?
3287                                          TCA_FLOWER_KEY_ENC_IPV4_SRC :
3288                                          TCA_FLOWER_KEY_IPV4_SRC,
3289                                          spec.ipv4->hdr.src_addr);
3290                                 mnl_attr_put_u32
3291                                         (nlh, decap.vxlan ?
3292                                          TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3293                                          TCA_FLOWER_KEY_IPV4_SRC_MASK,
3294                                          mask.ipv4->hdr.src_addr);
3295                         }
3296                         if (mask.ipv4->hdr.dst_addr) {
3297                                 mnl_attr_put_u32
3298                                         (nlh, decap.vxlan ?
3299                                          TCA_FLOWER_KEY_ENC_IPV4_DST :
3300                                          TCA_FLOWER_KEY_IPV4_DST,
3301                                          spec.ipv4->hdr.dst_addr);
3302                                 mnl_attr_put_u32
3303                                         (nlh, decap.vxlan ?
3304                                          TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3305                                          TCA_FLOWER_KEY_IPV4_DST_MASK,
3306                                          mask.ipv4->hdr.dst_addr);
3307                         }
3308                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3309                         break;
3310                 case RTE_FLOW_ITEM_TYPE_IPV6:
3311                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3312                         mask.ipv6 = flow_tcf_item_mask
3313                                 (items, &rte_flow_item_ipv6_mask,
3314                                  &flow_tcf_mask_supported.ipv6,
3315                                  &flow_tcf_mask_empty.ipv6,
3316                                  sizeof(flow_tcf_mask_supported.ipv6),
3317                                  error);
3318                         assert(mask.ipv6);
3319                         spec.ipv6 = items->spec;
3320                         if (!decap.vxlan) {
3321                                 if (!eth_type_set ||
3322                                     (!vlan_eth_type_set && vlan_present))
3323                                         mnl_attr_put_u16
3324                                                 (nlh,
3325                                                  vlan_present ?
3326                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
3327                                                  TCA_FLOWER_KEY_ETH_TYPE,
3328                                                  RTE_BE16(ETH_P_IPV6));
3329                                 eth_type_set = 1;
3330                                 vlan_eth_type_set = 1;
3331                                 if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
3332                                         break;
3333                                 if (mask.ipv6->hdr.proto) {
3334                                         mnl_attr_put_u8
3335                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3336                                                  spec.ipv6->hdr.proto);
3337                                         ip_proto_set = 1;
3338                                 }
3339                         } else {
3340                                 assert(mask.ipv6 != &flow_tcf_mask_empty.ipv6);
3341                         }
3342                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) {
3343                                 mnl_attr_put(nlh, decap.vxlan ?
3344                                              TCA_FLOWER_KEY_ENC_IPV6_SRC :
3345                                              TCA_FLOWER_KEY_IPV6_SRC,
3346                                              IPV6_ADDR_LEN,
3347                                              spec.ipv6->hdr.src_addr);
3348                                 mnl_attr_put(nlh, decap.vxlan ?
3349                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3350                                              TCA_FLOWER_KEY_IPV6_SRC_MASK,
3351                                              IPV6_ADDR_LEN,
3352                                              mask.ipv6->hdr.src_addr);
3353                         }
3354                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) {
3355                                 mnl_attr_put(nlh, decap.vxlan ?
3356                                              TCA_FLOWER_KEY_ENC_IPV6_DST :
3357                                              TCA_FLOWER_KEY_IPV6_DST,
3358                                              IPV6_ADDR_LEN,
3359                                              spec.ipv6->hdr.dst_addr);
3360                                 mnl_attr_put(nlh, decap.vxlan ?
3361                                              TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3362                                              TCA_FLOWER_KEY_IPV6_DST_MASK,
3363                                              IPV6_ADDR_LEN,
3364                                              mask.ipv6->hdr.dst_addr);
3365                         }
3366                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3367                         break;
3368                 case RTE_FLOW_ITEM_TYPE_UDP:
3369                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
3370                         mask.udp = flow_tcf_item_mask
3371                                 (items, &rte_flow_item_udp_mask,
3372                                  &flow_tcf_mask_supported.udp,
3373                                  &flow_tcf_mask_empty.udp,
3374                                  sizeof(flow_tcf_mask_supported.udp),
3375                                  error);
3376                         assert(mask.udp);
3377                         spec.udp = items->spec;
3378                         if (!decap.vxlan) {
3379                                 if (!ip_proto_set)
3380                                         mnl_attr_put_u8
3381                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3382                                                 IPPROTO_UDP);
3383                                 if (mask.udp == &flow_tcf_mask_empty.udp)
3384                                         break;
3385                         } else {
3386                                 assert(mask.udp != &flow_tcf_mask_empty.udp);
3387                                 decap.vxlan->udp_port =
3388                                         rte_be_to_cpu_16
3389                                                 (spec.udp->hdr.dst_port);
3390                         }
3391                         if (mask.udp->hdr.src_port) {
3392                                 mnl_attr_put_u16
3393                                         (nlh, decap.vxlan ?
3394                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3395                                          TCA_FLOWER_KEY_UDP_SRC,
3396                                          spec.udp->hdr.src_port);
3397                                 mnl_attr_put_u16
3398                                         (nlh, decap.vxlan ?
3399                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3400                                          TCA_FLOWER_KEY_UDP_SRC_MASK,
3401                                          mask.udp->hdr.src_port);
3402                         }
3403                         if (mask.udp->hdr.dst_port) {
3404                                 mnl_attr_put_u16
3405                                         (nlh, decap.vxlan ?
3406                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3407                                          TCA_FLOWER_KEY_UDP_DST,
3408                                          spec.udp->hdr.dst_port);
3409                                 mnl_attr_put_u16
3410                                         (nlh, decap.vxlan ?
3411                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3412                                          TCA_FLOWER_KEY_UDP_DST_MASK,
3413                                          mask.udp->hdr.dst_port);
3414                         }
3415                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3416                         break;
3417                 case RTE_FLOW_ITEM_TYPE_TCP:
3418                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
3419                         mask.tcp = flow_tcf_item_mask
3420                                 (items, &rte_flow_item_tcp_mask,
3421                                  &flow_tcf_mask_supported.tcp,
3422                                  &flow_tcf_mask_empty.tcp,
3423                                  sizeof(flow_tcf_mask_supported.tcp),
3424                                  error);
3425                         assert(mask.tcp);
3426                         if (!ip_proto_set)
3427                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3428                                                 IPPROTO_TCP);
3429                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
3430                                 break;
3431                         spec.tcp = items->spec;
3432                         if (mask.tcp->hdr.src_port) {
3433                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3434                                                  spec.tcp->hdr.src_port);
3435                                 mnl_attr_put_u16(nlh,
3436                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
3437                                                  mask.tcp->hdr.src_port);
3438                         }
3439                         if (mask.tcp->hdr.dst_port) {
3440                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3441                                                  spec.tcp->hdr.dst_port);
3442                                 mnl_attr_put_u16(nlh,
3443                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
3444                                                  mask.tcp->hdr.dst_port);
3445                         }
3446                         if (mask.tcp->hdr.tcp_flags) {
3447                                 mnl_attr_put_u16
3448                                         (nlh,
3449                                          TCA_FLOWER_KEY_TCP_FLAGS,
3450                                          rte_cpu_to_be_16
3451                                                 (spec.tcp->hdr.tcp_flags));
3452                                 mnl_attr_put_u16
3453                                         (nlh,
3454                                          TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3455                                          rte_cpu_to_be_16
3456                                                 (mask.tcp->hdr.tcp_flags));
3457                         }
3458                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3459                         break;
3460                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3461                         assert(decap.vxlan);
3462                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
3463                         spec.vxlan = items->spec;
3464                         mnl_attr_put_u32(nlh,
3465                                          TCA_FLOWER_KEY_ENC_KEY_ID,
3466                                          vxlan_vni_as_be32(spec.vxlan->vni));
3467                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3468                         break;
3469                 default:
3470                         return rte_flow_error_set(error, ENOTSUP,
3471                                                   RTE_FLOW_ERROR_TYPE_ITEM,
3472                                                   NULL, "item not supported");
3473                 }
3474         }
3475         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3476         na_act_index_cur = 1;
3477         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3478                 struct nlattr *na_act_index;
3479                 struct nlattr *na_act;
3480                 unsigned int vlan_act;
3481                 unsigned int i;
3482
3483                 switch (actions->type) {
3484                 case RTE_FLOW_ACTION_TYPE_VOID:
3485                         break;
3486                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3487                         conf.port_id = actions->conf;
3488                         if (conf.port_id->original)
3489                                 i = 0;
3490                         else
3491                                 for (i = 0; ptoi[i].ifindex; ++i)
3492                                         if (ptoi[i].port_id == conf.port_id->id)
3493                                                 break;
3494                         assert(ptoi[i].ifindex);
3495                         na_act_index =
3496                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3497                         assert(na_act_index);
3498                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3499                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3500                         assert(na_act);
3501                         if (encap.hdr) {
3502                                 assert(dev_flow->tcf.tunnel);
3503                                 dev_flow->tcf.tunnel->ifindex_ptr =
3504                                         &((struct tc_mirred *)
3505                                         mnl_attr_get_payload
3506                                         (mnl_nlmsg_get_payload_tail
3507                                                 (nlh)))->ifindex;
3508                         }
3509                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3510                                      sizeof(struct tc_mirred),
3511                                      &(struct tc_mirred){
3512                                         .action = TC_ACT_STOLEN,
3513                                         .eaction = TCA_EGRESS_REDIR,
3514                                         .ifindex = ptoi[i].ifindex,
3515                                      });
3516                         mnl_attr_nest_end(nlh, na_act);
3517                         mnl_attr_nest_end(nlh, na_act_index);
3518                         break;
3519                 case RTE_FLOW_ACTION_TYPE_JUMP:
3520                         conf.jump = actions->conf;
3521                         na_act_index =
3522                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3523                         assert(na_act_index);
3524                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3525                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3526                         assert(na_act);
3527                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3528                                      sizeof(struct tc_gact),
3529                                      &(struct tc_gact){
3530                                         .action = TC_ACT_GOTO_CHAIN |
3531                                                   conf.jump->group,
3532                                      });
3533                         mnl_attr_nest_end(nlh, na_act);
3534                         mnl_attr_nest_end(nlh, na_act_index);
3535                         break;
3536                 case RTE_FLOW_ACTION_TYPE_DROP:
3537                         na_act_index =
3538                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3539                         assert(na_act_index);
3540                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3541                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3542                         assert(na_act);
3543                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3544                                      sizeof(struct tc_gact),
3545                                      &(struct tc_gact){
3546                                         .action = TC_ACT_SHOT,
3547                                      });
3548                         mnl_attr_nest_end(nlh, na_act);
3549                         mnl_attr_nest_end(nlh, na_act_index);
3550                         break;
3551                 case RTE_FLOW_ACTION_TYPE_COUNT:
3552                         /*
3553                          * Driver adds the count action implicitly for
3554                          * each rule it creates.
3555                          */
3556                         ret = flow_tcf_translate_action_count(dev,
3557                                                               dev_flow, error);
3558                         if (ret < 0)
3559                                 return ret;
3560                         break;
3561                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3562                         conf.of_push_vlan = NULL;
3563                         vlan_act = TCA_VLAN_ACT_POP;
3564                         goto action_of_vlan;
3565                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3566                         conf.of_push_vlan = actions->conf;
3567                         vlan_act = TCA_VLAN_ACT_PUSH;
3568                         goto action_of_vlan;
3569                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3570                         conf.of_set_vlan_vid = actions->conf;
3571                         if (na_vlan_id)
3572                                 goto override_na_vlan_id;
3573                         vlan_act = TCA_VLAN_ACT_MODIFY;
3574                         goto action_of_vlan;
3575                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3576                         conf.of_set_vlan_pcp = actions->conf;
3577                         if (na_vlan_priority)
3578                                 goto override_na_vlan_priority;
3579                         vlan_act = TCA_VLAN_ACT_MODIFY;
3580                         goto action_of_vlan;
3581 action_of_vlan:
3582                         na_act_index =
3583                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3584                         assert(na_act_index);
3585                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3586                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3587                         assert(na_act);
3588                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
3589                                      sizeof(struct tc_vlan),
3590                                      &(struct tc_vlan){
3591                                         .action = TC_ACT_PIPE,
3592                                         .v_action = vlan_act,
3593                                      });
3594                         if (vlan_act == TCA_VLAN_ACT_POP) {
3595                                 mnl_attr_nest_end(nlh, na_act);
3596                                 mnl_attr_nest_end(nlh, na_act_index);
3597                                 break;
3598                         }
3599                         if (vlan_act == TCA_VLAN_ACT_PUSH)
3600                                 mnl_attr_put_u16(nlh,
3601                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
3602                                                  conf.of_push_vlan->ethertype);
3603                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3604                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3605                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3606                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3607                         mnl_attr_nest_end(nlh, na_act);
3608                         mnl_attr_nest_end(nlh, na_act_index);
3609                         if (actions->type ==
3610                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3611 override_na_vlan_id:
3612                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3613                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3614                                         rte_be_to_cpu_16
3615                                         (conf.of_set_vlan_vid->vlan_vid);
3616                         } else if (actions->type ==
3617                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3618 override_na_vlan_priority:
3619                                 na_vlan_priority->nla_type =
3620                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
3621                                 *(uint8_t *)mnl_attr_get_payload
3622                                         (na_vlan_priority) =
3623                                         conf.of_set_vlan_pcp->vlan_pcp;
3624                         }
3625                         break;
3626                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3627                         assert(decap.vxlan);
3628                         assert(dev_flow->tcf.tunnel);
3629                         dev_flow->tcf.tunnel->ifindex_ptr =
3630                                 (unsigned int *)&tcm->tcm_ifindex;
3631                         na_act_index =
3632                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3633                         assert(na_act_index);
3634                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3635                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3636                         assert(na_act);
3637                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3638                                 sizeof(struct tc_tunnel_key),
3639                                 &(struct tc_tunnel_key){
3640                                         .action = TC_ACT_PIPE,
3641                                         .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3642                                         });
3643                         mnl_attr_nest_end(nlh, na_act);
3644                         mnl_attr_nest_end(nlh, na_act_index);
3645                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3646                         break;
3647                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3648                         assert(encap.vxlan);
3649                         flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3650                         na_act_index =
3651                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3652                         assert(na_act_index);
3653                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3654                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3655                         assert(na_act);
3656                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3657                                 sizeof(struct tc_tunnel_key),
3658                                 &(struct tc_tunnel_key){
3659                                         .action = TC_ACT_PIPE,
3660                                         .t_action = TCA_TUNNEL_KEY_ACT_SET,
3661                                         });
3662                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3663                                 mnl_attr_put_u16(nlh,
3664                                          TCA_TUNNEL_KEY_ENC_DST_PORT,
3665                                          encap.vxlan->udp.dst);
3666                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3667                                 mnl_attr_put_u32(nlh,
3668                                          TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3669                                          encap.vxlan->ipv4.src);
3670                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3671                                 mnl_attr_put_u32(nlh,
3672                                          TCA_TUNNEL_KEY_ENC_IPV4_DST,
3673                                          encap.vxlan->ipv4.dst);
3674                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3675                                 mnl_attr_put(nlh,
3676                                          TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3677                                          sizeof(encap.vxlan->ipv6.src),
3678                                          &encap.vxlan->ipv6.src);
3679                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3680                                 mnl_attr_put(nlh,
3681                                          TCA_TUNNEL_KEY_ENC_IPV6_DST,
3682                                          sizeof(encap.vxlan->ipv6.dst),
3683                                          &encap.vxlan->ipv6.dst);
3684                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3685                                 mnl_attr_put_u32(nlh,
3686                                          TCA_TUNNEL_KEY_ENC_KEY_ID,
3687                                          vxlan_vni_as_be32
3688                                                 (encap.vxlan->vxlan.vni));
3689                         mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3690                         mnl_attr_nest_end(nlh, na_act);
3691                         mnl_attr_nest_end(nlh, na_act_index);
3692                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3693                         break;
3694                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3695                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3696                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
3697                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
3698                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
3699                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
3700                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
3701                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
3702                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
3703                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
3704                         na_act_index =
3705                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3706                         flow_tcf_create_pedit_mnl_msg(nlh,
3707                                                       &actions, item_flags);
3708                         mnl_attr_nest_end(nlh, na_act_index);
3709                         break;
3710                 default:
3711                         return rte_flow_error_set(error, ENOTSUP,
3712                                                   RTE_FLOW_ERROR_TYPE_ACTION,
3713                                                   actions,
3714                                                   "action not supported");
3715                 }
3716         }
3717         assert(na_flower);
3718         assert(na_flower_act);
3719         mnl_attr_nest_end(nlh, na_flower_act);
3720         dev_flow->tcf.ptc_flags = mnl_attr_get_payload
3721                                         (mnl_nlmsg_get_payload_tail(nlh));
3722         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
3723                                                 0 : TCA_CLS_FLAGS_SKIP_SW);
3724         mnl_attr_nest_end(nlh, na_flower);
3725         if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
3726                 dev_flow->tcf.tunnel->ifindex_org =
3727                         *dev_flow->tcf.tunnel->ifindex_ptr;
3728         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3729         return 0;
3730 }
3731
3732 /**
3733  * Send Netlink message with acknowledgment.
3734  *
3735  * @param tcf
3736  *   Flow context to use.
3737  * @param nlh
3738  *   Message to send. This function always raises the NLM_F_ACK flag before
3739  *   sending.
3740  * @param[in] cb
3741  *   Callback handler for received message.
3742  * @param[in] arg
3743  *   Context pointer for callback handler.
3744  *
3745  * @return
3746  *   0 on success, a negative errno value otherwise and rte_errno is set.
3747  */
3748 static int
3749 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
3750                 struct nlmsghdr *nlh,
3751                 mnl_cb_t cb, void *arg)
3752 {
3753         unsigned int portid = mnl_socket_get_portid(tcf->nl);
3754         uint32_t seq = tcf->seq++;
3755         int ret, err = 0;
3756
3757         assert(tcf->nl);
3758         assert(tcf->buf);
3759         if (!seq) {
3760                 /* seq 0 is reserved for kernel event-driven notifications. */
3761                 seq = tcf->seq++;
3762         }
3763         nlh->nlmsg_seq = seq;
3764         nlh->nlmsg_flags |= NLM_F_ACK;
3765         ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
3766         if (ret <= 0) {
3767                 /* Message send error occurres. */
3768                 rte_errno = errno;
3769                 return -rte_errno;
3770         }
3771         nlh = (struct nlmsghdr *)(tcf->buf);
3772         /*
3773          * The following loop postpones non-fatal errors until multipart
3774          * messages are complete.
3775          */
3776         while (true) {
3777                 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
3778                 if (ret < 0) {
3779                         err = errno;
3780                         /*
3781                          * In case of overflow Will receive till
3782                          * end of multipart message. We may lost part
3783                          * of reply messages but mark and return an error.
3784                          */
3785                         if (err != ENOSPC ||
3786                             !(nlh->nlmsg_flags & NLM_F_MULTI) ||
3787                             nlh->nlmsg_type == NLMSG_DONE)
3788                                 break;
3789                 } else {
3790                         ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
3791                         if (!ret) {
3792                                 /*
3793                                  * libmnl returns 0 if DONE or
3794                                  * success ACK message found.
3795                                  */
3796                                 break;
3797                         }
3798                         if (ret < 0) {
3799                                 /*
3800                                  * ACK message with error found
3801                                  * or some error occurred.
3802                                  */
3803                                 err = errno;
3804                                 break;
3805                         }
3806                         /* We should continue receiving. */
3807                 }
3808         }
3809         if (!err)
3810                 return 0;
3811         rte_errno = err;
3812         return -err;
3813 }
3814
3815 #define MNL_BUF_EXTRA_SPACE 16
3816 #define MNL_REQUEST_SIZE_MIN 256
3817 #define MNL_REQUEST_SIZE_MAX 2048
3818 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
3819                                  MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
3820
3821 /* Data structures used by flow_tcf_xxx_cb() routines. */
3822 struct tcf_nlcb_buf {
3823         LIST_ENTRY(tcf_nlcb_buf) next;
3824         uint32_t size;
3825         alignas(struct nlmsghdr)
3826         uint8_t msg[]; /**< Netlink message data. */
3827 };
3828
3829 struct tcf_nlcb_context {
3830         unsigned int ifindex; /**< Base interface index. */
3831         uint32_t bufsize;
3832         LIST_HEAD(, tcf_nlcb_buf) nlbuf;
3833 };
3834
3835 /**
3836  * Allocate space for netlink command in buffer list
3837  *
3838  * @param[in, out] ctx
3839  *   Pointer to callback context with command buffers list.
3840  * @param[in] size
3841  *   Required size of data buffer to be allocated.
3842  *
3843  * @return
3844  *   Pointer to allocated memory, aligned as message header.
3845  *   NULL if some error occurred.
3846  */
3847 static struct nlmsghdr *
3848 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
3849 {
3850         struct tcf_nlcb_buf *buf;
3851         struct nlmsghdr *nlh;
3852
3853         size = NLMSG_ALIGN(size);
3854         buf = LIST_FIRST(&ctx->nlbuf);
3855         if (buf && (buf->size + size) <= ctx->bufsize) {
3856                 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
3857                 buf->size += size;
3858                 return nlh;
3859         }
3860         if (size > ctx->bufsize) {
3861                 DRV_LOG(WARNING, "netlink: too long command buffer requested");
3862                 return NULL;
3863         }
3864         buf = rte_malloc(__func__,
3865                         ctx->bufsize + sizeof(struct tcf_nlcb_buf),
3866                         alignof(struct tcf_nlcb_buf));
3867         if (!buf) {
3868                 DRV_LOG(WARNING, "netlink: no memory for command buffer");
3869                 return NULL;
3870         }
3871         LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
3872         buf->size = size;
3873         nlh = (struct nlmsghdr *)&buf->msg[0];
3874         return nlh;
3875 }
3876
3877 /**
3878  * Send the buffers with prepared netlink commands. Scans the list and
3879  * sends all found buffers. Buffers are sent and freed anyway in order
3880  * to prevent memory leakage if some every message in received packet.
3881  *
3882  * @param[in] tcf
3883  *   Context object initialized by mlx5_flow_tcf_context_create().
3884  * @param[in, out] ctx
3885  *   Pointer to callback context with command buffers list.
3886  *
3887  * @return
3888  *   Zero value on success, negative errno value otherwise
3889  *   and rte_errno is set.
3890  */
3891 static int
3892 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
3893                     struct tcf_nlcb_context *ctx)
3894 {
3895         struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
3896         int ret = 0;
3897
3898         while (bc) {
3899                 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
3900                 struct nlmsghdr *nlh;
3901                 uint32_t msg = 0;
3902                 int rc;
3903
3904                 while (msg < bc->size) {
3905                         /*
3906                          * Send Netlink commands from buffer in one by one
3907                          * fashion. If we send multiple rule deletion commands
3908                          * in one Netlink message and some error occurs it may
3909                          * cause multiple ACK error messages and break sequence
3910                          * numbers of Netlink communication, because we expect
3911                          * the only one ACK reply.
3912                          */
3913                         assert((bc->size - msg) >= sizeof(struct nlmsghdr));
3914                         nlh = (struct nlmsghdr *)&bc->msg[msg];
3915                         assert((bc->size - msg) >= nlh->nlmsg_len);
3916                         msg += nlh->nlmsg_len;
3917                         rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
3918                         if (rc) {
3919                                 DRV_LOG(WARNING,
3920                                         "netlink: cleanup error %d", rc);
3921                                 if (!ret)
3922                                         ret = rc;
3923                         }
3924                 }
3925                 rte_free(bc);
3926                 bc = bn;
3927         }
3928         LIST_INIT(&ctx->nlbuf);
3929         return ret;
3930 }
3931
3932 /**
3933  * Collect local IP address rules with scope link attribute  on specified
3934  * network device. This is callback routine called by libmnl mnl_cb_run()
3935  * in loop for every message in received packet.
3936  *
3937  * @param[in] nlh
3938  *   Pointer to reply header.
3939  * @param[in, out] arg
3940  *   Opaque data pointer for this callback.
3941  *
3942  * @return
3943  *   A positive, nonzero value on success, negative errno value otherwise
3944  *   and rte_errno is set.
3945  */
3946 static int
3947 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
3948 {
3949         struct tcf_nlcb_context *ctx = arg;
3950         struct nlmsghdr *cmd;
3951         struct ifaddrmsg *ifa;
3952         struct nlattr *na;
3953         struct nlattr *na_local = NULL;
3954         struct nlattr *na_peer = NULL;
3955         unsigned char family;
3956         uint32_t size;
3957
3958         if (nlh->nlmsg_type != RTM_NEWADDR) {
3959                 rte_errno = EINVAL;
3960                 return -rte_errno;
3961         }
3962         ifa = mnl_nlmsg_get_payload(nlh);
3963         family = ifa->ifa_family;
3964         if (ifa->ifa_index != ctx->ifindex ||
3965             ifa->ifa_scope != RT_SCOPE_LINK ||
3966             !(ifa->ifa_flags & IFA_F_PERMANENT) ||
3967             (family != AF_INET && family != AF_INET6))
3968                 return 1;
3969         mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
3970                 switch (mnl_attr_get_type(na)) {
3971                 case IFA_LOCAL:
3972                         na_local = na;
3973                         break;
3974                 case IFA_ADDRESS:
3975                         na_peer = na;
3976                         break;
3977                 }
3978                 if (na_local && na_peer)
3979                         break;
3980         }
3981         if (!na_local || !na_peer)
3982                 return 1;
3983         /* Local rule found with scope link, permanent and assigned peer. */
3984         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
3985                MNL_ALIGN(sizeof(struct ifaddrmsg)) +
3986                (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
3987                                    : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
3988         cmd = flow_tcf_alloc_nlcmd(ctx, size);
3989         if (!cmd) {
3990                 rte_errno = ENOMEM;
3991                 return -rte_errno;
3992         }
3993         cmd = mnl_nlmsg_put_header(cmd);
3994         cmd->nlmsg_type = RTM_DELADDR;
3995         cmd->nlmsg_flags = NLM_F_REQUEST;
3996         ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
3997         ifa->ifa_flags = IFA_F_PERMANENT;
3998         ifa->ifa_scope = RT_SCOPE_LINK;
3999         ifa->ifa_index = ctx->ifindex;
4000         if (family == AF_INET) {
4001                 ifa->ifa_family = AF_INET;
4002                 ifa->ifa_prefixlen = 32;
4003                 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4004                 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4005         } else {
4006                 ifa->ifa_family = AF_INET6;
4007                 ifa->ifa_prefixlen = 128;
4008                 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4009                         mnl_attr_get_payload(na_local));
4010                 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4011                         mnl_attr_get_payload(na_peer));
4012         }
4013         assert(size == cmd->nlmsg_len);
4014         return 1;
4015 }
4016
4017 /**
4018  * Cleanup the local IP addresses on outer interface.
4019  *
4020  * @param[in] tcf
4021  *   Context object initialized by mlx5_flow_tcf_context_create().
4022  * @param[in] ifindex
4023  *   Network inferface index to perform cleanup.
4024  */
4025 static void
4026 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4027                             unsigned int ifindex)
4028 {
4029         struct nlmsghdr *nlh;
4030         struct ifaddrmsg *ifa;
4031         struct tcf_nlcb_context ctx = {
4032                 .ifindex = ifindex,
4033                 .bufsize = MNL_REQUEST_SIZE,
4034                 .nlbuf = LIST_HEAD_INITIALIZER(),
4035         };
4036         int ret;
4037
4038         assert(ifindex);
4039         /*
4040          * Seek and destroy leftovers of local IP addresses with
4041          * matching properties "scope link".
4042          */
4043         nlh = mnl_nlmsg_put_header(tcf->buf);
4044         nlh->nlmsg_type = RTM_GETADDR;
4045         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4046         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4047         ifa->ifa_family = AF_UNSPEC;
4048         ifa->ifa_index = ifindex;
4049         ifa->ifa_scope = RT_SCOPE_LINK;
4050         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
4051         if (ret)
4052                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4053         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4054         if (ret)
4055                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4056 }
4057
4058 /**
4059  * Collect neigh permament rules on specified network device.
4060  * This is callback routine called by libmnl mnl_cb_run() in loop for
4061  * every message in received packet.
4062  *
4063  * @param[in] nlh
4064  *   Pointer to reply header.
4065  * @param[in, out] arg
4066  *   Opaque data pointer for this callback.
4067  *
4068  * @return
4069  *   A positive, nonzero value on success, negative errno value otherwise
4070  *   and rte_errno is set.
4071  */
4072 static int
4073 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4074 {
4075         struct tcf_nlcb_context *ctx = arg;
4076         struct nlmsghdr *cmd;
4077         struct ndmsg *ndm;
4078         struct nlattr *na;
4079         struct nlattr *na_ip = NULL;
4080         struct nlattr *na_mac = NULL;
4081         unsigned char family;
4082         uint32_t size;
4083
4084         if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4085                 rte_errno = EINVAL;
4086                 return -rte_errno;
4087         }
4088         ndm = mnl_nlmsg_get_payload(nlh);
4089         family = ndm->ndm_family;
4090         if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4091            !(ndm->ndm_state & NUD_PERMANENT) ||
4092            (family != AF_INET && family != AF_INET6))
4093                 return 1;
4094         mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4095                 switch (mnl_attr_get_type(na)) {
4096                 case NDA_DST:
4097                         na_ip = na;
4098                         break;
4099                 case NDA_LLADDR:
4100                         na_mac = na;
4101                         break;
4102                 }
4103                 if (na_mac && na_ip)
4104                         break;
4105         }
4106         if (!na_mac || !na_ip)
4107                 return 1;
4108         /* Neigh rule with permenent attribute found. */
4109         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4110                MNL_ALIGN(sizeof(struct ndmsg)) +
4111                SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4112                (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4113                                    : SZ_NLATTR_TYPE_OF(uint32_t));
4114         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4115         if (!cmd) {
4116                 rte_errno = ENOMEM;
4117                 return -rte_errno;
4118         }
4119         cmd = mnl_nlmsg_put_header(cmd);
4120         cmd->nlmsg_type = RTM_DELNEIGH;
4121         cmd->nlmsg_flags = NLM_F_REQUEST;
4122         ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4123         ndm->ndm_ifindex = ctx->ifindex;
4124         ndm->ndm_state = NUD_PERMANENT;
4125         ndm->ndm_flags = 0;
4126         ndm->ndm_type = 0;
4127         if (family == AF_INET) {
4128                 ndm->ndm_family = AF_INET;
4129                 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4130         } else {
4131                 ndm->ndm_family = AF_INET6;
4132                 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4133                              mnl_attr_get_payload(na_ip));
4134         }
4135         mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4136                      mnl_attr_get_payload(na_mac));
4137         assert(size == cmd->nlmsg_len);
4138         return 1;
4139 }
4140
4141 /**
4142  * Cleanup the neigh rules on outer interface.
4143  *
4144  * @param[in] tcf
4145  *   Context object initialized by mlx5_flow_tcf_context_create().
4146  * @param[in] ifindex
4147  *   Network inferface index to perform cleanup.
4148  */
4149 static void
4150 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4151                             unsigned int ifindex)
4152 {
4153         struct nlmsghdr *nlh;
4154         struct ndmsg *ndm;
4155         struct tcf_nlcb_context ctx = {
4156                 .ifindex = ifindex,
4157                 .bufsize = MNL_REQUEST_SIZE,
4158                 .nlbuf = LIST_HEAD_INITIALIZER(),
4159         };
4160         int ret;
4161
4162         assert(ifindex);
4163         /* Seek and destroy leftovers of neigh rules. */
4164         nlh = mnl_nlmsg_put_header(tcf->buf);
4165         nlh->nlmsg_type = RTM_GETNEIGH;
4166         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4167         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4168         ndm->ndm_family = AF_UNSPEC;
4169         ndm->ndm_ifindex = ifindex;
4170         ndm->ndm_state = NUD_PERMANENT;
4171         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4172         if (ret)
4173                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4174         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4175         if (ret)
4176                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4177 }
4178
4179 /**
4180  * Collect indices of VXLAN encap/decap interfaces associated with device.
4181  * This is callback routine called by libmnl mnl_cb_run() in loop for
4182  * every message in received packet.
4183  *
4184  * @param[in] nlh
4185  *   Pointer to reply header.
4186  * @param[in, out] arg
4187  *   Opaque data pointer for this callback.
4188  *
4189  * @return
4190  *   A positive, nonzero value on success, negative errno value otherwise
4191  *   and rte_errno is set.
4192  */
4193 static int
4194 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4195 {
4196         struct tcf_nlcb_context *ctx = arg;
4197         struct nlmsghdr *cmd;
4198         struct ifinfomsg *ifm;
4199         struct nlattr *na;
4200         struct nlattr *na_info = NULL;
4201         struct nlattr *na_vxlan = NULL;
4202         bool found = false;
4203         unsigned int vxindex;
4204         uint32_t size;
4205
4206         if (nlh->nlmsg_type != RTM_NEWLINK) {
4207                 rte_errno = EINVAL;
4208                 return -rte_errno;
4209         }
4210         ifm = mnl_nlmsg_get_payload(nlh);
4211         if (!ifm->ifi_index) {
4212                 rte_errno = EINVAL;
4213                 return -rte_errno;
4214         }
4215         mnl_attr_for_each(na, nlh, sizeof(*ifm))
4216                 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4217                         na_info = na;
4218                         break;
4219                 }
4220         if (!na_info)
4221                 return 1;
4222         mnl_attr_for_each_nested(na, na_info) {
4223                 switch (mnl_attr_get_type(na)) {
4224                 case IFLA_INFO_KIND:
4225                         if (!strncmp("vxlan", mnl_attr_get_str(na),
4226                                      mnl_attr_get_len(na)))
4227                                 found = true;
4228                         break;
4229                 case IFLA_INFO_DATA:
4230                         na_vxlan = na;
4231                         break;
4232                 }
4233                 if (found && na_vxlan)
4234                         break;
4235         }
4236         if (!found || !na_vxlan)
4237                 return 1;
4238         found = false;
4239         mnl_attr_for_each_nested(na, na_vxlan) {
4240                 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4241                     mnl_attr_get_u32(na) == ctx->ifindex) {
4242                         found = true;
4243                         break;
4244                 }
4245         }
4246         if (!found)
4247                 return 1;
4248         /* Attached VXLAN device found, store the command to delete. */
4249         vxindex = ifm->ifi_index;
4250         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4251                MNL_ALIGN(sizeof(struct ifinfomsg));
4252         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4253         if (!cmd) {
4254                 rte_errno = ENOMEM;
4255                 return -rte_errno;
4256         }
4257         cmd = mnl_nlmsg_put_header(cmd);
4258         cmd->nlmsg_type = RTM_DELLINK;
4259         cmd->nlmsg_flags = NLM_F_REQUEST;
4260         ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4261         ifm->ifi_family = AF_UNSPEC;
4262         ifm->ifi_index = vxindex;
4263         assert(size == cmd->nlmsg_len);
4264         return 1;
4265 }
4266
4267 /**
4268  * Cleanup the outer interface. Removes all found vxlan devices
4269  * attached to specified index, flushes the meigh and local IP
4270  * datavase.
4271  *
4272  * @param[in] tcf
4273  *   Context object initialized by mlx5_flow_tcf_context_create().
4274  * @param[in] ifindex
4275  *   Network inferface index to perform cleanup.
4276  */
4277 static void
4278 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4279                             unsigned int ifindex)
4280 {
4281         struct nlmsghdr *nlh;
4282         struct ifinfomsg *ifm;
4283         struct tcf_nlcb_context ctx = {
4284                 .ifindex = ifindex,
4285                 .bufsize = MNL_REQUEST_SIZE,
4286                 .nlbuf = LIST_HEAD_INITIALIZER(),
4287         };
4288         int ret;
4289
4290         assert(ifindex);
4291         /*
4292          * Seek and destroy leftover VXLAN encap/decap interfaces with
4293          * matching properties.
4294          */
4295         nlh = mnl_nlmsg_put_header(tcf->buf);
4296         nlh->nlmsg_type = RTM_GETLINK;
4297         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4298         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4299         ifm->ifi_family = AF_UNSPEC;
4300         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4301         if (ret)
4302                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4303         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4304         if (ret)
4305                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4306 }
4307
4308 /**
4309  * Emit Netlink message to add/remove local address to the outer device.
4310  * The address being added is visible within the link only (scope link).
4311  *
4312  * Note that an implicit route is maintained by the kernel due to the
4313  * presence of a peer address (IFA_ADDRESS).
4314  *
4315  * These rules are used for encapsultion only and allow to assign
4316  * the outer tunnel source IP address.
4317  *
4318  * @param[in] tcf
4319  *   Libmnl socket context object.
4320  * @param[in] encap
4321  *   Encapsulation properties (source address and its peer).
4322  * @param[in] ifindex
4323  *   Network interface to apply rule.
4324  * @param[in] enable
4325  *   Toggle between add and remove.
4326  * @param[out] error
4327  *   Perform verbose error reporting if not NULL.
4328  *
4329  * @return
4330  *   0 on success, a negative errno value otherwise and rte_errno is set.
4331  */
4332 static int
4333 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4334                     const struct flow_tcf_vxlan_encap *encap,
4335                     unsigned int ifindex,
4336                     bool enable,
4337                     struct rte_flow_error *error)
4338 {
4339         struct nlmsghdr *nlh;
4340         struct ifaddrmsg *ifa;
4341         alignas(struct nlmsghdr)
4342         uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4343
4344         nlh = mnl_nlmsg_put_header(buf);
4345         nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4346         nlh->nlmsg_flags =
4347                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4348         nlh->nlmsg_seq = 0;
4349         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4350         ifa->ifa_flags = IFA_F_PERMANENT;
4351         ifa->ifa_scope = RT_SCOPE_LINK;
4352         ifa->ifa_index = ifindex;
4353         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4354                 ifa->ifa_family = AF_INET;
4355                 ifa->ifa_prefixlen = 32;
4356                 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4357                 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4358                         mnl_attr_put_u32(nlh, IFA_ADDRESS,
4359                                               encap->ipv4.dst);
4360         } else {
4361                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4362                 ifa->ifa_family = AF_INET6;
4363                 ifa->ifa_prefixlen = 128;
4364                 mnl_attr_put(nlh, IFA_LOCAL,
4365                                   sizeof(encap->ipv6.src),
4366                                   &encap->ipv6.src);
4367                 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4368                         mnl_attr_put(nlh, IFA_ADDRESS,
4369                                           sizeof(encap->ipv6.dst),
4370                                           &encap->ipv6.dst);
4371         }
4372         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4373                 return 0;
4374         return rte_flow_error_set(error, rte_errno,
4375                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4376                                   "netlink: cannot complete IFA request"
4377                                   " (ip addr add)");
4378 }
4379
4380 /**
4381  * Emit Netlink message to add/remove neighbor.
4382  *
4383  * @param[in] tcf
4384  *   Libmnl socket context object.
4385  * @param[in] encap
4386  *   Encapsulation properties (destination address).
4387  * @param[in] ifindex
4388  *   Network interface.
4389  * @param[in] enable
4390  *   Toggle between add and remove.
4391  * @param[out] error
4392  *   Perform verbose error reporting if not NULL.
4393  *
4394  * @return
4395  *   0 on success, a negative errno value otherwise and rte_errno is set.
4396  */
4397 static int
4398 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4399                      const struct flow_tcf_vxlan_encap *encap,
4400                      unsigned int ifindex,
4401                      bool enable,
4402                      struct rte_flow_error *error)
4403 {
4404         struct nlmsghdr *nlh;
4405         struct ndmsg *ndm;
4406         alignas(struct nlmsghdr)
4407         uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4408
4409         nlh = mnl_nlmsg_put_header(buf);
4410         nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4411         nlh->nlmsg_flags =
4412                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4413         nlh->nlmsg_seq = 0;
4414         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4415         ndm->ndm_ifindex = ifindex;
4416         ndm->ndm_state = NUD_PERMANENT;
4417         ndm->ndm_flags = 0;
4418         ndm->ndm_type = 0;
4419         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4420                 ndm->ndm_family = AF_INET;
4421                 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4422         } else {
4423                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4424                 ndm->ndm_family = AF_INET6;
4425                 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4426                                                  &encap->ipv6.dst);
4427         }
4428         if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4429                 DRV_LOG(WARNING,
4430                         "outer ethernet source address cannot be "
4431                         "forced for VXLAN encapsulation");
4432         if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4433                 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4434                                                     &encap->eth.dst);
4435         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4436                 return 0;
4437         return rte_flow_error_set(error, rte_errno,
4438                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4439                                   "netlink: cannot complete ND request"
4440                                   " (ip neigh)");
4441 }
4442
4443 /**
4444  * Manage the local IP addresses and their peers IP addresses on the
4445  * outer interface for encapsulation purposes. The kernel searches the
4446  * appropriate device for tunnel egress traffic using the outer source
4447  * IP, this IP should be assigned to the outer network device, otherwise
4448  * kernel rejects the rule.
4449  *
4450  * Adds or removes the addresses using the Netlink command like this:
4451  *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4452  *
4453  * The addresses are local to the netdev ("scope link"), this reduces
4454  * the risk of conflicts. Note that an implicit route is maintained by
4455  * the kernel due to the presence of a peer address (IFA_ADDRESS).
4456  *
4457  * @param[in] tcf
4458  *   Libmnl socket context object.
4459  * @param[in] vtep
4460  *   VTEP object, contains rule database and ifouter index.
4461  * @param[in] dev_flow
4462  *   Flow object, contains the tunnel parameters (for encap only).
4463  * @param[in] enable
4464  *   Toggle between add and remove.
4465  * @param[out] error
4466  *   Perform verbose error reporting if not NULL.
4467  *
4468  * @return
4469  *   0 on success, a negative errno value otherwise and rte_errno is set.
4470  */
4471 static int
4472 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4473                      struct tcf_vtep *vtep,
4474                      struct mlx5_flow *dev_flow,
4475                      bool enable,
4476                      struct rte_flow_error *error)
4477 {
4478         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4479         struct tcf_local_rule *rule;
4480         bool found = false;
4481         int ret;
4482
4483         assert(encap);
4484         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4485         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4486                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4487                 LIST_FOREACH(rule, &vtep->local, next) {
4488                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4489                             encap->ipv4.src == rule->ipv4.src &&
4490                             encap->ipv4.dst == rule->ipv4.dst) {
4491                                 found = true;
4492                                 break;
4493                         }
4494                 }
4495         } else {
4496                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4497                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4498                 LIST_FOREACH(rule, &vtep->local, next) {
4499                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4500                             !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4501                                             sizeof(encap->ipv6.src)) &&
4502                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4503                                             sizeof(encap->ipv6.dst))) {
4504                                 found = true;
4505                                 break;
4506                         }
4507                 }
4508         }
4509         if (found) {
4510                 if (enable) {
4511                         rule->refcnt++;
4512                         return 0;
4513                 }
4514                 if (!rule->refcnt || !--rule->refcnt) {
4515                         LIST_REMOVE(rule, next);
4516                         return flow_tcf_rule_local(tcf, encap,
4517                                         vtep->ifouter, false, error);
4518                 }
4519                 return 0;
4520         }
4521         if (!enable) {
4522                 DRV_LOG(WARNING, "disabling not existing local rule");
4523                 rte_flow_error_set(error, ENOENT,
4524                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4525                                    "disabling not existing local rule");
4526                 return -ENOENT;
4527         }
4528         rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4529                                 alignof(struct tcf_local_rule));
4530         if (!rule) {
4531                 rte_flow_error_set(error, ENOMEM,
4532                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4533                                    "unable to allocate memory for local rule");
4534                 return -rte_errno;
4535         }
4536         *rule = (struct tcf_local_rule){.refcnt = 0,
4537                                         .mask = 0,
4538                                         };
4539         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4540                 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4541                            | FLOW_TCF_ENCAP_IPV4_DST;
4542                 rule->ipv4.src = encap->ipv4.src;
4543                 rule->ipv4.dst = encap->ipv4.dst;
4544         } else {
4545                 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4546                            | FLOW_TCF_ENCAP_IPV6_DST;
4547                 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4548                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4549         }
4550         ret = flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error);
4551         if (ret) {
4552                 rte_free(rule);
4553                 return ret;
4554         }
4555         rule->refcnt++;
4556         LIST_INSERT_HEAD(&vtep->local, rule, next);
4557         return 0;
4558 }
4559
4560 /**
4561  * Manage the destination MAC/IP addresses neigh database, kernel uses
4562  * this one to determine the destination MAC address within encapsulation
4563  * header. Adds or removes the entries using the Netlink command like this:
4564  *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4565  *
4566  * @param[in] tcf
4567  *   Libmnl socket context object.
4568  * @param[in] vtep
4569  *   VTEP object, contains rule database and ifouter index.
4570  * @param[in] dev_flow
4571  *   Flow object, contains the tunnel parameters (for encap only).
4572  * @param[in] enable
4573  *   Toggle between add and remove.
4574  * @param[out] error
4575  *   Perform verbose error reporting if not NULL.
4576  *
4577  * @return
4578  *   0 on success, a negative errno value otherwise and rte_errno is set.
4579  */
4580 static int
4581 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4582                      struct tcf_vtep *vtep,
4583                      struct mlx5_flow *dev_flow,
4584                      bool enable,
4585                      struct rte_flow_error *error)
4586 {
4587         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4588         struct tcf_neigh_rule *rule;
4589         bool found = false;
4590         int ret;
4591
4592         assert(encap);
4593         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4594         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4595                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4596                 LIST_FOREACH(rule, &vtep->neigh, next) {
4597                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4598                             encap->ipv4.dst == rule->ipv4.dst) {
4599                                 found = true;
4600                                 break;
4601                         }
4602                 }
4603         } else {
4604                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4605                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4606                 LIST_FOREACH(rule, &vtep->neigh, next) {
4607                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4608                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4609                                                 sizeof(encap->ipv6.dst))) {
4610                                 found = true;
4611                                 break;
4612                         }
4613                 }
4614         }
4615         if (found) {
4616                 if (memcmp(&encap->eth.dst, &rule->eth,
4617                            sizeof(encap->eth.dst))) {
4618                         DRV_LOG(WARNING, "Destination MAC differs"
4619                                          " in neigh rule");
4620                         rte_flow_error_set(error, EEXIST,
4621                                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4622                                            NULL, "Different MAC address"
4623                                            " neigh rule for the same"
4624                                            " destination IP");
4625                                         return -EEXIST;
4626                 }
4627                 if (enable) {
4628                         rule->refcnt++;
4629                         return 0;
4630                 }
4631                 if (!rule->refcnt || !--rule->refcnt) {
4632                         LIST_REMOVE(rule, next);
4633                         return flow_tcf_rule_neigh(tcf, encap,
4634                                                    vtep->ifouter,
4635                                                    false, error);
4636                 }
4637                 return 0;
4638         }
4639         if (!enable) {
4640                 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4641                 rte_flow_error_set(error, ENOENT,
4642                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4643                                    "unable to allocate memory for neigh rule");
4644                 return -ENOENT;
4645         }
4646         rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4647                                 alignof(struct tcf_neigh_rule));
4648         if (!rule) {
4649                 rte_flow_error_set(error, ENOMEM,
4650                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4651                                    "unable to allocate memory for neigh rule");
4652                 return -rte_errno;
4653         }
4654         *rule = (struct tcf_neigh_rule){.refcnt = 0,
4655                                         .mask = 0,
4656                                         };
4657         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4658                 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4659                 rule->ipv4.dst = encap->ipv4.dst;
4660         } else {
4661                 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4662                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4663         }
4664         memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4665         ret = flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error);
4666         if (ret) {
4667                 rte_free(rule);
4668                 return ret;
4669         }
4670         rule->refcnt++;
4671         LIST_INSERT_HEAD(&vtep->neigh, rule, next);
4672         return 0;
4673 }
4674
4675 /* VTEP device list is shared between PMD port instances. */
4676 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4677 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4678
4679 /**
4680  * Deletes VTEP network device.
4681  *
4682  * @param[in] tcf
4683  *   Context object initialized by mlx5_flow_tcf_context_create().
4684  * @param[in] vtep
4685  *   Object represinting the network device to delete. Memory
4686  *   allocated for this object is freed by routine.
4687  */
4688 static void
4689 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
4690                      struct tcf_vtep *vtep)
4691 {
4692         struct nlmsghdr *nlh;
4693         struct ifinfomsg *ifm;
4694         alignas(struct nlmsghdr)
4695         uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
4696                     MNL_BUF_EXTRA_SPACE];
4697         int ret;
4698
4699         assert(!vtep->refcnt);
4700         /* Delete only ifaces those we actually created. */
4701         if (vtep->created && vtep->ifindex) {
4702                 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
4703                 nlh = mnl_nlmsg_put_header(buf);
4704                 nlh->nlmsg_type = RTM_DELLINK;
4705                 nlh->nlmsg_flags = NLM_F_REQUEST;
4706                 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4707                 ifm->ifi_family = AF_UNSPEC;
4708                 ifm->ifi_index = vtep->ifindex;
4709                 assert(sizeof(buf) >= nlh->nlmsg_len);
4710                 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4711                 if (ret)
4712                         DRV_LOG(WARNING, "netlink: error deleting vxlan"
4713                                          " encap/decap ifindex %u",
4714                                          ifm->ifi_index);
4715         }
4716         rte_free(vtep);
4717 }
4718
4719 /**
4720  * Creates VTEP network device.
4721  *
4722  * @param[in] tcf
4723  *   Context object initialized by mlx5_flow_tcf_context_create().
4724  * @param[in] ifouter
4725  *   Outer interface to attach new-created VXLAN device
4726  *   If zero the VXLAN device will not be attached to any device.
4727  *   These VTEPs are used for decapsulation and can be precreated
4728  *   and shared between processes.
4729  * @param[in] port
4730  *   UDP port of created VTEP device.
4731  * @param[out] error
4732  *   Perform verbose error reporting if not NULL.
4733  *
4734  * @return
4735  * Pointer to created device structure on success,
4736  * NULL otherwise and rte_errno is set.
4737  */
4738 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
4739 static struct tcf_vtep*
4740 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
4741                      unsigned int ifouter,
4742                      uint16_t port, struct rte_flow_error *error)
4743 {
4744         struct tcf_vtep *vtep;
4745         struct nlmsghdr *nlh;
4746         struct ifinfomsg *ifm;
4747         char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
4748         alignas(struct nlmsghdr)
4749         uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
4750                     SZ_NLATTR_DATA_OF(sizeof(name)) +
4751                     SZ_NLATTR_NEST * 2 +
4752                     SZ_NLATTR_STRZ_OF("vxlan") +
4753                     SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
4754                     SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
4755                     SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
4756                     MNL_BUF_EXTRA_SPACE];
4757         struct nlattr *na_info;
4758         struct nlattr *na_vxlan;
4759         rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
4760         int ret;
4761
4762         vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
4763         if (!vtep) {
4764                 rte_flow_error_set(error, ENOMEM,
4765                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4766                                    "unable to allocate memory for VTEP");
4767                 return NULL;
4768         }
4769         *vtep = (struct tcf_vtep){
4770                         .port = port,
4771                         .local = LIST_HEAD_INITIALIZER(),
4772                         .neigh = LIST_HEAD_INITIALIZER(),
4773         };
4774         memset(buf, 0, sizeof(buf));
4775         nlh = mnl_nlmsg_put_header(buf);
4776         nlh->nlmsg_type = RTM_NEWLINK;
4777         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE  | NLM_F_EXCL;
4778         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4779         ifm->ifi_family = AF_UNSPEC;
4780         ifm->ifi_type = 0;
4781         ifm->ifi_index = 0;
4782         ifm->ifi_flags = IFF_UP;
4783         ifm->ifi_change = 0xffffffff;
4784         snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
4785         mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
4786         na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
4787         assert(na_info);
4788         mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
4789         na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
4790         if (ifouter)
4791                 mnl_attr_put_u32(nlh, IFLA_VXLAN_LINK, ifouter);
4792         assert(na_vxlan);
4793         mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
4794         mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
4795         mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
4796         mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
4797         mnl_attr_nest_end(nlh, na_vxlan);
4798         mnl_attr_nest_end(nlh, na_info);
4799         assert(sizeof(buf) >= nlh->nlmsg_len);
4800         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4801         if (ret) {
4802                 DRV_LOG(WARNING,
4803                         "netlink: VTEP %s create failure (%d)",
4804                         name, rte_errno);
4805                 if (rte_errno != EEXIST || ifouter)
4806                         /*
4807                          * Some unhandled error occurred or device is
4808                          * for encapsulation and cannot be shared.
4809                          */
4810                         goto error;
4811         } else {
4812                 /*
4813                  * Mark device we actually created.
4814                  * We should explicitly delete
4815                  * when we do not need it anymore.
4816                  */
4817                 vtep->created = 1;
4818         }
4819         /* Try to get ifindex of created of pre-existing device. */
4820         ret = if_nametoindex(name);
4821         if (!ret) {
4822                 DRV_LOG(WARNING,
4823                         "VTEP %s failed to get index (%d)", name, errno);
4824                 rte_flow_error_set
4825                         (error, -errno,
4826                          RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4827                          "netlink: failed to retrieve VTEP ifindex");
4828                 goto error;
4829         }
4830         vtep->ifindex = ret;
4831         vtep->ifouter = ifouter;
4832         memset(buf, 0, sizeof(buf));
4833         nlh = mnl_nlmsg_put_header(buf);
4834         nlh->nlmsg_type = RTM_NEWLINK;
4835         nlh->nlmsg_flags = NLM_F_REQUEST;
4836         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4837         ifm->ifi_family = AF_UNSPEC;
4838         ifm->ifi_type = 0;
4839         ifm->ifi_index = vtep->ifindex;
4840         ifm->ifi_flags = IFF_UP;
4841         ifm->ifi_change = IFF_UP;
4842         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4843         if (ret) {
4844                 rte_flow_error_set(error, -errno,
4845                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4846                                    "netlink: failed to set VTEP link up");
4847                 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
4848                         name, rte_errno);
4849                 goto clean;
4850         }
4851         ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
4852         if (ret) {
4853                 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
4854                 goto clean;
4855         }
4856         DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
4857         vtep->refcnt = 1;
4858         return vtep;
4859 clean:
4860         flow_tcf_vtep_delete(tcf, vtep);
4861         return NULL;
4862 error:
4863         rte_free(vtep);
4864         return NULL;
4865 }
4866 #else
4867 static struct tcf_vtep*
4868 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf __rte_unused,
4869                      unsigned int ifouter __rte_unused,
4870                      uint16_t port __rte_unused,
4871                      struct rte_flow_error *error)
4872 {
4873         rte_flow_error_set(error, ENOTSUP,
4874                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4875                            "netlink: failed to create VTEP, "
4876                            "vxlan metadata are not supported by kernel");
4877         return NULL;
4878 }
4879 #endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */
4880
4881 /**
4882  * Acquire target interface index for VXLAN tunneling decapsulation.
4883  * In order to share the UDP port within the other interfaces the
4884  * VXLAN device created as not attached to any interface (if created).
4885  *
4886  * @param[in] tcf
4887  *   Context object initialized by mlx5_flow_tcf_context_create().
4888  * @param[in] dev_flow
4889  *   Flow tcf object with tunnel structure pointer set.
4890  * @param[out] error
4891  *   Perform verbose error reporting if not NULL.
4892  * @return
4893  *   Interface descriptor pointer on success,
4894  *   NULL otherwise and rte_errno is set.
4895  */
4896 static struct tcf_vtep*
4897 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
4898                             struct mlx5_flow *dev_flow,
4899                             struct rte_flow_error *error)
4900 {
4901         struct tcf_vtep *vtep;
4902         uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
4903
4904         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4905                 if (vtep->port == port)
4906                         break;
4907         }
4908         if (vtep && vtep->ifouter) {
4909                 rte_flow_error_set(error, -errno,
4910                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4911                                    "Failed to create decap VTEP with specified"
4912                                    " UDP port, atatched device exists");
4913                 return NULL;
4914         }
4915         if (vtep) {
4916                 /* Device exists, just increment the reference counter. */
4917                 vtep->refcnt++;
4918                 assert(vtep->ifindex);
4919                 return vtep;
4920         }
4921         /* No decapsulation device exists, try to create the new one. */
4922         vtep = flow_tcf_vtep_create(tcf, 0, port, error);
4923         if (vtep)
4924                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
4925         return vtep;
4926 }
4927
4928 /**
4929  * Aqcuire target interface index for VXLAN tunneling encapsulation.
4930  *
4931  * @param[in] tcf
4932  *   Context object initialized by mlx5_flow_tcf_context_create().
4933  * @param[in] ifouter
4934  *   Network interface index to attach VXLAN encap device to.
4935  * @param[in] dev_flow
4936  *   Flow tcf object with tunnel structure pointer set.
4937  * @param[out] error
4938  *   Perform verbose error reporting if not NULL.
4939  * @return
4940  *   Interface descriptor pointer on success,
4941  *   NULL otherwise and rte_errno is set.
4942  */
4943 static struct tcf_vtep*
4944 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
4945                             unsigned int ifouter,
4946                             struct mlx5_flow *dev_flow __rte_unused,
4947                             struct rte_flow_error *error)
4948 {
4949         static uint16_t encap_port = MLX5_VXLAN_PORT_MIN - 1;
4950         struct tcf_vtep *vtep;
4951         int ret;
4952
4953         assert(ifouter);
4954         /* Look whether the attached VTEP for encap is created. */
4955         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4956                 if (vtep->ifouter == ifouter)
4957                         break;
4958         }
4959         if (vtep) {
4960                 /* VTEP already exists, just increment the reference. */
4961                 vtep->refcnt++;
4962         } else {
4963                 uint16_t pcnt;
4964
4965                 /* Not found, we should create the new attached VTEP. */
4966                 flow_tcf_encap_iface_cleanup(tcf, ifouter);
4967                 flow_tcf_encap_local_cleanup(tcf, ifouter);
4968                 flow_tcf_encap_neigh_cleanup(tcf, ifouter);
4969                 for (pcnt = 0; pcnt <= (MLX5_VXLAN_PORT_MAX
4970                                      - MLX5_VXLAN_PORT_MIN); pcnt++) {
4971                         encap_port++;
4972                         /* Wraparound the UDP port index. */
4973                         if (encap_port < MLX5_VXLAN_PORT_MIN ||
4974                             encap_port > MLX5_VXLAN_PORT_MAX)
4975                                 encap_port = MLX5_VXLAN_PORT_MIN;
4976                         /* Check whether UDP port is in already in use. */
4977                         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4978                                 if (vtep->port == encap_port)
4979                                         break;
4980                         }
4981                         if (vtep) {
4982                                 /* Port is in use, try the next one. */
4983                                 vtep = NULL;
4984                                 continue;
4985                         }
4986                         vtep = flow_tcf_vtep_create(tcf, ifouter,
4987                                                     encap_port, error);
4988                         if (vtep) {
4989                                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
4990                                 break;
4991                         }
4992                         if (rte_errno != EEXIST)
4993                                 break;
4994                 }
4995                 if (!vtep)
4996                         return NULL;
4997         }
4998         assert(vtep->ifouter == ifouter);
4999         assert(vtep->ifindex);
5000         /* Create local ipaddr with peer to specify the outer IPs. */
5001         ret = flow_tcf_encap_local(tcf, vtep, dev_flow, true, error);
5002         if (!ret) {
5003                 /* Create neigh rule to specify outer destination MAC. */
5004                 ret = flow_tcf_encap_neigh(tcf, vtep, dev_flow, true, error);
5005                 if (ret)
5006                         flow_tcf_encap_local(tcf, vtep,
5007                                              dev_flow, false, error);
5008         }
5009         if (ret) {
5010                 if (--vtep->refcnt == 0)
5011                         flow_tcf_vtep_delete(tcf, vtep);
5012                 return NULL;
5013         }
5014         return vtep;
5015 }
5016
5017 /**
5018  * Acquires target interface index for tunneling of any type.
5019  * Creates the new VTEP if needed.
5020  *
5021  * @param[in] tcf
5022  *   Context object initialized by mlx5_flow_tcf_context_create().
5023  * @param[in] ifouter
5024  *   Network interface index to attach VXLAN encap device to.
5025  * @param[in] dev_flow
5026  *   Flow tcf object with tunnel structure pointer set.
5027  * @param[out] error
5028  *   Perform verbose error reporting if not NULL.
5029  * @return
5030  *   Interface descriptor pointer on success,
5031  *   NULL otherwise and rte_errno is set.
5032  */
5033 static struct tcf_vtep*
5034 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5035                       unsigned int ifouter,
5036                       struct mlx5_flow *dev_flow,
5037                       struct rte_flow_error *error)
5038 {
5039         struct tcf_vtep *vtep = NULL;
5040
5041         assert(dev_flow->tcf.tunnel);
5042         pthread_mutex_lock(&vtep_list_mutex);
5043         switch (dev_flow->tcf.tunnel->type) {
5044         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5045                 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5046                                                   dev_flow, error);
5047                 break;
5048         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5049                 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5050                 break;
5051         default:
5052                 rte_flow_error_set(error, ENOTSUP,
5053                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5054                                    "unsupported tunnel type");
5055                 break;
5056         }
5057         pthread_mutex_unlock(&vtep_list_mutex);
5058         return vtep;
5059 }
5060
5061 /**
5062  * Release tunneling interface by ifindex. Decrements reference
5063  * counter and actually removes the device if counter is zero.
5064  *
5065  * @param[in] tcf
5066  *   Context object initialized by mlx5_flow_tcf_context_create().
5067  * @param[in] vtep
5068  *   VTEP device descriptor structure.
5069  * @param[in] dev_flow
5070  *   Flow tcf object with tunnel structure pointer set.
5071  */
5072 static void
5073 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5074                       struct tcf_vtep *vtep,
5075                       struct mlx5_flow *dev_flow)
5076 {
5077         assert(dev_flow->tcf.tunnel);
5078         pthread_mutex_lock(&vtep_list_mutex);
5079         switch (dev_flow->tcf.tunnel->type) {
5080         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5081                 break;
5082         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5083                 /* Remove the encap ancillary rules first. */
5084                 flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL);
5085                 flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL);
5086                 break;
5087         default:
5088                 assert(false);
5089                 DRV_LOG(WARNING, "Unsupported tunnel type");
5090                 break;
5091         }
5092         assert(vtep->refcnt);
5093         if (--vtep->refcnt == 0) {
5094                 LIST_REMOVE(vtep, next);
5095                 flow_tcf_vtep_delete(tcf, vtep);
5096         }
5097         pthread_mutex_unlock(&vtep_list_mutex);
5098 }
5099
5100 struct tcf_nlcb_query {
5101         uint32_t handle;
5102         uint32_t tc_flags;
5103         uint32_t flags_valid:1;
5104 };
5105
5106 /**
5107  * Collect queried rule attributes. This is callback routine called by
5108  * libmnl mnl_cb_run() in loop for every message in received packet.
5109  * Current implementation collects the flower flags only.
5110  *
5111  * @param[in] nlh
5112  *   Pointer to reply header.
5113  * @param[in, out] arg
5114  *   Context pointer for this callback.
5115  *
5116  * @return
5117  *   A positive, nonzero value on success (required by libmnl
5118  *   to continue messages processing).
5119  */
5120 static int
5121 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5122 {
5123         struct tcf_nlcb_query *query = arg;
5124         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5125         struct nlattr *na, *na_opt;
5126         bool flower = false;
5127
5128         if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5129             tcm->tcm_handle != query->handle)
5130                 return 1;
5131         mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5132                 switch (mnl_attr_get_type(na)) {
5133                 case TCA_KIND:
5134                         if (strcmp(mnl_attr_get_payload(na), "flower")) {
5135                                 /* Not flower filter, drop entire message. */
5136                                 return 1;
5137                         }
5138                         flower = true;
5139                         break;
5140                 case TCA_OPTIONS:
5141                         if (!flower) {
5142                                 /* Not flower options, drop entire message. */
5143                                 return 1;
5144                         }
5145                         /* Check nested flower options. */
5146                         mnl_attr_for_each_nested(na_opt, na) {
5147                                 switch (mnl_attr_get_type(na_opt)) {
5148                                 case TCA_FLOWER_FLAGS:
5149                                         query->flags_valid = 1;
5150                                         query->tc_flags =
5151                                                 mnl_attr_get_u32(na_opt);
5152                                         break;
5153                                 }
5154                         }
5155                         break;
5156                 }
5157         }
5158         return 1;
5159 }
5160
5161 /**
5162  * Query a TC flower rule flags via netlink.
5163  *
5164  * @param[in] tcf
5165  *   Context object initialized by mlx5_flow_tcf_context_create().
5166  * @param[in] dev_flow
5167  *   Pointer to the flow.
5168  * @param[out] pflags
5169  *   pointer to the data retrieved by the query.
5170  *
5171  * @return
5172  *   0 on success, a negative errno value otherwise.
5173  */
5174 static int
5175 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5176                      struct mlx5_flow *dev_flow,
5177                      uint32_t *pflags)
5178 {
5179         struct nlmsghdr *nlh;
5180         struct tcmsg *tcm;
5181         struct tcf_nlcb_query query = {
5182                 .handle = dev_flow->tcf.tcm->tcm_handle,
5183         };
5184
5185         nlh = mnl_nlmsg_put_header(tcf->buf);
5186         nlh->nlmsg_type = RTM_GETTFILTER;
5187         nlh->nlmsg_flags = NLM_F_REQUEST;
5188         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5189         memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5190         /*
5191          * Ignore Netlink error for filter query operations.
5192          * The reply length is sent by kernel as errno.
5193          * Just check we got the flags option.
5194          */
5195         flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5196         if (!query.flags_valid) {
5197                 *pflags = 0;
5198                 return -ENOENT;
5199         }
5200         *pflags = query.tc_flags;
5201         return 0;
5202 }
5203
5204 /**
5205  * Query and check the in_hw set for specified rule.
5206  *
5207  * @param[in] tcf
5208  *   Context object initialized by mlx5_flow_tcf_context_create().
5209  * @param[in] dev_flow
5210  *   Pointer to the flow to check.
5211  *
5212  * @return
5213  *   0 on success, a negative errno value otherwise.
5214  */
5215 static int
5216 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5217                     struct mlx5_flow *dev_flow)
5218 {
5219         uint32_t flags;
5220         int ret;
5221
5222         ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5223         if (ret)
5224                 return ret;
5225         return  (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5226 }
5227
5228 /**
5229  * Remove flow from E-Switch by sending Netlink message.
5230  *
5231  * @param[in] dev
5232  *   Pointer to Ethernet device.
5233  * @param[in, out] flow
5234  *   Pointer to the sub flow.
5235  */
5236 static void
5237 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5238 {
5239         struct priv *priv = dev->data->dev_private;
5240         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5241         struct mlx5_flow *dev_flow;
5242         struct nlmsghdr *nlh;
5243
5244         if (!flow)
5245                 return;
5246         dev_flow = LIST_FIRST(&flow->dev_flows);
5247         if (!dev_flow)
5248                 return;
5249         /* E-Switch flow can't be expanded. */
5250         assert(!LIST_NEXT(dev_flow, next));
5251         if (dev_flow->tcf.applied) {
5252                 nlh = dev_flow->tcf.nlh;
5253                 nlh->nlmsg_type = RTM_DELTFILTER;
5254                 nlh->nlmsg_flags = NLM_F_REQUEST;
5255                 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5256                 if (dev_flow->tcf.tunnel) {
5257                         assert(dev_flow->tcf.tunnel->vtep);
5258                         flow_tcf_vtep_release(ctx,
5259                                 dev_flow->tcf.tunnel->vtep,
5260                                 dev_flow);
5261                         dev_flow->tcf.tunnel->vtep = NULL;
5262                 }
5263                 dev_flow->tcf.applied = 0;
5264         }
5265 }
5266
5267 /**
5268  * Apply flow to E-Switch by sending Netlink message.
5269  *
5270  * @param[in] dev
5271  *   Pointer to Ethernet device.
5272  * @param[in, out] flow
5273  *   Pointer to the sub flow.
5274  * @param[out] error
5275  *   Pointer to the error structure.
5276  *
5277  * @return
5278  *   0 on success, a negative errno value otherwise and rte_ernno is set.
5279  */
5280 static int
5281 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5282                struct rte_flow_error *error)
5283 {
5284         struct priv *priv = dev->data->dev_private;
5285         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5286         struct mlx5_flow *dev_flow;
5287         struct nlmsghdr *nlh;
5288
5289         dev_flow = LIST_FIRST(&flow->dev_flows);
5290         /* E-Switch flow can't be expanded. */
5291         assert(!LIST_NEXT(dev_flow, next));
5292         if (dev_flow->tcf.applied)
5293                 return 0;
5294         nlh = dev_flow->tcf.nlh;
5295         nlh->nlmsg_type = RTM_NEWTFILTER;
5296         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5297         if (dev_flow->tcf.tunnel) {
5298                 /*
5299                  * Replace the interface index, target for
5300                  * encapsulation, source for decapsulation.
5301                  */
5302                 assert(!dev_flow->tcf.tunnel->vtep);
5303                 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5304                 /* Acquire actual VTEP device when rule is being applied. */
5305                 dev_flow->tcf.tunnel->vtep =
5306                         flow_tcf_vtep_acquire(ctx,
5307                                         dev_flow->tcf.tunnel->ifindex_org,
5308                                         dev_flow, error);
5309                 if (!dev_flow->tcf.tunnel->vtep)
5310                         return -rte_errno;
5311                 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5312                                 dev_flow->tcf.tunnel->vtep->ifindex,
5313                                 dev_flow->tcf.tunnel->ifindex_org);
5314                 *dev_flow->tcf.tunnel->ifindex_ptr =
5315                         dev_flow->tcf.tunnel->vtep->ifindex;
5316         }
5317         if (!flow_tcf_nl_ack(ctx, nlh, NULL, NULL)) {
5318                 dev_flow->tcf.applied = 1;
5319                 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5320                         return 0;
5321                 /*
5322                  * Rule was applied without skip_sw flag set.
5323                  * We should check whether the rule was acctually
5324                  * accepted by hardware (have look at in_hw flag).
5325                  */
5326                 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5327                         flow_tcf_remove(dev, flow);
5328                         return rte_flow_error_set
5329                                 (error, ENOENT,
5330                                  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5331                                  "netlink: rule has no in_hw flag set");
5332                 }
5333                 return 0;
5334         }
5335         if (dev_flow->tcf.tunnel) {
5336                 /* Rollback the VTEP configuration if rule apply failed. */
5337                 assert(dev_flow->tcf.tunnel->vtep);
5338                 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5339                                       dev_flow);
5340                 dev_flow->tcf.tunnel->vtep = NULL;
5341         }
5342         return rte_flow_error_set(error, rte_errno,
5343                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5344                                   "netlink: failed to create TC flow rule");
5345 }
5346
5347 /**
5348  * Remove flow from E-Switch and release resources of the device flow.
5349  *
5350  * @param[in] dev
5351  *   Pointer to Ethernet device.
5352  * @param[in, out] flow
5353  *   Pointer to the sub flow.
5354  */
5355 static void
5356 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5357 {
5358         struct mlx5_flow *dev_flow;
5359
5360         if (!flow)
5361                 return;
5362         flow_tcf_remove(dev, flow);
5363         if (flow->counter) {
5364                 if (--flow->counter->ref_cnt == 0) {
5365                         rte_free(flow->counter);
5366                         flow->counter = NULL;
5367                 }
5368         }
5369         dev_flow = LIST_FIRST(&flow->dev_flows);
5370         if (!dev_flow)
5371                 return;
5372         /* E-Switch flow can't be expanded. */
5373         assert(!LIST_NEXT(dev_flow, next));
5374         LIST_REMOVE(dev_flow, next);
5375         rte_free(dev_flow);
5376 }
5377
5378 /**
5379  * Helper routine for figuring the space size required for a parse buffer.
5380  *
5381  * @param array
5382  *   array of values to use.
5383  * @param idx
5384  *   Current location in array.
5385  * @param value
5386  *   Value to compare with.
5387  *
5388  * @return
5389  *   The maximum between the given value and the array value on index.
5390  */
5391 static uint16_t
5392 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5393 {
5394         return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5395 }
5396
5397 /**
5398  * Parse rtnetlink message attributes filling the attribute table with the info
5399  * retrieved.
5400  *
5401  * @param tb
5402  *   Attribute table to be filled.
5403  * @param[out] max
5404  *   Maxinum entry in the attribute table.
5405  * @param rte
5406  *   The attributes section in the message to be parsed.
5407  * @param len
5408  *   The length of the attributes section in the message.
5409  */
5410 static void
5411 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5412                          struct rtattr *rta, int len)
5413 {
5414         unsigned short type;
5415         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5416         while (RTA_OK(rta, len)) {
5417                 type = rta->rta_type;
5418                 if (type <= max && !tb[type])
5419                         tb[type] = rta;
5420                 rta = RTA_NEXT(rta, len);
5421         }
5422 }
5423
5424 /**
5425  * Extract flow counters from flower action.
5426  *
5427  * @param rta
5428  *   flower action stats properties in the Netlink message received.
5429  * @param rta_type
5430  *   The backward sequence of rta_types, as written in the attribute table,
5431  *   we need to traverse in order to get to the requested object.
5432  * @param idx
5433  *   Current location in rta_type table.
5434  * @param[out] data
5435  *   data holding the count statistics of the rte_flow retrieved from
5436  *   the message.
5437  *
5438  * @return
5439  *   0 if data was found and retrieved, -1 otherwise.
5440  */
5441 static int
5442 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5443                                        uint16_t rta_type[], int idx,
5444                                        struct gnet_stats_basic *data)
5445 {
5446         int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5447                                                  TCA_STATS_BASIC);
5448         struct rtattr *tbs[tca_stats_max + 1];
5449
5450         if (rta == NULL || idx < 0)
5451                 return -1;
5452         flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5453                                  RTA_DATA(rta), RTA_PAYLOAD(rta));
5454         switch (rta_type[idx]) {
5455         case TCA_STATS_BASIC:
5456                 if (tbs[TCA_STATS_BASIC]) {
5457                         memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5458                                RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5459                                sizeof(*data)));
5460                         return 0;
5461                 }
5462                 break;
5463         default:
5464                 break;
5465         }
5466         return -1;
5467 }
5468
5469 /**
5470  * Parse flower single action retrieving the requested action attribute,
5471  * if found.
5472  *
5473  * @param arg
5474  *   flower action properties in the Netlink message received.
5475  * @param rta_type
5476  *   The backward sequence of rta_types, as written in the attribute table,
5477  *   we need to traverse in order to get to the requested object.
5478  * @param idx
5479  *   Current location in rta_type table.
5480  * @param[out] data
5481  *   Count statistics retrieved from the message query.
5482  *
5483  * @return
5484  *   0 if data was found and retrieved, -1 otherwise.
5485  */
5486 static int
5487 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5488                                      uint16_t rta_type[], int idx, void *data)
5489 {
5490         int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5491         struct rtattr *tb[tca_act_max + 1];
5492
5493         if (arg == NULL || idx < 0)
5494                 return -1;
5495         flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5496                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5497         if (tb[TCA_ACT_KIND] == NULL)
5498                 return -1;
5499         switch (rta_type[idx]) {
5500         case TCA_ACT_STATS:
5501                 if (tb[TCA_ACT_STATS])
5502                         return flow_tcf_nl_action_stats_parse_and_get
5503                                         (tb[TCA_ACT_STATS],
5504                                          rta_type, --idx,
5505                                          (struct gnet_stats_basic *)data);
5506                 break;
5507         default:
5508                 break;
5509         }
5510         return -1;
5511 }
5512
5513 /**
5514  * Parse flower action section in the message retrieving the requested
5515  * attribute from the first action that provides it.
5516  *
5517  * @param opt
5518  *   flower section in the Netlink message received.
5519  * @param rta_type
5520  *   The backward sequence of rta_types, as written in the attribute table,
5521  *   we need to traverse in order to get to the requested object.
5522  * @param idx
5523  *   Current location in rta_type table.
5524  * @param[out] data
5525  *   data retrieved from the message query.
5526  *
5527  * @return
5528  *   0 if data was found and retrieved, -1 otherwise.
5529  */
5530 static int
5531 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5532                                  uint16_t rta_type[], int idx, void *data)
5533 {
5534         struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5535         int i;
5536
5537         if (arg == NULL || idx < 0)
5538                 return -1;
5539         flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5540                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5541         switch (rta_type[idx]) {
5542         /*
5543          * flow counters are stored in the actions defined by the flow
5544          * and not in the flow itself, therefore we need to traverse the
5545          * flower chain of actions in search for them.
5546          *
5547          * Note that the index is not decremented here.
5548          */
5549         case TCA_ACT_STATS:
5550                 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5551                         if (tb[i] &&
5552                         !flow_tcf_nl_parse_one_action_and_get(tb[i],
5553                                                               rta_type,
5554                                                               idx, data))
5555                                 return 0;
5556                 }
5557                 break;
5558         default:
5559                 break;
5560         }
5561         return -1;
5562 }
5563
5564 /**
5565  * Parse flower classifier options in the message, retrieving the requested
5566  * attribute if found.
5567  *
5568  * @param opt
5569  *   flower section in the Netlink message received.
5570  * @param rta_type
5571  *   The backward sequence of rta_types, as written in the attribute table,
5572  *   we need to traverse in order to get to the requested object.
5573  * @param idx
5574  *   Current location in rta_type table.
5575  * @param[out] data
5576  *   data retrieved from the message query.
5577  *
5578  * @return
5579  *   0 if data was found and retrieved, -1 otherwise.
5580  */
5581 static int
5582 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5583                                uint16_t rta_type[], int idx, void *data)
5584 {
5585         int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5586                                                   TCA_FLOWER_ACT);
5587         struct rtattr *tb[tca_flower_max + 1];
5588
5589         if (!opt || idx < 0)
5590                 return -1;
5591         flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5592                                  RTA_DATA(opt), RTA_PAYLOAD(opt));
5593         switch (rta_type[idx]) {
5594         case TCA_FLOWER_ACT:
5595                 if (tb[TCA_FLOWER_ACT])
5596                         return flow_tcf_nl_action_parse_and_get
5597                                                         (tb[TCA_FLOWER_ACT],
5598                                                          rta_type, --idx, data);
5599                 break;
5600         default:
5601                 break;
5602         }
5603         return -1;
5604 }
5605
5606 /**
5607  * Parse Netlink reply on filter query, retrieving the flow counters.
5608  *
5609  * @param nlh
5610  *   Message received from Netlink.
5611  * @param rta_type
5612  *   The backward sequence of rta_types, as written in the attribute table,
5613  *   we need to traverse in order to get to the requested object.
5614  * @param idx
5615  *   Current location in rta_type table.
5616  * @param[out] data
5617  *   data retrieved from the message query.
5618  *
5619  * @return
5620  *   0 if data was found and retrieved, -1 otherwise.
5621  */
5622 static int
5623 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
5624                                  uint16_t rta_type[], int idx, void *data)
5625 {
5626         struct nlmsghdr *nlh = cnlh;
5627         struct tcmsg *t = NLMSG_DATA(nlh);
5628         int len = nlh->nlmsg_len;
5629         int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
5630         struct rtattr *tb[tca_max + 1];
5631
5632         if (idx < 0)
5633                 return -1;
5634         if (nlh->nlmsg_type != RTM_NEWTFILTER &&
5635             nlh->nlmsg_type != RTM_GETTFILTER &&
5636             nlh->nlmsg_type != RTM_DELTFILTER)
5637                 return -1;
5638         len -= NLMSG_LENGTH(sizeof(*t));
5639         if (len < 0)
5640                 return -1;
5641         flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
5642         /* Not a TC flower flow - bail out */
5643         if (!tb[TCA_KIND] ||
5644             strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
5645                 return -1;
5646         switch (rta_type[idx]) {
5647         case TCA_OPTIONS:
5648                 if (tb[TCA_OPTIONS])
5649                         return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
5650                                                               rta_type,
5651                                                               --idx, data);
5652                 break;
5653         default:
5654                 break;
5655         }
5656         return -1;
5657 }
5658
5659 /**
5660  * A callback to parse Netlink reply on TC flower query.
5661  *
5662  * @param nlh
5663  *   Message received from Netlink.
5664  * @param[out] data
5665  *   Pointer to data area to be filled by the parsing routine.
5666  *   assumed to be a pointer to struct flow_tcf_stats_basic.
5667  *
5668  * @return
5669  *   MNL_CB_OK value.
5670  */
5671 static int
5672 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
5673 {
5674         /*
5675          * The backward sequence of rta_types to pass in order to get
5676          *  to the counters.
5677          */
5678         uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
5679                                 TCA_FLOWER_ACT, TCA_OPTIONS };
5680         struct flow_tcf_stats_basic *sb_data = data;
5681         union {
5682                 const struct nlmsghdr *c;
5683                 struct nlmsghdr *nc;
5684         } tnlh = { .c = nlh };
5685
5686         if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
5687                                               RTE_DIM(rta_type) - 1,
5688                                               (void *)&sb_data->counters))
5689                 sb_data->valid = true;
5690         return MNL_CB_OK;
5691 }
5692
5693 /**
5694  * Query a TC flower rule for its statistics via netlink.
5695  *
5696  * @param[in] dev
5697  *   Pointer to Ethernet device.
5698  * @param[in] flow
5699  *   Pointer to the sub flow.
5700  * @param[out] data
5701  *   data retrieved by the query.
5702  * @param[out] error
5703  *   Perform verbose error reporting if not NULL.
5704  *
5705  * @return
5706  *   0 on success, a negative errno value otherwise and rte_errno is set.
5707  */
5708 static int
5709 flow_tcf_query_count(struct rte_eth_dev *dev,
5710                           struct rte_flow *flow,
5711                           void *data,
5712                           struct rte_flow_error *error)
5713 {
5714         struct flow_tcf_stats_basic sb_data;
5715         struct rte_flow_query_count *qc = data;
5716         struct priv *priv = dev->data->dev_private;
5717         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5718         struct mnl_socket *nl = ctx->nl;
5719         struct mlx5_flow *dev_flow;
5720         struct nlmsghdr *nlh;
5721         uint32_t seq = priv->tcf_context->seq++;
5722         ssize_t ret;
5723         assert(qc);
5724
5725         memset(&sb_data, 0, sizeof(sb_data));
5726         dev_flow = LIST_FIRST(&flow->dev_flows);
5727         /* E-Switch flow can't be expanded. */
5728         assert(!LIST_NEXT(dev_flow, next));
5729         if (!dev_flow->flow->counter)
5730                 goto notsup_exit;
5731         nlh = dev_flow->tcf.nlh;
5732         nlh->nlmsg_type = RTM_GETTFILTER;
5733         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
5734         nlh->nlmsg_seq = seq;
5735         if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
5736                 goto error_exit;
5737         do {
5738                 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
5739                 if (ret <= 0)
5740                         break;
5741                 ret = mnl_cb_run(ctx->buf, ret, seq,
5742                                  mnl_socket_get_portid(nl),
5743                                  flow_tcf_nl_message_get_stats_basic,
5744                                  (void *)&sb_data);
5745         } while (ret > 0);
5746         /* Return the delta from last reset. */
5747         if (sb_data.valid) {
5748                 /* Return the delta from last reset. */
5749                 qc->hits_set = 1;
5750                 qc->bytes_set = 1;
5751                 qc->hits = sb_data.counters.packets - flow->counter->hits;
5752                 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
5753                 if (qc->reset) {
5754                         flow->counter->hits = sb_data.counters.packets;
5755                         flow->counter->bytes = sb_data.counters.bytes;
5756                 }
5757                 return 0;
5758         }
5759         return rte_flow_error_set(error, EINVAL,
5760                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5761                                   NULL,
5762                                   "flow does not have counter");
5763 error_exit:
5764         return rte_flow_error_set
5765                         (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5766                          NULL, "netlink: failed to read flow rule counters");
5767 notsup_exit:
5768         return rte_flow_error_set
5769                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5770                          NULL, "counters are not available.");
5771 }
5772
5773 /**
5774  * Query a flow.
5775  *
5776  * @see rte_flow_query()
5777  * @see rte_flow_ops
5778  */
5779 static int
5780 flow_tcf_query(struct rte_eth_dev *dev,
5781                struct rte_flow *flow,
5782                const struct rte_flow_action *actions,
5783                void *data,
5784                struct rte_flow_error *error)
5785 {
5786         int ret = -EINVAL;
5787
5788         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
5789                 switch (actions->type) {
5790                 case RTE_FLOW_ACTION_TYPE_VOID:
5791                         break;
5792                 case RTE_FLOW_ACTION_TYPE_COUNT:
5793                         ret = flow_tcf_query_count(dev, flow, data, error);
5794                         break;
5795                 default:
5796                         return rte_flow_error_set(error, ENOTSUP,
5797                                                   RTE_FLOW_ERROR_TYPE_ACTION,
5798                                                   actions,
5799                                                   "action not supported");
5800                 }
5801         }
5802         return ret;
5803 }
5804
5805 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
5806         .validate = flow_tcf_validate,
5807         .prepare = flow_tcf_prepare,
5808         .translate = flow_tcf_translate,
5809         .apply = flow_tcf_apply,
5810         .remove = flow_tcf_remove,
5811         .destroy = flow_tcf_destroy,
5812         .query = flow_tcf_query,
5813 };
5814
5815 /**
5816  * Create and configure a libmnl socket for Netlink flow rules.
5817  *
5818  * @return
5819  *   A valid libmnl socket object pointer on success, NULL otherwise and
5820  *   rte_errno is set.
5821  */
5822 static struct mnl_socket *
5823 flow_tcf_mnl_socket_create(void)
5824 {
5825         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
5826
5827         if (nl) {
5828                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
5829                                       sizeof(int));
5830                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
5831                         return nl;
5832         }
5833         rte_errno = errno;
5834         if (nl)
5835                 mnl_socket_close(nl);
5836         return NULL;
5837 }
5838
5839 /**
5840  * Destroy a libmnl socket.
5841  *
5842  * @param nl
5843  *   Libmnl socket of the @p NETLINK_ROUTE kind.
5844  */
5845 static void
5846 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
5847 {
5848         if (nl)
5849                 mnl_socket_close(nl);
5850 }
5851
5852 /**
5853  * Initialize ingress qdisc of a given network interface.
5854  *
5855  * @param ctx
5856  *   Pointer to tc-flower context to use.
5857  * @param ifindex
5858  *   Index of network interface to initialize.
5859  * @param[out] error
5860  *   Perform verbose error reporting if not NULL.
5861  *
5862  * @return
5863  *   0 on success, a negative errno value otherwise and rte_errno is set.
5864  */
5865 int
5866 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
5867                    unsigned int ifindex, struct rte_flow_error *error)
5868 {
5869         struct nlmsghdr *nlh;
5870         struct tcmsg *tcm;
5871         alignas(struct nlmsghdr)
5872         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
5873                     SZ_NLATTR_STRZ_OF("ingress") +
5874                     MNL_BUF_EXTRA_SPACE];
5875
5876         /* Destroy existing ingress qdisc and everything attached to it. */
5877         nlh = mnl_nlmsg_put_header(buf);
5878         nlh->nlmsg_type = RTM_DELQDISC;
5879         nlh->nlmsg_flags = NLM_F_REQUEST;
5880         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5881         tcm->tcm_family = AF_UNSPEC;
5882         tcm->tcm_ifindex = ifindex;
5883         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
5884         tcm->tcm_parent = TC_H_INGRESS;
5885         assert(sizeof(buf) >= nlh->nlmsg_len);
5886         /* Ignore errors when qdisc is already absent. */
5887         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
5888             rte_errno != EINVAL && rte_errno != ENOENT)
5889                 return rte_flow_error_set(error, rte_errno,
5890                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5891                                           "netlink: failed to remove ingress"
5892                                           " qdisc");
5893         /* Create fresh ingress qdisc. */
5894         nlh = mnl_nlmsg_put_header(buf);
5895         nlh->nlmsg_type = RTM_NEWQDISC;
5896         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5897         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5898         tcm->tcm_family = AF_UNSPEC;
5899         tcm->tcm_ifindex = ifindex;
5900         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
5901         tcm->tcm_parent = TC_H_INGRESS;
5902         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
5903         assert(sizeof(buf) >= nlh->nlmsg_len);
5904         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
5905                 return rte_flow_error_set(error, rte_errno,
5906                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5907                                           "netlink: failed to create ingress"
5908                                           " qdisc");
5909         return 0;
5910 }
5911
5912 /**
5913  * Create libmnl context for Netlink flow rules.
5914  *
5915  * @return
5916  *   A valid libmnl socket object pointer on success, NULL otherwise and
5917  *   rte_errno is set.
5918  */
5919 struct mlx5_flow_tcf_context *
5920 mlx5_flow_tcf_context_create(void)
5921 {
5922         struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
5923                                                         sizeof(*ctx),
5924                                                         sizeof(uint32_t));
5925         if (!ctx)
5926                 goto error;
5927         ctx->nl = flow_tcf_mnl_socket_create();
5928         if (!ctx->nl)
5929                 goto error;
5930         ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
5931         ctx->buf = rte_zmalloc(__func__,
5932                                ctx->buf_size, sizeof(uint32_t));
5933         if (!ctx->buf)
5934                 goto error;
5935         ctx->seq = random();
5936         return ctx;
5937 error:
5938         mlx5_flow_tcf_context_destroy(ctx);
5939         return NULL;
5940 }
5941
5942 /**
5943  * Destroy a libmnl context.
5944  *
5945  * @param ctx
5946  *   Libmnl socket of the @p NETLINK_ROUTE kind.
5947  */
5948 void
5949 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
5950 {
5951         if (!ctx)
5952                 return;
5953         flow_tcf_mnl_socket_destroy(ctx->nl);
5954         rte_free(ctx->buf);
5955         rte_free(ctx);
5956 }