New upstream version 18.11.1
[deb_dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
18 #include <stdalign.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <sys/socket.h>
24
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
28 #include <rte_flow.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31 #include <rte_cycles.h>
32
33 #include "mlx5.h"
34 #include "mlx5_flow.h"
35 #include "mlx5_autoconf.h"
36
37 #ifdef HAVE_TC_ACT_VLAN
38
39 #include <linux/tc_act/tc_vlan.h>
40
41 #else /* HAVE_TC_ACT_VLAN */
42
43 #define TCA_VLAN_ACT_POP 1
44 #define TCA_VLAN_ACT_PUSH 2
45 #define TCA_VLAN_ACT_MODIFY 3
46 #define TCA_VLAN_PARMS 2
47 #define TCA_VLAN_PUSH_VLAN_ID 3
48 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
49 #define TCA_VLAN_PAD 5
50 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
51
52 struct tc_vlan {
53         tc_gen;
54         int v_action;
55 };
56
57 #endif /* HAVE_TC_ACT_VLAN */
58
59 #ifdef HAVE_TC_ACT_PEDIT
60
61 #include <linux/tc_act/tc_pedit.h>
62
63 #else /* HAVE_TC_ACT_VLAN */
64
65 enum {
66         TCA_PEDIT_UNSPEC,
67         TCA_PEDIT_TM,
68         TCA_PEDIT_PARMS,
69         TCA_PEDIT_PAD,
70         TCA_PEDIT_PARMS_EX,
71         TCA_PEDIT_KEYS_EX,
72         TCA_PEDIT_KEY_EX,
73         __TCA_PEDIT_MAX
74 };
75
76 enum {
77         TCA_PEDIT_KEY_EX_HTYPE = 1,
78         TCA_PEDIT_KEY_EX_CMD = 2,
79         __TCA_PEDIT_KEY_EX_MAX
80 };
81
82 enum pedit_header_type {
83         TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
84         TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
85         TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
86         TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
87         TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
88         TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
89         __PEDIT_HDR_TYPE_MAX,
90 };
91
92 enum pedit_cmd {
93         TCA_PEDIT_KEY_EX_CMD_SET = 0,
94         TCA_PEDIT_KEY_EX_CMD_ADD = 1,
95         __PEDIT_CMD_MAX,
96 };
97
98 struct tc_pedit_key {
99         __u32 mask; /* AND */
100         __u32 val; /*XOR */
101         __u32 off; /*offset */
102         __u32 at;
103         __u32 offmask;
104         __u32 shift;
105 };
106
107 __extension__
108 struct tc_pedit_sel {
109         tc_gen;
110         unsigned char nkeys;
111         unsigned char flags;
112         struct tc_pedit_key keys[0];
113 };
114
115 #endif /* HAVE_TC_ACT_VLAN */
116
117 #ifdef HAVE_TC_ACT_TUNNEL_KEY
118
119 #include <linux/tc_act/tc_tunnel_key.h>
120
121 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
122 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
123 #endif
124
125 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
126 #define TCA_TUNNEL_KEY_NO_CSUM 10
127 #endif
128
129 #else /* HAVE_TC_ACT_TUNNEL_KEY */
130
131 #define TCA_ACT_TUNNEL_KEY 17
132 #define TCA_TUNNEL_KEY_ACT_SET 1
133 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
134 #define TCA_TUNNEL_KEY_PARMS 2
135 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
136 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
137 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
138 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
139 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
140 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
141 #define TCA_TUNNEL_KEY_NO_CSUM 10
142
143 struct tc_tunnel_key {
144         tc_gen;
145         int t_action;
146 };
147
148 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
149
150 /* Normally found in linux/netlink.h. */
151 #ifndef NETLINK_CAP_ACK
152 #define NETLINK_CAP_ACK 10
153 #endif
154
155 /* Normally found in linux/pkt_sched.h. */
156 #ifndef TC_H_MIN_INGRESS
157 #define TC_H_MIN_INGRESS 0xfff2u
158 #endif
159
160 /* Normally found in linux/pkt_cls.h. */
161 #ifndef TCA_CLS_FLAGS_SKIP_SW
162 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
163 #endif
164 #ifndef TCA_CLS_FLAGS_IN_HW
165 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
166 #endif
167 #ifndef HAVE_TCA_CHAIN
168 #define TCA_CHAIN 11
169 #endif
170 #ifndef HAVE_TCA_FLOWER_ACT
171 #define TCA_FLOWER_ACT 3
172 #endif
173 #ifndef HAVE_TCA_FLOWER_FLAGS
174 #define TCA_FLOWER_FLAGS 22
175 #endif
176 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
177 #define TCA_FLOWER_KEY_ETH_TYPE 8
178 #endif
179 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
180 #define TCA_FLOWER_KEY_ETH_DST 4
181 #endif
182 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
183 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
184 #endif
185 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
186 #define TCA_FLOWER_KEY_ETH_SRC 6
187 #endif
188 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
189 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
190 #endif
191 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
192 #define TCA_FLOWER_KEY_IP_PROTO 9
193 #endif
194 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
195 #define TCA_FLOWER_KEY_IPV4_SRC 10
196 #endif
197 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
198 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
199 #endif
200 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
201 #define TCA_FLOWER_KEY_IPV4_DST 12
202 #endif
203 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
204 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
205 #endif
206 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
207 #define TCA_FLOWER_KEY_IPV6_SRC 14
208 #endif
209 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
210 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
211 #endif
212 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
213 #define TCA_FLOWER_KEY_IPV6_DST 16
214 #endif
215 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
216 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
217 #endif
218 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
219 #define TCA_FLOWER_KEY_TCP_SRC 18
220 #endif
221 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
222 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
223 #endif
224 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
225 #define TCA_FLOWER_KEY_TCP_DST 19
226 #endif
227 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
228 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
229 #endif
230 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
231 #define TCA_FLOWER_KEY_UDP_SRC 20
232 #endif
233 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
234 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
235 #endif
236 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
237 #define TCA_FLOWER_KEY_UDP_DST 21
238 #endif
239 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
240 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
241 #endif
242 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
243 #define TCA_FLOWER_KEY_VLAN_ID 23
244 #endif
245 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
246 #define TCA_FLOWER_KEY_VLAN_PRIO 24
247 #endif
248 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
249 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
250 #endif
251 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
252 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
253 #endif
254 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
255 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
256 #endif
257 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
258 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
259 #endif
260 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
261 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
262 #endif
263 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
264 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
265 #endif
266 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
267 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
268 #endif
269 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
270 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
271 #endif
272 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
273 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
274 #endif
275 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
276 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
277 #endif
278 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
279 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
280 #endif
281 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
282 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
283 #endif
284 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
285 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
286 #endif
287 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
288 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
289 #endif
290 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
291 #define TCA_FLOWER_KEY_TCP_FLAGS 71
292 #endif
293 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
294 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
295 #endif
296 #ifndef HAVE_TC_ACT_GOTO_CHAIN
297 #define TC_ACT_GOTO_CHAIN 0x20000000
298 #endif
299
300 #ifndef IPV6_ADDR_LEN
301 #define IPV6_ADDR_LEN 16
302 #endif
303
304 #ifndef IPV4_ADDR_LEN
305 #define IPV4_ADDR_LEN 4
306 #endif
307
308 #ifndef TP_PORT_LEN
309 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
310 #endif
311
312 #ifndef TTL_LEN
313 #define TTL_LEN 1
314 #endif
315
316 #ifndef TCA_ACT_MAX_PRIO
317 #define TCA_ACT_MAX_PRIO 32
318 #endif
319
320 /** UDP port range of VXLAN devices created by driver. */
321 #define MLX5_VXLAN_PORT_MIN 30000
322 #define MLX5_VXLAN_PORT_MAX 60000
323 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
324 /**
325  * Timeout in milliseconds to wait VXLAN UDP offloaded port
326  * registration  completed within the mlx5 driver.
327  */
328 #define MLX5_VXLAN_WAIT_PORT_REG_MS 250
329
330 /** Tunnel action type, used for @p type in header structure. */
331 enum flow_tcf_tunact_type {
332         FLOW_TCF_TUNACT_VXLAN_DECAP,
333         FLOW_TCF_TUNACT_VXLAN_ENCAP,
334 };
335
336 /** Flags used for @p mask in tunnel action encap descriptors. */
337 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
338 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
339 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
340 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
341 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
342 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
343 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
344 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
345 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
346
347 /**
348  * Structure for holding netlink context.
349  * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
350  * Using this (8KB) buffer size ensures that netlink messages will never be
351  * truncated.
352  */
353 struct mlx5_flow_tcf_context {
354         struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
355         uint32_t seq; /* Message sequence number. */
356         uint32_t buf_size; /* Message buffer size. */
357         uint8_t *buf; /* Message buffer. */
358 };
359
360 /**
361  * Neigh rule structure. The neigh rule is applied via Netlink to
362  * outer tunnel iface in order to provide destination MAC address
363  * for the VXLAN encapsultion. The neigh rule is implicitly related
364  * to the Flow itself and can be shared by multiple Flows.
365  */
366 struct tcf_neigh_rule {
367         LIST_ENTRY(tcf_neigh_rule) next;
368         uint32_t refcnt;
369         struct ether_addr eth;
370         uint16_t mask;
371         union {
372                 struct {
373                         rte_be32_t dst;
374                 } ipv4;
375                 struct {
376                         uint8_t dst[IPV6_ADDR_LEN];
377                 } ipv6;
378         };
379 };
380
381 /**
382  * Local rule structure. The local rule is applied via Netlink to
383  * outer tunnel iface in order to provide local and peer IP addresses
384  * of the VXLAN tunnel for encapsulation. The local rule is implicitly
385  * related to the Flow itself and can be shared by multiple Flows.
386  */
387 struct tcf_local_rule {
388         LIST_ENTRY(tcf_local_rule) next;
389         uint32_t refcnt;
390         uint16_t mask;
391         union {
392                 struct {
393                         rte_be32_t dst;
394                         rte_be32_t src;
395                 } ipv4;
396                 struct {
397                         uint8_t dst[IPV6_ADDR_LEN];
398                         uint8_t src[IPV6_ADDR_LEN];
399                 } ipv6;
400         };
401 };
402
403 /** VXLAN virtual netdev. */
404 struct tcf_vtep {
405         LIST_ENTRY(tcf_vtep) next;
406         LIST_HEAD(, tcf_neigh_rule) neigh;
407         LIST_HEAD(, tcf_local_rule) local;
408         uint32_t refcnt;
409         unsigned int ifindex; /**< Own interface index. */
410         unsigned int ifouter; /**< Index of device attached to. */
411         uint16_t port;
412         uint32_t created:1; /**< Actually created by PMD. */
413         uint32_t waitreg:1; /**< Wait for VXLAN UDP port registration. */
414 };
415
416 /** Tunnel descriptor header, common for all tunnel types. */
417 struct flow_tcf_tunnel_hdr {
418         uint32_t type; /**< Tunnel action type. */
419         struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
420         unsigned int ifindex_org; /**< Original dst/src interface */
421         unsigned int *ifindex_ptr; /**< Interface ptr in message. */
422 };
423
424 struct flow_tcf_vxlan_decap {
425         struct flow_tcf_tunnel_hdr hdr;
426         uint16_t udp_port;
427 };
428
429 struct flow_tcf_vxlan_encap {
430         struct flow_tcf_tunnel_hdr hdr;
431         uint32_t mask;
432         struct {
433                 struct ether_addr dst;
434                 struct ether_addr src;
435         } eth;
436         union {
437                 struct {
438                         rte_be32_t dst;
439                         rte_be32_t src;
440                 } ipv4;
441                 struct {
442                         uint8_t dst[IPV6_ADDR_LEN];
443                         uint8_t src[IPV6_ADDR_LEN];
444                 } ipv6;
445         };
446         struct {
447                 rte_be16_t src;
448                 rte_be16_t dst;
449         } udp;
450         struct {
451                 uint8_t vni[3];
452         } vxlan;
453 };
454
455 /** Structure used when extracting the values of a flow counters
456  * from a netlink message.
457  */
458 struct flow_tcf_stats_basic {
459         bool valid;
460         struct gnet_stats_basic counters;
461 };
462
463 /** Empty masks for known item types. */
464 static const union {
465         struct rte_flow_item_port_id port_id;
466         struct rte_flow_item_eth eth;
467         struct rte_flow_item_vlan vlan;
468         struct rte_flow_item_ipv4 ipv4;
469         struct rte_flow_item_ipv6 ipv6;
470         struct rte_flow_item_tcp tcp;
471         struct rte_flow_item_udp udp;
472         struct rte_flow_item_vxlan vxlan;
473 } flow_tcf_mask_empty = {
474         {0},
475 };
476
477 /** Supported masks for known item types. */
478 static const struct {
479         struct rte_flow_item_port_id port_id;
480         struct rte_flow_item_eth eth;
481         struct rte_flow_item_vlan vlan;
482         struct rte_flow_item_ipv4 ipv4;
483         struct rte_flow_item_ipv6 ipv6;
484         struct rte_flow_item_tcp tcp;
485         struct rte_flow_item_udp udp;
486         struct rte_flow_item_vxlan vxlan;
487 } flow_tcf_mask_supported = {
488         .port_id = {
489                 .id = 0xffffffff,
490         },
491         .eth = {
492                 .type = RTE_BE16(0xffff),
493                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
494                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
495         },
496         .vlan = {
497                 /* PCP and VID only, no DEI. */
498                 .tci = RTE_BE16(0xefff),
499                 .inner_type = RTE_BE16(0xffff),
500         },
501         .ipv4.hdr = {
502                 .next_proto_id = 0xff,
503                 .src_addr = RTE_BE32(0xffffffff),
504                 .dst_addr = RTE_BE32(0xffffffff),
505         },
506         .ipv6.hdr = {
507                 .proto = 0xff,
508                 .src_addr =
509                         "\xff\xff\xff\xff\xff\xff\xff\xff"
510                         "\xff\xff\xff\xff\xff\xff\xff\xff",
511                 .dst_addr =
512                         "\xff\xff\xff\xff\xff\xff\xff\xff"
513                         "\xff\xff\xff\xff\xff\xff\xff\xff",
514         },
515         .tcp.hdr = {
516                 .src_port = RTE_BE16(0xffff),
517                 .dst_port = RTE_BE16(0xffff),
518                 .tcp_flags = 0xff,
519         },
520         .udp.hdr = {
521                 .src_port = RTE_BE16(0xffff),
522                 .dst_port = RTE_BE16(0xffff),
523         },
524         .vxlan = {
525                .vni = "\xff\xff\xff",
526         },
527 };
528
529 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
530 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
531 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
532 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
533 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
534
535 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
536
537 /** DPDK port to network interface index (ifindex) conversion. */
538 struct flow_tcf_ptoi {
539         uint16_t port_id; /**< DPDK port ID. */
540         unsigned int ifindex; /**< Network interface index. */
541 };
542
543 /* Due to a limitation on driver/FW. */
544 #define MLX5_TCF_GROUP_ID_MAX 3
545
546 /*
547  * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
548  * Priority in rte_flow attribute starts from 0 and is added by 1 in
549  * translation. This is subject to be changed to determine the max priority
550  * based on trial-and-error like Verbs driver once the restriction is lifted or
551  * the range is extended.
552  */
553 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
554
555 #define MLX5_TCF_FATE_ACTIONS \
556         (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
557          MLX5_FLOW_ACTION_JUMP)
558
559 #define MLX5_TCF_VLAN_ACTIONS \
560         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
561          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
562
563 #define MLX5_TCF_VXLAN_ACTIONS \
564         (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
565
566 #define MLX5_TCF_PEDIT_ACTIONS \
567         (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
568          MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
569          MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
570          MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
571          MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
572
573 #define MLX5_TCF_CONFIG_ACTIONS \
574         (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
575          MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
576          MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
577          (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
578
579 #define MAX_PEDIT_KEYS 128
580 #define SZ_PEDIT_KEY_VAL 4
581
582 #define NUM_OF_PEDIT_KEYS(sz) \
583         (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
584
585 struct pedit_key_ex {
586         enum pedit_header_type htype;
587         enum pedit_cmd cmd;
588 };
589
590 struct pedit_parser {
591         struct tc_pedit_sel sel;
592         struct tc_pedit_key keys[MAX_PEDIT_KEYS];
593         struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
594 };
595
596 /**
597  * Create space for using the implicitly created TC flow counter.
598  *
599  * @param[in] dev
600  *   Pointer to the Ethernet device structure.
601  *
602  * @return
603  *   A pointer to the counter data structure, NULL otherwise and
604  *   rte_errno is set.
605  */
606 static struct mlx5_flow_counter *
607 flow_tcf_counter_new(void)
608 {
609         struct mlx5_flow_counter *cnt;
610
611         /*
612          * eswitch counter cannot be shared and its id is unknown.
613          * currently returning all with id 0.
614          * in the future maybe better to switch to unique numbers.
615          */
616         struct mlx5_flow_counter tmpl = {
617                 .ref_cnt = 1,
618         };
619         cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
620         if (!cnt) {
621                 rte_errno = ENOMEM;
622                 return NULL;
623         }
624         *cnt = tmpl;
625         /* Implicit counter, do not add to list. */
626         return cnt;
627 }
628
629 /**
630  * Set pedit key of MAC address
631  *
632  * @param[in] actions
633  *   pointer to action specification
634  * @param[in,out] p_parser
635  *   pointer to pedit_parser
636  */
637 static void
638 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
639                            struct pedit_parser *p_parser)
640 {
641         int idx = p_parser->sel.nkeys;
642         uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
643                                         offsetof(struct ether_hdr, s_addr) :
644                                         offsetof(struct ether_hdr, d_addr);
645         const struct rte_flow_action_set_mac *conf =
646                 (const struct rte_flow_action_set_mac *)actions->conf;
647
648         p_parser->keys[idx].off = off;
649         p_parser->keys[idx].mask = ~UINT32_MAX;
650         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
651         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
652         memcpy(&p_parser->keys[idx].val,
653                 conf->mac_addr, SZ_PEDIT_KEY_VAL);
654         idx++;
655         p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
656         p_parser->keys[idx].mask = 0xFFFF0000;
657         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
658         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
659         memcpy(&p_parser->keys[idx].val,
660                 conf->mac_addr + SZ_PEDIT_KEY_VAL,
661                 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
662         p_parser->sel.nkeys = (++idx);
663 }
664
665 /**
666  * Set pedit key of decrease/set ttl
667  *
668  * @param[in] actions
669  *   pointer to action specification
670  * @param[in,out] p_parser
671  *   pointer to pedit_parser
672  * @param[in] item_flags
673  *   flags of all items presented
674  */
675 static void
676 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
677                                 struct pedit_parser *p_parser,
678                                 uint64_t item_flags)
679 {
680         int idx = p_parser->sel.nkeys;
681
682         p_parser->keys[idx].mask = 0xFFFFFF00;
683         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
684                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
685                 p_parser->keys[idx].off =
686                         offsetof(struct ipv4_hdr, time_to_live);
687         }
688         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
689                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
690                 p_parser->keys[idx].off =
691                         offsetof(struct ipv6_hdr, hop_limits);
692         }
693         if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
694                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
695                 p_parser->keys[idx].val = 0x000000FF;
696         } else {
697                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
698                 p_parser->keys[idx].val =
699                         (__u32)((const struct rte_flow_action_set_ttl *)
700                          actions->conf)->ttl_value;
701         }
702         p_parser->sel.nkeys = (++idx);
703 }
704
705 /**
706  * Set pedit key of transport (TCP/UDP) port value
707  *
708  * @param[in] actions
709  *   pointer to action specification
710  * @param[in,out] p_parser
711  *   pointer to pedit_parser
712  * @param[in] item_flags
713  *   flags of all items presented
714  */
715 static void
716 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
717                                 struct pedit_parser *p_parser,
718                                 uint64_t item_flags)
719 {
720         int idx = p_parser->sel.nkeys;
721
722         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
723                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
724         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
725                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
726         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
727         /* offset of src/dst port is same for TCP and UDP */
728         p_parser->keys[idx].off =
729                 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
730                 offsetof(struct tcp_hdr, src_port) :
731                 offsetof(struct tcp_hdr, dst_port);
732         p_parser->keys[idx].mask = 0xFFFF0000;
733         p_parser->keys[idx].val =
734                 (__u32)((const struct rte_flow_action_set_tp *)
735                                 actions->conf)->port;
736         p_parser->sel.nkeys = (++idx);
737 }
738
739 /**
740  * Set pedit key of ipv6 address
741  *
742  * @param[in] actions
743  *   pointer to action specification
744  * @param[in,out] p_parser
745  *   pointer to pedit_parser
746  */
747 static void
748 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
749                                  struct pedit_parser *p_parser)
750 {
751         int idx = p_parser->sel.nkeys;
752         int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
753         int off_base =
754                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
755                 offsetof(struct ipv6_hdr, src_addr) :
756                 offsetof(struct ipv6_hdr, dst_addr);
757         const struct rte_flow_action_set_ipv6 *conf =
758                 (const struct rte_flow_action_set_ipv6 *)actions->conf;
759
760         for (int i = 0; i < keys; i++, idx++) {
761                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
762                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
763                 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
764                 p_parser->keys[idx].mask = ~UINT32_MAX;
765                 memcpy(&p_parser->keys[idx].val,
766                         conf->ipv6_addr + i *  SZ_PEDIT_KEY_VAL,
767                         SZ_PEDIT_KEY_VAL);
768         }
769         p_parser->sel.nkeys += keys;
770 }
771
772 /**
773  * Set pedit key of ipv4 address
774  *
775  * @param[in] actions
776  *   pointer to action specification
777  * @param[in,out] p_parser
778  *   pointer to pedit_parser
779  */
780 static void
781 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
782                                  struct pedit_parser *p_parser)
783 {
784         int idx = p_parser->sel.nkeys;
785
786         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
787         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
788         p_parser->keys[idx].off =
789                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
790                 offsetof(struct ipv4_hdr, src_addr) :
791                 offsetof(struct ipv4_hdr, dst_addr);
792         p_parser->keys[idx].mask = ~UINT32_MAX;
793         p_parser->keys[idx].val =
794                 ((const struct rte_flow_action_set_ipv4 *)
795                  actions->conf)->ipv4_addr;
796         p_parser->sel.nkeys = (++idx);
797 }
798
799 /**
800  * Create the pedit's na attribute in netlink message
801  * on pre-allocate message buffer
802  *
803  * @param[in,out] nl
804  *   pointer to pre-allocated netlink message buffer
805  * @param[in,out] actions
806  *   pointer to pointer of actions specification.
807  * @param[in,out] action_flags
808  *   pointer to actions flags
809  * @param[in] item_flags
810  *   flags of all item presented
811  */
812 static void
813 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
814                               const struct rte_flow_action **actions,
815                               uint64_t item_flags)
816 {
817         struct pedit_parser p_parser;
818         struct nlattr *na_act_options;
819         struct nlattr *na_pedit_keys;
820
821         memset(&p_parser, 0, sizeof(p_parser));
822         mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
823         na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
824         /* all modify header actions should be in one tc-pedit action */
825         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
826                 switch ((*actions)->type) {
827                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
828                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
829                         flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
830                         break;
831                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
832                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
833                         flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
834                         break;
835                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
836                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
837                         flow_tcf_pedit_key_set_tp_port(*actions,
838                                                         &p_parser, item_flags);
839                         break;
840                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
841                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
842                         flow_tcf_pedit_key_set_dec_ttl(*actions,
843                                                         &p_parser, item_flags);
844                         break;
845                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
846                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
847                         flow_tcf_pedit_key_set_mac(*actions, &p_parser);
848                         break;
849                 default:
850                         goto pedit_mnl_msg_done;
851                 }
852         }
853 pedit_mnl_msg_done:
854         p_parser.sel.action = TC_ACT_PIPE;
855         mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
856                      sizeof(p_parser.sel) +
857                      p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
858                      &p_parser);
859         na_pedit_keys =
860                 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
861         for (int i = 0; i < p_parser.sel.nkeys; i++) {
862                 struct nlattr *na_pedit_key =
863                         mnl_attr_nest_start(nl,
864                                             TCA_PEDIT_KEY_EX | NLA_F_NESTED);
865                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
866                                  p_parser.keys_ex[i].htype);
867                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
868                                  p_parser.keys_ex[i].cmd);
869                 mnl_attr_nest_end(nl, na_pedit_key);
870         }
871         mnl_attr_nest_end(nl, na_pedit_keys);
872         mnl_attr_nest_end(nl, na_act_options);
873         (*actions)--;
874 }
875
876 /**
877  * Calculate max memory size of one TC-pedit actions.
878  * One TC-pedit action can contain set of keys each defining
879  * a rewrite element (rte_flow action)
880  *
881  * @param[in,out] actions
882  *   actions specification.
883  * @param[in,out] action_flags
884  *   actions flags
885  * @param[in,out] size
886  *   accumulated size
887  * @return
888  *   Max memory size of one TC-pedit action
889  */
890 static int
891 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
892                                 uint64_t *action_flags)
893 {
894         int pedit_size = 0;
895         int keys = 0;
896         uint64_t flags = 0;
897
898         pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
899                       SZ_NLATTR_STRZ_OF("pedit") +
900                       SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
901         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
902                 switch ((*actions)->type) {
903                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
904                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
905                         flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
906                         break;
907                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
908                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
909                         flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
910                         break;
911                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
912                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
913                         flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
914                         break;
915                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
916                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
917                         flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
918                         break;
919                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
920                         /* TCP is as same as UDP */
921                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
922                         flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
923                         break;
924                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
925                         /* TCP is as same as UDP */
926                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
927                         flags |= MLX5_FLOW_ACTION_SET_TP_DST;
928                         break;
929                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
930                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
931                         flags |= MLX5_FLOW_ACTION_SET_TTL;
932                         break;
933                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
934                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
935                         flags |= MLX5_FLOW_ACTION_DEC_TTL;
936                         break;
937                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
938                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
939                         flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
940                         break;
941                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
942                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
943                         flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
944                         break;
945                 default:
946                         goto get_pedit_action_size_done;
947                 }
948         }
949 get_pedit_action_size_done:
950         /* TCA_PEDIT_PARAMS_EX */
951         pedit_size +=
952                 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
953                                   keys * sizeof(struct tc_pedit_key));
954         pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
955         pedit_size += keys *
956                       /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
957                       (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
958                        SZ_NLATTR_DATA_OF(2));
959         (*action_flags) |= flags;
960         (*actions)--;
961         return pedit_size;
962 }
963
964 /**
965  * Retrieve mask for pattern item.
966  *
967  * This function does basic sanity checks on a pattern item in order to
968  * return the most appropriate mask for it.
969  *
970  * @param[in] item
971  *   Item specification.
972  * @param[in] mask_default
973  *   Default mask for pattern item as specified by the flow API.
974  * @param[in] mask_supported
975  *   Mask fields supported by the implementation.
976  * @param[in] mask_empty
977  *   Empty mask to return when there is no specification.
978  * @param[out] error
979  *   Perform verbose error reporting if not NULL.
980  *
981  * @return
982  *   Either @p item->mask or one of the mask parameters on success, NULL
983  *   otherwise and rte_errno is set.
984  */
985 static const void *
986 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
987                    const void *mask_supported, const void *mask_empty,
988                    size_t mask_size, struct rte_flow_error *error)
989 {
990         const uint8_t *mask;
991         size_t i;
992
993         /* item->last and item->mask cannot exist without item->spec. */
994         if (!item->spec && (item->mask || item->last)) {
995                 rte_flow_error_set(error, EINVAL,
996                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
997                                    "\"mask\" or \"last\" field provided without"
998                                    " a corresponding \"spec\"");
999                 return NULL;
1000         }
1001         /* No spec, no mask, no problem. */
1002         if (!item->spec)
1003                 return mask_empty;
1004         mask = item->mask ? item->mask : mask_default;
1005         assert(mask);
1006         /*
1007          * Single-pass check to make sure that:
1008          * - Mask is supported, no bits are set outside mask_supported.
1009          * - Both item->spec and item->last are included in mask.
1010          */
1011         for (i = 0; i != mask_size; ++i) {
1012                 if (!mask[i])
1013                         continue;
1014                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1015                     ((const uint8_t *)mask_supported)[i]) {
1016                         rte_flow_error_set(error, ENOTSUP,
1017                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1018                                            "unsupported field found"
1019                                            " in \"mask\"");
1020                         return NULL;
1021                 }
1022                 if (item->last &&
1023                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
1024                     (((const uint8_t *)item->last)[i] & mask[i])) {
1025                         rte_flow_error_set(error, EINVAL,
1026                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1027                                            item->last,
1028                                            "range between \"spec\" and \"last\""
1029                                            " not comprised in \"mask\"");
1030                         return NULL;
1031                 }
1032         }
1033         return mask;
1034 }
1035
1036 /**
1037  * Build a conversion table between port ID and ifindex.
1038  *
1039  * @param[in] dev
1040  *   Pointer to Ethernet device.
1041  * @param[out] ptoi
1042  *   Pointer to ptoi table.
1043  * @param[in] len
1044  *   Size of ptoi table provided.
1045  *
1046  * @return
1047  *   Size of ptoi table filled.
1048  */
1049 static unsigned int
1050 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1051                           unsigned int len)
1052 {
1053         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1054         uint16_t port_id[n + 1];
1055         unsigned int i;
1056         unsigned int own = 0;
1057
1058         /* At least one port is needed when no switch domain is present. */
1059         if (!n) {
1060                 n = 1;
1061                 port_id[0] = dev->data->port_id;
1062         } else {
1063                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1064         }
1065         if (n > len)
1066                 return 0;
1067         for (i = 0; i != n; ++i) {
1068                 struct rte_eth_dev_info dev_info;
1069
1070                 rte_eth_dev_info_get(port_id[i], &dev_info);
1071                 if (port_id[i] == dev->data->port_id)
1072                         own = i;
1073                 ptoi[i].port_id = port_id[i];
1074                 ptoi[i].ifindex = dev_info.if_index;
1075         }
1076         /* Ensure first entry of ptoi[] is the current device. */
1077         if (own) {
1078                 ptoi[n] = ptoi[0];
1079                 ptoi[0] = ptoi[own];
1080                 ptoi[own] = ptoi[n];
1081         }
1082         /* An entry with zero ifindex terminates ptoi[]. */
1083         ptoi[n].port_id = 0;
1084         ptoi[n].ifindex = 0;
1085         return n;
1086 }
1087
1088 /**
1089  * Verify the @p attr will be correctly understood by the E-switch.
1090  *
1091  * @param[in] attr
1092  *   Pointer to flow attributes
1093  * @param[out] error
1094  *   Pointer to error structure.
1095  *
1096  * @return
1097  *   0 on success, a negative errno value otherwise and rte_errno is set.
1098  */
1099 static int
1100 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1101                              struct rte_flow_error *error)
1102 {
1103         /*
1104          * Supported attributes: groups, some priorities and ingress only.
1105          * group is supported only if kernel supports chain. Don't care about
1106          * transfer as it is the caller's problem.
1107          */
1108         if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1109                 return rte_flow_error_set(error, ENOTSUP,
1110                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1111                                           "group ID larger than "
1112                                           RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1113                                           " isn't supported");
1114         else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1115                 return rte_flow_error_set(error, ENOTSUP,
1116                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1117                                           attr,
1118                                           "priority more than "
1119                                           RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1120                                           " is not supported");
1121         if (!attr->ingress)
1122                 return rte_flow_error_set(error, EINVAL,
1123                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1124                                           attr, "only ingress is supported");
1125         if (attr->egress)
1126                 return rte_flow_error_set(error, ENOTSUP,
1127                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1128                                           attr, "egress is not supported");
1129         return 0;
1130 }
1131
1132 /**
1133  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1134  * The routine checks the L2 fields to be used in encapsulation header.
1135  *
1136  * @param[in] item
1137  *   Pointer to the item structure.
1138  * @param[out] error
1139  *   Pointer to the error structure.
1140  *
1141  * @return
1142  *   0 on success, a negative errno value otherwise and rte_errno is set.
1143  **/
1144 static int
1145 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1146                                   struct rte_flow_error *error)
1147 {
1148         const struct rte_flow_item_eth *spec = item->spec;
1149         const struct rte_flow_item_eth *mask = item->mask;
1150
1151         if (!spec) {
1152                 /*
1153                  * Specification for L2 addresses can be empty
1154                  * because these ones are optional and not
1155                  * required directly by tc rule. Kernel tries
1156                  * to resolve these ones on its own
1157                  */
1158                 return 0;
1159         }
1160         if (!mask) {
1161                 /* If mask is not specified use the default one. */
1162                 mask = &rte_flow_item_eth_mask;
1163         }
1164         if (memcmp(&mask->dst,
1165                    &flow_tcf_mask_empty.eth.dst,
1166                    sizeof(flow_tcf_mask_empty.eth.dst))) {
1167                 if (memcmp(&mask->dst,
1168                            &rte_flow_item_eth_mask.dst,
1169                            sizeof(rte_flow_item_eth_mask.dst)))
1170                         return rte_flow_error_set
1171                                 (error, ENOTSUP,
1172                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1173                                  "no support for partial mask on"
1174                                  " \"eth.dst\" field");
1175         }
1176         if (memcmp(&mask->src,
1177                    &flow_tcf_mask_empty.eth.src,
1178                    sizeof(flow_tcf_mask_empty.eth.src))) {
1179                 if (memcmp(&mask->src,
1180                            &rte_flow_item_eth_mask.src,
1181                            sizeof(rte_flow_item_eth_mask.src)))
1182                         return rte_flow_error_set
1183                                 (error, ENOTSUP,
1184                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1185                                  "no support for partial mask on"
1186                                  " \"eth.src\" field");
1187         }
1188         if (mask->type != RTE_BE16(0x0000)) {
1189                 if (mask->type != RTE_BE16(0xffff))
1190                         return rte_flow_error_set
1191                                 (error, ENOTSUP,
1192                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1193                                  "no support for partial mask on"
1194                                  " \"eth.type\" field");
1195                 DRV_LOG(WARNING,
1196                         "outer ethernet type field"
1197                         " cannot be forced for vxlan"
1198                         " encapsulation, parameter ignored");
1199         }
1200         return 0;
1201 }
1202
1203 /**
1204  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1205  * The routine checks the IPv4 fields to be used in encapsulation header.
1206  *
1207  * @param[in] item
1208  *   Pointer to the item structure.
1209  * @param[out] error
1210  *   Pointer to the error structure.
1211  *
1212  * @return
1213  *   0 on success, a negative errno value otherwise and rte_errno is set.
1214  **/
1215 static int
1216 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1217                                    struct rte_flow_error *error)
1218 {
1219         const struct rte_flow_item_ipv4 *spec = item->spec;
1220         const struct rte_flow_item_ipv4 *mask = item->mask;
1221
1222         if (!spec) {
1223                 /*
1224                  * Specification for IP addresses cannot be empty
1225                  * because it is required by tunnel_key parameter.
1226                  */
1227                 return rte_flow_error_set(error, EINVAL,
1228                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1229                                           "NULL outer ipv4 address"
1230                                           " specification for vxlan"
1231                                           " encapsulation");
1232         }
1233         if (!mask)
1234                 mask = &rte_flow_item_ipv4_mask;
1235         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1236                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1237                         return rte_flow_error_set
1238                                 (error, ENOTSUP,
1239                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1240                                  "no support for partial mask on"
1241                                  " \"ipv4.hdr.dst_addr\" field"
1242                                  " for vxlan encapsulation");
1243                 /* More IPv4 address validations can be put here. */
1244         } else {
1245                 /*
1246                  * Kernel uses the destination IP address to determine
1247                  * the routing path and obtain the MAC destination
1248                  * address, so IP destination address must be
1249                  * specified in the tc rule.
1250                  */
1251                 return rte_flow_error_set(error, EINVAL,
1252                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1253                                           "outer ipv4 destination address"
1254                                           " must be specified for"
1255                                           " vxlan encapsulation");
1256         }
1257         if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1258                 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1259                         return rte_flow_error_set
1260                                 (error, ENOTSUP,
1261                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1262                                  "no support for partial mask on"
1263                                  " \"ipv4.hdr.src_addr\" field"
1264                                  " for vxlan encapsulation");
1265                 /* More IPv4 address validations can be put here. */
1266         } else {
1267                 /*
1268                  * Kernel uses the source IP address to select the
1269                  * interface for egress encapsulated traffic, so
1270                  * it must be specified in the tc rule.
1271                  */
1272                 return rte_flow_error_set(error, EINVAL,
1273                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1274                                           "outer ipv4 source address"
1275                                           " must be specified for"
1276                                           " vxlan encapsulation");
1277         }
1278         return 0;
1279 }
1280
1281 /**
1282  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1283  * The routine checks the IPv6 fields to be used in encapsulation header.
1284  *
1285  * @param[in] item
1286  *   Pointer to the item structure.
1287  * @param[out] error
1288  *   Pointer to the error structure.
1289  *
1290  * @return
1291  *   0 on success, a negative errno value otherwise and rte_errno is set.
1292  **/
1293 static int
1294 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1295                                    struct rte_flow_error *error)
1296 {
1297         const struct rte_flow_item_ipv6 *spec = item->spec;
1298         const struct rte_flow_item_ipv6 *mask = item->mask;
1299
1300         if (!spec) {
1301                 /*
1302                  * Specification for IP addresses cannot be empty
1303                  * because it is required by tunnel_key parameter.
1304                  */
1305                 return rte_flow_error_set(error, EINVAL,
1306                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1307                                           "NULL outer ipv6 address"
1308                                           " specification for"
1309                                           " vxlan encapsulation");
1310         }
1311         if (!mask)
1312                 mask = &rte_flow_item_ipv6_mask;
1313         if (memcmp(&mask->hdr.dst_addr,
1314                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1315                    IPV6_ADDR_LEN)) {
1316                 if (memcmp(&mask->hdr.dst_addr,
1317                            &rte_flow_item_ipv6_mask.hdr.dst_addr,
1318                            IPV6_ADDR_LEN))
1319                         return rte_flow_error_set
1320                                         (error, ENOTSUP,
1321                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1322                                          "no support for partial mask on"
1323                                          " \"ipv6.hdr.dst_addr\" field"
1324                                          " for vxlan encapsulation");
1325                 /* More IPv6 address validations can be put here. */
1326         } else {
1327                 /*
1328                  * Kernel uses the destination IP address to determine
1329                  * the routing path and obtain the MAC destination
1330                  * address (heigh or gate), so IP destination address
1331                  * must be specified within the tc rule.
1332                  */
1333                 return rte_flow_error_set(error, EINVAL,
1334                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1335                                           "outer ipv6 destination address"
1336                                           " must be specified for"
1337                                           " vxlan encapsulation");
1338         }
1339         if (memcmp(&mask->hdr.src_addr,
1340                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1341                    IPV6_ADDR_LEN)) {
1342                 if (memcmp(&mask->hdr.src_addr,
1343                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1344                            IPV6_ADDR_LEN))
1345                         return rte_flow_error_set
1346                                         (error, ENOTSUP,
1347                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1348                                          "no support for partial mask on"
1349                                          " \"ipv6.hdr.src_addr\" field"
1350                                          " for vxlan encapsulation");
1351                 /* More L3 address validation can be put here. */
1352         } else {
1353                 /*
1354                  * Kernel uses the source IP address to select the
1355                  * interface for egress encapsulated traffic, so
1356                  * it must be specified in the tc rule.
1357                  */
1358                 return rte_flow_error_set(error, EINVAL,
1359                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1360                                           "outer L3 source address"
1361                                           " must be specified for"
1362                                           " vxlan encapsulation");
1363         }
1364         return 0;
1365 }
1366
1367 /**
1368  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1369  * The routine checks the UDP fields to be used in encapsulation header.
1370  *
1371  * @param[in] item
1372  *   Pointer to the item structure.
1373  * @param[out] error
1374  *   Pointer to the error structure.
1375  *
1376  * @return
1377  *   0 on success, a negative errno value otherwise and rte_errno is set.
1378  **/
1379 static int
1380 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1381                                   struct rte_flow_error *error)
1382 {
1383         const struct rte_flow_item_udp *spec = item->spec;
1384         const struct rte_flow_item_udp *mask = item->mask;
1385
1386         if (!spec) {
1387                 /*
1388                  * Specification for UDP ports cannot be empty
1389                  * because it is required by tunnel_key parameter.
1390                  */
1391                 return rte_flow_error_set(error, EINVAL,
1392                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1393                                           "NULL UDP port specification "
1394                                           " for vxlan encapsulation");
1395         }
1396         if (!mask)
1397                 mask = &rte_flow_item_udp_mask;
1398         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1399                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1400                         return rte_flow_error_set
1401                                         (error, ENOTSUP,
1402                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1403                                          "no support for partial mask on"
1404                                          " \"udp.hdr.dst_port\" field"
1405                                          " for vxlan encapsulation");
1406                 if (!spec->hdr.dst_port)
1407                         return rte_flow_error_set
1408                                         (error, EINVAL,
1409                                          RTE_FLOW_ERROR_TYPE_ITEM, item,
1410                                          "outer UDP remote port cannot be"
1411                                          " 0 for vxlan encapsulation");
1412         } else {
1413                 return rte_flow_error_set(error, EINVAL,
1414                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1415                                           "outer UDP remote port"
1416                                           " must be specified for"
1417                                           " vxlan encapsulation");
1418         }
1419         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1420                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1421                         return rte_flow_error_set
1422                                         (error, ENOTSUP,
1423                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1424                                          "no support for partial mask on"
1425                                          " \"udp.hdr.src_port\" field"
1426                                          " for vxlan encapsulation");
1427                 DRV_LOG(WARNING,
1428                         "outer UDP source port cannot be"
1429                         " forced for vxlan encapsulation,"
1430                         " parameter ignored");
1431         }
1432         return 0;
1433 }
1434
1435 /**
1436  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1437  * The routine checks the VNIP fields to be used in encapsulation header.
1438  *
1439  * @param[in] item
1440  *   Pointer to the item structure.
1441  * @param[out] error
1442  *   Pointer to the error structure.
1443  *
1444  * @return
1445  *   0 on success, a negative errno value otherwise and rte_errno is set.
1446  **/
1447 static int
1448 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1449                                   struct rte_flow_error *error)
1450 {
1451         const struct rte_flow_item_vxlan *spec = item->spec;
1452         const struct rte_flow_item_vxlan *mask = item->mask;
1453
1454         if (!spec) {
1455                 /* Outer VNI is required by tunnel_key parameter. */
1456                 return rte_flow_error_set(error, EINVAL,
1457                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1458                                           "NULL VNI specification"
1459                                           " for vxlan encapsulation");
1460         }
1461         if (!mask)
1462                 mask = &rte_flow_item_vxlan_mask;
1463         if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1464                 return rte_flow_error_set(error, EINVAL,
1465                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1466                                           "outer VNI must be specified "
1467                                           "for vxlan encapsulation");
1468         if (mask->vni[0] != 0xff ||
1469             mask->vni[1] != 0xff ||
1470             mask->vni[2] != 0xff)
1471                 return rte_flow_error_set(error, ENOTSUP,
1472                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1473                                           "no support for partial mask on"
1474                                           " \"vxlan.vni\" field");
1475
1476         if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1477                 return rte_flow_error_set(error, EINVAL,
1478                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1479                                           "vxlan vni cannot be 0");
1480         return 0;
1481 }
1482
1483 /**
1484  * Validate VXLAN_ENCAP action item list for E-Switch.
1485  * The routine checks items to be used in encapsulation header.
1486  *
1487  * @param[in] action
1488  *   Pointer to the VXLAN_ENCAP action structure.
1489  * @param[out] error
1490  *   Pointer to the error structure.
1491  *
1492  * @return
1493  *   0 on success, a negative errno value otherwise and rte_errno is set.
1494  **/
1495 static int
1496 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1497                               struct rte_flow_error *error)
1498 {
1499         const struct rte_flow_item *items;
1500         int ret;
1501         uint32_t item_flags = 0;
1502
1503         if (!action->conf)
1504                 return rte_flow_error_set(error, EINVAL,
1505                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1506                                           "Missing vxlan tunnel"
1507                                           " action configuration");
1508         items = ((const struct rte_flow_action_vxlan_encap *)
1509                                         action->conf)->definition;
1510         if (!items)
1511                 return rte_flow_error_set(error, EINVAL,
1512                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1513                                           "Missing vxlan tunnel"
1514                                           " encapsulation parameters");
1515         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1516                 switch (items->type) {
1517                 case RTE_FLOW_ITEM_TYPE_VOID:
1518                         break;
1519                 case RTE_FLOW_ITEM_TYPE_ETH:
1520                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1521                                                           error);
1522                         if (ret < 0)
1523                                 return ret;
1524                         ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1525                         if (ret < 0)
1526                                 return ret;
1527                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1528                         break;
1529                 break;
1530                 case RTE_FLOW_ITEM_TYPE_IPV4:
1531                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
1532                                                            error);
1533                         if (ret < 0)
1534                                 return ret;
1535                         ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1536                         if (ret < 0)
1537                                 return ret;
1538                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1539                         break;
1540                 case RTE_FLOW_ITEM_TYPE_IPV6:
1541                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
1542                                                            error);
1543                         if (ret < 0)
1544                                 return ret;
1545                         ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1546                         if (ret < 0)
1547                                 return ret;
1548                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1549                         break;
1550                 case RTE_FLOW_ITEM_TYPE_UDP:
1551                         ret = mlx5_flow_validate_item_udp(items, item_flags,
1552                                                            0xFF, error);
1553                         if (ret < 0)
1554                                 return ret;
1555                         ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1556                         if (ret < 0)
1557                                 return ret;
1558                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1559                         break;
1560                 case RTE_FLOW_ITEM_TYPE_VXLAN:
1561                         ret = mlx5_flow_validate_item_vxlan(items,
1562                                                             item_flags, error);
1563                         if (ret < 0)
1564                                 return ret;
1565                         ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1566                         if (ret < 0)
1567                                 return ret;
1568                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
1569                         break;
1570                 default:
1571                         return rte_flow_error_set
1572                                         (error, ENOTSUP,
1573                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1574                                          "vxlan encap item not supported");
1575                 }
1576         }
1577         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1578                 return rte_flow_error_set(error, EINVAL,
1579                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1580                                           "no outer IP layer found"
1581                                           " for vxlan encapsulation");
1582         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1583                 return rte_flow_error_set(error, EINVAL,
1584                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1585                                           "no outer UDP layer found"
1586                                           " for vxlan encapsulation");
1587         if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1588                 return rte_flow_error_set(error, EINVAL,
1589                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1590                                           "no VXLAN VNI found"
1591                                           " for vxlan encapsulation");
1592         return 0;
1593 }
1594
1595 /**
1596  * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item
1597  * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list.
1598  *
1599  * @param[in] udp
1600  *   Outer UDP layer item (if any, NULL otherwise).
1601  * @param[out] error
1602  *   Pointer to the error structure.
1603  *
1604  * @return
1605  *   0 on success, a negative errno value otherwise and rte_errno is set.
1606  **/
1607 static int
1608 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1609                                   struct rte_flow_error *error)
1610 {
1611         const struct rte_flow_item_udp *spec = udp->spec;
1612         const struct rte_flow_item_udp *mask = udp->mask;
1613
1614         if (!spec)
1615                 /*
1616                  * Specification for UDP ports cannot be empty
1617                  * because it is required as decap parameter.
1618                  */
1619                 return rte_flow_error_set(error, EINVAL,
1620                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1621                                           "NULL UDP port specification"
1622                                           " for VXLAN decapsulation");
1623         if (!mask)
1624                 mask = &rte_flow_item_udp_mask;
1625         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1626                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1627                         return rte_flow_error_set
1628                                         (error, ENOTSUP,
1629                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1630                                          "no support for partial mask on"
1631                                          " \"udp.hdr.dst_port\" field");
1632                 if (!spec->hdr.dst_port)
1633                         return rte_flow_error_set
1634                                         (error, EINVAL,
1635                                          RTE_FLOW_ERROR_TYPE_ITEM, udp,
1636                                          "zero decap local UDP port");
1637         } else {
1638                 return rte_flow_error_set(error, EINVAL,
1639                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1640                                           "outer UDP destination port must be "
1641                                           "specified for vxlan decapsulation");
1642         }
1643         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1644                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1645                         return rte_flow_error_set
1646                                         (error, ENOTSUP,
1647                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1648                                          "no support for partial mask on"
1649                                          " \"udp.hdr.src_port\" field");
1650                 DRV_LOG(WARNING,
1651                         "outer UDP local port cannot be "
1652                         "forced for VXLAN encapsulation, "
1653                         "parameter ignored");
1654         }
1655         return 0;
1656 }
1657
1658 /**
1659  * Validate flow for E-Switch.
1660  *
1661  * @param[in] priv
1662  *   Pointer to the priv structure.
1663  * @param[in] attr
1664  *   Pointer to the flow attributes.
1665  * @param[in] items
1666  *   Pointer to the list of items.
1667  * @param[in] actions
1668  *   Pointer to the list of actions.
1669  * @param[out] error
1670  *   Pointer to the error structure.
1671  *
1672  * @return
1673  *   0 on success, a negative errno value otherwise and rte_errno is set.
1674  */
1675 static int
1676 flow_tcf_validate(struct rte_eth_dev *dev,
1677                   const struct rte_flow_attr *attr,
1678                   const struct rte_flow_item items[],
1679                   const struct rte_flow_action actions[],
1680                   struct rte_flow_error *error)
1681 {
1682         union {
1683                 const struct rte_flow_item_port_id *port_id;
1684                 const struct rte_flow_item_eth *eth;
1685                 const struct rte_flow_item_vlan *vlan;
1686                 const struct rte_flow_item_ipv4 *ipv4;
1687                 const struct rte_flow_item_ipv6 *ipv6;
1688                 const struct rte_flow_item_tcp *tcp;
1689                 const struct rte_flow_item_udp *udp;
1690                 const struct rte_flow_item_vxlan *vxlan;
1691         } spec, mask;
1692         union {
1693                 const struct rte_flow_action_port_id *port_id;
1694                 const struct rte_flow_action_jump *jump;
1695                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1696                 const struct rte_flow_action_of_set_vlan_vid *
1697                         of_set_vlan_vid;
1698                 const struct rte_flow_action_of_set_vlan_pcp *
1699                         of_set_vlan_pcp;
1700                 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1701                 const struct rte_flow_action_set_ipv4 *set_ipv4;
1702                 const struct rte_flow_action_set_ipv6 *set_ipv6;
1703         } conf;
1704         const struct rte_flow_item *outer_udp = NULL;
1705         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
1706         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
1707         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
1708         uint64_t item_flags = 0;
1709         uint64_t action_flags = 0;
1710         uint8_t next_protocol = 0xff;
1711         unsigned int tcm_ifindex = 0;
1712         uint8_t pedit_validated = 0;
1713         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1714         struct rte_eth_dev *port_id_dev = NULL;
1715         bool in_port_id_set;
1716         int ret;
1717
1718         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1719                                                 PTOI_TABLE_SZ_MAX(dev)));
1720         ret = flow_tcf_validate_attributes(attr, error);
1721         if (ret < 0)
1722                 return ret;
1723         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1724                 unsigned int i;
1725                 uint64_t current_action_flag = 0;
1726
1727                 switch (actions->type) {
1728                 case RTE_FLOW_ACTION_TYPE_VOID:
1729                         break;
1730                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1731                         current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1732                         if (!actions->conf)
1733                                 break;
1734                         conf.port_id = actions->conf;
1735                         if (conf.port_id->original)
1736                                 i = 0;
1737                         else
1738                                 for (i = 0; ptoi[i].ifindex; ++i)
1739                                         if (ptoi[i].port_id == conf.port_id->id)
1740                                                 break;
1741                         if (!ptoi[i].ifindex)
1742                                 return rte_flow_error_set
1743                                         (error, ENODEV,
1744                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1745                                          conf.port_id,
1746                                          "missing data to convert port ID to"
1747                                          " ifindex");
1748                         port_id_dev = &rte_eth_devices[conf.port_id->id];
1749                         break;
1750                 case RTE_FLOW_ACTION_TYPE_JUMP:
1751                         current_action_flag = MLX5_FLOW_ACTION_JUMP;
1752                         if (!actions->conf)
1753                                 break;
1754                         conf.jump = actions->conf;
1755                         if (attr->group >= conf.jump->group)
1756                                 return rte_flow_error_set
1757                                         (error, ENOTSUP,
1758                                          RTE_FLOW_ERROR_TYPE_ACTION,
1759                                          actions,
1760                                          "can jump only to a group forward");
1761                         break;
1762                 case RTE_FLOW_ACTION_TYPE_DROP:
1763                         current_action_flag = MLX5_FLOW_ACTION_DROP;
1764                         break;
1765                 case RTE_FLOW_ACTION_TYPE_COUNT:
1766                         break;
1767                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1768                         current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1769                         break;
1770                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1771                         rte_be16_t ethertype;
1772
1773                         current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1774                         if (!actions->conf)
1775                                 break;
1776                         conf.of_push_vlan = actions->conf;
1777                         ethertype = conf.of_push_vlan->ethertype;
1778                         if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1779                             ethertype != RTE_BE16(ETH_P_8021AD))
1780                                 return rte_flow_error_set
1781                                         (error, EINVAL,
1782                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1783                                          "vlan push TPID must be "
1784                                          "802.1Q or 802.1AD");
1785                         break;
1786                 }
1787                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1788                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1789                                 return rte_flow_error_set
1790                                         (error, ENOTSUP,
1791                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1792                                          "vlan modify is not supported,"
1793                                          " set action must follow push action");
1794                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1795                         break;
1796                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1797                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1798                                 return rte_flow_error_set
1799                                         (error, ENOTSUP,
1800                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1801                                          "vlan modify is not supported,"
1802                                          " set action must follow push action");
1803                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1804                         break;
1805                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1806                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1807                         break;
1808                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1809                         ret = flow_tcf_validate_vxlan_encap(actions, error);
1810                         if (ret < 0)
1811                                 return ret;
1812                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1813                         break;
1814                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1815                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1816                         break;
1817                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1818                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1819                         break;
1820                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1821                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1822                         break;
1823                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1824                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1825                         break;
1826                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1827                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1828                         break;
1829                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1830                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1831                         break;
1832                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1833                         current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1834                         break;
1835                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1836                         current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1837                         break;
1838                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1839                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1840                         break;
1841                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1842                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1843                         break;
1844                 default:
1845                         return rte_flow_error_set(error, ENOTSUP,
1846                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1847                                                   actions,
1848                                                   "action not supported");
1849                 }
1850                 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1851                         if (!actions->conf)
1852                                 return rte_flow_error_set
1853                                         (error, EINVAL,
1854                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1855                                          actions,
1856                                          "action configuration not set");
1857                 }
1858                 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1859                     pedit_validated)
1860                         return rte_flow_error_set(error, ENOTSUP,
1861                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1862                                                   actions,
1863                                                   "set actions should be "
1864                                                   "listed successively");
1865                 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1866                     (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1867                         pedit_validated = 1;
1868                 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1869                     (action_flags & MLX5_TCF_FATE_ACTIONS))
1870                         return rte_flow_error_set(error, EINVAL,
1871                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1872                                                   actions,
1873                                                   "can't have multiple fate"
1874                                                   " actions");
1875                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1876                     (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1877                         return rte_flow_error_set(error, EINVAL,
1878                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1879                                                   actions,
1880                                                   "can't have multiple vxlan"
1881                                                   " actions");
1882                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1883                     (action_flags & MLX5_TCF_VLAN_ACTIONS))
1884                         return rte_flow_error_set(error, ENOTSUP,
1885                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1886                                                   actions,
1887                                                   "can't have vxlan and vlan"
1888                                                   " actions in the same rule");
1889                 action_flags |= current_action_flag;
1890         }
1891         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1892                 unsigned int i;
1893
1894                 switch (items->type) {
1895                 case RTE_FLOW_ITEM_TYPE_VOID:
1896                         break;
1897                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1898                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1899                                 return rte_flow_error_set
1900                                         (error, ENOTSUP,
1901                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1902                                          "inner tunnel port id"
1903                                          " item is not supported");
1904                         mask.port_id = flow_tcf_item_mask
1905                                 (items, &rte_flow_item_port_id_mask,
1906                                  &flow_tcf_mask_supported.port_id,
1907                                  &flow_tcf_mask_empty.port_id,
1908                                  sizeof(flow_tcf_mask_supported.port_id),
1909                                  error);
1910                         if (!mask.port_id)
1911                                 return -rte_errno;
1912                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1913                                 in_port_id_set = 1;
1914                                 break;
1915                         }
1916                         spec.port_id = items->spec;
1917                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1918                                 return rte_flow_error_set
1919                                         (error, ENOTSUP,
1920                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1921                                          mask.port_id,
1922                                          "no support for partial mask on"
1923                                          " \"id\" field");
1924                         if (!mask.port_id->id)
1925                                 i = 0;
1926                         else
1927                                 for (i = 0; ptoi[i].ifindex; ++i)
1928                                         if (ptoi[i].port_id == spec.port_id->id)
1929                                                 break;
1930                         if (!ptoi[i].ifindex)
1931                                 return rte_flow_error_set
1932                                         (error, ENODEV,
1933                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1934                                          spec.port_id,
1935                                          "missing data to convert port ID to"
1936                                          " ifindex");
1937                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
1938                                 return rte_flow_error_set
1939                                         (error, ENOTSUP,
1940                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1941                                          spec.port_id,
1942                                          "cannot match traffic for"
1943                                          " several port IDs through"
1944                                          " a single flow rule");
1945                         tcm_ifindex = ptoi[i].ifindex;
1946                         in_port_id_set = 1;
1947                         break;
1948                 case RTE_FLOW_ITEM_TYPE_ETH:
1949                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1950                                                           error);
1951                         if (ret < 0)
1952                                 return ret;
1953                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
1954                                       MLX5_FLOW_LAYER_INNER_L2 :
1955                                       MLX5_FLOW_LAYER_OUTER_L2;
1956                         /* TODO:
1957                          * Redundant check due to different supported mask.
1958                          * Same for the rest of items.
1959                          */
1960                         mask.eth = flow_tcf_item_mask
1961                                 (items, &rte_flow_item_eth_mask,
1962                                  &flow_tcf_mask_supported.eth,
1963                                  &flow_tcf_mask_empty.eth,
1964                                  sizeof(flow_tcf_mask_supported.eth),
1965                                  error);
1966                         if (!mask.eth)
1967                                 return -rte_errno;
1968                         if (mask.eth->type && mask.eth->type !=
1969                             RTE_BE16(0xffff))
1970                                 return rte_flow_error_set
1971                                         (error, ENOTSUP,
1972                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1973                                          mask.eth,
1974                                          "no support for partial mask on"
1975                                          " \"type\" field");
1976                         assert(items->spec);
1977                         spec.eth = items->spec;
1978                         if (mask.eth->type &&
1979                             (item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
1980                             inner_etype != RTE_BE16(ETH_P_ALL) &&
1981                             inner_etype != spec.eth->type)
1982                                 return rte_flow_error_set
1983                                         (error, EINVAL,
1984                                          RTE_FLOW_ERROR_TYPE_ITEM,
1985                                          items,
1986                                          "inner eth_type conflict");
1987                         if (mask.eth->type &&
1988                             !(item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
1989                             outer_etype != RTE_BE16(ETH_P_ALL) &&
1990                             outer_etype != spec.eth->type)
1991                                 return rte_flow_error_set
1992                                         (error, EINVAL,
1993                                          RTE_FLOW_ERROR_TYPE_ITEM,
1994                                          items,
1995                                          "outer eth_type conflict");
1996                         if (mask.eth->type) {
1997                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1998                                         inner_etype = spec.eth->type;
1999                                 else
2000                                         outer_etype = spec.eth->type;
2001                         }
2002                         break;
2003                 case RTE_FLOW_ITEM_TYPE_VLAN:
2004                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2005                                 return rte_flow_error_set
2006                                         (error, ENOTSUP,
2007                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2008                                          "inner tunnel VLAN"
2009                                          " is not supported");
2010                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
2011                                                            error);
2012                         if (ret < 0)
2013                                 return ret;
2014                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2015                         mask.vlan = flow_tcf_item_mask
2016                                 (items, &rte_flow_item_vlan_mask,
2017                                  &flow_tcf_mask_supported.vlan,
2018                                  &flow_tcf_mask_empty.vlan,
2019                                  sizeof(flow_tcf_mask_supported.vlan),
2020                                  error);
2021                         if (!mask.vlan)
2022                                 return -rte_errno;
2023                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2024                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
2025                               RTE_BE16(0xe000)) ||
2026                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
2027                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2028                               RTE_BE16(0x0fff)) ||
2029                             (mask.vlan->inner_type &&
2030                              mask.vlan->inner_type != RTE_BE16(0xffff)))
2031                                 return rte_flow_error_set
2032                                         (error, ENOTSUP,
2033                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2034                                          mask.vlan,
2035                                          "no support for partial masks on"
2036                                          " \"tci\" (PCP and VID parts) and"
2037                                          " \"inner_type\" fields");
2038                         if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2039                             outer_etype != RTE_BE16(ETH_P_8021Q))
2040                                 return rte_flow_error_set
2041                                         (error, EINVAL,
2042                                          RTE_FLOW_ERROR_TYPE_ITEM,
2043                                          items,
2044                                          "outer eth_type conflict,"
2045                                          " must be 802.1Q");
2046                         outer_etype = RTE_BE16(ETH_P_8021Q);
2047                         assert(items->spec);
2048                         spec.vlan = items->spec;
2049                         if (mask.vlan->inner_type &&
2050                             vlan_etype != RTE_BE16(ETH_P_ALL) &&
2051                             vlan_etype != spec.vlan->inner_type)
2052                                 return rte_flow_error_set
2053                                         (error, EINVAL,
2054                                          RTE_FLOW_ERROR_TYPE_ITEM,
2055                                          items,
2056                                          "vlan eth_type conflict");
2057                         if (mask.vlan->inner_type)
2058                                 vlan_etype = spec.vlan->inner_type;
2059                         break;
2060                 case RTE_FLOW_ITEM_TYPE_IPV4:
2061                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
2062                                                            error);
2063                         if (ret < 0)
2064                                 return ret;
2065                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2066                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
2067                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2068                         mask.ipv4 = flow_tcf_item_mask
2069                                 (items, &rte_flow_item_ipv4_mask,
2070                                  &flow_tcf_mask_supported.ipv4,
2071                                  &flow_tcf_mask_empty.ipv4,
2072                                  sizeof(flow_tcf_mask_supported.ipv4),
2073                                  error);
2074                         if (!mask.ipv4)
2075                                 return -rte_errno;
2076                         if (mask.ipv4->hdr.next_proto_id &&
2077                             mask.ipv4->hdr.next_proto_id != 0xff)
2078                                 return rte_flow_error_set
2079                                         (error, ENOTSUP,
2080                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2081                                          mask.ipv4,
2082                                          "no support for partial mask on"
2083                                          " \"hdr.next_proto_id\" field");
2084                         else if (mask.ipv4->hdr.next_proto_id)
2085                                 next_protocol =
2086                                         ((const struct rte_flow_item_ipv4 *)
2087                                          (items->spec))->hdr.next_proto_id;
2088                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2089                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2090                                     inner_etype != RTE_BE16(ETH_P_IP))
2091                                         return rte_flow_error_set
2092                                                 (error, EINVAL,
2093                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2094                                                  items,
2095                                                  "inner eth_type conflict,"
2096                                                  " IPv4 is required");
2097                                 inner_etype = RTE_BE16(ETH_P_IP);
2098                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2099                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2100                                     vlan_etype != RTE_BE16(ETH_P_IP))
2101                                         return rte_flow_error_set
2102                                                 (error, EINVAL,
2103                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2104                                                  items,
2105                                                  "vlan eth_type conflict,"
2106                                                  " IPv4 is required");
2107                                 vlan_etype = RTE_BE16(ETH_P_IP);
2108                         } else {
2109                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2110                                     outer_etype != RTE_BE16(ETH_P_IP))
2111                                         return rte_flow_error_set
2112                                                 (error, EINVAL,
2113                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2114                                                  items,
2115                                                  "eth_type conflict,"
2116                                                  " IPv4 is required");
2117                                 outer_etype = RTE_BE16(ETH_P_IP);
2118                         }
2119                         break;
2120                 case RTE_FLOW_ITEM_TYPE_IPV6:
2121                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
2122                                                            error);
2123                         if (ret < 0)
2124                                 return ret;
2125                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2126                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
2127                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2128                         mask.ipv6 = flow_tcf_item_mask
2129                                 (items, &rte_flow_item_ipv6_mask,
2130                                  &flow_tcf_mask_supported.ipv6,
2131                                  &flow_tcf_mask_empty.ipv6,
2132                                  sizeof(flow_tcf_mask_supported.ipv6),
2133                                  error);
2134                         if (!mask.ipv6)
2135                                 return -rte_errno;
2136                         if (mask.ipv6->hdr.proto &&
2137                             mask.ipv6->hdr.proto != 0xff)
2138                                 return rte_flow_error_set
2139                                         (error, ENOTSUP,
2140                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2141                                          mask.ipv6,
2142                                          "no support for partial mask on"
2143                                          " \"hdr.proto\" field");
2144                         else if (mask.ipv6->hdr.proto)
2145                                 next_protocol =
2146                                         ((const struct rte_flow_item_ipv6 *)
2147                                          (items->spec))->hdr.proto;
2148                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2149                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2150                                     inner_etype != RTE_BE16(ETH_P_IPV6))
2151                                         return rte_flow_error_set
2152                                                 (error, EINVAL,
2153                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2154                                                  items,
2155                                                  "inner eth_type conflict,"
2156                                                  " IPv6 is required");
2157                                 inner_etype = RTE_BE16(ETH_P_IPV6);
2158                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2159                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2160                                     vlan_etype != RTE_BE16(ETH_P_IPV6))
2161                                         return rte_flow_error_set
2162                                                 (error, EINVAL,
2163                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2164                                                  items,
2165                                                  "vlan eth_type conflict,"
2166                                                  " IPv6 is required");
2167                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
2168                         } else {
2169                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2170                                     outer_etype != RTE_BE16(ETH_P_IPV6))
2171                                         return rte_flow_error_set
2172                                                 (error, EINVAL,
2173                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2174                                                  items,
2175                                                  "eth_type conflict,"
2176                                                  " IPv6 is required");
2177                                 outer_etype = RTE_BE16(ETH_P_IPV6);
2178                         }
2179                         break;
2180                 case RTE_FLOW_ITEM_TYPE_UDP:
2181                         ret = mlx5_flow_validate_item_udp(items, item_flags,
2182                                                           next_protocol, error);
2183                         if (ret < 0)
2184                                 return ret;
2185                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2186                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
2187                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
2188                         mask.udp = flow_tcf_item_mask
2189                                 (items, &rte_flow_item_udp_mask,
2190                                  &flow_tcf_mask_supported.udp,
2191                                  &flow_tcf_mask_empty.udp,
2192                                  sizeof(flow_tcf_mask_supported.udp),
2193                                  error);
2194                         if (!mask.udp)
2195                                 return -rte_errno;
2196                         /*
2197                          * Save the presumed outer UDP item for extra check
2198                          * if the tunnel item will be found later in the list.
2199                          */
2200                         if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL))
2201                                 outer_udp = items;
2202                         break;
2203                 case RTE_FLOW_ITEM_TYPE_TCP:
2204                         ret = mlx5_flow_validate_item_tcp
2205                                              (items, item_flags,
2206                                               next_protocol,
2207                                               &flow_tcf_mask_supported.tcp,
2208                                               error);
2209                         if (ret < 0)
2210                                 return ret;
2211                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2212                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
2213                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
2214                         mask.tcp = flow_tcf_item_mask
2215                                 (items, &rte_flow_item_tcp_mask,
2216                                  &flow_tcf_mask_supported.tcp,
2217                                  &flow_tcf_mask_empty.tcp,
2218                                  sizeof(flow_tcf_mask_supported.tcp),
2219                                  error);
2220                         if (!mask.tcp)
2221                                 return -rte_errno;
2222                         break;
2223                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2224                         if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)
2225                                 return rte_flow_error_set
2226                                         (error, ENOTSUP,
2227                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2228                                          "vxlan tunnel over vlan"
2229                                          " is not supported");
2230                         ret = mlx5_flow_validate_item_vxlan(items,
2231                                                             item_flags, error);
2232                         if (ret < 0)
2233                                 return ret;
2234                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
2235                         mask.vxlan = flow_tcf_item_mask
2236                                 (items, &rte_flow_item_vxlan_mask,
2237                                  &flow_tcf_mask_supported.vxlan,
2238                                  &flow_tcf_mask_empty.vxlan,
2239                                  sizeof(flow_tcf_mask_supported.vxlan), error);
2240                         if (!mask.vxlan)
2241                                 return -rte_errno;
2242                         if (mask.vxlan->vni[0] != 0xff ||
2243                             mask.vxlan->vni[1] != 0xff ||
2244                             mask.vxlan->vni[2] != 0xff)
2245                                 return rte_flow_error_set
2246                                         (error, ENOTSUP,
2247                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2248                                          mask.vxlan,
2249                                          "no support for partial or "
2250                                          "empty mask on \"vxlan.vni\" field");
2251                         /*
2252                          * The VNI item assumes the VXLAN tunnel, it requires
2253                          * at least the outer destination UDP port must be
2254                          * specified without wildcards to allow kernel select
2255                          * the virtual VXLAN device by port. Also outer IPv4
2256                          * or IPv6 item must be specified (wilcards or even
2257                          * zero mask are allowed) to let driver know the tunnel
2258                          * IP version and process UDP traffic correctly.
2259                          */
2260                         if (!(item_flags &
2261                              (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2262                               MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2263                                 return rte_flow_error_set
2264                                                  (error, EINVAL,
2265                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2266                                                   NULL,
2267                                                   "no outer IP pattern found"
2268                                                   " for vxlan tunnel");
2269                         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2270                                 return rte_flow_error_set
2271                                                  (error, EINVAL,
2272                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2273                                                   NULL,
2274                                                   "no outer UDP pattern found"
2275                                                   " for vxlan tunnel");
2276                         /*
2277                          * All items preceding the tunnel item become outer
2278                          * ones and we should do extra validation for them
2279                          * due to tc limitations for tunnel outer parameters.
2280                          * Currently only outer UDP item requres extra check,
2281                          * use the saved pointer instead of item list rescan.
2282                          */
2283                         assert(outer_udp);
2284                         ret = flow_tcf_validate_vxlan_decap_udp
2285                                                 (outer_udp, error);
2286                         if (ret < 0)
2287                                 return ret;
2288                         /* Reset L4 protocol for inner parameters. */
2289                         next_protocol = 0xff;
2290                         break;
2291                 default:
2292                         return rte_flow_error_set(error, ENOTSUP,
2293                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2294                                                   items, "item not supported");
2295                 }
2296         }
2297         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2298             (action_flags & MLX5_FLOW_ACTION_DROP))
2299                 return rte_flow_error_set(error, ENOTSUP,
2300                                           RTE_FLOW_ERROR_TYPE_ACTION,
2301                                           actions,
2302                                           "set action is not compatible with "
2303                                           "drop action");
2304         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2305             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2306                 return rte_flow_error_set(error, ENOTSUP,
2307                                           RTE_FLOW_ERROR_TYPE_ACTION,
2308                                           actions,
2309                                           "set action must be followed by "
2310                                           "port_id action");
2311         if (action_flags &
2312            (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2313                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2314                         return rte_flow_error_set(error, EINVAL,
2315                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2316                                                   actions,
2317                                                   "no ipv4 item found in"
2318                                                   " pattern");
2319         }
2320         if (action_flags &
2321            (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2322                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2323                         return rte_flow_error_set(error, EINVAL,
2324                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2325                                                   actions,
2326                                                   "no ipv6 item found in"
2327                                                   " pattern");
2328         }
2329         if (action_flags &
2330            (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2331                 if (!(item_flags &
2332                      (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2333                       MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2334                         return rte_flow_error_set(error, EINVAL,
2335                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2336                                                   actions,
2337                                                   "no TCP/UDP item found in"
2338                                                   " pattern");
2339         }
2340         /*
2341          * FW syndrome (0xA9C090):
2342          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
2343          *     forward to the uplink.
2344          */
2345         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2346             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2347             ((struct priv *)port_id_dev->data->dev_private)->representor)
2348                 return rte_flow_error_set(error, ENOTSUP,
2349                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2350                                           "vlan push can only be applied"
2351                                           " when forwarding to uplink port");
2352         /*
2353          * FW syndrome (0x294609):
2354          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
2355          *     are supported only while forwarding to vport.
2356          */
2357         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2358             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2359                 return rte_flow_error_set(error, ENOTSUP,
2360                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2361                                           "vlan actions are supported"
2362                                           " only with port_id action");
2363         if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2364             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2365                 return rte_flow_error_set(error, ENOTSUP,
2366                                           RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2367                                           "vxlan actions are supported"
2368                                           " only with port_id action");
2369         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2370                 return rte_flow_error_set(error, EINVAL,
2371                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2372                                           "no fate action is found");
2373         if (action_flags &
2374            (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2375                 if (!(item_flags &
2376                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2377                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2378                         return rte_flow_error_set(error, EINVAL,
2379                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2380                                                   actions,
2381                                                   "no IP found in pattern");
2382         }
2383         if (action_flags &
2384             (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2385                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2386                         return rte_flow_error_set(error, ENOTSUP,
2387                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2388                                                   actions,
2389                                                   "no ethernet found in"
2390                                                   " pattern");
2391         }
2392         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) &&
2393             !(item_flags & MLX5_FLOW_LAYER_VXLAN))
2394                 return rte_flow_error_set(error, EINVAL,
2395                                           RTE_FLOW_ERROR_TYPE_ACTION,
2396                                           NULL,
2397                                           "no VNI pattern found"
2398                                           " for vxlan decap action");
2399         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) &&
2400             (item_flags & MLX5_FLOW_LAYER_TUNNEL))
2401                 return rte_flow_error_set(error, EINVAL,
2402                                           RTE_FLOW_ERROR_TYPE_ACTION,
2403                                           NULL,
2404                                           "vxlan encap not supported"
2405                                           " for tunneled traffic");
2406         return 0;
2407 }
2408
2409 /**
2410  * Calculate maximum size of memory for flow items of Linux TC flower.
2411  *
2412  * @param[in] attr
2413  *   Pointer to the flow attributes.
2414  * @param[in] items
2415  *   Pointer to the list of items.
2416  * @param[out] action_flags
2417  *   Pointer to the detected actions.
2418  *
2419  * @return
2420  *   Maximum size of memory for items.
2421  */
2422 static int
2423 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2424                         const struct rte_flow_item items[],
2425                         uint64_t *action_flags)
2426 {
2427         int size = 0;
2428
2429         size += SZ_NLATTR_STRZ_OF("flower") +
2430                 SZ_NLATTR_TYPE_OF(uint16_t) + /* Outer ether type. */
2431                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2432                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2433         if (attr->group > 0)
2434                 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2435         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2436                 switch (items->type) {
2437                 case RTE_FLOW_ITEM_TYPE_VOID:
2438                         break;
2439                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2440                         break;
2441                 case RTE_FLOW_ITEM_TYPE_ETH:
2442                         size += SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2443                                 /* dst/src MAC addr and mask. */
2444                         break;
2445                 case RTE_FLOW_ITEM_TYPE_VLAN:
2446                         size += SZ_NLATTR_TYPE_OF(uint16_t) +
2447                                 /* VLAN Ether type. */
2448                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2449                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2450                         break;
2451                 case RTE_FLOW_ITEM_TYPE_IPV4:
2452                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2453                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2454                                 /* dst/src IP addr and mask. */
2455                         break;
2456                 case RTE_FLOW_ITEM_TYPE_IPV6:
2457                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2458                                 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2459                                 /* dst/src IP addr and mask. */
2460                         break;
2461                 case RTE_FLOW_ITEM_TYPE_UDP:
2462                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2463                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2464                                 /* dst/src port and mask. */
2465                         break;
2466                 case RTE_FLOW_ITEM_TYPE_TCP:
2467                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2468                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2469                                 /* dst/src port and mask. */
2470                         break;
2471                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2472                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2473                         /*
2474                          * There might be no VXLAN decap action in the action
2475                          * list, nonetheless the VXLAN tunnel flow requires
2476                          * the decap structure to be correctly applied to
2477                          * VXLAN device, set the flag to create the structure.
2478                          * Translation routine will not put the decap action
2479                          * in tne Netlink message if there is no actual action
2480                          * in the list.
2481                          */
2482                         *action_flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2483                         break;
2484                 default:
2485                         DRV_LOG(WARNING,
2486                                 "unsupported item %p type %d,"
2487                                 " items must be validated before flow creation",
2488                                 (const void *)items, items->type);
2489                         break;
2490                 }
2491         }
2492         return size;
2493 }
2494
2495 /**
2496  * Calculate size of memory to store the VXLAN encapsultion
2497  * related items in the Netlink message buffer. Items list
2498  * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2499  * The item list should be validated.
2500  *
2501  * @param[in] action
2502  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2503  *   List of pattern items to scan data from.
2504  *
2505  * @return
2506  *   The size the part of Netlink message buffer to store the
2507  *   VXLAN encapsulation item attributes.
2508  */
2509 static int
2510 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2511 {
2512         const struct rte_flow_item *items;
2513         int size = 0;
2514
2515         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2516         assert(action->conf);
2517
2518         items = ((const struct rte_flow_action_vxlan_encap *)
2519                                         action->conf)->definition;
2520         assert(items);
2521         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2522                 switch (items->type) {
2523                 case RTE_FLOW_ITEM_TYPE_VOID:
2524                         break;
2525                 case RTE_FLOW_ITEM_TYPE_ETH:
2526                         /* This item does not require message buffer. */
2527                         break;
2528                 case RTE_FLOW_ITEM_TYPE_IPV4:
2529                         size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2530                         break;
2531                 case RTE_FLOW_ITEM_TYPE_IPV6:
2532                         size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2533                         break;
2534                 case RTE_FLOW_ITEM_TYPE_UDP: {
2535                         const struct rte_flow_item_udp *udp = items->mask;
2536
2537                         size += SZ_NLATTR_TYPE_OF(uint16_t);
2538                         if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2539                                 size += SZ_NLATTR_TYPE_OF(uint16_t);
2540                         break;
2541                 }
2542                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2543                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2544                         break;
2545                 default:
2546                         assert(false);
2547                         DRV_LOG(WARNING,
2548                                 "unsupported item %p type %d,"
2549                                 " items must be validated"
2550                                 " before flow creation",
2551                                 (const void *)items, items->type);
2552                         return 0;
2553                 }
2554         }
2555         return size;
2556 }
2557
2558 /**
2559  * Calculate maximum size of memory for flow actions of Linux TC flower and
2560  * extract specified actions.
2561  *
2562  * @param[in] actions
2563  *   Pointer to the list of actions.
2564  * @param[out] action_flags
2565  *   Pointer to the detected actions.
2566  *
2567  * @return
2568  *   Maximum size of memory for actions.
2569  */
2570 static int
2571 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2572                               uint64_t *action_flags)
2573 {
2574         int size = 0;
2575         uint64_t flags = *action_flags;
2576
2577         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2578         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2579                 switch (actions->type) {
2580                 case RTE_FLOW_ACTION_TYPE_VOID:
2581                         break;
2582                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2583                         size += SZ_NLATTR_NEST + /* na_act_index. */
2584                                 SZ_NLATTR_STRZ_OF("mirred") +
2585                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2586                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2587                         flags |= MLX5_FLOW_ACTION_PORT_ID;
2588                         break;
2589                 case RTE_FLOW_ACTION_TYPE_JUMP:
2590                         size += SZ_NLATTR_NEST + /* na_act_index. */
2591                                 SZ_NLATTR_STRZ_OF("gact") +
2592                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2593                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2594                         flags |= MLX5_FLOW_ACTION_JUMP;
2595                         break;
2596                 case RTE_FLOW_ACTION_TYPE_DROP:
2597                         size += SZ_NLATTR_NEST + /* na_act_index. */
2598                                 SZ_NLATTR_STRZ_OF("gact") +
2599                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2600                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2601                         flags |= MLX5_FLOW_ACTION_DROP;
2602                         break;
2603                 case RTE_FLOW_ACTION_TYPE_COUNT:
2604                         break;
2605                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2606                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2607                         goto action_of_vlan;
2608                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2609                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2610                         goto action_of_vlan;
2611                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2612                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2613                         goto action_of_vlan;
2614                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2615                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2616                         goto action_of_vlan;
2617 action_of_vlan:
2618                         size += SZ_NLATTR_NEST + /* na_act_index. */
2619                                 SZ_NLATTR_STRZ_OF("vlan") +
2620                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2621                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2622                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2623                                 /* VLAN protocol. */
2624                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2625                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2626                         break;
2627                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2628                         size += SZ_NLATTR_NEST + /* na_act_index. */
2629                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2630                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2631                                 SZ_NLATTR_TYPE_OF(uint8_t);
2632                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2633                         size += flow_tcf_vxlan_encap_size(actions) +
2634                                 RTE_ALIGN_CEIL /* preceding encap params. */
2635                                 (sizeof(struct flow_tcf_vxlan_encap),
2636                                 MNL_ALIGNTO);
2637                         flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2638                         break;
2639                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2640                         size += SZ_NLATTR_NEST + /* na_act_index. */
2641                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2642                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2643                                 SZ_NLATTR_TYPE_OF(uint8_t);
2644                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2645                         size += RTE_ALIGN_CEIL /* preceding decap params. */
2646                                 (sizeof(struct flow_tcf_vxlan_decap),
2647                                 MNL_ALIGNTO);
2648                         flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2649                         break;
2650                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2651                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2652                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2653                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2654                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2655                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2656                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2657                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2658                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2659                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2660                         size += flow_tcf_get_pedit_actions_size(&actions,
2661                                                                 &flags);
2662                         break;
2663                 default:
2664                         DRV_LOG(WARNING,
2665                                 "unsupported action %p type %d,"
2666                                 " items must be validated before flow creation",
2667                                 (const void *)actions, actions->type);
2668                         break;
2669                 }
2670         }
2671         *action_flags = flags;
2672         return size;
2673 }
2674
2675 /**
2676  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2677  * memory required, allocates the memory, initializes Netlink message headers
2678  * and set unique TC message handle.
2679  *
2680  * @param[in] attr
2681  *   Pointer to the flow attributes.
2682  * @param[in] items
2683  *   Pointer to the list of items.
2684  * @param[in] actions
2685  *   Pointer to the list of actions.
2686  * @param[out] error
2687  *   Pointer to the error structure.
2688  *
2689  * @return
2690  *   Pointer to mlx5_flow object on success,
2691  *   otherwise NULL and rte_errno is set.
2692  */
2693 static struct mlx5_flow *
2694 flow_tcf_prepare(const struct rte_flow_attr *attr,
2695                  const struct rte_flow_item items[],
2696                  const struct rte_flow_action actions[],
2697                  struct rte_flow_error *error)
2698 {
2699         size_t size = RTE_ALIGN_CEIL
2700                         (sizeof(struct mlx5_flow),
2701                          alignof(struct flow_tcf_tunnel_hdr)) +
2702                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
2703                       MNL_ALIGN(sizeof(struct tcmsg));
2704         struct mlx5_flow *dev_flow;
2705         uint64_t action_flags = 0;
2706         struct nlmsghdr *nlh;
2707         struct tcmsg *tcm;
2708         uint8_t *sp, *tun = NULL;
2709
2710         size += flow_tcf_get_items_size(attr, items, &action_flags);
2711         size += flow_tcf_get_actions_and_size(actions, &action_flags);
2712         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2713         if (!dev_flow) {
2714                 rte_flow_error_set(error, ENOMEM,
2715                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2716                                    "not enough memory to create E-Switch flow");
2717                 return NULL;
2718         }
2719         sp = (uint8_t *)(dev_flow + 1);
2720         if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2721                 sp = RTE_PTR_ALIGN
2722                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2723                 tun = sp;
2724                 sp += RTE_ALIGN_CEIL
2725                         (sizeof(struct flow_tcf_vxlan_encap),
2726                         MNL_ALIGNTO);
2727 #ifndef NDEBUG
2728                 size -= RTE_ALIGN_CEIL
2729                         (sizeof(struct flow_tcf_vxlan_encap),
2730                         MNL_ALIGNTO);
2731 #endif
2732         } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2733                 sp = RTE_PTR_ALIGN
2734                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2735                 tun = sp;
2736                 sp += RTE_ALIGN_CEIL
2737                         (sizeof(struct flow_tcf_vxlan_decap),
2738                         MNL_ALIGNTO);
2739 #ifndef NDEBUG
2740                 size -= RTE_ALIGN_CEIL
2741                         (sizeof(struct flow_tcf_vxlan_decap),
2742                         MNL_ALIGNTO);
2743 #endif
2744         } else {
2745                 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2746         }
2747         nlh = mnl_nlmsg_put_header(sp);
2748         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2749         *dev_flow = (struct mlx5_flow){
2750                 .tcf = (struct mlx5_flow_tcf){
2751 #ifndef NDEBUG
2752                         .nlsize = size - RTE_ALIGN_CEIL
2753                                 (sizeof(struct mlx5_flow),
2754                                  alignof(struct flow_tcf_tunnel_hdr)),
2755 #endif
2756                         .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2757                         .nlh = nlh,
2758                         .tcm = tcm,
2759                 },
2760         };
2761         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2762                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2763         else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2764                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2765         return dev_flow;
2766 }
2767
2768 /**
2769  * Make adjustments for supporting count actions.
2770  *
2771  * @param[in] dev
2772  *   Pointer to the Ethernet device structure.
2773  * @param[in] dev_flow
2774  *   Pointer to mlx5_flow.
2775  * @param[out] error
2776  *   Pointer to error structure.
2777  *
2778  * @return
2779  *   0 On success else a negative errno value is returned and rte_errno is set.
2780  */
2781 static int
2782 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2783                                   struct mlx5_flow *dev_flow,
2784                                   struct rte_flow_error *error)
2785 {
2786         struct rte_flow *flow = dev_flow->flow;
2787
2788         if (!flow->counter) {
2789                 flow->counter = flow_tcf_counter_new();
2790                 if (!flow->counter)
2791                         return rte_flow_error_set(error, rte_errno,
2792                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2793                                                   NULL,
2794                                                   "cannot get counter"
2795                                                   " context.");
2796         }
2797         return 0;
2798 }
2799
2800 /**
2801  * Convert VXLAN VNI to 32-bit integer.
2802  *
2803  * @param[in] vni
2804  *   VXLAN VNI in 24-bit wire format.
2805  *
2806  * @return
2807  *   VXLAN VNI as a 32-bit integer value in network endian.
2808  */
2809 static inline rte_be32_t
2810 vxlan_vni_as_be32(const uint8_t vni[3])
2811 {
2812         union {
2813                 uint8_t vni[4];
2814                 rte_be32_t dword;
2815         } ret = {
2816                 .vni = { 0, vni[0], vni[1], vni[2] },
2817         };
2818         return ret.dword;
2819 }
2820
2821 /**
2822  * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2823  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2824  * in the encapsulation parameters structure. The item must be prevalidated,
2825  * no any validation checks performed by function.
2826  *
2827  * @param[in] spec
2828  *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
2829  * @param[in] mask
2830  *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
2831  * @param[out] encap
2832  *   Structure to fill the gathered MAC address data.
2833  */
2834 static void
2835 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2836                                const struct rte_flow_item_eth *mask,
2837                                struct flow_tcf_vxlan_encap *encap)
2838 {
2839         /* Item must be validated before. No redundant checks. */
2840         assert(spec);
2841         if (!mask || !memcmp(&mask->dst,
2842                              &rte_flow_item_eth_mask.dst,
2843                              sizeof(rte_flow_item_eth_mask.dst))) {
2844                 /*
2845                  * Ethernet addresses are not supported by
2846                  * tc as tunnel_key parameters. Destination
2847                  * address is needed to form encap packet
2848                  * header and retrieved by kernel from
2849                  * implicit sources (ARP table, etc),
2850                  * address masks are not supported at all.
2851                  */
2852                 encap->eth.dst = spec->dst;
2853                 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2854         }
2855         if (!mask || !memcmp(&mask->src,
2856                              &rte_flow_item_eth_mask.src,
2857                              sizeof(rte_flow_item_eth_mask.src))) {
2858                 /*
2859                  * Ethernet addresses are not supported by
2860                  * tc as tunnel_key parameters. Source ethernet
2861                  * address is ignored anyway.
2862                  */
2863                 encap->eth.src = spec->src;
2864                 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2865         }
2866 }
2867
2868 /**
2869  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
2870  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
2871  * in the encapsulation parameters structure. The item must be prevalidated,
2872  * no any validation checks performed by function.
2873  *
2874  * @param[in] spec
2875  *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
2876  * @param[out] encap
2877  *   Structure to fill the gathered IPV4 address data.
2878  */
2879 static void
2880 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
2881                                 struct flow_tcf_vxlan_encap *encap)
2882 {
2883         /* Item must be validated before. No redundant checks. */
2884         assert(spec);
2885         encap->ipv4.dst = spec->hdr.dst_addr;
2886         encap->ipv4.src = spec->hdr.src_addr;
2887         encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
2888                        FLOW_TCF_ENCAP_IPV4_DST;
2889 }
2890
2891 /**
2892  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
2893  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
2894  * in the encapsulation parameters structure. The item must be prevalidated,
2895  * no any validation checks performed by function.
2896  *
2897  * @param[in] spec
2898  *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
2899  * @param[out] encap
2900  *   Structure to fill the gathered IPV6 address data.
2901  */
2902 static void
2903 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
2904                                 struct flow_tcf_vxlan_encap *encap)
2905 {
2906         /* Item must be validated before. No redundant checks. */
2907         assert(spec);
2908         memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
2909         memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
2910         encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
2911                        FLOW_TCF_ENCAP_IPV6_DST;
2912 }
2913
2914 /**
2915  * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
2916  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
2917  * in the encapsulation parameters structure. The item must be prevalidated,
2918  * no any validation checks performed by function.
2919  *
2920  * @param[in] spec
2921  *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
2922  * @param[in] mask
2923  *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
2924  * @param[out] encap
2925  *   Structure to fill the gathered UDP port data.
2926  */
2927 static void
2928 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
2929                                const struct rte_flow_item_udp *mask,
2930                                struct flow_tcf_vxlan_encap *encap)
2931 {
2932         assert(spec);
2933         encap->udp.dst = spec->hdr.dst_port;
2934         encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
2935         if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
2936                 encap->udp.src = spec->hdr.src_port;
2937                 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
2938         }
2939 }
2940
2941 /**
2942  * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
2943  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
2944  * in the encapsulation parameters structure. The item must be prevalidated,
2945  * no any validation checks performed by function.
2946  *
2947  * @param[in] spec
2948  *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
2949  * @param[out] encap
2950  *   Structure to fill the gathered VNI address data.
2951  */
2952 static void
2953 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
2954                                struct flow_tcf_vxlan_encap *encap)
2955 {
2956         /* Item must be validated before. Do not redundant checks. */
2957         assert(spec);
2958         memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
2959         encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
2960 }
2961
2962 /**
2963  * Populate consolidated encapsulation object from list of pattern items.
2964  *
2965  * Helper function to process configuration of action such as
2966  * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
2967  * validated, there is no way to return an meaningful error.
2968  *
2969  * @param[in] action
2970  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2971  *   List of pattern items to gather data from.
2972  * @param[out] src
2973  *   Structure to fill gathered data.
2974  */
2975 static void
2976 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
2977                            struct flow_tcf_vxlan_encap *encap)
2978 {
2979         union {
2980                 const struct rte_flow_item_eth *eth;
2981                 const struct rte_flow_item_ipv4 *ipv4;
2982                 const struct rte_flow_item_ipv6 *ipv6;
2983                 const struct rte_flow_item_udp *udp;
2984                 const struct rte_flow_item_vxlan *vxlan;
2985         } spec, mask;
2986         const struct rte_flow_item *items;
2987
2988         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2989         assert(action->conf);
2990
2991         items = ((const struct rte_flow_action_vxlan_encap *)
2992                                         action->conf)->definition;
2993         assert(items);
2994         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2995                 switch (items->type) {
2996                 case RTE_FLOW_ITEM_TYPE_VOID:
2997                         break;
2998                 case RTE_FLOW_ITEM_TYPE_ETH:
2999                         mask.eth = items->mask;
3000                         spec.eth = items->spec;
3001                         flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3002                                                        encap);
3003                         break;
3004                 case RTE_FLOW_ITEM_TYPE_IPV4:
3005                         spec.ipv4 = items->spec;
3006                         flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, encap);
3007                         break;
3008                 case RTE_FLOW_ITEM_TYPE_IPV6:
3009                         spec.ipv6 = items->spec;
3010                         flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, encap);
3011                         break;
3012                 case RTE_FLOW_ITEM_TYPE_UDP:
3013                         mask.udp = items->mask;
3014                         spec.udp = items->spec;
3015                         flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3016                                                        encap);
3017                         break;
3018                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3019                         spec.vxlan = items->spec;
3020                         flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3021                         break;
3022                 default:
3023                         assert(false);
3024                         DRV_LOG(WARNING,
3025                                 "unsupported item %p type %d,"
3026                                 " items must be validated"
3027                                 " before flow creation",
3028                                 (const void *)items, items->type);
3029                         encap->mask = 0;
3030                         return;
3031                 }
3032         }
3033 }
3034
3035 /**
3036  * Translate flow for Linux TC flower and construct Netlink message.
3037  *
3038  * @param[in] priv
3039  *   Pointer to the priv structure.
3040  * @param[in, out] flow
3041  *   Pointer to the sub flow.
3042  * @param[in] attr
3043  *   Pointer to the flow attributes.
3044  * @param[in] items
3045  *   Pointer to the list of items.
3046  * @param[in] actions
3047  *   Pointer to the list of actions.
3048  * @param[out] error
3049  *   Pointer to the error structure.
3050  *
3051  * @return
3052  *   0 on success, a negative errno value otherwise and rte_errno is set.
3053  */
3054 static int
3055 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3056                    const struct rte_flow_attr *attr,
3057                    const struct rte_flow_item items[],
3058                    const struct rte_flow_action actions[],
3059                    struct rte_flow_error *error)
3060 {
3061         union {
3062                 const struct rte_flow_item_port_id *port_id;
3063                 const struct rte_flow_item_eth *eth;
3064                 const struct rte_flow_item_vlan *vlan;
3065                 const struct rte_flow_item_ipv4 *ipv4;
3066                 const struct rte_flow_item_ipv6 *ipv6;
3067                 const struct rte_flow_item_tcp *tcp;
3068                 const struct rte_flow_item_udp *udp;
3069                 const struct rte_flow_item_vxlan *vxlan;
3070         } spec, mask;
3071         union {
3072                 const struct rte_flow_action_port_id *port_id;
3073                 const struct rte_flow_action_jump *jump;
3074                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3075                 const struct rte_flow_action_of_set_vlan_vid *
3076                         of_set_vlan_vid;
3077                 const struct rte_flow_action_of_set_vlan_pcp *
3078                         of_set_vlan_pcp;
3079         } conf;
3080         union {
3081                 struct flow_tcf_tunnel_hdr *hdr;
3082                 struct flow_tcf_vxlan_decap *vxlan;
3083         } decap = {
3084                 .hdr = NULL,
3085         };
3086         union {
3087                 struct flow_tcf_tunnel_hdr *hdr;
3088                 struct flow_tcf_vxlan_encap *vxlan;
3089         } encap = {
3090                 .hdr = NULL,
3091         };
3092         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3093         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3094         struct tcmsg *tcm = dev_flow->tcf.tcm;
3095         uint32_t na_act_index_cur;
3096         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
3097         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
3098         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
3099         bool ip_proto_set = 0;
3100         bool tunnel_outer = 0;
3101         struct nlattr *na_flower;
3102         struct nlattr *na_flower_act;
3103         struct nlattr *na_vlan_id = NULL;
3104         struct nlattr *na_vlan_priority = NULL;
3105         uint64_t item_flags = 0;
3106         int ret;
3107
3108         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3109                                                 PTOI_TABLE_SZ_MAX(dev)));
3110         if (dev_flow->tcf.tunnel) {
3111                 switch (dev_flow->tcf.tunnel->type) {
3112                 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3113                         decap.vxlan = dev_flow->tcf.vxlan_decap;
3114                         tunnel_outer = 1;
3115                         break;
3116                 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3117                         encap.vxlan = dev_flow->tcf.vxlan_encap;
3118                         break;
3119                 /* New tunnel actions can be added here. */
3120                 default:
3121                         assert(false);
3122                         break;
3123                 }
3124         }
3125         nlh = dev_flow->tcf.nlh;
3126         tcm = dev_flow->tcf.tcm;
3127         /* Prepare API must have been called beforehand. */
3128         assert(nlh != NULL && tcm != NULL);
3129         tcm->tcm_family = AF_UNSPEC;
3130         tcm->tcm_ifindex = ptoi[0].ifindex;
3131         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3132         /*
3133          * Priority cannot be zero to prevent the kernel from picking one
3134          * automatically.
3135          */
3136         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, outer_etype);
3137         if (attr->group > 0)
3138                 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3139         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3140         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3141         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3142                 unsigned int i;
3143
3144                 switch (items->type) {
3145                 case RTE_FLOW_ITEM_TYPE_VOID:
3146                         break;
3147                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3148                         mask.port_id = flow_tcf_item_mask
3149                                 (items, &rte_flow_item_port_id_mask,
3150                                  &flow_tcf_mask_supported.port_id,
3151                                  &flow_tcf_mask_empty.port_id,
3152                                  sizeof(flow_tcf_mask_supported.port_id),
3153                                  error);
3154                         assert(mask.port_id);
3155                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
3156                                 break;
3157                         spec.port_id = items->spec;
3158                         if (!mask.port_id->id)
3159                                 i = 0;
3160                         else
3161                                 for (i = 0; ptoi[i].ifindex; ++i)
3162                                         if (ptoi[i].port_id == spec.port_id->id)
3163                                                 break;
3164                         assert(ptoi[i].ifindex);
3165                         tcm->tcm_ifindex = ptoi[i].ifindex;
3166                         break;
3167                 case RTE_FLOW_ITEM_TYPE_ETH:
3168                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3169                                       MLX5_FLOW_LAYER_INNER_L2 :
3170                                       MLX5_FLOW_LAYER_OUTER_L2;
3171                         mask.eth = flow_tcf_item_mask
3172                                 (items, &rte_flow_item_eth_mask,
3173                                  &flow_tcf_mask_supported.eth,
3174                                  &flow_tcf_mask_empty.eth,
3175                                  sizeof(flow_tcf_mask_supported.eth),
3176                                  error);
3177                         assert(mask.eth);
3178                         if (mask.eth == &flow_tcf_mask_empty.eth)
3179                                 break;
3180                         spec.eth = items->spec;
3181                         if (mask.eth->type) {
3182                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
3183                                         inner_etype = spec.eth->type;
3184                                 else
3185                                         outer_etype = spec.eth->type;
3186                         }
3187                         if (tunnel_outer) {
3188                                 DRV_LOG(WARNING,
3189                                         "outer L2 addresses cannot be"
3190                                         " forced is outer ones for tunnel,"
3191                                         " parameter is ignored");
3192                                 break;
3193                         }
3194                         if (!is_zero_ether_addr(&mask.eth->dst)) {
3195                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3196                                              ETHER_ADDR_LEN,
3197                                              spec.eth->dst.addr_bytes);
3198                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3199                                              ETHER_ADDR_LEN,
3200                                              mask.eth->dst.addr_bytes);
3201                         }
3202                         if (!is_zero_ether_addr(&mask.eth->src)) {
3203                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3204                                              ETHER_ADDR_LEN,
3205                                              spec.eth->src.addr_bytes);
3206                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3207                                              ETHER_ADDR_LEN,
3208                                              mask.eth->src.addr_bytes);
3209                         }
3210                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3211                         break;
3212                 case RTE_FLOW_ITEM_TYPE_VLAN:
3213                         assert(!encap.hdr);
3214                         assert(!decap.hdr);
3215                         assert(!tunnel_outer);
3216                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3217                         mask.vlan = flow_tcf_item_mask
3218                                 (items, &rte_flow_item_vlan_mask,
3219                                  &flow_tcf_mask_supported.vlan,
3220                                  &flow_tcf_mask_empty.vlan,
3221                                  sizeof(flow_tcf_mask_supported.vlan),
3222                                  error);
3223                         assert(mask.vlan);
3224                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
3225                                 break;
3226                         spec.vlan = items->spec;
3227                         assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3228                                outer_etype == RTE_BE16(ETH_P_8021Q));
3229                         outer_etype = RTE_BE16(ETH_P_8021Q);
3230                         if (mask.vlan->inner_type)
3231                                 vlan_etype = spec.vlan->inner_type;
3232                         if (mask.vlan->tci & RTE_BE16(0xe000))
3233                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3234                                                 (rte_be_to_cpu_16
3235                                                  (spec.vlan->tci) >> 13) & 0x7);
3236                         if (mask.vlan->tci & RTE_BE16(0x0fff))
3237                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3238                                                  rte_be_to_cpu_16
3239                                                  (spec.vlan->tci &
3240                                                   RTE_BE16(0x0fff)));
3241                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3242                         break;
3243                 case RTE_FLOW_ITEM_TYPE_IPV4:
3244                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3245                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
3246                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3247                         mask.ipv4 = flow_tcf_item_mask
3248                                 (items, &rte_flow_item_ipv4_mask,
3249                                  &flow_tcf_mask_supported.ipv4,
3250                                  &flow_tcf_mask_empty.ipv4,
3251                                  sizeof(flow_tcf_mask_supported.ipv4),
3252                                  error);
3253                         assert(mask.ipv4);
3254                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3255                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3256                                        inner_etype == RTE_BE16(ETH_P_IP));
3257                                 inner_etype = RTE_BE16(ETH_P_IP);
3258                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3259                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3260                                        vlan_etype == RTE_BE16(ETH_P_IP));
3261                                 vlan_etype = RTE_BE16(ETH_P_IP);
3262                         } else {
3263                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3264                                        outer_etype == RTE_BE16(ETH_P_IP));
3265                                 outer_etype = RTE_BE16(ETH_P_IP);
3266                         }
3267                         spec.ipv4 = items->spec;
3268                         if (!tunnel_outer && mask.ipv4->hdr.next_proto_id) {
3269                                 /*
3270                                  * No way to set IP protocol for outer tunnel
3271                                  * layers. Usually it is fixed, for example,
3272                                  * to UDP for VXLAN/GPE.
3273                                  */
3274                                 assert(spec.ipv4); /* Mask is not empty. */
3275                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3276                                                 spec.ipv4->hdr.next_proto_id);
3277                                 ip_proto_set = 1;
3278                         }
3279                         if (mask.ipv4 == &flow_tcf_mask_empty.ipv4 ||
3280                              (!mask.ipv4->hdr.src_addr &&
3281                               !mask.ipv4->hdr.dst_addr)) {
3282                                 if (!tunnel_outer)
3283                                         break;
3284                                 /*
3285                                  * For tunnel outer we must set outer IP key
3286                                  * anyway, even if the specification/mask is
3287                                  * empty. There is no another way to tell
3288                                  * kernel about he outer layer protocol.
3289                                  */
3290                                 mnl_attr_put_u32
3291                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC,
3292                                          mask.ipv4->hdr.src_addr);
3293                                 mnl_attr_put_u32
3294                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
3295                                          mask.ipv4->hdr.src_addr);
3296                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3297                                 break;
3298                         }
3299                         if (mask.ipv4->hdr.src_addr) {
3300                                 mnl_attr_put_u32
3301                                         (nlh, tunnel_outer ?
3302                                          TCA_FLOWER_KEY_ENC_IPV4_SRC :
3303                                          TCA_FLOWER_KEY_IPV4_SRC,
3304                                          spec.ipv4->hdr.src_addr);
3305                                 mnl_attr_put_u32
3306                                         (nlh, tunnel_outer ?
3307                                          TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3308                                          TCA_FLOWER_KEY_IPV4_SRC_MASK,
3309                                          mask.ipv4->hdr.src_addr);
3310                         }
3311                         if (mask.ipv4->hdr.dst_addr) {
3312                                 mnl_attr_put_u32
3313                                         (nlh, tunnel_outer ?
3314                                          TCA_FLOWER_KEY_ENC_IPV4_DST :
3315                                          TCA_FLOWER_KEY_IPV4_DST,
3316                                          spec.ipv4->hdr.dst_addr);
3317                                 mnl_attr_put_u32
3318                                         (nlh, tunnel_outer ?
3319                                          TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3320                                          TCA_FLOWER_KEY_IPV4_DST_MASK,
3321                                          mask.ipv4->hdr.dst_addr);
3322                         }
3323                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3324                         break;
3325                 case RTE_FLOW_ITEM_TYPE_IPV6: {
3326                         bool ipv6_src, ipv6_dst;
3327
3328                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3329                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
3330                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3331                         mask.ipv6 = flow_tcf_item_mask
3332                                 (items, &rte_flow_item_ipv6_mask,
3333                                  &flow_tcf_mask_supported.ipv6,
3334                                  &flow_tcf_mask_empty.ipv6,
3335                                  sizeof(flow_tcf_mask_supported.ipv6),
3336                                  error);
3337                         assert(mask.ipv6);
3338                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3339                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3340                                        inner_etype == RTE_BE16(ETH_P_IPV6));
3341                                 inner_etype = RTE_BE16(ETH_P_IPV6);
3342                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3343                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3344                                        vlan_etype == RTE_BE16(ETH_P_IPV6));
3345                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
3346                         } else {
3347                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3348                                        outer_etype == RTE_BE16(ETH_P_IPV6));
3349                                 outer_etype = RTE_BE16(ETH_P_IPV6);
3350                         }
3351                         spec.ipv6 = items->spec;
3352                         if (!tunnel_outer && mask.ipv6->hdr.proto) {
3353                                 /*
3354                                  * No way to set IP protocol for outer tunnel
3355                                  * layers. Usually it is fixed, for example,
3356                                  * to UDP for VXLAN/GPE.
3357                                  */
3358                                 assert(spec.ipv6); /* Mask is not empty. */
3359                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3360                                                 spec.ipv6->hdr.proto);
3361                                 ip_proto_set = 1;
3362                         }
3363                         ipv6_dst = !IN6_IS_ADDR_UNSPECIFIED
3364                                                 (mask.ipv6->hdr.dst_addr);
3365                         ipv6_src = !IN6_IS_ADDR_UNSPECIFIED
3366                                                 (mask.ipv6->hdr.src_addr);
3367                         if (mask.ipv6 == &flow_tcf_mask_empty.ipv6 ||
3368                              (!ipv6_dst && !ipv6_src)) {
3369                                 if (!tunnel_outer)
3370                                         break;
3371                                 /*
3372                                  * For tunnel outer we must set outer IP key
3373                                  * anyway, even if the specification/mask is
3374                                  * empty. There is no another way to tell
3375                                  * kernel about he outer layer protocol.
3376                                  */
3377                                 mnl_attr_put(nlh,
3378                                              TCA_FLOWER_KEY_ENC_IPV6_SRC,
3379                                              IPV6_ADDR_LEN,
3380                                              mask.ipv6->hdr.src_addr);
3381                                 mnl_attr_put(nlh,
3382                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
3383                                              IPV6_ADDR_LEN,
3384                                              mask.ipv6->hdr.src_addr);
3385                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3386                                 break;
3387                         }
3388                         if (ipv6_src) {
3389                                 mnl_attr_put(nlh, tunnel_outer ?
3390                                              TCA_FLOWER_KEY_ENC_IPV6_SRC :
3391                                              TCA_FLOWER_KEY_IPV6_SRC,
3392                                              IPV6_ADDR_LEN,
3393                                              spec.ipv6->hdr.src_addr);
3394                                 mnl_attr_put(nlh, tunnel_outer ?
3395                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3396                                              TCA_FLOWER_KEY_IPV6_SRC_MASK,
3397                                              IPV6_ADDR_LEN,
3398                                              mask.ipv6->hdr.src_addr);
3399                         }
3400                         if (ipv6_dst) {
3401                                 mnl_attr_put(nlh, tunnel_outer ?
3402                                              TCA_FLOWER_KEY_ENC_IPV6_DST :
3403                                              TCA_FLOWER_KEY_IPV6_DST,
3404                                              IPV6_ADDR_LEN,
3405                                              spec.ipv6->hdr.dst_addr);
3406                                 mnl_attr_put(nlh, tunnel_outer ?
3407                                              TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3408                                              TCA_FLOWER_KEY_IPV6_DST_MASK,
3409                                              IPV6_ADDR_LEN,
3410                                              mask.ipv6->hdr.dst_addr);
3411                         }
3412                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3413                         break;
3414                 }
3415                 case RTE_FLOW_ITEM_TYPE_UDP:
3416                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3417                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
3418                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
3419                         mask.udp = flow_tcf_item_mask
3420                                 (items, &rte_flow_item_udp_mask,
3421                                  &flow_tcf_mask_supported.udp,
3422                                  &flow_tcf_mask_empty.udp,
3423                                  sizeof(flow_tcf_mask_supported.udp),
3424                                  error);
3425                         assert(mask.udp);
3426                         spec.udp = items->spec;
3427                         if (!tunnel_outer) {
3428                                 if (!ip_proto_set)
3429                                         mnl_attr_put_u8
3430                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3431                                                 IPPROTO_UDP);
3432                                 if (mask.udp == &flow_tcf_mask_empty.udp)
3433                                         break;
3434                         } else {
3435                                 assert(mask.udp != &flow_tcf_mask_empty.udp);
3436                                 decap.vxlan->udp_port =
3437                                         rte_be_to_cpu_16
3438                                                 (spec.udp->hdr.dst_port);
3439                         }
3440                         if (mask.udp->hdr.src_port) {
3441                                 mnl_attr_put_u16
3442                                         (nlh, tunnel_outer ?
3443                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3444                                          TCA_FLOWER_KEY_UDP_SRC,
3445                                          spec.udp->hdr.src_port);
3446                                 mnl_attr_put_u16
3447                                         (nlh, tunnel_outer ?
3448                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3449                                          TCA_FLOWER_KEY_UDP_SRC_MASK,
3450                                          mask.udp->hdr.src_port);
3451                         }
3452                         if (mask.udp->hdr.dst_port) {
3453                                 mnl_attr_put_u16
3454                                         (nlh, tunnel_outer ?
3455                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3456                                          TCA_FLOWER_KEY_UDP_DST,
3457                                          spec.udp->hdr.dst_port);
3458                                 mnl_attr_put_u16
3459                                         (nlh, tunnel_outer ?
3460                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3461                                          TCA_FLOWER_KEY_UDP_DST_MASK,
3462                                          mask.udp->hdr.dst_port);
3463                         }
3464                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3465                         break;
3466                 case RTE_FLOW_ITEM_TYPE_TCP:
3467                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3468                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
3469                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
3470                         mask.tcp = flow_tcf_item_mask
3471                                 (items, &rte_flow_item_tcp_mask,
3472                                  &flow_tcf_mask_supported.tcp,
3473                                  &flow_tcf_mask_empty.tcp,
3474                                  sizeof(flow_tcf_mask_supported.tcp),
3475                                  error);
3476                         assert(mask.tcp);
3477                         if (!ip_proto_set)
3478                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3479                                                 IPPROTO_TCP);
3480                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
3481                                 break;
3482                         spec.tcp = items->spec;
3483                         if (mask.tcp->hdr.src_port) {
3484                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3485                                                  spec.tcp->hdr.src_port);
3486                                 mnl_attr_put_u16(nlh,
3487                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
3488                                                  mask.tcp->hdr.src_port);
3489                         }
3490                         if (mask.tcp->hdr.dst_port) {
3491                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3492                                                  spec.tcp->hdr.dst_port);
3493                                 mnl_attr_put_u16(nlh,
3494                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
3495                                                  mask.tcp->hdr.dst_port);
3496                         }
3497                         if (mask.tcp->hdr.tcp_flags) {
3498                                 mnl_attr_put_u16
3499                                         (nlh,
3500                                          TCA_FLOWER_KEY_TCP_FLAGS,
3501                                          rte_cpu_to_be_16
3502                                                 (spec.tcp->hdr.tcp_flags));
3503                                 mnl_attr_put_u16
3504                                         (nlh,
3505                                          TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3506                                          rte_cpu_to_be_16
3507                                                 (mask.tcp->hdr.tcp_flags));
3508                         }
3509                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3510                         break;
3511                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3512                         assert(decap.vxlan);
3513                         tunnel_outer = 0;
3514                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
3515                         spec.vxlan = items->spec;
3516                         mnl_attr_put_u32(nlh,
3517                                          TCA_FLOWER_KEY_ENC_KEY_ID,
3518                                          vxlan_vni_as_be32(spec.vxlan->vni));
3519                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3520                         break;
3521                 default:
3522                         return rte_flow_error_set(error, ENOTSUP,
3523                                                   RTE_FLOW_ERROR_TYPE_ITEM,
3524                                                   NULL, "item not supported");
3525                 }
3526         }
3527         /*
3528          * Set the ether_type flower key and tc rule protocol:
3529          * - if there is nor VLAN neither VXLAN the key is taken from
3530          *   eth item directly or deduced from L3 items.
3531          * - if there is vlan item then key is fixed to 802.1q.
3532          * - if there is vxlan item then key is set to inner tunnel type.
3533          * - simultaneous vlan and vxlan items are prohibited.
3534          */
3535         if (outer_etype != RTE_BE16(ETH_P_ALL)) {
3536                 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3537                                            outer_etype);
3538                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3539                         if (inner_etype != RTE_BE16(ETH_P_ALL))
3540                                 mnl_attr_put_u16(nlh,
3541                                                  TCA_FLOWER_KEY_ETH_TYPE,
3542                                                  inner_etype);
3543                 } else {
3544                         mnl_attr_put_u16(nlh,
3545                                          TCA_FLOWER_KEY_ETH_TYPE,
3546                                          outer_etype);
3547                         if (outer_etype == RTE_BE16(ETH_P_8021Q) &&
3548                             vlan_etype != RTE_BE16(ETH_P_ALL))
3549                                 mnl_attr_put_u16(nlh,
3550                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3551                                                  vlan_etype);
3552                 }
3553                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3554         }
3555         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3556         na_act_index_cur = 1;
3557         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3558                 struct nlattr *na_act_index;
3559                 struct nlattr *na_act;
3560                 unsigned int vlan_act;
3561                 unsigned int i;
3562
3563                 switch (actions->type) {
3564                 case RTE_FLOW_ACTION_TYPE_VOID:
3565                         break;
3566                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3567                         conf.port_id = actions->conf;
3568                         if (conf.port_id->original)
3569                                 i = 0;
3570                         else
3571                                 for (i = 0; ptoi[i].ifindex; ++i)
3572                                         if (ptoi[i].port_id == conf.port_id->id)
3573                                                 break;
3574                         assert(ptoi[i].ifindex);
3575                         na_act_index =
3576                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3577                         assert(na_act_index);
3578                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3579                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3580                         assert(na_act);
3581                         if (encap.hdr) {
3582                                 assert(dev_flow->tcf.tunnel);
3583                                 dev_flow->tcf.tunnel->ifindex_ptr =
3584                                         &((struct tc_mirred *)
3585                                         mnl_attr_get_payload
3586                                         (mnl_nlmsg_get_payload_tail
3587                                                 (nlh)))->ifindex;
3588                         } else if (decap.hdr) {
3589                                 assert(dev_flow->tcf.tunnel);
3590                                 dev_flow->tcf.tunnel->ifindex_ptr =
3591                                         (unsigned int *)&tcm->tcm_ifindex;
3592                         }
3593                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3594                                      sizeof(struct tc_mirred),
3595                                      &(struct tc_mirred){
3596                                         .action = TC_ACT_STOLEN,
3597                                         .eaction = TCA_EGRESS_REDIR,
3598                                         .ifindex = ptoi[i].ifindex,
3599                                      });
3600                         mnl_attr_nest_end(nlh, na_act);
3601                         mnl_attr_nest_end(nlh, na_act_index);
3602                         break;
3603                 case RTE_FLOW_ACTION_TYPE_JUMP:
3604                         conf.jump = actions->conf;
3605                         na_act_index =
3606                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3607                         assert(na_act_index);
3608                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3609                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3610                         assert(na_act);
3611                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3612                                      sizeof(struct tc_gact),
3613                                      &(struct tc_gact){
3614                                         .action = TC_ACT_GOTO_CHAIN |
3615                                                   conf.jump->group,
3616                                      });
3617                         mnl_attr_nest_end(nlh, na_act);
3618                         mnl_attr_nest_end(nlh, na_act_index);
3619                         break;
3620                 case RTE_FLOW_ACTION_TYPE_DROP:
3621                         na_act_index =
3622                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3623                         assert(na_act_index);
3624                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3625                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3626                         assert(na_act);
3627                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3628                                      sizeof(struct tc_gact),
3629                                      &(struct tc_gact){
3630                                         .action = TC_ACT_SHOT,
3631                                      });
3632                         mnl_attr_nest_end(nlh, na_act);
3633                         mnl_attr_nest_end(nlh, na_act_index);
3634                         break;
3635                 case RTE_FLOW_ACTION_TYPE_COUNT:
3636                         /*
3637                          * Driver adds the count action implicitly for
3638                          * each rule it creates.
3639                          */
3640                         ret = flow_tcf_translate_action_count(dev,
3641                                                               dev_flow, error);
3642                         if (ret < 0)
3643                                 return ret;
3644                         break;
3645                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3646                         conf.of_push_vlan = NULL;
3647                         vlan_act = TCA_VLAN_ACT_POP;
3648                         goto action_of_vlan;
3649                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3650                         conf.of_push_vlan = actions->conf;
3651                         vlan_act = TCA_VLAN_ACT_PUSH;
3652                         goto action_of_vlan;
3653                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3654                         conf.of_set_vlan_vid = actions->conf;
3655                         if (na_vlan_id)
3656                                 goto override_na_vlan_id;
3657                         vlan_act = TCA_VLAN_ACT_MODIFY;
3658                         goto action_of_vlan;
3659                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3660                         conf.of_set_vlan_pcp = actions->conf;
3661                         if (na_vlan_priority)
3662                                 goto override_na_vlan_priority;
3663                         vlan_act = TCA_VLAN_ACT_MODIFY;
3664                         goto action_of_vlan;
3665 action_of_vlan:
3666                         na_act_index =
3667                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3668                         assert(na_act_index);
3669                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3670                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3671                         assert(na_act);
3672                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
3673                                      sizeof(struct tc_vlan),
3674                                      &(struct tc_vlan){
3675                                         .action = TC_ACT_PIPE,
3676                                         .v_action = vlan_act,
3677                                      });
3678                         if (vlan_act == TCA_VLAN_ACT_POP) {
3679                                 mnl_attr_nest_end(nlh, na_act);
3680                                 mnl_attr_nest_end(nlh, na_act_index);
3681                                 break;
3682                         }
3683                         if (vlan_act == TCA_VLAN_ACT_PUSH)
3684                                 mnl_attr_put_u16(nlh,
3685                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
3686                                                  conf.of_push_vlan->ethertype);
3687                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3688                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3689                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3690                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3691                         mnl_attr_nest_end(nlh, na_act);
3692                         mnl_attr_nest_end(nlh, na_act_index);
3693                         if (actions->type ==
3694                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3695 override_na_vlan_id:
3696                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3697                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3698                                         rte_be_to_cpu_16
3699                                         (conf.of_set_vlan_vid->vlan_vid);
3700                         } else if (actions->type ==
3701                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3702 override_na_vlan_priority:
3703                                 na_vlan_priority->nla_type =
3704                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
3705                                 *(uint8_t *)mnl_attr_get_payload
3706                                         (na_vlan_priority) =
3707                                         conf.of_set_vlan_pcp->vlan_pcp;
3708                         }
3709                         break;
3710                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3711                         assert(decap.vxlan);
3712                         assert(dev_flow->tcf.tunnel);
3713                         dev_flow->tcf.tunnel->ifindex_ptr =
3714                                 (unsigned int *)&tcm->tcm_ifindex;
3715                         na_act_index =
3716                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3717                         assert(na_act_index);
3718                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3719                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3720                         assert(na_act);
3721                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3722                                 sizeof(struct tc_tunnel_key),
3723                                 &(struct tc_tunnel_key){
3724                                         .action = TC_ACT_PIPE,
3725                                         .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3726                                         });
3727                         mnl_attr_nest_end(nlh, na_act);
3728                         mnl_attr_nest_end(nlh, na_act_index);
3729                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3730                         break;
3731                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3732                         assert(encap.vxlan);
3733                         flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3734                         na_act_index =
3735                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3736                         assert(na_act_index);
3737                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3738                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3739                         assert(na_act);
3740                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3741                                 sizeof(struct tc_tunnel_key),
3742                                 &(struct tc_tunnel_key){
3743                                         .action = TC_ACT_PIPE,
3744                                         .t_action = TCA_TUNNEL_KEY_ACT_SET,
3745                                         });
3746                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3747                                 mnl_attr_put_u16(nlh,
3748                                          TCA_TUNNEL_KEY_ENC_DST_PORT,
3749                                          encap.vxlan->udp.dst);
3750                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3751                                 mnl_attr_put_u32(nlh,
3752                                          TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3753                                          encap.vxlan->ipv4.src);
3754                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3755                                 mnl_attr_put_u32(nlh,
3756                                          TCA_TUNNEL_KEY_ENC_IPV4_DST,
3757                                          encap.vxlan->ipv4.dst);
3758                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3759                                 mnl_attr_put(nlh,
3760                                          TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3761                                          sizeof(encap.vxlan->ipv6.src),
3762                                          &encap.vxlan->ipv6.src);
3763                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3764                                 mnl_attr_put(nlh,
3765                                          TCA_TUNNEL_KEY_ENC_IPV6_DST,
3766                                          sizeof(encap.vxlan->ipv6.dst),
3767                                          &encap.vxlan->ipv6.dst);
3768                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3769                                 mnl_attr_put_u32(nlh,
3770                                          TCA_TUNNEL_KEY_ENC_KEY_ID,
3771                                          vxlan_vni_as_be32
3772                                                 (encap.vxlan->vxlan.vni));
3773                         mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3774                         mnl_attr_nest_end(nlh, na_act);
3775                         mnl_attr_nest_end(nlh, na_act_index);
3776                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3777                         break;
3778                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3779                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3780                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
3781                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
3782                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
3783                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
3784                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
3785                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
3786                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
3787                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
3788                         na_act_index =
3789                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3790                         flow_tcf_create_pedit_mnl_msg(nlh,
3791                                                       &actions, item_flags);
3792                         mnl_attr_nest_end(nlh, na_act_index);
3793                         break;
3794                 default:
3795                         return rte_flow_error_set(error, ENOTSUP,
3796                                                   RTE_FLOW_ERROR_TYPE_ACTION,
3797                                                   actions,
3798                                                   "action not supported");
3799                 }
3800         }
3801         assert(na_flower);
3802         assert(na_flower_act);
3803         mnl_attr_nest_end(nlh, na_flower_act);
3804         dev_flow->tcf.ptc_flags = mnl_attr_get_payload
3805                                         (mnl_nlmsg_get_payload_tail(nlh));
3806         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
3807                                                 0 : TCA_CLS_FLAGS_SKIP_SW);
3808         mnl_attr_nest_end(nlh, na_flower);
3809         if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
3810                 dev_flow->tcf.tunnel->ifindex_org =
3811                         *dev_flow->tcf.tunnel->ifindex_ptr;
3812         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3813         return 0;
3814 }
3815
3816 /**
3817  * Send Netlink message with acknowledgment.
3818  *
3819  * @param tcf
3820  *   Flow context to use.
3821  * @param nlh
3822  *   Message to send. This function always raises the NLM_F_ACK flag before
3823  *   sending.
3824  * @param[in] cb
3825  *   Callback handler for received message.
3826  * @param[in] arg
3827  *   Context pointer for callback handler.
3828  *
3829  * @return
3830  *   0 on success, a negative errno value otherwise and rte_errno is set.
3831  */
3832 static int
3833 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
3834                 struct nlmsghdr *nlh,
3835                 mnl_cb_t cb, void *arg)
3836 {
3837         unsigned int portid = mnl_socket_get_portid(tcf->nl);
3838         uint32_t seq = tcf->seq++;
3839         int ret, err = 0;
3840
3841         assert(tcf->nl);
3842         assert(tcf->buf);
3843         if (!seq) {
3844                 /* seq 0 is reserved for kernel event-driven notifications. */
3845                 seq = tcf->seq++;
3846         }
3847         nlh->nlmsg_seq = seq;
3848         nlh->nlmsg_flags |= NLM_F_ACK;
3849         ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
3850         if (ret <= 0) {
3851                 /* Message send error occurres. */
3852                 rte_errno = errno;
3853                 return -rte_errno;
3854         }
3855         nlh = (struct nlmsghdr *)(tcf->buf);
3856         /*
3857          * The following loop postpones non-fatal errors until multipart
3858          * messages are complete.
3859          */
3860         while (true) {
3861                 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
3862                 if (ret < 0) {
3863                         err = errno;
3864                         /*
3865                          * In case of overflow Will receive till
3866                          * end of multipart message. We may lost part
3867                          * of reply messages but mark and return an error.
3868                          */
3869                         if (err != ENOSPC ||
3870                             !(nlh->nlmsg_flags & NLM_F_MULTI) ||
3871                             nlh->nlmsg_type == NLMSG_DONE)
3872                                 break;
3873                 } else {
3874                         ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
3875                         if (!ret) {
3876                                 /*
3877                                  * libmnl returns 0 if DONE or
3878                                  * success ACK message found.
3879                                  */
3880                                 break;
3881                         }
3882                         if (ret < 0) {
3883                                 /*
3884                                  * ACK message with error found
3885                                  * or some error occurred.
3886                                  */
3887                                 err = errno;
3888                                 break;
3889                         }
3890                         /* We should continue receiving. */
3891                 }
3892         }
3893         if (!err)
3894                 return 0;
3895         rte_errno = err;
3896         return -err;
3897 }
3898
3899 #define MNL_BUF_EXTRA_SPACE 16
3900 #define MNL_REQUEST_SIZE_MIN 256
3901 #define MNL_REQUEST_SIZE_MAX 2048
3902 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
3903                                  MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
3904
3905 /* Data structures used by flow_tcf_xxx_cb() routines. */
3906 struct tcf_nlcb_buf {
3907         LIST_ENTRY(tcf_nlcb_buf) next;
3908         uint32_t size;
3909         alignas(struct nlmsghdr)
3910         uint8_t msg[]; /**< Netlink message data. */
3911 };
3912
3913 struct tcf_nlcb_context {
3914         unsigned int ifindex; /**< Base interface index. */
3915         uint32_t bufsize;
3916         LIST_HEAD(, tcf_nlcb_buf) nlbuf;
3917 };
3918
3919 /**
3920  * Allocate space for netlink command in buffer list
3921  *
3922  * @param[in, out] ctx
3923  *   Pointer to callback context with command buffers list.
3924  * @param[in] size
3925  *   Required size of data buffer to be allocated.
3926  *
3927  * @return
3928  *   Pointer to allocated memory, aligned as message header.
3929  *   NULL if some error occurred.
3930  */
3931 static struct nlmsghdr *
3932 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
3933 {
3934         struct tcf_nlcb_buf *buf;
3935         struct nlmsghdr *nlh;
3936
3937         size = NLMSG_ALIGN(size);
3938         buf = LIST_FIRST(&ctx->nlbuf);
3939         if (buf && (buf->size + size) <= ctx->bufsize) {
3940                 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
3941                 buf->size += size;
3942                 return nlh;
3943         }
3944         if (size > ctx->bufsize) {
3945                 DRV_LOG(WARNING, "netlink: too long command buffer requested");
3946                 return NULL;
3947         }
3948         buf = rte_malloc(__func__,
3949                         ctx->bufsize + sizeof(struct tcf_nlcb_buf),
3950                         alignof(struct tcf_nlcb_buf));
3951         if (!buf) {
3952                 DRV_LOG(WARNING, "netlink: no memory for command buffer");
3953                 return NULL;
3954         }
3955         LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
3956         buf->size = size;
3957         nlh = (struct nlmsghdr *)&buf->msg[0];
3958         return nlh;
3959 }
3960
3961 /**
3962  * Send the buffers with prepared netlink commands. Scans the list and
3963  * sends all found buffers. Buffers are sent and freed anyway in order
3964  * to prevent memory leakage if some every message in received packet.
3965  *
3966  * @param[in] tcf
3967  *   Context object initialized by mlx5_flow_tcf_context_create().
3968  * @param[in, out] ctx
3969  *   Pointer to callback context with command buffers list.
3970  *
3971  * @return
3972  *   Zero value on success, negative errno value otherwise
3973  *   and rte_errno is set.
3974  */
3975 static int
3976 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
3977                     struct tcf_nlcb_context *ctx)
3978 {
3979         struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
3980         int ret = 0;
3981
3982         while (bc) {
3983                 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
3984                 struct nlmsghdr *nlh;
3985                 uint32_t msg = 0;
3986                 int rc;
3987
3988                 while (msg < bc->size) {
3989                         /*
3990                          * Send Netlink commands from buffer in one by one
3991                          * fashion. If we send multiple rule deletion commands
3992                          * in one Netlink message and some error occurs it may
3993                          * cause multiple ACK error messages and break sequence
3994                          * numbers of Netlink communication, because we expect
3995                          * the only one ACK reply.
3996                          */
3997                         assert((bc->size - msg) >= sizeof(struct nlmsghdr));
3998                         nlh = (struct nlmsghdr *)&bc->msg[msg];
3999                         assert((bc->size - msg) >= nlh->nlmsg_len);
4000                         msg += nlh->nlmsg_len;
4001                         rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4002                         if (rc) {
4003                                 DRV_LOG(WARNING,
4004                                         "netlink: cleanup error %d", rc);
4005                                 if (!ret)
4006                                         ret = rc;
4007                         }
4008                 }
4009                 rte_free(bc);
4010                 bc = bn;
4011         }
4012         LIST_INIT(&ctx->nlbuf);
4013         return ret;
4014 }
4015
4016 /**
4017  * Collect local IP address rules with scope link attribute  on specified
4018  * network device. This is callback routine called by libmnl mnl_cb_run()
4019  * in loop for every message in received packet.
4020  *
4021  * @param[in] nlh
4022  *   Pointer to reply header.
4023  * @param[in, out] arg
4024  *   Opaque data pointer for this callback.
4025  *
4026  * @return
4027  *   A positive, nonzero value on success, negative errno value otherwise
4028  *   and rte_errno is set.
4029  */
4030 static int
4031 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
4032 {
4033         struct tcf_nlcb_context *ctx = arg;
4034         struct nlmsghdr *cmd;
4035         struct ifaddrmsg *ifa;
4036         struct nlattr *na;
4037         struct nlattr *na_local = NULL;
4038         struct nlattr *na_peer = NULL;
4039         unsigned char family;
4040         uint32_t size;
4041
4042         if (nlh->nlmsg_type != RTM_NEWADDR) {
4043                 rte_errno = EINVAL;
4044                 return -rte_errno;
4045         }
4046         ifa = mnl_nlmsg_get_payload(nlh);
4047         family = ifa->ifa_family;
4048         if (ifa->ifa_index != ctx->ifindex ||
4049             ifa->ifa_scope != RT_SCOPE_LINK ||
4050             !(ifa->ifa_flags & IFA_F_PERMANENT) ||
4051             (family != AF_INET && family != AF_INET6))
4052                 return 1;
4053         mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
4054                 switch (mnl_attr_get_type(na)) {
4055                 case IFA_LOCAL:
4056                         na_local = na;
4057                         break;
4058                 case IFA_ADDRESS:
4059                         na_peer = na;
4060                         break;
4061                 }
4062                 if (na_local && na_peer)
4063                         break;
4064         }
4065         if (!na_local || !na_peer)
4066                 return 1;
4067         /* Local rule found with scope link, permanent and assigned peer. */
4068         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4069                MNL_ALIGN(sizeof(struct ifaddrmsg)) +
4070                (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4071                                    : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
4072         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4073         if (!cmd) {
4074                 rte_errno = ENOMEM;
4075                 return -rte_errno;
4076         }
4077         cmd = mnl_nlmsg_put_header(cmd);
4078         cmd->nlmsg_type = RTM_DELADDR;
4079         cmd->nlmsg_flags = NLM_F_REQUEST;
4080         ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
4081         ifa->ifa_flags = IFA_F_PERMANENT;
4082         ifa->ifa_scope = RT_SCOPE_LINK;
4083         ifa->ifa_index = ctx->ifindex;
4084         if (family == AF_INET) {
4085                 ifa->ifa_family = AF_INET;
4086                 ifa->ifa_prefixlen = 32;
4087                 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4088                 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4089         } else {
4090                 ifa->ifa_family = AF_INET6;
4091                 ifa->ifa_prefixlen = 128;
4092                 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4093                         mnl_attr_get_payload(na_local));
4094                 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4095                         mnl_attr_get_payload(na_peer));
4096         }
4097         assert(size == cmd->nlmsg_len);
4098         return 1;
4099 }
4100
4101 /**
4102  * Cleanup the local IP addresses on outer interface.
4103  *
4104  * @param[in] tcf
4105  *   Context object initialized by mlx5_flow_tcf_context_create().
4106  * @param[in] ifindex
4107  *   Network inferface index to perform cleanup.
4108  */
4109 static void
4110 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4111                             unsigned int ifindex)
4112 {
4113         struct nlmsghdr *nlh;
4114         struct ifaddrmsg *ifa;
4115         struct tcf_nlcb_context ctx = {
4116                 .ifindex = ifindex,
4117                 .bufsize = MNL_REQUEST_SIZE,
4118                 .nlbuf = LIST_HEAD_INITIALIZER(),
4119         };
4120         int ret;
4121
4122         assert(ifindex);
4123         /*
4124          * Seek and destroy leftovers of local IP addresses with
4125          * matching properties "scope link".
4126          */
4127         nlh = mnl_nlmsg_put_header(tcf->buf);
4128         nlh->nlmsg_type = RTM_GETADDR;
4129         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4130         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4131         ifa->ifa_family = AF_UNSPEC;
4132         ifa->ifa_index = ifindex;
4133         ifa->ifa_scope = RT_SCOPE_LINK;
4134         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
4135         if (ret)
4136                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4137         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4138         if (ret)
4139                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4140 }
4141
4142 /**
4143  * Collect neigh permament rules on specified network device.
4144  * This is callback routine called by libmnl mnl_cb_run() in loop for
4145  * every message in received packet.
4146  *
4147  * @param[in] nlh
4148  *   Pointer to reply header.
4149  * @param[in, out] arg
4150  *   Opaque data pointer for this callback.
4151  *
4152  * @return
4153  *   A positive, nonzero value on success, negative errno value otherwise
4154  *   and rte_errno is set.
4155  */
4156 static int
4157 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4158 {
4159         struct tcf_nlcb_context *ctx = arg;
4160         struct nlmsghdr *cmd;
4161         struct ndmsg *ndm;
4162         struct nlattr *na;
4163         struct nlattr *na_ip = NULL;
4164         struct nlattr *na_mac = NULL;
4165         unsigned char family;
4166         uint32_t size;
4167
4168         if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4169                 rte_errno = EINVAL;
4170                 return -rte_errno;
4171         }
4172         ndm = mnl_nlmsg_get_payload(nlh);
4173         family = ndm->ndm_family;
4174         if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4175            !(ndm->ndm_state & NUD_PERMANENT) ||
4176            (family != AF_INET && family != AF_INET6))
4177                 return 1;
4178         mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4179                 switch (mnl_attr_get_type(na)) {
4180                 case NDA_DST:
4181                         na_ip = na;
4182                         break;
4183                 case NDA_LLADDR:
4184                         na_mac = na;
4185                         break;
4186                 }
4187                 if (na_mac && na_ip)
4188                         break;
4189         }
4190         if (!na_mac || !na_ip)
4191                 return 1;
4192         /* Neigh rule with permenent attribute found. */
4193         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4194                MNL_ALIGN(sizeof(struct ndmsg)) +
4195                SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4196                (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4197                                    : SZ_NLATTR_TYPE_OF(uint32_t));
4198         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4199         if (!cmd) {
4200                 rte_errno = ENOMEM;
4201                 return -rte_errno;
4202         }
4203         cmd = mnl_nlmsg_put_header(cmd);
4204         cmd->nlmsg_type = RTM_DELNEIGH;
4205         cmd->nlmsg_flags = NLM_F_REQUEST;
4206         ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4207         ndm->ndm_ifindex = ctx->ifindex;
4208         ndm->ndm_state = NUD_PERMANENT;
4209         ndm->ndm_flags = 0;
4210         ndm->ndm_type = 0;
4211         if (family == AF_INET) {
4212                 ndm->ndm_family = AF_INET;
4213                 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4214         } else {
4215                 ndm->ndm_family = AF_INET6;
4216                 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4217                              mnl_attr_get_payload(na_ip));
4218         }
4219         mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4220                      mnl_attr_get_payload(na_mac));
4221         assert(size == cmd->nlmsg_len);
4222         return 1;
4223 }
4224
4225 /**
4226  * Cleanup the neigh rules on outer interface.
4227  *
4228  * @param[in] tcf
4229  *   Context object initialized by mlx5_flow_tcf_context_create().
4230  * @param[in] ifindex
4231  *   Network inferface index to perform cleanup.
4232  */
4233 static void
4234 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4235                             unsigned int ifindex)
4236 {
4237         struct nlmsghdr *nlh;
4238         struct ndmsg *ndm;
4239         struct tcf_nlcb_context ctx = {
4240                 .ifindex = ifindex,
4241                 .bufsize = MNL_REQUEST_SIZE,
4242                 .nlbuf = LIST_HEAD_INITIALIZER(),
4243         };
4244         int ret;
4245
4246         assert(ifindex);
4247         /* Seek and destroy leftovers of neigh rules. */
4248         nlh = mnl_nlmsg_put_header(tcf->buf);
4249         nlh->nlmsg_type = RTM_GETNEIGH;
4250         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4251         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4252         ndm->ndm_family = AF_UNSPEC;
4253         ndm->ndm_ifindex = ifindex;
4254         ndm->ndm_state = NUD_PERMANENT;
4255         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4256         if (ret)
4257                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4258         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4259         if (ret)
4260                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4261 }
4262
4263 /**
4264  * Collect indices of VXLAN encap/decap interfaces associated with device.
4265  * This is callback routine called by libmnl mnl_cb_run() in loop for
4266  * every message in received packet.
4267  *
4268  * @param[in] nlh
4269  *   Pointer to reply header.
4270  * @param[in, out] arg
4271  *   Opaque data pointer for this callback.
4272  *
4273  * @return
4274  *   A positive, nonzero value on success, negative errno value otherwise
4275  *   and rte_errno is set.
4276  */
4277 static int
4278 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4279 {
4280         struct tcf_nlcb_context *ctx = arg;
4281         struct nlmsghdr *cmd;
4282         struct ifinfomsg *ifm;
4283         struct nlattr *na;
4284         struct nlattr *na_info = NULL;
4285         struct nlattr *na_vxlan = NULL;
4286         bool found = false;
4287         unsigned int vxindex;
4288         uint32_t size;
4289
4290         if (nlh->nlmsg_type != RTM_NEWLINK) {
4291                 rte_errno = EINVAL;
4292                 return -rte_errno;
4293         }
4294         ifm = mnl_nlmsg_get_payload(nlh);
4295         if (!ifm->ifi_index) {
4296                 rte_errno = EINVAL;
4297                 return -rte_errno;
4298         }
4299         mnl_attr_for_each(na, nlh, sizeof(*ifm))
4300                 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4301                         na_info = na;
4302                         break;
4303                 }
4304         if (!na_info)
4305                 return 1;
4306         mnl_attr_for_each_nested(na, na_info) {
4307                 switch (mnl_attr_get_type(na)) {
4308                 case IFLA_INFO_KIND:
4309                         if (!strncmp("vxlan", mnl_attr_get_str(na),
4310                                      mnl_attr_get_len(na)))
4311                                 found = true;
4312                         break;
4313                 case IFLA_INFO_DATA:
4314                         na_vxlan = na;
4315                         break;
4316                 }
4317                 if (found && na_vxlan)
4318                         break;
4319         }
4320         if (!found || !na_vxlan)
4321                 return 1;
4322         found = false;
4323         mnl_attr_for_each_nested(na, na_vxlan) {
4324                 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4325                     mnl_attr_get_u32(na) == ctx->ifindex) {
4326                         found = true;
4327                         break;
4328                 }
4329         }
4330         if (!found)
4331                 return 1;
4332         /* Attached VXLAN device found, store the command to delete. */
4333         vxindex = ifm->ifi_index;
4334         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4335                MNL_ALIGN(sizeof(struct ifinfomsg));
4336         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4337         if (!cmd) {
4338                 rte_errno = ENOMEM;
4339                 return -rte_errno;
4340         }
4341         cmd = mnl_nlmsg_put_header(cmd);
4342         cmd->nlmsg_type = RTM_DELLINK;
4343         cmd->nlmsg_flags = NLM_F_REQUEST;
4344         ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4345         ifm->ifi_family = AF_UNSPEC;
4346         ifm->ifi_index = vxindex;
4347         assert(size == cmd->nlmsg_len);
4348         return 1;
4349 }
4350
4351 /**
4352  * Cleanup the outer interface. Removes all found vxlan devices
4353  * attached to specified index, flushes the neigh and local IP
4354  * database.
4355  *
4356  * @param[in] tcf
4357  *   Context object initialized by mlx5_flow_tcf_context_create().
4358  * @param[in] ifindex
4359  *   Network inferface index to perform cleanup.
4360  */
4361 static void
4362 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4363                             unsigned int ifindex)
4364 {
4365         struct nlmsghdr *nlh;
4366         struct ifinfomsg *ifm;
4367         struct tcf_nlcb_context ctx = {
4368                 .ifindex = ifindex,
4369                 .bufsize = MNL_REQUEST_SIZE,
4370                 .nlbuf = LIST_HEAD_INITIALIZER(),
4371         };
4372         int ret;
4373
4374         assert(ifindex);
4375         /*
4376          * Seek and destroy leftover VXLAN encap/decap interfaces with
4377          * matching properties.
4378          */
4379         nlh = mnl_nlmsg_put_header(tcf->buf);
4380         nlh->nlmsg_type = RTM_GETLINK;
4381         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4382         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4383         ifm->ifi_family = AF_UNSPEC;
4384         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4385         if (ret)
4386                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4387         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4388         if (ret)
4389                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4390 }
4391
4392 /**
4393  * Emit Netlink message to add/remove local address to the outer device.
4394  * The address being added is visible within the link only (scope link).
4395  *
4396  * Note that an implicit route is maintained by the kernel due to the
4397  * presence of a peer address (IFA_ADDRESS).
4398  *
4399  * These rules are used for encapsultion only and allow to assign
4400  * the outer tunnel source IP address.
4401  *
4402  * @param[in] tcf
4403  *   Libmnl socket context object.
4404  * @param[in] encap
4405  *   Encapsulation properties (source address and its peer).
4406  * @param[in] ifindex
4407  *   Network interface to apply rule.
4408  * @param[in] enable
4409  *   Toggle between add and remove.
4410  * @param[out] error
4411  *   Perform verbose error reporting if not NULL.
4412  *
4413  * @return
4414  *   0 on success, a negative errno value otherwise and rte_errno is set.
4415  */
4416 static int
4417 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4418                     const struct flow_tcf_vxlan_encap *encap,
4419                     unsigned int ifindex,
4420                     bool enable,
4421                     struct rte_flow_error *error)
4422 {
4423         struct nlmsghdr *nlh;
4424         struct ifaddrmsg *ifa;
4425         alignas(struct nlmsghdr)
4426         uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4427
4428         nlh = mnl_nlmsg_put_header(buf);
4429         nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4430         nlh->nlmsg_flags =
4431                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4432         nlh->nlmsg_seq = 0;
4433         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4434         ifa->ifa_flags = IFA_F_PERMANENT;
4435         ifa->ifa_scope = RT_SCOPE_LINK;
4436         ifa->ifa_index = ifindex;
4437         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4438                 ifa->ifa_family = AF_INET;
4439                 ifa->ifa_prefixlen = 32;
4440                 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4441                 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4442                         mnl_attr_put_u32(nlh, IFA_ADDRESS,
4443                                               encap->ipv4.dst);
4444         } else {
4445                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4446                 ifa->ifa_family = AF_INET6;
4447                 ifa->ifa_prefixlen = 128;
4448                 mnl_attr_put(nlh, IFA_LOCAL,
4449                                   sizeof(encap->ipv6.src),
4450                                   &encap->ipv6.src);
4451                 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4452                         mnl_attr_put(nlh, IFA_ADDRESS,
4453                                           sizeof(encap->ipv6.dst),
4454                                           &encap->ipv6.dst);
4455         }
4456         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4457                 return 0;
4458         return rte_flow_error_set(error, rte_errno,
4459                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4460                                   "netlink: cannot complete IFA request"
4461                                   " (ip addr add)");
4462 }
4463
4464 /**
4465  * Emit Netlink message to add/remove neighbor.
4466  *
4467  * @param[in] tcf
4468  *   Libmnl socket context object.
4469  * @param[in] encap
4470  *   Encapsulation properties (destination address).
4471  * @param[in] ifindex
4472  *   Network interface.
4473  * @param[in] enable
4474  *   Toggle between add and remove.
4475  * @param[out] error
4476  *   Perform verbose error reporting if not NULL.
4477  *
4478  * @return
4479  *   0 on success, a negative errno value otherwise and rte_errno is set.
4480  */
4481 static int
4482 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4483                      const struct flow_tcf_vxlan_encap *encap,
4484                      unsigned int ifindex,
4485                      bool enable,
4486                      struct rte_flow_error *error)
4487 {
4488         struct nlmsghdr *nlh;
4489         struct ndmsg *ndm;
4490         alignas(struct nlmsghdr)
4491         uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4492
4493         nlh = mnl_nlmsg_put_header(buf);
4494         nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4495         nlh->nlmsg_flags =
4496                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4497         nlh->nlmsg_seq = 0;
4498         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4499         ndm->ndm_ifindex = ifindex;
4500         ndm->ndm_state = NUD_PERMANENT;
4501         ndm->ndm_flags = 0;
4502         ndm->ndm_type = 0;
4503         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4504                 ndm->ndm_family = AF_INET;
4505                 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4506         } else {
4507                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4508                 ndm->ndm_family = AF_INET6;
4509                 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4510                                                  &encap->ipv6.dst);
4511         }
4512         if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4513                 DRV_LOG(WARNING,
4514                         "outer ethernet source address cannot be "
4515                         "forced for VXLAN encapsulation");
4516         if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4517                 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4518                                                     &encap->eth.dst);
4519         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4520                 return 0;
4521         return rte_flow_error_set(error, rte_errno,
4522                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4523                                   "netlink: cannot complete ND request"
4524                                   " (ip neigh)");
4525 }
4526
4527 /**
4528  * Manage the local IP addresses and their peers IP addresses on the
4529  * outer interface for encapsulation purposes. The kernel searches the
4530  * appropriate device for tunnel egress traffic using the outer source
4531  * IP, this IP should be assigned to the outer network device, otherwise
4532  * kernel rejects the rule.
4533  *
4534  * Adds or removes the addresses using the Netlink command like this:
4535  *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4536  *
4537  * The addresses are local to the netdev ("scope link"), this reduces
4538  * the risk of conflicts. Note that an implicit route is maintained by
4539  * the kernel due to the presence of a peer address (IFA_ADDRESS).
4540  *
4541  * @param[in] tcf
4542  *   Libmnl socket context object.
4543  * @param[in] vtep
4544  *   VTEP object, contains rule database and ifouter index.
4545  * @param[in] dev_flow
4546  *   Flow object, contains the tunnel parameters (for encap only).
4547  * @param[in] enable
4548  *   Toggle between add and remove.
4549  * @param[out] error
4550  *   Perform verbose error reporting if not NULL.
4551  *
4552  * @return
4553  *   0 on success, a negative errno value otherwise and rte_errno is set.
4554  */
4555 static int
4556 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4557                      struct tcf_vtep *vtep,
4558                      struct mlx5_flow *dev_flow,
4559                      bool enable,
4560                      struct rte_flow_error *error)
4561 {
4562         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4563         struct tcf_local_rule *rule;
4564         bool found = false;
4565         int ret;
4566
4567         assert(encap);
4568         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4569         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4570                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4571                 LIST_FOREACH(rule, &vtep->local, next) {
4572                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4573                             encap->ipv4.src == rule->ipv4.src &&
4574                             encap->ipv4.dst == rule->ipv4.dst) {
4575                                 found = true;
4576                                 break;
4577                         }
4578                 }
4579         } else {
4580                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4581                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4582                 LIST_FOREACH(rule, &vtep->local, next) {
4583                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4584                             !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4585                                             sizeof(encap->ipv6.src)) &&
4586                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4587                                             sizeof(encap->ipv6.dst))) {
4588                                 found = true;
4589                                 break;
4590                         }
4591                 }
4592         }
4593         if (found) {
4594                 if (enable) {
4595                         rule->refcnt++;
4596                         return 0;
4597                 }
4598                 if (!rule->refcnt || !--rule->refcnt) {
4599                         LIST_REMOVE(rule, next);
4600                         return flow_tcf_rule_local(tcf, encap,
4601                                         vtep->ifouter, false, error);
4602                 }
4603                 return 0;
4604         }
4605         if (!enable) {
4606                 DRV_LOG(WARNING, "disabling not existing local rule");
4607                 rte_flow_error_set(error, ENOENT,
4608                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4609                                    "disabling not existing local rule");
4610                 return -ENOENT;
4611         }
4612         rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4613                                 alignof(struct tcf_local_rule));
4614         if (!rule) {
4615                 rte_flow_error_set(error, ENOMEM,
4616                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4617                                    "unable to allocate memory for local rule");
4618                 return -rte_errno;
4619         }
4620         *rule = (struct tcf_local_rule){.refcnt = 0,
4621                                         .mask = 0,
4622                                         };
4623         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4624                 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4625                            | FLOW_TCF_ENCAP_IPV4_DST;
4626                 rule->ipv4.src = encap->ipv4.src;
4627                 rule->ipv4.dst = encap->ipv4.dst;
4628         } else {
4629                 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4630                            | FLOW_TCF_ENCAP_IPV6_DST;
4631                 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4632                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4633         }
4634         ret = flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error);
4635         if (ret) {
4636                 rte_free(rule);
4637                 return ret;
4638         }
4639         rule->refcnt++;
4640         LIST_INSERT_HEAD(&vtep->local, rule, next);
4641         return 0;
4642 }
4643
4644 /**
4645  * Manage the destination MAC/IP addresses neigh database, kernel uses
4646  * this one to determine the destination MAC address within encapsulation
4647  * header. Adds or removes the entries using the Netlink command like this:
4648  *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4649  *
4650  * @param[in] tcf
4651  *   Libmnl socket context object.
4652  * @param[in] vtep
4653  *   VTEP object, contains rule database and ifouter index.
4654  * @param[in] dev_flow
4655  *   Flow object, contains the tunnel parameters (for encap only).
4656  * @param[in] enable
4657  *   Toggle between add and remove.
4658  * @param[out] error
4659  *   Perform verbose error reporting if not NULL.
4660  *
4661  * @return
4662  *   0 on success, a negative errno value otherwise and rte_errno is set.
4663  */
4664 static int
4665 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4666                      struct tcf_vtep *vtep,
4667                      struct mlx5_flow *dev_flow,
4668                      bool enable,
4669                      struct rte_flow_error *error)
4670 {
4671         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4672         struct tcf_neigh_rule *rule;
4673         bool found = false;
4674         int ret;
4675
4676         assert(encap);
4677         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4678         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4679                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4680                 LIST_FOREACH(rule, &vtep->neigh, next) {
4681                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4682                             encap->ipv4.dst == rule->ipv4.dst) {
4683                                 found = true;
4684                                 break;
4685                         }
4686                 }
4687         } else {
4688                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4689                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4690                 LIST_FOREACH(rule, &vtep->neigh, next) {
4691                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4692                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4693                                                 sizeof(encap->ipv6.dst))) {
4694                                 found = true;
4695                                 break;
4696                         }
4697                 }
4698         }
4699         if (found) {
4700                 if (memcmp(&encap->eth.dst, &rule->eth,
4701                            sizeof(encap->eth.dst))) {
4702                         DRV_LOG(WARNING, "Destination MAC differs"
4703                                          " in neigh rule");
4704                         rte_flow_error_set(error, EEXIST,
4705                                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4706                                            NULL, "Different MAC address"
4707                                            " neigh rule for the same"
4708                                            " destination IP");
4709                                         return -EEXIST;
4710                 }
4711                 if (enable) {
4712                         rule->refcnt++;
4713                         return 0;
4714                 }
4715                 if (!rule->refcnt || !--rule->refcnt) {
4716                         LIST_REMOVE(rule, next);
4717                         return flow_tcf_rule_neigh(tcf, encap,
4718                                                    vtep->ifouter,
4719                                                    false, error);
4720                 }
4721                 return 0;
4722         }
4723         if (!enable) {
4724                 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4725                 rte_flow_error_set(error, ENOENT,
4726                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4727                                    "unable to allocate memory for neigh rule");
4728                 return -ENOENT;
4729         }
4730         rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4731                                 alignof(struct tcf_neigh_rule));
4732         if (!rule) {
4733                 rte_flow_error_set(error, ENOMEM,
4734                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4735                                    "unable to allocate memory for neigh rule");
4736                 return -rte_errno;
4737         }
4738         *rule = (struct tcf_neigh_rule){.refcnt = 0,
4739                                         .mask = 0,
4740                                         };
4741         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4742                 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4743                 rule->ipv4.dst = encap->ipv4.dst;
4744         } else {
4745                 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4746                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4747         }
4748         memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4749         ret = flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error);
4750         if (ret) {
4751                 rte_free(rule);
4752                 return ret;
4753         }
4754         rule->refcnt++;
4755         LIST_INSERT_HEAD(&vtep->neigh, rule, next);
4756         return 0;
4757 }
4758
4759 /* VTEP device list is shared between PMD port instances. */
4760 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4761 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4762
4763 /**
4764  * Deletes VTEP network device.
4765  *
4766  * @param[in] tcf
4767  *   Context object initialized by mlx5_flow_tcf_context_create().
4768  * @param[in] vtep
4769  *   Object represinting the network device to delete. Memory
4770  *   allocated for this object is freed by routine.
4771  */
4772 static void
4773 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
4774                      struct tcf_vtep *vtep)
4775 {
4776         struct nlmsghdr *nlh;
4777         struct ifinfomsg *ifm;
4778         alignas(struct nlmsghdr)
4779         uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
4780                     MNL_BUF_EXTRA_SPACE];
4781         int ret;
4782
4783         assert(!vtep->refcnt);
4784         /* Delete only ifaces those we actually created. */
4785         if (vtep->created && vtep->ifindex) {
4786                 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
4787                 nlh = mnl_nlmsg_put_header(buf);
4788                 nlh->nlmsg_type = RTM_DELLINK;
4789                 nlh->nlmsg_flags = NLM_F_REQUEST;
4790                 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4791                 ifm->ifi_family = AF_UNSPEC;
4792                 ifm->ifi_index = vtep->ifindex;
4793                 assert(sizeof(buf) >= nlh->nlmsg_len);
4794                 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4795                 if (ret)
4796                         DRV_LOG(WARNING, "netlink: error deleting vxlan"
4797                                          " encap/decap ifindex %u",
4798                                          ifm->ifi_index);
4799         }
4800         rte_free(vtep);
4801 }
4802
4803 /**
4804  * Creates VTEP network device.
4805  *
4806  * @param[in] tcf
4807  *   Context object initialized by mlx5_flow_tcf_context_create().
4808  * @param[in] ifouter
4809  *   Outer interface to attach new-created VXLAN device
4810  *   If zero the VXLAN device will not be attached to any device.
4811  *   These VTEPs are used for decapsulation and can be precreated
4812  *   and shared between processes.
4813  * @param[in] port
4814  *   UDP port of created VTEP device.
4815  * @param[out] error
4816  *   Perform verbose error reporting if not NULL.
4817  *
4818  * @return
4819  * Pointer to created device structure on success,
4820  * NULL otherwise and rte_errno is set.
4821  */
4822 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
4823 static struct tcf_vtep*
4824 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
4825                      unsigned int ifouter,
4826                      uint16_t port, struct rte_flow_error *error)
4827 {
4828         struct tcf_vtep *vtep;
4829         struct nlmsghdr *nlh;
4830         struct ifinfomsg *ifm;
4831         char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
4832         alignas(struct nlmsghdr)
4833         uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
4834                     SZ_NLATTR_DATA_OF(sizeof(name)) +
4835                     SZ_NLATTR_NEST * 2 +
4836                     SZ_NLATTR_STRZ_OF("vxlan") +
4837                     SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
4838                     SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
4839                     SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
4840                     MNL_BUF_EXTRA_SPACE];
4841         struct nlattr *na_info;
4842         struct nlattr *na_vxlan;
4843         rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
4844         int ret;
4845
4846         vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
4847         if (!vtep) {
4848                 rte_flow_error_set(error, ENOMEM,
4849                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4850                                    "unable to allocate memory for VTEP");
4851                 return NULL;
4852         }
4853         *vtep = (struct tcf_vtep){
4854                         .port = port,
4855                         .local = LIST_HEAD_INITIALIZER(),
4856                         .neigh = LIST_HEAD_INITIALIZER(),
4857         };
4858         memset(buf, 0, sizeof(buf));
4859         nlh = mnl_nlmsg_put_header(buf);
4860         nlh->nlmsg_type = RTM_NEWLINK;
4861         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE  | NLM_F_EXCL;
4862         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4863         ifm->ifi_family = AF_UNSPEC;
4864         ifm->ifi_type = 0;
4865         ifm->ifi_index = 0;
4866         ifm->ifi_flags = IFF_UP;
4867         ifm->ifi_change = 0xffffffff;
4868         snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
4869         mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
4870         na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
4871         assert(na_info);
4872         mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
4873         na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
4874         if (ifouter)
4875                 mnl_attr_put_u32(nlh, IFLA_VXLAN_LINK, ifouter);
4876         assert(na_vxlan);
4877         mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
4878         mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
4879         mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
4880         mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
4881         mnl_attr_nest_end(nlh, na_vxlan);
4882         mnl_attr_nest_end(nlh, na_info);
4883         assert(sizeof(buf) >= nlh->nlmsg_len);
4884         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4885         if (ret) {
4886                 DRV_LOG(WARNING,
4887                         "netlink: VTEP %s create failure (%d)",
4888                         name, rte_errno);
4889                 if (rte_errno != EEXIST || ifouter)
4890                         /*
4891                          * Some unhandled error occurred or device is
4892                          * for encapsulation and cannot be shared.
4893                          */
4894                         goto error;
4895         } else {
4896                 /*
4897                  * Mark device we actually created.
4898                  * We should explicitly delete
4899                  * when we do not need it anymore.
4900                  */
4901                 vtep->created = 1;
4902                 vtep->waitreg = 1;
4903         }
4904         /* Try to get ifindex of created of pre-existing device. */
4905         ret = if_nametoindex(name);
4906         if (!ret) {
4907                 DRV_LOG(WARNING,
4908                         "VTEP %s failed to get index (%d)", name, errno);
4909                 rte_flow_error_set
4910                         (error, -errno,
4911                          RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4912                          "netlink: failed to retrieve VTEP ifindex");
4913                 goto error;
4914         }
4915         vtep->ifindex = ret;
4916         vtep->ifouter = ifouter;
4917         memset(buf, 0, sizeof(buf));
4918         nlh = mnl_nlmsg_put_header(buf);
4919         nlh->nlmsg_type = RTM_NEWLINK;
4920         nlh->nlmsg_flags = NLM_F_REQUEST;
4921         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4922         ifm->ifi_family = AF_UNSPEC;
4923         ifm->ifi_type = 0;
4924         ifm->ifi_index = vtep->ifindex;
4925         ifm->ifi_flags = IFF_UP;
4926         ifm->ifi_change = IFF_UP;
4927         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4928         if (ret) {
4929                 rte_flow_error_set(error, -errno,
4930                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4931                                    "netlink: failed to set VTEP link up");
4932                 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
4933                         name, rte_errno);
4934                 goto clean;
4935         }
4936         ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
4937         if (ret) {
4938                 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
4939                 goto clean;
4940         }
4941         DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
4942         vtep->refcnt = 1;
4943         return vtep;
4944 clean:
4945         flow_tcf_vtep_delete(tcf, vtep);
4946         return NULL;
4947 error:
4948         rte_free(vtep);
4949         return NULL;
4950 }
4951 #else
4952 static struct tcf_vtep*
4953 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf __rte_unused,
4954                      unsigned int ifouter __rte_unused,
4955                      uint16_t port __rte_unused,
4956                      struct rte_flow_error *error)
4957 {
4958         rte_flow_error_set(error, ENOTSUP,
4959                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4960                            "netlink: failed to create VTEP, "
4961                            "vxlan metadata are not supported by kernel");
4962         return NULL;
4963 }
4964 #endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */
4965
4966 /**
4967  * Acquire target interface index for VXLAN tunneling decapsulation.
4968  * In order to share the UDP port within the other interfaces the
4969  * VXLAN device created as not attached to any interface (if created).
4970  *
4971  * @param[in] tcf
4972  *   Context object initialized by mlx5_flow_tcf_context_create().
4973  * @param[in] dev_flow
4974  *   Flow tcf object with tunnel structure pointer set.
4975  * @param[out] error
4976  *   Perform verbose error reporting if not NULL.
4977  * @return
4978  *   Interface descriptor pointer on success,
4979  *   NULL otherwise and rte_errno is set.
4980  */
4981 static struct tcf_vtep*
4982 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
4983                             struct mlx5_flow *dev_flow,
4984                             struct rte_flow_error *error)
4985 {
4986         struct tcf_vtep *vtep;
4987         uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
4988
4989         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4990                 if (vtep->port == port)
4991                         break;
4992         }
4993         if (vtep && vtep->ifouter) {
4994                 rte_flow_error_set(error, -errno,
4995                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4996                                    "Failed to create decap VTEP with specified"
4997                                    " UDP port, atatched device exists");
4998                 return NULL;
4999         }
5000         if (vtep) {
5001                 /* Device exists, just increment the reference counter. */
5002                 vtep->refcnt++;
5003                 assert(vtep->ifindex);
5004                 return vtep;
5005         }
5006         /* No decapsulation device exists, try to create the new one. */
5007         vtep = flow_tcf_vtep_create(tcf, 0, port, error);
5008         if (vtep)
5009                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5010         return vtep;
5011 }
5012
5013 /**
5014  * Aqcuire target interface index for VXLAN tunneling encapsulation.
5015  *
5016  * @param[in] tcf
5017  *   Context object initialized by mlx5_flow_tcf_context_create().
5018  * @param[in] ifouter
5019  *   Network interface index to attach VXLAN encap device to.
5020  * @param[in] dev_flow
5021  *   Flow tcf object with tunnel structure pointer set.
5022  * @param[out] error
5023  *   Perform verbose error reporting if not NULL.
5024  * @return
5025  *   Interface descriptor pointer on success,
5026  *   NULL otherwise and rte_errno is set.
5027  */
5028 static struct tcf_vtep*
5029 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5030                             unsigned int ifouter,
5031                             struct mlx5_flow *dev_flow __rte_unused,
5032                             struct rte_flow_error *error)
5033 {
5034         static uint16_t encap_port = MLX5_VXLAN_PORT_MIN - 1;
5035         struct tcf_vtep *vtep;
5036         int ret;
5037
5038         assert(ifouter);
5039         /* Look whether the attached VTEP for encap is created. */
5040         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5041                 if (vtep->ifouter == ifouter)
5042                         break;
5043         }
5044         if (vtep) {
5045                 /* VTEP already exists, just increment the reference. */
5046                 vtep->refcnt++;
5047         } else {
5048                 uint16_t pcnt;
5049
5050                 /* Not found, we should create the new attached VTEP. */
5051                 flow_tcf_encap_iface_cleanup(tcf, ifouter);
5052                 flow_tcf_encap_local_cleanup(tcf, ifouter);
5053                 flow_tcf_encap_neigh_cleanup(tcf, ifouter);
5054                 for (pcnt = 0; pcnt <= (MLX5_VXLAN_PORT_MAX
5055                                      - MLX5_VXLAN_PORT_MIN); pcnt++) {
5056                         encap_port++;
5057                         /* Wraparound the UDP port index. */
5058                         if (encap_port < MLX5_VXLAN_PORT_MIN ||
5059                             encap_port > MLX5_VXLAN_PORT_MAX)
5060                                 encap_port = MLX5_VXLAN_PORT_MIN;
5061                         /* Check whether UDP port is in already in use. */
5062                         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5063                                 if (vtep->port == encap_port)
5064                                         break;
5065                         }
5066                         if (vtep) {
5067                                 /* Port is in use, try the next one. */
5068                                 vtep = NULL;
5069                                 continue;
5070                         }
5071                         vtep = flow_tcf_vtep_create(tcf, ifouter,
5072                                                     encap_port, error);
5073                         if (vtep) {
5074                                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5075                                 break;
5076                         }
5077                         if (rte_errno != EEXIST)
5078                                 break;
5079                 }
5080                 if (!vtep)
5081                         return NULL;
5082         }
5083         assert(vtep->ifouter == ifouter);
5084         assert(vtep->ifindex);
5085         /* Create local ipaddr with peer to specify the outer IPs. */
5086         ret = flow_tcf_encap_local(tcf, vtep, dev_flow, true, error);
5087         if (!ret) {
5088                 /* Create neigh rule to specify outer destination MAC. */
5089                 ret = flow_tcf_encap_neigh(tcf, vtep, dev_flow, true, error);
5090                 if (ret)
5091                         flow_tcf_encap_local(tcf, vtep,
5092                                              dev_flow, false, error);
5093         }
5094         if (ret) {
5095                 if (--vtep->refcnt == 0)
5096                         flow_tcf_vtep_delete(tcf, vtep);
5097                 return NULL;
5098         }
5099         return vtep;
5100 }
5101
5102 /**
5103  * Acquires target interface index for tunneling of any type.
5104  * Creates the new VTEP if needed.
5105  *
5106  * @param[in] tcf
5107  *   Context object initialized by mlx5_flow_tcf_context_create().
5108  * @param[in] ifouter
5109  *   Network interface index to attach VXLAN encap device to.
5110  * @param[in] dev_flow
5111  *   Flow tcf object with tunnel structure pointer set.
5112  * @param[out] error
5113  *   Perform verbose error reporting if not NULL.
5114  * @return
5115  *   Interface descriptor pointer on success,
5116  *   NULL otherwise and rte_errno is set.
5117  */
5118 static struct tcf_vtep*
5119 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5120                       unsigned int ifouter,
5121                       struct mlx5_flow *dev_flow,
5122                       struct rte_flow_error *error)
5123 {
5124         struct tcf_vtep *vtep = NULL;
5125
5126         assert(dev_flow->tcf.tunnel);
5127         pthread_mutex_lock(&vtep_list_mutex);
5128         switch (dev_flow->tcf.tunnel->type) {
5129         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5130                 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5131                                                   dev_flow, error);
5132                 break;
5133         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5134                 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5135                 break;
5136         default:
5137                 rte_flow_error_set(error, ENOTSUP,
5138                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5139                                    "unsupported tunnel type");
5140                 break;
5141         }
5142         pthread_mutex_unlock(&vtep_list_mutex);
5143         return vtep;
5144 }
5145
5146 /**
5147  * Release tunneling interface by ifindex. Decrements reference
5148  * counter and actually removes the device if counter is zero.
5149  *
5150  * @param[in] tcf
5151  *   Context object initialized by mlx5_flow_tcf_context_create().
5152  * @param[in] vtep
5153  *   VTEP device descriptor structure.
5154  * @param[in] dev_flow
5155  *   Flow tcf object with tunnel structure pointer set.
5156  */
5157 static void
5158 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5159                       struct tcf_vtep *vtep,
5160                       struct mlx5_flow *dev_flow)
5161 {
5162         assert(dev_flow->tcf.tunnel);
5163         pthread_mutex_lock(&vtep_list_mutex);
5164         switch (dev_flow->tcf.tunnel->type) {
5165         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5166                 break;
5167         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5168                 /* Remove the encap ancillary rules first. */
5169                 flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL);
5170                 flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL);
5171                 break;
5172         default:
5173                 assert(false);
5174                 DRV_LOG(WARNING, "Unsupported tunnel type");
5175                 break;
5176         }
5177         assert(vtep->refcnt);
5178         if (--vtep->refcnt == 0) {
5179                 LIST_REMOVE(vtep, next);
5180                 flow_tcf_vtep_delete(tcf, vtep);
5181         }
5182         pthread_mutex_unlock(&vtep_list_mutex);
5183 }
5184
5185 struct tcf_nlcb_query {
5186         uint32_t handle;
5187         uint32_t tc_flags;
5188         uint32_t flags_valid:1;
5189 };
5190
5191 /**
5192  * Collect queried rule attributes. This is callback routine called by
5193  * libmnl mnl_cb_run() in loop for every message in received packet.
5194  * Current implementation collects the flower flags only.
5195  *
5196  * @param[in] nlh
5197  *   Pointer to reply header.
5198  * @param[in, out] arg
5199  *   Context pointer for this callback.
5200  *
5201  * @return
5202  *   A positive, nonzero value on success (required by libmnl
5203  *   to continue messages processing).
5204  */
5205 static int
5206 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5207 {
5208         struct tcf_nlcb_query *query = arg;
5209         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5210         struct nlattr *na, *na_opt;
5211         bool flower = false;
5212
5213         if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5214             tcm->tcm_handle != query->handle)
5215                 return 1;
5216         mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5217                 switch (mnl_attr_get_type(na)) {
5218                 case TCA_KIND:
5219                         if (strcmp(mnl_attr_get_payload(na), "flower")) {
5220                                 /* Not flower filter, drop entire message. */
5221                                 return 1;
5222                         }
5223                         flower = true;
5224                         break;
5225                 case TCA_OPTIONS:
5226                         if (!flower) {
5227                                 /* Not flower options, drop entire message. */
5228                                 return 1;
5229                         }
5230                         /* Check nested flower options. */
5231                         mnl_attr_for_each_nested(na_opt, na) {
5232                                 switch (mnl_attr_get_type(na_opt)) {
5233                                 case TCA_FLOWER_FLAGS:
5234                                         query->flags_valid = 1;
5235                                         query->tc_flags =
5236                                                 mnl_attr_get_u32(na_opt);
5237                                         break;
5238                                 }
5239                         }
5240                         break;
5241                 }
5242         }
5243         return 1;
5244 }
5245
5246 /**
5247  * Query a TC flower rule flags via netlink.
5248  *
5249  * @param[in] tcf
5250  *   Context object initialized by mlx5_flow_tcf_context_create().
5251  * @param[in] dev_flow
5252  *   Pointer to the flow.
5253  * @param[out] pflags
5254  *   pointer to the data retrieved by the query.
5255  *
5256  * @return
5257  *   0 on success, a negative errno value otherwise.
5258  */
5259 static int
5260 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5261                      struct mlx5_flow *dev_flow,
5262                      uint32_t *pflags)
5263 {
5264         struct nlmsghdr *nlh;
5265         struct tcmsg *tcm;
5266         struct tcf_nlcb_query query = {
5267                 .handle = dev_flow->tcf.tcm->tcm_handle,
5268         };
5269
5270         nlh = mnl_nlmsg_put_header(tcf->buf);
5271         nlh->nlmsg_type = RTM_GETTFILTER;
5272         nlh->nlmsg_flags = NLM_F_REQUEST;
5273         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5274         memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5275         /*
5276          * Ignore Netlink error for filter query operations.
5277          * The reply length is sent by kernel as errno.
5278          * Just check we got the flags option.
5279          */
5280         flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5281         if (!query.flags_valid) {
5282                 *pflags = 0;
5283                 return -ENOENT;
5284         }
5285         *pflags = query.tc_flags;
5286         return 0;
5287 }
5288
5289 /**
5290  * Query and check the in_hw set for specified rule.
5291  *
5292  * @param[in] tcf
5293  *   Context object initialized by mlx5_flow_tcf_context_create().
5294  * @param[in] dev_flow
5295  *   Pointer to the flow to check.
5296  *
5297  * @return
5298  *   0 on success, a negative errno value otherwise.
5299  */
5300 static int
5301 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5302                     struct mlx5_flow *dev_flow)
5303 {
5304         uint32_t flags;
5305         int ret;
5306
5307         ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5308         if (ret)
5309                 return ret;
5310         return  (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5311 }
5312
5313 /**
5314  * Remove flow from E-Switch by sending Netlink message.
5315  *
5316  * @param[in] dev
5317  *   Pointer to Ethernet device.
5318  * @param[in, out] flow
5319  *   Pointer to the sub flow.
5320  */
5321 static void
5322 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5323 {
5324         struct priv *priv = dev->data->dev_private;
5325         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5326         struct mlx5_flow *dev_flow;
5327         struct nlmsghdr *nlh;
5328         struct tcmsg *tcm;
5329
5330         if (!flow)
5331                 return;
5332         dev_flow = LIST_FIRST(&flow->dev_flows);
5333         if (!dev_flow)
5334                 return;
5335         /* E-Switch flow can't be expanded. */
5336         assert(!LIST_NEXT(dev_flow, next));
5337         if (dev_flow->tcf.applied) {
5338                 nlh = dev_flow->tcf.nlh;
5339                 nlh->nlmsg_type = RTM_DELTFILTER;
5340                 nlh->nlmsg_flags = NLM_F_REQUEST;
5341                 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5342                 if (dev_flow->tcf.tunnel) {
5343                         assert(dev_flow->tcf.tunnel->vtep);
5344                         flow_tcf_vtep_release(ctx,
5345                                 dev_flow->tcf.tunnel->vtep,
5346                                 dev_flow);
5347                         dev_flow->tcf.tunnel->vtep = NULL;
5348                 }
5349                 /* Cleanup the rule handle value. */
5350                 tcm = mnl_nlmsg_get_payload(nlh);
5351                 tcm->tcm_handle = 0;
5352                 dev_flow->tcf.applied = 0;
5353         }
5354 }
5355
5356 /**
5357  * Fetch the applied rule handle. This is callback routine called by
5358  * libmnl mnl_cb_run() in loop for every message in received packet.
5359  * When the NLM_F_ECHO flag i sspecified the kernel sends the created
5360  * rule descriptor back to the application and we can retrieve the
5361  * actual rule handle from updated descriptor.
5362  *
5363  * @param[in] nlh
5364  *   Pointer to reply header.
5365  * @param[in, out] arg
5366  *   Context pointer for this callback.
5367  *
5368  * @return
5369  *   A positive, nonzero value on success (required by libmnl
5370  *   to continue messages processing).
5371  */
5372 static int
5373 flow_tcf_collect_apply_cb(const struct nlmsghdr *nlh, void *arg)
5374 {
5375         struct nlmsghdr *nlhrq = arg;
5376         struct tcmsg *tcmrq = mnl_nlmsg_get_payload(nlhrq);
5377         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5378         struct nlattr *na;
5379
5380         if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5381             nlh->nlmsg_seq != nlhrq->nlmsg_seq)
5382                 return 1;
5383         mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5384                 switch (mnl_attr_get_type(na)) {
5385                 case TCA_KIND:
5386                         if (strcmp(mnl_attr_get_payload(na), "flower")) {
5387                                 /* Not flower filter, drop entire message. */
5388                                 return 1;
5389                         }
5390                         tcmrq->tcm_handle = tcm->tcm_handle;
5391                         return 1;
5392                 }
5393         }
5394         return 1;
5395 }
5396 /**
5397  * Apply flow to E-Switch by sending Netlink message.
5398  *
5399  * @param[in] dev
5400  *   Pointer to Ethernet device.
5401  * @param[in, out] flow
5402  *   Pointer to the sub flow.
5403  * @param[out] error
5404  *   Pointer to the error structure.
5405  *
5406  * @return
5407  *   0 on success, a negative errno value otherwise and rte_errno is set.
5408  */
5409 static int
5410 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5411                struct rte_flow_error *error)
5412 {
5413         struct priv *priv = dev->data->dev_private;
5414         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5415         struct mlx5_flow *dev_flow;
5416         struct nlmsghdr *nlh;
5417         struct tcmsg *tcm;
5418         uint64_t start = 0;
5419         uint64_t twait = 0;
5420         int ret;
5421
5422         dev_flow = LIST_FIRST(&flow->dev_flows);
5423         /* E-Switch flow can't be expanded. */
5424         assert(!LIST_NEXT(dev_flow, next));
5425         if (dev_flow->tcf.applied)
5426                 return 0;
5427         nlh = dev_flow->tcf.nlh;
5428         nlh->nlmsg_type = RTM_NEWTFILTER;
5429         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
5430                            NLM_F_EXCL | NLM_F_ECHO;
5431         tcm = mnl_nlmsg_get_payload(nlh);
5432         /* Allow kernel to assign handle on its own. */
5433         tcm->tcm_handle = 0;
5434         if (dev_flow->tcf.tunnel) {
5435                 /*
5436                  * Replace the interface index, target for
5437                  * encapsulation, source for decapsulation.
5438                  */
5439                 assert(!dev_flow->tcf.tunnel->vtep);
5440                 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5441                 /* Acquire actual VTEP device when rule is being applied. */
5442                 dev_flow->tcf.tunnel->vtep =
5443                         flow_tcf_vtep_acquire(ctx,
5444                                         dev_flow->tcf.tunnel->ifindex_org,
5445                                         dev_flow, error);
5446                 if (!dev_flow->tcf.tunnel->vtep)
5447                         return -rte_errno;
5448                 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5449                                 dev_flow->tcf.tunnel->vtep->ifindex,
5450                                 dev_flow->tcf.tunnel->ifindex_org);
5451                 *dev_flow->tcf.tunnel->ifindex_ptr =
5452                         dev_flow->tcf.tunnel->vtep->ifindex;
5453                 if (dev_flow->tcf.tunnel->vtep->waitreg) {
5454                         /* Clear wait flag for VXLAN port registration. */
5455                         dev_flow->tcf.tunnel->vtep->waitreg = 0;
5456                         twait = rte_get_timer_hz();
5457                         assert(twait > MS_PER_S);
5458                         twait = twait * MLX5_VXLAN_WAIT_PORT_REG_MS;
5459                         twait = twait / MS_PER_S;
5460                         start = rte_get_timer_cycles();
5461                 }
5462         }
5463         /*
5464          * Kernel creates the VXLAN devices and registers UDP ports to
5465          * be hardware offloaded within the NIC kernel drivers. The
5466          * registration process is being performed into context of
5467          * working kernel thread and the race conditions might happen.
5468          * The VXLAN device is created and success is returned to
5469          * calling application, but the UDP port registration process
5470          * is not completed yet. The next applied rule may be rejected
5471          * by the driver with ENOSUP code. We are going to wait a bit,
5472          * allowing registration process to be completed. The waiting
5473          * is performed once after device been created.
5474          */
5475         do {
5476                 struct timespec onems;
5477
5478                 ret = flow_tcf_nl_ack(ctx, nlh,
5479                                       flow_tcf_collect_apply_cb, nlh);
5480                 if (!ret || ret != -ENOTSUP || !twait)
5481                         break;
5482                 /* Wait one millisecond and try again till timeout. */
5483                 onems.tv_sec = 0;
5484                 onems.tv_nsec = NS_PER_S / MS_PER_S;
5485                 nanosleep(&onems, 0);
5486                 if ((rte_get_timer_cycles() - start) > twait) {
5487                         /* Timeout elapsed, try once more and exit. */
5488                         twait = 0;
5489                 }
5490         } while (true);
5491         if (!ret) {
5492                 if (!tcm->tcm_handle) {
5493                         flow_tcf_remove(dev, flow);
5494                         return rte_flow_error_set
5495                                 (error, ENOENT,
5496                                  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5497                                  "netlink: rule zero handle returned");
5498                 }
5499                 dev_flow->tcf.applied = 1;
5500                 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5501                         return 0;
5502                 /*
5503                  * Rule was applied without skip_sw flag set.
5504                  * We should check whether the rule was acctually
5505                  * accepted by hardware (have look at in_hw flag).
5506                  */
5507                 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5508                         flow_tcf_remove(dev, flow);
5509                         return rte_flow_error_set
5510                                 (error, ENOENT,
5511                                  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5512                                  "netlink: rule has no in_hw flag set");
5513                 }
5514                 return 0;
5515         }
5516         if (dev_flow->tcf.tunnel) {
5517                 /* Rollback the VTEP configuration if rule apply failed. */
5518                 assert(dev_flow->tcf.tunnel->vtep);
5519                 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5520                                       dev_flow);
5521                 dev_flow->tcf.tunnel->vtep = NULL;
5522         }
5523         return rte_flow_error_set(error, rte_errno,
5524                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5525                                   "netlink: failed to create TC flow rule");
5526 }
5527
5528 /**
5529  * Remove flow from E-Switch and release resources of the device flow.
5530  *
5531  * @param[in] dev
5532  *   Pointer to Ethernet device.
5533  * @param[in, out] flow
5534  *   Pointer to the sub flow.
5535  */
5536 static void
5537 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5538 {
5539         struct mlx5_flow *dev_flow;
5540
5541         if (!flow)
5542                 return;
5543         flow_tcf_remove(dev, flow);
5544         if (flow->counter) {
5545                 if (--flow->counter->ref_cnt == 0) {
5546                         rte_free(flow->counter);
5547                         flow->counter = NULL;
5548                 }
5549         }
5550         dev_flow = LIST_FIRST(&flow->dev_flows);
5551         if (!dev_flow)
5552                 return;
5553         /* E-Switch flow can't be expanded. */
5554         assert(!LIST_NEXT(dev_flow, next));
5555         LIST_REMOVE(dev_flow, next);
5556         rte_free(dev_flow);
5557 }
5558
5559 /**
5560  * Helper routine for figuring the space size required for a parse buffer.
5561  *
5562  * @param array
5563  *   array of values to use.
5564  * @param idx
5565  *   Current location in array.
5566  * @param value
5567  *   Value to compare with.
5568  *
5569  * @return
5570  *   The maximum between the given value and the array value on index.
5571  */
5572 static uint16_t
5573 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5574 {
5575         return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5576 }
5577
5578 /**
5579  * Parse rtnetlink message attributes filling the attribute table with the info
5580  * retrieved.
5581  *
5582  * @param tb
5583  *   Attribute table to be filled.
5584  * @param[out] max
5585  *   Maxinum entry in the attribute table.
5586  * @param rte
5587  *   The attributes section in the message to be parsed.
5588  * @param len
5589  *   The length of the attributes section in the message.
5590  */
5591 static void
5592 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5593                          struct rtattr *rta, int len)
5594 {
5595         unsigned short type;
5596         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5597         while (RTA_OK(rta, len)) {
5598                 type = rta->rta_type;
5599                 if (type <= max && !tb[type])
5600                         tb[type] = rta;
5601                 rta = RTA_NEXT(rta, len);
5602         }
5603 }
5604
5605 /**
5606  * Extract flow counters from flower action.
5607  *
5608  * @param rta
5609  *   flower action stats properties in the Netlink message received.
5610  * @param rta_type
5611  *   The backward sequence of rta_types, as written in the attribute table,
5612  *   we need to traverse in order to get to the requested object.
5613  * @param idx
5614  *   Current location in rta_type table.
5615  * @param[out] data
5616  *   data holding the count statistics of the rte_flow retrieved from
5617  *   the message.
5618  *
5619  * @return
5620  *   0 if data was found and retrieved, -1 otherwise.
5621  */
5622 static int
5623 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5624                                        uint16_t rta_type[], int idx,
5625                                        struct gnet_stats_basic *data)
5626 {
5627         int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5628                                                  TCA_STATS_BASIC);
5629         struct rtattr *tbs[tca_stats_max + 1];
5630
5631         if (rta == NULL || idx < 0)
5632                 return -1;
5633         flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5634                                  RTA_DATA(rta), RTA_PAYLOAD(rta));
5635         switch (rta_type[idx]) {
5636         case TCA_STATS_BASIC:
5637                 if (tbs[TCA_STATS_BASIC]) {
5638                         memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5639                                RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5640                                sizeof(*data)));
5641                         return 0;
5642                 }
5643                 break;
5644         default:
5645                 break;
5646         }
5647         return -1;
5648 }
5649
5650 /**
5651  * Parse flower single action retrieving the requested action attribute,
5652  * if found.
5653  *
5654  * @param arg
5655  *   flower action properties in the Netlink message received.
5656  * @param rta_type
5657  *   The backward sequence of rta_types, as written in the attribute table,
5658  *   we need to traverse in order to get to the requested object.
5659  * @param idx
5660  *   Current location in rta_type table.
5661  * @param[out] data
5662  *   Count statistics retrieved from the message query.
5663  *
5664  * @return
5665  *   0 if data was found and retrieved, -1 otherwise.
5666  */
5667 static int
5668 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5669                                      uint16_t rta_type[], int idx, void *data)
5670 {
5671         int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5672         struct rtattr *tb[tca_act_max + 1];
5673
5674         if (arg == NULL || idx < 0)
5675                 return -1;
5676         flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5677                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5678         if (tb[TCA_ACT_KIND] == NULL)
5679                 return -1;
5680         switch (rta_type[idx]) {
5681         case TCA_ACT_STATS:
5682                 if (tb[TCA_ACT_STATS])
5683                         return flow_tcf_nl_action_stats_parse_and_get
5684                                         (tb[TCA_ACT_STATS],
5685                                          rta_type, --idx,
5686                                          (struct gnet_stats_basic *)data);
5687                 break;
5688         default:
5689                 break;
5690         }
5691         return -1;
5692 }
5693
5694 /**
5695  * Parse flower action section in the message retrieving the requested
5696  * attribute from the first action that provides it.
5697  *
5698  * @param opt
5699  *   flower section in the Netlink message received.
5700  * @param rta_type
5701  *   The backward sequence of rta_types, as written in the attribute table,
5702  *   we need to traverse in order to get to the requested object.
5703  * @param idx
5704  *   Current location in rta_type table.
5705  * @param[out] data
5706  *   data retrieved from the message query.
5707  *
5708  * @return
5709  *   0 if data was found and retrieved, -1 otherwise.
5710  */
5711 static int
5712 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5713                                  uint16_t rta_type[], int idx, void *data)
5714 {
5715         struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5716         int i;
5717
5718         if (arg == NULL || idx < 0)
5719                 return -1;
5720         flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5721                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5722         switch (rta_type[idx]) {
5723         /*
5724          * flow counters are stored in the actions defined by the flow
5725          * and not in the flow itself, therefore we need to traverse the
5726          * flower chain of actions in search for them.
5727          *
5728          * Note that the index is not decremented here.
5729          */
5730         case TCA_ACT_STATS:
5731                 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5732                         if (tb[i] &&
5733                         !flow_tcf_nl_parse_one_action_and_get(tb[i],
5734                                                               rta_type,
5735                                                               idx, data))
5736                                 return 0;
5737                 }
5738                 break;
5739         default:
5740                 break;
5741         }
5742         return -1;
5743 }
5744
5745 /**
5746  * Parse flower classifier options in the message, retrieving the requested
5747  * attribute if found.
5748  *
5749  * @param opt
5750  *   flower section in the Netlink message received.
5751  * @param rta_type
5752  *   The backward sequence of rta_types, as written in the attribute table,
5753  *   we need to traverse in order to get to the requested object.
5754  * @param idx
5755  *   Current location in rta_type table.
5756  * @param[out] data
5757  *   data retrieved from the message query.
5758  *
5759  * @return
5760  *   0 if data was found and retrieved, -1 otherwise.
5761  */
5762 static int
5763 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5764                                uint16_t rta_type[], int idx, void *data)
5765 {
5766         int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5767                                                   TCA_FLOWER_ACT);
5768         struct rtattr *tb[tca_flower_max + 1];
5769
5770         if (!opt || idx < 0)
5771                 return -1;
5772         flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5773                                  RTA_DATA(opt), RTA_PAYLOAD(opt));
5774         switch (rta_type[idx]) {
5775         case TCA_FLOWER_ACT:
5776                 if (tb[TCA_FLOWER_ACT])
5777                         return flow_tcf_nl_action_parse_and_get
5778                                                         (tb[TCA_FLOWER_ACT],
5779                                                          rta_type, --idx, data);
5780                 break;
5781         default:
5782                 break;
5783         }
5784         return -1;
5785 }
5786
5787 /**
5788  * Parse Netlink reply on filter query, retrieving the flow counters.
5789  *
5790  * @param nlh
5791  *   Message received from Netlink.
5792  * @param rta_type
5793  *   The backward sequence of rta_types, as written in the attribute table,
5794  *   we need to traverse in order to get to the requested object.
5795  * @param idx
5796  *   Current location in rta_type table.
5797  * @param[out] data
5798  *   data retrieved from the message query.
5799  *
5800  * @return
5801  *   0 if data was found and retrieved, -1 otherwise.
5802  */
5803 static int
5804 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
5805                                  uint16_t rta_type[], int idx, void *data)
5806 {
5807         struct nlmsghdr *nlh = cnlh;
5808         struct tcmsg *t = NLMSG_DATA(nlh);
5809         int len = nlh->nlmsg_len;
5810         int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
5811         struct rtattr *tb[tca_max + 1];
5812
5813         if (idx < 0)
5814                 return -1;
5815         if (nlh->nlmsg_type != RTM_NEWTFILTER &&
5816             nlh->nlmsg_type != RTM_GETTFILTER &&
5817             nlh->nlmsg_type != RTM_DELTFILTER)
5818                 return -1;
5819         len -= NLMSG_LENGTH(sizeof(*t));
5820         if (len < 0)
5821                 return -1;
5822         flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
5823         /* Not a TC flower flow - bail out */
5824         if (!tb[TCA_KIND] ||
5825             strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
5826                 return -1;
5827         switch (rta_type[idx]) {
5828         case TCA_OPTIONS:
5829                 if (tb[TCA_OPTIONS])
5830                         return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
5831                                                               rta_type,
5832                                                               --idx, data);
5833                 break;
5834         default:
5835                 break;
5836         }
5837         return -1;
5838 }
5839
5840 /**
5841  * A callback to parse Netlink reply on TC flower query.
5842  *
5843  * @param nlh
5844  *   Message received from Netlink.
5845  * @param[out] data
5846  *   Pointer to data area to be filled by the parsing routine.
5847  *   assumed to be a pointer to struct flow_tcf_stats_basic.
5848  *
5849  * @return
5850  *   MNL_CB_OK value.
5851  */
5852 static int
5853 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
5854 {
5855         /*
5856          * The backward sequence of rta_types to pass in order to get
5857          *  to the counters.
5858          */
5859         uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
5860                                 TCA_FLOWER_ACT, TCA_OPTIONS };
5861         struct flow_tcf_stats_basic *sb_data = data;
5862         union {
5863                 const struct nlmsghdr *c;
5864                 struct nlmsghdr *nc;
5865         } tnlh = { .c = nlh };
5866
5867         if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
5868                                               RTE_DIM(rta_type) - 1,
5869                                               (void *)&sb_data->counters))
5870                 sb_data->valid = true;
5871         return MNL_CB_OK;
5872 }
5873
5874 /**
5875  * Query a TC flower rule for its statistics via netlink.
5876  *
5877  * @param[in] dev
5878  *   Pointer to Ethernet device.
5879  * @param[in] flow
5880  *   Pointer to the sub flow.
5881  * @param[out] data
5882  *   data retrieved by the query.
5883  * @param[out] error
5884  *   Perform verbose error reporting if not NULL.
5885  *
5886  * @return
5887  *   0 on success, a negative errno value otherwise and rte_errno is set.
5888  */
5889 static int
5890 flow_tcf_query_count(struct rte_eth_dev *dev,
5891                           struct rte_flow *flow,
5892                           void *data,
5893                           struct rte_flow_error *error)
5894 {
5895         struct flow_tcf_stats_basic sb_data;
5896         struct rte_flow_query_count *qc = data;
5897         struct priv *priv = dev->data->dev_private;
5898         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5899         struct mnl_socket *nl = ctx->nl;
5900         struct mlx5_flow *dev_flow;
5901         struct nlmsghdr *nlh;
5902         uint32_t seq = priv->tcf_context->seq++;
5903         ssize_t ret;
5904         assert(qc);
5905
5906         memset(&sb_data, 0, sizeof(sb_data));
5907         dev_flow = LIST_FIRST(&flow->dev_flows);
5908         /* E-Switch flow can't be expanded. */
5909         assert(!LIST_NEXT(dev_flow, next));
5910         if (!dev_flow->flow->counter)
5911                 goto notsup_exit;
5912         nlh = dev_flow->tcf.nlh;
5913         nlh->nlmsg_type = RTM_GETTFILTER;
5914         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
5915         nlh->nlmsg_seq = seq;
5916         if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
5917                 goto error_exit;
5918         do {
5919                 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
5920                 if (ret <= 0)
5921                         break;
5922                 ret = mnl_cb_run(ctx->buf, ret, seq,
5923                                  mnl_socket_get_portid(nl),
5924                                  flow_tcf_nl_message_get_stats_basic,
5925                                  (void *)&sb_data);
5926         } while (ret > 0);
5927         /* Return the delta from last reset. */
5928         if (sb_data.valid) {
5929                 /* Return the delta from last reset. */
5930                 qc->hits_set = 1;
5931                 qc->bytes_set = 1;
5932                 qc->hits = sb_data.counters.packets - flow->counter->hits;
5933                 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
5934                 if (qc->reset) {
5935                         flow->counter->hits = sb_data.counters.packets;
5936                         flow->counter->bytes = sb_data.counters.bytes;
5937                 }
5938                 return 0;
5939         }
5940         return rte_flow_error_set(error, EINVAL,
5941                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5942                                   NULL,
5943                                   "flow does not have counter");
5944 error_exit:
5945         return rte_flow_error_set
5946                         (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5947                          NULL, "netlink: failed to read flow rule counters");
5948 notsup_exit:
5949         return rte_flow_error_set
5950                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5951                          NULL, "counters are not available.");
5952 }
5953
5954 /**
5955  * Query a flow.
5956  *
5957  * @see rte_flow_query()
5958  * @see rte_flow_ops
5959  */
5960 static int
5961 flow_tcf_query(struct rte_eth_dev *dev,
5962                struct rte_flow *flow,
5963                const struct rte_flow_action *actions,
5964                void *data,
5965                struct rte_flow_error *error)
5966 {
5967         int ret = -EINVAL;
5968
5969         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
5970                 switch (actions->type) {
5971                 case RTE_FLOW_ACTION_TYPE_VOID:
5972                         break;
5973                 case RTE_FLOW_ACTION_TYPE_COUNT:
5974                         ret = flow_tcf_query_count(dev, flow, data, error);
5975                         break;
5976                 default:
5977                         return rte_flow_error_set(error, ENOTSUP,
5978                                                   RTE_FLOW_ERROR_TYPE_ACTION,
5979                                                   actions,
5980                                                   "action not supported");
5981                 }
5982         }
5983         return ret;
5984 }
5985
5986 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
5987         .validate = flow_tcf_validate,
5988         .prepare = flow_tcf_prepare,
5989         .translate = flow_tcf_translate,
5990         .apply = flow_tcf_apply,
5991         .remove = flow_tcf_remove,
5992         .destroy = flow_tcf_destroy,
5993         .query = flow_tcf_query,
5994 };
5995
5996 /**
5997  * Create and configure a libmnl socket for Netlink flow rules.
5998  *
5999  * @return
6000  *   A valid libmnl socket object pointer on success, NULL otherwise and
6001  *   rte_errno is set.
6002  */
6003 static struct mnl_socket *
6004 flow_tcf_mnl_socket_create(void)
6005 {
6006         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
6007
6008         if (nl) {
6009                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
6010                                       sizeof(int));
6011                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
6012                         return nl;
6013         }
6014         rte_errno = errno;
6015         if (nl)
6016                 mnl_socket_close(nl);
6017         return NULL;
6018 }
6019
6020 /**
6021  * Destroy a libmnl socket.
6022  *
6023  * @param nl
6024  *   Libmnl socket of the @p NETLINK_ROUTE kind.
6025  */
6026 static void
6027 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
6028 {
6029         if (nl)
6030                 mnl_socket_close(nl);
6031 }
6032
6033 /**
6034  * Initialize ingress qdisc of a given network interface.
6035  *
6036  * @param ctx
6037  *   Pointer to tc-flower context to use.
6038  * @param ifindex
6039  *   Index of network interface to initialize.
6040  * @param[out] error
6041  *   Perform verbose error reporting if not NULL.
6042  *
6043  * @return
6044  *   0 on success, a negative errno value otherwise and rte_errno is set.
6045  */
6046 int
6047 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
6048                    unsigned int ifindex, struct rte_flow_error *error)
6049 {
6050         struct nlmsghdr *nlh;
6051         struct tcmsg *tcm;
6052         alignas(struct nlmsghdr)
6053         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
6054                     SZ_NLATTR_STRZ_OF("ingress") +
6055                     MNL_BUF_EXTRA_SPACE];
6056
6057         /* Destroy existing ingress qdisc and everything attached to it. */
6058         nlh = mnl_nlmsg_put_header(buf);
6059         nlh->nlmsg_type = RTM_DELQDISC;
6060         nlh->nlmsg_flags = NLM_F_REQUEST;
6061         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6062         tcm->tcm_family = AF_UNSPEC;
6063         tcm->tcm_ifindex = ifindex;
6064         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6065         tcm->tcm_parent = TC_H_INGRESS;
6066         assert(sizeof(buf) >= nlh->nlmsg_len);
6067         /* Ignore errors when qdisc is already absent. */
6068         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
6069             rte_errno != EINVAL && rte_errno != ENOENT)
6070                 return rte_flow_error_set(error, rte_errno,
6071                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6072                                           "netlink: failed to remove ingress"
6073                                           " qdisc");
6074         /* Create fresh ingress qdisc. */
6075         nlh = mnl_nlmsg_put_header(buf);
6076         nlh->nlmsg_type = RTM_NEWQDISC;
6077         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
6078         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6079         tcm->tcm_family = AF_UNSPEC;
6080         tcm->tcm_ifindex = ifindex;
6081         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6082         tcm->tcm_parent = TC_H_INGRESS;
6083         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
6084         assert(sizeof(buf) >= nlh->nlmsg_len);
6085         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
6086                 return rte_flow_error_set(error, rte_errno,
6087                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6088                                           "netlink: failed to create ingress"
6089                                           " qdisc");
6090         return 0;
6091 }
6092
6093 /**
6094  * Create libmnl context for Netlink flow rules.
6095  *
6096  * @return
6097  *   A valid libmnl socket object pointer on success, NULL otherwise and
6098  *   rte_errno is set.
6099  */
6100 struct mlx5_flow_tcf_context *
6101 mlx5_flow_tcf_context_create(void)
6102 {
6103         struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
6104                                                         sizeof(*ctx),
6105                                                         sizeof(uint32_t));
6106         if (!ctx)
6107                 goto error;
6108         ctx->nl = flow_tcf_mnl_socket_create();
6109         if (!ctx->nl)
6110                 goto error;
6111         ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
6112         ctx->buf = rte_zmalloc(__func__,
6113                                ctx->buf_size, sizeof(uint32_t));
6114         if (!ctx->buf)
6115                 goto error;
6116         ctx->seq = random();
6117         return ctx;
6118 error:
6119         mlx5_flow_tcf_context_destroy(ctx);
6120         return NULL;
6121 }
6122
6123 /**
6124  * Destroy a libmnl context.
6125  *
6126  * @param ctx
6127  *   Libmnl socket of the @p NETLINK_ROUTE kind.
6128  */
6129 void
6130 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
6131 {
6132         if (!ctx)
6133                 return;
6134         flow_tcf_mnl_socket_destroy(ctx->nl);
6135         rte_free(ctx->buf);
6136         rte_free(ctx);
6137 }