New upstream version 17.11.4
[deb_dpdk.git] / drivers / net / tap / tap_flow.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2017 6WIND S.A.
5  *   Copyright 2017 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <errno.h>
35 #include <string.h>
36 #include <sys/queue.h>
37
38 #include <rte_byteorder.h>
39 #include <rte_jhash.h>
40 #include <rte_malloc.h>
41 #include <rte_eth_tap.h>
42 #include <tap_flow.h>
43 #include <tap_autoconf.h>
44 #include <tap_tcmsgs.h>
45
46 #ifndef HAVE_TC_FLOWER
47 /*
48  * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
49  * avoid sending TC messages the kernel cannot understand.
50  */
51 enum {
52         TCA_FLOWER_UNSPEC,
53         TCA_FLOWER_CLASSID,
54         TCA_FLOWER_INDEV,
55         TCA_FLOWER_ACT,
56         TCA_FLOWER_KEY_ETH_DST,         /* ETH_ALEN */
57         TCA_FLOWER_KEY_ETH_DST_MASK,    /* ETH_ALEN */
58         TCA_FLOWER_KEY_ETH_SRC,         /* ETH_ALEN */
59         TCA_FLOWER_KEY_ETH_SRC_MASK,    /* ETH_ALEN */
60         TCA_FLOWER_KEY_ETH_TYPE,        /* be16 */
61         TCA_FLOWER_KEY_IP_PROTO,        /* u8 */
62         TCA_FLOWER_KEY_IPV4_SRC,        /* be32 */
63         TCA_FLOWER_KEY_IPV4_SRC_MASK,   /* be32 */
64         TCA_FLOWER_KEY_IPV4_DST,        /* be32 */
65         TCA_FLOWER_KEY_IPV4_DST_MASK,   /* be32 */
66         TCA_FLOWER_KEY_IPV6_SRC,        /* struct in6_addr */
67         TCA_FLOWER_KEY_IPV6_SRC_MASK,   /* struct in6_addr */
68         TCA_FLOWER_KEY_IPV6_DST,        /* struct in6_addr */
69         TCA_FLOWER_KEY_IPV6_DST_MASK,   /* struct in6_addr */
70         TCA_FLOWER_KEY_TCP_SRC,         /* be16 */
71         TCA_FLOWER_KEY_TCP_DST,         /* be16 */
72         TCA_FLOWER_KEY_UDP_SRC,         /* be16 */
73         TCA_FLOWER_KEY_UDP_DST,         /* be16 */
74 };
75 #endif
76 #ifndef HAVE_TC_VLAN_ID
77 enum {
78         /* TCA_FLOWER_FLAGS, */
79         TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */
80         TCA_FLOWER_KEY_VLAN_PRIO,       /* u8   */
81         TCA_FLOWER_KEY_VLAN_ETH_TYPE,   /* be16 */
82 };
83 #endif
84
85 #define ISOLATE_HANDLE 1
86
87 struct rte_flow {
88         LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
89         struct rte_flow *remote_flow; /* associated remote flow */
90         struct nlmsg msg;
91 };
92
93 struct convert_data {
94         uint16_t eth_type;
95         uint16_t ip_proto;
96         uint8_t vlan;
97         struct rte_flow *flow;
98 };
99
100 struct remote_rule {
101         struct rte_flow_attr attr;
102         struct rte_flow_item items[2];
103         struct rte_flow_action actions[2];
104         int mirred;
105 };
106
107 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
108 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
109 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
110 static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
111 static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
112 static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
113 static int
114 tap_flow_validate(struct rte_eth_dev *dev,
115                   const struct rte_flow_attr *attr,
116                   const struct rte_flow_item items[],
117                   const struct rte_flow_action actions[],
118                   struct rte_flow_error *error);
119
120 static struct rte_flow *
121 tap_flow_create(struct rte_eth_dev *dev,
122                 const struct rte_flow_attr *attr,
123                 const struct rte_flow_item items[],
124                 const struct rte_flow_action actions[],
125                 struct rte_flow_error *error);
126
127 static int
128 tap_flow_destroy(struct rte_eth_dev *dev,
129                  struct rte_flow *flow,
130                  struct rte_flow_error *error);
131
132 static int
133 tap_flow_isolate(struct rte_eth_dev *dev,
134                  int set,
135                  struct rte_flow_error *error);
136
137 static const struct rte_flow_ops tap_flow_ops = {
138         .validate = tap_flow_validate,
139         .create = tap_flow_create,
140         .destroy = tap_flow_destroy,
141         .flush = tap_flow_flush,
142         .isolate = tap_flow_isolate,
143 };
144
145 /* Static initializer for items. */
146 #define ITEMS(...) \
147         (const enum rte_flow_item_type []){ \
148                 __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
149         }
150
151 /* Structure to generate a simple graph of layers supported by the NIC. */
152 struct tap_flow_items {
153         /* Bit-mask corresponding to what is supported for this item. */
154         const void *mask;
155         const unsigned int mask_sz; /* Bit-mask size in bytes. */
156         /*
157          * Bit-mask corresponding to the default mask, if none is provided
158          * along with the item.
159          */
160         const void *default_mask;
161         /**
162          * Conversion function from rte_flow to netlink attributes.
163          *
164          * @param item
165          *   rte_flow item to convert.
166          * @param data
167          *   Internal structure to store the conversion.
168          *
169          * @return
170          *   0 on success, negative value otherwise.
171          */
172         int (*convert)(const struct rte_flow_item *item, void *data);
173         /** List of possible following items.  */
174         const enum rte_flow_item_type *const items;
175 };
176
177 /* Graph of supported items and associated actions. */
178 static const struct tap_flow_items tap_flow_items[] = {
179         [RTE_FLOW_ITEM_TYPE_END] = {
180                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
181         },
182         [RTE_FLOW_ITEM_TYPE_ETH] = {
183                 .items = ITEMS(
184                         RTE_FLOW_ITEM_TYPE_VLAN,
185                         RTE_FLOW_ITEM_TYPE_IPV4,
186                         RTE_FLOW_ITEM_TYPE_IPV6),
187                 .mask = &(const struct rte_flow_item_eth){
188                         .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
189                         .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
190                         .type = -1,
191                 },
192                 .mask_sz = sizeof(struct rte_flow_item_eth),
193                 .default_mask = &rte_flow_item_eth_mask,
194                 .convert = tap_flow_create_eth,
195         },
196         [RTE_FLOW_ITEM_TYPE_VLAN] = {
197                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
198                                RTE_FLOW_ITEM_TYPE_IPV6),
199                 .mask = &(const struct rte_flow_item_vlan){
200                         .tpid = -1,
201                         /* DEI matching is not supported */
202 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
203                         .tci = 0xffef,
204 #else
205                         .tci = 0xefff,
206 #endif
207                 },
208                 .mask_sz = sizeof(struct rte_flow_item_vlan),
209                 .default_mask = &rte_flow_item_vlan_mask,
210                 .convert = tap_flow_create_vlan,
211         },
212         [RTE_FLOW_ITEM_TYPE_IPV4] = {
213                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
214                                RTE_FLOW_ITEM_TYPE_TCP),
215                 .mask = &(const struct rte_flow_item_ipv4){
216                         .hdr = {
217                                 .src_addr = -1,
218                                 .dst_addr = -1,
219                                 .next_proto_id = -1,
220                         },
221                 },
222                 .mask_sz = sizeof(struct rte_flow_item_ipv4),
223                 .default_mask = &rte_flow_item_ipv4_mask,
224                 .convert = tap_flow_create_ipv4,
225         },
226         [RTE_FLOW_ITEM_TYPE_IPV6] = {
227                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
228                                RTE_FLOW_ITEM_TYPE_TCP),
229                 .mask = &(const struct rte_flow_item_ipv6){
230                         .hdr = {
231                                 .src_addr = {
232                                         "\xff\xff\xff\xff\xff\xff\xff\xff"
233                                         "\xff\xff\xff\xff\xff\xff\xff\xff",
234                                 },
235                                 .dst_addr = {
236                                         "\xff\xff\xff\xff\xff\xff\xff\xff"
237                                         "\xff\xff\xff\xff\xff\xff\xff\xff",
238                                 },
239                                 .proto = -1,
240                         },
241                 },
242                 .mask_sz = sizeof(struct rte_flow_item_ipv6),
243                 .default_mask = &rte_flow_item_ipv6_mask,
244                 .convert = tap_flow_create_ipv6,
245         },
246         [RTE_FLOW_ITEM_TYPE_UDP] = {
247                 .mask = &(const struct rte_flow_item_udp){
248                         .hdr = {
249                                 .src_port = -1,
250                                 .dst_port = -1,
251                         },
252                 },
253                 .mask_sz = sizeof(struct rte_flow_item_udp),
254                 .default_mask = &rte_flow_item_udp_mask,
255                 .convert = tap_flow_create_udp,
256         },
257         [RTE_FLOW_ITEM_TYPE_TCP] = {
258                 .mask = &(const struct rte_flow_item_tcp){
259                         .hdr = {
260                                 .src_port = -1,
261                                 .dst_port = -1,
262                         },
263                 },
264                 .mask_sz = sizeof(struct rte_flow_item_tcp),
265                 .default_mask = &rte_flow_item_tcp_mask,
266                 .convert = tap_flow_create_tcp,
267         },
268 };
269
270 /*
271  *                TC rules, by growing priority
272  *
273  *        Remote netdevice                  Tap netdevice
274  * +-------------+-------------+  +-------------+-------------+
275  * |   Ingress   |   Egress    |  |   Ingress   |   Egress    |
276  * |-------------|-------------|  |-------------|-------------|
277  * |             |  \       /  |  |             |  REMOTE TX  | prio 1
278  * |             |   \     /   |  |             |   \     /   | prio 2
279  * |  EXPLICIT   |    \   /    |  |  EXPLICIT   |    \   /    |   .
280  * |             |     \ /     |  |             |     \ /     |   .
281  * |    RULES    |      X      |  |    RULES    |      X      |   .
282  * |      .      |     / \     |  |      .      |     / \     |   .
283  * |      .      |    /   \    |  |      .      |    /   \    |   .
284  * |      .      |   /     \   |  |      .      |   /     \   |   .
285  * |      .      |  /       \  |  |      .      |  /       \  |   .
286  *
287  *      ....           ....           ....           ....
288  *
289  * |      .      |  \       /  |  |      .      |  \       /  |   .
290  * |      .      |   \     /   |  |      .      |   \     /   |   .
291  * |             |    \   /    |  |             |    \   /    |
292  * |  LOCAL_MAC  |     \ /     |  |    \   /    |     \ /     | last prio - 5
293  * |   PROMISC   |      X      |  |     \ /     |      X      | last prio - 4
294  * |   ALLMULTI  |     / \     |  |      X      |     / \     | last prio - 3
295  * |  BROADCAST  |    /   \    |  |     / \     |    /   \    | last prio - 2
296  * | BROADCASTV6 |   /     \   |  |    /   \    |   /     \   | last prio - 1
297  * |     xx      |  /       \  |  |   ISOLATE   |  /       \  | last prio
298  * +-------------+-------------+  +-------------+-------------+
299  *
300  * The implicit flow rules are stored in a list in with mandatorily the last two
301  * being the ISOLATE and REMOTE_TX rules. e.g.:
302  *
303  * LOCAL_MAC -> BROADCAST -> BROADCASTV6 -> REMOTE_TX -> ISOLATE -> NULL
304  *
305  * That enables tap_flow_isolate() to remove implicit rules by popping the list
306  * head and remove it as long as it applies on the remote netdevice. The
307  * implicit rule for TX redirection is not removed, as isolate concerns only
308  * incoming traffic.
309  */
310
311 static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = {
312         [TAP_REMOTE_LOCAL_MAC] = {
313                 .attr = {
314                         .group = MAX_GROUP,
315                         .priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC,
316                         .ingress = 1,
317                 },
318                 .items[0] = {
319                         .type = RTE_FLOW_ITEM_TYPE_ETH,
320                         .mask =  &(const struct rte_flow_item_eth){
321                                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
322                         },
323                 },
324                 .items[1] = {
325                         .type = RTE_FLOW_ITEM_TYPE_END,
326                 },
327                 .mirred = TCA_EGRESS_REDIR,
328         },
329         [TAP_REMOTE_BROADCAST] = {
330                 .attr = {
331                         .group = MAX_GROUP,
332                         .priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST,
333                         .ingress = 1,
334                 },
335                 .items[0] = {
336                         .type = RTE_FLOW_ITEM_TYPE_ETH,
337                         .mask =  &(const struct rte_flow_item_eth){
338                                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
339                         },
340                         .spec = &(const struct rte_flow_item_eth){
341                                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
342                         },
343                 },
344                 .items[1] = {
345                         .type = RTE_FLOW_ITEM_TYPE_END,
346                 },
347                 .mirred = TCA_EGRESS_MIRROR,
348         },
349         [TAP_REMOTE_BROADCASTV6] = {
350                 .attr = {
351                         .group = MAX_GROUP,
352                         .priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6,
353                         .ingress = 1,
354                 },
355                 .items[0] = {
356                         .type = RTE_FLOW_ITEM_TYPE_ETH,
357                         .mask =  &(const struct rte_flow_item_eth){
358                                 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
359                         },
360                         .spec = &(const struct rte_flow_item_eth){
361                                 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
362                         },
363                 },
364                 .items[1] = {
365                         .type = RTE_FLOW_ITEM_TYPE_END,
366                 },
367                 .mirred = TCA_EGRESS_MIRROR,
368         },
369         [TAP_REMOTE_PROMISC] = {
370                 .attr = {
371                         .group = MAX_GROUP,
372                         .priority = PRIORITY_MASK - TAP_REMOTE_PROMISC,
373                         .ingress = 1,
374                 },
375                 .items[0] = {
376                         .type = RTE_FLOW_ITEM_TYPE_VOID,
377                 },
378                 .items[1] = {
379                         .type = RTE_FLOW_ITEM_TYPE_END,
380                 },
381                 .mirred = TCA_EGRESS_MIRROR,
382         },
383         [TAP_REMOTE_ALLMULTI] = {
384                 .attr = {
385                         .group = MAX_GROUP,
386                         .priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI,
387                         .ingress = 1,
388                 },
389                 .items[0] = {
390                         .type = RTE_FLOW_ITEM_TYPE_ETH,
391                         .mask =  &(const struct rte_flow_item_eth){
392                                 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
393                         },
394                         .spec = &(const struct rte_flow_item_eth){
395                                 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
396                         },
397                 },
398                 .items[1] = {
399                         .type = RTE_FLOW_ITEM_TYPE_END,
400                 },
401                 .mirred = TCA_EGRESS_MIRROR,
402         },
403         [TAP_REMOTE_TX] = {
404                 .attr = {
405                         .group = 0,
406                         .priority = TAP_REMOTE_TX,
407                         .egress = 1,
408                 },
409                 .items[0] = {
410                         .type = RTE_FLOW_ITEM_TYPE_VOID,
411                 },
412                 .items[1] = {
413                         .type = RTE_FLOW_ITEM_TYPE_END,
414                 },
415                 .mirred = TCA_EGRESS_MIRROR,
416         },
417         [TAP_ISOLATE] = {
418                 .attr = {
419                         .group = MAX_GROUP,
420                         .priority = PRIORITY_MASK - TAP_ISOLATE,
421                         .ingress = 1,
422                 },
423                 .items[0] = {
424                         .type = RTE_FLOW_ITEM_TYPE_VOID,
425                 },
426                 .items[1] = {
427                         .type = RTE_FLOW_ITEM_TYPE_END,
428                 },
429         },
430 };
431
432 /**
433  * Make as much checks as possible on an Ethernet item, and if a flow is
434  * provided, fill it appropriately with Ethernet info.
435  *
436  * @param[in] item
437  *   Item specification.
438  * @param[in, out] data
439  *   Additional data structure to tell next layers we've been here.
440  *
441  * @return
442  *   0 if checks are alright, -1 otherwise.
443  */
444 static int
445 tap_flow_create_eth(const struct rte_flow_item *item, void *data)
446 {
447         struct convert_data *info = (struct convert_data *)data;
448         const struct rte_flow_item_eth *spec = item->spec;
449         const struct rte_flow_item_eth *mask = item->mask;
450         struct rte_flow *flow = info->flow;
451         struct nlmsg *msg;
452
453         /* use default mask if none provided */
454         if (!mask)
455                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
456         /* TC does not support eth_type masking. Only accept if exact match. */
457         if (mask->type && mask->type != 0xffff)
458                 return -1;
459         if (!spec)
460                 return 0;
461         /* store eth_type for consistency if ipv4/6 pattern item comes next */
462         if (spec->type & mask->type)
463                 info->eth_type = spec->type;
464         if (!flow)
465                 return 0;
466         msg = &flow->msg;
467         if (!is_zero_ether_addr(&mask->dst)) {
468                 nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
469                            &spec->dst.addr_bytes);
470                 nlattr_add(&msg->nh,
471                            TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
472                            &mask->dst.addr_bytes);
473         }
474         if (!is_zero_ether_addr(&mask->src)) {
475                 nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
476                            &spec->src.addr_bytes);
477                 nlattr_add(&msg->nh,
478                            TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
479                            &mask->src.addr_bytes);
480         }
481         return 0;
482 }
483
484 /**
485  * Make as much checks as possible on a VLAN item, and if a flow is provided,
486  * fill it appropriately with VLAN info.
487  *
488  * @param[in] item
489  *   Item specification.
490  * @param[in, out] data
491  *   Additional data structure to tell next layers we've been here.
492  *
493  * @return
494  *   0 if checks are alright, -1 otherwise.
495  */
496 static int
497 tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
498 {
499         struct convert_data *info = (struct convert_data *)data;
500         const struct rte_flow_item_vlan *spec = item->spec;
501         const struct rte_flow_item_vlan *mask = item->mask;
502         struct rte_flow *flow = info->flow;
503         struct nlmsg *msg;
504
505         /* use default mask if none provided */
506         if (!mask)
507                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
508         /* TC does not support tpid masking. Only accept if exact match. */
509         if (mask->tpid && mask->tpid != 0xffff)
510                 return -1;
511         /* Double-tagging not supported. */
512         if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
513                 return -1;
514         info->vlan = 1;
515         if (!flow)
516                 return 0;
517         msg = &flow->msg;
518         msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
519 #define VLAN_PRIO(tci) ((tci) >> 13)
520 #define VLAN_ID(tci) ((tci) & 0xfff)
521         if (!spec)
522                 return 0;
523         if (spec->tci) {
524                 uint16_t tci = ntohs(spec->tci) & mask->tci;
525                 uint16_t prio = VLAN_PRIO(tci);
526                 uint8_t vid = VLAN_ID(tci);
527
528                 if (prio)
529                         nlattr_add8(&msg->nh, TCA_FLOWER_KEY_VLAN_PRIO, prio);
530                 if (vid)
531                         nlattr_add16(&msg->nh, TCA_FLOWER_KEY_VLAN_ID, vid);
532         }
533         return 0;
534 }
535
536 /**
537  * Make as much checks as possible on an IPv4 item, and if a flow is provided,
538  * fill it appropriately with IPv4 info.
539  *
540  * @param[in] item
541  *   Item specification.
542  * @param[in, out] data
543  *   Additional data structure to tell next layers we've been here.
544  *
545  * @return
546  *   0 if checks are alright, -1 otherwise.
547  */
548 static int
549 tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
550 {
551         struct convert_data *info = (struct convert_data *)data;
552         const struct rte_flow_item_ipv4 *spec = item->spec;
553         const struct rte_flow_item_ipv4 *mask = item->mask;
554         struct rte_flow *flow = info->flow;
555         struct nlmsg *msg;
556
557         /* use default mask if none provided */
558         if (!mask)
559                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
560         /* check that previous eth type is compatible with ipv4 */
561         if (info->eth_type && info->eth_type != htons(ETH_P_IP))
562                 return -1;
563         /* store ip_proto for consistency if udp/tcp pattern item comes next */
564         if (spec)
565                 info->ip_proto = spec->hdr.next_proto_id;
566         if (!flow)
567                 return 0;
568         msg = &flow->msg;
569         if (!info->eth_type)
570                 info->eth_type = htons(ETH_P_IP);
571         if (!spec)
572                 return 0;
573         if (mask->hdr.dst_addr) {
574                 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
575                              spec->hdr.dst_addr);
576                 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
577                              mask->hdr.dst_addr);
578         }
579         if (mask->hdr.src_addr) {
580                 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
581                              spec->hdr.src_addr);
582                 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
583                              mask->hdr.src_addr);
584         }
585         if (spec->hdr.next_proto_id)
586                 nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
587                             spec->hdr.next_proto_id);
588         return 0;
589 }
590
591 /**
592  * Make as much checks as possible on an IPv6 item, and if a flow is provided,
593  * fill it appropriately with IPv6 info.
594  *
595  * @param[in] item
596  *   Item specification.
597  * @param[in, out] data
598  *   Additional data structure to tell next layers we've been here.
599  *
600  * @return
601  *   0 if checks are alright, -1 otherwise.
602  */
603 static int
604 tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
605 {
606         struct convert_data *info = (struct convert_data *)data;
607         const struct rte_flow_item_ipv6 *spec = item->spec;
608         const struct rte_flow_item_ipv6 *mask = item->mask;
609         struct rte_flow *flow = info->flow;
610         uint8_t empty_addr[16] = { 0 };
611         struct nlmsg *msg;
612
613         /* use default mask if none provided */
614         if (!mask)
615                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
616         /* check that previous eth type is compatible with ipv6 */
617         if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
618                 return -1;
619         /* store ip_proto for consistency if udp/tcp pattern item comes next */
620         if (spec)
621                 info->ip_proto = spec->hdr.proto;
622         if (!flow)
623                 return 0;
624         msg = &flow->msg;
625         if (!info->eth_type)
626                 info->eth_type = htons(ETH_P_IPV6);
627         if (!spec)
628                 return 0;
629         if (memcmp(mask->hdr.dst_addr, empty_addr, 16)) {
630                 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
631                            sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
632                 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
633                            sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
634         }
635         if (memcmp(mask->hdr.src_addr, empty_addr, 16)) {
636                 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
637                            sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
638                 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
639                            sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
640         }
641         if (spec->hdr.proto)
642                 nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
643         return 0;
644 }
645
646 /**
647  * Make as much checks as possible on a UDP item, and if a flow is provided,
648  * fill it appropriately with UDP info.
649  *
650  * @param[in] item
651  *   Item specification.
652  * @param[in, out] data
653  *   Additional data structure to tell next layers we've been here.
654  *
655  * @return
656  *   0 if checks are alright, -1 otherwise.
657  */
658 static int
659 tap_flow_create_udp(const struct rte_flow_item *item, void *data)
660 {
661         struct convert_data *info = (struct convert_data *)data;
662         const struct rte_flow_item_udp *spec = item->spec;
663         const struct rte_flow_item_udp *mask = item->mask;
664         struct rte_flow *flow = info->flow;
665         struct nlmsg *msg;
666
667         /* use default mask if none provided */
668         if (!mask)
669                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
670         /* check that previous ip_proto is compatible with udp */
671         if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
672                 return -1;
673         /* TC does not support UDP port masking. Only accept if exact match. */
674         if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
675             (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
676                 return -1;
677         if (!flow)
678                 return 0;
679         msg = &flow->msg;
680         nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
681         if (!spec)
682                 return 0;
683         if (mask->hdr.dst_port)
684                 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
685                              spec->hdr.dst_port);
686         if (mask->hdr.src_port)
687                 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
688                              spec->hdr.src_port);
689         return 0;
690 }
691
692 /**
693  * Make as much checks as possible on a TCP item, and if a flow is provided,
694  * fill it appropriately with TCP info.
695  *
696  * @param[in] item
697  *   Item specification.
698  * @param[in, out] data
699  *   Additional data structure to tell next layers we've been here.
700  *
701  * @return
702  *   0 if checks are alright, -1 otherwise.
703  */
704 static int
705 tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
706 {
707         struct convert_data *info = (struct convert_data *)data;
708         const struct rte_flow_item_tcp *spec = item->spec;
709         const struct rte_flow_item_tcp *mask = item->mask;
710         struct rte_flow *flow = info->flow;
711         struct nlmsg *msg;
712
713         /* use default mask if none provided */
714         if (!mask)
715                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
716         /* check that previous ip_proto is compatible with tcp */
717         if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
718                 return -1;
719         /* TC does not support TCP port masking. Only accept if exact match. */
720         if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
721             (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
722                 return -1;
723         if (!flow)
724                 return 0;
725         msg = &flow->msg;
726         nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
727         if (!spec)
728                 return 0;
729         if (mask->hdr.dst_port)
730                 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
731                              spec->hdr.dst_port);
732         if (mask->hdr.src_port)
733                 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
734                              spec->hdr.src_port);
735         return 0;
736 }
737
738 /**
739  * Check support for a given item.
740  *
741  * @param[in] item
742  *   Item specification.
743  * @param size
744  *   Bit-Mask size in bytes.
745  * @param[in] supported_mask
746  *   Bit-mask covering supported fields to compare with spec, last and mask in
747  *   \item.
748  * @param[in] default_mask
749  *   Bit-mask default mask if none is provided in \item.
750  *
751  * @return
752  *   0 on success.
753  */
754 static int
755 tap_flow_item_validate(const struct rte_flow_item *item,
756                        unsigned int size,
757                        const uint8_t *supported_mask,
758                        const uint8_t *default_mask)
759 {
760         int ret = 0;
761
762         /* An empty layer is allowed, as long as all fields are NULL */
763         if (!item->spec && (item->mask || item->last))
764                 return -1;
765         /* Is the item spec compatible with what the NIC supports? */
766         if (item->spec && !item->mask) {
767                 unsigned int i;
768                 const uint8_t *spec = item->spec;
769
770                 for (i = 0; i < size; ++i)
771                         if ((spec[i] | supported_mask[i]) != supported_mask[i])
772                                 return -1;
773                 /* Is the default mask compatible with what the NIC supports? */
774                 for (i = 0; i < size; i++)
775                         if ((default_mask[i] | supported_mask[i]) !=
776                             supported_mask[i])
777                                 return -1;
778         }
779         /* Is the item last compatible with what the NIC supports? */
780         if (item->last && !item->mask) {
781                 unsigned int i;
782                 const uint8_t *spec = item->last;
783
784                 for (i = 0; i < size; ++i)
785                         if ((spec[i] | supported_mask[i]) != supported_mask[i])
786                                 return -1;
787         }
788         /* Is the item mask compatible with what the NIC supports? */
789         if (item->mask) {
790                 unsigned int i;
791                 const uint8_t *spec = item->mask;
792
793                 for (i = 0; i < size; ++i)
794                         if ((spec[i] | supported_mask[i]) != supported_mask[i])
795                                 return -1;
796         }
797         /**
798          * Once masked, Are item spec and item last equal?
799          * TC does not support range so anything else is invalid.
800          */
801         if (item->spec && item->last) {
802                 uint8_t spec[size];
803                 uint8_t last[size];
804                 const uint8_t *apply = default_mask;
805                 unsigned int i;
806
807                 if (item->mask)
808                         apply = item->mask;
809                 for (i = 0; i < size; ++i) {
810                         spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
811                         last[i] = ((const uint8_t *)item->last)[i] & apply[i];
812                 }
813                 ret = memcmp(spec, last, size);
814         }
815         return ret;
816 }
817
818 /**
819  * Transform a DROP/PASSTHRU action item in the provided flow for TC.
820  *
821  * @param[in, out] flow
822  *   Flow to be filled.
823  * @param[in] action
824  *   Appropriate action to be set in the TCA_GACT_PARMS structure.
825  *
826  * @return
827  *   0 if checks are alright, -1 otherwise.
828  */
829 static int
830 add_action_gact(struct rte_flow *flow, int action)
831 {
832         struct nlmsg *msg = &flow->msg;
833         size_t act_index = 1;
834         struct tc_gact p = {
835                 .action = action
836         };
837
838         if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
839                 return -1;
840         if (nlattr_nested_start(msg, act_index++) < 0)
841                 return -1;
842         nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact");
843         if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
844                 return -1;
845         nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p);
846         nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
847         nlattr_nested_finish(msg); /* nested act_index */
848         nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
849         return 0;
850 }
851
852 /**
853  * Transform a MIRRED action item in the provided flow for TC.
854  *
855  * @param[in, out] flow
856  *   Flow to be filled.
857  * @param[in] ifindex
858  *   Netdevice ifindex, where to mirror/redirect packet to.
859  * @param[in] action_type
860  *   Either TCA_EGRESS_REDIR for redirection or TCA_EGRESS_MIRROR for mirroring.
861  *
862  * @return
863  *   0 if checks are alright, -1 otherwise.
864  */
865 static int
866 add_action_mirred(struct rte_flow *flow, uint16_t ifindex, uint16_t action_type)
867 {
868         struct nlmsg *msg = &flow->msg;
869         size_t act_index = 1;
870         struct tc_mirred p = {
871                 .eaction = action_type,
872                 .ifindex = ifindex,
873         };
874
875         if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
876                 return -1;
877         if (nlattr_nested_start(msg, act_index++) < 0)
878                 return -1;
879         nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("mirred"), "mirred");
880         if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
881                 return -1;
882         if (action_type == TCA_EGRESS_MIRROR)
883                 p.action = TC_ACT_PIPE;
884         else /* REDIRECT */
885                 p.action = TC_ACT_STOLEN;
886         nlattr_add(&msg->nh, TCA_MIRRED_PARMS, sizeof(p), &p);
887         nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
888         nlattr_nested_finish(msg); /* nested act_index */
889         nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
890         return 0;
891 }
892
893 /**
894  * Transform a QUEUE action item in the provided flow for TC.
895  *
896  * @param[in, out] flow
897  *   Flow to be filled.
898  * @param[in] queue
899  *   Queue id to use.
900  *
901  * @return
902  *   0 if checks are alright, -1 otherwise.
903  */
904 static int
905 add_action_skbedit(struct rte_flow *flow, uint16_t queue)
906 {
907         struct nlmsg *msg = &flow->msg;
908         size_t act_index = 1;
909         struct tc_skbedit p = {
910                 .action = TC_ACT_PIPE
911         };
912
913         if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
914                 return -1;
915         if (nlattr_nested_start(msg, act_index++) < 0)
916                 return -1;
917         nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit");
918         if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
919                 return -1;
920         nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p);
921         nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue);
922         nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
923         nlattr_nested_finish(msg); /* nested act_index */
924         nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
925         return 0;
926 }
927
928 /**
929  * Validate a flow supported by TC.
930  * If flow param is not NULL, then also fill the netlink message inside.
931  *
932  * @param pmd
933  *   Pointer to private structure.
934  * @param[in] attr
935  *   Flow rule attributes.
936  * @param[in] pattern
937  *   Pattern specification (list terminated by the END pattern item).
938  * @param[in] actions
939  *   Associated actions (list terminated by the END action).
940  * @param[out] error
941  *   Perform verbose error reporting if not NULL.
942  * @param[in, out] flow
943  *   Flow structure to update.
944  * @param[in] mirred
945  *   If set to TCA_EGRESS_REDIR, provided actions will be replaced with a
946  *   redirection to the tap netdevice, and the TC rule will be configured
947  *   on the remote netdevice in pmd.
948  *   If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a
949  *   mirroring to the tap netdevice, and the TC rule will be configured
950  *   on the remote netdevice in pmd. Matching packets will thus be duplicated.
951  *   If set to 0, the standard behavior is to be used: set correct actions for
952  *   the TC rule, and apply it on the tap netdevice.
953  *
954  * @return
955  *   0 on success, a negative errno value otherwise and rte_errno is set.
956  */
957 static int
958 priv_flow_process(struct pmd_internals *pmd,
959                   const struct rte_flow_attr *attr,
960                   const struct rte_flow_item items[],
961                   const struct rte_flow_action actions[],
962                   struct rte_flow_error *error,
963                   struct rte_flow *flow,
964                   int mirred)
965 {
966         const struct tap_flow_items *cur_item = tap_flow_items;
967         struct convert_data data = {
968                 .eth_type = 0,
969                 .ip_proto = 0,
970                 .flow = flow,
971         };
972         int action = 0; /* Only one action authorized for now */
973
974         if (attr->group > MAX_GROUP) {
975                 rte_flow_error_set(
976                         error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
977                         NULL, "group value too big: cannot exceed 15");
978                 return -rte_errno;
979         }
980         if (attr->priority > MAX_PRIORITY) {
981                 rte_flow_error_set(
982                         error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
983                         NULL, "priority value too big");
984                 return -rte_errno;
985         } else if (flow) {
986                 uint16_t group = attr->group << GROUP_SHIFT;
987                 uint16_t prio = group | (attr->priority + PRIORITY_OFFSET);
988                 flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
989                                                  flow->msg.t.tcm_info);
990         }
991         if (flow) {
992                 if (mirred) {
993                         /*
994                          * If attr->ingress, the rule applies on remote ingress
995                          * to match incoming packets
996                          * If attr->egress, the rule applies on tap ingress (as
997                          * seen from the kernel) to deal with packets going out
998                          * from the DPDK app.
999                          */
1000                         flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0);
1001                 } else {
1002                         /* Standard rule on tap egress (kernel standpoint). */
1003                         flow->msg.t.tcm_parent =
1004                                 TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
1005                 }
1006                 /* use flower filter type */
1007                 nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
1008                 if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
1009                         goto exit_item_not_supported;
1010         }
1011         for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
1012                 const struct tap_flow_items *token = NULL;
1013                 unsigned int i;
1014                 int err = 0;
1015
1016                 if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
1017                         continue;
1018                 for (i = 0;
1019                      cur_item->items &&
1020                      cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
1021                      ++i) {
1022                         if (cur_item->items[i] == items->type) {
1023                                 token = &tap_flow_items[items->type];
1024                                 break;
1025                         }
1026                 }
1027                 if (!token)
1028                         goto exit_item_not_supported;
1029                 cur_item = token;
1030                 err = tap_flow_item_validate(
1031                         items, cur_item->mask_sz,
1032                         (const uint8_t *)cur_item->mask,
1033                         (const uint8_t *)cur_item->default_mask);
1034                 if (err)
1035                         goto exit_item_not_supported;
1036                 if (flow && cur_item->convert) {
1037                         err = cur_item->convert(items, &data);
1038                         if (err)
1039                                 goto exit_item_not_supported;
1040                 }
1041         }
1042         if (flow) {
1043                 if (data.vlan) {
1044                         nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
1045                                      htons(ETH_P_8021Q));
1046                         nlattr_add16(&flow->msg.nh,
1047                                      TCA_FLOWER_KEY_VLAN_ETH_TYPE,
1048                                      data.eth_type ?
1049                                      data.eth_type : htons(ETH_P_ALL));
1050                 } else if (data.eth_type) {
1051                         nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
1052                                      data.eth_type);
1053                 }
1054         }
1055         if (mirred && flow) {
1056                 uint16_t if_index = pmd->if_index;
1057
1058                 /*
1059                  * If attr->egress && mirred, then this is a special
1060                  * case where the rule must be applied on the tap, to
1061                  * redirect packets coming from the DPDK App, out
1062                  * through the remote netdevice.
1063                  */
1064                 if (attr->egress)
1065                         if_index = pmd->remote_if_index;
1066                 if (add_action_mirred(flow, if_index, mirred) < 0)
1067                         goto exit_action_not_supported;
1068                 else
1069                         goto end;
1070         }
1071         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
1072                 int err = 0;
1073
1074                 if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
1075                         continue;
1076                 } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
1077                         if (action)
1078                                 goto exit_action_not_supported;
1079                         action = 1;
1080                         if (flow)
1081                                 err = add_action_gact(flow, TC_ACT_SHOT);
1082                 } else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
1083                         if (action)
1084                                 goto exit_action_not_supported;
1085                         action = 1;
1086                         if (flow)
1087                                 err = add_action_gact(flow, TC_ACT_UNSPEC);
1088                 } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
1089                         const struct rte_flow_action_queue *queue =
1090                                 (const struct rte_flow_action_queue *)
1091                                 actions->conf;
1092
1093                         if (action)
1094                                 goto exit_action_not_supported;
1095                         action = 1;
1096                         if (!queue ||
1097                             (queue->index > pmd->dev->data->nb_rx_queues - 1))
1098                                 goto exit_action_not_supported;
1099                         if (flow)
1100                                 err = add_action_skbedit(flow, queue->index);
1101                 } else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
1102                         /* Fake RSS support. */
1103                         const struct rte_flow_action_rss *rss =
1104                                 (const struct rte_flow_action_rss *)
1105                                 actions->conf;
1106
1107                         if (action)
1108                                 goto exit_action_not_supported;
1109                         action = 1;
1110                         if (!rss || rss->num < 1 ||
1111                             (rss->queue[0] > pmd->dev->data->nb_rx_queues - 1))
1112                                 goto exit_action_not_supported;
1113                         if (flow)
1114                                 err = add_action_skbedit(flow, rss->queue[0]);
1115                 } else {
1116                         goto exit_action_not_supported;
1117                 }
1118                 if (err)
1119                         goto exit_action_not_supported;
1120         }
1121 end:
1122         if (flow)
1123                 nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
1124         return 0;
1125 exit_item_not_supported:
1126         rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
1127                            items, "item not supported");
1128         return -rte_errno;
1129 exit_action_not_supported:
1130         rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
1131                            actions, "action not supported");
1132         return -rte_errno;
1133 }
1134
1135
1136
1137 /**
1138  * Validate a flow.
1139  *
1140  * @see rte_flow_validate()
1141  * @see rte_flow_ops
1142  */
1143 static int
1144 tap_flow_validate(struct rte_eth_dev *dev,
1145                   const struct rte_flow_attr *attr,
1146                   const struct rte_flow_item items[],
1147                   const struct rte_flow_action actions[],
1148                   struct rte_flow_error *error)
1149 {
1150         struct pmd_internals *pmd = dev->data->dev_private;
1151
1152         return priv_flow_process(pmd, attr, items, actions, error, NULL, 0);
1153 }
1154
1155 /**
1156  * Set a unique handle in a flow.
1157  *
1158  * The kernel supports TC rules with equal priority, as long as they use the
1159  * same matching fields (e.g.: dst mac and ipv4) with different values (and
1160  * full mask to ensure no collision is possible).
1161  * In those rules, the handle (uint32_t) is the part that would identify
1162  * specifically each rule.
1163  *
1164  * On 32-bit architectures, the handle can simply be the flow's pointer address.
1165  * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
1166  * unique handle.
1167  *
1168  * @param[in, out] flow
1169  *   The flow that needs its handle set.
1170  */
1171 static void
1172 tap_flow_set_handle(struct rte_flow *flow)
1173 {
1174         uint32_t handle = 0;
1175
1176         if (sizeof(flow) > 4)
1177                 handle = rte_jhash(&flow, sizeof(flow), 1);
1178         else
1179                 handle = (uintptr_t)flow;
1180         /* must be at least 1 to avoid letting the kernel choose one for us */
1181         if (!handle)
1182                 handle = 1;
1183         flow->msg.t.tcm_handle = handle;
1184 }
1185
1186 /**
1187  * Create a flow.
1188  *
1189  * @see rte_flow_create()
1190  * @see rte_flow_ops
1191  */
1192 static struct rte_flow *
1193 tap_flow_create(struct rte_eth_dev *dev,
1194                 const struct rte_flow_attr *attr,
1195                 const struct rte_flow_item items[],
1196                 const struct rte_flow_action actions[],
1197                 struct rte_flow_error *error)
1198 {
1199         struct pmd_internals *pmd = dev->data->dev_private;
1200         struct rte_flow *remote_flow = NULL;
1201         struct rte_flow *flow = NULL;
1202         struct nlmsg *msg = NULL;
1203         int err;
1204
1205         if (!pmd->if_index) {
1206                 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1207                                    NULL,
1208                                    "can't create rule, ifindex not found");
1209                 goto fail;
1210         }
1211         /*
1212          * No rules configured through standard rte_flow should be set on the
1213          * priorities used by implicit rules.
1214          */
1215         if ((attr->group == MAX_GROUP) &&
1216             attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) {
1217                 rte_flow_error_set(
1218                         error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1219                         NULL, "priority value too big");
1220                 goto fail;
1221         }
1222         flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1223         if (!flow) {
1224                 rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1225                                    NULL, "cannot allocate memory for rte_flow");
1226                 goto fail;
1227         }
1228         msg = &flow->msg;
1229         tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
1230                     NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1231         msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1232         tap_flow_set_handle(flow);
1233         if (priv_flow_process(pmd, attr, items, actions, error, flow, 0))
1234                 goto fail;
1235         err = nl_send(pmd->nlsk_fd, &msg->nh);
1236         if (err < 0) {
1237                 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1238                                    NULL, "couldn't send request to kernel");
1239                 goto fail;
1240         }
1241         err = nl_recv_ack(pmd->nlsk_fd);
1242         if (err < 0) {
1243                 RTE_LOG(ERR, PMD,
1244                         "Kernel refused TC filter rule creation (%d): %s\n",
1245                         errno, strerror(errno));
1246                 rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
1247                                    NULL,
1248                                    "overlapping rules or Kernel too old for flower support");
1249                 goto fail;
1250         }
1251         LIST_INSERT_HEAD(&pmd->flows, flow, next);
1252         /**
1253          * If a remote device is configured, a TC rule with identical items for
1254          * matching must be set on that device, with a single action: redirect
1255          * to the local pmd->if_index.
1256          */
1257         if (pmd->remote_if_index) {
1258                 remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1259                 if (!remote_flow) {
1260                         rte_flow_error_set(
1261                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1262                                 "cannot allocate memory for rte_flow");
1263                         goto fail;
1264                 }
1265                 msg = &remote_flow->msg;
1266                 /* set the rule if_index for the remote netdevice */
1267                 tc_init_msg(
1268                         msg, pmd->remote_if_index, RTM_NEWTFILTER,
1269                         NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1270                 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1271                 tap_flow_set_handle(remote_flow);
1272                 if (priv_flow_process(pmd, attr, items, NULL,
1273                                       error, remote_flow, TCA_EGRESS_REDIR)) {
1274                         rte_flow_error_set(
1275                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1276                                 NULL, "rte flow rule validation failed");
1277                         goto fail;
1278                 }
1279                 err = nl_send(pmd->nlsk_fd, &msg->nh);
1280                 if (err < 0) {
1281                         rte_flow_error_set(
1282                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1283                                 NULL, "Failure sending nl request");
1284                         goto fail;
1285                 }
1286                 err = nl_recv_ack(pmd->nlsk_fd);
1287                 if (err < 0) {
1288                         RTE_LOG(ERR, PMD,
1289                                 "Kernel refused TC filter rule creation (%d): %s\n",
1290                                 errno, strerror(errno));
1291                         rte_flow_error_set(
1292                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1293                                 NULL,
1294                                 "overlapping rules or Kernel too old for flower support");
1295                         goto fail;
1296                 }
1297                 flow->remote_flow = remote_flow;
1298         }
1299         return flow;
1300 fail:
1301         if (remote_flow)
1302                 rte_free(remote_flow);
1303         if (flow)
1304                 rte_free(flow);
1305         return NULL;
1306 }
1307
1308 /**
1309  * Destroy a flow using pointer to pmd_internal.
1310  *
1311  * @param[in, out] pmd
1312  *   Pointer to private structure.
1313  * @param[in] flow
1314  *   Pointer to the flow to destroy.
1315  * @param[in, out] error
1316  *   Pointer to the flow error handler
1317  *
1318  * @return 0 if the flow could be destroyed, -1 otherwise.
1319  */
1320 static int
1321 tap_flow_destroy_pmd(struct pmd_internals *pmd,
1322                      struct rte_flow *flow,
1323                      struct rte_flow_error *error)
1324 {
1325         struct rte_flow *remote_flow = flow->remote_flow;
1326         int ret = 0;
1327
1328         LIST_REMOVE(flow, next);
1329         flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1330         flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1331
1332         ret = nl_send(pmd->nlsk_fd, &flow->msg.nh);
1333         if (ret < 0) {
1334                 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1335                                    NULL, "couldn't send request to kernel");
1336                 goto end;
1337         }
1338         ret = nl_recv_ack(pmd->nlsk_fd);
1339         /* If errno is ENOENT, the rule is already no longer in the kernel. */
1340         if (ret < 0 && errno == ENOENT)
1341                 ret = 0;
1342         if (ret < 0) {
1343                 RTE_LOG(ERR, PMD,
1344                         "Kernel refused TC filter rule deletion (%d): %s\n",
1345                         errno, strerror(errno));
1346                 rte_flow_error_set(
1347                         error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1348                         "couldn't receive kernel ack to our request");
1349                 goto end;
1350         }
1351         if (remote_flow) {
1352                 remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1353                 remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1354
1355                 ret = nl_send(pmd->nlsk_fd, &remote_flow->msg.nh);
1356                 if (ret < 0) {
1357                         rte_flow_error_set(
1358                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1359                                 NULL, "Failure sending nl request");
1360                         goto end;
1361                 }
1362                 ret = nl_recv_ack(pmd->nlsk_fd);
1363                 if (ret < 0 && errno == ENOENT)
1364                         ret = 0;
1365                 if (ret < 0) {
1366                         RTE_LOG(ERR, PMD,
1367                                 "Kernel refused TC filter rule deletion (%d): %s\n",
1368                                 errno, strerror(errno));
1369                         rte_flow_error_set(
1370                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1371                                 NULL, "Failure trying to receive nl ack");
1372                         goto end;
1373                 }
1374         }
1375 end:
1376         if (remote_flow)
1377                 rte_free(remote_flow);
1378         rte_free(flow);
1379         return ret;
1380 }
1381
1382 /**
1383  * Destroy a flow.
1384  *
1385  * @see rte_flow_destroy()
1386  * @see rte_flow_ops
1387  */
1388 static int
1389 tap_flow_destroy(struct rte_eth_dev *dev,
1390                  struct rte_flow *flow,
1391                  struct rte_flow_error *error)
1392 {
1393         struct pmd_internals *pmd = dev->data->dev_private;
1394
1395         return tap_flow_destroy_pmd(pmd, flow, error);
1396 }
1397
1398 /**
1399  * Enable/disable flow isolation.
1400  *
1401  * @see rte_flow_isolate()
1402  * @see rte_flow_ops
1403  */
1404 static int
1405 tap_flow_isolate(struct rte_eth_dev *dev,
1406                  int set,
1407                  struct rte_flow_error *error __rte_unused)
1408 {
1409         struct pmd_internals *pmd = dev->data->dev_private;
1410
1411         if (set)
1412                 pmd->flow_isolate = 1;
1413         else
1414                 pmd->flow_isolate = 0;
1415         /*
1416          * If netdevice is there, setup appropriate flow rules immediately.
1417          * Otherwise it will be set when bringing up the netdevice (tun_alloc).
1418          */
1419         if (!pmd->rxq[0].fd)
1420                 return 0;
1421         if (set) {
1422                 struct rte_flow *flow;
1423
1424                 while (1) {
1425                         flow = LIST_FIRST(&pmd->implicit_flows);
1426                         if (!flow)
1427                                 break;
1428                         /*
1429                          * Remove all implicit rules on the remote.
1430                          * Keep the local rule to redirect packets on TX.
1431                          * Keep also the last implicit local rule: ISOLATE.
1432                          */
1433                         if (flow->msg.t.tcm_ifindex == pmd->if_index)
1434                                 break;
1435                         if (tap_flow_destroy_pmd(pmd, flow, NULL) < 0)
1436                                 goto error;
1437                 }
1438                 /* Switch the TC rule according to pmd->flow_isolate */
1439                 if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1)
1440                         goto error;
1441         } else {
1442                 /* Switch the TC rule according to pmd->flow_isolate */
1443                 if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1)
1444                         goto error;
1445                 if (!pmd->remote_if_index)
1446                         return 0;
1447                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0)
1448                         goto error;
1449                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0)
1450                         goto error;
1451                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0)
1452                         goto error;
1453                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0)
1454                         goto error;
1455                 if (dev->data->promiscuous &&
1456                     tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC) < 0)
1457                         goto error;
1458                 if (dev->data->all_multicast &&
1459                     tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI) < 0)
1460                         goto error;
1461         }
1462         return 0;
1463 error:
1464         pmd->flow_isolate = 0;
1465         return rte_flow_error_set(
1466                 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1467                 "TC rule creation failed");
1468 }
1469
1470 /**
1471  * Destroy all flows.
1472  *
1473  * @see rte_flow_flush()
1474  * @see rte_flow_ops
1475  */
1476 int
1477 tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
1478 {
1479         struct pmd_internals *pmd = dev->data->dev_private;
1480         struct rte_flow *flow;
1481
1482         while (!LIST_EMPTY(&pmd->flows)) {
1483                 flow = LIST_FIRST(&pmd->flows);
1484                 if (tap_flow_destroy(dev, flow, error) < 0)
1485                         return -1;
1486         }
1487         return 0;
1488 }
1489
1490 /**
1491  * Add an implicit flow rule on the remote device to make sure traffic gets to
1492  * the tap netdevice from there.
1493  *
1494  * @param pmd
1495  *   Pointer to private structure.
1496  * @param[in] idx
1497  *   The idx in the implicit_rte_flows array specifying which rule to apply.
1498  *
1499  * @return -1 if the rule couldn't be applied, 0 otherwise.
1500  */
1501 int tap_flow_implicit_create(struct pmd_internals *pmd,
1502                              enum implicit_rule_index idx)
1503 {
1504         uint16_t flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE;
1505         struct rte_flow_action *actions = implicit_rte_flows[idx].actions;
1506         struct rte_flow_action isolate_actions[2] = {
1507                 [1] = {
1508                         .type = RTE_FLOW_ACTION_TYPE_END,
1509                 },
1510         };
1511         struct rte_flow_item *items = implicit_rte_flows[idx].items;
1512         struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr;
1513         struct rte_flow_item_eth eth_local = { .type = 0 };
1514         uint16_t if_index = pmd->remote_if_index;
1515         struct rte_flow *remote_flow = NULL;
1516         struct nlmsg *msg = NULL;
1517         int err = 0;
1518         struct rte_flow_item items_local[2] = {
1519                 [0] = {
1520                         .type = items[0].type,
1521                         .spec = &eth_local,
1522                         .mask = items[0].mask,
1523                 },
1524                 [1] = {
1525                         .type = items[1].type,
1526                 }
1527         };
1528
1529         remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1530         if (!remote_flow) {
1531                 RTE_LOG(ERR, PMD, "Cannot allocate memory for rte_flow\n");
1532                 goto fail;
1533         }
1534         msg = &remote_flow->msg;
1535         if (idx == TAP_REMOTE_TX) {
1536                 if_index = pmd->if_index;
1537         } else if (idx == TAP_ISOLATE) {
1538                 if_index = pmd->if_index;
1539                 /* Don't be exclusive for this rule, it can be changed later. */
1540                 flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE;
1541                 isolate_actions[0].type = pmd->flow_isolate ?
1542                         RTE_FLOW_ACTION_TYPE_DROP :
1543                         RTE_FLOW_ACTION_TYPE_PASSTHRU;
1544                 actions = isolate_actions;
1545         } else if (idx == TAP_REMOTE_LOCAL_MAC) {
1546                 /*
1547                  * eth addr couldn't be set in implicit_rte_flows[] as it is not
1548                  * known at compile time.
1549                  */
1550                 memcpy(&eth_local.dst, &pmd->eth_addr, sizeof(pmd->eth_addr));
1551                 items = items_local;
1552         }
1553         tc_init_msg(msg, if_index, RTM_NEWTFILTER, flags);
1554         msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1555         /*
1556          * The ISOLATE rule is always present and must have a static handle, as
1557          * the action is changed whether the feature is enabled (DROP) or
1558          * disabled (PASSTHRU).
1559          */
1560         if (idx == TAP_ISOLATE)
1561                 remote_flow->msg.t.tcm_handle = ISOLATE_HANDLE;
1562         else
1563                 tap_flow_set_handle(remote_flow);
1564         if (priv_flow_process(pmd, attr, items, actions, NULL,
1565                               remote_flow, implicit_rte_flows[idx].mirred)) {
1566                 RTE_LOG(ERR, PMD, "rte flow rule validation failed\n");
1567                 goto fail;
1568         }
1569         err = nl_send(pmd->nlsk_fd, &msg->nh);
1570         if (err < 0) {
1571                 RTE_LOG(ERR, PMD, "Failure sending nl request\n");
1572                 goto fail;
1573         }
1574         err = nl_recv_ack(pmd->nlsk_fd);
1575         if (err < 0) {
1576                 RTE_LOG(ERR, PMD,
1577                         "Kernel refused TC filter rule creation (%d): %s\n",
1578                         errno, strerror(errno));
1579                 goto fail;
1580         }
1581         LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next);
1582         return 0;
1583 fail:
1584         if (remote_flow)
1585                 rte_free(remote_flow);
1586         return -1;
1587 }
1588
1589 /**
1590  * Remove specific implicit flow rule on the remote device.
1591  *
1592  * @param[in, out] pmd
1593  *   Pointer to private structure.
1594  * @param[in] idx
1595  *   The idx in the implicit_rte_flows array specifying which rule to remove.
1596  *
1597  * @return -1 if one of the implicit rules couldn't be created, 0 otherwise.
1598  */
1599 int tap_flow_implicit_destroy(struct pmd_internals *pmd,
1600                               enum implicit_rule_index idx)
1601 {
1602         struct rte_flow *remote_flow;
1603         int cur_prio = -1;
1604         int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET;
1605
1606         for (remote_flow = LIST_FIRST(&pmd->implicit_flows);
1607              remote_flow;
1608              remote_flow = LIST_NEXT(remote_flow, next)) {
1609                 cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK;
1610                 if (cur_prio != idx_prio)
1611                         continue;
1612                 return tap_flow_destroy_pmd(pmd, remote_flow, NULL);
1613         }
1614         return 0;
1615 }
1616
1617 /**
1618  * Destroy all implicit flows.
1619  *
1620  * @see rte_flow_flush()
1621  */
1622 int
1623 tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error)
1624 {
1625         struct rte_flow *remote_flow;
1626
1627         while (!LIST_EMPTY(&pmd->implicit_flows)) {
1628                 remote_flow = LIST_FIRST(&pmd->implicit_flows);
1629                 if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0)
1630                         return -1;
1631         }
1632         return 0;
1633 }
1634
1635 /**
1636  * Manage filter operations.
1637  *
1638  * @param dev
1639  *   Pointer to Ethernet device structure.
1640  * @param filter_type
1641  *   Filter type.
1642  * @param filter_op
1643  *   Operation to perform.
1644  * @param arg
1645  *   Pointer to operation-specific structure.
1646  *
1647  * @return
1648  *   0 on success, negative errno value on failure.
1649  */
1650 int
1651 tap_dev_filter_ctrl(struct rte_eth_dev *dev,
1652                     enum rte_filter_type filter_type,
1653                     enum rte_filter_op filter_op,
1654                     void *arg)
1655 {
1656         switch (filter_type) {
1657         case RTE_ETH_FILTER_GENERIC:
1658                 if (filter_op != RTE_ETH_FILTER_GET)
1659                         return -EINVAL;
1660                 *(const void **)arg = &tap_flow_ops;
1661                 return 0;
1662         default:
1663                 RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported\n",
1664                         (void *)dev, filter_type);
1665         }
1666         return -EINVAL;
1667 }
1668