2 * Copyright (c) 2016 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
17 * lb-plugin implements a MagLev-like load balancer.
18 * http://research.google.com/pubs/pub44824.html
20 * It hasn't been tested for interoperability with the original MagLev
21 * but intends to provide similar functionality.
22 * The load-balancer receives traffic destined to VIP (Virtual IP)
23 * addresses from one or multiple(ECMP) routers.
24 * The load-balancer tunnels the traffic toward many application servers
25 * ensuring session stickiness (i.e. that a single sessions is tunneled
26 * towards a single application server).
30 #ifndef LB_PLUGIN_LB_LB_H_
31 #define LB_PLUGIN_LB_LB_H_
34 #include <vnet/util/refcount.h>
36 #include <vnet/vnet.h>
37 #include <vnet/ip/ip.h>
38 #include <vnet/dpo/dpo.h>
39 #include <vnet/fib/fib_table.h>
40 #include <vppinfra/hash.h>
41 #include <vppinfra/bihash_8_8.h>
42 #include <vppinfra/bihash_24_8.h>
43 #include <lb/lbhash.h>
44 #include <vppinfra/lock.h>
46 #define LB_DEFAULT_PER_CPU_STICKY_BUCKETS 1 << 10
47 #define LB_DEFAULT_FLOW_TIMEOUT 40
48 #define LB_MAPPING_BUCKETS 1024
49 #define LB_MAPPING_MEMORY_SIZE 64<<20
51 #define LB_VIP_PER_PORT_BUCKETS 1024
52 #define LB_VIP_PER_PORT_MEMORY_SIZE 64<<20
60 LB_NAT4_IN2OUT_NEXT_DROP,
61 LB_NAT4_IN2OUT_NEXT_LOOKUP,
62 LB_NAT4_IN2OUT_N_NEXT,
63 } LB_nat4_in2out_next_t;
66 LB_NAT6_IN2OUT_NEXT_DROP,
67 LB_NAT6_IN2OUT_NEXT_LOOKUP,
68 LB_NAT6_IN2OUT_N_NEXT,
69 } LB_nat6_in2out_next_t;
71 #define foreach_lb_nat_in2out_error \
72 _(UNSUPPORTED_PROTOCOL, "Unsupported protocol") \
73 _(IN2OUT_PACKETS, "Good in2out packets processed") \
74 _(NO_TRANSLATION, "No translation")
77 #define _(sym,str) LB_NAT_IN2OUT_ERROR_##sym,
78 foreach_lb_nat_in2out_error
80 LB_NAT_IN2OUT_N_ERROR,
81 } lb_nat_in2out_error_t;
84 * lb for kube-proxy supports three types of service
87 LB_SRV_TYPE_CLUSTERIP,
93 LB4_NODEPORT_NEXT_IP4_NAT4,
94 LB4_NODEPORT_NEXT_DROP,
96 } lb4_nodeport_next_t;
99 LB6_NODEPORT_NEXT_IP6_NAT6,
100 LB6_NODEPORT_NEXT_DROP,
102 } lb6_nodeport_next_t;
105 * Each VIP is configured with a set of
106 * application server.
110 * Registration to FIB event.
115 * Destination address used to tunnel traffic towards
116 * that application server.
117 * The address is also used as ID and pseudo-random
118 * seed for the load-balancing process.
120 ip46_address_t address;
123 * ASs are indexed by address and VIP Index.
124 * Which means there will be duplicated if the same server
125 * address is used for multiple VIPs.
131 * For now only LB_AS_FLAGS_USED is defined.
135 #define LB_AS_FLAGS_USED 0x1
138 * Rotating timestamp of when LB_AS_FLAGS_USED flag was last set.
140 * AS removal is based on garbage collection and reference counting.
141 * When an AS is removed, there is a race between configuration core
142 * and worker cores which may still add a reference while it should not
143 * be used. This timestamp is used to not remove the AS while a race condition
149 * The FIB entry index for the next-hop
151 fib_node_index_t next_hop_fib_entry_index;
154 * The child index on the FIB entry
156 u32 next_hop_child_index;
159 * The next DPO in the graph to follow.
165 format_function_t format_lb_as;
169 } lb_new_flow_entry_t;
171 #define lb_foreach_vip_counter \
172 _(NEXT_PACKET, "packet from existing sessions", 0) \
173 _(FIRST_PACKET, "first session packet", 1) \
174 _(UNTRACKED_PACKET, "untracked packet", 2) \
175 _(NO_SERVER, "no server configured", 3)
178 #define _(a,b,c) LB_VIP_COUNTER_##a = c,
179 lb_foreach_vip_counter
205 * The load balancer supports IPv4 and IPv6 traffic
206 * and GRE4, GRE6, L3DSR and NAT4, NAT6 encap.
209 LB_VIP_TYPE_IP6_GRE6,
210 LB_VIP_TYPE_IP6_GRE4,
211 LB_VIP_TYPE_IP4_GRE6,
212 LB_VIP_TYPE_IP4_GRE4,
213 LB_VIP_TYPE_IP4_L3DSR,
214 LB_VIP_TYPE_IP4_NAT4,
215 LB_VIP_TYPE_IP6_NAT6,
219 format_function_t format_lb_vip_type;
220 unformat_function_t unformat_lb_vip_type;
223 /* args for different vip encap types */
229 /* Service type. clusterip or nodeport */
232 /* Pod's port corresponding to specific service. network byte order */
235 /* DSCP bits for L3DSR */
239 } lb_vip_encap_args_t;
242 /* all fields in NET byte order */
245 u32 vip_prefix_index;
255 * Load balancing service is provided per VIP+protocol+port.
256 * In this data model, a VIP can be a whole prefix.
257 * But load balancing only
258 * occurs on a per-source-address/port basis. Meaning that if a given source
259 * reuses the same port for multiple destinations within the same VIP,
260 * they will be considered as a single flow.
267 * Vector mapping (flow-hash & new_connect_table_mask) to AS index.
268 * This is used for new flows.
270 lb_new_flow_entry_t *new_flow_table;
273 * New flows table length - 1
274 * (length MUST be a power of 2)
276 u32 new_flow_table_mask;
279 * Last time garbage collection was run to free the ASs.
281 u32 last_garbage_collection;
286 * A Virtual IP represents a given service delivered
287 * by a set of application servers. It can be a single
288 * address or a prefix.
289 * IPv4 prefixes are encoded using IPv4-in-IPv6 embedded address
290 * (i.e. ::/96 prefix).
292 ip46_address_t prefix;
295 * The VIP prefix length.
296 * In case of IPv4, plen = 96 + ip4_plen.
300 /* tcp or udp. If not per-port vip, set to ~0 */
303 /* tcp port or udp port. If not per-port vip, set to ~0 */
306 /* Valid for per-port vip */
307 u32 vip_prefix_index;
310 * The type of traffic for this.
311 * LB_TYPE_UNDEFINED if unknown.
315 /* args for different vip encap types */
316 lb_vip_encap_args_t encap_args;
319 * Flags related to this VIP.
320 * LB_VIP_FLAGS_USED means the VIP is active.
321 * When it is not set, the VIP in the process of being removed.
322 * We cannot immediately remove a VIP because the VIP index still may be stored
323 * in the adjacency index.
326 #define LB_VIP_FLAGS_USED 0x1
327 #define LB_VIP_FLAGS_SRC_IP_STICKY 0x2
330 * Pool of AS indexes used for this VIP.
331 * This also includes ASs that have been removed (but are still referenced).
336 #define lb_vip_is_ip4(type) (type == LB_VIP_TYPE_IP4_GRE6 \
337 || type == LB_VIP_TYPE_IP4_GRE4 \
338 || type == LB_VIP_TYPE_IP4_L3DSR \
339 || type == LB_VIP_TYPE_IP4_NAT4 )
341 #define lb_vip_is_ip6(type) (type == LB_VIP_TYPE_IP6_GRE6 \
342 || type == LB_VIP_TYPE_IP6_GRE4 \
343 || type == LB_VIP_TYPE_IP6_NAT6 )
345 #define lb_encap_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 \
346 || (vip)->type == LB_VIP_TYPE_IP4_GRE4 \
347 || (vip)->type == LB_VIP_TYPE_IP4_L3DSR \
348 || (vip)->type == LB_VIP_TYPE_IP4_NAT4 )
350 #define lb_vip_is_src_ip_sticky(vip) \
351 (((vip)->flags & LB_VIP_FLAGS_SRC_IP_STICKY) != 0)
353 /* clang-format off */
354 #define lb_vip_is_gre4(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE4 \
355 || (vip)->type == LB_VIP_TYPE_IP4_GRE4) \
356 && ((vip)->port == 0) \
357 && !lb_vip_is_src_ip_sticky (vip))
359 #define lb_vip_is_gre6(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE6 \
360 || (vip)->type == LB_VIP_TYPE_IP4_GRE6) \
361 && ((vip)->port == 0) \
362 && !lb_vip_is_src_ip_sticky (vip))
364 #define lb_vip_is_gre4_port(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE4 \
365 || (vip)->type == LB_VIP_TYPE_IP4_GRE4) \
366 && ((vip)->port != 0) \
367 && !lb_vip_is_src_ip_sticky (vip))
369 #define lb_vip_is_gre6_port(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE6 \
370 || (vip)->type == LB_VIP_TYPE_IP4_GRE6) \
371 && ((vip)->port != 0) \
372 && !lb_vip_is_src_ip_sticky (vip))
373 /* clang-format on */
375 #define lb_vip_is_gre4_sticky(vip) \
376 (((vip)->type == LB_VIP_TYPE_IP6_GRE4 || \
377 (vip)->type == LB_VIP_TYPE_IP4_GRE4) && \
378 ((vip)->port == 0) && lb_vip_is_src_ip_sticky (vip))
380 #define lb_vip_is_gre6_sticky(vip) \
381 (((vip)->type == LB_VIP_TYPE_IP6_GRE6 || \
382 (vip)->type == LB_VIP_TYPE_IP4_GRE6) && \
383 ((vip)->port == 0) && lb_vip_is_src_ip_sticky (vip))
385 #define lb_vip_is_gre4_port_sticky(vip) \
386 (((vip)->type == LB_VIP_TYPE_IP6_GRE4 || \
387 (vip)->type == LB_VIP_TYPE_IP4_GRE4) && \
388 ((vip)->port != 0) && lb_vip_is_src_ip_sticky (vip))
390 #define lb_vip_is_gre6_port_sticky(vip) \
391 (((vip)->type == LB_VIP_TYPE_IP6_GRE6 || \
392 (vip)->type == LB_VIP_TYPE_IP4_GRE6) && \
393 ((vip)->port != 0) && lb_vip_is_src_ip_sticky (vip))
396 lb_vip_is_l3dsr(const lb_vip_t *vip)
398 return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port == 0 &&
399 !lb_vip_is_src_ip_sticky (vip));
403 lb_vip_is_l3dsr_port(const lb_vip_t *vip)
405 return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port != 0 &&
406 !lb_vip_is_src_ip_sticky (vip));
409 lb_vip_is_nat4_port(const lb_vip_t *vip)
411 return (vip->type == LB_VIP_TYPE_IP4_NAT4 && vip->port != 0 &&
412 !lb_vip_is_src_ip_sticky (vip));
415 lb_vip_is_nat6_port(const lb_vip_t *vip)
417 return (vip->type == LB_VIP_TYPE_IP6_NAT6 && vip->port != 0 &&
418 !lb_vip_is_src_ip_sticky (vip));
422 lb_vip_is_l3dsr_sticky (const lb_vip_t *vip)
424 return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port == 0 &&
425 lb_vip_is_src_ip_sticky (vip));
428 lb_vip_is_l3dsr_port_sticky (const lb_vip_t *vip)
430 return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port != 0 &&
431 lb_vip_is_src_ip_sticky (vip));
434 lb_vip_is_nat4_port_sticky (const lb_vip_t *vip)
436 return (vip->type == LB_VIP_TYPE_IP4_NAT4 && vip->port != 0 &&
437 lb_vip_is_src_ip_sticky (vip));
440 lb_vip_is_nat6_port_sticky (const lb_vip_t *vip)
442 return (vip->type == LB_VIP_TYPE_IP6_NAT6 && vip->port != 0 &&
443 lb_vip_is_src_ip_sticky (vip));
446 format_function_t format_lb_vip;
447 format_function_t format_lb_vip_detailed;
449 #define foreach_lb_nat_protocol \
450 _(UDP, 0, udp, "udp") \
451 _(TCP, 1, tcp, "tcp")
454 #define _(N, i, n, s) LB_NAT_PROTOCOL_##N = i,
455 foreach_lb_nat_protocol
460 lb_ip_proto_to_nat_proto (u8 ip_proto)
464 nat_proto = (ip_proto == IP_PROTOCOL_UDP) ? LB_NAT_PROTOCOL_UDP : nat_proto;
465 nat_proto = (ip_proto == IP_PROTOCOL_TCP) ? LB_NAT_PROTOCOL_TCP : nat_proto;
470 /* Key for Pod's egress SNAT */
502 * for vip + port case, src_ip = vip;
503 * for node ip + node_port, src_ip = node_ip
505 ip46_address_t src_ip;
506 ip46_address_t as_ip;
511 * for vip + port case, src_port = port;
512 * for node ip + node_port, src_port = node_port
515 u16 target_port; /* Network byte order */
522 * Each CPU has its own sticky flow hash table.
523 * One single table is used for all VIPs.
525 lb_hash_t *sticky_ht;
530 * Pool of all Virtual IPs
535 * bitmap for vip prefix to support per-port vip
537 uword *vip_prefix_indexes;
541 * ASs are referenced by address and vip index.
542 * The first element (index 0) is special and used only to fill
543 * new_flow_tables when no AS has been configured.
548 * Each AS has an associated reference counter.
549 * As ass[0] has a special meaning, its associated counter
550 * starts at 0 and is decremented instead. i.e. do not use it.
552 vlib_refcount_t as_refcount;
554 /* hash lookup vip_index by key: {u16: nodeport} */
555 uword * vip_index_by_nodeport;
558 * Some global data is per-cpu
560 lb_per_cpu_t *per_cpu;
563 * Node next index for IP adjacencies, for each of the traffic types.
565 u32 ip_lookup_next_index[LB_VIP_N_TYPES];
568 * Source address used in IPv6 encapsulated traffic
570 ip6_address_t ip6_src_address;
573 * Source address used for IPv4 encapsulated traffic
575 ip4_address_t ip4_src_address;
578 * Number of buckets in the per-cpu sticky hash table.
580 u32 per_cpu_sticky_buckets;
583 * Flow timeout in seconds.
590 vlib_simple_counter_main_t vip_counters[LB_N_VIP_COUNTERS];
593 * DPO used to send packet from IP4/6 lookup to LB node.
595 dpo_type_t dpo_gre4_type;
596 dpo_type_t dpo_gre6_type;
597 dpo_type_t dpo_gre4_port_type;
598 dpo_type_t dpo_gre6_port_type;
599 dpo_type_t dpo_l3dsr_type;
600 dpo_type_t dpo_l3dsr_port_type;
601 dpo_type_t dpo_nat4_port_type;
602 dpo_type_t dpo_nat6_port_type;
603 dpo_type_t dpo_gre4_sticky_type;
604 dpo_type_t dpo_gre6_sticky_type;
605 dpo_type_t dpo_gre4_port_sticky_type;
606 dpo_type_t dpo_gre6_port_sticky_type;
607 dpo_type_t dpo_l3dsr_sticky_type;
608 dpo_type_t dpo_l3dsr_port_sticky_type;
609 dpo_type_t dpo_nat4_port_sticky_type;
610 dpo_type_t dpo_nat6_port_sticky_type;
612 * Node type for registering to fib changes.
614 fib_node_type_t fib_node_type;
616 /* lookup per_port vip by key */
617 clib_bihash_8_8_t vip_index_per_port;
619 /* Find a static mapping by AS IP : target_port */
620 clib_bihash_8_8_t mapping_by_as4;
621 clib_bihash_24_8_t mapping_by_as6;
623 /* Static mapping pool */
624 lb_snat_mapping_t * snat_mappings;
627 * API dynamically registered base ID.
631 clib_spinlock_t writer_lock;
634 vlib_main_t *vlib_main;
635 vnet_main_t *vnet_main;
638 /* args for different vip encap types */
640 ip46_address_t prefix;
647 lb_vip_encap_args_t encap_args;
650 extern lb_main_t lb_main;
651 extern vlib_node_registration_t lb4_node;
652 extern vlib_node_registration_t lb6_node;
653 extern vlib_node_registration_t lb4_nodeport_node;
654 extern vlib_node_registration_t lb6_nodeport_node;
655 extern vlib_node_registration_t lb_nat4_in2out_node;
656 extern vlib_node_registration_t lb_nat6_in2out_node;
659 * Fix global load-balancer parameters.
660 * @param ip4_address IPv4 source address used for encapsulated traffic
661 * @param ip6_address IPv6 source address used for encapsulated traffic
662 * @param sticky_buckets FIXME
663 * @param flow_timeout FIXME
664 * @return 0 on success. VNET_LB_ERR_XXX on error
666 int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address,
667 u32 sticky_buckets, u32 flow_timeout);
669 int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index);
671 int lb_vip_del(u32 vip_index);
673 int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u8 protocol,
674 u16 port, u32 *vip_index);
676 #define lb_vip_get_by_index(index) (pool_is_free_index(lb_main.vips, index)?NULL:pool_elt_at_index(lb_main.vips, index))
678 int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n);
679 int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n, u8 flush);
680 int lb_flush_vip_as (u32 vip_index, u32 as_index);
682 u32 lb_hash_time_now(vlib_main_t * vm);
684 void lb_garbage_collection();
686 int lb_nat4_interface_add_del (u32 sw_if_index, int is_del);
687 int lb_nat6_interface_add_del (u32 sw_if_index, int is_del);
689 format_function_t format_lb_main;
691 #endif /* LB_PLUGIN_LB_LB_H_ */