2 * Copyright (c) 2016 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
17 #include <vnet/plugin/plugin.h>
18 #include <vpp/app/version.h>
19 #include <vnet/api_errno.h>
20 #include <vnet/udp/udp.h>
21 #include <vppinfra/lock.h>
23 //GC runs at most once every so many seconds
24 #define LB_GARBAGE_RUN 60
26 //After so many seconds. It is assumed that inter-core race condition will not occur.
27 #define LB_CONCURRENCY_TIMEOUT 10
31 #define lb_get_writer_lock() clib_spinlock_lock (&lb_main.writer_lock)
32 #define lb_put_writer_lock() clib_spinlock_unlock (&lb_main.writer_lock)
34 static void lb_as_stack (lb_as_t *as);
37 const static char * const lb_dpo_gre4_ip4[] = { "lb4-gre4" , NULL };
38 const static char * const lb_dpo_gre4_ip6[] = { "lb6-gre4" , NULL };
39 const static char* const * const lb_dpo_gre4_nodes[DPO_PROTO_NUM] =
41 [DPO_PROTO_IP4] = lb_dpo_gre4_ip4,
42 [DPO_PROTO_IP6] = lb_dpo_gre4_ip6,
45 const static char * const lb_dpo_gre6_ip4[] = { "lb4-gre6" , NULL };
46 const static char * const lb_dpo_gre6_ip6[] = { "lb6-gre6" , NULL };
47 const static char* const * const lb_dpo_gre6_nodes[DPO_PROTO_NUM] =
49 [DPO_PROTO_IP4] = lb_dpo_gre6_ip4,
50 [DPO_PROTO_IP6] = lb_dpo_gre6_ip6,
53 const static char * const lb_dpo_gre4_ip4_port[] = { "lb4-gre4-port" , NULL };
54 const static char * const lb_dpo_gre4_ip6_port[] = { "lb6-gre4-port" , NULL };
55 const static char* const * const lb_dpo_gre4_port_nodes[DPO_PROTO_NUM] =
57 [DPO_PROTO_IP4] = lb_dpo_gre4_ip4_port,
58 [DPO_PROTO_IP6] = lb_dpo_gre4_ip6_port,
61 const static char * const lb_dpo_gre6_ip4_port[] = { "lb4-gre6-port" , NULL };
62 const static char * const lb_dpo_gre6_ip6_port[] = { "lb6-gre6-port" , NULL };
63 const static char* const * const lb_dpo_gre6_port_nodes[DPO_PROTO_NUM] =
65 [DPO_PROTO_IP4] = lb_dpo_gre6_ip4_port,
66 [DPO_PROTO_IP6] = lb_dpo_gre6_ip6_port,
69 const static char * const lb_dpo_l3dsr_ip4[] = {"lb4-l3dsr" , NULL};
70 const static char* const * const lb_dpo_l3dsr_nodes[DPO_PROTO_NUM] =
72 [DPO_PROTO_IP4] = lb_dpo_l3dsr_ip4,
75 const static char * const lb_dpo_l3dsr_ip4_port[] = {"lb4-l3dsr-port" , NULL};
76 const static char* const * const lb_dpo_l3dsr_port_nodes[DPO_PROTO_NUM] =
78 [DPO_PROTO_IP4] = lb_dpo_l3dsr_ip4_port,
81 const static char * const lb_dpo_nat4_ip4_port[] = { "lb4-nat4-port" , NULL };
82 const static char* const * const lb_dpo_nat4_port_nodes[DPO_PROTO_NUM] =
84 [DPO_PROTO_IP4] = lb_dpo_nat4_ip4_port,
87 const static char * const lb_dpo_nat6_ip6_port[] = { "lb6-nat6-port" , NULL };
88 const static char* const * const lb_dpo_nat6_port_nodes[DPO_PROTO_NUM] =
90 [DPO_PROTO_IP6] = lb_dpo_nat6_ip6_port,
93 u32 lb_hash_time_now(vlib_main_t * vm)
95 return (u32) (vlib_time_now(vm) + 10000);
98 u8 *format_lb_main (u8 * s, va_list * args)
100 vlib_thread_main_t *tm = vlib_get_thread_main();
101 lb_main_t *lbm = &lb_main;
102 s = format(s, "lb_main");
103 s = format(s, " ip4-src-address: %U \n", format_ip4_address, &lbm->ip4_src_address);
104 s = format(s, " ip6-src-address: %U \n", format_ip6_address, &lbm->ip6_src_address);
105 s = format(s, " #vips: %u\n", pool_elts(lbm->vips));
106 s = format(s, " #ass: %u\n", pool_elts(lbm->ass) - 1);
109 for(thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++ ) {
110 lb_hash_t *h = lbm->per_cpu[thread_index].sticky_ht;
112 s = format(s, "core %d\n", thread_index);
113 s = format(s, " timeout: %ds\n", h->timeout);
114 s = format(s, " usage: %d / %d\n", lb_hash_elts(h, lb_hash_time_now(vlib_get_main())), lb_hash_size(h));
121 static char *lb_vip_type_strings[] = {
122 [LB_VIP_TYPE_IP6_GRE6] = "ip6-gre6",
123 [LB_VIP_TYPE_IP6_GRE4] = "ip6-gre4",
124 [LB_VIP_TYPE_IP4_GRE6] = "ip4-gre6",
125 [LB_VIP_TYPE_IP4_GRE4] = "ip4-gre4",
126 [LB_VIP_TYPE_IP4_L3DSR] = "ip4-l3dsr",
127 [LB_VIP_TYPE_IP4_NAT4] = "ip4-nat4",
128 [LB_VIP_TYPE_IP6_NAT6] = "ip6-nat6",
131 u8 *format_lb_vip_type (u8 * s, va_list * args)
133 lb_vip_type_t vipt = va_arg (*args, lb_vip_type_t);
135 for (i=0; i<LB_VIP_N_TYPES; i++)
137 return format(s, lb_vip_type_strings[i]);
138 return format(s, "_WRONG_TYPE_");
141 uword unformat_lb_vip_type (unformat_input_t * input, va_list * args)
143 lb_vip_type_t *vipt = va_arg (*args, lb_vip_type_t *);
145 for (i=0; i<LB_VIP_N_TYPES; i++)
146 if (unformat(input, lb_vip_type_strings[i])) {
153 u8 *format_lb_vip (u8 * s, va_list * args)
155 lb_vip_t *vip = va_arg (*args, lb_vip_t *);
156 s = format(s, "%U %U new_size:%u #as:%u%s",
157 format_lb_vip_type, vip->type,
158 format_ip46_prefix, &vip->prefix, vip->plen, IP46_TYPE_ANY,
159 vip->new_flow_table_mask + 1,
160 pool_elts(vip->as_indexes),
161 (vip->flags & LB_VIP_FLAGS_USED)?"":" removed");
165 s = format(s, " protocol:%u port:%u ", vip->protocol, vip->port);
168 if (vip->type == LB_VIP_TYPE_IP4_L3DSR)
170 s = format(s, " dscp:%u", vip->encap_args.dscp);
172 else if ((vip->type == LB_VIP_TYPE_IP4_NAT4)
173 || (vip->type == LB_VIP_TYPE_IP6_NAT6))
175 s = format (s, " type:%s port:%u target_port:%u",
176 (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP)?"clusterip":
178 ntohs(vip->port), ntohs(vip->encap_args.target_port));
184 u8 *format_lb_as (u8 * s, va_list * args)
186 lb_as_t *as = va_arg (*args, lb_as_t *);
187 return format(s, "%U %s", format_ip46_address,
188 &as->address, IP46_TYPE_ANY,
189 (as->flags & LB_AS_FLAGS_USED)?"used":"removed");
192 u8 *format_lb_vip_detailed (u8 * s, va_list * args)
194 lb_main_t *lbm = &lb_main;
195 lb_vip_t *vip = va_arg (*args, lb_vip_t *);
196 u32 indent = format_get_indent (s);
198 s = format(s, "%U %U [%lu] %U%s\n"
200 format_white_space, indent,
201 format_lb_vip_type, vip->type,
203 format_ip46_prefix, &vip->prefix, (u32) vip->plen, IP46_TYPE_ANY,
204 (vip->flags & LB_VIP_FLAGS_USED)?"":" removed",
205 format_white_space, indent,
206 vip->new_flow_table_mask + 1);
210 s = format(s, "%U protocol:%u port:%u\n",
211 format_white_space, indent,
212 vip->protocol, vip->port);
215 if (vip->type == LB_VIP_TYPE_IP4_L3DSR)
217 s = format(s, "%U dscp:%u\n",
218 format_white_space, indent,
219 vip->encap_args.dscp);
221 else if ((vip->type == LB_VIP_TYPE_IP4_NAT4)
222 || (vip->type == LB_VIP_TYPE_IP6_NAT6))
224 s = format (s, "%U type:%s port:%u target_port:%u",
225 format_white_space, indent,
226 (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP)?"clusterip":
228 ntohs(vip->port), ntohs(vip->encap_args.target_port));
232 s = format(s, "%U counters:\n",
233 format_white_space, indent);
235 for (i=0; i<LB_N_VIP_COUNTERS; i++)
236 s = format(s, "%U %s: %Lu\n",
237 format_white_space, indent,
238 lbm->vip_counters[i].name,
239 vlib_get_simple_counter(&lbm->vip_counters[i], vip - lbm->vips));
242 s = format(s, "%U #as:%u\n",
243 format_white_space, indent,
244 pool_elts(vip->as_indexes));
246 //Let's count the buckets for each AS
248 vec_validate(count, pool_len(lbm->ass)); //Possibly big alloc for not much...
249 lb_new_flow_entry_t *nfe;
250 vec_foreach(nfe, vip->new_flow_table)
251 count[nfe->as_index]++;
255 pool_foreach(as_index, vip->as_indexes, {
256 as = &lbm->ass[*as_index];
257 s = format(s, "%U %U %u buckets %Lu flows dpo:%u %s\n",
258 format_white_space, indent,
259 format_ip46_address, &as->address, IP46_TYPE_ANY,
260 count[as - lbm->ass],
261 vlib_refcount_get(&lbm->as_refcount, as - lbm->ass),
263 (as->flags & LB_AS_FLAGS_USED)?"used":" removed");
276 static int lb_pseudorand_compare(void *a, void *b)
279 lb_main_t *lbm = &lb_main;
280 asa = &lbm->ass[((lb_pseudorand_t *)a)->as_index];
281 asb = &lbm->ass[((lb_pseudorand_t *)b)->as_index];
282 return memcmp(&asa->address, &asb->address, sizeof(asb->address));
285 static void lb_vip_garbage_collection(lb_vip_t *vip)
287 lb_main_t *lbm = &lb_main;
288 lb_snat4_key_t m_key4;
289 clib_bihash_kv_8_8_t kv4, value4;
290 lb_snat6_key_t m_key6;
291 clib_bihash_kv_24_8_t kv6, value6;
292 lb_snat_mapping_t *m = 0;
293 CLIB_SPINLOCK_ASSERT_LOCKED (&lbm->writer_lock);
295 u32 now = (u32) vlib_time_now(vlib_get_main());
296 if (!clib_u32_loop_gt(now, vip->last_garbage_collection + LB_GARBAGE_RUN))
299 vip->last_garbage_collection = now;
302 pool_foreach(as_index, vip->as_indexes, {
303 as = &lbm->ass[*as_index];
304 if (!(as->flags & LB_AS_FLAGS_USED) && //Not used
305 clib_u32_loop_gt(now, as->last_used + LB_CONCURRENCY_TIMEOUT) &&
306 (vlib_refcount_get(&lbm->as_refcount, as - lbm->ass) == 0))
309 if (lb_vip_is_nat4_port(vip)) {
310 m_key4.addr = as->address.ip4;
311 m_key4.port = vip->encap_args.target_port;
313 m_key4.fib_index = 0;
315 kv4.key = m_key4.as_u64;
316 if(!clib_bihash_search_8_8(&lbm->mapping_by_as4, &kv4, &value4))
317 m = pool_elt_at_index (lbm->snat_mappings, value4.value);
320 kv4.value = m - lbm->snat_mappings;
321 clib_bihash_add_del_8_8(&lbm->mapping_by_as4, &kv4, 0);
322 pool_put (lbm->snat_mappings, m);
323 } else if (lb_vip_is_nat6_port(vip)) {
324 m_key6.addr.as_u64[0] = as->address.ip6.as_u64[0];
325 m_key6.addr.as_u64[1] = as->address.ip6.as_u64[1];
326 m_key6.port = vip->encap_args.target_port;
328 m_key6.fib_index = 0;
330 kv6.key[0] = m_key6.as_u64[0];
331 kv6.key[1] = m_key6.as_u64[1];
332 kv6.key[2] = m_key6.as_u64[2];
334 if (!clib_bihash_search_24_8 (&lbm->mapping_by_as6, &kv6, &value6))
335 m = pool_elt_at_index (lbm->snat_mappings, value6.value);
338 kv6.value = m - lbm->snat_mappings;
339 clib_bihash_add_del_24_8(&lbm->mapping_by_as6, &kv6, 0);
340 pool_put (lbm->snat_mappings, m);
342 fib_entry_child_remove(as->next_hop_fib_entry_index,
343 as->next_hop_child_index);
344 fib_table_entry_delete_index(as->next_hop_fib_entry_index,
346 as->next_hop_fib_entry_index = FIB_NODE_INDEX_INVALID;
348 pool_put(vip->as_indexes, as_index);
349 pool_put(lbm->ass, as);
354 void lb_garbage_collection()
356 lb_main_t *lbm = &lb_main;
357 lb_get_writer_lock();
359 u32 *to_be_removed_vips = 0, *i;
360 pool_foreach(vip, lbm->vips, {
361 lb_vip_garbage_collection(vip);
363 if (!(vip->flags & LB_VIP_FLAGS_USED) &&
364 (pool_elts(vip->as_indexes) == 0)) {
365 vec_add1(to_be_removed_vips, vip - lbm->vips);
369 vec_foreach(i, to_be_removed_vips) {
370 vip = &lbm->vips[*i];
371 pool_put(lbm->vips, vip);
372 pool_free(vip->as_indexes);
375 vec_free(to_be_removed_vips);
376 lb_put_writer_lock();
379 static void lb_vip_update_new_flow_table(lb_vip_t *vip)
381 lb_main_t *lbm = &lb_main;
382 lb_new_flow_entry_t *old_table;
384 lb_new_flow_entry_t *new_flow_table = 0;
386 lb_pseudorand_t *pr, *sort_arr = 0;
388 CLIB_SPINLOCK_ASSERT_LOCKED (&lbm->writer_lock); // We must have the lock
390 //Check if some AS is configured or not
392 pool_foreach(as_index, vip->as_indexes, {
393 as = &lbm->ass[*as_index];
394 if (as->flags & LB_AS_FLAGS_USED) { //Not used anymore
396 goto out; //Not sure 'break' works in this macro-loop
402 //Only the default. i.e. no AS
403 vec_validate(new_flow_table, vip->new_flow_table_mask);
404 for (i=0; i<vec_len(new_flow_table); i++)
405 new_flow_table[i].as_index = 0;
410 //First, let's sort the ASs
411 vec_alloc(sort_arr, pool_elts(vip->as_indexes));
414 pool_foreach(as_index, vip->as_indexes, {
415 as = &lbm->ass[*as_index];
416 if (!(as->flags & LB_AS_FLAGS_USED)) //Not used anymore
419 sort_arr[i].as_index = as - lbm->ass;
422 _vec_len(sort_arr) = i;
424 vec_sort_with_function(sort_arr, lb_pseudorand_compare);
426 //Now let's pseudo-randomly generate permutations
427 vec_foreach(pr, sort_arr) {
428 lb_as_t *as = &lbm->ass[pr->as_index];
430 u64 seed = clib_xxhash(as->address.as_u64[0] ^
431 as->address.as_u64[1]);
432 /* We have 2^n buckets.
433 * skip must be prime with 2^n.
434 * So skip must be odd.
435 * MagLev actually state that M should be prime,
436 * but this has a big computation cost (% operation).
437 * Using 2^n is more better (& operation).
439 pr->skip = ((seed & 0xffffffff) | 1) & vip->new_flow_table_mask;
440 pr->last = (seed >> 32) & vip->new_flow_table_mask;
443 //Let's create a new flow table
444 vec_validate(new_flow_table, vip->new_flow_table_mask);
445 for (i=0; i<vec_len(new_flow_table); i++)
446 new_flow_table[i].as_index = 0;
450 vec_foreach(pr, sort_arr) {
453 pr->last = (pr->last + pr->skip) & vip->new_flow_table_mask;
454 if (new_flow_table[last].as_index == 0) {
455 new_flow_table[last].as_index = pr->as_index;
460 if (done == vec_len(new_flow_table))
468 old_table = vip->new_flow_table;
469 vip->new_flow_table = new_flow_table;
473 int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address,
474 u32 per_cpu_sticky_buckets, u32 flow_timeout)
476 lb_main_t *lbm = &lb_main;
478 if (!is_pow2(per_cpu_sticky_buckets))
479 return VNET_API_ERROR_INVALID_MEMORY_SIZE;
481 lb_get_writer_lock(); //Not exactly necessary but just a reminder that it exists for my future self
482 lbm->ip4_src_address = *ip4_address;
483 lbm->ip6_src_address = *ip6_address;
484 lbm->per_cpu_sticky_buckets = per_cpu_sticky_buckets;
485 lbm->flow_timeout = flow_timeout;
486 lb_put_writer_lock();
493 int lb_vip_port_find_index(ip46_address_t *prefix, u8 plen,
494 u8 protocol, u16 port,
495 lb_lkp_type_t lkp_type,
498 lb_main_t *lbm = &lb_main;
500 /* This must be called with the lock owned */
501 CLIB_SPINLOCK_ASSERT_LOCKED (&lbm->writer_lock);
502 ip46_prefix_normalize(prefix, plen);
503 pool_foreach(vip, lbm->vips, {
504 if ((vip->flags & LB_AS_FLAGS_USED) &&
506 vip->prefix.as_u64[0] == prefix->as_u64[0] &&
507 vip->prefix.as_u64[1] == prefix->as_u64[1])
509 if((lkp_type == LB_LKP_SAME_IP_PORT &&
510 vip->protocol == protocol &&
511 vip->port == port) ||
512 (lkp_type == LB_LKP_ALL_PORT_IP &&
514 (lkp_type == LB_LKP_DIFF_IP_PORT &&
515 (vip->protocol != protocol ||
516 vip->port != port) ) )
518 *vip_index = vip - lbm->vips;
523 return VNET_API_ERROR_NO_SUCH_ENTRY;
527 int lb_vip_port_find_index_with_lock(ip46_address_t *prefix, u8 plen,
528 u8 protocol, u16 port, u32 *vip_index)
530 return lb_vip_port_find_index(prefix, plen, protocol, port,
531 LB_LKP_SAME_IP_PORT, vip_index);
535 int lb_vip_port_find_all_port_vip(ip46_address_t *prefix, u8 plen,
538 return lb_vip_port_find_index(prefix, plen, ~0, 0,
539 LB_LKP_ALL_PORT_IP, vip_index);
542 /* Find out per-port-vip entry with different protocol and port */
544 int lb_vip_port_find_diff_port(ip46_address_t *prefix, u8 plen,
545 u8 protocol, u16 port, u32 *vip_index)
547 return lb_vip_port_find_index(prefix, plen, protocol, port,
548 LB_LKP_DIFF_IP_PORT, vip_index);
551 int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u8 protocol,
552 u16 port, u32 *vip_index)
555 lb_get_writer_lock();
556 ret = lb_vip_port_find_index_with_lock(prefix, plen,
557 protocol, port, vip_index);
558 lb_put_writer_lock();
562 static int lb_as_find_index_vip(lb_vip_t *vip, ip46_address_t *address, u32 *as_index)
564 lb_main_t *lbm = &lb_main;
565 /* This must be called with the lock owned */
566 CLIB_SPINLOCK_ASSERT_LOCKED (&lbm->writer_lock);
569 pool_foreach(asi, vip->as_indexes, {
570 as = &lbm->ass[*asi];
571 if (as->vip_index == (vip - lbm->vips) &&
572 as->address.as_u64[0] == address->as_u64[0] &&
573 as->address.as_u64[1] == address->as_u64[1])
575 *as_index = as - lbm->ass;
582 int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n)
584 lb_main_t *lbm = &lb_main;
585 lb_get_writer_lock();
587 if (!(vip = lb_vip_get_by_index(vip_index))) {
588 lb_put_writer_lock();
589 return VNET_API_ERROR_NO_SUCH_ENTRY;
592 ip46_type_t type = lb_encap_is_ip4(vip)?IP46_TYPE_IP4:IP46_TYPE_IP6;
593 u32 *to_be_added = 0;
594 u32 *to_be_updated = 0;
597 lb_snat_mapping_t *m;
602 if (!lb_as_find_index_vip(vip, &addresses[n], &i)) {
603 if (lbm->ass[i].flags & LB_AS_FLAGS_USED) {
604 vec_free(to_be_added);
605 vec_free(to_be_updated);
606 lb_put_writer_lock();
607 return VNET_API_ERROR_VALUE_EXIST;
609 vec_add1(to_be_updated, i);
613 if (ip46_address_type(&addresses[n]) != type) {
614 vec_free(to_be_added);
615 vec_free(to_be_updated);
616 lb_put_writer_lock();
617 return VNET_API_ERROR_INVALID_ADDRESS_FAMILY;
622 while(n2--) //Check for duplicates
623 if (addresses[n2].as_u64[0] == addresses[n].as_u64[0] &&
624 addresses[n2].as_u64[1] == addresses[n].as_u64[1])
628 vec_add1(to_be_added, n);
635 vec_foreach(ip, to_be_updated) {
636 lbm->ass[*ip].flags = LB_AS_FLAGS_USED;
638 vec_free(to_be_updated);
640 //Create those who have to be created
641 vec_foreach(ip, to_be_added) {
644 pool_get(lbm->ass, as);
645 as->address = addresses[*ip];
646 as->flags = LB_AS_FLAGS_USED;
647 as->vip_index = vip_index;
648 pool_get(vip->as_indexes, as_index);
649 *as_index = as - lbm->ass;
652 * become a child of the FIB entry
653 * so we are informed when its forwarding changes
655 fib_prefix_t nh = {};
656 if (lb_encap_is_ip4(vip)) {
657 nh.fp_addr.ip4 = as->address.ip4;
659 nh.fp_proto = FIB_PROTOCOL_IP4;
661 nh.fp_addr.ip6 = as->address.ip6;
663 nh.fp_proto = FIB_PROTOCOL_IP6;
666 as->next_hop_fib_entry_index =
667 fib_table_entry_special_add(0,
670 FIB_ENTRY_FLAG_NONE);
671 as->next_hop_child_index =
672 fib_entry_child_add(as->next_hop_fib_entry_index,
678 if ( lb_vip_is_nat4_port(vip) || lb_vip_is_nat6_port(vip) )
680 /* Add SNAT static mapping */
681 pool_get (lbm->snat_mappings, m);
682 clib_memset (m, 0, sizeof (*m));
683 if (lb_vip_is_nat4_port(vip)) {
684 lb_snat4_key_t m_key4;
685 clib_bihash_kv_8_8_t kv4;
686 m_key4.addr = as->address.ip4;
687 m_key4.port = vip->encap_args.target_port;
689 m_key4.fib_index = 0;
691 if (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP)
693 m->src_ip.ip4 = vip->prefix.ip4;
695 else if (vip->encap_args.srv_type == LB_SRV_TYPE_NODEPORT)
697 m->src_ip.ip4 = lbm->ip4_src_address;
699 m->src_ip_is_ipv6 = 0;
700 m->as_ip.ip4 = as->address.ip4;
701 m->as_ip_is_ipv6 = 0;
702 m->src_port = vip->port;
703 m->target_port = vip->encap_args.target_port;
707 kv4.key = m_key4.as_u64;
708 kv4.value = m - lbm->snat_mappings;
709 clib_bihash_add_del_8_8(&lbm->mapping_by_as4, &kv4, 1);
711 lb_snat6_key_t m_key6;
712 clib_bihash_kv_24_8_t kv6;
713 m_key6.addr.as_u64[0] = as->address.ip6.as_u64[0];
714 m_key6.addr.as_u64[1] = as->address.ip6.as_u64[1];
715 m_key6.port = vip->encap_args.target_port;
717 m_key6.fib_index = 0;
719 if (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP)
721 m->src_ip.ip6.as_u64[0] = vip->prefix.ip6.as_u64[0];
722 m->src_ip.ip6.as_u64[1] = vip->prefix.ip6.as_u64[1];
724 else if (vip->encap_args.srv_type == LB_SRV_TYPE_NODEPORT)
726 m->src_ip.ip6.as_u64[0] = lbm->ip6_src_address.as_u64[0];
727 m->src_ip.ip6.as_u64[1] = lbm->ip6_src_address.as_u64[1];
729 m->src_ip_is_ipv6 = 1;
730 m->as_ip.ip6.as_u64[0] = as->address.ip6.as_u64[0];
731 m->as_ip.ip6.as_u64[1] = as->address.ip6.as_u64[1];
732 m->as_ip_is_ipv6 = 1;
733 m->src_port = vip->port;
734 m->target_port = vip->encap_args.target_port;
738 kv6.key[0] = m_key6.as_u64[0];
739 kv6.key[1] = m_key6.as_u64[1];
740 kv6.key[2] = m_key6.as_u64[2];
741 kv6.value = m - lbm->snat_mappings;
742 clib_bihash_add_del_24_8(&lbm->mapping_by_as6, &kv6, 1);
746 vec_free(to_be_added);
749 lb_vip_update_new_flow_table(vip);
751 //Garbage collection maybe
752 lb_vip_garbage_collection(vip);
754 lb_put_writer_lock();
759 lb_flush_vip_as (u32 vip_index, u32 as_index)
762 vlib_thread_main_t *tm = vlib_get_thread_main();
763 lb_main_t *lbm = &lb_main;
765 for(thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++ ) {
766 lb_hash_t *h = lbm->per_cpu[thread_index].sticky_ht;
771 lb_hash_foreach_entry(h, b, i) {
772 if ((vip_index == ~0)
773 || ((b->vip[i] == vip_index) && (as_index == ~0))
774 || ((b->vip[i] == vip_index) && (b->value[i] == as_index)))
776 vlib_refcount_add(&lbm->as_refcount, thread_index, b->value[i], -1);
777 vlib_refcount_add(&lbm->as_refcount, thread_index, 0, 1);
785 lbm->per_cpu[thread_index].sticky_ht = 0;
793 int lb_vip_del_ass_withlock(u32 vip_index, ip46_address_t *addresses, u32 n,
796 lb_main_t *lbm = &lb_main;
797 u32 now = (u32) vlib_time_now(vlib_get_main());
802 if (!(vip = lb_vip_get_by_index(vip_index))) {
803 return VNET_API_ERROR_NO_SUCH_ENTRY;
808 if (lb_as_find_index_vip(vip, &addresses[n], &as_index)) {
810 return VNET_API_ERROR_NO_SUCH_ENTRY;
813 if (n) { //Check for duplicates
816 if (addresses[n2].as_u64[0] == addresses[n].as_u64[0] &&
817 addresses[n2].as_u64[1] == addresses[n].as_u64[1])
822 vec_add1(indexes, as_index);
827 //Garbage collection maybe
828 lb_vip_garbage_collection(vip);
830 if (indexes != NULL) {
831 vec_foreach(ip, indexes) {
832 lbm->ass[*ip].flags &= ~LB_AS_FLAGS_USED;
833 lbm->ass[*ip].last_used = now;
837 /* flush flow table for deleted ASs*/
838 lb_flush_vip_as(vip_index, *ip);
843 lb_vip_update_new_flow_table(vip);
850 int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n, u8 flush)
852 lb_get_writer_lock();
853 int ret = lb_vip_del_ass_withlock(vip_index, addresses, n, flush);
854 lb_put_writer_lock();
860 lb_vip_prefix_index_alloc (lb_main_t *lbm)
863 * Check for dynamically allocated instance number.
867 bit = clib_bitmap_first_clear (lbm->vip_prefix_indexes);
869 lbm->vip_prefix_indexes = clib_bitmap_set(lbm->vip_prefix_indexes, bit, 1);
875 lb_vip_prefix_index_free (lb_main_t *lbm, u32 instance)
878 if (clib_bitmap_get (lbm->vip_prefix_indexes, instance) == 0)
883 lbm->vip_prefix_indexes = clib_bitmap_set (lbm->vip_prefix_indexes,
890 * Add the VIP adjacency to the ip4 or ip6 fib
892 static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip,
893 u32 *vip_prefix_index)
895 dpo_proto_t proto = 0;
896 dpo_type_t dpo_type = 0;
901 /* for per-port vip, if VIP adjacency has been added,
902 * no need to add adjacency. */
903 if (!lb_vip_port_find_diff_port(&(vip->prefix), vip->plen,
904 vip->protocol, vip->port, &vip_idx))
909 /* Allocate an index for per-port vip */
910 *vip_prefix_index = lb_vip_prefix_index_alloc(lbm);
914 *vip_prefix_index = vip - lbm->vips;
917 dpo_id_t dpo = DPO_INVALID;
918 fib_prefix_t pfx = {};
919 if (lb_vip_is_ip4(vip->type)) {
920 pfx.fp_addr.ip4 = vip->prefix.ip4;
921 pfx.fp_len = vip->plen - 96;
922 pfx.fp_proto = FIB_PROTOCOL_IP4;
923 proto = DPO_PROTO_IP4;
925 pfx.fp_addr.ip6 = vip->prefix.ip6;
926 pfx.fp_len = vip->plen;
927 pfx.fp_proto = FIB_PROTOCOL_IP6;
928 proto = DPO_PROTO_IP6;
931 if (lb_vip_is_gre4(vip))
932 dpo_type = lbm->dpo_gre4_type;
933 else if (lb_vip_is_gre6(vip))
934 dpo_type = lbm->dpo_gre6_type;
935 else if (lb_vip_is_gre4_port(vip))
936 dpo_type = lbm->dpo_gre4_port_type;
937 else if (lb_vip_is_gre6_port(vip))
938 dpo_type = lbm->dpo_gre6_port_type;
939 else if (lb_vip_is_l3dsr(vip))
940 dpo_type = lbm->dpo_l3dsr_type;
941 else if (lb_vip_is_l3dsr_port(vip))
942 dpo_type = lbm->dpo_l3dsr_port_type;
943 else if(lb_vip_is_nat4_port(vip))
944 dpo_type = lbm->dpo_nat4_port_type;
945 else if (lb_vip_is_nat6_port(vip))
946 dpo_type = lbm->dpo_nat6_port_type;
948 dpo_set(&dpo, dpo_type, proto, *vip_prefix_index);
949 fib_table_entry_special_dpo_add(0,
951 FIB_SOURCE_PLUGIN_HI,
952 FIB_ENTRY_FLAG_EXCLUSIVE,
958 * Add the VIP filter entry
960 static int lb_vip_add_port_filter(lb_main_t *lbm, lb_vip_t *vip,
961 u32 vip_prefix_index, u32 vip_idx)
964 clib_bihash_kv_8_8_t kv;
966 key.vip_prefix_index = vip_prefix_index;
967 key.protocol = vip->protocol;
968 key.port = clib_host_to_net_u16(vip->port);
973 clib_bihash_add_del_8_8(&lbm->vip_index_per_port, &kv, 1);
979 * Del the VIP filter entry
981 static int lb_vip_del_port_filter(lb_main_t *lbm, lb_vip_t *vip)
984 clib_bihash_kv_8_8_t kv, value;
987 key.vip_prefix_index = vip->vip_prefix_index;
988 key.protocol = vip->protocol;
989 key.port = clib_host_to_net_u16(vip->port);
993 if(clib_bihash_search_8_8(&lbm->vip_index_per_port, &kv, &value) != 0)
995 clib_warning("looking up vip_index_per_port failed.");
996 return VNET_API_ERROR_NO_SUCH_ENTRY;
998 m = pool_elt_at_index (lbm->vips, value.value);
1001 kv.value = m - lbm->vips;
1002 clib_bihash_add_del_8_8(&lbm->vip_index_per_port, &kv, 0);
1008 * Deletes the adjacency associated with the VIP
1010 static void lb_vip_del_adjacency(lb_main_t *lbm, lb_vip_t *vip)
1012 fib_prefix_t pfx = {};
1017 /* If this vip adjacency is used by other per-port vip,
1018 * no need to del this adjacency. */
1019 if (!lb_vip_port_find_diff_port(&(vip->prefix), vip->plen,
1020 vip->protocol, vip->port, &vip_idx))
1022 lb_put_writer_lock();
1026 /* Return vip_prefix_index for per-port vip */
1027 lb_vip_prefix_index_free(lbm, vip->vip_prefix_index);
1031 if (lb_vip_is_ip4(vip->type)) {
1032 pfx.fp_addr.ip4 = vip->prefix.ip4;
1033 pfx.fp_len = vip->plen - 96;
1034 pfx.fp_proto = FIB_PROTOCOL_IP4;
1036 pfx.fp_addr.ip6 = vip->prefix.ip6;
1037 pfx.fp_len = vip->plen;
1038 pfx.fp_proto = FIB_PROTOCOL_IP6;
1040 fib_table_entry_special_remove(0, &pfx, FIB_SOURCE_PLUGIN_HI);
1043 int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index)
1045 lb_main_t *lbm = &lb_main;
1046 vlib_main_t *vm = vlib_get_main();
1048 lb_vip_type_t type = args.type;
1049 u32 vip_prefix_index = 0;
1051 lb_get_writer_lock();
1052 ip46_prefix_normalize(&(args.prefix), args.plen);
1054 if (!lb_vip_port_find_index_with_lock(&(args.prefix), args.plen,
1055 args.protocol, args.port,
1058 lb_put_writer_lock();
1059 return VNET_API_ERROR_VALUE_EXIST;
1062 /* Make sure we can't add a per-port VIP entry
1063 * when there already is an all-port VIP for the same prefix. */
1064 if ((args.port != 0) &&
1065 !lb_vip_port_find_all_port_vip(&(args.prefix), args.plen, vip_index))
1067 lb_put_writer_lock();
1068 return VNET_API_ERROR_VALUE_EXIST;
1071 /* Make sure we can't add a all-port VIP entry
1072 * when there already is an per-port VIP for the same prefix. */
1073 if ((args.port == 0) &&
1074 !lb_vip_port_find_diff_port(&(args.prefix), args.plen,
1075 args.protocol, args.port, vip_index))
1077 lb_put_writer_lock();
1078 return VNET_API_ERROR_VALUE_EXIST;
1081 /* Make sure all VIP for a given prefix (using different ports) have the same type. */
1082 if ((args.port != 0) &&
1083 !lb_vip_port_find_diff_port(&(args.prefix), args.plen,
1084 args.protocol, args.port, vip_index)
1085 && (args.type != lbm->vips[*vip_index].type))
1087 lb_put_writer_lock();
1088 return VNET_API_ERROR_INVALID_ARGUMENT;
1091 if (!is_pow2(args.new_length)) {
1092 lb_put_writer_lock();
1093 return VNET_API_ERROR_INVALID_MEMORY_SIZE;
1096 if (ip46_prefix_is_ip4(&(args.prefix), args.plen) &&
1097 !lb_vip_is_ip4(type)) {
1098 lb_put_writer_lock();
1099 return VNET_API_ERROR_INVALID_ADDRESS_FAMILY;
1102 if ((!ip46_prefix_is_ip4(&(args.prefix), args.plen)) &&
1103 !lb_vip_is_ip6(type)) {
1104 lb_put_writer_lock();
1105 return VNET_API_ERROR_INVALID_ADDRESS_FAMILY;
1108 if ((type == LB_VIP_TYPE_IP4_L3DSR) &&
1109 (args.encap_args.dscp >= 64) )
1111 lb_put_writer_lock();
1112 return VNET_API_ERROR_VALUE_EXIST;
1116 pool_get(lbm->vips, vip);
1119 memcpy (&(vip->prefix), &(args.prefix), sizeof(args.prefix));
1120 vip->plen = args.plen;
1123 vip->protocol = args.protocol;
1124 vip->port = args.port;
1128 vip->protocol = (u8)~0;
1131 vip->last_garbage_collection = (u32) vlib_time_now(vlib_get_main());
1132 vip->type = args.type;
1134 if (args.type == LB_VIP_TYPE_IP4_L3DSR) {
1135 vip->encap_args.dscp = args.encap_args.dscp;
1137 else if ((args.type == LB_VIP_TYPE_IP4_NAT4)
1138 ||(args.type == LB_VIP_TYPE_IP6_NAT6)) {
1139 vip->encap_args.srv_type = args.encap_args.srv_type;
1140 vip->encap_args.target_port =
1141 clib_host_to_net_u16(args.encap_args.target_port);
1144 vip->flags = LB_VIP_FLAGS_USED;
1145 vip->as_indexes = 0;
1149 for (i = 0; i < LB_N_VIP_COUNTERS; i++) {
1150 vlib_validate_simple_counter(&lbm->vip_counters[i], vip - lbm->vips);
1151 vlib_zero_simple_counter(&lbm->vip_counters[i], vip - lbm->vips);
1154 //Configure new flow table
1155 vip->new_flow_table_mask = args.new_length - 1;
1156 vip->new_flow_table = 0;
1158 //Update flow hash table
1159 lb_vip_update_new_flow_table(vip);
1161 //Create adjacency to direct traffic
1162 lb_vip_add_adjacency(lbm, vip, &vip_prefix_index);
1164 if ( (lb_vip_is_nat4_port(vip) || lb_vip_is_nat6_port(vip))
1165 && (args.encap_args.srv_type == LB_SRV_TYPE_NODEPORT) )
1170 //Create maping from nodeport to vip_index
1171 key = clib_host_to_net_u16(args.port);
1172 entry = hash_get_mem (lbm->vip_index_by_nodeport, &key);
1174 lb_put_writer_lock();
1175 return VNET_API_ERROR_VALUE_EXIST;
1178 hash_set_mem (lbm->vip_index_by_nodeport, &key, vip - lbm->vips);
1180 /* receive packets destined to NodeIP:NodePort */
1181 udp_register_dst_port (vm, args.port, lb4_nodeport_node.index, 1);
1182 udp_register_dst_port (vm, args.port, lb6_nodeport_node.index, 0);
1185 *vip_index = vip - lbm->vips;
1186 //Create per-port vip filtering table
1189 lb_vip_add_port_filter(lbm, vip, vip_prefix_index, *vip_index);
1190 vip->vip_prefix_index = vip_prefix_index;
1193 lb_put_writer_lock();
1197 int lb_vip_del(u32 vip_index)
1199 lb_main_t *lbm = &lb_main;
1203 /* Does not remove default vip, i.e. vip_index = 0 */
1205 return VNET_API_ERROR_INVALID_VALUE;
1207 lb_get_writer_lock();
1208 if (!(vip = lb_vip_get_by_index(vip_index))) {
1209 lb_put_writer_lock();
1210 return VNET_API_ERROR_NO_SUCH_ENTRY;
1213 //FIXME: This operation is actually not working
1214 //We will need to remove state before performing this.
1218 ip46_address_t *ass = 0;
1222 pool_foreach(as_index, vip->as_indexes, {
1223 as = &lbm->ass[*as_index];
1224 vec_add1(ass, as->address);
1227 lb_vip_del_ass_withlock(vip_index, ass, vec_len(ass), 0);
1232 lb_vip_del_adjacency(lbm, vip);
1234 //Delete per-port vip filtering entry
1237 rv = lb_vip_del_port_filter(lbm, vip);
1240 //Set the VIP as unused
1241 vip->flags &= ~LB_VIP_FLAGS_USED;
1243 lb_put_writer_lock();
1248 VLIB_PLUGIN_REGISTER () = {
1249 .version = VPP_BUILD_VER,
1250 .description = "Load Balancer (LB)",
1254 u8 *format_lb_dpo (u8 * s, va_list * va)
1256 index_t index = va_arg (*va, index_t);
1257 CLIB_UNUSED(u32 indent) = va_arg (*va, u32);
1258 lb_main_t *lbm = &lb_main;
1259 lb_vip_t *vip = pool_elt_at_index (lbm->vips, index);
1260 return format (s, "%U", format_lb_vip, vip);
1263 static void lb_dpo_lock (dpo_id_t *dpo) {}
1264 static void lb_dpo_unlock (dpo_id_t *dpo) {}
1267 lb_fib_node_get_node (fib_node_index_t index)
1269 lb_main_t *lbm = &lb_main;
1270 lb_as_t *as = pool_elt_at_index (lbm->ass, index);
1271 return (&as->fib_node);
1275 lb_fib_node_last_lock_gone (fib_node_t *node)
1280 lb_as_from_fib_node (fib_node_t *node)
1282 return ((lb_as_t*)(((char*)node) -
1283 STRUCT_OFFSET_OF(lb_as_t, fib_node)));
1287 lb_as_stack (lb_as_t *as)
1289 lb_main_t *lbm = &lb_main;
1290 lb_vip_t *vip = &lbm->vips[as->vip_index];
1291 dpo_type_t dpo_type = 0;
1293 if (lb_vip_is_gre4(vip))
1294 dpo_type = lbm->dpo_gre4_type;
1295 else if (lb_vip_is_gre6(vip))
1296 dpo_type = lbm->dpo_gre6_type;
1297 else if (lb_vip_is_gre4_port(vip))
1298 dpo_type = lbm->dpo_gre4_port_type;
1299 else if (lb_vip_is_gre6_port(vip))
1300 dpo_type = lbm->dpo_gre6_port_type;
1301 else if (lb_vip_is_l3dsr(vip))
1302 dpo_type = lbm->dpo_l3dsr_type;
1303 else if (lb_vip_is_l3dsr_port(vip))
1304 dpo_type = lbm->dpo_l3dsr_port_type;
1305 else if(lb_vip_is_nat4_port(vip))
1306 dpo_type = lbm->dpo_nat4_port_type;
1307 else if (lb_vip_is_nat6_port(vip))
1308 dpo_type = lbm->dpo_nat6_port_type;
1311 lb_vip_is_ip4(vip->type)?DPO_PROTO_IP4:DPO_PROTO_IP6,
1313 fib_entry_contribute_ip_forwarding(
1314 as->next_hop_fib_entry_index));
1317 static fib_node_back_walk_rc_t
1318 lb_fib_node_back_walk_notify (fib_node_t *node,
1319 fib_node_back_walk_ctx_t *ctx)
1321 lb_as_stack(lb_as_from_fib_node(node));
1322 return (FIB_NODE_BACK_WALK_CONTINUE);
1325 int lb_nat4_interface_add_del (u32 sw_if_index, int is_del)
1329 vnet_feature_enable_disable ("ip4-unicast", "lb-nat4-in2out",
1330 sw_if_index, 0, 0, 0);
1334 vnet_feature_enable_disable ("ip4-unicast", "lb-nat4-in2out",
1335 sw_if_index, 1, 0, 0);
1341 int lb_nat6_interface_add_del (u32 sw_if_index, int is_del)
1345 vnet_feature_enable_disable ("ip6-unicast", "lb-nat6-in2out",
1346 sw_if_index, 0, 0, 0);
1350 vnet_feature_enable_disable ("ip6-unicast", "lb-nat6-in2out",
1351 sw_if_index, 1, 0, 0);
1358 lb_init (vlib_main_t * vm)
1360 vlib_thread_main_t *tm = vlib_get_thread_main ();
1361 lb_main_t *lbm = &lb_main;
1362 lbm->vnet_main = vnet_get_main ();
1363 lbm->vlib_main = vm;
1365 lb_vip_t *default_vip;
1366 lb_as_t *default_as;
1367 fib_node_vft_t lb_fib_node_vft = {
1368 .fnv_get = lb_fib_node_get_node,
1369 .fnv_last_lock = lb_fib_node_last_lock_gone,
1370 .fnv_back_walk = lb_fib_node_back_walk_notify,
1372 dpo_vft_t lb_vft = {
1373 .dv_lock = lb_dpo_lock,
1374 .dv_unlock = lb_dpo_unlock,
1375 .dv_format = format_lb_dpo,
1378 //Allocate and init default VIP.
1380 pool_get(lbm->vips, default_vip);
1381 default_vip->new_flow_table_mask = 0;
1382 default_vip->prefix.ip6.as_u64[0] = 0xffffffffffffffffL;
1383 default_vip->prefix.ip6.as_u64[1] = 0xffffffffffffffffL;
1384 default_vip->protocol = ~0;
1385 default_vip->port = 0;
1386 default_vip->flags = LB_VIP_FLAGS_USED;
1389 vec_validate(lbm->per_cpu, tm->n_vlib_mains - 1);
1390 clib_spinlock_init (&lbm->writer_lock);
1391 lbm->per_cpu_sticky_buckets = LB_DEFAULT_PER_CPU_STICKY_BUCKETS;
1392 lbm->flow_timeout = LB_DEFAULT_FLOW_TIMEOUT;
1393 lbm->ip4_src_address.as_u32 = 0xffffffff;
1394 lbm->ip6_src_address.as_u64[0] = 0xffffffffffffffffL;
1395 lbm->ip6_src_address.as_u64[1] = 0xffffffffffffffffL;
1396 lbm->dpo_gre4_type = dpo_register_new_type(&lb_vft, lb_dpo_gre4_nodes);
1397 lbm->dpo_gre6_type = dpo_register_new_type(&lb_vft, lb_dpo_gre6_nodes);
1398 lbm->dpo_gre4_port_type = dpo_register_new_type(&lb_vft,
1399 lb_dpo_gre4_port_nodes);
1400 lbm->dpo_gre6_port_type = dpo_register_new_type(&lb_vft,
1401 lb_dpo_gre6_port_nodes);
1402 lbm->dpo_l3dsr_type = dpo_register_new_type(&lb_vft,
1403 lb_dpo_l3dsr_nodes);
1404 lbm->dpo_l3dsr_port_type = dpo_register_new_type(&lb_vft,
1405 lb_dpo_l3dsr_port_nodes);
1406 lbm->dpo_nat4_port_type = dpo_register_new_type(&lb_vft,
1407 lb_dpo_nat4_port_nodes);
1408 lbm->dpo_nat6_port_type = dpo_register_new_type(&lb_vft,
1409 lb_dpo_nat6_port_nodes);
1410 lbm->fib_node_type = fib_node_register_new_type(&lb_fib_node_vft);
1412 //Init AS reference counters
1413 vlib_refcount_init(&lbm->as_refcount);
1415 //Allocate and init default AS.
1417 pool_get(lbm->ass, default_as);
1418 default_as->flags = 0;
1419 default_as->dpo.dpoi_next_node = LB_NEXT_DROP;
1420 default_as->vip_index = ~0;
1421 default_as->address.ip6.as_u64[0] = 0xffffffffffffffffL;
1422 default_as->address.ip6.as_u64[1] = 0xffffffffffffffffL;
1424 /* Generate a valid flow table for default VIP */
1425 default_vip->as_indexes = NULL;
1426 lb_get_writer_lock();
1427 lb_vip_update_new_flow_table(default_vip);
1428 lb_put_writer_lock();
1430 lbm->vip_index_by_nodeport
1431 = hash_create_mem (0, sizeof(u16), sizeof (uword));
1433 clib_bihash_init_8_8 (&lbm->vip_index_per_port,
1434 "vip_index_per_port", LB_VIP_PER_PORT_BUCKETS,
1435 LB_VIP_PER_PORT_MEMORY_SIZE);
1437 clib_bihash_init_8_8 (&lbm->mapping_by_as4,
1438 "mapping_by_as4", LB_MAPPING_BUCKETS,
1439 LB_MAPPING_MEMORY_SIZE);
1441 clib_bihash_init_24_8 (&lbm->mapping_by_as6,
1442 "mapping_by_as6", LB_MAPPING_BUCKETS,
1443 LB_MAPPING_MEMORY_SIZE);
1445 #define _(a,b,c) lbm->vip_counters[c].name = b;
1446 lb_foreach_vip_counter
1451 VLIB_INIT_FUNCTION (lb_init);