2 * Copyright (c) 2016 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
17 #include <vnet/plugin/plugin.h>
18 #include <vpp/app/version.h>
19 #include <vnet/api_errno.h>
20 #include <vnet/udp/udp.h>
22 //GC runs at most once every so many seconds
23 #define LB_GARBAGE_RUN 60
25 //After so many seconds. It is assumed that inter-core race condition will not occur.
26 #define LB_CONCURRENCY_TIMEOUT 10
30 #define lb_get_writer_lock() do {} while(__sync_lock_test_and_set (lb_main.writer_lock, 1))
31 #define lb_put_writer_lock() lb_main.writer_lock[0] = 0
33 static void lb_as_stack (lb_as_t *as);
36 const static char * const lb_dpo_gre4_ip4[] = { "lb4-gre4" , NULL };
37 const static char * const lb_dpo_gre4_ip6[] = { "lb6-gre4" , NULL };
38 const static char* const * const lb_dpo_gre4_nodes[DPO_PROTO_NUM] =
40 [DPO_PROTO_IP4] = lb_dpo_gre4_ip4,
41 [DPO_PROTO_IP6] = lb_dpo_gre4_ip6,
44 const static char * const lb_dpo_gre6_ip4[] = { "lb4-gre6" , NULL };
45 const static char * const lb_dpo_gre6_ip6[] = { "lb6-gre6" , NULL };
46 const static char* const * const lb_dpo_gre6_nodes[DPO_PROTO_NUM] =
48 [DPO_PROTO_IP4] = lb_dpo_gre6_ip4,
49 [DPO_PROTO_IP6] = lb_dpo_gre6_ip6,
52 const static char * const lb_dpo_l3dsr_ip4[] = { "lb4-l3dsr" , NULL };
53 const static char* const * const lb_dpo_l3dsr_nodes[DPO_PROTO_NUM] =
55 [DPO_PROTO_IP4] = lb_dpo_l3dsr_ip4,
58 const static char * const lb_dpo_nat4_ip4[] = { "lb4-nat4" , NULL };
59 const static char* const * const lb_dpo_nat4_nodes[DPO_PROTO_NUM] =
61 [DPO_PROTO_IP4] = lb_dpo_nat4_ip4,
64 const static char * const lb_dpo_nat6_ip6[] = { "lb6-nat6" , NULL };
65 const static char* const * const lb_dpo_nat6_nodes[DPO_PROTO_NUM] =
67 [DPO_PROTO_IP6] = lb_dpo_nat6_ip6,
70 u32 lb_hash_time_now(vlib_main_t * vm)
72 return (u32) (vlib_time_now(vm) + 10000);
75 u8 *format_lb_main (u8 * s, va_list * args)
77 vlib_thread_main_t *tm = vlib_get_thread_main();
78 lb_main_t *lbm = &lb_main;
79 s = format(s, "lb_main");
80 s = format(s, " ip4-src-address: %U \n", format_ip4_address, &lbm->ip4_src_address);
81 s = format(s, " ip6-src-address: %U \n", format_ip6_address, &lbm->ip6_src_address);
82 s = format(s, " #vips: %u\n", pool_elts(lbm->vips));
83 s = format(s, " #ass: %u\n", pool_elts(lbm->ass) - 1);
86 for(thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++ ) {
87 lb_hash_t *h = lbm->per_cpu[thread_index].sticky_ht;
89 s = format(s, "core %d\n", thread_index);
90 s = format(s, " timeout: %ds\n", h->timeout);
91 s = format(s, " usage: %d / %d\n", lb_hash_elts(h, lb_hash_time_now(vlib_get_main())), lb_hash_size(h));
98 static char *lb_vip_type_strings[] = {
99 [LB_VIP_TYPE_IP6_GRE6] = "ip6-gre6",
100 [LB_VIP_TYPE_IP6_GRE4] = "ip6-gre4",
101 [LB_VIP_TYPE_IP4_GRE6] = "ip4-gre6",
102 [LB_VIP_TYPE_IP4_GRE4] = "ip4-gre4",
103 [LB_VIP_TYPE_IP4_L3DSR] = "ip4-l3dsr",
104 [LB_VIP_TYPE_IP4_NAT4] = "ip4-nat4",
105 [LB_VIP_TYPE_IP6_NAT6] = "ip6-nat6",
108 u8 *format_lb_vip_type (u8 * s, va_list * args)
110 lb_vip_type_t vipt = va_arg (*args, lb_vip_type_t);
112 for (i=0; i<LB_VIP_N_TYPES; i++)
114 return format(s, lb_vip_type_strings[i]);
115 return format(s, "_WRONG_TYPE_");
118 uword unformat_lb_vip_type (unformat_input_t * input, va_list * args)
120 lb_vip_type_t *vipt = va_arg (*args, lb_vip_type_t *);
122 for (i=0; i<LB_VIP_N_TYPES; i++)
123 if (unformat(input, lb_vip_type_strings[i])) {
130 u8 *format_lb_vip (u8 * s, va_list * args)
132 lb_vip_t *vip = va_arg (*args, lb_vip_t *);
133 s = format(s, "%U %U new_size:%u #as:%u%s",
134 format_lb_vip_type, vip->type,
135 format_ip46_prefix, &vip->prefix, vip->plen, IP46_TYPE_ANY,
136 vip->new_flow_table_mask + 1,
137 pool_elts(vip->as_indexes),
138 (vip->flags & LB_VIP_FLAGS_USED)?"":" removed");
140 if (vip->type == LB_VIP_TYPE_IP4_L3DSR)
142 s = format(s, " dscp:%u", vip->encap_args.dscp);
144 else if ((vip->type == LB_VIP_TYPE_IP4_NAT4)
145 || (vip->type == LB_VIP_TYPE_IP6_NAT6))
147 if (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP)
148 s = format (s, " type:clusterip port:%u target_port:%u",
149 ntohs (vip->encap_args.port),
150 ntohs (vip->encap_args.target_port));
152 s = format (s, " type:nodeport node_port:%u target_port:%u",
153 ntohs (vip->encap_args.node_port),
154 ntohs (vip->encap_args.target_port));
160 u8 *format_lb_as (u8 * s, va_list * args)
162 lb_as_t *as = va_arg (*args, lb_as_t *);
163 return format(s, "%U %s", format_ip46_address,
164 &as->address, IP46_TYPE_ANY,
165 (as->flags & LB_AS_FLAGS_USED)?"used":"removed");
168 u8 *format_lb_vip_detailed (u8 * s, va_list * args)
170 lb_main_t *lbm = &lb_main;
171 lb_vip_t *vip = va_arg (*args, lb_vip_t *);
172 u32 indent = format_get_indent (s);
174 s = format(s, "%U %U [%lu] %U%s\n"
176 format_white_space, indent,
177 format_lb_vip_type, vip->type,
179 format_ip46_prefix, &vip->prefix, (u32) vip->plen, IP46_TYPE_ANY,
180 (vip->flags & LB_VIP_FLAGS_USED)?"":" removed",
181 format_white_space, indent,
182 vip->new_flow_table_mask + 1);
184 if (vip->type == LB_VIP_TYPE_IP4_L3DSR)
186 s = format(s, "%U dscp:%u\n",
187 format_white_space, indent,
188 vip->encap_args.dscp);
190 else if ((vip->type == LB_VIP_TYPE_IP4_NAT4)
191 || (vip->type == LB_VIP_TYPE_IP6_NAT6))
193 if (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP)
194 s = format (s, "%U type:clusterip port:%u target_port:%u",
195 format_white_space, indent, ntohs (vip->encap_args.port),
196 ntohs (vip->encap_args.target_port));
198 s = format (s, "%U type:nodeport node_port:%u target_port:%u",
199 format_white_space, indent,
200 ntohs (vip->encap_args.node_port),
201 ntohs (vip->encap_args.target_port));
205 s = format(s, "%U counters:\n",
206 format_white_space, indent);
208 for (i=0; i<LB_N_VIP_COUNTERS; i++)
209 s = format(s, "%U %s: %d\n",
210 format_white_space, indent,
211 lbm->vip_counters[i].name,
212 vlib_get_simple_counter(&lbm->vip_counters[i], vip - lbm->vips));
215 s = format(s, "%U #as:%u\n",
216 format_white_space, indent,
217 pool_elts(vip->as_indexes));
219 //Let's count the buckets for each AS
221 vec_validate(count, pool_len(lbm->ass)); //Possibly big alloc for not much...
222 lb_new_flow_entry_t *nfe;
223 vec_foreach(nfe, vip->new_flow_table)
224 count[nfe->as_index]++;
228 pool_foreach(as_index, vip->as_indexes, {
229 as = &lbm->ass[*as_index];
230 s = format(s, "%U %U %d buckets %d flows dpo:%u %s\n",
231 format_white_space, indent,
232 format_ip46_address, &as->address, IP46_TYPE_ANY,
233 count[as - lbm->ass],
234 vlib_refcount_get(&lbm->as_refcount, as - lbm->ass),
236 (as->flags & LB_AS_FLAGS_USED)?"used":" removed");
242 s = format(s, "%U new flows table:\n", format_white_space, indent);
243 lb_new_flow_entry_t *nfe;
244 vec_foreach(nfe, vip->new_flow_table) {
245 s = format(s, "%U %d: %d\n", format_white_space, indent, nfe - vip->new_flow_table, nfe->as_index);
257 static int lb_pseudorand_compare(void *a, void *b)
260 lb_main_t *lbm = &lb_main;
261 asa = &lbm->ass[((lb_pseudorand_t *)a)->as_index];
262 asb = &lbm->ass[((lb_pseudorand_t *)b)->as_index];
263 return memcmp(&asa->address, &asb->address, sizeof(asb->address));
266 static void lb_vip_garbage_collection(lb_vip_t *vip)
268 lb_main_t *lbm = &lb_main;
269 lb_snat4_key_t m_key4;
270 clib_bihash_kv_8_8_t kv4, value4;
271 lb_snat6_key_t m_key6;
272 clib_bihash_kv_24_8_t kv6, value6;
273 lb_snat_mapping_t *m = 0;
274 ASSERT (lbm->writer_lock[0]);
276 u32 now = (u32) vlib_time_now(vlib_get_main());
277 if (!clib_u32_loop_gt(now, vip->last_garbage_collection + LB_GARBAGE_RUN))
280 vip->last_garbage_collection = now;
283 pool_foreach(as_index, vip->as_indexes, {
284 as = &lbm->ass[*as_index];
285 if (!(as->flags & LB_AS_FLAGS_USED) && //Not used
286 clib_u32_loop_gt(now, as->last_used + LB_CONCURRENCY_TIMEOUT) && //Not recently used
287 (vlib_refcount_get(&lbm->as_refcount, as - lbm->ass) == 0))
290 if (lb_vip_is_nat4(vip)) {
291 m_key4.addr = as->address.ip4;
292 m_key4.port = vip->encap_args.target_port;
294 m_key4.fib_index = 0;
296 kv4.key = m_key4.as_u64;
297 if(!clib_bihash_search_8_8(&lbm->mapping_by_as4, &kv4, &value4))
298 m = pool_elt_at_index (lbm->snat_mappings, value4.value);
301 kv4.value = m - lbm->snat_mappings;
302 clib_bihash_add_del_8_8(&lbm->mapping_by_as4, &kv4, 0);
303 pool_put (lbm->snat_mappings, m);
304 } else if (lb_vip_is_nat6(vip)) {
305 m_key6.addr.as_u64[0] = as->address.ip6.as_u64[0];
306 m_key6.addr.as_u64[1] = as->address.ip6.as_u64[1];
307 m_key6.port = vip->encap_args.target_port;
309 m_key6.fib_index = 0;
311 kv6.key[0] = m_key6.as_u64[0];
312 kv6.key[1] = m_key6.as_u64[1];
313 kv6.key[2] = m_key6.as_u64[2];
315 if (!clib_bihash_search_24_8 (&lbm->mapping_by_as6, &kv6, &value6))
316 m = pool_elt_at_index (lbm->snat_mappings, value6.value);
319 kv6.value = m - lbm->snat_mappings;
320 clib_bihash_add_del_24_8(&lbm->mapping_by_as6, &kv6, 0);
321 pool_put (lbm->snat_mappings, m);
323 fib_entry_child_remove(as->next_hop_fib_entry_index,
324 as->next_hop_child_index);
325 fib_table_entry_delete_index(as->next_hop_fib_entry_index,
327 as->next_hop_fib_entry_index = FIB_NODE_INDEX_INVALID;
329 pool_put(vip->as_indexes, as_index);
330 pool_put(lbm->ass, as);
335 void lb_garbage_collection()
337 lb_main_t *lbm = &lb_main;
338 lb_get_writer_lock();
340 u32 *to_be_removed_vips = 0, *i;
341 pool_foreach(vip, lbm->vips, {
342 lb_vip_garbage_collection(vip);
344 if (!(vip->flags & LB_VIP_FLAGS_USED) &&
345 (pool_elts(vip->as_indexes) == 0)) {
346 vec_add1(to_be_removed_vips, vip - lbm->vips);
350 vec_foreach(i, to_be_removed_vips) {
351 vip = &lbm->vips[*i];
352 pool_put(lbm->vips, vip);
353 pool_free(vip->as_indexes);
356 vec_free(to_be_removed_vips);
357 lb_put_writer_lock();
360 static void lb_vip_update_new_flow_table(lb_vip_t *vip)
362 lb_main_t *lbm = &lb_main;
363 lb_new_flow_entry_t *old_table;
365 lb_new_flow_entry_t *new_flow_table = 0;
367 lb_pseudorand_t *pr, *sort_arr = 0;
370 ASSERT (lbm->writer_lock[0]); //We must have the lock
372 //Check if some AS is configured or not
374 pool_foreach(as_index, vip->as_indexes, {
375 as = &lbm->ass[*as_index];
376 if (as->flags & LB_AS_FLAGS_USED) { //Not used anymore
378 goto out; //Not sure 'break' works in this macro-loop
384 //Only the default. i.e. no AS
385 vec_validate(new_flow_table, vip->new_flow_table_mask);
386 for (i=0; i<vec_len(new_flow_table); i++)
387 new_flow_table[i].as_index = 0;
392 //First, let's sort the ASs
394 vec_alloc(sort_arr, pool_elts(vip->as_indexes));
397 pool_foreach(as_index, vip->as_indexes, {
398 as = &lbm->ass[*as_index];
399 if (!(as->flags & LB_AS_FLAGS_USED)) //Not used anymore
402 sort_arr[i].as_index = as - lbm->ass;
405 _vec_len(sort_arr) = i;
407 vec_sort_with_function(sort_arr, lb_pseudorand_compare);
409 //Now let's pseudo-randomly generate permutations
410 vec_foreach(pr, sort_arr) {
411 lb_as_t *as = &lbm->ass[pr->as_index];
413 u64 seed = clib_xxhash(as->address.as_u64[0] ^
414 as->address.as_u64[1]);
415 /* We have 2^n buckets.
416 * skip must be prime with 2^n.
417 * So skip must be odd.
418 * MagLev actually state that M should be prime,
419 * but this has a big computation cost (% operation).
420 * Using 2^n is more better (& operation).
422 pr->skip = ((seed & 0xffffffff) | 1) & vip->new_flow_table_mask;
423 pr->last = (seed >> 32) & vip->new_flow_table_mask;
426 //Let's create a new flow table
427 vec_validate(new_flow_table, vip->new_flow_table_mask);
428 for (i=0; i<vec_len(new_flow_table); i++)
429 new_flow_table[i].as_index = ~0;
433 vec_foreach(pr, sort_arr) {
436 pr->last = (pr->last + pr->skip) & vip->new_flow_table_mask;
437 if (new_flow_table[last].as_index == ~0) {
438 new_flow_table[last].as_index = pr->as_index;
443 if (done == vec_len(new_flow_table))
452 //Count number of changed entries
454 for (i=0; i<vec_len(new_flow_table); i++)
455 if (vip->new_flow_table == 0 ||
456 new_flow_table[i].as_index != vip->new_flow_table[i].as_index)
459 old_table = vip->new_flow_table;
460 vip->new_flow_table = new_flow_table;
464 int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address,
465 u32 per_cpu_sticky_buckets, u32 flow_timeout)
467 lb_main_t *lbm = &lb_main;
469 if (!is_pow2(per_cpu_sticky_buckets))
470 return VNET_API_ERROR_INVALID_MEMORY_SIZE;
472 lb_get_writer_lock(); //Not exactly necessary but just a reminder that it exists for my future self
473 lbm->ip4_src_address = *ip4_address;
474 lbm->ip6_src_address = *ip6_address;
475 lbm->per_cpu_sticky_buckets = per_cpu_sticky_buckets;
476 lbm->flow_timeout = flow_timeout;
477 lb_put_writer_lock();
482 int lb_vip_find_index_with_lock(ip46_address_t *prefix, u8 plen, u32 *vip_index)
484 lb_main_t *lbm = &lb_main;
486 ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned
487 ip46_prefix_normalize(prefix, plen);
488 pool_foreach(vip, lbm->vips, {
489 if ((vip->flags & LB_AS_FLAGS_USED) &&
491 vip->prefix.as_u64[0] == prefix->as_u64[0] &&
492 vip->prefix.as_u64[1] == prefix->as_u64[1]) {
493 *vip_index = vip - lbm->vips;
497 return VNET_API_ERROR_NO_SUCH_ENTRY;
500 int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index)
503 lb_get_writer_lock();
504 ret = lb_vip_find_index_with_lock(prefix, plen, vip_index);
505 lb_put_writer_lock();
509 static int lb_as_find_index_vip(lb_vip_t *vip, ip46_address_t *address, u32 *as_index)
511 lb_main_t *lbm = &lb_main;
512 ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned
515 pool_foreach(asi, vip->as_indexes, {
516 as = &lbm->ass[*asi];
517 if (as->vip_index == (vip - lbm->vips) &&
518 as->address.as_u64[0] == address->as_u64[0] &&
519 as->address.as_u64[1] == address->as_u64[1]) {
520 *as_index = as - lbm->ass;
527 int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n)
529 lb_main_t *lbm = &lb_main;
530 lb_get_writer_lock();
532 if (!(vip = lb_vip_get_by_index(vip_index))) {
533 lb_put_writer_lock();
534 return VNET_API_ERROR_NO_SUCH_ENTRY;
537 ip46_type_t type = lb_encap_is_ip4(vip)?IP46_TYPE_IP4:IP46_TYPE_IP6;
538 u32 *to_be_added = 0;
539 u32 *to_be_updated = 0;
542 lb_snat_mapping_t *m;
547 if (!lb_as_find_index_vip(vip, &addresses[n], &i)) {
548 if (lbm->ass[i].flags & LB_AS_FLAGS_USED) {
549 vec_free(to_be_added);
550 vec_free(to_be_updated);
551 lb_put_writer_lock();
552 return VNET_API_ERROR_VALUE_EXIST;
554 vec_add1(to_be_updated, i);
558 if (ip46_address_type(&addresses[n]) != type) {
559 vec_free(to_be_added);
560 vec_free(to_be_updated);
561 lb_put_writer_lock();
562 return VNET_API_ERROR_INVALID_ADDRESS_FAMILY;
567 while(n2--) //Check for duplicates
568 if (addresses[n2].as_u64[0] == addresses[n].as_u64[0] &&
569 addresses[n2].as_u64[1] == addresses[n].as_u64[1])
573 vec_add1(to_be_added, n);
580 vec_foreach(ip, to_be_updated) {
581 lbm->ass[*ip].flags = LB_AS_FLAGS_USED;
583 vec_free(to_be_updated);
585 //Create those who have to be created
586 vec_foreach(ip, to_be_added) {
589 pool_get(lbm->ass, as);
590 as->address = addresses[*ip];
591 as->flags = LB_AS_FLAGS_USED;
592 as->vip_index = vip_index;
593 pool_get(vip->as_indexes, as_index);
594 *as_index = as - lbm->ass;
597 * become a child of the FIB entry
598 * so we are informed when its forwarding changes
600 fib_prefix_t nh = {};
601 if (lb_encap_is_ip4(vip)) {
602 nh.fp_addr.ip4 = as->address.ip4;
604 nh.fp_proto = FIB_PROTOCOL_IP4;
606 nh.fp_addr.ip6 = as->address.ip6;
608 nh.fp_proto = FIB_PROTOCOL_IP6;
611 as->next_hop_fib_entry_index =
612 fib_table_entry_special_add(0,
615 FIB_ENTRY_FLAG_NONE);
616 as->next_hop_child_index =
617 fib_entry_child_add(as->next_hop_fib_entry_index,
623 if ( lb_vip_is_nat4(vip) || lb_vip_is_nat6(vip) )
625 /* Add SNAT static mapping */
626 pool_get (lbm->snat_mappings, m);
627 memset (m, 0, sizeof (*m));
628 if (lb_vip_is_nat4(vip)) {
629 lb_snat4_key_t m_key4;
630 clib_bihash_kv_8_8_t kv4;
631 m_key4.addr = as->address.ip4;
632 m_key4.port = vip->encap_args.target_port;
634 m_key4.fib_index = 0;
636 if (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP)
638 m->src_ip.ip4 = vip->prefix.ip4;
639 m->src_port = vip->encap_args.port;
641 else if (vip->encap_args.srv_type == LB_SRV_TYPE_NODEPORT)
643 m->src_ip.ip4 = lbm->ip4_src_address;
644 m->src_port = vip->encap_args.node_port;
646 m->src_ip_is_ipv6 = 0;
647 m->as_ip.ip4 = as->address.ip4;
648 m->as_ip_is_ipv6 = 0;;
649 m->target_port = vip->encap_args.target_port;
653 kv4.key = m_key4.as_u64;
654 kv4.value = m - lbm->snat_mappings;
655 clib_bihash_add_del_8_8(&lbm->mapping_by_as4, &kv4, 1);
657 lb_snat6_key_t m_key6;
658 clib_bihash_kv_24_8_t kv6;
659 m_key6.addr.as_u64[0] = as->address.ip6.as_u64[0];
660 m_key6.addr.as_u64[1] = as->address.ip6.as_u64[1];
661 m_key6.port = vip->encap_args.target_port;
663 m_key6.fib_index = 0;
665 if (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP)
667 m->src_ip.ip6.as_u64[0] = vip->prefix.ip6.as_u64[0];
668 m->src_ip.ip6.as_u64[1] = vip->prefix.ip6.as_u64[1];
669 m->src_port = vip->encap_args.port;
671 else if (vip->encap_args.srv_type == LB_SRV_TYPE_NODEPORT)
673 m->src_ip.ip6.as_u64[0] = lbm->ip6_src_address.as_u64[0];
674 m->src_ip.ip6.as_u64[1] = lbm->ip6_src_address.as_u64[1];
675 m->src_port = vip->encap_args.node_port;
677 m->src_ip_is_ipv6 = 1;
678 m->as_ip.ip6.as_u64[0] = as->address.ip6.as_u64[0];
679 m->as_ip.ip6.as_u64[1] = as->address.ip6.as_u64[1];
680 m->as_ip_is_ipv6 = 1;
681 m->target_port = vip->encap_args.target_port;
685 kv6.key[0] = m_key6.as_u64[0];
686 kv6.key[1] = m_key6.as_u64[1];
687 kv6.key[2] = m_key6.as_u64[2];
688 kv6.value = m - lbm->snat_mappings;
689 clib_bihash_add_del_24_8(&lbm->mapping_by_as6, &kv6, 1);
693 vec_free(to_be_added);
696 lb_vip_update_new_flow_table(vip);
698 //Garbage collection maybe
699 lb_vip_garbage_collection(vip);
701 lb_put_writer_lock();
705 int lb_vip_del_ass_withlock(u32 vip_index, ip46_address_t *addresses, u32 n)
707 lb_main_t *lbm = &lb_main;
708 u32 now = (u32) vlib_time_now(vlib_get_main());
712 if (!(vip = lb_vip_get_by_index(vip_index))) {
713 return VNET_API_ERROR_NO_SUCH_ENTRY;
719 if (lb_as_find_index_vip(vip, &addresses[n], &i)) {
721 return VNET_API_ERROR_NO_SUCH_ENTRY;
724 if (n) { //Check for duplicates
727 if (addresses[n2].as_u64[0] == addresses[n].as_u64[0] &&
728 addresses[n2].as_u64[1] == addresses[n].as_u64[1])
733 vec_add1(indexes, i);
738 //Garbage collection maybe
739 lb_vip_garbage_collection(vip);
741 if (indexes != NULL) {
742 vec_foreach(ip, indexes) {
743 lbm->ass[*ip].flags &= ~LB_AS_FLAGS_USED;
744 lbm->ass[*ip].last_used = now;
748 lb_vip_update_new_flow_table(vip);
755 int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n)
757 lb_get_writer_lock();
758 int ret = lb_vip_del_ass_withlock(vip_index, addresses, n);
759 lb_put_writer_lock();
764 * Add the VIP adjacency to the ip4 or ip6 fib
766 static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip)
768 dpo_proto_t proto = 0;
769 dpo_type_t dpo_type = 0;
771 dpo_id_t dpo = DPO_INVALID;
772 fib_prefix_t pfx = {};
773 if (lb_vip_is_ip4(vip)) {
774 pfx.fp_addr.ip4 = vip->prefix.ip4;
775 pfx.fp_len = vip->plen - 96;
776 pfx.fp_proto = FIB_PROTOCOL_IP4;
777 proto = DPO_PROTO_IP4;
779 pfx.fp_addr.ip6 = vip->prefix.ip6;
780 pfx.fp_len = vip->plen;
781 pfx.fp_proto = FIB_PROTOCOL_IP6;
782 proto = DPO_PROTO_IP6;
785 if (lb_vip_is_gre4(vip))
786 dpo_type = lbm->dpo_gre4_type;
787 else if (lb_vip_is_gre6(vip))
788 dpo_type = lbm->dpo_gre6_type;
789 else if (lb_vip_is_l3dsr(vip))
790 dpo_type = lbm->dpo_l3dsr_type;
791 else if(lb_vip_is_nat4(vip))
792 dpo_type = lbm->dpo_nat4_type;
793 else if (lb_vip_is_nat6(vip))
794 dpo_type = lbm->dpo_nat6_type;
796 dpo_set(&dpo, dpo_type, proto, vip - lbm->vips);
797 fib_table_entry_special_dpo_add(0,
799 FIB_SOURCE_PLUGIN_HI,
800 FIB_ENTRY_FLAG_EXCLUSIVE,
806 * Deletes the adjacency associated with the VIP
808 static void lb_vip_del_adjacency(lb_main_t *lbm, lb_vip_t *vip)
810 fib_prefix_t pfx = {};
811 if (lb_vip_is_ip4(vip)) {
812 pfx.fp_addr.ip4 = vip->prefix.ip4;
813 pfx.fp_len = vip->plen - 96;
814 pfx.fp_proto = FIB_PROTOCOL_IP4;
816 pfx.fp_addr.ip6 = vip->prefix.ip6;
817 pfx.fp_len = vip->plen;
818 pfx.fp_proto = FIB_PROTOCOL_IP6;
820 fib_table_entry_special_remove(0, &pfx, FIB_SOURCE_PLUGIN_HI);
823 int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index)
825 lb_main_t *lbm = &lb_main;
826 vlib_main_t *vm = vlib_get_main();
828 lb_vip_type_t type = args.type;
829 u16 node_port = args.encap_args.node_port;
831 lb_get_writer_lock();
832 ip46_prefix_normalize(&(args.prefix), args.plen);
834 if (!lb_vip_find_index_with_lock(&(args.prefix), args.plen, vip_index)) {
835 lb_put_writer_lock();
836 return VNET_API_ERROR_VALUE_EXIST;
839 if (!is_pow2(args.new_length)) {
840 lb_put_writer_lock();
841 return VNET_API_ERROR_INVALID_MEMORY_SIZE;
844 if (ip46_prefix_is_ip4(&(args.prefix), args.plen) &&
845 (type != LB_VIP_TYPE_IP4_GRE4) &&
846 (type != LB_VIP_TYPE_IP4_GRE6) &&
847 (type != LB_VIP_TYPE_IP4_L3DSR) &&
848 (type != LB_VIP_TYPE_IP4_NAT4)) {
849 lb_put_writer_lock();
850 return VNET_API_ERROR_INVALID_ADDRESS_FAMILY;
853 if ((!ip46_prefix_is_ip4(&(args.prefix), args.plen)) &&
854 (type != LB_VIP_TYPE_IP6_GRE4) &&
855 (type != LB_VIP_TYPE_IP6_GRE6) &&
856 (type != LB_VIP_TYPE_IP6_NAT6)) {
857 lb_put_writer_lock();
858 return VNET_API_ERROR_INVALID_ADDRESS_FAMILY;
861 if ((type == LB_VIP_TYPE_IP4_L3DSR) && (args.encap_args.dscp >= 64 ) )
863 lb_put_writer_lock();
864 return VNET_API_ERROR_VALUE_EXIST;
868 pool_get(lbm->vips, vip);
871 memcpy (&(vip->prefix), &(args.prefix), sizeof(args.prefix));
872 vip->plen = args.plen;
873 vip->last_garbage_collection = (u32) vlib_time_now(vlib_get_main());
874 vip->type = args.type;
876 if (args.type == LB_VIP_TYPE_IP4_L3DSR) {
877 vip->encap_args.dscp = args.encap_args.dscp;
879 else if ((args.type == LB_VIP_TYPE_IP4_NAT4)
880 ||(args.type == LB_VIP_TYPE_IP6_NAT6)) {
881 vip->encap_args.srv_type = args.encap_args.srv_type;
882 vip->encap_args.port = clib_host_to_net_u16(args.encap_args.port);
883 vip->encap_args.target_port =
884 clib_host_to_net_u16(args.encap_args.target_port);
885 vip->encap_args.node_port = clib_host_to_net_u16(node_port);
888 vip->flags = LB_VIP_FLAGS_USED;
893 for (i = 0; i < LB_N_VIP_COUNTERS; i++) {
894 vlib_validate_simple_counter(&lbm->vip_counters[i], vip - lbm->vips);
895 vlib_zero_simple_counter(&lbm->vip_counters[i], vip - lbm->vips);
898 //Configure new flow table
899 vip->new_flow_table_mask = args.new_length - 1;
900 vip->new_flow_table = 0;
902 //Create a new flow hash table full of the default entry
903 lb_vip_update_new_flow_table(vip);
905 //Create adjacency to direct traffic
906 lb_vip_add_adjacency(lbm, vip);
908 if ( (lb_vip_is_nat4(vip) || lb_vip_is_nat6(vip))
909 && (args.encap_args.srv_type == LB_SRV_TYPE_NODEPORT) )
914 //Create maping from nodeport to vip_index
915 key = clib_host_to_net_u16(node_port);
916 entry = hash_get_mem (lbm->vip_index_by_nodeport, &key);
918 lb_put_writer_lock();
919 return VNET_API_ERROR_VALUE_EXIST;
922 hash_set_mem (lbm->vip_index_by_nodeport, &key, vip - lbm->vips);
924 /* receive packets destined to NodeIP:NodePort */
925 udp_register_dst_port (vm, node_port, lb4_nodeport_node.index, 1);
926 udp_register_dst_port (vm, node_port, lb6_nodeport_node.index, 0);
930 *vip_index = vip - lbm->vips;
932 lb_put_writer_lock();
936 int lb_vip_del(u32 vip_index)
938 lb_main_t *lbm = &lb_main;
940 lb_get_writer_lock();
941 if (!(vip = lb_vip_get_by_index(vip_index))) {
942 lb_put_writer_lock();
943 return VNET_API_ERROR_NO_SUCH_ENTRY;
946 //FIXME: This operation is actually not working
947 //We will need to remove state before performing this.
951 ip46_address_t *ass = 0;
954 pool_foreach(as_index, vip->as_indexes, {
955 as = &lbm->ass[*as_index];
956 vec_add1(ass, as->address);
959 lb_vip_del_ass_withlock(vip_index, ass, vec_len(ass));
964 lb_vip_del_adjacency(lbm, vip);
966 //Set the VIP as unused
967 vip->flags &= ~LB_VIP_FLAGS_USED;
969 lb_put_writer_lock();
974 VLIB_PLUGIN_REGISTER () = {
975 .version = VPP_BUILD_VER,
976 .description = "Load Balancer",
980 u8 *format_lb_dpo (u8 * s, va_list * va)
982 index_t index = va_arg (*va, index_t);
983 CLIB_UNUSED(u32 indent) = va_arg (*va, u32);
984 lb_main_t *lbm = &lb_main;
985 lb_vip_t *vip = pool_elt_at_index (lbm->vips, index);
986 return format (s, "%U", format_lb_vip, vip);
989 static void lb_dpo_lock (dpo_id_t *dpo) {}
990 static void lb_dpo_unlock (dpo_id_t *dpo) {}
993 lb_fib_node_get_node (fib_node_index_t index)
995 lb_main_t *lbm = &lb_main;
996 lb_as_t *as = pool_elt_at_index (lbm->ass, index);
997 return (&as->fib_node);
1001 lb_fib_node_last_lock_gone (fib_node_t *node)
1006 lb_as_from_fib_node (fib_node_t *node)
1008 return ((lb_as_t*)(((char*)node) -
1009 STRUCT_OFFSET_OF(lb_as_t, fib_node)));
1013 lb_as_stack (lb_as_t *as)
1015 lb_main_t *lbm = &lb_main;
1016 lb_vip_t *vip = &lbm->vips[as->vip_index];
1017 dpo_type_t dpo_type = 0;
1019 if (lb_vip_is_gre4(vip))
1020 dpo_type = lbm->dpo_gre4_type;
1021 else if (lb_vip_is_gre6(vip))
1022 dpo_type = lbm->dpo_gre6_type;
1023 else if (lb_vip_is_l3dsr(vip))
1024 dpo_type = lbm->dpo_l3dsr_type;
1025 else if(lb_vip_is_nat4(vip))
1026 dpo_type = lbm->dpo_nat4_type;
1027 else if (lb_vip_is_nat6(vip))
1028 dpo_type = lbm->dpo_nat6_type;
1031 lb_vip_is_ip4(vip)?DPO_PROTO_IP4:DPO_PROTO_IP6,
1033 fib_entry_contribute_ip_forwarding(
1034 as->next_hop_fib_entry_index));
1037 static fib_node_back_walk_rc_t
1038 lb_fib_node_back_walk_notify (fib_node_t *node,
1039 fib_node_back_walk_ctx_t *ctx)
1041 lb_as_stack(lb_as_from_fib_node(node));
1042 return (FIB_NODE_BACK_WALK_CONTINUE);
1045 int lb_nat4_interface_add_del (u32 sw_if_index, int is_del)
1049 vnet_feature_enable_disable ("ip4-unicast", "lb-nat4-in2out",
1050 sw_if_index, 0, 0, 0);
1054 vnet_feature_enable_disable ("ip4-unicast", "lb-nat4-in2out",
1055 sw_if_index, 1, 0, 0);
1061 int lb_nat6_interface_add_del (u32 sw_if_index, int is_del)
1065 vnet_feature_enable_disable ("ip6-unicast", "lb-nat6-in2out",
1066 sw_if_index, 0, 0, 0);
1070 vnet_feature_enable_disable ("ip6-unicast", "lb-nat6-in2out",
1071 sw_if_index, 1, 0, 0);
1078 lb_init (vlib_main_t * vm)
1080 vlib_thread_main_t *tm = vlib_get_thread_main ();
1081 lb_main_t *lbm = &lb_main;
1082 lbm->vnet_main = vnet_get_main ();
1083 lbm->vlib_main = vm;
1085 lb_as_t *default_as;
1086 fib_node_vft_t lb_fib_node_vft = {
1087 .fnv_get = lb_fib_node_get_node,
1088 .fnv_last_lock = lb_fib_node_last_lock_gone,
1089 .fnv_back_walk = lb_fib_node_back_walk_notify,
1091 dpo_vft_t lb_vft = {
1092 .dv_lock = lb_dpo_lock,
1093 .dv_unlock = lb_dpo_unlock,
1094 .dv_format = format_lb_dpo,
1099 vec_validate(lbm->per_cpu, tm->n_vlib_mains - 1);
1100 lbm->writer_lock = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES);
1101 lbm->writer_lock[0] = 0;
1102 lbm->per_cpu_sticky_buckets = LB_DEFAULT_PER_CPU_STICKY_BUCKETS;
1103 lbm->flow_timeout = LB_DEFAULT_FLOW_TIMEOUT;
1104 lbm->ip4_src_address.as_u32 = 0xffffffff;
1105 lbm->ip6_src_address.as_u64[0] = 0xffffffffffffffffL;
1106 lbm->ip6_src_address.as_u64[1] = 0xffffffffffffffffL;
1107 lbm->dpo_gre4_type = dpo_register_new_type(&lb_vft, lb_dpo_gre4_nodes);
1108 lbm->dpo_gre6_type = dpo_register_new_type(&lb_vft, lb_dpo_gre6_nodes);
1109 lbm->dpo_l3dsr_type = dpo_register_new_type(&lb_vft, lb_dpo_l3dsr_nodes);
1110 lbm->dpo_nat4_type = dpo_register_new_type(&lb_vft, lb_dpo_nat4_nodes);
1111 lbm->dpo_nat6_type = dpo_register_new_type(&lb_vft, lb_dpo_nat6_nodes);
1112 lbm->fib_node_type = fib_node_register_new_type(&lb_fib_node_vft);
1114 //Init AS reference counters
1115 vlib_refcount_init(&lbm->as_refcount);
1117 //Allocate and init default AS.
1119 pool_get(lbm->ass, default_as);
1120 default_as->flags = 0;
1121 default_as->dpo.dpoi_next_node = LB_NEXT_DROP;
1122 default_as->vip_index = ~0;
1123 default_as->address.ip6.as_u64[0] = 0xffffffffffffffffL;
1124 default_as->address.ip6.as_u64[1] = 0xffffffffffffffffL;
1126 lbm->vip_index_by_nodeport
1127 = hash_create_mem (0, sizeof(u16), sizeof (uword));
1129 clib_bihash_init_8_8 (&lbm->mapping_by_as4,
1130 "mapping_by_as4", LB_MAPPING_BUCKETS,
1131 LB_MAPPING_MEMORY_SIZE);
1133 clib_bihash_init_24_8 (&lbm->mapping_by_as6,
1134 "mapping_by_as6", LB_MAPPING_BUCKETS,
1135 LB_MAPPING_MEMORY_SIZE);
1137 #define _(a,b,c) lbm->vip_counters[c].name = b;
1138 lb_foreach_vip_counter
1143 VLIB_INIT_FUNCTION (lb_init);