2 * Copyright (c) 2016 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
17 #include <vnet/plugin/plugin.h>
18 #include <vnet/api_errno.h>
20 //GC runs at most once every so many seconds
21 #define LB_GARBAGE_RUN 60
23 //After so many seconds. It is assumed that inter-core race condition will not occur.
24 #define LB_CONCURRENCY_TIMEOUT 10
28 #define lb_get_writer_lock() do {} while(__sync_lock_test_and_set (lb_main.writer_lock, 1))
29 #define lb_put_writer_lock() lb_main.writer_lock[0] = 0
31 u32 lb_hash_time_now(vlib_main_t * vm)
33 return (u32) (vlib_time_now(vm) + 10000);
36 u8 *format_lb_main (u8 * s, va_list * args)
38 vlib_thread_main_t *tm = vlib_get_thread_main();
39 lb_main_t *lbm = &lb_main;
40 s = format(s, "lb_main");
41 s = format(s, " ip4-src-address: %U \n", format_ip4_address, &lbm->ip4_src_address);
42 s = format(s, " ip6-src-address: %U \n", format_ip6_address, &lbm->ip6_src_address);
43 s = format(s, " #vips: %u\n", pool_elts(lbm->vips));
44 s = format(s, " #ass: %u\n", pool_elts(lbm->ass) - 1);
47 for(cpu_index = 0; cpu_index < tm->n_vlib_mains; cpu_index++ ) {
48 lb_hash_t *h = lbm->per_cpu[cpu_index].sticky_ht;
50 s = format(s, "core %d\n", cpu_index);
51 s = format(s, " timeout: %ds\n", h->timeout);
52 s = format(s, " usage: %d / %d\n", lb_hash_elts(h, lb_hash_time_now(vlib_get_main())), lb_hash_size(h));
59 static char *lb_vip_type_strings[] = {
60 [LB_VIP_TYPE_IP6_GRE6] = "ip6-gre6",
61 [LB_VIP_TYPE_IP6_GRE4] = "ip6-gre4",
62 [LB_VIP_TYPE_IP4_GRE6] = "ip4-gre6",
63 [LB_VIP_TYPE_IP4_GRE4] = "ip4-gre4",
66 u8 *format_lb_vip_type (u8 * s, va_list * args)
68 lb_vip_type_t vipt = va_arg (*args, lb_vip_type_t);
70 for (i=0; i<LB_VIP_N_TYPES; i++)
72 return format(s, lb_vip_type_strings[i]);
73 return format(s, "_WRONG_TYPE_");
76 uword unformat_lb_vip_type (unformat_input_t * input, va_list * args)
78 lb_vip_type_t *vipt = va_arg (*args, lb_vip_type_t *);
80 for (i=0; i<LB_VIP_N_TYPES; i++)
81 if (unformat(input, lb_vip_type_strings[i])) {
88 u8 *format_lb_vip (u8 * s, va_list * args)
90 lb_vip_t *vip = va_arg (*args, lb_vip_t *);
91 return format(s, "%U %U new_size:%u #as:%u%s",
92 format_lb_vip_type, vip->type,
93 format_ip46_prefix, &vip->prefix, vip->plen, IP46_TYPE_ANY,
94 vip->new_flow_table_mask + 1,
95 pool_elts(vip->as_indexes),
96 (vip->flags & LB_VIP_FLAGS_USED)?"":" removed");
99 u8 *format_lb_as (u8 * s, va_list * args)
101 lb_as_t *as = va_arg (*args, lb_as_t *);
102 return format(s, "%U %s", format_ip46_address, &as->address, (as->flags & LB_AS_FLAGS_USED)?"used":"removed");
105 u8 *format_lb_vip_detailed (u8 * s, va_list * args)
107 lb_main_t *lbm = &lb_main;
108 lb_vip_t *vip = va_arg (*args, lb_vip_t *);
109 uword indent = format_get_indent (s);
111 s = format(s, "%U %U [%u] %U%s\n"
113 format_white_space, indent,
114 format_lb_vip_type, vip->type,
115 vip - lbm->vips, format_ip46_prefix, &vip->prefix, vip->plen, IP46_TYPE_ANY,
116 (vip->flags & LB_VIP_FLAGS_USED)?"":" removed",
117 format_white_space, indent,
118 vip->new_flow_table_mask + 1);
121 s = format(s, "%U counters:\n",
122 format_white_space, indent);
124 for (i=0; i<LB_N_VIP_COUNTERS; i++)
125 s = format(s, "%U %s: %d\n",
126 format_white_space, indent,
127 lbm->vip_counters[i].name,
128 vlib_get_simple_counter(&lbm->vip_counters[i], vip - lbm->vips));
131 s = format(s, "%U #as:%u\n",
132 format_white_space, indent,
133 pool_elts(vip->as_indexes));
135 //Let's count the buckets for each AS
137 vec_validate(count, pool_len(lbm->ass)); //Possibly big alloc for not much...
138 lb_new_flow_entry_t *nfe;
139 vec_foreach(nfe, vip->new_flow_table)
140 count[nfe->as_index]++;
144 pool_foreach(as_index, vip->as_indexes, {
145 as = &lbm->ass[*as_index];
146 s = format(s, "%U %U %d buckets %d flows %s\n", format_white_space, indent,
147 format_ip46_address, &as->address, IP46_TYPE_ANY,
148 count[as - lbm->ass],
149 vlib_refcount_get(&lbm->as_refcount, as - lbm->ass),
150 (as->flags & LB_AS_FLAGS_USED)?"used":" removed");
156 s = format(s, "%U new flows table:\n", format_white_space, indent);
157 lb_new_flow_entry_t *nfe;
158 vec_foreach(nfe, vip->new_flow_table) {
159 s = format(s, "%U %d: %d\n", format_white_space, indent, nfe - vip->new_flow_table, nfe->as_index);
172 static int lb_pseudorand_compare(void *a, void *b)
175 lb_main_t *lbm = &lb_main;
176 asa = &lbm->ass[((lb_pseudorand_t *)a)->as_index];
177 asb = &lbm->ass[((lb_pseudorand_t *)b)->as_index];
178 return memcmp(&asa->address, &asb->address, sizeof(asb->address));
181 static void lb_vip_garbage_collection(lb_vip_t *vip)
183 lb_main_t *lbm = &lb_main;
184 ASSERT (lbm->writer_lock[0]);
186 u32 now = (u32) vlib_time_now(vlib_get_main());
187 if (!clib_u32_loop_gt(now, vip->last_garbage_collection + LB_GARBAGE_RUN))
190 vip->last_garbage_collection = now;
193 pool_foreach(as_index, vip->as_indexes, {
194 as = &lbm->ass[*as_index];
195 if (!(as->flags & LB_AS_FLAGS_USED) && //Not used
196 clib_u32_loop_gt(now, as->last_used + LB_CONCURRENCY_TIMEOUT) && //Not recently used
197 (vlib_refcount_get(&lbm->as_refcount, as - lbm->ass) == 0)) { //Not referenced
198 pool_put(vip->as_indexes, as_index);
199 pool_put(lbm->ass, as);
204 void lb_garbage_collection()
206 lb_main_t *lbm = &lb_main;
207 lb_get_writer_lock();
209 u32 *to_be_removed_vips = 0, *i;
210 pool_foreach(vip, lbm->vips, {
211 lb_vip_garbage_collection(vip);
213 if (!(vip->flags & LB_VIP_FLAGS_USED) &&
214 (pool_elts(vip->as_indexes) == 0)) {
215 vec_add1(to_be_removed_vips, vip - lbm->vips);
219 vec_foreach(i, to_be_removed_vips) {
220 vip = &lbm->vips[*i];
221 pool_put(lbm->vips, vip);
222 pool_free(vip->as_indexes);
225 vec_free(to_be_removed_vips);
226 lb_put_writer_lock();
229 static void lb_vip_update_new_flow_table(lb_vip_t *vip)
231 lb_main_t *lbm = &lb_main;
232 lb_new_flow_entry_t *old_table;
234 lb_new_flow_entry_t *new_flow_table = 0;
236 lb_pseudorand_t *pr, *sort_arr = 0;
239 ASSERT (lbm->writer_lock[0]); //We must have the lock
241 //Check if some AS is configured or not
243 pool_foreach(as_index, vip->as_indexes, {
244 as = &lbm->ass[*as_index];
245 if (as->flags & LB_AS_FLAGS_USED) { //Not used anymore
247 goto out; //Not sure 'break' works in this macro-loop
253 //Only the default. i.e. no AS
254 vec_validate(new_flow_table, vip->new_flow_table_mask);
255 for (i=0; i<vec_len(new_flow_table); i++)
256 new_flow_table[i].as_index = 0;
261 //First, let's sort the ASs
263 vec_alloc(sort_arr, pool_elts(vip->as_indexes));
266 pool_foreach(as_index, vip->as_indexes, {
267 as = &lbm->ass[*as_index];
268 if (!(as->flags & LB_AS_FLAGS_USED)) //Not used anymore
271 sort_arr[i].as_index = as - lbm->ass;
274 _vec_len(sort_arr) = i;
276 vec_sort_with_function(sort_arr, lb_pseudorand_compare);
278 //Now let's pseudo-randomly generate permutations
279 vec_foreach(pr, sort_arr) {
280 lb_as_t *as = &lbm->ass[pr->as_index];
282 u64 seed = clib_xxhash(as->address.as_u64[0] ^
283 as->address.as_u64[1]);
284 /* We have 2^n buckets.
285 * skip must be prime with 2^n.
286 * So skip must be odd.
287 * MagLev actually state that M should be prime,
288 * but this has a big computation cost (% operation).
289 * Using 2^n is more better (& operation).
291 pr->skip = ((seed & 0xffffffff) | 1) & vip->new_flow_table_mask;
292 pr->last = (seed >> 32) & vip->new_flow_table_mask;
295 //Let's create a new flow table
296 vec_validate(new_flow_table, vip->new_flow_table_mask);
297 for (i=0; i<vec_len(new_flow_table); i++)
298 new_flow_table[i].as_index = ~0;
302 vec_foreach(pr, sort_arr) {
305 pr->last = (pr->last + pr->skip) & vip->new_flow_table_mask;
306 if (new_flow_table[last].as_index == ~0) {
307 new_flow_table[last].as_index = pr->as_index;
312 if (done == vec_len(new_flow_table))
321 //Count number of changed entries
323 for (i=0; i<vec_len(new_flow_table); i++)
324 if (vip->new_flow_table == 0 ||
325 new_flow_table[i].as_index != vip->new_flow_table[i].as_index)
328 old_table = vip->new_flow_table;
329 vip->new_flow_table = new_flow_table;
333 int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address,
334 u32 per_cpu_sticky_buckets, u32 flow_timeout)
336 lb_main_t *lbm = &lb_main;
338 if (!is_pow2(per_cpu_sticky_buckets))
339 return VNET_API_ERROR_INVALID_MEMORY_SIZE;
341 lb_get_writer_lock(); //Not exactly necessary but just a reminder that it exists for my future self
342 lbm->ip4_src_address = *ip4_address;
343 lbm->ip6_src_address = *ip6_address;
344 lbm->per_cpu_sticky_buckets = per_cpu_sticky_buckets;
345 lbm->flow_timeout = flow_timeout;
346 lb_put_writer_lock();
351 int lb_vip_find_index_with_lock(ip46_address_t *prefix, u8 plen, u32 *vip_index)
353 lb_main_t *lbm = &lb_main;
355 ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned
356 ip46_prefix_normalize(prefix, plen);
357 pool_foreach(vip, lbm->vips, {
358 if ((vip->flags & LB_AS_FLAGS_USED) &&
360 vip->prefix.as_u64[0] == prefix->as_u64[0] &&
361 vip->prefix.as_u64[1] == prefix->as_u64[1]) {
362 *vip_index = vip - lbm->vips;
366 return VNET_API_ERROR_NO_SUCH_ENTRY;
369 int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index)
372 lb_get_writer_lock();
373 ret = lb_vip_find_index_with_lock(prefix, plen, vip_index);
374 lb_put_writer_lock();
378 static int lb_as_find_index_vip(lb_vip_t *vip, ip46_address_t *address, u32 *as_index)
380 lb_main_t *lbm = &lb_main;
381 ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned
384 pool_foreach(asi, vip->as_indexes, {
385 as = &lbm->ass[*asi];
386 if (as->vip_index == (vip - lbm->vips) &&
387 as->address.as_u64[0] == address->as_u64[0] &&
388 as->address.as_u64[1] == address->as_u64[1]) {
389 *as_index = as - lbm->ass;
396 int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n)
398 lb_main_t *lbm = &lb_main;
399 lb_get_writer_lock();
401 if (!(vip = lb_vip_get_by_index(vip_index))) {
402 lb_put_writer_lock();
403 return VNET_API_ERROR_NO_SUCH_ENTRY;
406 ip46_type_t type = lb_vip_is_gre4(vip)?IP46_TYPE_IP4:IP46_TYPE_IP6;
407 u32 *to_be_added = 0;
408 u32 *to_be_updated = 0;
415 if (!lb_as_find_index_vip(vip, &addresses[n], &i)) {
416 if (lbm->ass[i].flags & LB_AS_FLAGS_USED) {
417 vec_free(to_be_added);
418 vec_free(to_be_updated);
419 lb_put_writer_lock();
420 return VNET_API_ERROR_VALUE_EXIST;
422 vec_add1(to_be_updated, i);
426 if (ip46_address_type(&addresses[n]) != type) {
427 vec_free(to_be_added);
428 vec_free(to_be_updated);
429 lb_put_writer_lock();
430 return VNET_API_ERROR_INVALID_ADDRESS_FAMILY;
435 while(n2--) //Check for duplicates
436 if (addresses[n2].as_u64[0] == addresses[n].as_u64[0] &&
437 addresses[n2].as_u64[1] == addresses[n].as_u64[1])
441 vec_add1(to_be_added, n);
448 vec_foreach(ip, to_be_updated) {
449 lbm->ass[*ip].flags = LB_AS_FLAGS_USED;
451 vec_free(to_be_updated);
453 //Create those who have to be created
454 vec_foreach(ip, to_be_added) {
457 pool_get(lbm->ass, as);
458 as->address = addresses[*ip];
459 as->flags = LB_AS_FLAGS_USED;
460 as->vip_index = vip_index;
461 pool_get(vip->as_indexes, as_index);
462 *as_index = as - lbm->ass;
464 vec_free(to_be_added);
467 lb_vip_update_new_flow_table(vip);
469 //Garbage collection maybe
470 lb_vip_garbage_collection(vip);
472 lb_put_writer_lock();
476 int lb_vip_del_ass_withlock(u32 vip_index, ip46_address_t *addresses, u32 n)
478 lb_main_t *lbm = &lb_main;
479 u32 now = (u32) vlib_time_now(vlib_get_main());
483 if (!(vip = lb_vip_get_by_index(vip_index))) {
484 return VNET_API_ERROR_NO_SUCH_ENTRY;
490 if (lb_as_find_index_vip(vip, &addresses[n], &i)) {
492 return VNET_API_ERROR_NO_SUCH_ENTRY;
495 if (n) { //Check for duplicates
498 if (addresses[n2].as_u64[0] == addresses[n].as_u64[0] &&
499 addresses[n2].as_u64[1] == addresses[n].as_u64[1])
504 vec_add1(indexes, i);
509 //Garbage collection maybe
510 lb_vip_garbage_collection(vip);
512 if (indexes != NULL) {
513 vec_foreach(ip, indexes) {
514 lbm->ass[*ip].flags &= ~LB_AS_FLAGS_USED;
515 lbm->ass[*ip].last_used = now;
519 lb_vip_update_new_flow_table(vip);
526 int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n)
528 lb_get_writer_lock();
529 int ret = lb_vip_del_ass_withlock(vip_index, addresses, n);
530 lb_put_writer_lock();
536 * Add the VIP adjacency to the ip4 or ip6 fib
538 static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip)
542 memset (&adj, 0, sizeof (adj));
543 adj.explicit_fib_index = ~0;
544 lb_adj_data_t *ad = (lb_adj_data_t *) &adj.opaque;
545 ad->vip_index = vip - lbm->vips;
547 ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned
548 u32 lookup_next_index = lbm->ip_lookup_next_index[vip->type];
550 if (lb_vip_is_ip4(vip)) {
551 adj.lookup_next_index = lookup_next_index;
552 ip4_add_del_route_args_t route_args = {};
553 ip4_main_t *im4 = &ip4_main;
554 route_args.table_index_or_table_id = 0;
555 route_args.flags = IP4_ROUTE_FLAG_ADD;
556 route_args.dst_address = vip->prefix.ip4;
557 route_args.dst_address_length = vip->plen - 96;
558 route_args.adj_index = ~0;
559 route_args.add_adj = &adj;
560 route_args.n_add_adj = 1;
561 ip4_add_del_route (im4, &route_args);
563 adj.lookup_next_index = lookup_next_index;
564 ip6_add_del_route_args_t route_args = {};
565 ip6_main_t *im6 = &ip6_main;
566 route_args.table_index_or_table_id = 0;
567 route_args.flags = IP6_ROUTE_FLAG_ADD;
568 route_args.dst_address = vip->prefix.ip6;
569 route_args.dst_address_length = vip->plen;
570 route_args.adj_index = ~0;
571 route_args.add_adj = &adj;
572 route_args.n_add_adj = 1;
573 ip6_add_del_route (im6, &route_args);
578 * Deletes the adjacency associated with the VIP
580 static void lb_vip_del_adjacency(lb_main_t *lbm, lb_vip_t *vip)
582 ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned
583 if (lb_vip_is_ip4(vip)) {
584 ip4_main_t *im4 = &ip4_main;
585 ip4_add_del_route_args_t route_args = {};
586 route_args.table_index_or_table_id = 0;
587 route_args.flags = IP4_ROUTE_FLAG_DEL;
588 route_args.dst_address = vip->prefix.ip4;
589 route_args.dst_address_length = vip->plen - 96;
590 route_args.adj_index = ~0;
591 route_args.add_adj = NULL;
592 route_args.n_add_adj = 0;
593 ip4_add_del_route (im4, &route_args);
595 ip6_main_t *im6 = &ip6_main;
596 ip6_add_del_route_args_t route_args = {};
597 route_args.table_index_or_table_id = 0;
598 route_args.flags = IP6_ROUTE_FLAG_DEL;
599 route_args.dst_address = vip->prefix.ip6;
600 route_args.dst_address_length = vip->plen;
601 route_args.adj_index = ~0;
602 route_args.add_adj = NULL;
603 route_args.n_add_adj = 0;
604 ip6_add_del_route (im6, &route_args);
608 int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u32 new_length, u32 *vip_index)
610 lb_main_t *lbm = &lb_main;
612 lb_get_writer_lock();
613 ip46_prefix_normalize(prefix, plen);
615 if (!lb_vip_find_index_with_lock(prefix, plen, vip_index)) {
616 lb_put_writer_lock();
617 return VNET_API_ERROR_VALUE_EXIST;
620 if (!is_pow2(new_length)) {
621 lb_put_writer_lock();
622 return VNET_API_ERROR_INVALID_MEMORY_SIZE;
625 if (ip46_prefix_is_ip4(prefix, plen) &&
626 (type != LB_VIP_TYPE_IP4_GRE4) &&
627 (type != LB_VIP_TYPE_IP4_GRE6))
628 return VNET_API_ERROR_INVALID_ADDRESS_FAMILY;
632 pool_get(lbm->vips, vip);
635 vip->prefix = *prefix;
637 vip->last_garbage_collection = (u32) vlib_time_now(vlib_get_main());
639 vip->flags = LB_VIP_FLAGS_USED;
644 for (i = 0; i < LB_N_VIP_COUNTERS; i++) {
645 vlib_validate_simple_counter(&lbm->vip_counters[i], vip - lbm->vips);
646 vlib_zero_simple_counter(&lbm->vip_counters[i], vip - lbm->vips);
649 //Configure new flow table
650 vip->new_flow_table_mask = new_length - 1;
651 vip->new_flow_table = 0;
653 //Create a new flow hash table full of the default entry
654 lb_vip_update_new_flow_table(vip);
656 //Create adjacency to direct traffic
657 lb_vip_add_adjacency(lbm, vip);
660 *vip_index = vip - lbm->vips;
662 lb_put_writer_lock();
666 int lb_vip_del(u32 vip_index)
668 lb_main_t *lbm = &lb_main;
670 lb_get_writer_lock();
671 if (!(vip = lb_vip_get_by_index(vip_index))) {
672 lb_put_writer_lock();
673 return VNET_API_ERROR_NO_SUCH_ENTRY;
676 //FIXME: This operation is actually not working
677 //We will need to remove state before performing this.
681 ip46_address_t *ass = 0;
684 pool_foreach(as_index, vip->as_indexes, {
685 as = &lbm->ass[*as_index];
686 vec_add1(ass, as->address);
689 lb_vip_del_ass_withlock(vip_index, ass, vec_len(ass));
694 lb_vip_del_adjacency(lbm, vip);
696 //Set the VIP as unused
697 vip->flags &= ~LB_VIP_FLAGS_USED;
699 lb_put_writer_lock();
704 vlib_plugin_register (vlib_main_t * vm,
705 vnet_plugin_handoff_t * h,
708 clib_error_t *error = 0;
713 lb_init (vlib_main_t * vm)
715 vlib_thread_main_t *tm = vlib_get_thread_main ();
716 lb_main_t *lbm = &lb_main;
720 vec_validate(lbm->per_cpu, tm->n_vlib_mains - 1);
721 lbm->writer_lock = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES);
722 lbm->writer_lock[0] = 0;
723 lbm->per_cpu_sticky_buckets = LB_DEFAULT_PER_CPU_STICKY_BUCKETS;
724 lbm->flow_timeout = LB_DEFAULT_FLOW_TIMEOUT;
725 lbm->ip4_src_address.as_u32 = 0xffffffff;
726 lbm->ip6_src_address.as_u64[0] = 0xffffffffffffffffL;
727 lbm->ip6_src_address.as_u64[1] = 0xffffffffffffffffL;
729 //Init AS reference counters
730 vlib_refcount_init(&lbm->as_refcount);
732 //Allocate and init default AS.
734 pool_get(lbm->ass, default_as);
735 default_as->flags = 0;
736 default_as->vip_index = ~0;
737 default_as->address.ip6.as_u64[0] = 0xffffffffffffffffL;
738 default_as->address.ip6.as_u64[1] = 0xffffffffffffffffL;
740 #define _(a,b,c) lbm->vip_counters[c].name = b;
741 lb_foreach_vip_counter
746 VLIB_INIT_FUNCTION (lb_init);