fib: add barrier sync, pool/vector expand cases
[vpp.git] / src / vnet / dpo / load_balance.c
1 /*
2  * Copyright (c) 2016 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 #include <vnet/ip/lookup.h>
17 #include <vnet/dpo/load_balance.h>
18 #include <vnet/dpo/load_balance_map.h>
19 #include <vnet/dpo/drop_dpo.h>
20 #include <vppinfra/math.h>              /* for fabs */
21 #include <vnet/adj/adj.h>
22 #include <vnet/adj/adj_internal.h>
23 #include <vnet/fib/fib_urpf_list.h>
24 #include <vnet/bier/bier_fwd.h>
25 #include <vnet/fib/mpls_fib.h>
26
27 /*
28  * distribution error tolerance for load-balancing
29  */
30 const f64 multipath_next_hop_error_tolerance = 0.1;
31
32 static const char *load_balance_attr_names[] = LOAD_BALANCE_ATTR_NAMES;
33
34 /**
35  * the logger
36  */
37 vlib_log_class_t load_balance_logger;
38
39 #define LB_DBG(_lb, _fmt, _args...)                                     \
40 {                                                                       \
41     vlib_log_debug(load_balance_logger,                                 \
42                    "lb:[%U]:" _fmt,                                     \
43                    format_load_balance, load_balance_get_index(_lb),    \
44                    LOAD_BALANCE_FORMAT_NONE,                            \
45                    ##_args);                                            \
46 }
47
48 /**
49  * Pool of all DPOs. It's not static so the DP can have fast access
50  */
51 load_balance_t *load_balance_pool;
52
53 /**
54  * The one instance of load-balance main
55  */
56 load_balance_main_t load_balance_main = {
57     .lbm_to_counters = {
58         .name = "route-to",
59         .stat_segment_name = "/net/route/to",
60     },
61     .lbm_via_counters = {
62         .name = "route-via",
63         .stat_segment_name = "/net/route/via",
64     }
65 };
66
67 f64
68 load_balance_get_multipath_tolerance (void)
69 {
70     return (multipath_next_hop_error_tolerance);
71 }
72
73 static inline index_t
74 load_balance_get_index (const load_balance_t *lb)
75 {
76     return (lb - load_balance_pool);
77 }
78
79 static inline dpo_id_t*
80 load_balance_get_buckets (load_balance_t *lb)
81 {
82     if (LB_HAS_INLINE_BUCKETS(lb))
83     {
84         return (lb->lb_buckets_inline);
85     }
86     else
87     {
88         return (lb->lb_buckets);
89     }
90 }
91
92 static load_balance_t *
93 load_balance_alloc_i (void)
94 {
95     load_balance_t *lb;
96     u8 need_barrier_sync = 0;
97     vlib_main_t *vm = vlib_get_main();
98     ASSERT (vm->thread_index == 0);
99
100     pool_get_aligned_will_expand (load_balance_pool, need_barrier_sync,
101                                   CLIB_CACHE_LINE_BYTES);
102     if (need_barrier_sync)
103         vlib_worker_thread_barrier_sync (vm);
104
105     pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
106     clib_memset(lb, 0, sizeof(*lb));
107
108     lb->lb_map = INDEX_INVALID;
109     lb->lb_urpf = INDEX_INVALID;
110
111     if (need_barrier_sync == 0)
112     {
113         need_barrier_sync += vlib_validate_combined_counter_will_expand
114             (&(load_balance_main.lbm_to_counters),
115              load_balance_get_index(lb));
116         need_barrier_sync += vlib_validate_combined_counter_will_expand
117             (&(load_balance_main.lbm_via_counters),
118              load_balance_get_index(lb));
119         if (need_barrier_sync)
120             vlib_worker_thread_barrier_sync (vm);
121     }
122
123     vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
124                                    load_balance_get_index(lb));
125     vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
126                                    load_balance_get_index(lb));
127     vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters),
128                                load_balance_get_index(lb));
129     vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
130                                load_balance_get_index(lb));
131
132     if (need_barrier_sync)
133         vlib_worker_thread_barrier_release (vm);
134
135     return (lb);
136 }
137
138 static u8*
139 load_balance_format (index_t lbi,
140                      load_balance_format_flags_t flags,
141                      u32 indent,
142                      u8 *s)
143 {
144     vlib_counter_t to, via;
145     load_balance_t *lb;
146     dpo_id_t *buckets;
147     u32 i;
148
149     lb = load_balance_get(lbi);
150     vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to);
151     vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via);
152     buckets = load_balance_get_buckets(lb);
153
154     s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE);
155     s = format(s, "[proto:%U ", format_dpo_proto, lb->lb_proto);
156     s = format(s, "index:%d buckets:%d ", lbi, lb->lb_n_buckets);
157     s = format(s, "uRPF:%d ", lb->lb_urpf);
158     if (lb->lb_flags)
159     {
160         load_balance_attr_t attr;
161
162         s = format(s, "flags:[");
163
164         FOR_EACH_LOAD_BALANCE_ATTR(attr)
165         {
166             if (lb->lb_flags & (1 << attr))
167             {
168                 s = format (s, "%s", load_balance_attr_names[attr]);
169             }
170         }
171         s = format(s, "] ");
172     }
173     s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
174     if (0 != via.packets)
175     {
176         s = format(s, " via:[%Ld:%Ld]",
177                    via.packets, via.bytes);
178     }
179     s = format(s, "]");
180
181     if (INDEX_INVALID != lb->lb_map)
182     {
183         s = format(s, "\n%U%U",
184                    format_white_space, indent+4,
185                    format_load_balance_map, lb->lb_map, indent+4);
186     }
187     for (i = 0; i < lb->lb_n_buckets; i++)
188     {
189         s = format(s, "\n%U[%d] %U",
190                    format_white_space, indent+2,
191                    i,
192                    format_dpo_id,
193                    &buckets[i], indent+6);
194     }
195     return (s);
196 }
197
198 u8*
199 format_load_balance (u8 * s, va_list * args)
200 {
201     index_t lbi = va_arg(*args, index_t);
202     load_balance_format_flags_t flags = va_arg(*args, load_balance_format_flags_t);
203
204     return (load_balance_format(lbi, flags, 0, s));
205 }
206
207 static u8*
208 format_load_balance_dpo (u8 * s, va_list * args)
209 {
210     index_t lbi = va_arg(*args, index_t);
211     u32 indent = va_arg(*args, u32);
212
213     return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
214 }
215
216 flow_hash_config_t
217 load_balance_get_default_flow_hash (dpo_proto_t lb_proto)
218 {
219     switch (lb_proto)
220     {
221     case DPO_PROTO_IP4:
222     case DPO_PROTO_IP6:
223         return (IP_FLOW_HASH_DEFAULT);
224
225     case DPO_PROTO_MPLS:
226         return (MPLS_FLOW_HASH_DEFAULT);
227
228     case DPO_PROTO_ETHERNET:
229     case DPO_PROTO_BIER:
230     case DPO_PROTO_NSH:
231         break;
232     }
233
234     return (0);
235 }
236
237 static load_balance_t *
238 load_balance_create_i (u32 num_buckets,
239                        dpo_proto_t lb_proto,
240                        flow_hash_config_t fhc)
241 {
242     load_balance_t *lb;
243
244     lb = load_balance_alloc_i();
245     lb->lb_hash_config = fhc;
246     lb->lb_n_buckets = num_buckets;
247     lb->lb_n_buckets_minus_1 = num_buckets-1;
248     lb->lb_proto = lb_proto;
249
250     if (!LB_HAS_INLINE_BUCKETS(lb))
251     {
252         vec_validate_aligned(lb->lb_buckets,
253                              lb->lb_n_buckets - 1,
254                              CLIB_CACHE_LINE_BYTES);
255     }
256
257     LB_DBG(lb, "create");
258
259     return (lb);
260 }
261
262 index_t
263 load_balance_create (u32 n_buckets,
264                      dpo_proto_t lb_proto,
265                      flow_hash_config_t fhc)
266 {
267     return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc)));
268 }
269
270 static inline void
271 load_balance_set_bucket_i (load_balance_t *lb,
272                            u32 bucket,
273                            dpo_id_t *buckets,
274                            const dpo_id_t *next)
275 {
276     dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next);
277 }
278
279 void
280 load_balance_set_bucket (index_t lbi,
281                          u32 bucket,
282                          const dpo_id_t *next)
283 {
284     load_balance_t *lb;
285     dpo_id_t *buckets;
286
287     lb = load_balance_get(lbi);
288     buckets = load_balance_get_buckets(lb);
289
290     ASSERT(bucket < lb->lb_n_buckets);
291
292     load_balance_set_bucket_i(lb, bucket, buckets, next);
293 }
294
295 int
296 load_balance_is_drop (const dpo_id_t *dpo)
297 {
298     load_balance_t *lb;
299
300     if (DPO_LOAD_BALANCE != dpo->dpoi_type)
301         return (0);
302
303     lb = load_balance_get(dpo->dpoi_index);
304
305     if (1 == lb->lb_n_buckets)
306     {
307         return (dpo_is_drop(load_balance_get_bucket_i(lb, 0)));
308     }
309     return (0);
310 }
311
312 u16
313 load_balance_n_buckets (index_t lbi)
314 {
315     load_balance_t *lb;
316
317     lb = load_balance_get(lbi);
318
319     return (lb->lb_n_buckets);
320 }
321
322 void
323 load_balance_set_fib_entry_flags (index_t lbi,
324                                   fib_entry_flag_t flags)
325 {
326     load_balance_t *lb;
327
328     lb = load_balance_get(lbi);
329     lb->lb_fib_entry_flags = flags;
330 }
331
332
333 void
334 load_balance_set_urpf (index_t lbi,
335                        index_t urpf)
336 {
337     load_balance_t *lb;
338     index_t old;
339
340     lb = load_balance_get(lbi);
341
342     /*
343      * packets in flight we see this change. but it's atomic, so :P
344      */
345     old = lb->lb_urpf;
346     lb->lb_urpf = urpf;
347
348     fib_urpf_list_unlock(old);
349     fib_urpf_list_lock(urpf);
350 }
351
352 index_t
353 load_balance_get_urpf (index_t lbi)
354 {
355     load_balance_t *lb;
356
357     lb = load_balance_get(lbi);
358
359     return (lb->lb_urpf);
360 }
361
362 const dpo_id_t *
363 load_balance_get_bucket (index_t lbi,
364                          u32 bucket)
365 {
366     load_balance_t *lb;
367
368     lb = load_balance_get(lbi);
369
370     return (load_balance_get_bucket_i(lb, bucket));
371 }
372
373 static int
374 next_hop_sort_by_weight (const load_balance_path_t * n1,
375                          const load_balance_path_t * n2)
376 {
377     return ((int) n1->path_weight - (int) n2->path_weight);
378 }
379
380 /* Given next hop vector is over-written with normalized one with sorted weights and
381    with weights corresponding to the number of adjacencies for each next hop.
382    Returns number of adjacencies in block. */
383 u32
384 ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops,
385                                   load_balance_path_t ** normalized_next_hops,
386                                   u32 *sum_weight_in,
387                                   f64 multipath_next_hop_error_tolerance)
388 {
389     load_balance_path_t * nhs;
390     uword n_nhs, n_adj, n_adj_left, i, sum_weight;
391     f64 norm, error;
392
393     n_nhs = vec_len (raw_next_hops);
394     ASSERT (n_nhs > 0);
395     if (n_nhs == 0)
396         return 0;
397
398     /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
399     nhs = *normalized_next_hops;
400     vec_validate (nhs, 2*n_nhs - 1);
401
402     /* Fast path: 1 next hop in block. */
403     n_adj = n_nhs;
404     if (n_nhs == 1)
405     {
406         nhs[0] = raw_next_hops[0];
407         nhs[0].path_weight = 1;
408         _vec_len (nhs) = 1;
409         sum_weight = 1;
410         goto done;
411     }
412
413     else if (n_nhs == 2)
414     {
415         int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;
416
417         /* Fast sort. */
418         nhs[0] = raw_next_hops[cmp];
419         nhs[1] = raw_next_hops[cmp ^ 1];
420
421         /* Fast path: equal cost multipath with 2 next hops. */
422         if (nhs[0].path_weight == nhs[1].path_weight)
423         {
424             nhs[0].path_weight = nhs[1].path_weight = 1;
425             _vec_len (nhs) = 2;
426             sum_weight = 2;
427             goto done;
428         }
429     }
430     else
431     {
432         clib_memcpy_fast (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
433         qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
434     }
435
436     /* Find total weight to normalize weights. */
437     sum_weight = 0;
438     for (i = 0; i < n_nhs; i++)
439         sum_weight += nhs[i].path_weight;
440
441     /* In the unlikely case that all weights are given as 0, set them all to 1. */
442     if (sum_weight == 0)
443     {
444         for (i = 0; i < n_nhs; i++)
445             nhs[i].path_weight = 1;
446         sum_weight = n_nhs;
447     }
448
449     /* Save copies of all next hop weights to avoid being overwritten in loop below. */
450     for (i = 0; i < n_nhs; i++)
451         nhs[n_nhs + i].path_weight = nhs[i].path_weight;
452
453     /* Try larger and larger power of 2 sized adjacency blocks until we
454        find one where traffic flows to within 1% of specified weights. */
455     for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
456     {
457         error = 0;
458
459         norm = n_adj / ((f64) sum_weight);
460         n_adj_left = n_adj;
461         for (i = 0; i < n_nhs; i++)
462         {
463             f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */
464             word n = flt_round_nearest (nf);
465
466             n = n > n_adj_left ? n_adj_left : n;
467             n_adj_left -= n;
468             error += fabs (nf - n);
469             nhs[i].path_weight = n;
470
471             if (0 == nhs[i].path_weight)
472             {
473                 /*
474                  * when the weight skew is high (norm is small) and n == nf.
475                  * without this correction the path with a low weight would have
476                  * no representation in the load-balanace - don't want that.
477                  * If the weight skew is high so the load-balance has many buckets
478                  * to allow it. pays ya money takes ya choice.
479                  */
480                 error = n_adj;
481                 break;
482             }
483         }
484
485         nhs[0].path_weight += n_adj_left;
486
487         /* Less than 5% average error per adjacency with this size adjacency block? */
488         if (error <= multipath_next_hop_error_tolerance*n_adj)
489         {
490             /* Truncate any next hops with zero weight. */
491             _vec_len (nhs) = i;
492             break;
493         }
494     }
495
496 done:
497     /* Save vector for next call. */
498     *normalized_next_hops = nhs;
499     *sum_weight_in = sum_weight;
500     return n_adj;
501 }
502
503 static load_balance_path_t *
504 load_balance_multipath_next_hop_fixup (const load_balance_path_t *nhs,
505                                        dpo_proto_t drop_proto)
506 {
507     if (0 == vec_len(nhs))
508     {
509         load_balance_path_t *new_nhs = NULL, *nh;
510
511         /*
512          * we need something for the load-balance. so use the drop
513          */
514         vec_add2(new_nhs, nh, 1);
515
516         nh->path_weight = 1;
517         dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto));
518
519         return (new_nhs);
520     }
521
522     return (NULL);
523 }
524
525 /*
526  * Fill in adjacencies in block based on corresponding
527  * next hop adjacencies.
528  */
529 static void
530 load_balance_fill_buckets_norm (load_balance_t *lb,
531                                 load_balance_path_t *nhs,
532                                 dpo_id_t *buckets,
533                                 u32 n_buckets)
534 {
535     load_balance_path_t *nh;
536     u16 ii, bucket;
537
538     bucket = 0;
539
540     /*
541      * the next-hops have normalised weights. that means their sum is the number
542      * of buckets we need to fill.
543      */
544     vec_foreach (nh, nhs)
545     {
546         for (ii = 0; ii < nh->path_weight; ii++)
547         {
548             ASSERT(bucket < n_buckets);
549             load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
550         }
551     }
552 }
553 static void
554 load_balance_fill_buckets_sticky (load_balance_t *lb,
555                                   load_balance_path_t *nhs,
556                                   dpo_id_t *buckets,
557                                   u32 n_buckets)
558 {
559     load_balance_path_t *nh, *fwding_paths;
560     u16 ii, bucket, fpath;
561
562     fpath = bucket = 0;
563     fwding_paths = NULL;
564
565     vec_foreach (nh, nhs)
566     {
567         if (!dpo_is_drop(&nh->path_dpo))
568         {
569             vec_add1(fwding_paths, *nh);
570         }
571     }
572     if (vec_len(fwding_paths) == 0)
573         fwding_paths = vec_dup(nhs);
574
575     /*
576      * the next-hops have normalised weights. that means their sum is the number
577      * of buckets we need to fill.
578      */
579     vec_foreach (nh, nhs)
580     {
581         for (ii = 0; ii < nh->path_weight; ii++)
582         {
583             ASSERT(bucket < n_buckets);
584             if (!dpo_is_drop(&nh->path_dpo))
585             {
586                 load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
587             }
588             else
589             {
590                 /* fill the bucks from the next up path */
591                 load_balance_set_bucket_i(lb, bucket++, buckets, &fwding_paths[fpath].path_dpo);
592                 fpath = (fpath + 1) % vec_len(fwding_paths);
593             }
594         }
595     }
596
597     vec_free(fwding_paths);
598 }
599
600 static void
601 load_balance_fill_buckets (load_balance_t *lb,
602                            load_balance_path_t *nhs,
603                            dpo_id_t *buckets,
604                            u32 n_buckets,
605                            load_balance_flags_t flags)
606 {
607     if (flags & LOAD_BALANCE_FLAG_STICKY)
608     {
609         load_balance_fill_buckets_sticky(lb, nhs, buckets, n_buckets);
610     }
611     else
612     {
613         load_balance_fill_buckets_norm(lb, nhs, buckets, n_buckets);
614     }
615 }
616
617 static inline void
618 load_balance_set_n_buckets (load_balance_t *lb,
619                             u32 n_buckets)
620 {
621     lb->lb_n_buckets = n_buckets;
622     lb->lb_n_buckets_minus_1 = n_buckets-1;
623 }
624
625 void
626 load_balance_multipath_update (const dpo_id_t *dpo,
627                                const load_balance_path_t * raw_nhs,
628                                load_balance_flags_t flags)
629 {
630     load_balance_path_t *nh, *nhs, *fixed_nhs;
631     u32 sum_of_weights, n_buckets, ii;
632     index_t lbmi, old_lbmi;
633     load_balance_t *lb;
634     dpo_id_t *tmp_dpo;
635
636     nhs = NULL;
637
638     ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
639     lb = load_balance_get(dpo->dpoi_index);
640     lb->lb_flags = flags;
641     fixed_nhs = load_balance_multipath_next_hop_fixup(raw_nhs, lb->lb_proto);
642     n_buckets =
643         ip_multipath_normalize_next_hops((NULL == fixed_nhs ?
644                                           raw_nhs :
645                                           fixed_nhs),
646                                          &nhs,
647                                          &sum_of_weights,
648                                          multipath_next_hop_error_tolerance);
649
650     ASSERT (n_buckets >= vec_len (raw_nhs));
651
652     /*
653      * Save the old load-balance map used, and get a new one if required.
654      */
655     old_lbmi = lb->lb_map;
656     if (flags & LOAD_BALANCE_FLAG_USES_MAP)
657     {
658         lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs);
659     }
660     else
661     {
662         lbmi = INDEX_INVALID;
663     }
664
665     if (0 == lb->lb_n_buckets)
666     {
667         /*
668          * first time initialisation. no packets inflight, so we can write
669          * at leisure.
670          */
671         load_balance_set_n_buckets(lb, n_buckets);
672
673         if (!LB_HAS_INLINE_BUCKETS(lb))
674             vec_validate_aligned(lb->lb_buckets,
675                                  lb->lb_n_buckets - 1,
676                                  CLIB_CACHE_LINE_BYTES);
677
678         load_balance_fill_buckets(lb, nhs,
679                                   load_balance_get_buckets(lb),
680                                   n_buckets, flags);
681         lb->lb_map = lbmi;
682     }
683     else
684     {
685         /*
686          * This is a modification of an existing load-balance.
687          * We need to ensure that packets inflight see a consistent state, that
688          * is the number of reported buckets the LB has (read from
689          * lb_n_buckets_minus_1) is not more than it actually has. So if the
690          * number of buckets is increasing, we must update the bucket array first,
691          * then the reported number. vice-versa if the number of buckets goes down.
692          */
693         if (n_buckets == lb->lb_n_buckets)
694         {
695             /*
696              * no change in the number of buckets. we can simply fill what
697              * is new over what is old.
698              */
699             load_balance_fill_buckets(lb, nhs,
700                                       load_balance_get_buckets(lb),
701                                       n_buckets, flags);
702             lb->lb_map = lbmi;
703         }
704         else if (n_buckets > lb->lb_n_buckets)
705         {
706             /*
707              * we have more buckets. the old load-balance map (if there is one)
708              * will remain valid, i.e. mapping to indices within range, so we
709              * update it last.
710              */
711             if (n_buckets > LB_NUM_INLINE_BUCKETS &&
712                 lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
713             {
714                 /*
715                  * the new increased number of buckets is crossing the threshold
716                  * from the inline storage to out-line. Alloc the outline buckets
717                  * first, then fixup the number. then reset the inlines.
718                  */
719                 ASSERT(NULL == lb->lb_buckets);
720                 vec_validate_aligned(lb->lb_buckets,
721                                      n_buckets - 1,
722                                      CLIB_CACHE_LINE_BYTES);
723
724                 load_balance_fill_buckets(lb, nhs,
725                                           lb->lb_buckets,
726                                           n_buckets, flags);
727                 CLIB_MEMORY_BARRIER();
728                 load_balance_set_n_buckets(lb, n_buckets);
729
730                 CLIB_MEMORY_BARRIER();
731
732                 for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++)
733                 {
734                     dpo_reset(&lb->lb_buckets_inline[ii]);
735                 }
736             }
737             else
738             {
739                 if (n_buckets <= LB_NUM_INLINE_BUCKETS)
740                 {
741                     /*
742                      * we are not crossing the threshold and it's still inline buckets.
743                      * we can write the new on the old..
744                      */
745                     load_balance_fill_buckets(lb, nhs,
746                                               load_balance_get_buckets(lb),
747                                               n_buckets, flags);
748                     CLIB_MEMORY_BARRIER();
749                     load_balance_set_n_buckets(lb, n_buckets);
750                 }
751                 else
752                 {
753                     /*
754                      * we are not crossing the threshold. We need a new bucket array to
755                      * hold the increased number of choices.
756                      */
757                     dpo_id_t *new_buckets, *old_buckets, *tmp_dpo;
758
759                     new_buckets = NULL;
760                     old_buckets = load_balance_get_buckets(lb);
761
762                     vec_validate_aligned(new_buckets,
763                                          n_buckets - 1,
764                                          CLIB_CACHE_LINE_BYTES);
765
766                     load_balance_fill_buckets(lb, nhs, new_buckets,
767                                               n_buckets, flags);
768                     CLIB_MEMORY_BARRIER();
769                     lb->lb_buckets = new_buckets;
770                     CLIB_MEMORY_BARRIER();
771                     load_balance_set_n_buckets(lb, n_buckets);
772
773                     vec_foreach(tmp_dpo, old_buckets)
774                     {
775                         dpo_reset(tmp_dpo);
776                     }
777                     vec_free(old_buckets);
778                 }
779             }
780
781             /*
782              * buckets fixed. ready for the MAP update.
783              */
784             lb->lb_map = lbmi;
785         }
786         else
787         {
788             /*
789              * bucket size shrinkage.
790              * Any map we have will be based on the old
791              * larger number of buckets, so will be translating to indices
792              * out of range. So the new MAP must be installed first.
793              */
794             lb->lb_map = lbmi;
795             CLIB_MEMORY_BARRIER();
796
797
798             if (n_buckets <= LB_NUM_INLINE_BUCKETS &&
799                 lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS)
800             {
801                 /*
802                  * the new decreased number of buckets is crossing the threshold
803                  * from out-line storage to inline:
804                  *   1 - Fill the inline buckets,
805                  *   2 - fixup the number (and this point the inline buckets are
806                  *       used).
807                  *   3 - free the outline buckets
808                  */
809                 load_balance_fill_buckets(lb, nhs,
810                                           lb->lb_buckets_inline,
811                                           n_buckets, flags);
812                 CLIB_MEMORY_BARRIER();
813                 load_balance_set_n_buckets(lb, n_buckets);
814                 CLIB_MEMORY_BARRIER();
815
816                 vec_foreach(tmp_dpo, lb->lb_buckets)
817                 {
818                     dpo_reset(tmp_dpo);
819                 }
820                 vec_free(lb->lb_buckets);
821             }
822             else
823             {
824                 /*
825                  * not crossing the threshold.
826                  *  1 - update the number to the smaller size
827                  *  2 - write the new buckets
828                  *  3 - reset those no longer used.
829                  */
830                 dpo_id_t *buckets;
831                 u32 old_n_buckets;
832
833                 old_n_buckets = lb->lb_n_buckets;
834                 buckets = load_balance_get_buckets(lb);
835
836                 load_balance_set_n_buckets(lb, n_buckets);
837                 CLIB_MEMORY_BARRIER();
838
839                 load_balance_fill_buckets(lb, nhs, buckets,
840                                           n_buckets, flags);
841
842                 for (ii = n_buckets; ii < old_n_buckets; ii++)
843                 {
844                     dpo_reset(&buckets[ii]);
845                 }
846             }
847         }
848     }
849
850     vec_foreach (nh, nhs)
851     {
852         dpo_reset(&nh->path_dpo);
853     }
854     vec_free(nhs);
855     vec_free(fixed_nhs);
856
857     load_balance_map_unlock(old_lbmi);
858 }
859
860 static void
861 load_balance_lock (dpo_id_t *dpo)
862 {
863     load_balance_t *lb;
864
865     lb = load_balance_get(dpo->dpoi_index);
866
867     lb->lb_locks++;
868 }
869
870 static void
871 load_balance_destroy (load_balance_t *lb)
872 {
873     dpo_id_t *buckets;
874     int i;
875
876     buckets = load_balance_get_buckets(lb);
877
878     for (i = 0; i < lb->lb_n_buckets; i++)
879     {
880         dpo_reset(&buckets[i]);
881     }
882
883     LB_DBG(lb, "destroy");
884     if (!LB_HAS_INLINE_BUCKETS(lb))
885     {
886         vec_free(lb->lb_buckets);
887     }
888
889     fib_urpf_list_unlock(lb->lb_urpf);
890     load_balance_map_unlock(lb->lb_map);
891
892     pool_put(load_balance_pool, lb);
893 }
894
895 static void
896 load_balance_unlock (dpo_id_t *dpo)
897 {
898     load_balance_t *lb;
899
900     lb = load_balance_get(dpo->dpoi_index);
901
902     lb->lb_locks--;
903
904     if (0 == lb->lb_locks)
905     {
906         load_balance_destroy(lb);
907     }
908 }
909
910 static void
911 load_balance_mem_show (void)
912 {
913     fib_show_memory_usage("load-balance",
914                           pool_elts(load_balance_pool),
915                           pool_len(load_balance_pool),
916                           sizeof(load_balance_t));
917     load_balance_map_show_mem();
918 }
919
920 const static dpo_vft_t lb_vft = {
921     .dv_lock = load_balance_lock,
922     .dv_unlock = load_balance_unlock,
923     .dv_format = format_load_balance_dpo,
924     .dv_mem_show = load_balance_mem_show,
925 };
926
927 /**
928  * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance
929  *        object.
930  *
931  * this means that these graph nodes are ones from which a load-balance is the
932  * parent object in the DPO-graph.
933  *
934  * We do not list all the load-balance nodes, such as the *-lookup. instead
935  * we are relying on the correct use of the .sibling_of field when setting
936  * up these sibling nodes.
937  */
938 const static char* const load_balance_ip4_nodes[] =
939 {
940     "ip4-load-balance",
941     NULL,
942 };
943 const static char* const load_balance_ip6_nodes[] =
944 {
945     "ip6-load-balance",
946     NULL,
947 };
948 const static char* const load_balance_mpls_nodes[] =
949 {
950     "mpls-load-balance",
951     NULL,
952 };
953 const static char* const load_balance_l2_nodes[] =
954 {
955     "l2-load-balance",
956     NULL,
957 };
958 const static char* const load_balance_nsh_nodes[] =
959 {
960     "nsh-load-balance",
961     NULL
962 };
963 const static char* const load_balance_bier_nodes[] =
964 {
965     "bier-load-balance",
966     NULL,
967 };
968 const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
969 {
970     [DPO_PROTO_IP4]  = load_balance_ip4_nodes,
971     [DPO_PROTO_IP6]  = load_balance_ip6_nodes,
972     [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
973     [DPO_PROTO_ETHERNET] = load_balance_l2_nodes,
974     [DPO_PROTO_NSH] = load_balance_nsh_nodes,
975     [DPO_PROTO_BIER] = load_balance_bier_nodes,
976 };
977
978 void
979 load_balance_module_init (void)
980 {
981     index_t lbi;
982
983     dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes);
984
985     /*
986      * Special LB with index zero. we need to define this since the v4 mtrie
987      * assumes an index of 0 implies the ply is empty. therefore all 'real'
988      * adjs need a non-zero index.
989      * This should never be used, but just in case, stack it on a drop.
990      */
991     lbi = load_balance_create(1, DPO_PROTO_IP4, 0);
992     load_balance_set_bucket(lbi, 0, drop_dpo_get(DPO_PROTO_IP4));
993
994     load_balance_logger =
995         vlib_log_register_class("dpo", "load-balance");
996
997     load_balance_map_module_init();
998 }
999
1000 static clib_error_t *
1001 load_balance_show (vlib_main_t * vm,
1002                    unformat_input_t * input,
1003                    vlib_cli_command_t * cmd)
1004 {
1005     index_t lbi = INDEX_INVALID;
1006
1007     while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
1008     {
1009         if (unformat (input, "%d", &lbi))
1010             ;
1011         else
1012             break;
1013     }
1014
1015     if (INDEX_INVALID != lbi)
1016     {
1017         if (pool_is_free_index(load_balance_pool, lbi))
1018         {
1019             vlib_cli_output (vm, "no such load-balance:%d", lbi);
1020         }
1021         else
1022         {
1023             vlib_cli_output (vm, "%U", format_load_balance, lbi,
1024                          LOAD_BALANCE_FORMAT_DETAIL);
1025         }
1026     }
1027     else
1028     {
1029         load_balance_t *lb;
1030
1031         pool_foreach(lb, load_balance_pool,
1032         ({
1033             vlib_cli_output (vm, "%U", format_load_balance,
1034                              load_balance_get_index(lb),
1035                              LOAD_BALANCE_FORMAT_NONE);
1036         }));
1037     }
1038
1039     return 0;
1040 }
1041
1042 VLIB_CLI_COMMAND (load_balance_show_command, static) = {
1043     .path = "show load-balance",
1044     .short_help = "show load-balance [<index>]",
1045     .function = load_balance_show,
1046 };
1047
1048
1049 always_inline u32
1050 ip_flow_hash (void *data)
1051 {
1052   ip4_header_t *iph = (ip4_header_t *) data;
1053
1054   if ((iph->ip_version_and_header_length & 0xF0) == 0x40)
1055     return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT);
1056   else
1057     return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT);
1058 }
1059
1060 always_inline u64
1061 mac_to_u64 (u8 * m)
1062 {
1063   return (*((u64 *) m) & 0xffffffffffff);
1064 }
1065
1066 always_inline u32
1067 l2_flow_hash (vlib_buffer_t * b0)
1068 {
1069   ethernet_header_t *eh;
1070   u64 a, b, c;
1071   uword is_ip, eh_size;
1072   u16 eh_type;
1073
1074   eh = vlib_buffer_get_current (b0);
1075   eh_type = clib_net_to_host_u16 (eh->type);
1076   eh_size = ethernet_buffer_header_size (b0);
1077
1078   is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6);
1079
1080   /* since we have 2 cache lines, use them */
1081   if (is_ip)
1082     a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size);
1083   else
1084     a = eh->type;
1085
1086   b = mac_to_u64 ((u8 *) eh->dst_address);
1087   c = mac_to_u64 ((u8 *) eh->src_address);
1088   hash_mix64 (a, b, c);
1089
1090   return (u32) c;
1091 }
1092
1093 typedef struct load_balance_trace_t_
1094 {
1095     index_t lb_index;
1096 } load_balance_trace_t;
1097
1098 always_inline uword
1099 load_balance_inline (vlib_main_t * vm,
1100                      vlib_node_runtime_t * node,
1101                      vlib_frame_t * frame,
1102                      int is_l2)
1103 {
1104   u32 n_left_from, next_index, *from, *to_next;
1105
1106   from = vlib_frame_vector_args (frame);
1107   n_left_from = frame->n_vectors;
1108
1109   next_index = node->cached_next_index;
1110
1111   while (n_left_from > 0)
1112     {
1113       u32 n_left_to_next;
1114
1115       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1116
1117       while (n_left_from > 0 && n_left_to_next > 0)
1118         {
1119           vlib_buffer_t *b0;
1120           u32 bi0, lbi0, next0;
1121           const dpo_id_t *dpo0;
1122           const load_balance_t *lb0;
1123
1124           bi0 = from[0];
1125           to_next[0] = bi0;
1126           from += 1;
1127           to_next += 1;
1128           n_left_from -= 1;
1129           n_left_to_next -= 1;
1130
1131           b0 = vlib_get_buffer (vm, bi0);
1132
1133           /* lookup dst + src mac */
1134           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1135           lb0 = load_balance_get(lbi0);
1136
1137           if (is_l2)
1138           {
1139               vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);
1140           }
1141           else
1142           {
1143               /* it's BIER */
1144               const bier_hdr_t *bh0 = vlib_buffer_get_current(b0);
1145               vnet_buffer(b0)->ip.flow_hash = bier_compute_flow_hash(bh0);
1146           }
1147
1148           dpo0 = load_balance_get_bucket_i(lb0,
1149                                            vnet_buffer(b0)->ip.flow_hash &
1150                                            (lb0->lb_n_buckets_minus_1));
1151
1152           next0 = dpo0->dpoi_next_node;
1153           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1154
1155           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1156             {
1157               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1158                                                          sizeof (*tr));
1159               tr->lb_index = lbi0;
1160             }
1161           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1162                                            n_left_to_next, bi0, next0);
1163         }
1164
1165       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1166     }
1167
1168   return frame->n_vectors;
1169 }
1170
1171 static uword
1172 l2_load_balance (vlib_main_t * vm,
1173                  vlib_node_runtime_t * node,
1174                  vlib_frame_t * frame)
1175 {
1176     return (load_balance_inline(vm, node, frame, 1));
1177 }
1178
1179 static u8 *
1180 format_l2_load_balance_trace (u8 * s, va_list * args)
1181 {
1182   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1183   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1184   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1185
1186   s = format (s, "L2-load-balance: index %d", t->lb_index);
1187   return s;
1188 }
1189
1190 /**
1191  * @brief
1192  */
1193 VLIB_REGISTER_NODE (l2_load_balance_node) = {
1194   .function = l2_load_balance,
1195   .name = "l2-load-balance",
1196   .vector_size = sizeof (u32),
1197
1198   .format_trace = format_l2_load_balance_trace,
1199   .n_next_nodes = 1,
1200   .next_nodes = {
1201       [0] = "error-drop",
1202   },
1203 };
1204
1205 static uword
1206 nsh_load_balance (vlib_main_t * vm,
1207                  vlib_node_runtime_t * node,
1208                  vlib_frame_t * frame)
1209 {
1210   u32 n_left_from, next_index, *from, *to_next;
1211
1212   from = vlib_frame_vector_args (frame);
1213   n_left_from = frame->n_vectors;
1214
1215   next_index = node->cached_next_index;
1216
1217   while (n_left_from > 0)
1218     {
1219       u32 n_left_to_next;
1220
1221       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1222
1223       while (n_left_from > 0 && n_left_to_next > 0)
1224         {
1225           vlib_buffer_t *b0;
1226           u32 bi0, lbi0, next0, *nsh0;
1227           const dpo_id_t *dpo0;
1228           const load_balance_t *lb0;
1229
1230           bi0 = from[0];
1231           to_next[0] = bi0;
1232           from += 1;
1233           to_next += 1;
1234           n_left_from -= 1;
1235           n_left_to_next -= 1;
1236
1237           b0 = vlib_get_buffer (vm, bi0);
1238
1239           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1240           lb0 = load_balance_get(lbi0);
1241
1242           /* SPI + SI are the second word of the NSH header */
1243           nsh0 = vlib_buffer_get_current (b0);
1244           vnet_buffer(b0)->ip.flow_hash = nsh0[1] % lb0->lb_n_buckets;
1245
1246           dpo0 = load_balance_get_bucket_i(lb0,
1247                                            vnet_buffer(b0)->ip.flow_hash &
1248                                            (lb0->lb_n_buckets_minus_1));
1249
1250           next0 = dpo0->dpoi_next_node;
1251           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1252
1253           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1254             {
1255               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1256                                                          sizeof (*tr));
1257               tr->lb_index = lbi0;
1258             }
1259           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1260                                            n_left_to_next, bi0, next0);
1261         }
1262
1263       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1264     }
1265
1266   return frame->n_vectors;
1267 }
1268
1269 static u8 *
1270 format_nsh_load_balance_trace (u8 * s, va_list * args)
1271 {
1272   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1273   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1274   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1275
1276   s = format (s, "NSH-load-balance: index %d", t->lb_index);
1277   return s;
1278 }
1279
1280 /**
1281  * @brief
1282  */
1283 VLIB_REGISTER_NODE (nsh_load_balance_node) = {
1284   .function = nsh_load_balance,
1285   .name = "nsh-load-balance",
1286   .vector_size = sizeof (u32),
1287
1288   .format_trace = format_nsh_load_balance_trace,
1289   .n_next_nodes = 1,
1290   .next_nodes = {
1291       [0] = "error-drop",
1292   },
1293 };
1294
1295 static u8 *
1296 format_bier_load_balance_trace (u8 * s, va_list * args)
1297 {
1298   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1299   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1300   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1301
1302   s = format (s, "BIER-load-balance: index %d", t->lb_index);
1303   return s;
1304 }
1305
1306 static uword
1307 bier_load_balance (vlib_main_t * vm,
1308                    vlib_node_runtime_t * node,
1309                    vlib_frame_t * frame)
1310 {
1311     return (load_balance_inline(vm, node, frame, 0));
1312 }
1313
1314 /**
1315  * @brief
1316  */
1317 VLIB_REGISTER_NODE (bier_load_balance_node) = {
1318   .function = bier_load_balance,
1319   .name = "bier-load-balance",
1320   .vector_size = sizeof (u32),
1321
1322   .format_trace = format_bier_load_balance_trace,
1323   .sibling_of = "mpls-load-balance",
1324 };