ff46d56e3e2496e12fc1acef609f13f8116330d4
[vpp.git] / src / vnet / dpo / load_balance.c
1 /*
2  * Copyright (c) 2016 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 #include <vnet/dpo/load_balance.h>
17 #include <vnet/dpo/load_balance_map.h>
18 #include <vnet/dpo/drop_dpo.h>
19 #include <vppinfra/math.h>              /* for fabs */
20 #include <vnet/adj/adj.h>
21 #include <vnet/adj/adj_internal.h>
22 #include <vnet/fib/fib_urpf_list.h>
23 #include <vnet/bier/bier_fwd.h>
24 #include <vnet/fib/mpls_fib.h>
25 #include <vnet/ip/ip4_inlines.h>
26 #include <vnet/ip/ip6_inlines.h>
27
28 // clang-format off
29
30 /*
31  * distribution error tolerance for load-balancing
32  */
33 const f64 multipath_next_hop_error_tolerance = 0.1;
34
35 static const char *load_balance_attr_names[] = LOAD_BALANCE_ATTR_NAMES;
36
37 /**
38  * the logger
39  */
40 vlib_log_class_t load_balance_logger;
41
42 #define LB_DBG(_lb, _fmt, _args...)                                     \
43 {                                                                       \
44     vlib_log_debug(load_balance_logger,                                 \
45                    "lb:[%U]:" _fmt,                                     \
46                    format_load_balance, load_balance_get_index(_lb),    \
47                    LOAD_BALANCE_FORMAT_NONE,                            \
48                    ##_args);                                            \
49 }
50
51 /**
52  * Pool of all DPOs. It's not static so the DP can have fast access
53  */
54 load_balance_t *load_balance_pool;
55
56 /**
57  * The one instance of load-balance main
58  */
59 load_balance_main_t load_balance_main = {
60     .lbm_to_counters = {
61         .name = "route-to",
62         .stat_segment_name = "/net/route/to",
63     },
64     .lbm_via_counters = {
65         .name = "route-via",
66         .stat_segment_name = "/net/route/via",
67     }
68 };
69
70 f64
71 load_balance_get_multipath_tolerance (void)
72 {
73     return (multipath_next_hop_error_tolerance);
74 }
75
76 static inline index_t
77 load_balance_get_index (const load_balance_t *lb)
78 {
79     return (lb - load_balance_pool);
80 }
81
82 static inline dpo_id_t*
83 load_balance_get_buckets (load_balance_t *lb)
84 {
85     if (LB_HAS_INLINE_BUCKETS(lb))
86     {
87         return (lb->lb_buckets_inline);
88     }
89     else
90     {
91         return (lb->lb_buckets);
92     }
93 }
94
95 static load_balance_t *
96 load_balance_alloc_i (void)
97 {
98     load_balance_t *lb;
99     u8 need_barrier_sync = 0;
100     vlib_main_t *vm = vlib_get_main();
101     ASSERT (vm->thread_index == 0);
102
103     need_barrier_sync = pool_get_will_expand (load_balance_pool);
104
105     if (need_barrier_sync)
106         vlib_worker_thread_barrier_sync (vm);
107
108     pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
109     clib_memset(lb, 0, sizeof(*lb));
110
111     lb->lb_map = INDEX_INVALID;
112     lb->lb_urpf = INDEX_INVALID;
113
114     if (need_barrier_sync == 0)
115     {
116         need_barrier_sync += vlib_validate_combined_counter_will_expand
117             (&(load_balance_main.lbm_to_counters),
118              load_balance_get_index(lb));
119         need_barrier_sync += vlib_validate_combined_counter_will_expand
120             (&(load_balance_main.lbm_via_counters),
121              load_balance_get_index(lb));
122         if (need_barrier_sync)
123             vlib_worker_thread_barrier_sync (vm);
124     }
125
126     vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
127                                    load_balance_get_index(lb));
128     vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
129                                    load_balance_get_index(lb));
130     vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters),
131                                load_balance_get_index(lb));
132     vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
133                                load_balance_get_index(lb));
134
135     if (need_barrier_sync)
136         vlib_worker_thread_barrier_release (vm);
137
138     return (lb);
139 }
140
141 static u8*
142 load_balance_format (index_t lbi,
143                      load_balance_format_flags_t flags,
144                      u32 indent,
145                      u8 *s)
146 {
147     vlib_counter_t to, via;
148     load_balance_t *lb;
149     dpo_id_t *buckets;
150     u32 i;
151
152     lb = load_balance_get(lbi);
153     vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to);
154     vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via);
155     buckets = load_balance_get_buckets(lb);
156
157     s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE);
158     s = format(s, "[proto:%U ", format_dpo_proto, lb->lb_proto);
159     s = format(s, "index:%d buckets:%d ", lbi, lb->lb_n_buckets);
160     s = format(s, "uRPF:%d ", lb->lb_urpf);
161     if (lb->lb_flags)
162     {
163         load_balance_attr_t attr;
164
165         s = format(s, "flags:[");
166
167         FOR_EACH_LOAD_BALANCE_ATTR(attr)
168         {
169             if (lb->lb_flags & (1 << attr))
170             {
171                 s = format (s, "%s", load_balance_attr_names[attr]);
172             }
173         }
174         s = format(s, "] ");
175     }
176     s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
177     if (0 != via.packets)
178     {
179         s = format(s, " via:[%Ld:%Ld]",
180                    via.packets, via.bytes);
181     }
182     s = format(s, "]");
183
184     if (INDEX_INVALID != lb->lb_map)
185     {
186         s = format(s, "\n%U%U",
187                    format_white_space, indent+4,
188                    format_load_balance_map, lb->lb_map, indent+4);
189     }
190     for (i = 0; i < lb->lb_n_buckets; i++)
191     {
192         s = format(s, "\n%U[%d] %U",
193                    format_white_space, indent+2,
194                    i,
195                    format_dpo_id,
196                    &buckets[i], indent+6);
197     }
198     return (s);
199 }
200
201 u8*
202 format_load_balance (u8 * s, va_list * args)
203 {
204     index_t lbi = va_arg(*args, index_t);
205     load_balance_format_flags_t flags = va_arg(*args, load_balance_format_flags_t);
206
207     return (load_balance_format(lbi, flags, 0, s));
208 }
209
210 static u8*
211 format_load_balance_dpo (u8 * s, va_list * args)
212 {
213     index_t lbi = va_arg(*args, index_t);
214     u32 indent = va_arg(*args, u32);
215
216     return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
217 }
218
219 flow_hash_config_t
220 load_balance_get_default_flow_hash (dpo_proto_t lb_proto)
221 {
222     switch (lb_proto)
223     {
224     case DPO_PROTO_IP4:
225     case DPO_PROTO_IP6:
226         return (IP_FLOW_HASH_DEFAULT);
227
228     case DPO_PROTO_MPLS:
229         return (MPLS_FLOW_HASH_DEFAULT);
230
231     case DPO_PROTO_ETHERNET:
232     case DPO_PROTO_BIER:
233     case DPO_PROTO_NSH:
234         break;
235     }
236
237     return (0);
238 }
239
240 static load_balance_t *
241 load_balance_create_i (u32 num_buckets,
242                        dpo_proto_t lb_proto,
243                        flow_hash_config_t fhc)
244 {
245     load_balance_t *lb;
246
247     lb = load_balance_alloc_i();
248     lb->lb_hash_config = fhc;
249     lb->lb_n_buckets = num_buckets;
250     lb->lb_n_buckets_minus_1 = num_buckets-1;
251     lb->lb_proto = lb_proto;
252
253     if (!LB_HAS_INLINE_BUCKETS(lb))
254     {
255         vec_validate_aligned(lb->lb_buckets,
256                              lb->lb_n_buckets - 1,
257                              CLIB_CACHE_LINE_BYTES);
258     }
259
260     LB_DBG(lb, "create");
261
262     return (lb);
263 }
264
265 index_t
266 load_balance_create (u32 n_buckets,
267                      dpo_proto_t lb_proto,
268                      flow_hash_config_t fhc)
269 {
270     return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc)));
271 }
272
273 static inline void
274 load_balance_set_bucket_i (load_balance_t *lb,
275                            u32 bucket,
276                            dpo_id_t *buckets,
277                            const dpo_id_t *next)
278 {
279     dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next);
280 }
281
282 void
283 load_balance_set_bucket (index_t lbi,
284                          u32 bucket,
285                          const dpo_id_t *next)
286 {
287     load_balance_t *lb;
288     dpo_id_t *buckets;
289
290     lb = load_balance_get(lbi);
291     buckets = load_balance_get_buckets(lb);
292
293     ASSERT(bucket < lb->lb_n_buckets);
294
295     load_balance_set_bucket_i(lb, bucket, buckets, next);
296 }
297
298 int
299 load_balance_is_drop (const dpo_id_t *dpo)
300 {
301     load_balance_t *lb;
302
303     if (DPO_LOAD_BALANCE != dpo->dpoi_type)
304         return (0);
305
306     lb = load_balance_get(dpo->dpoi_index);
307
308     if (1 == lb->lb_n_buckets)
309     {
310         return (dpo_is_drop(load_balance_get_bucket_i(lb, 0)));
311     }
312     return (0);
313 }
314
315 u16
316 load_balance_n_buckets (index_t lbi)
317 {
318     load_balance_t *lb;
319
320     lb = load_balance_get(lbi);
321
322     return (lb->lb_n_buckets);
323 }
324
325 void
326 load_balance_set_fib_entry_flags (index_t lbi,
327                                   fib_entry_flag_t flags)
328 {
329     load_balance_t *lb;
330
331     lb = load_balance_get(lbi);
332     lb->lb_fib_entry_flags = flags;
333 }
334
335
336 void
337 load_balance_set_urpf (index_t lbi,
338                        index_t urpf)
339 {
340     load_balance_t *lb;
341     index_t old;
342
343     lb = load_balance_get(lbi);
344
345     /*
346      * packets in flight we see this change. but it's atomic, so :P
347      */
348     old = lb->lb_urpf;
349     lb->lb_urpf = urpf;
350
351     fib_urpf_list_unlock(old);
352     fib_urpf_list_lock(urpf);
353 }
354
355 index_t
356 load_balance_get_urpf (index_t lbi)
357 {
358     load_balance_t *lb;
359
360     lb = load_balance_get(lbi);
361
362     return (lb->lb_urpf);
363 }
364
365 const dpo_id_t *
366 load_balance_get_bucket (index_t lbi,
367                          u32 bucket)
368 {
369     load_balance_t *lb;
370
371     lb = load_balance_get(lbi);
372
373     return (load_balance_get_bucket_i(lb, bucket));
374 }
375
376 static int
377 next_hop_sort_by_weight (const load_balance_path_t * n1,
378                          const load_balance_path_t * n2)
379 {
380     return ((int) n1->path_weight - (int) n2->path_weight);
381 }
382
383 /* Given next hop vector is over-written with normalized one with sorted weights and
384    with weights corresponding to the number of adjacencies for each next hop.
385    Returns number of adjacencies in block. */
386 u32
387 ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops,
388                                   load_balance_path_t ** normalized_next_hops,
389                                   u32 *sum_weight_in,
390                                   f64 multipath_next_hop_error_tolerance)
391 {
392     load_balance_path_t * nhs;
393     uword n_nhs, n_adj, n_adj_left, i, sum_weight;
394     f64 norm, error;
395
396     n_nhs = vec_len (raw_next_hops);
397     ASSERT (n_nhs > 0);
398     if (n_nhs == 0)
399         return 0;
400
401     /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
402     nhs = *normalized_next_hops;
403     vec_validate (nhs, 2*n_nhs - 1);
404
405     /* Fast path: 1 next hop in block. */
406     n_adj = n_nhs;
407     if (n_nhs == 1)
408     {
409         nhs[0] = raw_next_hops[0];
410         nhs[0].path_weight = 1;
411         vec_set_len (nhs, 1);
412         sum_weight = 1;
413         goto done;
414     }
415
416     else if (n_nhs == 2)
417     {
418         int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;
419
420         /* Fast sort. */
421         nhs[0] = raw_next_hops[cmp];
422         nhs[1] = raw_next_hops[cmp ^ 1];
423
424         /* Fast path: equal cost multipath with 2 next hops. */
425         if (nhs[0].path_weight == nhs[1].path_weight)
426         {
427             nhs[0].path_weight = nhs[1].path_weight = 1;
428             vec_set_len (nhs, 2);
429             sum_weight = 2;
430             goto done;
431         }
432     }
433     else
434     {
435         clib_memcpy_fast (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
436         qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
437     }
438
439     /* Find total weight to normalize weights. */
440     sum_weight = 0;
441     for (i = 0; i < n_nhs; i++)
442         sum_weight += nhs[i].path_weight;
443
444     /* In the unlikely case that all weights are given as 0, set them all to 1. */
445     if (sum_weight == 0)
446     {
447         for (i = 0; i < n_nhs; i++)
448             nhs[i].path_weight = 1;
449         sum_weight = n_nhs;
450     }
451
452     /* Save copies of all next hop weights to avoid being overwritten in loop below. */
453     for (i = 0; i < n_nhs; i++)
454         nhs[n_nhs + i].path_weight = nhs[i].path_weight;
455
456     /* Try larger and larger power of 2 sized adjacency blocks until we
457        find one where traffic flows to within 1% of specified weights. */
458     for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
459     {
460         error = 0;
461
462         norm = n_adj / ((f64) sum_weight);
463         n_adj_left = n_adj;
464         for (i = 0; i < n_nhs; i++)
465         {
466             f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */
467             word n = flt_round_nearest (nf);
468
469             n = n > n_adj_left ? n_adj_left : n;
470             n_adj_left -= n;
471             error += fabs (nf - n);
472             nhs[i].path_weight = n;
473
474             if (0 == nhs[i].path_weight)
475             {
476                 /*
477                  * when the weight skew is high (norm is small) and n == nf.
478                  * without this correction the path with a low weight would have
479                  * no representation in the load-balanace - don't want that.
480                  * If the weight skew is high so the load-balance has many buckets
481                  * to allow it. pays ya money takes ya choice.
482                  */
483                 error = n_adj;
484                 break;
485             }
486         }
487
488         nhs[0].path_weight += n_adj_left;
489
490         /* Less than 5% average error per adjacency with this size adjacency block? */
491         if (error <= multipath_next_hop_error_tolerance*n_adj)
492         {
493             /* Truncate any next hops with zero weight. */
494             vec_set_len (nhs, i);
495             break;
496         }
497     }
498
499 done:
500     /* Save vector for next call. */
501     *normalized_next_hops = nhs;
502     *sum_weight_in = sum_weight;
503     return n_adj;
504 }
505
506 static load_balance_path_t *
507 load_balance_multipath_next_hop_fixup (const load_balance_path_t *nhs,
508                                        dpo_proto_t drop_proto)
509 {
510     if (0 == vec_len(nhs))
511     {
512         load_balance_path_t *new_nhs = NULL, *nh;
513
514         /*
515          * we need something for the load-balance. so use the drop
516          */
517         vec_add2(new_nhs, nh, 1);
518
519         nh->path_weight = 1;
520         dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto));
521
522         return (new_nhs);
523     }
524
525     return (NULL);
526 }
527
528 /*
529  * Fill in adjacencies in block based on corresponding
530  * next hop adjacencies.
531  */
532 static void
533 load_balance_fill_buckets_norm (load_balance_t *lb,
534                                 load_balance_path_t *nhs,
535                                 dpo_id_t *buckets,
536                                 u32 n_buckets)
537 {
538     load_balance_path_t *nh;
539     u16 ii, bucket;
540
541     bucket = 0;
542
543     /*
544      * the next-hops have normalised weights. that means their sum is the number
545      * of buckets we need to fill.
546      */
547     vec_foreach (nh, nhs)
548     {
549         for (ii = 0; ii < nh->path_weight; ii++)
550         {
551             ASSERT(bucket < n_buckets);
552             load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
553         }
554     }
555 }
556 static void
557 load_balance_fill_buckets_sticky (load_balance_t *lb,
558                                   load_balance_path_t *nhs,
559                                   dpo_id_t *buckets,
560                                   u32 n_buckets)
561 {
562     load_balance_path_t *nh, *fwding_paths;
563     u16 ii, bucket, fpath;
564
565     fpath = bucket = 0;
566     fwding_paths = NULL;
567
568     vec_foreach (nh, nhs)
569     {
570         if (!dpo_is_drop(&nh->path_dpo))
571         {
572             vec_add1(fwding_paths, *nh);
573         }
574     }
575     if (vec_len(fwding_paths) == 0)
576         fwding_paths = vec_dup(nhs);
577
578     /*
579      * the next-hops have normalised weights. that means their sum is the number
580      * of buckets we need to fill.
581      */
582     vec_foreach (nh, nhs)
583     {
584         for (ii = 0; ii < nh->path_weight; ii++)
585         {
586             ASSERT(bucket < n_buckets);
587             if (!dpo_is_drop(&nh->path_dpo))
588             {
589                 load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
590             }
591             else
592             {
593                 /* fill the bucks from the next up path */
594                 load_balance_set_bucket_i(lb, bucket++, buckets, &fwding_paths[fpath].path_dpo);
595                 ASSERT(vec_len(fwding_paths) > 0);
596                 fpath = (fpath + 1) % vec_len(fwding_paths);
597             }
598         }
599     }
600
601     vec_free(fwding_paths);
602 }
603
604 static void
605 load_balance_fill_buckets (load_balance_t *lb,
606                            load_balance_path_t *nhs,
607                            dpo_id_t *buckets,
608                            u32 n_buckets,
609                            load_balance_flags_t flags)
610 {
611     if (flags & LOAD_BALANCE_FLAG_STICKY)
612     {
613         load_balance_fill_buckets_sticky(lb, nhs, buckets, n_buckets);
614     }
615     else
616     {
617         load_balance_fill_buckets_norm(lb, nhs, buckets, n_buckets);
618     }
619 }
620
621 static inline void
622 load_balance_set_n_buckets (load_balance_t *lb,
623                             u32 n_buckets)
624 {
625     lb->lb_n_buckets = n_buckets;
626     lb->lb_n_buckets_minus_1 = n_buckets-1;
627 }
628
629 void
630 load_balance_multipath_update (const dpo_id_t *dpo,
631                                const load_balance_path_t * raw_nhs,
632                                load_balance_flags_t flags)
633 {
634     load_balance_path_t *nh, *nhs, *fixed_nhs;
635     u32 sum_of_weights, n_buckets, ii;
636     index_t lbmi, old_lbmi;
637     load_balance_t *lb;
638     dpo_id_t *tmp_dpo;
639
640     nhs = NULL;
641
642     ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
643     lb = load_balance_get(dpo->dpoi_index);
644     lb->lb_flags = flags;
645     fixed_nhs = load_balance_multipath_next_hop_fixup(raw_nhs, lb->lb_proto);
646     n_buckets =
647         ip_multipath_normalize_next_hops((NULL == fixed_nhs ?
648                                           raw_nhs :
649                                           fixed_nhs),
650                                          &nhs,
651                                          &sum_of_weights,
652                                          multipath_next_hop_error_tolerance);
653
654     ASSERT (n_buckets >= vec_len (raw_nhs));
655
656     /*
657      * Save the old load-balance map used, and get a new one if required.
658      */
659     old_lbmi = lb->lb_map;
660     if (flags & LOAD_BALANCE_FLAG_USES_MAP)
661     {
662         lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs);
663     }
664     else
665     {
666         lbmi = INDEX_INVALID;
667     }
668
669     if (0 == lb->lb_n_buckets)
670     {
671         /*
672          * first time initialisation. no packets inflight, so we can write
673          * at leisure.
674          */
675         load_balance_set_n_buckets(lb, n_buckets);
676
677         if (!LB_HAS_INLINE_BUCKETS(lb))
678             vec_validate_aligned(lb->lb_buckets,
679                                  lb->lb_n_buckets - 1,
680                                  CLIB_CACHE_LINE_BYTES);
681
682         load_balance_fill_buckets(lb, nhs,
683                                   load_balance_get_buckets(lb),
684                                   n_buckets, flags);
685         lb->lb_map = lbmi;
686     }
687     else
688     {
689         /*
690          * This is a modification of an existing load-balance.
691          * We need to ensure that packets inflight see a consistent state, that
692          * is the number of reported buckets the LB has (read from
693          * lb_n_buckets_minus_1) is not more than it actually has. So if the
694          * number of buckets is increasing, we must update the bucket array first,
695          * then the reported number. vice-versa if the number of buckets goes down.
696          */
697         if (n_buckets == lb->lb_n_buckets)
698         {
699             /*
700              * no change in the number of buckets. we can simply fill what
701              * is new over what is old.
702              */
703             load_balance_fill_buckets(lb, nhs,
704                                       load_balance_get_buckets(lb),
705                                       n_buckets, flags);
706             lb->lb_map = lbmi;
707         }
708         else if (n_buckets > lb->lb_n_buckets)
709         {
710             /*
711              * we have more buckets. the old load-balance map (if there is one)
712              * will remain valid, i.e. mapping to indices within range, so we
713              * update it last.
714              */
715             if (n_buckets > LB_NUM_INLINE_BUCKETS &&
716                 lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
717             {
718                 /*
719                  * the new increased number of buckets is crossing the threshold
720                  * from the inline storage to out-line. Alloc the outline buckets
721                  * first, then fixup the number. then reset the inlines.
722                  */
723                 ASSERT(NULL == lb->lb_buckets);
724                 vec_validate_aligned(lb->lb_buckets,
725                                      n_buckets - 1,
726                                      CLIB_CACHE_LINE_BYTES);
727
728                 load_balance_fill_buckets(lb, nhs,
729                                           lb->lb_buckets,
730                                           n_buckets, flags);
731                 CLIB_MEMORY_BARRIER();
732                 load_balance_set_n_buckets(lb, n_buckets);
733
734                 CLIB_MEMORY_BARRIER();
735
736                 for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++)
737                 {
738                     dpo_reset(&lb->lb_buckets_inline[ii]);
739                 }
740             }
741             else
742             {
743                 if (n_buckets <= LB_NUM_INLINE_BUCKETS)
744                 {
745                     /*
746                      * we are not crossing the threshold and it's still inline buckets.
747                      * we can write the new on the old..
748                      */
749                     load_balance_fill_buckets(lb, nhs,
750                                               load_balance_get_buckets(lb),
751                                               n_buckets, flags);
752                     CLIB_MEMORY_BARRIER();
753                     load_balance_set_n_buckets(lb, n_buckets);
754                 }
755                 else
756                 {
757                     /*
758                      * we are not crossing the threshold. We need a new bucket array to
759                      * hold the increased number of choices.
760                      */
761                     dpo_id_t *new_buckets, *old_buckets, *tmp_dpo;
762
763                     new_buckets = NULL;
764                     old_buckets = load_balance_get_buckets(lb);
765
766                     vec_validate_aligned(new_buckets,
767                                          n_buckets - 1,
768                                          CLIB_CACHE_LINE_BYTES);
769
770                     load_balance_fill_buckets(lb, nhs, new_buckets,
771                                               n_buckets, flags);
772                     CLIB_MEMORY_BARRIER();
773                     lb->lb_buckets = new_buckets;
774                     CLIB_MEMORY_BARRIER();
775                     load_balance_set_n_buckets(lb, n_buckets);
776
777                     vec_foreach(tmp_dpo, old_buckets)
778                     {
779                         dpo_reset(tmp_dpo);
780                     }
781                     vec_free(old_buckets);
782                 }
783             }
784
785             /*
786              * buckets fixed. ready for the MAP update.
787              */
788             lb->lb_map = lbmi;
789         }
790         else
791         {
792             /*
793              * bucket size shrinkage.
794              * Any map we have will be based on the old
795              * larger number of buckets, so will be translating to indices
796              * out of range. So the new MAP must be installed first.
797              */
798             lb->lb_map = lbmi;
799             CLIB_MEMORY_BARRIER();
800
801
802             if (n_buckets <= LB_NUM_INLINE_BUCKETS &&
803                 lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS)
804             {
805                 /*
806                  * the new decreased number of buckets is crossing the threshold
807                  * from out-line storage to inline:
808                  *   1 - Fill the inline buckets,
809                  *   2 - fixup the number (and this point the inline buckets are
810                  *       used).
811                  *   3 - free the outline buckets
812                  */
813                 load_balance_fill_buckets(lb, nhs,
814                                           lb->lb_buckets_inline,
815                                           n_buckets, flags);
816                 CLIB_MEMORY_BARRIER();
817                 load_balance_set_n_buckets(lb, n_buckets);
818                 CLIB_MEMORY_BARRIER();
819
820                 vec_foreach(tmp_dpo, lb->lb_buckets)
821                 {
822                     dpo_reset(tmp_dpo);
823                 }
824                 vec_free(lb->lb_buckets);
825             }
826             else
827             {
828                 /*
829                  * not crossing the threshold.
830                  *  1 - update the number to the smaller size
831                  *  2 - write the new buckets
832                  *  3 - reset those no longer used.
833                  */
834                 dpo_id_t *buckets;
835                 u32 old_n_buckets;
836
837                 old_n_buckets = lb->lb_n_buckets;
838                 buckets = load_balance_get_buckets(lb);
839
840                 load_balance_set_n_buckets(lb, n_buckets);
841                 CLIB_MEMORY_BARRIER();
842
843                 load_balance_fill_buckets(lb, nhs, buckets,
844                                           n_buckets, flags);
845
846                 for (ii = n_buckets; ii < old_n_buckets; ii++)
847                 {
848                     dpo_reset(&buckets[ii]);
849                 }
850             }
851         }
852     }
853
854     vec_foreach (nh, nhs)
855     {
856         dpo_reset(&nh->path_dpo);
857     }
858     vec_free(nhs);
859     vec_free(fixed_nhs);
860
861     load_balance_map_unlock(old_lbmi);
862 }
863
864 static void
865 load_balance_lock (dpo_id_t *dpo)
866 {
867     load_balance_t *lb;
868
869     lb = load_balance_get(dpo->dpoi_index);
870
871     lb->lb_locks++;
872 }
873
874 static void
875 load_balance_destroy (load_balance_t *lb)
876 {
877     dpo_id_t *buckets;
878     int i;
879
880     buckets = load_balance_get_buckets(lb);
881
882     for (i = 0; i < lb->lb_n_buckets; i++)
883     {
884         dpo_reset(&buckets[i]);
885     }
886
887     LB_DBG(lb, "destroy");
888     if (!LB_HAS_INLINE_BUCKETS(lb))
889     {
890         vec_free(lb->lb_buckets);
891     }
892
893     fib_urpf_list_unlock(lb->lb_urpf);
894     load_balance_map_unlock(lb->lb_map);
895
896     pool_put(load_balance_pool, lb);
897 }
898
899 static void
900 load_balance_unlock (dpo_id_t *dpo)
901 {
902     load_balance_t *lb;
903
904     lb = load_balance_get(dpo->dpoi_index);
905
906     lb->lb_locks--;
907
908     if (0 == lb->lb_locks)
909     {
910         load_balance_destroy(lb);
911     }
912 }
913
914 static void
915 load_balance_mem_show (void)
916 {
917     fib_show_memory_usage("load-balance",
918                           pool_elts(load_balance_pool),
919                           pool_len(load_balance_pool),
920                           sizeof(load_balance_t));
921     load_balance_map_show_mem();
922 }
923
924 static u16
925 load_balance_dpo_get_mtu (const dpo_id_t *dpo)
926 {
927     const dpo_id_t *buckets;
928     load_balance_t *lb;
929     u16 i, mtu = 0xffff;
930
931     lb = load_balance_get(dpo->dpoi_index);
932     buckets = load_balance_get_buckets(lb);
933
934     for (i = 0; i < lb->lb_n_buckets; i++)
935     {
936         mtu = clib_min (mtu, dpo_get_mtu (&buckets[i]));
937     }
938
939     return (mtu);
940 }
941
942 const static dpo_vft_t lb_vft = {
943     .dv_lock = load_balance_lock,
944     .dv_unlock = load_balance_unlock,
945     .dv_format = format_load_balance_dpo,
946     .dv_mem_show = load_balance_mem_show,
947     .dv_get_mtu = load_balance_dpo_get_mtu,
948 };
949
950 /**
951  * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance
952  *        object.
953  *
954  * this means that these graph nodes are ones from which a load-balance is the
955  * parent object in the DPO-graph.
956  *
957  * We do not list all the load-balance nodes, such as the *-lookup. instead
958  * we are relying on the correct use of the .sibling_of field when setting
959  * up these sibling nodes.
960  */
961 const static char* const load_balance_ip4_nodes[] =
962 {
963     "ip4-load-balance",
964     NULL,
965 };
966 const static char* const load_balance_ip6_nodes[] =
967 {
968     "ip6-load-balance",
969     NULL,
970 };
971 const static char* const load_balance_mpls_nodes[] =
972 {
973     "mpls-load-balance",
974     NULL,
975 };
976 const static char* const load_balance_l2_nodes[] =
977 {
978     "l2-load-balance",
979     NULL,
980 };
981 const static char* const load_balance_nsh_nodes[] =
982 {
983     "nsh-load-balance",
984     NULL
985 };
986 const static char* const load_balance_bier_nodes[] =
987 {
988     "bier-load-balance",
989     NULL,
990 };
991 const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
992 {
993     [DPO_PROTO_IP4]  = load_balance_ip4_nodes,
994     [DPO_PROTO_IP6]  = load_balance_ip6_nodes,
995     [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
996     [DPO_PROTO_ETHERNET] = load_balance_l2_nodes,
997     [DPO_PROTO_NSH] = load_balance_nsh_nodes,
998     [DPO_PROTO_BIER] = load_balance_bier_nodes,
999 };
1000
1001 void
1002 load_balance_module_init (void)
1003 {
1004     index_t lbi;
1005
1006     dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes);
1007
1008     /*
1009      * Special LB with index zero. we need to define this since the v4 mtrie
1010      * assumes an index of 0 implies the ply is empty. therefore all 'real'
1011      * adjs need a non-zero index.
1012      * This should never be used, but just in case, stack it on a drop.
1013      */
1014     lbi = load_balance_create(1, DPO_PROTO_IP4, 0);
1015     load_balance_set_bucket(lbi, 0, drop_dpo_get(DPO_PROTO_IP4));
1016
1017     load_balance_logger =
1018         vlib_log_register_class("dpo", "load-balance");
1019
1020     load_balance_map_module_init();
1021 }
1022
1023 static clib_error_t *
1024 load_balance_show (vlib_main_t * vm,
1025                    unformat_input_t * input,
1026                    vlib_cli_command_t * cmd)
1027 {
1028     index_t lbi = INDEX_INVALID;
1029
1030     while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
1031     {
1032         if (unformat (input, "%d", &lbi))
1033             ;
1034         else
1035             break;
1036     }
1037
1038     if (INDEX_INVALID != lbi)
1039     {
1040         if (pool_is_free_index(load_balance_pool, lbi))
1041         {
1042             vlib_cli_output (vm, "no such load-balance:%d", lbi);
1043         }
1044         else
1045         {
1046             vlib_cli_output (vm, "%U", format_load_balance, lbi,
1047                          LOAD_BALANCE_FORMAT_DETAIL);
1048         }
1049     }
1050     else
1051     {
1052         load_balance_t *lb;
1053
1054         pool_foreach (lb, load_balance_pool)
1055          {
1056             vlib_cli_output (vm, "%U", format_load_balance,
1057                              load_balance_get_index(lb),
1058                              LOAD_BALANCE_FORMAT_NONE);
1059         }
1060     }
1061
1062     return 0;
1063 }
1064
1065 VLIB_CLI_COMMAND (load_balance_show_command, static) = {
1066     .path = "show load-balance",
1067     .short_help = "show load-balance [<index>]",
1068     .function = load_balance_show,
1069 };
1070
1071
1072 always_inline u32
1073 ip_flow_hash (void *data)
1074 {
1075   ip4_header_t *iph = (ip4_header_t *) data;
1076
1077   if ((iph->ip_version_and_header_length & 0xF0) == 0x40)
1078     return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT);
1079   else
1080     return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT);
1081 }
1082
1083 always_inline u64
1084 mac_to_u64 (u8 * m)
1085 {
1086   return (*((u64 *) m) & 0xffffffffffff);
1087 }
1088
1089 always_inline u32
1090 l2_flow_hash (vlib_buffer_t * b0)
1091 {
1092   ethernet_header_t *eh;
1093   u64 a, b, c;
1094   uword is_ip, eh_size;
1095   u16 eh_type;
1096
1097   eh = vlib_buffer_get_current (b0);
1098   eh_type = clib_net_to_host_u16 (eh->type);
1099   eh_size = ethernet_buffer_header_size (b0);
1100
1101   is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6);
1102
1103   /* since we have 2 cache lines, use them */
1104   if (is_ip)
1105     a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size);
1106   else
1107     a = eh->type;
1108
1109   b = mac_to_u64 ((u8 *) eh->dst_address);
1110   c = mac_to_u64 ((u8 *) eh->src_address);
1111   hash_mix64 (a, b, c);
1112
1113   return (u32) c;
1114 }
1115
1116 typedef struct load_balance_trace_t_
1117 {
1118     index_t lb_index;
1119 } load_balance_trace_t;
1120
1121 always_inline uword
1122 load_balance_inline (vlib_main_t * vm,
1123                      vlib_node_runtime_t * node,
1124                      vlib_frame_t * frame,
1125                      int is_l2)
1126 {
1127   u32 n_left_from, next_index, *from, *to_next;
1128
1129   from = vlib_frame_vector_args (frame);
1130   n_left_from = frame->n_vectors;
1131
1132   next_index = node->cached_next_index;
1133
1134   while (n_left_from > 0)
1135     {
1136       u32 n_left_to_next;
1137
1138       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1139
1140       while (n_left_from > 0 && n_left_to_next > 0)
1141         {
1142           vlib_buffer_t *b0;
1143           u32 bi0, lbi0, next0;
1144           const dpo_id_t *dpo0;
1145           const load_balance_t *lb0;
1146
1147           bi0 = from[0];
1148           to_next[0] = bi0;
1149           from += 1;
1150           to_next += 1;
1151           n_left_from -= 1;
1152           n_left_to_next -= 1;
1153
1154           b0 = vlib_get_buffer (vm, bi0);
1155
1156           /* lookup dst + src mac */
1157           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1158           lb0 = load_balance_get(lbi0);
1159
1160           if (is_l2)
1161           {
1162               vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);
1163           }
1164           else
1165           {
1166               /* it's BIER */
1167               const bier_hdr_t *bh0 = vlib_buffer_get_current(b0);
1168               vnet_buffer(b0)->ip.flow_hash = bier_compute_flow_hash(bh0);
1169           }
1170
1171           dpo0 = load_balance_get_bucket_i(lb0,
1172                                            vnet_buffer(b0)->ip.flow_hash &
1173                                            (lb0->lb_n_buckets_minus_1));
1174
1175           next0 = dpo0->dpoi_next_node;
1176           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1177
1178           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1179             {
1180               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1181                                                          sizeof (*tr));
1182               tr->lb_index = lbi0;
1183             }
1184           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1185                                            n_left_to_next, bi0, next0);
1186         }
1187
1188       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1189     }
1190
1191   return frame->n_vectors;
1192 }
1193
1194 static uword
1195 l2_load_balance (vlib_main_t * vm,
1196                  vlib_node_runtime_t * node,
1197                  vlib_frame_t * frame)
1198 {
1199     return (load_balance_inline(vm, node, frame, 1));
1200 }
1201
1202 static u8 *
1203 format_l2_load_balance_trace (u8 * s, va_list * args)
1204 {
1205   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1206   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1207   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1208
1209   s = format (s, "L2-load-balance: index %d", t->lb_index);
1210   return s;
1211 }
1212
1213 /**
1214  * @brief
1215  */
1216 VLIB_REGISTER_NODE (l2_load_balance_node) = {
1217   .function = l2_load_balance,
1218   .name = "l2-load-balance",
1219   .vector_size = sizeof (u32),
1220
1221   .format_trace = format_l2_load_balance_trace,
1222   .n_next_nodes = 1,
1223   .next_nodes = {
1224       [0] = "error-drop",
1225   },
1226 };
1227
1228 static uword
1229 nsh_load_balance (vlib_main_t * vm,
1230                  vlib_node_runtime_t * node,
1231                  vlib_frame_t * frame)
1232 {
1233   u32 n_left_from, next_index, *from, *to_next;
1234
1235   from = vlib_frame_vector_args (frame);
1236   n_left_from = frame->n_vectors;
1237
1238   next_index = node->cached_next_index;
1239
1240   while (n_left_from > 0)
1241     {
1242       u32 n_left_to_next;
1243
1244       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1245
1246       while (n_left_from > 0 && n_left_to_next > 0)
1247         {
1248           vlib_buffer_t *b0;
1249           u32 bi0, lbi0, next0, *nsh0;
1250           const dpo_id_t *dpo0;
1251           const load_balance_t *lb0;
1252
1253           bi0 = from[0];
1254           to_next[0] = bi0;
1255           from += 1;
1256           to_next += 1;
1257           n_left_from -= 1;
1258           n_left_to_next -= 1;
1259
1260           b0 = vlib_get_buffer (vm, bi0);
1261
1262           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1263           lb0 = load_balance_get(lbi0);
1264
1265           /* SPI + SI are the second word of the NSH header */
1266           nsh0 = vlib_buffer_get_current (b0);
1267           vnet_buffer(b0)->ip.flow_hash = nsh0[1] % lb0->lb_n_buckets;
1268
1269           dpo0 = load_balance_get_bucket_i(lb0,
1270                                            vnet_buffer(b0)->ip.flow_hash &
1271                                            (lb0->lb_n_buckets_minus_1));
1272
1273           next0 = dpo0->dpoi_next_node;
1274           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1275
1276           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1277             {
1278               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1279                                                          sizeof (*tr));
1280               tr->lb_index = lbi0;
1281             }
1282           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1283                                            n_left_to_next, bi0, next0);
1284         }
1285
1286       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1287     }
1288
1289   return frame->n_vectors;
1290 }
1291
1292 static u8 *
1293 format_nsh_load_balance_trace (u8 * s, va_list * args)
1294 {
1295   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1296   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1297   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1298
1299   s = format (s, "NSH-load-balance: index %d", t->lb_index);
1300   return s;
1301 }
1302
1303 /**
1304  * @brief
1305  */
1306 VLIB_REGISTER_NODE (nsh_load_balance_node) = {
1307   .function = nsh_load_balance,
1308   .name = "nsh-load-balance",
1309   .vector_size = sizeof (u32),
1310
1311   .format_trace = format_nsh_load_balance_trace,
1312   .n_next_nodes = 1,
1313   .next_nodes = {
1314       [0] = "error-drop",
1315   },
1316 };
1317
1318 static u8 *
1319 format_bier_load_balance_trace (u8 * s, va_list * args)
1320 {
1321   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1322   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1323   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1324
1325   s = format (s, "BIER-load-balance: index %d", t->lb_index);
1326   return s;
1327 }
1328
1329 static uword
1330 bier_load_balance (vlib_main_t * vm,
1331                    vlib_node_runtime_t * node,
1332                    vlib_frame_t * frame)
1333 {
1334     return (load_balance_inline(vm, node, frame, 0));
1335 }
1336
1337 /**
1338  * @brief
1339  */
1340 VLIB_REGISTER_NODE (bier_load_balance_node) = {
1341   .function = bier_load_balance,
1342   .name = "bier-load-balance",
1343   .vector_size = sizeof (u32),
1344
1345   .format_trace = format_bier_load_balance_trace,
1346   .sibling_of = "mpls-load-balance",
1347 };
1348
1349 // clang-format on