ip: Path MTU
[vpp.git] / src / vnet / dpo / load_balance.c
1 /*
2  * Copyright (c) 2016 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 #include <vnet/dpo/load_balance.h>
17 #include <vnet/dpo/load_balance_map.h>
18 #include <vnet/dpo/drop_dpo.h>
19 #include <vppinfra/math.h>              /* for fabs */
20 #include <vnet/adj/adj.h>
21 #include <vnet/adj/adj_internal.h>
22 #include <vnet/fib/fib_urpf_list.h>
23 #include <vnet/bier/bier_fwd.h>
24 #include <vnet/fib/mpls_fib.h>
25 #include <vnet/ip/ip4_inlines.h>
26 #include <vnet/ip/ip6_inlines.h>
27
28 // clang-format off
29
30 /*
31  * distribution error tolerance for load-balancing
32  */
33 const f64 multipath_next_hop_error_tolerance = 0.1;
34
35 static const char *load_balance_attr_names[] = LOAD_BALANCE_ATTR_NAMES;
36
37 /**
38  * the logger
39  */
40 vlib_log_class_t load_balance_logger;
41
42 #define LB_DBG(_lb, _fmt, _args...)                                     \
43 {                                                                       \
44     vlib_log_debug(load_balance_logger,                                 \
45                    "lb:[%U]:" _fmt,                                     \
46                    format_load_balance, load_balance_get_index(_lb),    \
47                    LOAD_BALANCE_FORMAT_NONE,                            \
48                    ##_args);                                            \
49 }
50
51 /**
52  * Pool of all DPOs. It's not static so the DP can have fast access
53  */
54 load_balance_t *load_balance_pool;
55
56 /**
57  * The one instance of load-balance main
58  */
59 load_balance_main_t load_balance_main = {
60     .lbm_to_counters = {
61         .name = "route-to",
62         .stat_segment_name = "/net/route/to",
63     },
64     .lbm_via_counters = {
65         .name = "route-via",
66         .stat_segment_name = "/net/route/via",
67     }
68 };
69
70 f64
71 load_balance_get_multipath_tolerance (void)
72 {
73     return (multipath_next_hop_error_tolerance);
74 }
75
76 static inline index_t
77 load_balance_get_index (const load_balance_t *lb)
78 {
79     return (lb - load_balance_pool);
80 }
81
82 static inline dpo_id_t*
83 load_balance_get_buckets (load_balance_t *lb)
84 {
85     if (LB_HAS_INLINE_BUCKETS(lb))
86     {
87         return (lb->lb_buckets_inline);
88     }
89     else
90     {
91         return (lb->lb_buckets);
92     }
93 }
94
95 static load_balance_t *
96 load_balance_alloc_i (void)
97 {
98     load_balance_t *lb;
99     u8 need_barrier_sync = 0;
100     vlib_main_t *vm = vlib_get_main();
101     ASSERT (vm->thread_index == 0);
102
103     pool_get_aligned_will_expand (load_balance_pool, need_barrier_sync,
104                                   CLIB_CACHE_LINE_BYTES);
105     if (need_barrier_sync)
106         vlib_worker_thread_barrier_sync (vm);
107
108     pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
109     clib_memset(lb, 0, sizeof(*lb));
110
111     lb->lb_map = INDEX_INVALID;
112     lb->lb_urpf = INDEX_INVALID;
113
114     if (need_barrier_sync == 0)
115     {
116         need_barrier_sync += vlib_validate_combined_counter_will_expand
117             (&(load_balance_main.lbm_to_counters),
118              load_balance_get_index(lb));
119         need_barrier_sync += vlib_validate_combined_counter_will_expand
120             (&(load_balance_main.lbm_via_counters),
121              load_balance_get_index(lb));
122         if (need_barrier_sync)
123             vlib_worker_thread_barrier_sync (vm);
124     }
125
126     vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
127                                    load_balance_get_index(lb));
128     vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
129                                    load_balance_get_index(lb));
130     vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters),
131                                load_balance_get_index(lb));
132     vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
133                                load_balance_get_index(lb));
134
135     if (need_barrier_sync)
136         vlib_worker_thread_barrier_release (vm);
137
138     return (lb);
139 }
140
141 static u8*
142 load_balance_format (index_t lbi,
143                      load_balance_format_flags_t flags,
144                      u32 indent,
145                      u8 *s)
146 {
147     vlib_counter_t to, via;
148     load_balance_t *lb;
149     dpo_id_t *buckets;
150     u32 i;
151
152     lb = load_balance_get(lbi);
153     vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to);
154     vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via);
155     buckets = load_balance_get_buckets(lb);
156
157     s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE);
158     s = format(s, "[proto:%U ", format_dpo_proto, lb->lb_proto);
159     s = format(s, "index:%d buckets:%d ", lbi, lb->lb_n_buckets);
160     s = format(s, "uRPF:%d ", lb->lb_urpf);
161     if (lb->lb_flags)
162     {
163         load_balance_attr_t attr;
164
165         s = format(s, "flags:[");
166
167         FOR_EACH_LOAD_BALANCE_ATTR(attr)
168         {
169             if (lb->lb_flags & (1 << attr))
170             {
171                 s = format (s, "%s", load_balance_attr_names[attr]);
172             }
173         }
174         s = format(s, "] ");
175     }
176     s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
177     if (0 != via.packets)
178     {
179         s = format(s, " via:[%Ld:%Ld]",
180                    via.packets, via.bytes);
181     }
182     s = format(s, "]");
183
184     if (INDEX_INVALID != lb->lb_map)
185     {
186         s = format(s, "\n%U%U",
187                    format_white_space, indent+4,
188                    format_load_balance_map, lb->lb_map, indent+4);
189     }
190     for (i = 0; i < lb->lb_n_buckets; i++)
191     {
192         s = format(s, "\n%U[%d] %U",
193                    format_white_space, indent+2,
194                    i,
195                    format_dpo_id,
196                    &buckets[i], indent+6);
197     }
198     return (s);
199 }
200
201 u8*
202 format_load_balance (u8 * s, va_list * args)
203 {
204     index_t lbi = va_arg(*args, index_t);
205     load_balance_format_flags_t flags = va_arg(*args, load_balance_format_flags_t);
206
207     return (load_balance_format(lbi, flags, 0, s));
208 }
209
210 static u8*
211 format_load_balance_dpo (u8 * s, va_list * args)
212 {
213     index_t lbi = va_arg(*args, index_t);
214     u32 indent = va_arg(*args, u32);
215
216     return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
217 }
218
219 flow_hash_config_t
220 load_balance_get_default_flow_hash (dpo_proto_t lb_proto)
221 {
222     switch (lb_proto)
223     {
224     case DPO_PROTO_IP4:
225     case DPO_PROTO_IP6:
226         return (IP_FLOW_HASH_DEFAULT);
227
228     case DPO_PROTO_MPLS:
229         return (MPLS_FLOW_HASH_DEFAULT);
230
231     case DPO_PROTO_ETHERNET:
232     case DPO_PROTO_BIER:
233     case DPO_PROTO_NSH:
234         break;
235     }
236
237     return (0);
238 }
239
240 static load_balance_t *
241 load_balance_create_i (u32 num_buckets,
242                        dpo_proto_t lb_proto,
243                        flow_hash_config_t fhc)
244 {
245     load_balance_t *lb;
246
247     lb = load_balance_alloc_i();
248     lb->lb_hash_config = fhc;
249     lb->lb_n_buckets = num_buckets;
250     lb->lb_n_buckets_minus_1 = num_buckets-1;
251     lb->lb_proto = lb_proto;
252
253     if (!LB_HAS_INLINE_BUCKETS(lb))
254     {
255         vec_validate_aligned(lb->lb_buckets,
256                              lb->lb_n_buckets - 1,
257                              CLIB_CACHE_LINE_BYTES);
258     }
259
260     LB_DBG(lb, "create");
261
262     return (lb);
263 }
264
265 index_t
266 load_balance_create (u32 n_buckets,
267                      dpo_proto_t lb_proto,
268                      flow_hash_config_t fhc)
269 {
270     return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc)));
271 }
272
273 static inline void
274 load_balance_set_bucket_i (load_balance_t *lb,
275                            u32 bucket,
276                            dpo_id_t *buckets,
277                            const dpo_id_t *next)
278 {
279     dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next);
280 }
281
282 void
283 load_balance_set_bucket (index_t lbi,
284                          u32 bucket,
285                          const dpo_id_t *next)
286 {
287     load_balance_t *lb;
288     dpo_id_t *buckets;
289
290     lb = load_balance_get(lbi);
291     buckets = load_balance_get_buckets(lb);
292
293     ASSERT(bucket < lb->lb_n_buckets);
294
295     load_balance_set_bucket_i(lb, bucket, buckets, next);
296 }
297
298 int
299 load_balance_is_drop (const dpo_id_t *dpo)
300 {
301     load_balance_t *lb;
302
303     if (DPO_LOAD_BALANCE != dpo->dpoi_type)
304         return (0);
305
306     lb = load_balance_get(dpo->dpoi_index);
307
308     if (1 == lb->lb_n_buckets)
309     {
310         return (dpo_is_drop(load_balance_get_bucket_i(lb, 0)));
311     }
312     return (0);
313 }
314
315 u16
316 load_balance_n_buckets (index_t lbi)
317 {
318     load_balance_t *lb;
319
320     lb = load_balance_get(lbi);
321
322     return (lb->lb_n_buckets);
323 }
324
325 void
326 load_balance_set_fib_entry_flags (index_t lbi,
327                                   fib_entry_flag_t flags)
328 {
329     load_balance_t *lb;
330
331     lb = load_balance_get(lbi);
332     lb->lb_fib_entry_flags = flags;
333 }
334
335
336 void
337 load_balance_set_urpf (index_t lbi,
338                        index_t urpf)
339 {
340     load_balance_t *lb;
341     index_t old;
342
343     lb = load_balance_get(lbi);
344
345     /*
346      * packets in flight we see this change. but it's atomic, so :P
347      */
348     old = lb->lb_urpf;
349     lb->lb_urpf = urpf;
350
351     fib_urpf_list_unlock(old);
352     fib_urpf_list_lock(urpf);
353 }
354
355 index_t
356 load_balance_get_urpf (index_t lbi)
357 {
358     load_balance_t *lb;
359
360     lb = load_balance_get(lbi);
361
362     return (lb->lb_urpf);
363 }
364
365 const dpo_id_t *
366 load_balance_get_bucket (index_t lbi,
367                          u32 bucket)
368 {
369     load_balance_t *lb;
370
371     lb = load_balance_get(lbi);
372
373     return (load_balance_get_bucket_i(lb, bucket));
374 }
375
376 static int
377 next_hop_sort_by_weight (const load_balance_path_t * n1,
378                          const load_balance_path_t * n2)
379 {
380     return ((int) n1->path_weight - (int) n2->path_weight);
381 }
382
383 /* Given next hop vector is over-written with normalized one with sorted weights and
384    with weights corresponding to the number of adjacencies for each next hop.
385    Returns number of adjacencies in block. */
386 u32
387 ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops,
388                                   load_balance_path_t ** normalized_next_hops,
389                                   u32 *sum_weight_in,
390                                   f64 multipath_next_hop_error_tolerance)
391 {
392     load_balance_path_t * nhs;
393     uword n_nhs, n_adj, n_adj_left, i, sum_weight;
394     f64 norm, error;
395
396     n_nhs = vec_len (raw_next_hops);
397     ASSERT (n_nhs > 0);
398     if (n_nhs == 0)
399         return 0;
400
401     /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
402     nhs = *normalized_next_hops;
403     vec_validate (nhs, 2*n_nhs - 1);
404
405     /* Fast path: 1 next hop in block. */
406     n_adj = n_nhs;
407     if (n_nhs == 1)
408     {
409         nhs[0] = raw_next_hops[0];
410         nhs[0].path_weight = 1;
411         _vec_len (nhs) = 1;
412         sum_weight = 1;
413         goto done;
414     }
415
416     else if (n_nhs == 2)
417     {
418         int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;
419
420         /* Fast sort. */
421         nhs[0] = raw_next_hops[cmp];
422         nhs[1] = raw_next_hops[cmp ^ 1];
423
424         /* Fast path: equal cost multipath with 2 next hops. */
425         if (nhs[0].path_weight == nhs[1].path_weight)
426         {
427             nhs[0].path_weight = nhs[1].path_weight = 1;
428             _vec_len (nhs) = 2;
429             sum_weight = 2;
430             goto done;
431         }
432     }
433     else
434     {
435         clib_memcpy_fast (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
436         qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
437     }
438
439     /* Find total weight to normalize weights. */
440     sum_weight = 0;
441     for (i = 0; i < n_nhs; i++)
442         sum_weight += nhs[i].path_weight;
443
444     /* In the unlikely case that all weights are given as 0, set them all to 1. */
445     if (sum_weight == 0)
446     {
447         for (i = 0; i < n_nhs; i++)
448             nhs[i].path_weight = 1;
449         sum_weight = n_nhs;
450     }
451
452     /* Save copies of all next hop weights to avoid being overwritten in loop below. */
453     for (i = 0; i < n_nhs; i++)
454         nhs[n_nhs + i].path_weight = nhs[i].path_weight;
455
456     /* Try larger and larger power of 2 sized adjacency blocks until we
457        find one where traffic flows to within 1% of specified weights. */
458     for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
459     {
460         error = 0;
461
462         norm = n_adj / ((f64) sum_weight);
463         n_adj_left = n_adj;
464         for (i = 0; i < n_nhs; i++)
465         {
466             f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */
467             word n = flt_round_nearest (nf);
468
469             n = n > n_adj_left ? n_adj_left : n;
470             n_adj_left -= n;
471             error += fabs (nf - n);
472             nhs[i].path_weight = n;
473
474             if (0 == nhs[i].path_weight)
475             {
476                 /*
477                  * when the weight skew is high (norm is small) and n == nf.
478                  * without this correction the path with a low weight would have
479                  * no representation in the load-balanace - don't want that.
480                  * If the weight skew is high so the load-balance has many buckets
481                  * to allow it. pays ya money takes ya choice.
482                  */
483                 error = n_adj;
484                 break;
485             }
486         }
487
488         nhs[0].path_weight += n_adj_left;
489
490         /* Less than 5% average error per adjacency with this size adjacency block? */
491         if (error <= multipath_next_hop_error_tolerance*n_adj)
492         {
493             /* Truncate any next hops with zero weight. */
494             _vec_len (nhs) = i;
495             break;
496         }
497     }
498
499 done:
500     /* Save vector for next call. */
501     *normalized_next_hops = nhs;
502     *sum_weight_in = sum_weight;
503     return n_adj;
504 }
505
506 static load_balance_path_t *
507 load_balance_multipath_next_hop_fixup (const load_balance_path_t *nhs,
508                                        dpo_proto_t drop_proto)
509 {
510     if (0 == vec_len(nhs))
511     {
512         load_balance_path_t *new_nhs = NULL, *nh;
513
514         /*
515          * we need something for the load-balance. so use the drop
516          */
517         vec_add2(new_nhs, nh, 1);
518
519         nh->path_weight = 1;
520         dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto));
521
522         return (new_nhs);
523     }
524
525     return (NULL);
526 }
527
528 /*
529  * Fill in adjacencies in block based on corresponding
530  * next hop adjacencies.
531  */
532 static void
533 load_balance_fill_buckets_norm (load_balance_t *lb,
534                                 load_balance_path_t *nhs,
535                                 dpo_id_t *buckets,
536                                 u32 n_buckets)
537 {
538     load_balance_path_t *nh;
539     u16 ii, bucket;
540
541     bucket = 0;
542
543     /*
544      * the next-hops have normalised weights. that means their sum is the number
545      * of buckets we need to fill.
546      */
547     vec_foreach (nh, nhs)
548     {
549         for (ii = 0; ii < nh->path_weight; ii++)
550         {
551             ASSERT(bucket < n_buckets);
552             load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
553         }
554     }
555 }
556 static void
557 load_balance_fill_buckets_sticky (load_balance_t *lb,
558                                   load_balance_path_t *nhs,
559                                   dpo_id_t *buckets,
560                                   u32 n_buckets)
561 {
562     load_balance_path_t *nh, *fwding_paths;
563     u16 ii, bucket, fpath;
564
565     fpath = bucket = 0;
566     fwding_paths = NULL;
567
568     vec_foreach (nh, nhs)
569     {
570         if (!dpo_is_drop(&nh->path_dpo))
571         {
572             vec_add1(fwding_paths, *nh);
573         }
574     }
575     if (vec_len(fwding_paths) == 0)
576         fwding_paths = vec_dup(nhs);
577
578     /*
579      * the next-hops have normalised weights. that means their sum is the number
580      * of buckets we need to fill.
581      */
582     vec_foreach (nh, nhs)
583     {
584         for (ii = 0; ii < nh->path_weight; ii++)
585         {
586             ASSERT(bucket < n_buckets);
587             if (!dpo_is_drop(&nh->path_dpo))
588             {
589                 load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
590             }
591             else
592             {
593                 /* fill the bucks from the next up path */
594                 load_balance_set_bucket_i(lb, bucket++, buckets, &fwding_paths[fpath].path_dpo);
595                 fpath = (fpath + 1) % vec_len(fwding_paths);
596             }
597         }
598     }
599
600     vec_free(fwding_paths);
601 }
602
603 static void
604 load_balance_fill_buckets (load_balance_t *lb,
605                            load_balance_path_t *nhs,
606                            dpo_id_t *buckets,
607                            u32 n_buckets,
608                            load_balance_flags_t flags)
609 {
610     if (flags & LOAD_BALANCE_FLAG_STICKY)
611     {
612         load_balance_fill_buckets_sticky(lb, nhs, buckets, n_buckets);
613     }
614     else
615     {
616         load_balance_fill_buckets_norm(lb, nhs, buckets, n_buckets);
617     }
618 }
619
620 static inline void
621 load_balance_set_n_buckets (load_balance_t *lb,
622                             u32 n_buckets)
623 {
624     lb->lb_n_buckets = n_buckets;
625     lb->lb_n_buckets_minus_1 = n_buckets-1;
626 }
627
628 void
629 load_balance_multipath_update (const dpo_id_t *dpo,
630                                const load_balance_path_t * raw_nhs,
631                                load_balance_flags_t flags)
632 {
633     load_balance_path_t *nh, *nhs, *fixed_nhs;
634     u32 sum_of_weights, n_buckets, ii;
635     index_t lbmi, old_lbmi;
636     load_balance_t *lb;
637     dpo_id_t *tmp_dpo;
638
639     nhs = NULL;
640
641     ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
642     lb = load_balance_get(dpo->dpoi_index);
643     lb->lb_flags = flags;
644     fixed_nhs = load_balance_multipath_next_hop_fixup(raw_nhs, lb->lb_proto);
645     n_buckets =
646         ip_multipath_normalize_next_hops((NULL == fixed_nhs ?
647                                           raw_nhs :
648                                           fixed_nhs),
649                                          &nhs,
650                                          &sum_of_weights,
651                                          multipath_next_hop_error_tolerance);
652
653     ASSERT (n_buckets >= vec_len (raw_nhs));
654
655     /*
656      * Save the old load-balance map used, and get a new one if required.
657      */
658     old_lbmi = lb->lb_map;
659     if (flags & LOAD_BALANCE_FLAG_USES_MAP)
660     {
661         lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs);
662     }
663     else
664     {
665         lbmi = INDEX_INVALID;
666     }
667
668     if (0 == lb->lb_n_buckets)
669     {
670         /*
671          * first time initialisation. no packets inflight, so we can write
672          * at leisure.
673          */
674         load_balance_set_n_buckets(lb, n_buckets);
675
676         if (!LB_HAS_INLINE_BUCKETS(lb))
677             vec_validate_aligned(lb->lb_buckets,
678                                  lb->lb_n_buckets - 1,
679                                  CLIB_CACHE_LINE_BYTES);
680
681         load_balance_fill_buckets(lb, nhs,
682                                   load_balance_get_buckets(lb),
683                                   n_buckets, flags);
684         lb->lb_map = lbmi;
685     }
686     else
687     {
688         /*
689          * This is a modification of an existing load-balance.
690          * We need to ensure that packets inflight see a consistent state, that
691          * is the number of reported buckets the LB has (read from
692          * lb_n_buckets_minus_1) is not more than it actually has. So if the
693          * number of buckets is increasing, we must update the bucket array first,
694          * then the reported number. vice-versa if the number of buckets goes down.
695          */
696         if (n_buckets == lb->lb_n_buckets)
697         {
698             /*
699              * no change in the number of buckets. we can simply fill what
700              * is new over what is old.
701              */
702             load_balance_fill_buckets(lb, nhs,
703                                       load_balance_get_buckets(lb),
704                                       n_buckets, flags);
705             lb->lb_map = lbmi;
706         }
707         else if (n_buckets > lb->lb_n_buckets)
708         {
709             /*
710              * we have more buckets. the old load-balance map (if there is one)
711              * will remain valid, i.e. mapping to indices within range, so we
712              * update it last.
713              */
714             if (n_buckets > LB_NUM_INLINE_BUCKETS &&
715                 lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
716             {
717                 /*
718                  * the new increased number of buckets is crossing the threshold
719                  * from the inline storage to out-line. Alloc the outline buckets
720                  * first, then fixup the number. then reset the inlines.
721                  */
722                 ASSERT(NULL == lb->lb_buckets);
723                 vec_validate_aligned(lb->lb_buckets,
724                                      n_buckets - 1,
725                                      CLIB_CACHE_LINE_BYTES);
726
727                 load_balance_fill_buckets(lb, nhs,
728                                           lb->lb_buckets,
729                                           n_buckets, flags);
730                 CLIB_MEMORY_BARRIER();
731                 load_balance_set_n_buckets(lb, n_buckets);
732
733                 CLIB_MEMORY_BARRIER();
734
735                 for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++)
736                 {
737                     dpo_reset(&lb->lb_buckets_inline[ii]);
738                 }
739             }
740             else
741             {
742                 if (n_buckets <= LB_NUM_INLINE_BUCKETS)
743                 {
744                     /*
745                      * we are not crossing the threshold and it's still inline buckets.
746                      * we can write the new on the old..
747                      */
748                     load_balance_fill_buckets(lb, nhs,
749                                               load_balance_get_buckets(lb),
750                                               n_buckets, flags);
751                     CLIB_MEMORY_BARRIER();
752                     load_balance_set_n_buckets(lb, n_buckets);
753                 }
754                 else
755                 {
756                     /*
757                      * we are not crossing the threshold. We need a new bucket array to
758                      * hold the increased number of choices.
759                      */
760                     dpo_id_t *new_buckets, *old_buckets, *tmp_dpo;
761
762                     new_buckets = NULL;
763                     old_buckets = load_balance_get_buckets(lb);
764
765                     vec_validate_aligned(new_buckets,
766                                          n_buckets - 1,
767                                          CLIB_CACHE_LINE_BYTES);
768
769                     load_balance_fill_buckets(lb, nhs, new_buckets,
770                                               n_buckets, flags);
771                     CLIB_MEMORY_BARRIER();
772                     lb->lb_buckets = new_buckets;
773                     CLIB_MEMORY_BARRIER();
774                     load_balance_set_n_buckets(lb, n_buckets);
775
776                     vec_foreach(tmp_dpo, old_buckets)
777                     {
778                         dpo_reset(tmp_dpo);
779                     }
780                     vec_free(old_buckets);
781                 }
782             }
783
784             /*
785              * buckets fixed. ready for the MAP update.
786              */
787             lb->lb_map = lbmi;
788         }
789         else
790         {
791             /*
792              * bucket size shrinkage.
793              * Any map we have will be based on the old
794              * larger number of buckets, so will be translating to indices
795              * out of range. So the new MAP must be installed first.
796              */
797             lb->lb_map = lbmi;
798             CLIB_MEMORY_BARRIER();
799
800
801             if (n_buckets <= LB_NUM_INLINE_BUCKETS &&
802                 lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS)
803             {
804                 /*
805                  * the new decreased number of buckets is crossing the threshold
806                  * from out-line storage to inline:
807                  *   1 - Fill the inline buckets,
808                  *   2 - fixup the number (and this point the inline buckets are
809                  *       used).
810                  *   3 - free the outline buckets
811                  */
812                 load_balance_fill_buckets(lb, nhs,
813                                           lb->lb_buckets_inline,
814                                           n_buckets, flags);
815                 CLIB_MEMORY_BARRIER();
816                 load_balance_set_n_buckets(lb, n_buckets);
817                 CLIB_MEMORY_BARRIER();
818
819                 vec_foreach(tmp_dpo, lb->lb_buckets)
820                 {
821                     dpo_reset(tmp_dpo);
822                 }
823                 vec_free(lb->lb_buckets);
824             }
825             else
826             {
827                 /*
828                  * not crossing the threshold.
829                  *  1 - update the number to the smaller size
830                  *  2 - write the new buckets
831                  *  3 - reset those no longer used.
832                  */
833                 dpo_id_t *buckets;
834                 u32 old_n_buckets;
835
836                 old_n_buckets = lb->lb_n_buckets;
837                 buckets = load_balance_get_buckets(lb);
838
839                 load_balance_set_n_buckets(lb, n_buckets);
840                 CLIB_MEMORY_BARRIER();
841
842                 load_balance_fill_buckets(lb, nhs, buckets,
843                                           n_buckets, flags);
844
845                 for (ii = n_buckets; ii < old_n_buckets; ii++)
846                 {
847                     dpo_reset(&buckets[ii]);
848                 }
849             }
850         }
851     }
852
853     vec_foreach (nh, nhs)
854     {
855         dpo_reset(&nh->path_dpo);
856     }
857     vec_free(nhs);
858     vec_free(fixed_nhs);
859
860     load_balance_map_unlock(old_lbmi);
861 }
862
863 static void
864 load_balance_lock (dpo_id_t *dpo)
865 {
866     load_balance_t *lb;
867
868     lb = load_balance_get(dpo->dpoi_index);
869
870     lb->lb_locks++;
871 }
872
873 static void
874 load_balance_destroy (load_balance_t *lb)
875 {
876     dpo_id_t *buckets;
877     int i;
878
879     buckets = load_balance_get_buckets(lb);
880
881     for (i = 0; i < lb->lb_n_buckets; i++)
882     {
883         dpo_reset(&buckets[i]);
884     }
885
886     LB_DBG(lb, "destroy");
887     if (!LB_HAS_INLINE_BUCKETS(lb))
888     {
889         vec_free(lb->lb_buckets);
890     }
891
892     fib_urpf_list_unlock(lb->lb_urpf);
893     load_balance_map_unlock(lb->lb_map);
894
895     pool_put(load_balance_pool, lb);
896 }
897
898 static void
899 load_balance_unlock (dpo_id_t *dpo)
900 {
901     load_balance_t *lb;
902
903     lb = load_balance_get(dpo->dpoi_index);
904
905     lb->lb_locks--;
906
907     if (0 == lb->lb_locks)
908     {
909         load_balance_destroy(lb);
910     }
911 }
912
913 static void
914 load_balance_mem_show (void)
915 {
916     fib_show_memory_usage("load-balance",
917                           pool_elts(load_balance_pool),
918                           pool_len(load_balance_pool),
919                           sizeof(load_balance_t));
920     load_balance_map_show_mem();
921 }
922
923 static u16
924 load_balance_dpo_get_mtu (const dpo_id_t *dpo)
925 {
926     const dpo_id_t *buckets;
927     load_balance_t *lb;
928     u16 i, mtu = 0xffff;
929
930     lb = load_balance_get(dpo->dpoi_index);
931     buckets = load_balance_get_buckets(lb);
932
933     for (i = 0; i < lb->lb_n_buckets; i++)
934     {
935         mtu = clib_min (mtu, dpo_get_mtu (&buckets[i]));
936     }
937
938     return (mtu);
939 }
940
941 const static dpo_vft_t lb_vft = {
942     .dv_lock = load_balance_lock,
943     .dv_unlock = load_balance_unlock,
944     .dv_format = format_load_balance_dpo,
945     .dv_mem_show = load_balance_mem_show,
946     .dv_get_mtu = load_balance_dpo_get_mtu,
947 };
948
949 /**
950  * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance
951  *        object.
952  *
953  * this means that these graph nodes are ones from which a load-balance is the
954  * parent object in the DPO-graph.
955  *
956  * We do not list all the load-balance nodes, such as the *-lookup. instead
957  * we are relying on the correct use of the .sibling_of field when setting
958  * up these sibling nodes.
959  */
960 const static char* const load_balance_ip4_nodes[] =
961 {
962     "ip4-load-balance",
963     NULL,
964 };
965 const static char* const load_balance_ip6_nodes[] =
966 {
967     "ip6-load-balance",
968     NULL,
969 };
970 const static char* const load_balance_mpls_nodes[] =
971 {
972     "mpls-load-balance",
973     NULL,
974 };
975 const static char* const load_balance_l2_nodes[] =
976 {
977     "l2-load-balance",
978     NULL,
979 };
980 const static char* const load_balance_nsh_nodes[] =
981 {
982     "nsh-load-balance",
983     NULL
984 };
985 const static char* const load_balance_bier_nodes[] =
986 {
987     "bier-load-balance",
988     NULL,
989 };
990 const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
991 {
992     [DPO_PROTO_IP4]  = load_balance_ip4_nodes,
993     [DPO_PROTO_IP6]  = load_balance_ip6_nodes,
994     [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
995     [DPO_PROTO_ETHERNET] = load_balance_l2_nodes,
996     [DPO_PROTO_NSH] = load_balance_nsh_nodes,
997     [DPO_PROTO_BIER] = load_balance_bier_nodes,
998 };
999
1000 void
1001 load_balance_module_init (void)
1002 {
1003     index_t lbi;
1004
1005     dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes);
1006
1007     /*
1008      * Special LB with index zero. we need to define this since the v4 mtrie
1009      * assumes an index of 0 implies the ply is empty. therefore all 'real'
1010      * adjs need a non-zero index.
1011      * This should never be used, but just in case, stack it on a drop.
1012      */
1013     lbi = load_balance_create(1, DPO_PROTO_IP4, 0);
1014     load_balance_set_bucket(lbi, 0, drop_dpo_get(DPO_PROTO_IP4));
1015
1016     load_balance_logger =
1017         vlib_log_register_class("dpo", "load-balance");
1018
1019     load_balance_map_module_init();
1020 }
1021
1022 static clib_error_t *
1023 load_balance_show (vlib_main_t * vm,
1024                    unformat_input_t * input,
1025                    vlib_cli_command_t * cmd)
1026 {
1027     index_t lbi = INDEX_INVALID;
1028
1029     while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
1030     {
1031         if (unformat (input, "%d", &lbi))
1032             ;
1033         else
1034             break;
1035     }
1036
1037     if (INDEX_INVALID != lbi)
1038     {
1039         if (pool_is_free_index(load_balance_pool, lbi))
1040         {
1041             vlib_cli_output (vm, "no such load-balance:%d", lbi);
1042         }
1043         else
1044         {
1045             vlib_cli_output (vm, "%U", format_load_balance, lbi,
1046                          LOAD_BALANCE_FORMAT_DETAIL);
1047         }
1048     }
1049     else
1050     {
1051         load_balance_t *lb;
1052
1053         pool_foreach (lb, load_balance_pool)
1054          {
1055             vlib_cli_output (vm, "%U", format_load_balance,
1056                              load_balance_get_index(lb),
1057                              LOAD_BALANCE_FORMAT_NONE);
1058         }
1059     }
1060
1061     return 0;
1062 }
1063
1064 VLIB_CLI_COMMAND (load_balance_show_command, static) = {
1065     .path = "show load-balance",
1066     .short_help = "show load-balance [<index>]",
1067     .function = load_balance_show,
1068 };
1069
1070
1071 always_inline u32
1072 ip_flow_hash (void *data)
1073 {
1074   ip4_header_t *iph = (ip4_header_t *) data;
1075
1076   if ((iph->ip_version_and_header_length & 0xF0) == 0x40)
1077     return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT);
1078   else
1079     return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT);
1080 }
1081
1082 always_inline u64
1083 mac_to_u64 (u8 * m)
1084 {
1085   return (*((u64 *) m) & 0xffffffffffff);
1086 }
1087
1088 always_inline u32
1089 l2_flow_hash (vlib_buffer_t * b0)
1090 {
1091   ethernet_header_t *eh;
1092   u64 a, b, c;
1093   uword is_ip, eh_size;
1094   u16 eh_type;
1095
1096   eh = vlib_buffer_get_current (b0);
1097   eh_type = clib_net_to_host_u16 (eh->type);
1098   eh_size = ethernet_buffer_header_size (b0);
1099
1100   is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6);
1101
1102   /* since we have 2 cache lines, use them */
1103   if (is_ip)
1104     a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size);
1105   else
1106     a = eh->type;
1107
1108   b = mac_to_u64 ((u8 *) eh->dst_address);
1109   c = mac_to_u64 ((u8 *) eh->src_address);
1110   hash_mix64 (a, b, c);
1111
1112   return (u32) c;
1113 }
1114
1115 typedef struct load_balance_trace_t_
1116 {
1117     index_t lb_index;
1118 } load_balance_trace_t;
1119
1120 always_inline uword
1121 load_balance_inline (vlib_main_t * vm,
1122                      vlib_node_runtime_t * node,
1123                      vlib_frame_t * frame,
1124                      int is_l2)
1125 {
1126   u32 n_left_from, next_index, *from, *to_next;
1127
1128   from = vlib_frame_vector_args (frame);
1129   n_left_from = frame->n_vectors;
1130
1131   next_index = node->cached_next_index;
1132
1133   while (n_left_from > 0)
1134     {
1135       u32 n_left_to_next;
1136
1137       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1138
1139       while (n_left_from > 0 && n_left_to_next > 0)
1140         {
1141           vlib_buffer_t *b0;
1142           u32 bi0, lbi0, next0;
1143           const dpo_id_t *dpo0;
1144           const load_balance_t *lb0;
1145
1146           bi0 = from[0];
1147           to_next[0] = bi0;
1148           from += 1;
1149           to_next += 1;
1150           n_left_from -= 1;
1151           n_left_to_next -= 1;
1152
1153           b0 = vlib_get_buffer (vm, bi0);
1154
1155           /* lookup dst + src mac */
1156           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1157           lb0 = load_balance_get(lbi0);
1158
1159           if (is_l2)
1160           {
1161               vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);
1162           }
1163           else
1164           {
1165               /* it's BIER */
1166               const bier_hdr_t *bh0 = vlib_buffer_get_current(b0);
1167               vnet_buffer(b0)->ip.flow_hash = bier_compute_flow_hash(bh0);
1168           }
1169
1170           dpo0 = load_balance_get_bucket_i(lb0,
1171                                            vnet_buffer(b0)->ip.flow_hash &
1172                                            (lb0->lb_n_buckets_minus_1));
1173
1174           next0 = dpo0->dpoi_next_node;
1175           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1176
1177           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1178             {
1179               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1180                                                          sizeof (*tr));
1181               tr->lb_index = lbi0;
1182             }
1183           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1184                                            n_left_to_next, bi0, next0);
1185         }
1186
1187       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1188     }
1189
1190   return frame->n_vectors;
1191 }
1192
1193 static uword
1194 l2_load_balance (vlib_main_t * vm,
1195                  vlib_node_runtime_t * node,
1196                  vlib_frame_t * frame)
1197 {
1198     return (load_balance_inline(vm, node, frame, 1));
1199 }
1200
1201 static u8 *
1202 format_l2_load_balance_trace (u8 * s, va_list * args)
1203 {
1204   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1205   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1206   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1207
1208   s = format (s, "L2-load-balance: index %d", t->lb_index);
1209   return s;
1210 }
1211
1212 /**
1213  * @brief
1214  */
1215 VLIB_REGISTER_NODE (l2_load_balance_node) = {
1216   .function = l2_load_balance,
1217   .name = "l2-load-balance",
1218   .vector_size = sizeof (u32),
1219
1220   .format_trace = format_l2_load_balance_trace,
1221   .n_next_nodes = 1,
1222   .next_nodes = {
1223       [0] = "error-drop",
1224   },
1225 };
1226
1227 static uword
1228 nsh_load_balance (vlib_main_t * vm,
1229                  vlib_node_runtime_t * node,
1230                  vlib_frame_t * frame)
1231 {
1232   u32 n_left_from, next_index, *from, *to_next;
1233
1234   from = vlib_frame_vector_args (frame);
1235   n_left_from = frame->n_vectors;
1236
1237   next_index = node->cached_next_index;
1238
1239   while (n_left_from > 0)
1240     {
1241       u32 n_left_to_next;
1242
1243       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1244
1245       while (n_left_from > 0 && n_left_to_next > 0)
1246         {
1247           vlib_buffer_t *b0;
1248           u32 bi0, lbi0, next0, *nsh0;
1249           const dpo_id_t *dpo0;
1250           const load_balance_t *lb0;
1251
1252           bi0 = from[0];
1253           to_next[0] = bi0;
1254           from += 1;
1255           to_next += 1;
1256           n_left_from -= 1;
1257           n_left_to_next -= 1;
1258
1259           b0 = vlib_get_buffer (vm, bi0);
1260
1261           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1262           lb0 = load_balance_get(lbi0);
1263
1264           /* SPI + SI are the second word of the NSH header */
1265           nsh0 = vlib_buffer_get_current (b0);
1266           vnet_buffer(b0)->ip.flow_hash = nsh0[1] % lb0->lb_n_buckets;
1267
1268           dpo0 = load_balance_get_bucket_i(lb0,
1269                                            vnet_buffer(b0)->ip.flow_hash &
1270                                            (lb0->lb_n_buckets_minus_1));
1271
1272           next0 = dpo0->dpoi_next_node;
1273           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1274
1275           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1276             {
1277               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1278                                                          sizeof (*tr));
1279               tr->lb_index = lbi0;
1280             }
1281           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1282                                            n_left_to_next, bi0, next0);
1283         }
1284
1285       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1286     }
1287
1288   return frame->n_vectors;
1289 }
1290
1291 static u8 *
1292 format_nsh_load_balance_trace (u8 * s, va_list * args)
1293 {
1294   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1295   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1296   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1297
1298   s = format (s, "NSH-load-balance: index %d", t->lb_index);
1299   return s;
1300 }
1301
1302 /**
1303  * @brief
1304  */
1305 VLIB_REGISTER_NODE (nsh_load_balance_node) = {
1306   .function = nsh_load_balance,
1307   .name = "nsh-load-balance",
1308   .vector_size = sizeof (u32),
1309
1310   .format_trace = format_nsh_load_balance_trace,
1311   .n_next_nodes = 1,
1312   .next_nodes = {
1313       [0] = "error-drop",
1314   },
1315 };
1316
1317 static u8 *
1318 format_bier_load_balance_trace (u8 * s, va_list * args)
1319 {
1320   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1321   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1322   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1323
1324   s = format (s, "BIER-load-balance: index %d", t->lb_index);
1325   return s;
1326 }
1327
1328 static uword
1329 bier_load_balance (vlib_main_t * vm,
1330                    vlib_node_runtime_t * node,
1331                    vlib_frame_t * frame)
1332 {
1333     return (load_balance_inline(vm, node, frame, 0));
1334 }
1335
1336 /**
1337  * @brief
1338  */
1339 VLIB_REGISTER_NODE (bier_load_balance_node) = {
1340   .function = bier_load_balance,
1341   .name = "bier-load-balance",
1342   .vector_size = sizeof (u32),
1343
1344   .format_trace = format_bier_load_balance_trace,
1345   .sibling_of = "mpls-load-balance",
1346 };
1347
1348 // clang-format on