udp: fix csum computation when offload disabled
[vpp.git] / src / vnet / dpo / load_balance.c
1 /*
2  * Copyright (c) 2016 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 #include <vnet/dpo/load_balance.h>
17 #include <vnet/dpo/load_balance_map.h>
18 #include <vnet/dpo/drop_dpo.h>
19 #include <vppinfra/math.h>              /* for fabs */
20 #include <vnet/adj/adj.h>
21 #include <vnet/adj/adj_internal.h>
22 #include <vnet/fib/fib_urpf_list.h>
23 #include <vnet/bier/bier_fwd.h>
24 #include <vnet/fib/mpls_fib.h>
25 #include <vnet/ip/ip4_inlines.h>
26 #include <vnet/ip/ip6_inlines.h>
27
28 // clang-format off
29
30 /*
31  * distribution error tolerance for load-balancing
32  */
33 const f64 multipath_next_hop_error_tolerance = 0.1;
34
35 static const char *load_balance_attr_names[] = LOAD_BALANCE_ATTR_NAMES;
36
37 /**
38  * the logger
39  */
40 vlib_log_class_t load_balance_logger;
41
42 #define LB_DBG(_lb, _fmt, _args...)                                     \
43 {                                                                       \
44     vlib_log_debug(load_balance_logger,                                 \
45                    "lb:[%U]:" _fmt,                                     \
46                    format_load_balance, load_balance_get_index(_lb),    \
47                    LOAD_BALANCE_FORMAT_NONE,                            \
48                    ##_args);                                            \
49 }
50
51 /**
52  * Pool of all DPOs. It's not static so the DP can have fast access
53  */
54 load_balance_t *load_balance_pool;
55
56 /**
57  * The one instance of load-balance main
58  */
59 load_balance_main_t load_balance_main = {
60     .lbm_to_counters = {
61         .name = "route-to",
62         .stat_segment_name = "/net/route/to",
63     },
64     .lbm_via_counters = {
65         .name = "route-via",
66         .stat_segment_name = "/net/route/via",
67     }
68 };
69
70 f64
71 load_balance_get_multipath_tolerance (void)
72 {
73     return (multipath_next_hop_error_tolerance);
74 }
75
76 static inline index_t
77 load_balance_get_index (const load_balance_t *lb)
78 {
79     return (lb - load_balance_pool);
80 }
81
82 static inline dpo_id_t*
83 load_balance_get_buckets (load_balance_t *lb)
84 {
85     if (LB_HAS_INLINE_BUCKETS(lb))
86     {
87         return (lb->lb_buckets_inline);
88     }
89     else
90     {
91         return (lb->lb_buckets);
92     }
93 }
94
95 static load_balance_t *
96 load_balance_alloc_i (void)
97 {
98     load_balance_t *lb;
99     u8 need_barrier_sync = 0;
100     vlib_main_t *vm = vlib_get_main();
101     ASSERT (vm->thread_index == 0);
102
103     need_barrier_sync = pool_get_will_expand (load_balance_pool);
104
105     if (need_barrier_sync)
106         vlib_worker_thread_barrier_sync (vm);
107
108     pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
109     clib_memset(lb, 0, sizeof(*lb));
110
111     lb->lb_map = INDEX_INVALID;
112     lb->lb_urpf = INDEX_INVALID;
113
114     if (need_barrier_sync == 0)
115     {
116         need_barrier_sync += vlib_validate_combined_counter_will_expand
117             (&(load_balance_main.lbm_to_counters),
118              load_balance_get_index(lb));
119         need_barrier_sync += vlib_validate_combined_counter_will_expand
120             (&(load_balance_main.lbm_via_counters),
121              load_balance_get_index(lb));
122         if (need_barrier_sync)
123             vlib_worker_thread_barrier_sync (vm);
124     }
125
126     vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
127                                    load_balance_get_index(lb));
128     vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
129                                    load_balance_get_index(lb));
130     vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters),
131                                load_balance_get_index(lb));
132     vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
133                                load_balance_get_index(lb));
134
135     if (need_barrier_sync)
136         vlib_worker_thread_barrier_release (vm);
137
138     return (lb);
139 }
140
141 static u8*
142 load_balance_format (index_t lbi,
143                      load_balance_format_flags_t flags,
144                      u32 indent,
145                      u8 *s)
146 {
147     vlib_counter_t to, via;
148     load_balance_t *lb;
149     dpo_id_t *buckets;
150     u32 i;
151
152     lb = load_balance_get_or_null(lbi);
153     if (lb == NULL)
154       {
155         s = format(s, "DELETED lb:%u", lbi);
156         return (s);
157       }
158
159     vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to);
160     vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via);
161     buckets = load_balance_get_buckets(lb);
162
163     s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE);
164     s = format(s, "[proto:%U ", format_dpo_proto, lb->lb_proto);
165     s = format(s, "index:%d buckets:%d ", lbi, lb->lb_n_buckets);
166     s = format(s, "uRPF:%d ", lb->lb_urpf);
167     if (lb->lb_flags)
168     {
169         load_balance_attr_t attr;
170
171         s = format(s, "flags:[");
172
173         FOR_EACH_LOAD_BALANCE_ATTR(attr)
174         {
175             if (lb->lb_flags & (1 << attr))
176             {
177                 s = format (s, "%s", load_balance_attr_names[attr]);
178             }
179         }
180         s = format(s, "] ");
181     }
182     s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
183     if (0 != via.packets)
184     {
185         s = format(s, " via:[%Ld:%Ld]",
186                    via.packets, via.bytes);
187     }
188     s = format(s, "]");
189
190     if (INDEX_INVALID != lb->lb_map)
191     {
192         s = format(s, "\n%U%U",
193                    format_white_space, indent+4,
194                    format_load_balance_map, lb->lb_map, indent+4);
195     }
196     for (i = 0; i < lb->lb_n_buckets; i++)
197     {
198         s = format(s, "\n%U[%d] %U",
199                    format_white_space, indent+2,
200                    i,
201                    format_dpo_id,
202                    &buckets[i], indent+6);
203     }
204     return (s);
205 }
206
207 u8*
208 format_load_balance (u8 * s, va_list * args)
209 {
210     index_t lbi = va_arg(*args, index_t);
211     load_balance_format_flags_t flags = va_arg(*args, load_balance_format_flags_t);
212
213     return (load_balance_format(lbi, flags, 0, s));
214 }
215
216 static u8*
217 format_load_balance_dpo (u8 * s, va_list * args)
218 {
219     index_t lbi = va_arg(*args, index_t);
220     u32 indent = va_arg(*args, u32);
221
222     return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
223 }
224
225 flow_hash_config_t
226 load_balance_get_default_flow_hash (dpo_proto_t lb_proto)
227 {
228     switch (lb_proto)
229     {
230     case DPO_PROTO_IP4:
231     case DPO_PROTO_IP6:
232         return (IP_FLOW_HASH_DEFAULT);
233
234     case DPO_PROTO_MPLS:
235         return (MPLS_FLOW_HASH_DEFAULT);
236
237     case DPO_PROTO_ETHERNET:
238     case DPO_PROTO_BIER:
239     case DPO_PROTO_NSH:
240         break;
241     }
242
243     return (0);
244 }
245
246 static load_balance_t *
247 load_balance_create_i (u32 num_buckets,
248                        dpo_proto_t lb_proto,
249                        flow_hash_config_t fhc)
250 {
251     load_balance_t *lb;
252
253     ASSERT (num_buckets <= LB_MAX_BUCKETS);
254
255     lb = load_balance_alloc_i();
256     lb->lb_hash_config = fhc;
257     lb->lb_n_buckets = num_buckets;
258     lb->lb_n_buckets_minus_1 = num_buckets-1;
259     lb->lb_proto = lb_proto;
260
261     if (!LB_HAS_INLINE_BUCKETS(lb))
262     {
263         vec_validate_aligned(lb->lb_buckets,
264                              lb->lb_n_buckets - 1,
265                              CLIB_CACHE_LINE_BYTES);
266     }
267
268     LB_DBG(lb, "create");
269
270     return (lb);
271 }
272
273 index_t
274 load_balance_create (u32 n_buckets,
275                      dpo_proto_t lb_proto,
276                      flow_hash_config_t fhc)
277 {
278     return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc)));
279 }
280
281 static inline void
282 load_balance_set_bucket_i (load_balance_t *lb,
283                            u32 bucket,
284                            dpo_id_t *buckets,
285                            const dpo_id_t *next)
286 {
287     dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next);
288 }
289
290 void
291 load_balance_set_bucket (index_t lbi,
292                          u32 bucket,
293                          const dpo_id_t *next)
294 {
295     load_balance_t *lb;
296     dpo_id_t *buckets;
297
298     lb = load_balance_get(lbi);
299     buckets = load_balance_get_buckets(lb);
300
301     ASSERT(bucket < lb->lb_n_buckets);
302
303     load_balance_set_bucket_i(lb, bucket, buckets, next);
304 }
305
306 int
307 load_balance_is_drop (const dpo_id_t *dpo)
308 {
309     load_balance_t *lb;
310
311     if (DPO_LOAD_BALANCE != dpo->dpoi_type)
312         return (0);
313
314     lb = load_balance_get(dpo->dpoi_index);
315
316     if (1 == lb->lb_n_buckets)
317     {
318         return (dpo_is_drop(load_balance_get_bucket_i(lb, 0)));
319     }
320     return (0);
321 }
322
323 u16
324 load_balance_n_buckets (index_t lbi)
325 {
326     load_balance_t *lb;
327
328     lb = load_balance_get(lbi);
329
330     return (lb->lb_n_buckets);
331 }
332
333 void
334 load_balance_set_fib_entry_flags (index_t lbi,
335                                   fib_entry_flag_t flags)
336 {
337     load_balance_t *lb;
338
339     lb = load_balance_get(lbi);
340     lb->lb_fib_entry_flags = flags;
341 }
342
343
344 void
345 load_balance_set_urpf (index_t lbi,
346                        index_t urpf)
347 {
348     load_balance_t *lb;
349     index_t old;
350
351     lb = load_balance_get(lbi);
352
353     /*
354      * packets in flight we see this change. but it's atomic, so :P
355      */
356     old = lb->lb_urpf;
357     lb->lb_urpf = urpf;
358
359     fib_urpf_list_unlock(old);
360     fib_urpf_list_lock(urpf);
361 }
362
363 index_t
364 load_balance_get_urpf (index_t lbi)
365 {
366     load_balance_t *lb;
367
368     lb = load_balance_get(lbi);
369
370     return (lb->lb_urpf);
371 }
372
373 const dpo_id_t *
374 load_balance_get_bucket (index_t lbi,
375                          u32 bucket)
376 {
377     load_balance_t *lb;
378
379     lb = load_balance_get(lbi);
380
381     return (load_balance_get_bucket_i(lb, bucket));
382 }
383
384 static int
385 next_hop_sort_by_weight (const load_balance_path_t * n1,
386                          const load_balance_path_t * n2)
387 {
388     return ((int) n1->path_weight - (int) n2->path_weight);
389 }
390
391 /* Given next hop vector is over-written with normalized one with sorted weights and
392    with weights corresponding to the number of adjacencies for each next hop.
393    Returns number of adjacencies in block. */
394 u32
395 ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops,
396                                   load_balance_path_t ** normalized_next_hops,
397                                   u32 *sum_weight_in,
398                                   f64 multipath_next_hop_error_tolerance)
399 {
400     load_balance_path_t * nhs;
401     uword n_nhs, n_adj, n_adj_left, i, sum_weight;
402     f64 norm, error;
403
404     n_nhs = vec_len (raw_next_hops);
405     ASSERT (n_nhs > 0);
406     if (n_nhs == 0)
407         return 0;
408
409     /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
410     nhs = *normalized_next_hops;
411     vec_validate (nhs, 2*n_nhs - 1);
412
413     /* Fast path: 1 next hop in block. */
414     n_adj = n_nhs;
415     if (n_nhs == 1)
416     {
417         nhs[0] = raw_next_hops[0];
418         nhs[0].path_weight = 1;
419         vec_set_len (nhs, 1);
420         sum_weight = 1;
421         goto done;
422     }
423
424     else if (n_nhs == 2)
425     {
426         int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;
427
428         /* Fast sort. */
429         nhs[0] = raw_next_hops[cmp];
430         nhs[1] = raw_next_hops[cmp ^ 1];
431
432         /* Fast path: equal cost multipath with 2 next hops. */
433         if (nhs[0].path_weight == nhs[1].path_weight)
434         {
435             nhs[0].path_weight = nhs[1].path_weight = 1;
436             vec_set_len (nhs, 2);
437             sum_weight = 2;
438             goto done;
439         }
440     }
441     else
442     {
443         clib_memcpy_fast (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
444         qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
445     }
446
447     /* Find total weight to normalize weights. */
448     sum_weight = 0;
449     for (i = 0; i < n_nhs; i++)
450         sum_weight += nhs[i].path_weight;
451
452     /* In the unlikely case that all weights are given as 0, set them all to 1. */
453     if (sum_weight == 0)
454     {
455         for (i = 0; i < n_nhs; i++)
456             nhs[i].path_weight = 1;
457         sum_weight = n_nhs;
458     }
459
460     /* Save copies of all next hop weights to avoid being overwritten in loop below. */
461     for (i = 0; i < n_nhs; i++)
462         nhs[n_nhs + i].path_weight = nhs[i].path_weight;
463
464     /* Try larger and larger power of 2 sized adjacency blocks until we
465        find one where traffic flows to within 1% of specified weights. */
466     for (n_adj = clib_min(max_pow2 (n_nhs), LB_MAX_BUCKETS); ; n_adj *= 2)
467     {
468         ASSERT (n_adj <= LB_MAX_BUCKETS);
469         error = 0;
470
471         norm = n_adj / ((f64) sum_weight);
472         n_adj_left = n_adj;
473         for (i = 0; i < n_nhs; i++)
474         {
475             f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */
476             word n = flt_round_nearest (nf);
477
478             n = n > n_adj_left ? n_adj_left : n;
479             n_adj_left -= n;
480             error += fabs (nf - n);
481             nhs[i].path_weight = n;
482
483             if (0 == nhs[i].path_weight)
484             {
485                 /*
486                  * when the weight skew is high (norm is small) and n == nf.
487                  * without this correction the path with a low weight would have
488                  * no representation in the load-balanace - don't want that.
489                  * If the weight skew is high so the load-balance has many buckets
490                  * to allow it. pays ya money takes ya choice.
491                  */
492                 error = n_adj;
493                 break;
494             }
495         }
496
497         nhs[0].path_weight += n_adj_left;
498
499         /* Less than 1% average error per adjacency with this size adjacency block,
500          * or did we reached the maximum number of buckets we support? */
501         if (error <= multipath_next_hop_error_tolerance*n_adj ||
502             n_adj >= LB_MAX_BUCKETS)
503         {
504           if (i < n_nhs)
505           {
506             /* Truncate any next hops in excess */
507             vlib_log_err(load_balance_logger,
508                          "Too many paths for load-balance, truncating %d -> %d",
509                          n_nhs, i);
510             for (int j = i; j < n_nhs; j++)
511               dpo_reset (&vec_elt(nhs, j).path_dpo);
512           }
513           vec_set_len (nhs, i);
514           break;
515         }
516     }
517
518 done:
519     /* Save vector for next call. */
520     *normalized_next_hops = nhs;
521     *sum_weight_in = sum_weight;
522     return n_adj;
523 }
524
525 static load_balance_path_t *
526 load_balance_multipath_next_hop_fixup (const load_balance_path_t *nhs,
527                                        dpo_proto_t drop_proto)
528 {
529     if (0 == vec_len(nhs))
530     {
531         load_balance_path_t *new_nhs = NULL, *nh;
532
533         /*
534          * we need something for the load-balance. so use the drop
535          */
536         vec_add2(new_nhs, nh, 1);
537
538         nh->path_weight = 1;
539         dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto));
540
541         return (new_nhs);
542     }
543
544     return (NULL);
545 }
546
547 /*
548  * Fill in adjacencies in block based on corresponding
549  * next hop adjacencies.
550  */
551 static void
552 load_balance_fill_buckets_norm (load_balance_t *lb,
553                                 load_balance_path_t *nhs,
554                                 dpo_id_t *buckets,
555                                 u32 n_buckets)
556 {
557     load_balance_path_t *nh;
558     u16 ii, bucket;
559
560     bucket = 0;
561
562     /*
563      * the next-hops have normalised weights. that means their sum is the number
564      * of buckets we need to fill.
565      */
566     vec_foreach (nh, nhs)
567     {
568         for (ii = 0; ii < nh->path_weight; ii++)
569         {
570             ASSERT(bucket < n_buckets);
571             load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
572         }
573     }
574 }
575 static void
576 load_balance_fill_buckets_sticky (load_balance_t *lb,
577                                   load_balance_path_t *nhs,
578                                   dpo_id_t *buckets,
579                                   u32 n_buckets)
580 {
581     load_balance_path_t *nh, *fwding_paths;
582     u16 ii, bucket, fpath;
583
584     fpath = bucket = 0;
585     fwding_paths = NULL;
586
587     vec_foreach (nh, nhs)
588     {
589         if (!dpo_is_drop(&nh->path_dpo))
590         {
591             vec_add1(fwding_paths, *nh);
592         }
593     }
594     if (vec_len(fwding_paths) == 0)
595         fwding_paths = vec_dup(nhs);
596
597     /*
598      * the next-hops have normalised weights. that means their sum is the number
599      * of buckets we need to fill.
600      */
601     vec_foreach (nh, nhs)
602     {
603         for (ii = 0; ii < nh->path_weight; ii++)
604         {
605             ASSERT(bucket < n_buckets);
606             if (!dpo_is_drop(&nh->path_dpo))
607             {
608                 load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
609             }
610             else
611             {
612                 /* fill the bucks from the next up path */
613                 load_balance_set_bucket_i(lb, bucket++, buckets, &fwding_paths[fpath].path_dpo);
614                 ASSERT(vec_len(fwding_paths) > 0);
615                 fpath = (fpath + 1) % vec_len(fwding_paths);
616             }
617         }
618     }
619
620     vec_free(fwding_paths);
621 }
622
623 static void
624 load_balance_fill_buckets (load_balance_t *lb,
625                            load_balance_path_t *nhs,
626                            dpo_id_t *buckets,
627                            u32 n_buckets,
628                            load_balance_flags_t flags)
629 {
630     if (flags & LOAD_BALANCE_FLAG_STICKY)
631     {
632         load_balance_fill_buckets_sticky(lb, nhs, buckets, n_buckets);
633     }
634     else
635     {
636         load_balance_fill_buckets_norm(lb, nhs, buckets, n_buckets);
637     }
638 }
639
640 static inline void
641 load_balance_set_n_buckets (load_balance_t *lb,
642                             u32 n_buckets)
643 {
644     ASSERT (n_buckets <= LB_MAX_BUCKETS);
645     lb->lb_n_buckets = n_buckets;
646     lb->lb_n_buckets_minus_1 = n_buckets-1;
647 }
648
649 void
650 load_balance_multipath_update (const dpo_id_t *dpo,
651                                const load_balance_path_t * raw_nhs,
652                                load_balance_flags_t flags)
653 {
654     load_balance_path_t *nh, *nhs, *fixed_nhs;
655     u32 sum_of_weights, n_buckets, ii;
656     index_t lbmi, old_lbmi;
657     load_balance_t *lb;
658     dpo_id_t *tmp_dpo;
659
660     nhs = NULL;
661
662     ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
663     lb = load_balance_get(dpo->dpoi_index);
664     lb->lb_flags = flags;
665     fixed_nhs = load_balance_multipath_next_hop_fixup(raw_nhs, lb->lb_proto);
666     n_buckets =
667         ip_multipath_normalize_next_hops((NULL == fixed_nhs ?
668                                           raw_nhs :
669                                           fixed_nhs),
670                                          &nhs,
671                                          &sum_of_weights,
672                                          multipath_next_hop_error_tolerance);
673
674     /*
675      * Save the old load-balance map used, and get a new one if required.
676      */
677     old_lbmi = lb->lb_map;
678     if (flags & LOAD_BALANCE_FLAG_USES_MAP)
679     {
680         lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs);
681     }
682     else
683     {
684         lbmi = INDEX_INVALID;
685     }
686
687     if (0 == lb->lb_n_buckets)
688     {
689         /*
690          * first time initialisation. no packets inflight, so we can write
691          * at leisure.
692          */
693         load_balance_set_n_buckets(lb, n_buckets);
694
695         if (!LB_HAS_INLINE_BUCKETS(lb))
696             vec_validate_aligned(lb->lb_buckets,
697                                  lb->lb_n_buckets - 1,
698                                  CLIB_CACHE_LINE_BYTES);
699
700         load_balance_fill_buckets(lb, nhs,
701                                   load_balance_get_buckets(lb),
702                                   n_buckets, flags);
703         lb->lb_map = lbmi;
704     }
705     else
706     {
707         /*
708          * This is a modification of an existing load-balance.
709          * We need to ensure that packets inflight see a consistent state, that
710          * is the number of reported buckets the LB has (read from
711          * lb_n_buckets_minus_1) is not more than it actually has. So if the
712          * number of buckets is increasing, we must update the bucket array first,
713          * then the reported number. vice-versa if the number of buckets goes down.
714          */
715         if (n_buckets == lb->lb_n_buckets)
716         {
717             /*
718              * no change in the number of buckets. we can simply fill what
719              * is new over what is old.
720              */
721             load_balance_fill_buckets(lb, nhs,
722                                       load_balance_get_buckets(lb),
723                                       n_buckets, flags);
724             lb->lb_map = lbmi;
725         }
726         else if (n_buckets > lb->lb_n_buckets)
727         {
728             /*
729              * we have more buckets. the old load-balance map (if there is one)
730              * will remain valid, i.e. mapping to indices within range, so we
731              * update it last.
732              */
733             if (n_buckets > LB_NUM_INLINE_BUCKETS &&
734                 lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
735             {
736                 /*
737                  * the new increased number of buckets is crossing the threshold
738                  * from the inline storage to out-line. Alloc the outline buckets
739                  * first, then fixup the number. then reset the inlines.
740                  */
741                 ASSERT(NULL == lb->lb_buckets);
742                 vec_validate_aligned(lb->lb_buckets,
743                                      n_buckets - 1,
744                                      CLIB_CACHE_LINE_BYTES);
745
746                 load_balance_fill_buckets(lb, nhs,
747                                           lb->lb_buckets,
748                                           n_buckets, flags);
749                 CLIB_MEMORY_BARRIER();
750                 load_balance_set_n_buckets(lb, n_buckets);
751
752                 CLIB_MEMORY_BARRIER();
753
754                 for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++)
755                 {
756                     dpo_reset(&lb->lb_buckets_inline[ii]);
757                 }
758             }
759             else
760             {
761                 if (n_buckets <= LB_NUM_INLINE_BUCKETS)
762                 {
763                     /*
764                      * we are not crossing the threshold and it's still inline buckets.
765                      * we can write the new on the old..
766                      */
767                     load_balance_fill_buckets(lb, nhs,
768                                               load_balance_get_buckets(lb),
769                                               n_buckets, flags);
770                     CLIB_MEMORY_BARRIER();
771                     load_balance_set_n_buckets(lb, n_buckets);
772                 }
773                 else
774                 {
775                     /*
776                      * we are not crossing the threshold. We need a new bucket array to
777                      * hold the increased number of choices.
778                      */
779                     dpo_id_t *new_buckets, *old_buckets, *tmp_dpo;
780
781                     new_buckets = NULL;
782                     old_buckets = load_balance_get_buckets(lb);
783
784                     vec_validate_aligned(new_buckets,
785                                          n_buckets - 1,
786                                          CLIB_CACHE_LINE_BYTES);
787
788                     load_balance_fill_buckets(lb, nhs, new_buckets,
789                                               n_buckets, flags);
790                     CLIB_MEMORY_BARRIER();
791                     lb->lb_buckets = new_buckets;
792                     CLIB_MEMORY_BARRIER();
793                     load_balance_set_n_buckets(lb, n_buckets);
794
795                     vec_foreach(tmp_dpo, old_buckets)
796                     {
797                         dpo_reset(tmp_dpo);
798                     }
799                     vec_free(old_buckets);
800                 }
801             }
802
803             /*
804              * buckets fixed. ready for the MAP update.
805              */
806             lb->lb_map = lbmi;
807         }
808         else
809         {
810             /*
811              * bucket size shrinkage.
812              * Any map we have will be based on the old
813              * larger number of buckets, so will be translating to indices
814              * out of range. So the new MAP must be installed first.
815              */
816             lb->lb_map = lbmi;
817             CLIB_MEMORY_BARRIER();
818
819
820             if (n_buckets <= LB_NUM_INLINE_BUCKETS &&
821                 lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS)
822             {
823                 /*
824                  * the new decreased number of buckets is crossing the threshold
825                  * from out-line storage to inline:
826                  *   1 - Fill the inline buckets,
827                  *   2 - fixup the number (and this point the inline buckets are
828                  *       used).
829                  *   3 - free the outline buckets
830                  */
831                 load_balance_fill_buckets(lb, nhs,
832                                           lb->lb_buckets_inline,
833                                           n_buckets, flags);
834                 CLIB_MEMORY_BARRIER();
835                 load_balance_set_n_buckets(lb, n_buckets);
836                 CLIB_MEMORY_BARRIER();
837
838                 vec_foreach(tmp_dpo, lb->lb_buckets)
839                 {
840                     dpo_reset(tmp_dpo);
841                 }
842                 vec_free(lb->lb_buckets);
843             }
844             else
845             {
846                 /*
847                  * not crossing the threshold.
848                  *  1 - update the number to the smaller size
849                  *  2 - write the new buckets
850                  *  3 - reset those no longer used.
851                  */
852                 dpo_id_t *buckets;
853                 u32 old_n_buckets;
854
855                 old_n_buckets = lb->lb_n_buckets;
856                 buckets = load_balance_get_buckets(lb);
857
858                 load_balance_set_n_buckets(lb, n_buckets);
859                 CLIB_MEMORY_BARRIER();
860
861                 load_balance_fill_buckets(lb, nhs, buckets,
862                                           n_buckets, flags);
863
864                 for (ii = n_buckets; ii < old_n_buckets; ii++)
865                 {
866                     dpo_reset(&buckets[ii]);
867                 }
868             }
869         }
870     }
871
872     vec_foreach (nh, nhs)
873     {
874         dpo_reset(&nh->path_dpo);
875     }
876     vec_free(nhs);
877     vec_free(fixed_nhs);
878
879     load_balance_map_unlock(old_lbmi);
880 }
881
882 static void
883 load_balance_lock (dpo_id_t *dpo)
884 {
885     load_balance_t *lb;
886
887     lb = load_balance_get(dpo->dpoi_index);
888
889     lb->lb_locks++;
890 }
891
892 static void
893 load_balance_destroy (load_balance_t *lb)
894 {
895     dpo_id_t *buckets;
896     int i;
897
898     buckets = load_balance_get_buckets(lb);
899
900     for (i = 0; i < lb->lb_n_buckets; i++)
901     {
902         dpo_reset(&buckets[i]);
903     }
904
905     LB_DBG(lb, "destroy");
906     if (!LB_HAS_INLINE_BUCKETS(lb))
907     {
908         vec_free(lb->lb_buckets);
909     }
910
911     fib_urpf_list_unlock(lb->lb_urpf);
912     load_balance_map_unlock(lb->lb_map);
913
914     pool_put(load_balance_pool, lb);
915 }
916
917 static void
918 load_balance_unlock (dpo_id_t *dpo)
919 {
920     load_balance_t *lb;
921
922     lb = load_balance_get(dpo->dpoi_index);
923
924     lb->lb_locks--;
925
926     if (0 == lb->lb_locks)
927     {
928         load_balance_destroy(lb);
929     }
930 }
931
932 static void
933 load_balance_mem_show (void)
934 {
935     fib_show_memory_usage("load-balance",
936                           pool_elts(load_balance_pool),
937                           pool_len(load_balance_pool),
938                           sizeof(load_balance_t));
939     load_balance_map_show_mem();
940 }
941
942 static u16
943 load_balance_dpo_get_mtu (const dpo_id_t *dpo)
944 {
945     const dpo_id_t *buckets;
946     load_balance_t *lb;
947     u16 i, mtu = 0xffff;
948
949     lb = load_balance_get(dpo->dpoi_index);
950     buckets = load_balance_get_buckets(lb);
951
952     for (i = 0; i < lb->lb_n_buckets; i++)
953     {
954         mtu = clib_min (mtu, dpo_get_mtu (&buckets[i]));
955     }
956
957     return (mtu);
958 }
959
960 const static dpo_vft_t lb_vft = {
961     .dv_lock = load_balance_lock,
962     .dv_unlock = load_balance_unlock,
963     .dv_format = format_load_balance_dpo,
964     .dv_mem_show = load_balance_mem_show,
965     .dv_get_mtu = load_balance_dpo_get_mtu,
966 };
967
968 /**
969  * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance
970  *        object.
971  *
972  * this means that these graph nodes are ones from which a load-balance is the
973  * parent object in the DPO-graph.
974  *
975  * We do not list all the load-balance nodes, such as the *-lookup. instead
976  * we are relying on the correct use of the .sibling_of field when setting
977  * up these sibling nodes.
978  */
979 const static char* const load_balance_ip4_nodes[] =
980 {
981     "ip4-load-balance",
982     NULL,
983 };
984 const static char* const load_balance_ip6_nodes[] =
985 {
986     "ip6-load-balance",
987     NULL,
988 };
989 const static char* const load_balance_mpls_nodes[] =
990 {
991     "mpls-load-balance",
992     NULL,
993 };
994 const static char* const load_balance_l2_nodes[] =
995 {
996     "l2-load-balance",
997     NULL,
998 };
999 const static char* const load_balance_nsh_nodes[] =
1000 {
1001     "nsh-load-balance",
1002     NULL
1003 };
1004 const static char* const load_balance_bier_nodes[] =
1005 {
1006     "bier-load-balance",
1007     NULL,
1008 };
1009 const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
1010 {
1011     [DPO_PROTO_IP4]  = load_balance_ip4_nodes,
1012     [DPO_PROTO_IP6]  = load_balance_ip6_nodes,
1013     [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
1014     [DPO_PROTO_ETHERNET] = load_balance_l2_nodes,
1015     [DPO_PROTO_NSH] = load_balance_nsh_nodes,
1016     [DPO_PROTO_BIER] = load_balance_bier_nodes,
1017 };
1018
1019 void
1020 load_balance_module_init (void)
1021 {
1022     index_t lbi;
1023
1024     dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes);
1025
1026     /*
1027      * Special LB with index zero. we need to define this since the v4 mtrie
1028      * assumes an index of 0 implies the ply is empty. therefore all 'real'
1029      * adjs need a non-zero index.
1030      * This should never be used, but just in case, stack it on a drop.
1031      */
1032     lbi = load_balance_create(1, DPO_PROTO_IP4, 0);
1033     load_balance_set_bucket(lbi, 0, drop_dpo_get(DPO_PROTO_IP4));
1034
1035     load_balance_logger =
1036         vlib_log_register_class("dpo", "load-balance");
1037
1038     load_balance_map_module_init();
1039 }
1040
1041 static clib_error_t *
1042 load_balance_show (vlib_main_t * vm,
1043                    unformat_input_t * input,
1044                    vlib_cli_command_t * cmd)
1045 {
1046     index_t lbi = INDEX_INVALID;
1047
1048     while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
1049     {
1050         if (unformat (input, "%d", &lbi))
1051             ;
1052         else
1053             break;
1054     }
1055
1056     if (INDEX_INVALID != lbi)
1057     {
1058         if (pool_is_free_index(load_balance_pool, lbi))
1059         {
1060             vlib_cli_output (vm, "no such load-balance:%d", lbi);
1061         }
1062         else
1063         {
1064             vlib_cli_output (vm, "%U", format_load_balance, lbi,
1065                          LOAD_BALANCE_FORMAT_DETAIL);
1066         }
1067     }
1068     else
1069     {
1070         load_balance_t *lb;
1071
1072         pool_foreach (lb, load_balance_pool)
1073          {
1074             vlib_cli_output (vm, "%U", format_load_balance,
1075                              load_balance_get_index(lb),
1076                              LOAD_BALANCE_FORMAT_NONE);
1077         }
1078     }
1079
1080     return 0;
1081 }
1082
1083 VLIB_CLI_COMMAND (load_balance_show_command, static) = {
1084     .path = "show load-balance",
1085     .short_help = "show load-balance [<index>]",
1086     .function = load_balance_show,
1087 };
1088
1089
1090 always_inline u32
1091 ip_flow_hash (void *data)
1092 {
1093   ip4_header_t *iph = (ip4_header_t *) data;
1094
1095   if ((iph->ip_version_and_header_length & 0xF0) == 0x40)
1096     return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT);
1097   else
1098     return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT);
1099 }
1100
1101 always_inline u64
1102 mac_to_u64 (u8 * m)
1103 {
1104   return (*((u64 *) m) & 0xffffffffffff);
1105 }
1106
1107 always_inline u32
1108 l2_flow_hash (vlib_buffer_t * b0)
1109 {
1110   ethernet_header_t *eh;
1111   u64 a, b, c;
1112   uword is_ip, eh_size;
1113   u16 eh_type;
1114
1115   eh = vlib_buffer_get_current (b0);
1116   eh_type = clib_net_to_host_u16 (eh->type);
1117   eh_size = ethernet_buffer_header_size (b0);
1118
1119   is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6);
1120
1121   /* since we have 2 cache lines, use them */
1122   if (is_ip)
1123     a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size);
1124   else
1125     a = eh->type;
1126
1127   b = mac_to_u64 ((u8 *) eh->dst_address);
1128   c = mac_to_u64 ((u8 *) eh->src_address);
1129   hash_mix64 (a, b, c);
1130
1131   return (u32) c;
1132 }
1133
1134 typedef struct load_balance_trace_t_
1135 {
1136     index_t lb_index;
1137 } load_balance_trace_t;
1138
1139 always_inline uword
1140 load_balance_inline (vlib_main_t * vm,
1141                      vlib_node_runtime_t * node,
1142                      vlib_frame_t * frame,
1143                      int is_l2)
1144 {
1145   u32 n_left_from, next_index, *from, *to_next;
1146
1147   from = vlib_frame_vector_args (frame);
1148   n_left_from = frame->n_vectors;
1149
1150   next_index = node->cached_next_index;
1151
1152   while (n_left_from > 0)
1153     {
1154       u32 n_left_to_next;
1155
1156       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1157
1158       while (n_left_from > 0 && n_left_to_next > 0)
1159         {
1160           vlib_buffer_t *b0;
1161           u32 bi0, lbi0, next0;
1162           const dpo_id_t *dpo0;
1163           const load_balance_t *lb0;
1164
1165           bi0 = from[0];
1166           to_next[0] = bi0;
1167           from += 1;
1168           to_next += 1;
1169           n_left_from -= 1;
1170           n_left_to_next -= 1;
1171
1172           b0 = vlib_get_buffer (vm, bi0);
1173
1174           /* lookup dst + src mac */
1175           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1176           lb0 = load_balance_get(lbi0);
1177
1178           if (is_l2)
1179           {
1180               vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);
1181           }
1182           else
1183           {
1184               /* it's BIER */
1185               const bier_hdr_t *bh0 = vlib_buffer_get_current(b0);
1186               vnet_buffer(b0)->ip.flow_hash = bier_compute_flow_hash(bh0);
1187           }
1188
1189           dpo0 = load_balance_get_bucket_i(lb0,
1190                                            vnet_buffer(b0)->ip.flow_hash &
1191                                            (lb0->lb_n_buckets_minus_1));
1192
1193           next0 = dpo0->dpoi_next_node;
1194           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1195
1196           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1197             {
1198               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1199                                                          sizeof (*tr));
1200               tr->lb_index = lbi0;
1201             }
1202           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1203                                            n_left_to_next, bi0, next0);
1204         }
1205
1206       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1207     }
1208
1209   return frame->n_vectors;
1210 }
1211
1212 static uword
1213 l2_load_balance (vlib_main_t * vm,
1214                  vlib_node_runtime_t * node,
1215                  vlib_frame_t * frame)
1216 {
1217     return (load_balance_inline(vm, node, frame, 1));
1218 }
1219
1220 static u8 *
1221 format_l2_load_balance_trace (u8 * s, va_list * args)
1222 {
1223   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1224   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1225   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1226
1227   s = format (s, "L2-load-balance: index %d", t->lb_index);
1228   return s;
1229 }
1230
1231 /**
1232  * @brief
1233  */
1234 VLIB_REGISTER_NODE (l2_load_balance_node) = {
1235   .function = l2_load_balance,
1236   .name = "l2-load-balance",
1237   .vector_size = sizeof (u32),
1238
1239   .format_trace = format_l2_load_balance_trace,
1240   .n_next_nodes = 1,
1241   .next_nodes = {
1242       [0] = "error-drop",
1243   },
1244 };
1245
1246 static uword
1247 nsh_load_balance (vlib_main_t * vm,
1248                  vlib_node_runtime_t * node,
1249                  vlib_frame_t * frame)
1250 {
1251   u32 n_left_from, next_index, *from, *to_next;
1252
1253   from = vlib_frame_vector_args (frame);
1254   n_left_from = frame->n_vectors;
1255
1256   next_index = node->cached_next_index;
1257
1258   while (n_left_from > 0)
1259     {
1260       u32 n_left_to_next;
1261
1262       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1263
1264       while (n_left_from > 0 && n_left_to_next > 0)
1265         {
1266           vlib_buffer_t *b0;
1267           u32 bi0, lbi0, next0, *nsh0;
1268           const dpo_id_t *dpo0;
1269           const load_balance_t *lb0;
1270
1271           bi0 = from[0];
1272           to_next[0] = bi0;
1273           from += 1;
1274           to_next += 1;
1275           n_left_from -= 1;
1276           n_left_to_next -= 1;
1277
1278           b0 = vlib_get_buffer (vm, bi0);
1279
1280           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1281           lb0 = load_balance_get(lbi0);
1282
1283           /* SPI + SI are the second word of the NSH header */
1284           nsh0 = vlib_buffer_get_current (b0);
1285           vnet_buffer(b0)->ip.flow_hash = nsh0[1] % lb0->lb_n_buckets;
1286
1287           dpo0 = load_balance_get_bucket_i(lb0,
1288                                            vnet_buffer(b0)->ip.flow_hash &
1289                                            (lb0->lb_n_buckets_minus_1));
1290
1291           next0 = dpo0->dpoi_next_node;
1292           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1293
1294           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1295             {
1296               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1297                                                          sizeof (*tr));
1298               tr->lb_index = lbi0;
1299             }
1300           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1301                                            n_left_to_next, bi0, next0);
1302         }
1303
1304       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1305     }
1306
1307   return frame->n_vectors;
1308 }
1309
1310 static u8 *
1311 format_nsh_load_balance_trace (u8 * s, va_list * args)
1312 {
1313   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1314   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1315   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1316
1317   s = format (s, "NSH-load-balance: index %d", t->lb_index);
1318   return s;
1319 }
1320
1321 /**
1322  * @brief
1323  */
1324 VLIB_REGISTER_NODE (nsh_load_balance_node) = {
1325   .function = nsh_load_balance,
1326   .name = "nsh-load-balance",
1327   .vector_size = sizeof (u32),
1328
1329   .format_trace = format_nsh_load_balance_trace,
1330   .n_next_nodes = 1,
1331   .next_nodes = {
1332       [0] = "error-drop",
1333   },
1334 };
1335
1336 static u8 *
1337 format_bier_load_balance_trace (u8 * s, va_list * args)
1338 {
1339   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1340   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1341   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1342
1343   s = format (s, "BIER-load-balance: index %d", t->lb_index);
1344   return s;
1345 }
1346
1347 static uword
1348 bier_load_balance (vlib_main_t * vm,
1349                    vlib_node_runtime_t * node,
1350                    vlib_frame_t * frame)
1351 {
1352     return (load_balance_inline(vm, node, frame, 0));
1353 }
1354
1355 /**
1356  * @brief
1357  */
1358 VLIB_REGISTER_NODE (bier_load_balance_node) = {
1359   .function = bier_load_balance,
1360   .name = "bier-load-balance",
1361   .vector_size = sizeof (u32),
1362
1363   .format_trace = format_bier_load_balance_trace,
1364   .sibling_of = "mpls-load-balance",
1365 };
1366
1367 // clang-format on