misc: move to new pool_foreach macros
[vpp.git] / src / vnet / dpo / load_balance.c
1 /*
2  * Copyright (c) 2016 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 #include <vnet/dpo/load_balance.h>
17 #include <vnet/dpo/load_balance_map.h>
18 #include <vnet/dpo/drop_dpo.h>
19 #include <vppinfra/math.h>              /* for fabs */
20 #include <vnet/adj/adj.h>
21 #include <vnet/adj/adj_internal.h>
22 #include <vnet/fib/fib_urpf_list.h>
23 #include <vnet/bier/bier_fwd.h>
24 #include <vnet/fib/mpls_fib.h>
25 #include <vnet/ip/ip4_inlines.h>
26 #include <vnet/ip/ip6_inlines.h>
27
28 /*
29  * distribution error tolerance for load-balancing
30  */
31 const f64 multipath_next_hop_error_tolerance = 0.1;
32
33 static const char *load_balance_attr_names[] = LOAD_BALANCE_ATTR_NAMES;
34
35 /**
36  * the logger
37  */
38 vlib_log_class_t load_balance_logger;
39
40 #define LB_DBG(_lb, _fmt, _args...)                                     \
41 {                                                                       \
42     vlib_log_debug(load_balance_logger,                                 \
43                    "lb:[%U]:" _fmt,                                     \
44                    format_load_balance, load_balance_get_index(_lb),    \
45                    LOAD_BALANCE_FORMAT_NONE,                            \
46                    ##_args);                                            \
47 }
48
49 /**
50  * Pool of all DPOs. It's not static so the DP can have fast access
51  */
52 load_balance_t *load_balance_pool;
53
54 /**
55  * The one instance of load-balance main
56  */
57 load_balance_main_t load_balance_main = {
58     .lbm_to_counters = {
59         .name = "route-to",
60         .stat_segment_name = "/net/route/to",
61     },
62     .lbm_via_counters = {
63         .name = "route-via",
64         .stat_segment_name = "/net/route/via",
65     }
66 };
67
68 f64
69 load_balance_get_multipath_tolerance (void)
70 {
71     return (multipath_next_hop_error_tolerance);
72 }
73
74 static inline index_t
75 load_balance_get_index (const load_balance_t *lb)
76 {
77     return (lb - load_balance_pool);
78 }
79
80 static inline dpo_id_t*
81 load_balance_get_buckets (load_balance_t *lb)
82 {
83     if (LB_HAS_INLINE_BUCKETS(lb))
84     {
85         return (lb->lb_buckets_inline);
86     }
87     else
88     {
89         return (lb->lb_buckets);
90     }
91 }
92
93 static load_balance_t *
94 load_balance_alloc_i (void)
95 {
96     load_balance_t *lb;
97     u8 need_barrier_sync = 0;
98     vlib_main_t *vm = vlib_get_main();
99     ASSERT (vm->thread_index == 0);
100
101     pool_get_aligned_will_expand (load_balance_pool, need_barrier_sync,
102                                   CLIB_CACHE_LINE_BYTES);
103     if (need_barrier_sync)
104         vlib_worker_thread_barrier_sync (vm);
105
106     pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
107     clib_memset(lb, 0, sizeof(*lb));
108
109     lb->lb_map = INDEX_INVALID;
110     lb->lb_urpf = INDEX_INVALID;
111
112     if (need_barrier_sync == 0)
113     {
114         need_barrier_sync += vlib_validate_combined_counter_will_expand
115             (&(load_balance_main.lbm_to_counters),
116              load_balance_get_index(lb));
117         need_barrier_sync += vlib_validate_combined_counter_will_expand
118             (&(load_balance_main.lbm_via_counters),
119              load_balance_get_index(lb));
120         if (need_barrier_sync)
121             vlib_worker_thread_barrier_sync (vm);
122     }
123
124     vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
125                                    load_balance_get_index(lb));
126     vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
127                                    load_balance_get_index(lb));
128     vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters),
129                                load_balance_get_index(lb));
130     vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
131                                load_balance_get_index(lb));
132
133     if (need_barrier_sync)
134         vlib_worker_thread_barrier_release (vm);
135
136     return (lb);
137 }
138
139 static u8*
140 load_balance_format (index_t lbi,
141                      load_balance_format_flags_t flags,
142                      u32 indent,
143                      u8 *s)
144 {
145     vlib_counter_t to, via;
146     load_balance_t *lb;
147     dpo_id_t *buckets;
148     u32 i;
149
150     lb = load_balance_get(lbi);
151     vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to);
152     vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via);
153     buckets = load_balance_get_buckets(lb);
154
155     s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE);
156     s = format(s, "[proto:%U ", format_dpo_proto, lb->lb_proto);
157     s = format(s, "index:%d buckets:%d ", lbi, lb->lb_n_buckets);
158     s = format(s, "uRPF:%d ", lb->lb_urpf);
159     if (lb->lb_flags)
160     {
161         load_balance_attr_t attr;
162
163         s = format(s, "flags:[");
164
165         FOR_EACH_LOAD_BALANCE_ATTR(attr)
166         {
167             if (lb->lb_flags & (1 << attr))
168             {
169                 s = format (s, "%s", load_balance_attr_names[attr]);
170             }
171         }
172         s = format(s, "] ");
173     }
174     s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
175     if (0 != via.packets)
176     {
177         s = format(s, " via:[%Ld:%Ld]",
178                    via.packets, via.bytes);
179     }
180     s = format(s, "]");
181
182     if (INDEX_INVALID != lb->lb_map)
183     {
184         s = format(s, "\n%U%U",
185                    format_white_space, indent+4,
186                    format_load_balance_map, lb->lb_map, indent+4);
187     }
188     for (i = 0; i < lb->lb_n_buckets; i++)
189     {
190         s = format(s, "\n%U[%d] %U",
191                    format_white_space, indent+2,
192                    i,
193                    format_dpo_id,
194                    &buckets[i], indent+6);
195     }
196     return (s);
197 }
198
199 u8*
200 format_load_balance (u8 * s, va_list * args)
201 {
202     index_t lbi = va_arg(*args, index_t);
203     load_balance_format_flags_t flags = va_arg(*args, load_balance_format_flags_t);
204
205     return (load_balance_format(lbi, flags, 0, s));
206 }
207
208 static u8*
209 format_load_balance_dpo (u8 * s, va_list * args)
210 {
211     index_t lbi = va_arg(*args, index_t);
212     u32 indent = va_arg(*args, u32);
213
214     return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
215 }
216
217 flow_hash_config_t
218 load_balance_get_default_flow_hash (dpo_proto_t lb_proto)
219 {
220     switch (lb_proto)
221     {
222     case DPO_PROTO_IP4:
223     case DPO_PROTO_IP6:
224         return (IP_FLOW_HASH_DEFAULT);
225
226     case DPO_PROTO_MPLS:
227         return (MPLS_FLOW_HASH_DEFAULT);
228
229     case DPO_PROTO_ETHERNET:
230     case DPO_PROTO_BIER:
231     case DPO_PROTO_NSH:
232         break;
233     }
234
235     return (0);
236 }
237
238 static load_balance_t *
239 load_balance_create_i (u32 num_buckets,
240                        dpo_proto_t lb_proto,
241                        flow_hash_config_t fhc)
242 {
243     load_balance_t *lb;
244
245     lb = load_balance_alloc_i();
246     lb->lb_hash_config = fhc;
247     lb->lb_n_buckets = num_buckets;
248     lb->lb_n_buckets_minus_1 = num_buckets-1;
249     lb->lb_proto = lb_proto;
250
251     if (!LB_HAS_INLINE_BUCKETS(lb))
252     {
253         vec_validate_aligned(lb->lb_buckets,
254                              lb->lb_n_buckets - 1,
255                              CLIB_CACHE_LINE_BYTES);
256     }
257
258     LB_DBG(lb, "create");
259
260     return (lb);
261 }
262
263 index_t
264 load_balance_create (u32 n_buckets,
265                      dpo_proto_t lb_proto,
266                      flow_hash_config_t fhc)
267 {
268     return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc)));
269 }
270
271 static inline void
272 load_balance_set_bucket_i (load_balance_t *lb,
273                            u32 bucket,
274                            dpo_id_t *buckets,
275                            const dpo_id_t *next)
276 {
277     dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next);
278 }
279
280 void
281 load_balance_set_bucket (index_t lbi,
282                          u32 bucket,
283                          const dpo_id_t *next)
284 {
285     load_balance_t *lb;
286     dpo_id_t *buckets;
287
288     lb = load_balance_get(lbi);
289     buckets = load_balance_get_buckets(lb);
290
291     ASSERT(bucket < lb->lb_n_buckets);
292
293     load_balance_set_bucket_i(lb, bucket, buckets, next);
294 }
295
296 int
297 load_balance_is_drop (const dpo_id_t *dpo)
298 {
299     load_balance_t *lb;
300
301     if (DPO_LOAD_BALANCE != dpo->dpoi_type)
302         return (0);
303
304     lb = load_balance_get(dpo->dpoi_index);
305
306     if (1 == lb->lb_n_buckets)
307     {
308         return (dpo_is_drop(load_balance_get_bucket_i(lb, 0)));
309     }
310     return (0);
311 }
312
313 u16
314 load_balance_n_buckets (index_t lbi)
315 {
316     load_balance_t *lb;
317
318     lb = load_balance_get(lbi);
319
320     return (lb->lb_n_buckets);
321 }
322
323 void
324 load_balance_set_fib_entry_flags (index_t lbi,
325                                   fib_entry_flag_t flags)
326 {
327     load_balance_t *lb;
328
329     lb = load_balance_get(lbi);
330     lb->lb_fib_entry_flags = flags;
331 }
332
333
334 void
335 load_balance_set_urpf (index_t lbi,
336                        index_t urpf)
337 {
338     load_balance_t *lb;
339     index_t old;
340
341     lb = load_balance_get(lbi);
342
343     /*
344      * packets in flight we see this change. but it's atomic, so :P
345      */
346     old = lb->lb_urpf;
347     lb->lb_urpf = urpf;
348
349     fib_urpf_list_unlock(old);
350     fib_urpf_list_lock(urpf);
351 }
352
353 index_t
354 load_balance_get_urpf (index_t lbi)
355 {
356     load_balance_t *lb;
357
358     lb = load_balance_get(lbi);
359
360     return (lb->lb_urpf);
361 }
362
363 const dpo_id_t *
364 load_balance_get_bucket (index_t lbi,
365                          u32 bucket)
366 {
367     load_balance_t *lb;
368
369     lb = load_balance_get(lbi);
370
371     return (load_balance_get_bucket_i(lb, bucket));
372 }
373
374 static int
375 next_hop_sort_by_weight (const load_balance_path_t * n1,
376                          const load_balance_path_t * n2)
377 {
378     return ((int) n1->path_weight - (int) n2->path_weight);
379 }
380
381 /* Given next hop vector is over-written with normalized one with sorted weights and
382    with weights corresponding to the number of adjacencies for each next hop.
383    Returns number of adjacencies in block. */
384 u32
385 ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops,
386                                   load_balance_path_t ** normalized_next_hops,
387                                   u32 *sum_weight_in,
388                                   f64 multipath_next_hop_error_tolerance)
389 {
390     load_balance_path_t * nhs;
391     uword n_nhs, n_adj, n_adj_left, i, sum_weight;
392     f64 norm, error;
393
394     n_nhs = vec_len (raw_next_hops);
395     ASSERT (n_nhs > 0);
396     if (n_nhs == 0)
397         return 0;
398
399     /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
400     nhs = *normalized_next_hops;
401     vec_validate (nhs, 2*n_nhs - 1);
402
403     /* Fast path: 1 next hop in block. */
404     n_adj = n_nhs;
405     if (n_nhs == 1)
406     {
407         nhs[0] = raw_next_hops[0];
408         nhs[0].path_weight = 1;
409         _vec_len (nhs) = 1;
410         sum_weight = 1;
411         goto done;
412     }
413
414     else if (n_nhs == 2)
415     {
416         int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;
417
418         /* Fast sort. */
419         nhs[0] = raw_next_hops[cmp];
420         nhs[1] = raw_next_hops[cmp ^ 1];
421
422         /* Fast path: equal cost multipath with 2 next hops. */
423         if (nhs[0].path_weight == nhs[1].path_weight)
424         {
425             nhs[0].path_weight = nhs[1].path_weight = 1;
426             _vec_len (nhs) = 2;
427             sum_weight = 2;
428             goto done;
429         }
430     }
431     else
432     {
433         clib_memcpy_fast (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
434         qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
435     }
436
437     /* Find total weight to normalize weights. */
438     sum_weight = 0;
439     for (i = 0; i < n_nhs; i++)
440         sum_weight += nhs[i].path_weight;
441
442     /* In the unlikely case that all weights are given as 0, set them all to 1. */
443     if (sum_weight == 0)
444     {
445         for (i = 0; i < n_nhs; i++)
446             nhs[i].path_weight = 1;
447         sum_weight = n_nhs;
448     }
449
450     /* Save copies of all next hop weights to avoid being overwritten in loop below. */
451     for (i = 0; i < n_nhs; i++)
452         nhs[n_nhs + i].path_weight = nhs[i].path_weight;
453
454     /* Try larger and larger power of 2 sized adjacency blocks until we
455        find one where traffic flows to within 1% of specified weights. */
456     for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
457     {
458         error = 0;
459
460         norm = n_adj / ((f64) sum_weight);
461         n_adj_left = n_adj;
462         for (i = 0; i < n_nhs; i++)
463         {
464             f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */
465             word n = flt_round_nearest (nf);
466
467             n = n > n_adj_left ? n_adj_left : n;
468             n_adj_left -= n;
469             error += fabs (nf - n);
470             nhs[i].path_weight = n;
471
472             if (0 == nhs[i].path_weight)
473             {
474                 /*
475                  * when the weight skew is high (norm is small) and n == nf.
476                  * without this correction the path with a low weight would have
477                  * no representation in the load-balanace - don't want that.
478                  * If the weight skew is high so the load-balance has many buckets
479                  * to allow it. pays ya money takes ya choice.
480                  */
481                 error = n_adj;
482                 break;
483             }
484         }
485
486         nhs[0].path_weight += n_adj_left;
487
488         /* Less than 5% average error per adjacency with this size adjacency block? */
489         if (error <= multipath_next_hop_error_tolerance*n_adj)
490         {
491             /* Truncate any next hops with zero weight. */
492             _vec_len (nhs) = i;
493             break;
494         }
495     }
496
497 done:
498     /* Save vector for next call. */
499     *normalized_next_hops = nhs;
500     *sum_weight_in = sum_weight;
501     return n_adj;
502 }
503
504 static load_balance_path_t *
505 load_balance_multipath_next_hop_fixup (const load_balance_path_t *nhs,
506                                        dpo_proto_t drop_proto)
507 {
508     if (0 == vec_len(nhs))
509     {
510         load_balance_path_t *new_nhs = NULL, *nh;
511
512         /*
513          * we need something for the load-balance. so use the drop
514          */
515         vec_add2(new_nhs, nh, 1);
516
517         nh->path_weight = 1;
518         dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto));
519
520         return (new_nhs);
521     }
522
523     return (NULL);
524 }
525
526 /*
527  * Fill in adjacencies in block based on corresponding
528  * next hop adjacencies.
529  */
530 static void
531 load_balance_fill_buckets_norm (load_balance_t *lb,
532                                 load_balance_path_t *nhs,
533                                 dpo_id_t *buckets,
534                                 u32 n_buckets)
535 {
536     load_balance_path_t *nh;
537     u16 ii, bucket;
538
539     bucket = 0;
540
541     /*
542      * the next-hops have normalised weights. that means their sum is the number
543      * of buckets we need to fill.
544      */
545     vec_foreach (nh, nhs)
546     {
547         for (ii = 0; ii < nh->path_weight; ii++)
548         {
549             ASSERT(bucket < n_buckets);
550             load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
551         }
552     }
553 }
554 static void
555 load_balance_fill_buckets_sticky (load_balance_t *lb,
556                                   load_balance_path_t *nhs,
557                                   dpo_id_t *buckets,
558                                   u32 n_buckets)
559 {
560     load_balance_path_t *nh, *fwding_paths;
561     u16 ii, bucket, fpath;
562
563     fpath = bucket = 0;
564     fwding_paths = NULL;
565
566     vec_foreach (nh, nhs)
567     {
568         if (!dpo_is_drop(&nh->path_dpo))
569         {
570             vec_add1(fwding_paths, *nh);
571         }
572     }
573     if (vec_len(fwding_paths) == 0)
574         fwding_paths = vec_dup(nhs);
575
576     /*
577      * the next-hops have normalised weights. that means their sum is the number
578      * of buckets we need to fill.
579      */
580     vec_foreach (nh, nhs)
581     {
582         for (ii = 0; ii < nh->path_weight; ii++)
583         {
584             ASSERT(bucket < n_buckets);
585             if (!dpo_is_drop(&nh->path_dpo))
586             {
587                 load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
588             }
589             else
590             {
591                 /* fill the bucks from the next up path */
592                 load_balance_set_bucket_i(lb, bucket++, buckets, &fwding_paths[fpath].path_dpo);
593                 fpath = (fpath + 1) % vec_len(fwding_paths);
594             }
595         }
596     }
597
598     vec_free(fwding_paths);
599 }
600
601 static void
602 load_balance_fill_buckets (load_balance_t *lb,
603                            load_balance_path_t *nhs,
604                            dpo_id_t *buckets,
605                            u32 n_buckets,
606                            load_balance_flags_t flags)
607 {
608     if (flags & LOAD_BALANCE_FLAG_STICKY)
609     {
610         load_balance_fill_buckets_sticky(lb, nhs, buckets, n_buckets);
611     }
612     else
613     {
614         load_balance_fill_buckets_norm(lb, nhs, buckets, n_buckets);
615     }
616 }
617
618 static inline void
619 load_balance_set_n_buckets (load_balance_t *lb,
620                             u32 n_buckets)
621 {
622     lb->lb_n_buckets = n_buckets;
623     lb->lb_n_buckets_minus_1 = n_buckets-1;
624 }
625
626 void
627 load_balance_multipath_update (const dpo_id_t *dpo,
628                                const load_balance_path_t * raw_nhs,
629                                load_balance_flags_t flags)
630 {
631     load_balance_path_t *nh, *nhs, *fixed_nhs;
632     u32 sum_of_weights, n_buckets, ii;
633     index_t lbmi, old_lbmi;
634     load_balance_t *lb;
635     dpo_id_t *tmp_dpo;
636
637     nhs = NULL;
638
639     ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
640     lb = load_balance_get(dpo->dpoi_index);
641     lb->lb_flags = flags;
642     fixed_nhs = load_balance_multipath_next_hop_fixup(raw_nhs, lb->lb_proto);
643     n_buckets =
644         ip_multipath_normalize_next_hops((NULL == fixed_nhs ?
645                                           raw_nhs :
646                                           fixed_nhs),
647                                          &nhs,
648                                          &sum_of_weights,
649                                          multipath_next_hop_error_tolerance);
650
651     ASSERT (n_buckets >= vec_len (raw_nhs));
652
653     /*
654      * Save the old load-balance map used, and get a new one if required.
655      */
656     old_lbmi = lb->lb_map;
657     if (flags & LOAD_BALANCE_FLAG_USES_MAP)
658     {
659         lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs);
660     }
661     else
662     {
663         lbmi = INDEX_INVALID;
664     }
665
666     if (0 == lb->lb_n_buckets)
667     {
668         /*
669          * first time initialisation. no packets inflight, so we can write
670          * at leisure.
671          */
672         load_balance_set_n_buckets(lb, n_buckets);
673
674         if (!LB_HAS_INLINE_BUCKETS(lb))
675             vec_validate_aligned(lb->lb_buckets,
676                                  lb->lb_n_buckets - 1,
677                                  CLIB_CACHE_LINE_BYTES);
678
679         load_balance_fill_buckets(lb, nhs,
680                                   load_balance_get_buckets(lb),
681                                   n_buckets, flags);
682         lb->lb_map = lbmi;
683     }
684     else
685     {
686         /*
687          * This is a modification of an existing load-balance.
688          * We need to ensure that packets inflight see a consistent state, that
689          * is the number of reported buckets the LB has (read from
690          * lb_n_buckets_minus_1) is not more than it actually has. So if the
691          * number of buckets is increasing, we must update the bucket array first,
692          * then the reported number. vice-versa if the number of buckets goes down.
693          */
694         if (n_buckets == lb->lb_n_buckets)
695         {
696             /*
697              * no change in the number of buckets. we can simply fill what
698              * is new over what is old.
699              */
700             load_balance_fill_buckets(lb, nhs,
701                                       load_balance_get_buckets(lb),
702                                       n_buckets, flags);
703             lb->lb_map = lbmi;
704         }
705         else if (n_buckets > lb->lb_n_buckets)
706         {
707             /*
708              * we have more buckets. the old load-balance map (if there is one)
709              * will remain valid, i.e. mapping to indices within range, so we
710              * update it last.
711              */
712             if (n_buckets > LB_NUM_INLINE_BUCKETS &&
713                 lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
714             {
715                 /*
716                  * the new increased number of buckets is crossing the threshold
717                  * from the inline storage to out-line. Alloc the outline buckets
718                  * first, then fixup the number. then reset the inlines.
719                  */
720                 ASSERT(NULL == lb->lb_buckets);
721                 vec_validate_aligned(lb->lb_buckets,
722                                      n_buckets - 1,
723                                      CLIB_CACHE_LINE_BYTES);
724
725                 load_balance_fill_buckets(lb, nhs,
726                                           lb->lb_buckets,
727                                           n_buckets, flags);
728                 CLIB_MEMORY_BARRIER();
729                 load_balance_set_n_buckets(lb, n_buckets);
730
731                 CLIB_MEMORY_BARRIER();
732
733                 for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++)
734                 {
735                     dpo_reset(&lb->lb_buckets_inline[ii]);
736                 }
737             }
738             else
739             {
740                 if (n_buckets <= LB_NUM_INLINE_BUCKETS)
741                 {
742                     /*
743                      * we are not crossing the threshold and it's still inline buckets.
744                      * we can write the new on the old..
745                      */
746                     load_balance_fill_buckets(lb, nhs,
747                                               load_balance_get_buckets(lb),
748                                               n_buckets, flags);
749                     CLIB_MEMORY_BARRIER();
750                     load_balance_set_n_buckets(lb, n_buckets);
751                 }
752                 else
753                 {
754                     /*
755                      * we are not crossing the threshold. We need a new bucket array to
756                      * hold the increased number of choices.
757                      */
758                     dpo_id_t *new_buckets, *old_buckets, *tmp_dpo;
759
760                     new_buckets = NULL;
761                     old_buckets = load_balance_get_buckets(lb);
762
763                     vec_validate_aligned(new_buckets,
764                                          n_buckets - 1,
765                                          CLIB_CACHE_LINE_BYTES);
766
767                     load_balance_fill_buckets(lb, nhs, new_buckets,
768                                               n_buckets, flags);
769                     CLIB_MEMORY_BARRIER();
770                     lb->lb_buckets = new_buckets;
771                     CLIB_MEMORY_BARRIER();
772                     load_balance_set_n_buckets(lb, n_buckets);
773
774                     vec_foreach(tmp_dpo, old_buckets)
775                     {
776                         dpo_reset(tmp_dpo);
777                     }
778                     vec_free(old_buckets);
779                 }
780             }
781
782             /*
783              * buckets fixed. ready for the MAP update.
784              */
785             lb->lb_map = lbmi;
786         }
787         else
788         {
789             /*
790              * bucket size shrinkage.
791              * Any map we have will be based on the old
792              * larger number of buckets, so will be translating to indices
793              * out of range. So the new MAP must be installed first.
794              */
795             lb->lb_map = lbmi;
796             CLIB_MEMORY_BARRIER();
797
798
799             if (n_buckets <= LB_NUM_INLINE_BUCKETS &&
800                 lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS)
801             {
802                 /*
803                  * the new decreased number of buckets is crossing the threshold
804                  * from out-line storage to inline:
805                  *   1 - Fill the inline buckets,
806                  *   2 - fixup the number (and this point the inline buckets are
807                  *       used).
808                  *   3 - free the outline buckets
809                  */
810                 load_balance_fill_buckets(lb, nhs,
811                                           lb->lb_buckets_inline,
812                                           n_buckets, flags);
813                 CLIB_MEMORY_BARRIER();
814                 load_balance_set_n_buckets(lb, n_buckets);
815                 CLIB_MEMORY_BARRIER();
816
817                 vec_foreach(tmp_dpo, lb->lb_buckets)
818                 {
819                     dpo_reset(tmp_dpo);
820                 }
821                 vec_free(lb->lb_buckets);
822             }
823             else
824             {
825                 /*
826                  * not crossing the threshold.
827                  *  1 - update the number to the smaller size
828                  *  2 - write the new buckets
829                  *  3 - reset those no longer used.
830                  */
831                 dpo_id_t *buckets;
832                 u32 old_n_buckets;
833
834                 old_n_buckets = lb->lb_n_buckets;
835                 buckets = load_balance_get_buckets(lb);
836
837                 load_balance_set_n_buckets(lb, n_buckets);
838                 CLIB_MEMORY_BARRIER();
839
840                 load_balance_fill_buckets(lb, nhs, buckets,
841                                           n_buckets, flags);
842
843                 for (ii = n_buckets; ii < old_n_buckets; ii++)
844                 {
845                     dpo_reset(&buckets[ii]);
846                 }
847             }
848         }
849     }
850
851     vec_foreach (nh, nhs)
852     {
853         dpo_reset(&nh->path_dpo);
854     }
855     vec_free(nhs);
856     vec_free(fixed_nhs);
857
858     load_balance_map_unlock(old_lbmi);
859 }
860
861 static void
862 load_balance_lock (dpo_id_t *dpo)
863 {
864     load_balance_t *lb;
865
866     lb = load_balance_get(dpo->dpoi_index);
867
868     lb->lb_locks++;
869 }
870
871 static void
872 load_balance_destroy (load_balance_t *lb)
873 {
874     dpo_id_t *buckets;
875     int i;
876
877     buckets = load_balance_get_buckets(lb);
878
879     for (i = 0; i < lb->lb_n_buckets; i++)
880     {
881         dpo_reset(&buckets[i]);
882     }
883
884     LB_DBG(lb, "destroy");
885     if (!LB_HAS_INLINE_BUCKETS(lb))
886     {
887         vec_free(lb->lb_buckets);
888     }
889
890     fib_urpf_list_unlock(lb->lb_urpf);
891     load_balance_map_unlock(lb->lb_map);
892
893     pool_put(load_balance_pool, lb);
894 }
895
896 static void
897 load_balance_unlock (dpo_id_t *dpo)
898 {
899     load_balance_t *lb;
900
901     lb = load_balance_get(dpo->dpoi_index);
902
903     lb->lb_locks--;
904
905     if (0 == lb->lb_locks)
906     {
907         load_balance_destroy(lb);
908     }
909 }
910
911 static void
912 load_balance_mem_show (void)
913 {
914     fib_show_memory_usage("load-balance",
915                           pool_elts(load_balance_pool),
916                           pool_len(load_balance_pool),
917                           sizeof(load_balance_t));
918     load_balance_map_show_mem();
919 }
920
921 const static dpo_vft_t lb_vft = {
922     .dv_lock = load_balance_lock,
923     .dv_unlock = load_balance_unlock,
924     .dv_format = format_load_balance_dpo,
925     .dv_mem_show = load_balance_mem_show,
926 };
927
928 /**
929  * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance
930  *        object.
931  *
932  * this means that these graph nodes are ones from which a load-balance is the
933  * parent object in the DPO-graph.
934  *
935  * We do not list all the load-balance nodes, such as the *-lookup. instead
936  * we are relying on the correct use of the .sibling_of field when setting
937  * up these sibling nodes.
938  */
939 const static char* const load_balance_ip4_nodes[] =
940 {
941     "ip4-load-balance",
942     NULL,
943 };
944 const static char* const load_balance_ip6_nodes[] =
945 {
946     "ip6-load-balance",
947     NULL,
948 };
949 const static char* const load_balance_mpls_nodes[] =
950 {
951     "mpls-load-balance",
952     NULL,
953 };
954 const static char* const load_balance_l2_nodes[] =
955 {
956     "l2-load-balance",
957     NULL,
958 };
959 const static char* const load_balance_nsh_nodes[] =
960 {
961     "nsh-load-balance",
962     NULL
963 };
964 const static char* const load_balance_bier_nodes[] =
965 {
966     "bier-load-balance",
967     NULL,
968 };
969 const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
970 {
971     [DPO_PROTO_IP4]  = load_balance_ip4_nodes,
972     [DPO_PROTO_IP6]  = load_balance_ip6_nodes,
973     [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
974     [DPO_PROTO_ETHERNET] = load_balance_l2_nodes,
975     [DPO_PROTO_NSH] = load_balance_nsh_nodes,
976     [DPO_PROTO_BIER] = load_balance_bier_nodes,
977 };
978
979 void
980 load_balance_module_init (void)
981 {
982     index_t lbi;
983
984     dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes);
985
986     /*
987      * Special LB with index zero. we need to define this since the v4 mtrie
988      * assumes an index of 0 implies the ply is empty. therefore all 'real'
989      * adjs need a non-zero index.
990      * This should never be used, but just in case, stack it on a drop.
991      */
992     lbi = load_balance_create(1, DPO_PROTO_IP4, 0);
993     load_balance_set_bucket(lbi, 0, drop_dpo_get(DPO_PROTO_IP4));
994
995     load_balance_logger =
996         vlib_log_register_class("dpo", "load-balance");
997
998     load_balance_map_module_init();
999 }
1000
1001 static clib_error_t *
1002 load_balance_show (vlib_main_t * vm,
1003                    unformat_input_t * input,
1004                    vlib_cli_command_t * cmd)
1005 {
1006     index_t lbi = INDEX_INVALID;
1007
1008     while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
1009     {
1010         if (unformat (input, "%d", &lbi))
1011             ;
1012         else
1013             break;
1014     }
1015
1016     if (INDEX_INVALID != lbi)
1017     {
1018         if (pool_is_free_index(load_balance_pool, lbi))
1019         {
1020             vlib_cli_output (vm, "no such load-balance:%d", lbi);
1021         }
1022         else
1023         {
1024             vlib_cli_output (vm, "%U", format_load_balance, lbi,
1025                          LOAD_BALANCE_FORMAT_DETAIL);
1026         }
1027     }
1028     else
1029     {
1030         load_balance_t *lb;
1031
1032         pool_foreach (lb, load_balance_pool)
1033          {
1034             vlib_cli_output (vm, "%U", format_load_balance,
1035                              load_balance_get_index(lb),
1036                              LOAD_BALANCE_FORMAT_NONE);
1037         }
1038     }
1039
1040     return 0;
1041 }
1042
1043 VLIB_CLI_COMMAND (load_balance_show_command, static) = {
1044     .path = "show load-balance",
1045     .short_help = "show load-balance [<index>]",
1046     .function = load_balance_show,
1047 };
1048
1049
1050 always_inline u32
1051 ip_flow_hash (void *data)
1052 {
1053   ip4_header_t *iph = (ip4_header_t *) data;
1054
1055   if ((iph->ip_version_and_header_length & 0xF0) == 0x40)
1056     return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT);
1057   else
1058     return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT);
1059 }
1060
1061 always_inline u64
1062 mac_to_u64 (u8 * m)
1063 {
1064   return (*((u64 *) m) & 0xffffffffffff);
1065 }
1066
1067 always_inline u32
1068 l2_flow_hash (vlib_buffer_t * b0)
1069 {
1070   ethernet_header_t *eh;
1071   u64 a, b, c;
1072   uword is_ip, eh_size;
1073   u16 eh_type;
1074
1075   eh = vlib_buffer_get_current (b0);
1076   eh_type = clib_net_to_host_u16 (eh->type);
1077   eh_size = ethernet_buffer_header_size (b0);
1078
1079   is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6);
1080
1081   /* since we have 2 cache lines, use them */
1082   if (is_ip)
1083     a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size);
1084   else
1085     a = eh->type;
1086
1087   b = mac_to_u64 ((u8 *) eh->dst_address);
1088   c = mac_to_u64 ((u8 *) eh->src_address);
1089   hash_mix64 (a, b, c);
1090
1091   return (u32) c;
1092 }
1093
1094 typedef struct load_balance_trace_t_
1095 {
1096     index_t lb_index;
1097 } load_balance_trace_t;
1098
1099 always_inline uword
1100 load_balance_inline (vlib_main_t * vm,
1101                      vlib_node_runtime_t * node,
1102                      vlib_frame_t * frame,
1103                      int is_l2)
1104 {
1105   u32 n_left_from, next_index, *from, *to_next;
1106
1107   from = vlib_frame_vector_args (frame);
1108   n_left_from = frame->n_vectors;
1109
1110   next_index = node->cached_next_index;
1111
1112   while (n_left_from > 0)
1113     {
1114       u32 n_left_to_next;
1115
1116       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1117
1118       while (n_left_from > 0 && n_left_to_next > 0)
1119         {
1120           vlib_buffer_t *b0;
1121           u32 bi0, lbi0, next0;
1122           const dpo_id_t *dpo0;
1123           const load_balance_t *lb0;
1124
1125           bi0 = from[0];
1126           to_next[0] = bi0;
1127           from += 1;
1128           to_next += 1;
1129           n_left_from -= 1;
1130           n_left_to_next -= 1;
1131
1132           b0 = vlib_get_buffer (vm, bi0);
1133
1134           /* lookup dst + src mac */
1135           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1136           lb0 = load_balance_get(lbi0);
1137
1138           if (is_l2)
1139           {
1140               vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);
1141           }
1142           else
1143           {
1144               /* it's BIER */
1145               const bier_hdr_t *bh0 = vlib_buffer_get_current(b0);
1146               vnet_buffer(b0)->ip.flow_hash = bier_compute_flow_hash(bh0);
1147           }
1148
1149           dpo0 = load_balance_get_bucket_i(lb0,
1150                                            vnet_buffer(b0)->ip.flow_hash &
1151                                            (lb0->lb_n_buckets_minus_1));
1152
1153           next0 = dpo0->dpoi_next_node;
1154           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1155
1156           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1157             {
1158               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1159                                                          sizeof (*tr));
1160               tr->lb_index = lbi0;
1161             }
1162           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1163                                            n_left_to_next, bi0, next0);
1164         }
1165
1166       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1167     }
1168
1169   return frame->n_vectors;
1170 }
1171
1172 static uword
1173 l2_load_balance (vlib_main_t * vm,
1174                  vlib_node_runtime_t * node,
1175                  vlib_frame_t * frame)
1176 {
1177     return (load_balance_inline(vm, node, frame, 1));
1178 }
1179
1180 static u8 *
1181 format_l2_load_balance_trace (u8 * s, va_list * args)
1182 {
1183   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1184   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1185   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1186
1187   s = format (s, "L2-load-balance: index %d", t->lb_index);
1188   return s;
1189 }
1190
1191 /**
1192  * @brief
1193  */
1194 VLIB_REGISTER_NODE (l2_load_balance_node) = {
1195   .function = l2_load_balance,
1196   .name = "l2-load-balance",
1197   .vector_size = sizeof (u32),
1198
1199   .format_trace = format_l2_load_balance_trace,
1200   .n_next_nodes = 1,
1201   .next_nodes = {
1202       [0] = "error-drop",
1203   },
1204 };
1205
1206 static uword
1207 nsh_load_balance (vlib_main_t * vm,
1208                  vlib_node_runtime_t * node,
1209                  vlib_frame_t * frame)
1210 {
1211   u32 n_left_from, next_index, *from, *to_next;
1212
1213   from = vlib_frame_vector_args (frame);
1214   n_left_from = frame->n_vectors;
1215
1216   next_index = node->cached_next_index;
1217
1218   while (n_left_from > 0)
1219     {
1220       u32 n_left_to_next;
1221
1222       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1223
1224       while (n_left_from > 0 && n_left_to_next > 0)
1225         {
1226           vlib_buffer_t *b0;
1227           u32 bi0, lbi0, next0, *nsh0;
1228           const dpo_id_t *dpo0;
1229           const load_balance_t *lb0;
1230
1231           bi0 = from[0];
1232           to_next[0] = bi0;
1233           from += 1;
1234           to_next += 1;
1235           n_left_from -= 1;
1236           n_left_to_next -= 1;
1237
1238           b0 = vlib_get_buffer (vm, bi0);
1239
1240           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1241           lb0 = load_balance_get(lbi0);
1242
1243           /* SPI + SI are the second word of the NSH header */
1244           nsh0 = vlib_buffer_get_current (b0);
1245           vnet_buffer(b0)->ip.flow_hash = nsh0[1] % lb0->lb_n_buckets;
1246
1247           dpo0 = load_balance_get_bucket_i(lb0,
1248                                            vnet_buffer(b0)->ip.flow_hash &
1249                                            (lb0->lb_n_buckets_minus_1));
1250
1251           next0 = dpo0->dpoi_next_node;
1252           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1253
1254           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1255             {
1256               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1257                                                          sizeof (*tr));
1258               tr->lb_index = lbi0;
1259             }
1260           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1261                                            n_left_to_next, bi0, next0);
1262         }
1263
1264       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1265     }
1266
1267   return frame->n_vectors;
1268 }
1269
1270 static u8 *
1271 format_nsh_load_balance_trace (u8 * s, va_list * args)
1272 {
1273   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1274   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1275   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1276
1277   s = format (s, "NSH-load-balance: index %d", t->lb_index);
1278   return s;
1279 }
1280
1281 /**
1282  * @brief
1283  */
1284 VLIB_REGISTER_NODE (nsh_load_balance_node) = {
1285   .function = nsh_load_balance,
1286   .name = "nsh-load-balance",
1287   .vector_size = sizeof (u32),
1288
1289   .format_trace = format_nsh_load_balance_trace,
1290   .n_next_nodes = 1,
1291   .next_nodes = {
1292       [0] = "error-drop",
1293   },
1294 };
1295
1296 static u8 *
1297 format_bier_load_balance_trace (u8 * s, va_list * args)
1298 {
1299   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1300   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1301   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1302
1303   s = format (s, "BIER-load-balance: index %d", t->lb_index);
1304   return s;
1305 }
1306
1307 static uword
1308 bier_load_balance (vlib_main_t * vm,
1309                    vlib_node_runtime_t * node,
1310                    vlib_frame_t * frame)
1311 {
1312     return (load_balance_inline(vm, node, frame, 0));
1313 }
1314
1315 /**
1316  * @brief
1317  */
1318 VLIB_REGISTER_NODE (bier_load_balance_node) = {
1319   .function = bier_load_balance,
1320   .name = "bier-load-balance",
1321   .vector_size = sizeof (u32),
1322
1323   .format_trace = format_bier_load_balance_trace,
1324   .sibling_of = "mpls-load-balance",
1325 };