c11 safe string handling support
[vpp.git] / src / vnet / dpo / load_balance.c
1 /*
2  * Copyright (c) 2016 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 #include <vnet/ip/lookup.h>
17 #include <vnet/dpo/load_balance.h>
18 #include <vnet/dpo/load_balance_map.h>
19 #include <vnet/dpo/drop_dpo.h>
20 #include <vppinfra/math.h>              /* for fabs */
21 #include <vnet/adj/adj.h>
22 #include <vnet/adj/adj_internal.h>
23 #include <vnet/fib/fib_urpf_list.h>
24 #include <vnet/bier/bier_fwd.h>
25
26 /*
27  * distribution error tolerance for load-balancing
28  */
29 const f64 multipath_next_hop_error_tolerance = 0.1;
30
31 static const char *load_balance_attr_names[] = LOAD_BALANCE_ATTR_NAMES;
32
33 /**
34  * the logger
35  */
36 vlib_log_class_t load_balance_logger;
37
38 #define LB_DBG(_lb, _fmt, _args...)                                     \
39 {                                                                       \
40     vlib_log_debug(load_balance_logger,                                 \
41                    "lb:[%U]:" _fmt,                                     \
42                    format_load_balance, load_balance_get_index(_lb),    \
43                    LOAD_BALANCE_FORMAT_NONE,                            \
44                    ##_args);                                            \
45 }
46
47 /**
48  * Pool of all DPOs. It's not static so the DP can have fast access
49  */
50 load_balance_t *load_balance_pool;
51
52 /**
53  * The one instance of load-balance main
54  */
55 load_balance_main_t load_balance_main = {
56     .lbm_to_counters = {
57         .name = "route-to",
58         .stat_segment_name = "/net/route/to",
59     },
60     .lbm_via_counters = {
61         .name = "route-via",
62         .stat_segment_name = "/net/route/via",
63     }
64 };
65
66 f64
67 load_balance_get_multipath_tolerance (void)
68 {
69     return (multipath_next_hop_error_tolerance);
70 }
71
72 static inline index_t
73 load_balance_get_index (const load_balance_t *lb)
74 {
75     return (lb - load_balance_pool);
76 }
77
78 static inline dpo_id_t*
79 load_balance_get_buckets (load_balance_t *lb)
80 {
81     if (LB_HAS_INLINE_BUCKETS(lb))
82     {
83         return (lb->lb_buckets_inline);
84     }
85     else
86     {
87         return (lb->lb_buckets);
88     }
89 }
90
91 static load_balance_t *
92 load_balance_alloc_i (void)
93 {
94     load_balance_t *lb;
95
96     pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
97     clib_memset(lb, 0, sizeof(*lb));
98
99     lb->lb_map = INDEX_INVALID;
100     lb->lb_urpf = INDEX_INVALID;
101     vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
102                                    load_balance_get_index(lb));
103     vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
104                                    load_balance_get_index(lb));
105     vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters),
106                                load_balance_get_index(lb));
107     vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
108                                load_balance_get_index(lb));
109
110     return (lb);
111 }
112
113 static u8*
114 load_balance_format (index_t lbi,
115                      load_balance_format_flags_t flags,
116                      u32 indent,
117                      u8 *s)
118 {
119     vlib_counter_t to, via;
120     load_balance_t *lb;
121     dpo_id_t *buckets;
122     u32 i;
123
124     lb = load_balance_get(lbi);
125     vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to);
126     vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via);
127     buckets = load_balance_get_buckets(lb);
128
129     s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE);
130     s = format(s, "[proto:%U ", format_dpo_proto, lb->lb_proto);
131     s = format(s, "index:%d buckets:%d ", lbi, lb->lb_n_buckets);
132     s = format(s, "uRPF:%d ", lb->lb_urpf);
133     if (lb->lb_flags)
134     {
135         load_balance_attr_t attr;
136
137         s = format(s, "flags:[");
138
139         FOR_EACH_LOAD_BALANCE_ATTR(attr)
140         {
141             if (lb->lb_flags & (1 << attr))
142             {
143                 s = format (s, "%s", load_balance_attr_names[attr]);
144             }
145         }
146         s = format(s, "] ");
147     }
148     s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
149     if (0 != via.packets)
150     {
151         s = format(s, " via:[%Ld:%Ld]",
152                    via.packets, via.bytes);
153     }
154     s = format(s, "]");
155
156     if (INDEX_INVALID != lb->lb_map)
157     {
158         s = format(s, "\n%U%U",
159                    format_white_space, indent+4,
160                    format_load_balance_map, lb->lb_map, indent+4);
161     }
162     for (i = 0; i < lb->lb_n_buckets; i++)
163     {
164         s = format(s, "\n%U[%d] %U",
165                    format_white_space, indent+2,
166                    i,
167                    format_dpo_id,
168                    &buckets[i], indent+6);
169     }
170     return (s);
171 }
172
173 u8*
174 format_load_balance (u8 * s, va_list * args)
175 {
176     index_t lbi = va_arg(*args, index_t);
177     load_balance_format_flags_t flags = va_arg(*args, load_balance_format_flags_t);
178
179     return (load_balance_format(lbi, flags, 0, s));
180 }
181
182 static u8*
183 format_load_balance_dpo (u8 * s, va_list * args)
184 {
185     index_t lbi = va_arg(*args, index_t);
186     u32 indent = va_arg(*args, u32);
187
188     return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
189 }
190
191 static load_balance_t *
192 load_balance_create_i (u32 num_buckets,
193                        dpo_proto_t lb_proto,
194                        flow_hash_config_t fhc)
195 {
196     load_balance_t *lb;
197
198     lb = load_balance_alloc_i();
199     lb->lb_hash_config = fhc;
200     lb->lb_n_buckets = num_buckets;
201     lb->lb_n_buckets_minus_1 = num_buckets-1;
202     lb->lb_proto = lb_proto;
203
204     if (!LB_HAS_INLINE_BUCKETS(lb))
205     {
206         vec_validate_aligned(lb->lb_buckets,
207                              lb->lb_n_buckets - 1,
208                              CLIB_CACHE_LINE_BYTES);
209     }
210
211     LB_DBG(lb, "create");
212
213     return (lb);
214 }
215
216 index_t
217 load_balance_create (u32 n_buckets,
218                      dpo_proto_t lb_proto,
219                      flow_hash_config_t fhc)
220 {
221     return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc)));
222 }
223
224 static inline void
225 load_balance_set_bucket_i (load_balance_t *lb,
226                            u32 bucket,
227                            dpo_id_t *buckets,
228                            const dpo_id_t *next)
229 {
230     dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next);
231 }
232
233 void
234 load_balance_set_bucket (index_t lbi,
235                          u32 bucket,
236                          const dpo_id_t *next)
237 {
238     load_balance_t *lb;
239     dpo_id_t *buckets;
240
241     lb = load_balance_get(lbi);
242     buckets = load_balance_get_buckets(lb);
243
244     ASSERT(bucket < lb->lb_n_buckets);
245
246     load_balance_set_bucket_i(lb, bucket, buckets, next);
247 }
248
249 int
250 load_balance_is_drop (const dpo_id_t *dpo)
251 {
252     load_balance_t *lb;
253
254     if (DPO_LOAD_BALANCE != dpo->dpoi_type)
255         return (0);
256
257     lb = load_balance_get(dpo->dpoi_index);
258
259     if (1 == lb->lb_n_buckets)
260     {
261         return (dpo_is_drop(load_balance_get_bucket_i(lb, 0)));
262     }
263     return (0);
264 }
265
266 u16
267 load_balance_n_buckets (index_t lbi)
268 {
269     load_balance_t *lb;
270
271     lb = load_balance_get(lbi);
272
273     return (lb->lb_n_buckets);
274 }
275
276 void
277 load_balance_set_fib_entry_flags (index_t lbi,
278                                   fib_entry_flag_t flags)
279 {
280     load_balance_t *lb;
281
282     lb = load_balance_get(lbi);
283     lb->lb_fib_entry_flags = flags;
284 }
285
286
287 void
288 load_balance_set_urpf (index_t lbi,
289                        index_t urpf)
290 {
291     load_balance_t *lb;
292     index_t old;
293
294     lb = load_balance_get(lbi);
295
296     /*
297      * packets in flight we see this change. but it's atomic, so :P
298      */
299     old = lb->lb_urpf;
300     lb->lb_urpf = urpf;
301
302     fib_urpf_list_unlock(old);
303     fib_urpf_list_lock(urpf);
304 }
305
306 index_t
307 load_balance_get_urpf (index_t lbi)
308 {
309     load_balance_t *lb;
310
311     lb = load_balance_get(lbi);
312
313     return (lb->lb_urpf);
314 }
315
316 const dpo_id_t *
317 load_balance_get_bucket (index_t lbi,
318                          u32 bucket)
319 {
320     load_balance_t *lb;
321
322     lb = load_balance_get(lbi);
323
324     return (load_balance_get_bucket_i(lb, bucket));
325 }
326
327 static int
328 next_hop_sort_by_weight (const load_balance_path_t * n1,
329                          const load_balance_path_t * n2)
330 {
331     return ((int) n1->path_weight - (int) n2->path_weight);
332 }
333
334 /* Given next hop vector is over-written with normalized one with sorted weights and
335    with weights corresponding to the number of adjacencies for each next hop.
336    Returns number of adjacencies in block. */
337 u32
338 ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops,
339                                   load_balance_path_t ** normalized_next_hops,
340                                   u32 *sum_weight_in,
341                                   f64 multipath_next_hop_error_tolerance)
342 {
343     load_balance_path_t * nhs;
344     uword n_nhs, n_adj, n_adj_left, i, sum_weight;
345     f64 norm, error;
346
347     n_nhs = vec_len (raw_next_hops);
348     ASSERT (n_nhs > 0);
349     if (n_nhs == 0)
350         return 0;
351
352     /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
353     nhs = *normalized_next_hops;
354     vec_validate (nhs, 2*n_nhs - 1);
355
356     /* Fast path: 1 next hop in block. */
357     n_adj = n_nhs;
358     if (n_nhs == 1)
359     {
360         nhs[0] = raw_next_hops[0];
361         nhs[0].path_weight = 1;
362         _vec_len (nhs) = 1;
363         sum_weight = 1;
364         goto done;
365     }
366
367     else if (n_nhs == 2)
368     {
369         int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;
370
371         /* Fast sort. */
372         nhs[0] = raw_next_hops[cmp];
373         nhs[1] = raw_next_hops[cmp ^ 1];
374
375         /* Fast path: equal cost multipath with 2 next hops. */
376         if (nhs[0].path_weight == nhs[1].path_weight)
377         {
378             nhs[0].path_weight = nhs[1].path_weight = 1;
379             _vec_len (nhs) = 2;
380             sum_weight = 2;
381             goto done;
382         }
383     }
384     else
385     {
386         clib_memcpy (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
387         qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
388     }
389
390     /* Find total weight to normalize weights. */
391     sum_weight = 0;
392     for (i = 0; i < n_nhs; i++)
393         sum_weight += nhs[i].path_weight;
394
395     /* In the unlikely case that all weights are given as 0, set them all to 1. */
396     if (sum_weight == 0)
397     {
398         for (i = 0; i < n_nhs; i++)
399             nhs[i].path_weight = 1;
400         sum_weight = n_nhs;
401     }
402
403     /* Save copies of all next hop weights to avoid being overwritten in loop below. */
404     for (i = 0; i < n_nhs; i++)
405         nhs[n_nhs + i].path_weight = nhs[i].path_weight;
406
407     /* Try larger and larger power of 2 sized adjacency blocks until we
408        find one where traffic flows to within 1% of specified weights. */
409     for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
410     {
411         error = 0;
412
413         norm = n_adj / ((f64) sum_weight);
414         n_adj_left = n_adj;
415         for (i = 0; i < n_nhs; i++)
416         {
417             f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */
418             word n = flt_round_nearest (nf);
419
420             n = n > n_adj_left ? n_adj_left : n;
421             n_adj_left -= n;
422             error += fabs (nf - n);
423             nhs[i].path_weight = n;
424
425             if (0 == nhs[i].path_weight)
426             {
427                 /*
428                  * when the weight skew is high (norm is small) and n == nf.
429                  * without this correction the path with a low weight would have
430                  * no represenation in the load-balanace - don't want that.
431                  * If the weight skew is high so the load-balance has many buckets
432                  * to allow it. pays ya money takes ya choice.
433                  */
434                 error = n_adj;
435                 break;
436             }
437         }
438
439         nhs[0].path_weight += n_adj_left;
440
441         /* Less than 5% average error per adjacency with this size adjacency block? */
442         if (error <= multipath_next_hop_error_tolerance*n_adj)
443         {
444             /* Truncate any next hops with zero weight. */
445             _vec_len (nhs) = i;
446             break;
447         }
448     }
449
450 done:
451     /* Save vector for next call. */
452     *normalized_next_hops = nhs;
453     *sum_weight_in = sum_weight;
454     return n_adj;
455 }
456
457 static load_balance_path_t *
458 load_balance_multipath_next_hop_fixup (const load_balance_path_t *nhs,
459                                        dpo_proto_t drop_proto)
460 {
461     if (0 == vec_len(nhs))
462     {
463         load_balance_path_t *new_nhs = NULL, *nh;
464
465         /*
466          * we need something for the load-balance. so use the drop
467          */
468         vec_add2(new_nhs, nh, 1);
469
470         nh->path_weight = 1;
471         dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto));
472
473         return (new_nhs);
474     }
475
476     return (NULL);
477 }
478
479 /*
480  * Fill in adjacencies in block based on corresponding
481  * next hop adjacencies.
482  */
483 static void
484 load_balance_fill_buckets_norm (load_balance_t *lb,
485                                 load_balance_path_t *nhs,
486                                 dpo_id_t *buckets,
487                                 u32 n_buckets)
488 {
489     load_balance_path_t *nh;
490     u16 ii, bucket;
491
492     bucket = 0;
493
494     /*
495      * the next-hops have normalised weights. that means their sum is the number
496      * of buckets we need to fill.
497      */
498     vec_foreach (nh, nhs)
499     {
500         for (ii = 0; ii < nh->path_weight; ii++)
501         {
502             ASSERT(bucket < n_buckets);
503             load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
504         }
505     }
506 }
507 static void
508 load_balance_fill_buckets_sticky (load_balance_t *lb,
509                                   load_balance_path_t *nhs,
510                                   dpo_id_t *buckets,
511                                   u32 n_buckets)
512 {
513     load_balance_path_t *nh, *fwding_paths;
514     u16 ii, bucket, fpath;
515
516     fpath = bucket = 0;
517     fwding_paths = NULL;
518
519     vec_foreach (nh, nhs)
520     {
521         if (!dpo_is_drop(&nh->path_dpo))
522         {
523             vec_add1(fwding_paths, *nh);
524         }
525     }
526     if (vec_len(fwding_paths) == 0)
527         fwding_paths = vec_dup(nhs);
528
529     /*
530      * the next-hops have normalised weights. that means their sum is the number
531      * of buckets we need to fill.
532      */
533     vec_foreach (nh, nhs)
534     {
535         for (ii = 0; ii < nh->path_weight; ii++)
536         {
537             ASSERT(bucket < n_buckets);
538             if (!dpo_is_drop(&nh->path_dpo))
539             {
540                 load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
541             }
542             else
543             {
544                 /* fill the bucks from the next up path */
545                 load_balance_set_bucket_i(lb, bucket++, buckets, &fwding_paths[fpath].path_dpo);
546                 fpath = (fpath + 1) % vec_len(fwding_paths);
547             }
548         }
549     }
550
551     vec_free(fwding_paths);
552 }
553
554 static void
555 load_balance_fill_buckets (load_balance_t *lb,
556                            load_balance_path_t *nhs,
557                            dpo_id_t *buckets,
558                            u32 n_buckets,
559                            load_balance_flags_t flags)
560 {
561     if (flags & LOAD_BALANCE_FLAG_STICKY)
562     {
563         load_balance_fill_buckets_sticky(lb, nhs, buckets, n_buckets);
564     }
565     else
566     {
567         load_balance_fill_buckets_norm(lb, nhs, buckets, n_buckets);
568     }
569 }
570
571 static inline void
572 load_balance_set_n_buckets (load_balance_t *lb,
573                             u32 n_buckets)
574 {
575     lb->lb_n_buckets = n_buckets;
576     lb->lb_n_buckets_minus_1 = n_buckets-1;
577 }
578
579 void
580 load_balance_multipath_update (const dpo_id_t *dpo,
581                                const load_balance_path_t * raw_nhs,
582                                load_balance_flags_t flags)
583 {
584     load_balance_path_t *nh, *nhs, *fixed_nhs;
585     u32 sum_of_weights, n_buckets, ii;
586     index_t lbmi, old_lbmi;
587     load_balance_t *lb;
588     dpo_id_t *tmp_dpo;
589
590     nhs = NULL;
591
592     ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
593     lb = load_balance_get(dpo->dpoi_index);
594     lb->lb_flags = flags;
595     fixed_nhs = load_balance_multipath_next_hop_fixup(raw_nhs, lb->lb_proto);
596     n_buckets =
597         ip_multipath_normalize_next_hops((NULL == fixed_nhs ?
598                                           raw_nhs :
599                                           fixed_nhs),
600                                          &nhs,
601                                          &sum_of_weights,
602                                          multipath_next_hop_error_tolerance);
603
604     ASSERT (n_buckets >= vec_len (raw_nhs));
605
606     /*
607      * Save the old load-balance map used, and get a new one if required.
608      */
609     old_lbmi = lb->lb_map;
610     if (flags & LOAD_BALANCE_FLAG_USES_MAP)
611     {
612         lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs);
613     }
614     else
615     {
616         lbmi = INDEX_INVALID;
617     }
618
619     if (0 == lb->lb_n_buckets)
620     {
621         /*
622          * first time initialisation. no packets inflight, so we can write
623          * at leisure.
624          */
625         load_balance_set_n_buckets(lb, n_buckets);
626
627         if (!LB_HAS_INLINE_BUCKETS(lb))
628             vec_validate_aligned(lb->lb_buckets,
629                                  lb->lb_n_buckets - 1,
630                                  CLIB_CACHE_LINE_BYTES);
631
632         load_balance_fill_buckets(lb, nhs,
633                                   load_balance_get_buckets(lb),
634                                   n_buckets, flags);
635         lb->lb_map = lbmi;
636     }
637     else
638     {
639         /*
640          * This is a modification of an existing load-balance.
641          * We need to ensure that packets inflight see a consistent state, that
642          * is the number of reported buckets the LB has (read from
643          * lb_n_buckets_minus_1) is not more than it actually has. So if the
644          * number of buckets is increasing, we must update the bucket array first,
645          * then the reported number. vice-versa if the number of buckets goes down.
646          */
647         if (n_buckets == lb->lb_n_buckets)
648         {
649             /*
650              * no change in the number of buckets. we can simply fill what
651              * is new over what is old.
652              */
653             load_balance_fill_buckets(lb, nhs,
654                                       load_balance_get_buckets(lb),
655                                       n_buckets, flags);
656             lb->lb_map = lbmi;
657         }
658         else if (n_buckets > lb->lb_n_buckets)
659         {
660             /*
661              * we have more buckets. the old load-balance map (if there is one)
662              * will remain valid, i.e. mapping to indices within range, so we
663              * update it last.
664              */
665             if (n_buckets > LB_NUM_INLINE_BUCKETS &&
666                 lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
667             {
668                 /*
669                  * the new increased number of buckets is crossing the threshold
670                  * from the inline storage to out-line. Alloc the outline buckets
671                  * first, then fixup the number. then reset the inlines.
672                  */
673                 ASSERT(NULL == lb->lb_buckets);
674                 vec_validate_aligned(lb->lb_buckets,
675                                      n_buckets - 1,
676                                      CLIB_CACHE_LINE_BYTES);
677
678                 load_balance_fill_buckets(lb, nhs,
679                                           lb->lb_buckets,
680                                           n_buckets, flags);
681                 CLIB_MEMORY_BARRIER();
682                 load_balance_set_n_buckets(lb, n_buckets);
683
684                 CLIB_MEMORY_BARRIER();
685
686                 for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++)
687                 {
688                     dpo_reset(&lb->lb_buckets_inline[ii]);
689                 }
690             }
691             else
692             {
693                 if (n_buckets <= LB_NUM_INLINE_BUCKETS)
694                 {
695                     /*
696                      * we are not crossing the threshold and it's still inline buckets.
697                      * we can write the new on the old..
698                      */
699                     load_balance_fill_buckets(lb, nhs,
700                                               load_balance_get_buckets(lb),
701                                               n_buckets, flags);
702                     CLIB_MEMORY_BARRIER();
703                     load_balance_set_n_buckets(lb, n_buckets);
704                 }
705                 else
706                 {
707                     /*
708                      * we are not crossing the threshold. We need a new bucket array to
709                      * hold the increased number of choices.
710                      */
711                     dpo_id_t *new_buckets, *old_buckets, *tmp_dpo;
712
713                     new_buckets = NULL;
714                     old_buckets = load_balance_get_buckets(lb);
715
716                     vec_validate_aligned(new_buckets,
717                                          n_buckets - 1,
718                                          CLIB_CACHE_LINE_BYTES);
719
720                     load_balance_fill_buckets(lb, nhs, new_buckets,
721                                               n_buckets, flags);
722                     CLIB_MEMORY_BARRIER();
723                     lb->lb_buckets = new_buckets;
724                     CLIB_MEMORY_BARRIER();
725                     load_balance_set_n_buckets(lb, n_buckets);
726
727                     vec_foreach(tmp_dpo, old_buckets)
728                     {
729                         dpo_reset(tmp_dpo);
730                     }
731                     vec_free(old_buckets);
732                 }
733             }
734
735             /*
736              * buckets fixed. ready for the MAP update.
737              */
738             lb->lb_map = lbmi;
739         }
740         else
741         {
742             /*
743              * bucket size shrinkage.
744              * Any map we have will be based on the old
745              * larger number of buckets, so will be translating to indices
746              * out of range. So the new MAP must be installed first.
747              */
748             lb->lb_map = lbmi;
749             CLIB_MEMORY_BARRIER();
750
751
752             if (n_buckets <= LB_NUM_INLINE_BUCKETS &&
753                 lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS)
754             {
755                 /*
756                  * the new decreased number of buckets is crossing the threshold
757                  * from out-line storage to inline:
758                  *   1 - Fill the inline buckets,
759                  *   2 - fixup the number (and this point the inline buckets are
760                  *       used).
761                  *   3 - free the outline buckets
762                  */
763                 load_balance_fill_buckets(lb, nhs,
764                                           lb->lb_buckets_inline,
765                                           n_buckets, flags);
766                 CLIB_MEMORY_BARRIER();
767                 load_balance_set_n_buckets(lb, n_buckets);
768                 CLIB_MEMORY_BARRIER();
769
770                 vec_foreach(tmp_dpo, lb->lb_buckets)
771                 {
772                     dpo_reset(tmp_dpo);
773                 }
774                 vec_free(lb->lb_buckets);
775             }
776             else
777             {
778                 /*
779                  * not crossing the threshold.
780                  *  1 - update the number to the smaller size
781                  *  2 - write the new buckets
782                  *  3 - reset those no longer used.
783                  */
784                 dpo_id_t *buckets;
785                 u32 old_n_buckets;
786
787                 old_n_buckets = lb->lb_n_buckets;
788                 buckets = load_balance_get_buckets(lb);
789
790                 load_balance_set_n_buckets(lb, n_buckets);
791                 CLIB_MEMORY_BARRIER();
792
793                 load_balance_fill_buckets(lb, nhs, buckets,
794                                           n_buckets, flags);
795
796                 for (ii = n_buckets; ii < old_n_buckets; ii++)
797                 {
798                     dpo_reset(&buckets[ii]);
799                 }
800             }
801         }
802     }
803
804     vec_foreach (nh, nhs)
805     {
806         dpo_reset(&nh->path_dpo);
807     }
808     vec_free(nhs);
809     vec_free(fixed_nhs);
810
811     load_balance_map_unlock(old_lbmi);
812 }
813
814 static void
815 load_balance_lock (dpo_id_t *dpo)
816 {
817     load_balance_t *lb;
818
819     lb = load_balance_get(dpo->dpoi_index);
820
821     lb->lb_locks++;
822 }
823
824 static void
825 load_balance_destroy (load_balance_t *lb)
826 {
827     dpo_id_t *buckets;
828     int i;
829
830     buckets = load_balance_get_buckets(lb);
831
832     for (i = 0; i < lb->lb_n_buckets; i++)
833     {
834         dpo_reset(&buckets[i]);
835     }
836
837     LB_DBG(lb, "destroy");
838     if (!LB_HAS_INLINE_BUCKETS(lb))
839     {
840         vec_free(lb->lb_buckets);
841     }
842
843     fib_urpf_list_unlock(lb->lb_urpf);
844     load_balance_map_unlock(lb->lb_map);
845
846     pool_put(load_balance_pool, lb);
847 }
848
849 static void
850 load_balance_unlock (dpo_id_t *dpo)
851 {
852     load_balance_t *lb;
853
854     lb = load_balance_get(dpo->dpoi_index);
855
856     lb->lb_locks--;
857
858     if (0 == lb->lb_locks)
859     {
860         load_balance_destroy(lb);
861     }
862 }
863
864 static void
865 load_balance_mem_show (void)
866 {
867     fib_show_memory_usage("load-balance",
868                           pool_elts(load_balance_pool),
869                           pool_len(load_balance_pool),
870                           sizeof(load_balance_t));
871     load_balance_map_show_mem();
872 }
873
874 const static dpo_vft_t lb_vft = {
875     .dv_lock = load_balance_lock,
876     .dv_unlock = load_balance_unlock,
877     .dv_format = format_load_balance_dpo,
878     .dv_mem_show = load_balance_mem_show,
879 };
880
881 /**
882  * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance
883  *        object.
884  *
885  * this means that these graph nodes are ones from which a load-balance is the
886  * parent object in the DPO-graph.
887  *
888  * We do not list all the load-balance nodes, such as the *-lookup. instead
889  * we are relying on the correct use of the .sibling_of field when setting
890  * up these sibling nodes.
891  */
892 const static char* const load_balance_ip4_nodes[] =
893 {
894     "ip4-load-balance",
895     NULL,
896 };
897 const static char* const load_balance_ip6_nodes[] =
898 {
899     "ip6-load-balance",
900     NULL,
901 };
902 const static char* const load_balance_mpls_nodes[] =
903 {
904     "mpls-load-balance",
905     NULL,
906 };
907 const static char* const load_balance_l2_nodes[] =
908 {
909     "l2-load-balance",
910     NULL,
911 };
912 const static char* const load_balance_nsh_nodes[] =
913 {
914     "nsh-load-balance",
915     NULL
916 };
917 const static char* const load_balance_bier_nodes[] =
918 {
919     "bier-load-balance",
920     NULL,
921 };
922 const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
923 {
924     [DPO_PROTO_IP4]  = load_balance_ip4_nodes,
925     [DPO_PROTO_IP6]  = load_balance_ip6_nodes,
926     [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
927     [DPO_PROTO_ETHERNET] = load_balance_l2_nodes,
928     [DPO_PROTO_NSH] = load_balance_nsh_nodes,
929     [DPO_PROTO_BIER] = load_balance_bier_nodes,
930 };
931
932 void
933 load_balance_module_init (void)
934 {
935     index_t lbi;
936
937     dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes);
938
939     /*
940      * Special LB with index zero. we need to define this since the v4 mtrie
941      * assumes an index of 0 implies the ply is empty. therefore all 'real'
942      * adjs need a non-zero index.
943      * This should never be used, but just in case, stack it on a drop.
944      */
945     lbi = load_balance_create(1, DPO_PROTO_IP4, 0);
946     load_balance_set_bucket(lbi, 0, drop_dpo_get(DPO_PROTO_IP4));
947
948     load_balance_logger =
949         vlib_log_register_class("dpo", "load-balance");
950
951     load_balance_map_module_init();
952 }
953
954 static clib_error_t *
955 load_balance_show (vlib_main_t * vm,
956                    unformat_input_t * input,
957                    vlib_cli_command_t * cmd)
958 {
959     index_t lbi = INDEX_INVALID;
960
961     while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
962     {
963         if (unformat (input, "%d", &lbi))
964             ;
965         else
966             break;
967     }
968
969     if (INDEX_INVALID != lbi)
970     {
971         if (pool_is_free_index(load_balance_pool, lbi))
972         {
973             vlib_cli_output (vm, "no such load-balance:%d", lbi);
974         }
975         else
976         {
977             vlib_cli_output (vm, "%U", format_load_balance, lbi,
978                          LOAD_BALANCE_FORMAT_DETAIL);
979         }
980     }
981     else
982     {
983         load_balance_t *lb;
984
985         pool_foreach(lb, load_balance_pool,
986         ({
987             vlib_cli_output (vm, "%U", format_load_balance,
988                              load_balance_get_index(lb),
989                              LOAD_BALANCE_FORMAT_NONE);
990         }));
991     }
992
993     return 0;
994 }
995
996 VLIB_CLI_COMMAND (load_balance_show_command, static) = {
997     .path = "show load-balance",
998     .short_help = "show load-balance [<index>]",
999     .function = load_balance_show,
1000 };
1001
1002
1003 always_inline u32
1004 ip_flow_hash (void *data)
1005 {
1006   ip4_header_t *iph = (ip4_header_t *) data;
1007
1008   if ((iph->ip_version_and_header_length & 0xF0) == 0x40)
1009     return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT);
1010   else
1011     return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT);
1012 }
1013
1014 always_inline u64
1015 mac_to_u64 (u8 * m)
1016 {
1017   return (*((u64 *) m) & 0xffffffffffff);
1018 }
1019
1020 always_inline u32
1021 l2_flow_hash (vlib_buffer_t * b0)
1022 {
1023   ethernet_header_t *eh;
1024   u64 a, b, c;
1025   uword is_ip, eh_size;
1026   u16 eh_type;
1027
1028   eh = vlib_buffer_get_current (b0);
1029   eh_type = clib_net_to_host_u16 (eh->type);
1030   eh_size = ethernet_buffer_header_size (b0);
1031
1032   is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6);
1033
1034   /* since we have 2 cache lines, use them */
1035   if (is_ip)
1036     a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size);
1037   else
1038     a = eh->type;
1039
1040   b = mac_to_u64 ((u8 *) eh->dst_address);
1041   c = mac_to_u64 ((u8 *) eh->src_address);
1042   hash_mix64 (a, b, c);
1043
1044   return (u32) c;
1045 }
1046
1047 typedef struct load_balance_trace_t_
1048 {
1049     index_t lb_index;
1050 } load_balance_trace_t;
1051
1052 always_inline uword
1053 load_balance_inline (vlib_main_t * vm,
1054                      vlib_node_runtime_t * node,
1055                      vlib_frame_t * frame,
1056                      int is_l2)
1057 {
1058   u32 n_left_from, next_index, *from, *to_next;
1059
1060   from = vlib_frame_vector_args (frame);
1061   n_left_from = frame->n_vectors;
1062
1063   next_index = node->cached_next_index;
1064
1065   while (n_left_from > 0)
1066     {
1067       u32 n_left_to_next;
1068
1069       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1070
1071       while (n_left_from > 0 && n_left_to_next > 0)
1072         {
1073           vlib_buffer_t *b0;
1074           u32 bi0, lbi0, next0;
1075           const dpo_id_t *dpo0;
1076           const load_balance_t *lb0;
1077
1078           bi0 = from[0];
1079           to_next[0] = bi0;
1080           from += 1;
1081           to_next += 1;
1082           n_left_from -= 1;
1083           n_left_to_next -= 1;
1084
1085           b0 = vlib_get_buffer (vm, bi0);
1086
1087           /* lookup dst + src mac */
1088           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1089           lb0 = load_balance_get(lbi0);
1090
1091           if (is_l2)
1092           {
1093               vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);
1094           }
1095           else
1096           {
1097               /* it's BIER */
1098               const bier_hdr_t *bh0 = vlib_buffer_get_current(b0);
1099               vnet_buffer(b0)->ip.flow_hash = bier_compute_flow_hash(bh0);
1100           }
1101
1102           dpo0 = load_balance_get_bucket_i(lb0, 
1103                                            vnet_buffer(b0)->ip.flow_hash &
1104                                            (lb0->lb_n_buckets_minus_1));
1105
1106           next0 = dpo0->dpoi_next_node;
1107           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1108
1109           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1110             {
1111               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1112                                                          sizeof (*tr));
1113               tr->lb_index = lbi0;
1114             }
1115           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1116                                            n_left_to_next, bi0, next0);
1117         }
1118
1119       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1120     }
1121
1122   return frame->n_vectors;
1123 }
1124
1125 static uword
1126 l2_load_balance (vlib_main_t * vm,
1127                  vlib_node_runtime_t * node,
1128                  vlib_frame_t * frame)
1129 {
1130     return (load_balance_inline(vm, node, frame, 1));
1131 }
1132
1133 static u8 *
1134 format_l2_load_balance_trace (u8 * s, va_list * args)
1135 {
1136   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1137   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1138   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1139
1140   s = format (s, "L2-load-balance: index %d", t->lb_index);
1141   return s;
1142 }
1143
1144 /**
1145  * @brief
1146  */
1147 VLIB_REGISTER_NODE (l2_load_balance_node) = {
1148   .function = l2_load_balance,
1149   .name = "l2-load-balance",
1150   .vector_size = sizeof (u32),
1151
1152   .format_trace = format_l2_load_balance_trace,
1153   .n_next_nodes = 1,
1154   .next_nodes = {
1155       [0] = "error-drop",
1156   },
1157 };
1158
1159 static uword
1160 nsh_load_balance (vlib_main_t * vm,
1161                  vlib_node_runtime_t * node,
1162                  vlib_frame_t * frame)
1163 {
1164   u32 n_left_from, next_index, *from, *to_next;
1165
1166   from = vlib_frame_vector_args (frame);
1167   n_left_from = frame->n_vectors;
1168
1169   next_index = node->cached_next_index;
1170
1171   while (n_left_from > 0)
1172     {
1173       u32 n_left_to_next;
1174
1175       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1176
1177       while (n_left_from > 0 && n_left_to_next > 0)
1178         {
1179           vlib_buffer_t *b0;
1180           u32 bi0, lbi0, next0, *nsh0;
1181           const dpo_id_t *dpo0;
1182           const load_balance_t *lb0;
1183
1184           bi0 = from[0];
1185           to_next[0] = bi0;
1186           from += 1;
1187           to_next += 1;
1188           n_left_from -= 1;
1189           n_left_to_next -= 1;
1190
1191           b0 = vlib_get_buffer (vm, bi0);
1192
1193           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1194           lb0 = load_balance_get(lbi0);
1195
1196           /* SPI + SI are the second word of the NSH header */
1197           nsh0 = vlib_buffer_get_current (b0);
1198           vnet_buffer(b0)->ip.flow_hash = nsh0[1] % lb0->lb_n_buckets;
1199
1200           dpo0 = load_balance_get_bucket_i(lb0,
1201                                            vnet_buffer(b0)->ip.flow_hash &
1202                                            (lb0->lb_n_buckets_minus_1));
1203
1204           next0 = dpo0->dpoi_next_node;
1205           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1206
1207           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1208             {
1209               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1210                                                          sizeof (*tr));
1211               tr->lb_index = lbi0;
1212             }
1213           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1214                                            n_left_to_next, bi0, next0);
1215         }
1216
1217       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1218     }
1219
1220   return frame->n_vectors;
1221 }
1222
1223 static u8 *
1224 format_nsh_load_balance_trace (u8 * s, va_list * args)
1225 {
1226   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1227   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1228   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1229
1230   s = format (s, "NSH-load-balance: index %d", t->lb_index);
1231   return s;
1232 }
1233
1234 /**
1235  * @brief
1236  */
1237 VLIB_REGISTER_NODE (nsh_load_balance_node) = {
1238   .function = nsh_load_balance,
1239   .name = "nsh-load-balance",
1240   .vector_size = sizeof (u32),
1241
1242   .format_trace = format_nsh_load_balance_trace,
1243   .n_next_nodes = 1,
1244   .next_nodes = {
1245       [0] = "error-drop",
1246   },
1247 };
1248
1249 static u8 *
1250 format_bier_load_balance_trace (u8 * s, va_list * args)
1251 {
1252   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1253   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1254   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1255
1256   s = format (s, "BIER-load-balance: index %d", t->lb_index);
1257   return s;
1258 }
1259
1260 static uword
1261 bier_load_balance (vlib_main_t * vm,
1262                    vlib_node_runtime_t * node,
1263                    vlib_frame_t * frame)
1264 {
1265     return (load_balance_inline(vm, node, frame, 0));
1266 }
1267
1268 /**
1269  * @brief
1270  */
1271 VLIB_REGISTER_NODE (bier_load_balance_node) = {
1272   .function = bier_load_balance,
1273   .name = "bier-load-balance",
1274   .vector_size = sizeof (u32),
1275
1276   .format_trace = format_bier_load_balance_trace,
1277   .sibling_of = "mpls-load-balance",
1278 };