Sticky Load-balance
[vpp.git] / src / vnet / dpo / load_balance.c
1 /*
2  * Copyright (c) 2016 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 #include <vnet/ip/lookup.h>
17 #include <vnet/dpo/load_balance.h>
18 #include <vnet/dpo/load_balance_map.h>
19 #include <vnet/dpo/drop_dpo.h>
20 #include <vppinfra/math.h>              /* for fabs */
21 #include <vnet/adj/adj.h>
22 #include <vnet/adj/adj_internal.h>
23 #include <vnet/fib/fib_urpf_list.h>
24 #include <vnet/bier/bier_fwd.h>
25
26 /*
27  * distribution error tolerance for load-balancing
28  */
29 const f64 multipath_next_hop_error_tolerance = 0.1;
30
31 static const char *load_balance_attr_names[] = LOAD_BALANCE_ATTR_NAMES;
32
33 #undef LB_DEBUG
34 #ifdef LB_DEBUG
35 #define LB_DBG(_lb, _fmt, _args...)                                     \
36 {                                                                       \
37     u8* _tmp =NULL;                                                     \
38     clib_warning("lb:[%s]:" _fmt,                                       \
39                  load_balance_format(load_balance_get_index((_lb)),     \
40                                      0, _tmp),                          \
41                  ##_args);                                              \
42     vec_free(_tmp);                                                     \
43 }
44 #else
45 #define LB_DBG(_p, _fmt, _args...)
46 #endif
47
48
49 /**
50  * Pool of all DPOs. It's not static so the DP can have fast access
51  */
52 load_balance_t *load_balance_pool;
53
54 /**
55  * The one instance of load-balance main
56  */
57 load_balance_main_t load_balance_main = {
58     .lbm_to_counters = {
59         .name = "route-to",
60         .stat_segment_name = "/net/route/to",
61     },
62     .lbm_via_counters = {
63         .name = "route-via",
64         .stat_segment_name = "/net/route/via",
65     }
66 };
67
68 f64
69 load_balance_get_multipath_tolerance (void)
70 {
71     return (multipath_next_hop_error_tolerance);
72 }
73
74 static inline index_t
75 load_balance_get_index (const load_balance_t *lb)
76 {
77     return (lb - load_balance_pool);
78 }
79
80 static inline dpo_id_t*
81 load_balance_get_buckets (load_balance_t *lb)
82 {
83     if (LB_HAS_INLINE_BUCKETS(lb))
84     {
85         return (lb->lb_buckets_inline);
86     }
87     else
88     {
89         return (lb->lb_buckets);
90     }
91 }
92
93 static load_balance_t *
94 load_balance_alloc_i (void)
95 {
96     load_balance_t *lb;
97
98     pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
99     memset(lb, 0, sizeof(*lb));
100
101     lb->lb_map = INDEX_INVALID;
102     lb->lb_urpf = INDEX_INVALID;
103     vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
104                                    load_balance_get_index(lb));
105     vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
106                                    load_balance_get_index(lb));
107     vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters),
108                                load_balance_get_index(lb));
109     vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
110                                load_balance_get_index(lb));
111
112     return (lb);
113 }
114
115 static u8*
116 load_balance_format (index_t lbi,
117                      load_balance_format_flags_t flags,
118                      u32 indent,
119                      u8 *s)
120 {
121     vlib_counter_t to, via;
122     load_balance_t *lb;
123     dpo_id_t *buckets;
124     u32 i;
125
126     lb = load_balance_get(lbi);
127     vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to);
128     vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via);
129     buckets = load_balance_get_buckets(lb);
130
131     s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE);
132     s = format(s, "[proto:%U ", format_dpo_proto, lb->lb_proto);
133     s = format(s, "index:%d buckets:%d ", lbi, lb->lb_n_buckets);
134     s = format(s, "uRPF:%d ", lb->lb_urpf);
135     if (lb->lb_flags)
136     {
137         load_balance_attr_t attr;
138
139         s = format(s, "flags:[");
140
141         FOR_EACH_LOAD_BALANCE_ATTR(attr)
142         {
143             if (lb->lb_flags & (1 << attr))
144             {
145                 s = format (s, "%s", load_balance_attr_names[attr]);
146             }
147         }
148         s = format(s, "] ");
149     }
150     s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
151     if (0 != via.packets)
152     {
153         s = format(s, " via:[%Ld:%Ld]",
154                    via.packets, via.bytes);
155     }
156     s = format(s, "]");
157
158     if (INDEX_INVALID != lb->lb_map)
159     {
160         s = format(s, "\n%U%U",
161                    format_white_space, indent+4,
162                    format_load_balance_map, lb->lb_map, indent+4);
163     }
164     for (i = 0; i < lb->lb_n_buckets; i++)
165     {
166         s = format(s, "\n%U[%d] %U",
167                    format_white_space, indent+2,
168                    i,
169                    format_dpo_id,
170                    &buckets[i], indent+6);
171     }
172     return (s);
173 }
174
175 u8*
176 format_load_balance (u8 * s, va_list * args)
177 {
178     index_t lbi = va_arg(*args, index_t);
179     load_balance_format_flags_t flags = va_arg(*args, load_balance_format_flags_t);
180
181     return (load_balance_format(lbi, flags, 0, s));
182 }
183 static u8*
184 format_load_balance_dpo (u8 * s, va_list * args)
185 {
186     index_t lbi = va_arg(*args, index_t);
187     u32 indent = va_arg(*args, u32);
188
189     return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
190 }
191
192
193 static load_balance_t *
194 load_balance_create_i (u32 num_buckets,
195                        dpo_proto_t lb_proto,
196                        flow_hash_config_t fhc)
197 {
198     load_balance_t *lb;
199
200     lb = load_balance_alloc_i();
201     lb->lb_hash_config = fhc;
202     lb->lb_n_buckets = num_buckets;
203     lb->lb_n_buckets_minus_1 = num_buckets-1;
204     lb->lb_proto = lb_proto;
205
206     if (!LB_HAS_INLINE_BUCKETS(lb))
207     {
208         vec_validate_aligned(lb->lb_buckets,
209                              lb->lb_n_buckets - 1,
210                              CLIB_CACHE_LINE_BYTES);
211     }
212
213     LB_DBG(lb, "create");
214
215     return (lb);
216 }
217
218 index_t
219 load_balance_create (u32 n_buckets,
220                      dpo_proto_t lb_proto,
221                      flow_hash_config_t fhc)
222 {
223     return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc)));
224 }
225
226 static inline void
227 load_balance_set_bucket_i (load_balance_t *lb,
228                            u32 bucket,
229                            dpo_id_t *buckets,
230                            const dpo_id_t *next)
231 {
232     dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next);
233 }
234
235 void
236 load_balance_set_bucket (index_t lbi,
237                          u32 bucket,
238                          const dpo_id_t *next)
239 {
240     load_balance_t *lb;
241     dpo_id_t *buckets;
242
243     lb = load_balance_get(lbi);
244     buckets = load_balance_get_buckets(lb);
245
246     ASSERT(bucket < lb->lb_n_buckets);
247
248     load_balance_set_bucket_i(lb, bucket, buckets, next);
249 }
250
251 int
252 load_balance_is_drop (const dpo_id_t *dpo)
253 {
254     load_balance_t *lb;
255
256     if (DPO_LOAD_BALANCE != dpo->dpoi_type)
257         return (0);
258
259     lb = load_balance_get(dpo->dpoi_index);
260
261     if (1 == lb->lb_n_buckets)
262     {
263         return (dpo_is_drop(load_balance_get_bucket_i(lb, 0)));
264     }
265     return (0);
266 }
267
268 u16
269 load_balance_n_buckets (index_t lbi)
270 {
271     load_balance_t *lb;
272
273     lb = load_balance_get(lbi);
274
275     return (lb->lb_n_buckets);
276 }
277
278 void
279 load_balance_set_fib_entry_flags (index_t lbi,
280                                   fib_entry_flag_t flags)
281 {
282     load_balance_t *lb;
283
284     lb = load_balance_get(lbi);
285     lb->lb_fib_entry_flags = flags;
286 }
287
288
289 void
290 load_balance_set_urpf (index_t lbi,
291                        index_t urpf)
292 {
293     load_balance_t *lb;
294     index_t old;
295
296     lb = load_balance_get(lbi);
297
298     /*
299      * packets in flight we see this change. but it's atomic, so :P
300      */
301     old = lb->lb_urpf;
302     lb->lb_urpf = urpf;
303
304     fib_urpf_list_unlock(old);
305     fib_urpf_list_lock(urpf);
306 }
307
308 index_t
309 load_balance_get_urpf (index_t lbi)
310 {
311     load_balance_t *lb;
312
313     lb = load_balance_get(lbi);
314
315     return (lb->lb_urpf);
316 }
317
318 const dpo_id_t *
319 load_balance_get_bucket (index_t lbi,
320                          u32 bucket)
321 {
322     load_balance_t *lb;
323
324     lb = load_balance_get(lbi);
325
326     return (load_balance_get_bucket_i(lb, bucket));
327 }
328
329 static int
330 next_hop_sort_by_weight (const load_balance_path_t * n1,
331                          const load_balance_path_t * n2)
332 {
333     return ((int) n1->path_weight - (int) n2->path_weight);
334 }
335
336 /* Given next hop vector is over-written with normalized one with sorted weights and
337    with weights corresponding to the number of adjacencies for each next hop.
338    Returns number of adjacencies in block. */
339 u32
340 ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops,
341                                   load_balance_path_t ** normalized_next_hops,
342                                   u32 *sum_weight_in,
343                                   f64 multipath_next_hop_error_tolerance)
344 {
345     load_balance_path_t * nhs;
346     uword n_nhs, n_adj, n_adj_left, i, sum_weight;
347     f64 norm, error;
348
349     n_nhs = vec_len (raw_next_hops);
350     ASSERT (n_nhs > 0);
351     if (n_nhs == 0)
352         return 0;
353
354     /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
355     nhs = *normalized_next_hops;
356     vec_validate (nhs, 2*n_nhs - 1);
357
358     /* Fast path: 1 next hop in block. */
359     n_adj = n_nhs;
360     if (n_nhs == 1)
361     {
362         nhs[0] = raw_next_hops[0];
363         nhs[0].path_weight = 1;
364         _vec_len (nhs) = 1;
365         sum_weight = 1;
366         goto done;
367     }
368
369     else if (n_nhs == 2)
370     {
371         int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;
372
373         /* Fast sort. */
374         nhs[0] = raw_next_hops[cmp];
375         nhs[1] = raw_next_hops[cmp ^ 1];
376
377         /* Fast path: equal cost multipath with 2 next hops. */
378         if (nhs[0].path_weight == nhs[1].path_weight)
379         {
380             nhs[0].path_weight = nhs[1].path_weight = 1;
381             _vec_len (nhs) = 2;
382             sum_weight = 2;
383             goto done;
384         }
385     }
386     else
387     {
388         clib_memcpy (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
389         qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
390     }
391
392     /* Find total weight to normalize weights. */
393     sum_weight = 0;
394     for (i = 0; i < n_nhs; i++)
395         sum_weight += nhs[i].path_weight;
396
397     /* In the unlikely case that all weights are given as 0, set them all to 1. */
398     if (sum_weight == 0)
399     {
400         for (i = 0; i < n_nhs; i++)
401             nhs[i].path_weight = 1;
402         sum_weight = n_nhs;
403     }
404
405     /* Save copies of all next hop weights to avoid being overwritten in loop below. */
406     for (i = 0; i < n_nhs; i++)
407         nhs[n_nhs + i].path_weight = nhs[i].path_weight;
408
409     /* Try larger and larger power of 2 sized adjacency blocks until we
410        find one where traffic flows to within 1% of specified weights. */
411     for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
412     {
413         error = 0;
414
415         norm = n_adj / ((f64) sum_weight);
416         n_adj_left = n_adj;
417         for (i = 0; i < n_nhs; i++)
418         {
419             f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */
420             word n = flt_round_nearest (nf);
421
422             n = n > n_adj_left ? n_adj_left : n;
423             n_adj_left -= n;
424             error += fabs (nf - n);
425             nhs[i].path_weight = n;
426
427             if (0 == nhs[i].path_weight)
428             {
429                 /*
430                  * when the weight skew is high (norm is small) and n == nf.
431                  * without this correction the path with a low weight would have
432                  * no represenation in the load-balanace - don't want that.
433                  * If the weight skew is high so the load-balance has many buckets
434                  * to allow it. pays ya money takes ya choice.
435                  */
436                 error = n_adj;
437                 break;
438             }
439         }
440
441         nhs[0].path_weight += n_adj_left;
442
443         /* Less than 5% average error per adjacency with this size adjacency block? */
444         if (error <= multipath_next_hop_error_tolerance*n_adj)
445         {
446             /* Truncate any next hops with zero weight. */
447             _vec_len (nhs) = i;
448             break;
449         }
450     }
451
452 done:
453     /* Save vector for next call. */
454     *normalized_next_hops = nhs;
455     *sum_weight_in = sum_weight;
456     return n_adj;
457 }
458
459 static load_balance_path_t *
460 load_balance_multipath_next_hop_fixup (const load_balance_path_t *nhs,
461                                        dpo_proto_t drop_proto)
462 {
463     if (0 == vec_len(nhs))
464     {
465         load_balance_path_t *new_nhs = NULL, *nh;
466
467         /*
468          * we need something for the load-balance. so use the drop
469          */
470         vec_add2(new_nhs, nh, 1);
471
472         nh->path_weight = 1;
473         dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto));
474
475         return (new_nhs);
476     }
477
478     return (NULL);
479 }
480
481 /*
482  * Fill in adjacencies in block based on corresponding
483  * next hop adjacencies.
484  */
485 static void
486 load_balance_fill_buckets_norm (load_balance_t *lb,
487                                 load_balance_path_t *nhs,
488                                 dpo_id_t *buckets,
489                                 u32 n_buckets)
490 {
491     load_balance_path_t *nh;
492     u16 ii, bucket;
493
494     bucket = 0;
495
496     /*
497      * the next-hops have normalised weights. that means their sum is the number
498      * of buckets we need to fill.
499      */
500     vec_foreach (nh, nhs)
501     {
502         for (ii = 0; ii < nh->path_weight; ii++)
503         {
504             ASSERT(bucket < n_buckets);
505             load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
506         }
507     }
508 }
509 static void
510 load_balance_fill_buckets_sticky (load_balance_t *lb,
511                                   load_balance_path_t *nhs,
512                                   dpo_id_t *buckets,
513                                   u32 n_buckets)
514 {
515     load_balance_path_t *nh, *fwding_paths;
516     u16 ii, bucket, fpath;
517
518     fpath = bucket = 0;
519     fwding_paths = NULL;
520
521     vec_foreach (nh, nhs)
522     {
523         if (!dpo_is_drop(&nh->path_dpo))
524         {
525             vec_add1(fwding_paths, *nh);
526         }
527     }
528     if (vec_len(fwding_paths) == 0)
529         fwding_paths = vec_dup(nhs);
530
531     /*
532      * the next-hops have normalised weights. that means their sum is the number
533      * of buckets we need to fill.
534      */
535     vec_foreach (nh, nhs)
536     {
537         for (ii = 0; ii < nh->path_weight; ii++)
538         {
539             ASSERT(bucket < n_buckets);
540             if (!dpo_is_drop(&nh->path_dpo))
541             {
542                 load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
543             }
544             else
545             {
546                 /* fill the bucks from the next up path */
547                 load_balance_set_bucket_i(lb, bucket++, buckets, &fwding_paths[fpath].path_dpo);
548                 fpath = (fpath + 1) % vec_len(fwding_paths);
549             }
550         }
551     }
552
553     vec_free(fwding_paths);
554 }
555
556 static void
557 load_balance_fill_buckets (load_balance_t *lb,
558                            load_balance_path_t *nhs,
559                            dpo_id_t *buckets,
560                            u32 n_buckets,
561                            load_balance_flags_t flags)
562 {
563     if (flags & LOAD_BALANCE_FLAG_STICKY)
564     {
565         load_balance_fill_buckets_sticky(lb, nhs, buckets, n_buckets);
566     }
567     else
568     {
569         load_balance_fill_buckets_norm(lb, nhs, buckets, n_buckets);
570     }
571 }
572
573 static inline void
574 load_balance_set_n_buckets (load_balance_t *lb,
575                             u32 n_buckets)
576 {
577     lb->lb_n_buckets = n_buckets;
578     lb->lb_n_buckets_minus_1 = n_buckets-1;
579 }
580
581 void
582 load_balance_multipath_update (const dpo_id_t *dpo,
583                                const load_balance_path_t * raw_nhs,
584                                load_balance_flags_t flags)
585 {
586     load_balance_path_t *nh, *nhs, *fixed_nhs;
587     u32 sum_of_weights, n_buckets, ii;
588     index_t lbmi, old_lbmi;
589     load_balance_t *lb;
590     dpo_id_t *tmp_dpo;
591
592     nhs = NULL;
593
594     ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
595     lb = load_balance_get(dpo->dpoi_index);
596     lb->lb_flags = flags;
597     fixed_nhs = load_balance_multipath_next_hop_fixup(raw_nhs, lb->lb_proto);
598     n_buckets =
599         ip_multipath_normalize_next_hops((NULL == fixed_nhs ?
600                                           raw_nhs :
601                                           fixed_nhs),
602                                          &nhs,
603                                          &sum_of_weights,
604                                          multipath_next_hop_error_tolerance);
605
606     ASSERT (n_buckets >= vec_len (raw_nhs));
607
608     /*
609      * Save the old load-balance map used, and get a new one if required.
610      */
611     old_lbmi = lb->lb_map;
612     if (flags & LOAD_BALANCE_FLAG_USES_MAP)
613     {
614         lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs);
615     }
616     else
617     {
618         lbmi = INDEX_INVALID;
619     }
620
621     if (0 == lb->lb_n_buckets)
622     {
623         /*
624          * first time initialisation. no packets inflight, so we can write
625          * at leisure.
626          */
627         load_balance_set_n_buckets(lb, n_buckets);
628
629         if (!LB_HAS_INLINE_BUCKETS(lb))
630             vec_validate_aligned(lb->lb_buckets,
631                                  lb->lb_n_buckets - 1,
632                                  CLIB_CACHE_LINE_BYTES);
633
634         load_balance_fill_buckets(lb, nhs,
635                                   load_balance_get_buckets(lb),
636                                   n_buckets, flags);
637         lb->lb_map = lbmi;
638     }
639     else
640     {
641         /*
642          * This is a modification of an existing load-balance.
643          * We need to ensure that packets inflight see a consistent state, that
644          * is the number of reported buckets the LB has (read from
645          * lb_n_buckets_minus_1) is not more than it actually has. So if the
646          * number of buckets is increasing, we must update the bucket array first,
647          * then the reported number. vice-versa if the number of buckets goes down.
648          */
649         if (n_buckets == lb->lb_n_buckets)
650         {
651             /*
652              * no change in the number of buckets. we can simply fill what
653              * is new over what is old.
654              */
655             load_balance_fill_buckets(lb, nhs,
656                                       load_balance_get_buckets(lb),
657                                       n_buckets, flags);
658             lb->lb_map = lbmi;
659         }
660         else if (n_buckets > lb->lb_n_buckets)
661         {
662             /*
663              * we have more buckets. the old load-balance map (if there is one)
664              * will remain valid, i.e. mapping to indices within range, so we
665              * update it last.
666              */
667             if (n_buckets > LB_NUM_INLINE_BUCKETS &&
668                 lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
669             {
670                 /*
671                  * the new increased number of buckets is crossing the threshold
672                  * from the inline storage to out-line. Alloc the outline buckets
673                  * first, then fixup the number. then reset the inlines.
674                  */
675                 ASSERT(NULL == lb->lb_buckets);
676                 vec_validate_aligned(lb->lb_buckets,
677                                      n_buckets - 1,
678                                      CLIB_CACHE_LINE_BYTES);
679
680                 load_balance_fill_buckets(lb, nhs,
681                                           lb->lb_buckets,
682                                           n_buckets, flags);
683                 CLIB_MEMORY_BARRIER();
684                 load_balance_set_n_buckets(lb, n_buckets);
685
686                 CLIB_MEMORY_BARRIER();
687
688                 for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++)
689                 {
690                     dpo_reset(&lb->lb_buckets_inline[ii]);
691                 }
692             }
693             else
694             {
695                 if (n_buckets <= LB_NUM_INLINE_BUCKETS)
696                 {
697                     /*
698                      * we are not crossing the threshold and it's still inline buckets.
699                      * we can write the new on the old..
700                      */
701                     load_balance_fill_buckets(lb, nhs,
702                                               load_balance_get_buckets(lb),
703                                               n_buckets, flags);
704                     CLIB_MEMORY_BARRIER();
705                     load_balance_set_n_buckets(lb, n_buckets);
706                 }
707                 else
708                 {
709                     /*
710                      * we are not crossing the threshold. We need a new bucket array to
711                      * hold the increased number of choices.
712                      */
713                     dpo_id_t *new_buckets, *old_buckets, *tmp_dpo;
714
715                     new_buckets = NULL;
716                     old_buckets = load_balance_get_buckets(lb);
717
718                     vec_validate_aligned(new_buckets,
719                                          n_buckets - 1,
720                                          CLIB_CACHE_LINE_BYTES);
721
722                     load_balance_fill_buckets(lb, nhs, new_buckets,
723                                               n_buckets, flags);
724                     CLIB_MEMORY_BARRIER();
725                     lb->lb_buckets = new_buckets;
726                     CLIB_MEMORY_BARRIER();
727                     load_balance_set_n_buckets(lb, n_buckets);
728
729                     vec_foreach(tmp_dpo, old_buckets)
730                     {
731                         dpo_reset(tmp_dpo);
732                     }
733                     vec_free(old_buckets);
734                 }
735             }
736
737             /*
738              * buckets fixed. ready for the MAP update.
739              */
740             lb->lb_map = lbmi;
741         }
742         else
743         {
744             /*
745              * bucket size shrinkage.
746              * Any map we have will be based on the old
747              * larger number of buckets, so will be translating to indices
748              * out of range. So the new MAP must be installed first.
749              */
750             lb->lb_map = lbmi;
751             CLIB_MEMORY_BARRIER();
752
753
754             if (n_buckets <= LB_NUM_INLINE_BUCKETS &&
755                 lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS)
756             {
757                 /*
758                  * the new decreased number of buckets is crossing the threshold
759                  * from out-line storage to inline:
760                  *   1 - Fill the inline buckets,
761                  *   2 - fixup the number (and this point the inline buckets are
762                  *       used).
763                  *   3 - free the outline buckets
764                  */
765                 load_balance_fill_buckets(lb, nhs,
766                                           lb->lb_buckets_inline,
767                                           n_buckets, flags);
768                 CLIB_MEMORY_BARRIER();
769                 load_balance_set_n_buckets(lb, n_buckets);
770                 CLIB_MEMORY_BARRIER();
771
772                 vec_foreach(tmp_dpo, lb->lb_buckets)
773                 {
774                     dpo_reset(tmp_dpo);
775                 }
776                 vec_free(lb->lb_buckets);
777             }
778             else
779             {
780                 /*
781                  * not crossing the threshold.
782                  *  1 - update the number to the smaller size
783                  *  2 - write the new buckets
784                  *  3 - reset those no longer used.
785                  */
786                 dpo_id_t *buckets;
787                 u32 old_n_buckets;
788
789                 old_n_buckets = lb->lb_n_buckets;
790                 buckets = load_balance_get_buckets(lb);
791
792                 load_balance_set_n_buckets(lb, n_buckets);
793                 CLIB_MEMORY_BARRIER();
794
795                 load_balance_fill_buckets(lb, nhs, buckets,
796                                           n_buckets, flags);
797
798                 for (ii = n_buckets; ii < old_n_buckets; ii++)
799                 {
800                     dpo_reset(&buckets[ii]);
801                 }
802             }
803         }
804     }
805
806     vec_foreach (nh, nhs)
807     {
808         dpo_reset(&nh->path_dpo);
809     }
810     vec_free(nhs);
811     vec_free(fixed_nhs);
812
813     load_balance_map_unlock(old_lbmi);
814 }
815
816 static void
817 load_balance_lock (dpo_id_t *dpo)
818 {
819     load_balance_t *lb;
820
821     lb = load_balance_get(dpo->dpoi_index);
822
823     lb->lb_locks++;
824 }
825
826 static void
827 load_balance_destroy (load_balance_t *lb)
828 {
829     dpo_id_t *buckets;
830     int i;
831
832     buckets = load_balance_get_buckets(lb);
833
834     for (i = 0; i < lb->lb_n_buckets; i++)
835     {
836         dpo_reset(&buckets[i]);
837     }
838
839     LB_DBG(lb, "destroy");
840     if (!LB_HAS_INLINE_BUCKETS(lb))
841     {
842         vec_free(lb->lb_buckets);
843     }
844
845     fib_urpf_list_unlock(lb->lb_urpf);
846     load_balance_map_unlock(lb->lb_map);
847
848     pool_put(load_balance_pool, lb);
849 }
850
851 static void
852 load_balance_unlock (dpo_id_t *dpo)
853 {
854     load_balance_t *lb;
855
856     lb = load_balance_get(dpo->dpoi_index);
857
858     lb->lb_locks--;
859
860     if (0 == lb->lb_locks)
861     {
862         load_balance_destroy(lb);
863     }
864 }
865
866 static void
867 load_balance_mem_show (void)
868 {
869     fib_show_memory_usage("load-balance",
870                           pool_elts(load_balance_pool),
871                           pool_len(load_balance_pool),
872                           sizeof(load_balance_t));
873     load_balance_map_show_mem();
874 }
875
876 const static dpo_vft_t lb_vft = {
877     .dv_lock = load_balance_lock,
878     .dv_unlock = load_balance_unlock,
879     .dv_format = format_load_balance_dpo,
880     .dv_mem_show = load_balance_mem_show,
881 };
882
883 /**
884  * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance
885  *        object.
886  *
887  * this means that these graph nodes are ones from which a load-balance is the
888  * parent object in the DPO-graph.
889  *
890  * We do not list all the load-balance nodes, such as the *-lookup. instead
891  * we are relying on the correct use of the .sibling_of field when setting
892  * up these sibling nodes.
893  */
894 const static char* const load_balance_ip4_nodes[] =
895 {
896     "ip4-load-balance",
897     NULL,
898 };
899 const static char* const load_balance_ip6_nodes[] =
900 {
901     "ip6-load-balance",
902     NULL,
903 };
904 const static char* const load_balance_mpls_nodes[] =
905 {
906     "mpls-load-balance",
907     NULL,
908 };
909 const static char* const load_balance_l2_nodes[] =
910 {
911     "l2-load-balance",
912     NULL,
913 };
914 const static char* const load_balance_nsh_nodes[] =
915 {
916     "nsh-load-balance",
917     NULL
918 };
919 const static char* const load_balance_bier_nodes[] =
920 {
921     "bier-load-balance",
922     NULL,
923 };
924 const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
925 {
926     [DPO_PROTO_IP4]  = load_balance_ip4_nodes,
927     [DPO_PROTO_IP6]  = load_balance_ip6_nodes,
928     [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
929     [DPO_PROTO_ETHERNET] = load_balance_l2_nodes,
930     [DPO_PROTO_NSH] = load_balance_nsh_nodes,
931     [DPO_PROTO_BIER] = load_balance_bier_nodes,
932 };
933
934 void
935 load_balance_module_init (void)
936 {
937     index_t lbi;
938
939     dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes);
940
941     /*
942      * Special LB with index zero. we need to define this since the v4 mtrie
943      * assumes an index of 0 implies the ply is empty. therefore all 'real'
944      * adjs need a non-zero index.
945      * This should never be used, but just in case, stack it on a drop.
946      */
947     lbi = load_balance_create(1, DPO_PROTO_IP4, 0);
948     load_balance_set_bucket(lbi, 0, drop_dpo_get(DPO_PROTO_IP4));
949
950     load_balance_map_module_init();
951 }
952
953 static clib_error_t *
954 load_balance_show (vlib_main_t * vm,
955                    unformat_input_t * input,
956                    vlib_cli_command_t * cmd)
957 {
958     index_t lbi = INDEX_INVALID;
959
960     while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
961     {
962         if (unformat (input, "%d", &lbi))
963             ;
964         else
965             break;
966     }
967
968     if (INDEX_INVALID != lbi)
969     {
970         if (pool_is_free_index(load_balance_pool, lbi))
971         {
972             vlib_cli_output (vm, "no such load-balance:%d", lbi);
973         }
974         else
975         {
976             vlib_cli_output (vm, "%U", format_load_balance, lbi,
977                          LOAD_BALANCE_FORMAT_DETAIL);
978         }
979     }
980     else
981     {
982         load_balance_t *lb;
983
984         pool_foreach(lb, load_balance_pool,
985         ({
986             vlib_cli_output (vm, "%U", format_load_balance,
987                              load_balance_get_index(lb),
988                              LOAD_BALANCE_FORMAT_NONE);
989         }));
990     }
991
992     return 0;
993 }
994
995 VLIB_CLI_COMMAND (load_balance_show_command, static) = {
996     .path = "show load-balance",
997     .short_help = "show load-balance [<index>]",
998     .function = load_balance_show,
999 };
1000
1001
1002 always_inline u32
1003 ip_flow_hash (void *data)
1004 {
1005   ip4_header_t *iph = (ip4_header_t *) data;
1006
1007   if ((iph->ip_version_and_header_length & 0xF0) == 0x40)
1008     return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT);
1009   else
1010     return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT);
1011 }
1012
1013 always_inline u64
1014 mac_to_u64 (u8 * m)
1015 {
1016   return (*((u64 *) m) & 0xffffffffffff);
1017 }
1018
1019 always_inline u32
1020 l2_flow_hash (vlib_buffer_t * b0)
1021 {
1022   ethernet_header_t *eh;
1023   u64 a, b, c;
1024   uword is_ip, eh_size;
1025   u16 eh_type;
1026
1027   eh = vlib_buffer_get_current (b0);
1028   eh_type = clib_net_to_host_u16 (eh->type);
1029   eh_size = ethernet_buffer_header_size (b0);
1030
1031   is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6);
1032
1033   /* since we have 2 cache lines, use them */
1034   if (is_ip)
1035     a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size);
1036   else
1037     a = eh->type;
1038
1039   b = mac_to_u64 ((u8 *) eh->dst_address);
1040   c = mac_to_u64 ((u8 *) eh->src_address);
1041   hash_mix64 (a, b, c);
1042
1043   return (u32) c;
1044 }
1045
1046 typedef struct load_balance_trace_t_
1047 {
1048     index_t lb_index;
1049 } load_balance_trace_t;
1050
1051 always_inline uword
1052 load_balance_inline (vlib_main_t * vm,
1053                      vlib_node_runtime_t * node,
1054                      vlib_frame_t * frame,
1055                      int is_l2)
1056 {
1057   u32 n_left_from, next_index, *from, *to_next;
1058
1059   from = vlib_frame_vector_args (frame);
1060   n_left_from = frame->n_vectors;
1061
1062   next_index = node->cached_next_index;
1063
1064   while (n_left_from > 0)
1065     {
1066       u32 n_left_to_next;
1067
1068       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1069
1070       while (n_left_from > 0 && n_left_to_next > 0)
1071         {
1072           vlib_buffer_t *b0;
1073           u32 bi0, lbi0, next0;
1074           const dpo_id_t *dpo0;
1075           const load_balance_t *lb0;
1076
1077           bi0 = from[0];
1078           to_next[0] = bi0;
1079           from += 1;
1080           to_next += 1;
1081           n_left_from -= 1;
1082           n_left_to_next -= 1;
1083
1084           b0 = vlib_get_buffer (vm, bi0);
1085
1086           /* lookup dst + src mac */
1087           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1088           lb0 = load_balance_get(lbi0);
1089
1090           if (is_l2)
1091           {
1092               vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);
1093           }
1094           else
1095           {
1096               /* it's BIER */
1097               const bier_hdr_t *bh0 = vlib_buffer_get_current(b0);
1098               vnet_buffer(b0)->ip.flow_hash = bier_compute_flow_hash(bh0);
1099           }
1100
1101           dpo0 = load_balance_get_bucket_i(lb0, 
1102                                            vnet_buffer(b0)->ip.flow_hash &
1103                                            (lb0->lb_n_buckets_minus_1));
1104
1105           next0 = dpo0->dpoi_next_node;
1106           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1107
1108           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1109             {
1110               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1111                                                          sizeof (*tr));
1112               tr->lb_index = lbi0;
1113             }
1114           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1115                                            n_left_to_next, bi0, next0);
1116         }
1117
1118       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1119     }
1120
1121   return frame->n_vectors;
1122 }
1123
1124 static uword
1125 l2_load_balance (vlib_main_t * vm,
1126                  vlib_node_runtime_t * node,
1127                  vlib_frame_t * frame)
1128 {
1129     return (load_balance_inline(vm, node, frame, 1));
1130 }
1131
1132 static u8 *
1133 format_l2_load_balance_trace (u8 * s, va_list * args)
1134 {
1135   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1136   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1137   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1138
1139   s = format (s, "L2-load-balance: index %d", t->lb_index);
1140   return s;
1141 }
1142
1143 /**
1144  * @brief
1145  */
1146 VLIB_REGISTER_NODE (l2_load_balance_node) = {
1147   .function = l2_load_balance,
1148   .name = "l2-load-balance",
1149   .vector_size = sizeof (u32),
1150
1151   .format_trace = format_l2_load_balance_trace,
1152   .n_next_nodes = 1,
1153   .next_nodes = {
1154       [0] = "error-drop",
1155   },
1156 };
1157
1158 static uword
1159 nsh_load_balance (vlib_main_t * vm,
1160                  vlib_node_runtime_t * node,
1161                  vlib_frame_t * frame)
1162 {
1163   u32 n_left_from, next_index, *from, *to_next;
1164
1165   from = vlib_frame_vector_args (frame);
1166   n_left_from = frame->n_vectors;
1167
1168   next_index = node->cached_next_index;
1169
1170   while (n_left_from > 0)
1171     {
1172       u32 n_left_to_next;
1173
1174       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1175
1176       while (n_left_from > 0 && n_left_to_next > 0)
1177         {
1178           vlib_buffer_t *b0;
1179           u32 bi0, lbi0, next0, *nsh0;
1180           const dpo_id_t *dpo0;
1181           const load_balance_t *lb0;
1182
1183           bi0 = from[0];
1184           to_next[0] = bi0;
1185           from += 1;
1186           to_next += 1;
1187           n_left_from -= 1;
1188           n_left_to_next -= 1;
1189
1190           b0 = vlib_get_buffer (vm, bi0);
1191
1192           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1193           lb0 = load_balance_get(lbi0);
1194
1195           /* SPI + SI are the second word of the NSH header */
1196           nsh0 = vlib_buffer_get_current (b0);
1197           vnet_buffer(b0)->ip.flow_hash = nsh0[1] % lb0->lb_n_buckets;
1198
1199           dpo0 = load_balance_get_bucket_i(lb0,
1200                                            vnet_buffer(b0)->ip.flow_hash &
1201                                            (lb0->lb_n_buckets_minus_1));
1202
1203           next0 = dpo0->dpoi_next_node;
1204           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1205
1206           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1207             {
1208               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1209                                                          sizeof (*tr));
1210               tr->lb_index = lbi0;
1211             }
1212           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1213                                            n_left_to_next, bi0, next0);
1214         }
1215
1216       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1217     }
1218
1219   return frame->n_vectors;
1220 }
1221
1222 static u8 *
1223 format_nsh_load_balance_trace (u8 * s, va_list * args)
1224 {
1225   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1226   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1227   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1228
1229   s = format (s, "NSH-load-balance: index %d", t->lb_index);
1230   return s;
1231 }
1232
1233 /**
1234  * @brief
1235  */
1236 VLIB_REGISTER_NODE (nsh_load_balance_node) = {
1237   .function = nsh_load_balance,
1238   .name = "nsh-load-balance",
1239   .vector_size = sizeof (u32),
1240
1241   .format_trace = format_nsh_load_balance_trace,
1242   .n_next_nodes = 1,
1243   .next_nodes = {
1244       [0] = "error-drop",
1245   },
1246 };
1247
1248 static u8 *
1249 format_bier_load_balance_trace (u8 * s, va_list * args)
1250 {
1251   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1252   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1253   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1254
1255   s = format (s, "BIER-load-balance: index %d", t->lb_index);
1256   return s;
1257 }
1258
1259 static uword
1260 bier_load_balance (vlib_main_t * vm,
1261                    vlib_node_runtime_t * node,
1262                    vlib_frame_t * frame)
1263 {
1264     return (load_balance_inline(vm, node, frame, 0));
1265 }
1266
1267 /**
1268  * @brief
1269  */
1270 VLIB_REGISTER_NODE (bier_load_balance_node) = {
1271   .function = bier_load_balance,
1272   .name = "bier-load-balance",
1273   .vector_size = sizeof (u32),
1274
1275   .format_trace = format_bier_load_balance_trace,
1276   .sibling_of = "mpls-load-balance",
1277 };