fib: default flow hash config for each DPO protocol type
[vpp.git] / src / vnet / dpo / load_balance.c
1 /*
2  * Copyright (c) 2016 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 #include <vnet/ip/lookup.h>
17 #include <vnet/dpo/load_balance.h>
18 #include <vnet/dpo/load_balance_map.h>
19 #include <vnet/dpo/drop_dpo.h>
20 #include <vppinfra/math.h>              /* for fabs */
21 #include <vnet/adj/adj.h>
22 #include <vnet/adj/adj_internal.h>
23 #include <vnet/fib/fib_urpf_list.h>
24 #include <vnet/bier/bier_fwd.h>
25 #include <vnet/fib/mpls_fib.h>
26
27 /*
28  * distribution error tolerance for load-balancing
29  */
30 const f64 multipath_next_hop_error_tolerance = 0.1;
31
32 static const char *load_balance_attr_names[] = LOAD_BALANCE_ATTR_NAMES;
33
34 /**
35  * the logger
36  */
37 vlib_log_class_t load_balance_logger;
38
39 #define LB_DBG(_lb, _fmt, _args...)                                     \
40 {                                                                       \
41     vlib_log_debug(load_balance_logger,                                 \
42                    "lb:[%U]:" _fmt,                                     \
43                    format_load_balance, load_balance_get_index(_lb),    \
44                    LOAD_BALANCE_FORMAT_NONE,                            \
45                    ##_args);                                            \
46 }
47
48 /**
49  * Pool of all DPOs. It's not static so the DP can have fast access
50  */
51 load_balance_t *load_balance_pool;
52
53 /**
54  * The one instance of load-balance main
55  */
56 load_balance_main_t load_balance_main = {
57     .lbm_to_counters = {
58         .name = "route-to",
59         .stat_segment_name = "/net/route/to",
60     },
61     .lbm_via_counters = {
62         .name = "route-via",
63         .stat_segment_name = "/net/route/via",
64     }
65 };
66
67 f64
68 load_balance_get_multipath_tolerance (void)
69 {
70     return (multipath_next_hop_error_tolerance);
71 }
72
73 static inline index_t
74 load_balance_get_index (const load_balance_t *lb)
75 {
76     return (lb - load_balance_pool);
77 }
78
79 static inline dpo_id_t*
80 load_balance_get_buckets (load_balance_t *lb)
81 {
82     if (LB_HAS_INLINE_BUCKETS(lb))
83     {
84         return (lb->lb_buckets_inline);
85     }
86     else
87     {
88         return (lb->lb_buckets);
89     }
90 }
91
92 static load_balance_t *
93 load_balance_alloc_i (void)
94 {
95     load_balance_t *lb;
96
97     pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
98     clib_memset(lb, 0, sizeof(*lb));
99
100     lb->lb_map = INDEX_INVALID;
101     lb->lb_urpf = INDEX_INVALID;
102     vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
103                                    load_balance_get_index(lb));
104     vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
105                                    load_balance_get_index(lb));
106     vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters),
107                                load_balance_get_index(lb));
108     vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
109                                load_balance_get_index(lb));
110
111     return (lb);
112 }
113
114 static u8*
115 load_balance_format (index_t lbi,
116                      load_balance_format_flags_t flags,
117                      u32 indent,
118                      u8 *s)
119 {
120     vlib_counter_t to, via;
121     load_balance_t *lb;
122     dpo_id_t *buckets;
123     u32 i;
124
125     lb = load_balance_get(lbi);
126     vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to);
127     vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via);
128     buckets = load_balance_get_buckets(lb);
129
130     s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE);
131     s = format(s, "[proto:%U ", format_dpo_proto, lb->lb_proto);
132     s = format(s, "index:%d buckets:%d ", lbi, lb->lb_n_buckets);
133     s = format(s, "uRPF:%d ", lb->lb_urpf);
134     if (lb->lb_flags)
135     {
136         load_balance_attr_t attr;
137
138         s = format(s, "flags:[");
139
140         FOR_EACH_LOAD_BALANCE_ATTR(attr)
141         {
142             if (lb->lb_flags & (1 << attr))
143             {
144                 s = format (s, "%s", load_balance_attr_names[attr]);
145             }
146         }
147         s = format(s, "] ");
148     }
149     s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
150     if (0 != via.packets)
151     {
152         s = format(s, " via:[%Ld:%Ld]",
153                    via.packets, via.bytes);
154     }
155     s = format(s, "]");
156
157     if (INDEX_INVALID != lb->lb_map)
158     {
159         s = format(s, "\n%U%U",
160                    format_white_space, indent+4,
161                    format_load_balance_map, lb->lb_map, indent+4);
162     }
163     for (i = 0; i < lb->lb_n_buckets; i++)
164     {
165         s = format(s, "\n%U[%d] %U",
166                    format_white_space, indent+2,
167                    i,
168                    format_dpo_id,
169                    &buckets[i], indent+6);
170     }
171     return (s);
172 }
173
174 u8*
175 format_load_balance (u8 * s, va_list * args)
176 {
177     index_t lbi = va_arg(*args, index_t);
178     load_balance_format_flags_t flags = va_arg(*args, load_balance_format_flags_t);
179
180     return (load_balance_format(lbi, flags, 0, s));
181 }
182
183 static u8*
184 format_load_balance_dpo (u8 * s, va_list * args)
185 {
186     index_t lbi = va_arg(*args, index_t);
187     u32 indent = va_arg(*args, u32);
188
189     return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
190 }
191
192 flow_hash_config_t
193 load_balance_get_default_flow_hash (dpo_proto_t lb_proto)
194 {
195     switch (lb_proto)
196     {
197     case DPO_PROTO_IP4:
198     case DPO_PROTO_IP6:
199         return (IP_FLOW_HASH_DEFAULT);
200
201     case DPO_PROTO_MPLS:
202         return (MPLS_FLOW_HASH_DEFAULT);
203
204     case DPO_PROTO_ETHERNET:
205     case DPO_PROTO_BIER:
206     case DPO_PROTO_NSH:
207         break;
208     }
209
210     return (0);
211 }
212
213 static load_balance_t *
214 load_balance_create_i (u32 num_buckets,
215                        dpo_proto_t lb_proto,
216                        flow_hash_config_t fhc)
217 {
218     load_balance_t *lb;
219
220     lb = load_balance_alloc_i();
221     lb->lb_hash_config = fhc;
222     lb->lb_n_buckets = num_buckets;
223     lb->lb_n_buckets_minus_1 = num_buckets-1;
224     lb->lb_proto = lb_proto;
225
226     if (!LB_HAS_INLINE_BUCKETS(lb))
227     {
228         vec_validate_aligned(lb->lb_buckets,
229                              lb->lb_n_buckets - 1,
230                              CLIB_CACHE_LINE_BYTES);
231     }
232
233     LB_DBG(lb, "create");
234
235     return (lb);
236 }
237
238 index_t
239 load_balance_create (u32 n_buckets,
240                      dpo_proto_t lb_proto,
241                      flow_hash_config_t fhc)
242 {
243     return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc)));
244 }
245
246 static inline void
247 load_balance_set_bucket_i (load_balance_t *lb,
248                            u32 bucket,
249                            dpo_id_t *buckets,
250                            const dpo_id_t *next)
251 {
252     dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next);
253 }
254
255 void
256 load_balance_set_bucket (index_t lbi,
257                          u32 bucket,
258                          const dpo_id_t *next)
259 {
260     load_balance_t *lb;
261     dpo_id_t *buckets;
262
263     lb = load_balance_get(lbi);
264     buckets = load_balance_get_buckets(lb);
265
266     ASSERT(bucket < lb->lb_n_buckets);
267
268     load_balance_set_bucket_i(lb, bucket, buckets, next);
269 }
270
271 int
272 load_balance_is_drop (const dpo_id_t *dpo)
273 {
274     load_balance_t *lb;
275
276     if (DPO_LOAD_BALANCE != dpo->dpoi_type)
277         return (0);
278
279     lb = load_balance_get(dpo->dpoi_index);
280
281     if (1 == lb->lb_n_buckets)
282     {
283         return (dpo_is_drop(load_balance_get_bucket_i(lb, 0)));
284     }
285     return (0);
286 }
287
288 u16
289 load_balance_n_buckets (index_t lbi)
290 {
291     load_balance_t *lb;
292
293     lb = load_balance_get(lbi);
294
295     return (lb->lb_n_buckets);
296 }
297
298 void
299 load_balance_set_fib_entry_flags (index_t lbi,
300                                   fib_entry_flag_t flags)
301 {
302     load_balance_t *lb;
303
304     lb = load_balance_get(lbi);
305     lb->lb_fib_entry_flags = flags;
306 }
307
308
309 void
310 load_balance_set_urpf (index_t lbi,
311                        index_t urpf)
312 {
313     load_balance_t *lb;
314     index_t old;
315
316     lb = load_balance_get(lbi);
317
318     /*
319      * packets in flight we see this change. but it's atomic, so :P
320      */
321     old = lb->lb_urpf;
322     lb->lb_urpf = urpf;
323
324     fib_urpf_list_unlock(old);
325     fib_urpf_list_lock(urpf);
326 }
327
328 index_t
329 load_balance_get_urpf (index_t lbi)
330 {
331     load_balance_t *lb;
332
333     lb = load_balance_get(lbi);
334
335     return (lb->lb_urpf);
336 }
337
338 const dpo_id_t *
339 load_balance_get_bucket (index_t lbi,
340                          u32 bucket)
341 {
342     load_balance_t *lb;
343
344     lb = load_balance_get(lbi);
345
346     return (load_balance_get_bucket_i(lb, bucket));
347 }
348
349 static int
350 next_hop_sort_by_weight (const load_balance_path_t * n1,
351                          const load_balance_path_t * n2)
352 {
353     return ((int) n1->path_weight - (int) n2->path_weight);
354 }
355
356 /* Given next hop vector is over-written with normalized one with sorted weights and
357    with weights corresponding to the number of adjacencies for each next hop.
358    Returns number of adjacencies in block. */
359 u32
360 ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops,
361                                   load_balance_path_t ** normalized_next_hops,
362                                   u32 *sum_weight_in,
363                                   f64 multipath_next_hop_error_tolerance)
364 {
365     load_balance_path_t * nhs;
366     uword n_nhs, n_adj, n_adj_left, i, sum_weight;
367     f64 norm, error;
368
369     n_nhs = vec_len (raw_next_hops);
370     ASSERT (n_nhs > 0);
371     if (n_nhs == 0)
372         return 0;
373
374     /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
375     nhs = *normalized_next_hops;
376     vec_validate (nhs, 2*n_nhs - 1);
377
378     /* Fast path: 1 next hop in block. */
379     n_adj = n_nhs;
380     if (n_nhs == 1)
381     {
382         nhs[0] = raw_next_hops[0];
383         nhs[0].path_weight = 1;
384         _vec_len (nhs) = 1;
385         sum_weight = 1;
386         goto done;
387     }
388
389     else if (n_nhs == 2)
390     {
391         int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;
392
393         /* Fast sort. */
394         nhs[0] = raw_next_hops[cmp];
395         nhs[1] = raw_next_hops[cmp ^ 1];
396
397         /* Fast path: equal cost multipath with 2 next hops. */
398         if (nhs[0].path_weight == nhs[1].path_weight)
399         {
400             nhs[0].path_weight = nhs[1].path_weight = 1;
401             _vec_len (nhs) = 2;
402             sum_weight = 2;
403             goto done;
404         }
405     }
406     else
407     {
408         clib_memcpy_fast (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
409         qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
410     }
411
412     /* Find total weight to normalize weights. */
413     sum_weight = 0;
414     for (i = 0; i < n_nhs; i++)
415         sum_weight += nhs[i].path_weight;
416
417     /* In the unlikely case that all weights are given as 0, set them all to 1. */
418     if (sum_weight == 0)
419     {
420         for (i = 0; i < n_nhs; i++)
421             nhs[i].path_weight = 1;
422         sum_weight = n_nhs;
423     }
424
425     /* Save copies of all next hop weights to avoid being overwritten in loop below. */
426     for (i = 0; i < n_nhs; i++)
427         nhs[n_nhs + i].path_weight = nhs[i].path_weight;
428
429     /* Try larger and larger power of 2 sized adjacency blocks until we
430        find one where traffic flows to within 1% of specified weights. */
431     for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
432     {
433         error = 0;
434
435         norm = n_adj / ((f64) sum_weight);
436         n_adj_left = n_adj;
437         for (i = 0; i < n_nhs; i++)
438         {
439             f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */
440             word n = flt_round_nearest (nf);
441
442             n = n > n_adj_left ? n_adj_left : n;
443             n_adj_left -= n;
444             error += fabs (nf - n);
445             nhs[i].path_weight = n;
446
447             if (0 == nhs[i].path_weight)
448             {
449                 /*
450                  * when the weight skew is high (norm is small) and n == nf.
451                  * without this correction the path with a low weight would have
452                  * no represenation in the load-balanace - don't want that.
453                  * If the weight skew is high so the load-balance has many buckets
454                  * to allow it. pays ya money takes ya choice.
455                  */
456                 error = n_adj;
457                 break;
458             }
459         }
460
461         nhs[0].path_weight += n_adj_left;
462
463         /* Less than 5% average error per adjacency with this size adjacency block? */
464         if (error <= multipath_next_hop_error_tolerance*n_adj)
465         {
466             /* Truncate any next hops with zero weight. */
467             _vec_len (nhs) = i;
468             break;
469         }
470     }
471
472 done:
473     /* Save vector for next call. */
474     *normalized_next_hops = nhs;
475     *sum_weight_in = sum_weight;
476     return n_adj;
477 }
478
479 static load_balance_path_t *
480 load_balance_multipath_next_hop_fixup (const load_balance_path_t *nhs,
481                                        dpo_proto_t drop_proto)
482 {
483     if (0 == vec_len(nhs))
484     {
485         load_balance_path_t *new_nhs = NULL, *nh;
486
487         /*
488          * we need something for the load-balance. so use the drop
489          */
490         vec_add2(new_nhs, nh, 1);
491
492         nh->path_weight = 1;
493         dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto));
494
495         return (new_nhs);
496     }
497
498     return (NULL);
499 }
500
501 /*
502  * Fill in adjacencies in block based on corresponding
503  * next hop adjacencies.
504  */
505 static void
506 load_balance_fill_buckets_norm (load_balance_t *lb,
507                                 load_balance_path_t *nhs,
508                                 dpo_id_t *buckets,
509                                 u32 n_buckets)
510 {
511     load_balance_path_t *nh;
512     u16 ii, bucket;
513
514     bucket = 0;
515
516     /*
517      * the next-hops have normalised weights. that means their sum is the number
518      * of buckets we need to fill.
519      */
520     vec_foreach (nh, nhs)
521     {
522         for (ii = 0; ii < nh->path_weight; ii++)
523         {
524             ASSERT(bucket < n_buckets);
525             load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
526         }
527     }
528 }
529 static void
530 load_balance_fill_buckets_sticky (load_balance_t *lb,
531                                   load_balance_path_t *nhs,
532                                   dpo_id_t *buckets,
533                                   u32 n_buckets)
534 {
535     load_balance_path_t *nh, *fwding_paths;
536     u16 ii, bucket, fpath;
537
538     fpath = bucket = 0;
539     fwding_paths = NULL;
540
541     vec_foreach (nh, nhs)
542     {
543         if (!dpo_is_drop(&nh->path_dpo))
544         {
545             vec_add1(fwding_paths, *nh);
546         }
547     }
548     if (vec_len(fwding_paths) == 0)
549         fwding_paths = vec_dup(nhs);
550
551     /*
552      * the next-hops have normalised weights. that means their sum is the number
553      * of buckets we need to fill.
554      */
555     vec_foreach (nh, nhs)
556     {
557         for (ii = 0; ii < nh->path_weight; ii++)
558         {
559             ASSERT(bucket < n_buckets);
560             if (!dpo_is_drop(&nh->path_dpo))
561             {
562                 load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
563             }
564             else
565             {
566                 /* fill the bucks from the next up path */
567                 load_balance_set_bucket_i(lb, bucket++, buckets, &fwding_paths[fpath].path_dpo);
568                 fpath = (fpath + 1) % vec_len(fwding_paths);
569             }
570         }
571     }
572
573     vec_free(fwding_paths);
574 }
575
576 static void
577 load_balance_fill_buckets (load_balance_t *lb,
578                            load_balance_path_t *nhs,
579                            dpo_id_t *buckets,
580                            u32 n_buckets,
581                            load_balance_flags_t flags)
582 {
583     if (flags & LOAD_BALANCE_FLAG_STICKY)
584     {
585         load_balance_fill_buckets_sticky(lb, nhs, buckets, n_buckets);
586     }
587     else
588     {
589         load_balance_fill_buckets_norm(lb, nhs, buckets, n_buckets);
590     }
591 }
592
593 static inline void
594 load_balance_set_n_buckets (load_balance_t *lb,
595                             u32 n_buckets)
596 {
597     lb->lb_n_buckets = n_buckets;
598     lb->lb_n_buckets_minus_1 = n_buckets-1;
599 }
600
601 void
602 load_balance_multipath_update (const dpo_id_t *dpo,
603                                const load_balance_path_t * raw_nhs,
604                                load_balance_flags_t flags)
605 {
606     load_balance_path_t *nh, *nhs, *fixed_nhs;
607     u32 sum_of_weights, n_buckets, ii;
608     index_t lbmi, old_lbmi;
609     load_balance_t *lb;
610     dpo_id_t *tmp_dpo;
611
612     nhs = NULL;
613
614     ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
615     lb = load_balance_get(dpo->dpoi_index);
616     lb->lb_flags = flags;
617     fixed_nhs = load_balance_multipath_next_hop_fixup(raw_nhs, lb->lb_proto);
618     n_buckets =
619         ip_multipath_normalize_next_hops((NULL == fixed_nhs ?
620                                           raw_nhs :
621                                           fixed_nhs),
622                                          &nhs,
623                                          &sum_of_weights,
624                                          multipath_next_hop_error_tolerance);
625
626     ASSERT (n_buckets >= vec_len (raw_nhs));
627
628     /*
629      * Save the old load-balance map used, and get a new one if required.
630      */
631     old_lbmi = lb->lb_map;
632     if (flags & LOAD_BALANCE_FLAG_USES_MAP)
633     {
634         lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs);
635     }
636     else
637     {
638         lbmi = INDEX_INVALID;
639     }
640
641     if (0 == lb->lb_n_buckets)
642     {
643         /*
644          * first time initialisation. no packets inflight, so we can write
645          * at leisure.
646          */
647         load_balance_set_n_buckets(lb, n_buckets);
648
649         if (!LB_HAS_INLINE_BUCKETS(lb))
650             vec_validate_aligned(lb->lb_buckets,
651                                  lb->lb_n_buckets - 1,
652                                  CLIB_CACHE_LINE_BYTES);
653
654         load_balance_fill_buckets(lb, nhs,
655                                   load_balance_get_buckets(lb),
656                                   n_buckets, flags);
657         lb->lb_map = lbmi;
658     }
659     else
660     {
661         /*
662          * This is a modification of an existing load-balance.
663          * We need to ensure that packets inflight see a consistent state, that
664          * is the number of reported buckets the LB has (read from
665          * lb_n_buckets_minus_1) is not more than it actually has. So if the
666          * number of buckets is increasing, we must update the bucket array first,
667          * then the reported number. vice-versa if the number of buckets goes down.
668          */
669         if (n_buckets == lb->lb_n_buckets)
670         {
671             /*
672              * no change in the number of buckets. we can simply fill what
673              * is new over what is old.
674              */
675             load_balance_fill_buckets(lb, nhs,
676                                       load_balance_get_buckets(lb),
677                                       n_buckets, flags);
678             lb->lb_map = lbmi;
679         }
680         else if (n_buckets > lb->lb_n_buckets)
681         {
682             /*
683              * we have more buckets. the old load-balance map (if there is one)
684              * will remain valid, i.e. mapping to indices within range, so we
685              * update it last.
686              */
687             if (n_buckets > LB_NUM_INLINE_BUCKETS &&
688                 lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
689             {
690                 /*
691                  * the new increased number of buckets is crossing the threshold
692                  * from the inline storage to out-line. Alloc the outline buckets
693                  * first, then fixup the number. then reset the inlines.
694                  */
695                 ASSERT(NULL == lb->lb_buckets);
696                 vec_validate_aligned(lb->lb_buckets,
697                                      n_buckets - 1,
698                                      CLIB_CACHE_LINE_BYTES);
699
700                 load_balance_fill_buckets(lb, nhs,
701                                           lb->lb_buckets,
702                                           n_buckets, flags);
703                 CLIB_MEMORY_BARRIER();
704                 load_balance_set_n_buckets(lb, n_buckets);
705
706                 CLIB_MEMORY_BARRIER();
707
708                 for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++)
709                 {
710                     dpo_reset(&lb->lb_buckets_inline[ii]);
711                 }
712             }
713             else
714             {
715                 if (n_buckets <= LB_NUM_INLINE_BUCKETS)
716                 {
717                     /*
718                      * we are not crossing the threshold and it's still inline buckets.
719                      * we can write the new on the old..
720                      */
721                     load_balance_fill_buckets(lb, nhs,
722                                               load_balance_get_buckets(lb),
723                                               n_buckets, flags);
724                     CLIB_MEMORY_BARRIER();
725                     load_balance_set_n_buckets(lb, n_buckets);
726                 }
727                 else
728                 {
729                     /*
730                      * we are not crossing the threshold. We need a new bucket array to
731                      * hold the increased number of choices.
732                      */
733                     dpo_id_t *new_buckets, *old_buckets, *tmp_dpo;
734
735                     new_buckets = NULL;
736                     old_buckets = load_balance_get_buckets(lb);
737
738                     vec_validate_aligned(new_buckets,
739                                          n_buckets - 1,
740                                          CLIB_CACHE_LINE_BYTES);
741
742                     load_balance_fill_buckets(lb, nhs, new_buckets,
743                                               n_buckets, flags);
744                     CLIB_MEMORY_BARRIER();
745                     lb->lb_buckets = new_buckets;
746                     CLIB_MEMORY_BARRIER();
747                     load_balance_set_n_buckets(lb, n_buckets);
748
749                     vec_foreach(tmp_dpo, old_buckets)
750                     {
751                         dpo_reset(tmp_dpo);
752                     }
753                     vec_free(old_buckets);
754                 }
755             }
756
757             /*
758              * buckets fixed. ready for the MAP update.
759              */
760             lb->lb_map = lbmi;
761         }
762         else
763         {
764             /*
765              * bucket size shrinkage.
766              * Any map we have will be based on the old
767              * larger number of buckets, so will be translating to indices
768              * out of range. So the new MAP must be installed first.
769              */
770             lb->lb_map = lbmi;
771             CLIB_MEMORY_BARRIER();
772
773
774             if (n_buckets <= LB_NUM_INLINE_BUCKETS &&
775                 lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS)
776             {
777                 /*
778                  * the new decreased number of buckets is crossing the threshold
779                  * from out-line storage to inline:
780                  *   1 - Fill the inline buckets,
781                  *   2 - fixup the number (and this point the inline buckets are
782                  *       used).
783                  *   3 - free the outline buckets
784                  */
785                 load_balance_fill_buckets(lb, nhs,
786                                           lb->lb_buckets_inline,
787                                           n_buckets, flags);
788                 CLIB_MEMORY_BARRIER();
789                 load_balance_set_n_buckets(lb, n_buckets);
790                 CLIB_MEMORY_BARRIER();
791
792                 vec_foreach(tmp_dpo, lb->lb_buckets)
793                 {
794                     dpo_reset(tmp_dpo);
795                 }
796                 vec_free(lb->lb_buckets);
797             }
798             else
799             {
800                 /*
801                  * not crossing the threshold.
802                  *  1 - update the number to the smaller size
803                  *  2 - write the new buckets
804                  *  3 - reset those no longer used.
805                  */
806                 dpo_id_t *buckets;
807                 u32 old_n_buckets;
808
809                 old_n_buckets = lb->lb_n_buckets;
810                 buckets = load_balance_get_buckets(lb);
811
812                 load_balance_set_n_buckets(lb, n_buckets);
813                 CLIB_MEMORY_BARRIER();
814
815                 load_balance_fill_buckets(lb, nhs, buckets,
816                                           n_buckets, flags);
817
818                 for (ii = n_buckets; ii < old_n_buckets; ii++)
819                 {
820                     dpo_reset(&buckets[ii]);
821                 }
822             }
823         }
824     }
825
826     vec_foreach (nh, nhs)
827     {
828         dpo_reset(&nh->path_dpo);
829     }
830     vec_free(nhs);
831     vec_free(fixed_nhs);
832
833     load_balance_map_unlock(old_lbmi);
834 }
835
836 static void
837 load_balance_lock (dpo_id_t *dpo)
838 {
839     load_balance_t *lb;
840
841     lb = load_balance_get(dpo->dpoi_index);
842
843     lb->lb_locks++;
844 }
845
846 static void
847 load_balance_destroy (load_balance_t *lb)
848 {
849     dpo_id_t *buckets;
850     int i;
851
852     buckets = load_balance_get_buckets(lb);
853
854     for (i = 0; i < lb->lb_n_buckets; i++)
855     {
856         dpo_reset(&buckets[i]);
857     }
858
859     LB_DBG(lb, "destroy");
860     if (!LB_HAS_INLINE_BUCKETS(lb))
861     {
862         vec_free(lb->lb_buckets);
863     }
864
865     fib_urpf_list_unlock(lb->lb_urpf);
866     load_balance_map_unlock(lb->lb_map);
867
868     pool_put(load_balance_pool, lb);
869 }
870
871 static void
872 load_balance_unlock (dpo_id_t *dpo)
873 {
874     load_balance_t *lb;
875
876     lb = load_balance_get(dpo->dpoi_index);
877
878     lb->lb_locks--;
879
880     if (0 == lb->lb_locks)
881     {
882         load_balance_destroy(lb);
883     }
884 }
885
886 static void
887 load_balance_mem_show (void)
888 {
889     fib_show_memory_usage("load-balance",
890                           pool_elts(load_balance_pool),
891                           pool_len(load_balance_pool),
892                           sizeof(load_balance_t));
893     load_balance_map_show_mem();
894 }
895
896 const static dpo_vft_t lb_vft = {
897     .dv_lock = load_balance_lock,
898     .dv_unlock = load_balance_unlock,
899     .dv_format = format_load_balance_dpo,
900     .dv_mem_show = load_balance_mem_show,
901 };
902
903 /**
904  * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance
905  *        object.
906  *
907  * this means that these graph nodes are ones from which a load-balance is the
908  * parent object in the DPO-graph.
909  *
910  * We do not list all the load-balance nodes, such as the *-lookup. instead
911  * we are relying on the correct use of the .sibling_of field when setting
912  * up these sibling nodes.
913  */
914 const static char* const load_balance_ip4_nodes[] =
915 {
916     "ip4-load-balance",
917     NULL,
918 };
919 const static char* const load_balance_ip6_nodes[] =
920 {
921     "ip6-load-balance",
922     NULL,
923 };
924 const static char* const load_balance_mpls_nodes[] =
925 {
926     "mpls-load-balance",
927     NULL,
928 };
929 const static char* const load_balance_l2_nodes[] =
930 {
931     "l2-load-balance",
932     NULL,
933 };
934 const static char* const load_balance_nsh_nodes[] =
935 {
936     "nsh-load-balance",
937     NULL
938 };
939 const static char* const load_balance_bier_nodes[] =
940 {
941     "bier-load-balance",
942     NULL,
943 };
944 const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
945 {
946     [DPO_PROTO_IP4]  = load_balance_ip4_nodes,
947     [DPO_PROTO_IP6]  = load_balance_ip6_nodes,
948     [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
949     [DPO_PROTO_ETHERNET] = load_balance_l2_nodes,
950     [DPO_PROTO_NSH] = load_balance_nsh_nodes,
951     [DPO_PROTO_BIER] = load_balance_bier_nodes,
952 };
953
954 void
955 load_balance_module_init (void)
956 {
957     index_t lbi;
958
959     dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes);
960
961     /*
962      * Special LB with index zero. we need to define this since the v4 mtrie
963      * assumes an index of 0 implies the ply is empty. therefore all 'real'
964      * adjs need a non-zero index.
965      * This should never be used, but just in case, stack it on a drop.
966      */
967     lbi = load_balance_create(1, DPO_PROTO_IP4, 0);
968     load_balance_set_bucket(lbi, 0, drop_dpo_get(DPO_PROTO_IP4));
969
970     load_balance_logger =
971         vlib_log_register_class("dpo", "load-balance");
972
973     load_balance_map_module_init();
974 }
975
976 static clib_error_t *
977 load_balance_show (vlib_main_t * vm,
978                    unformat_input_t * input,
979                    vlib_cli_command_t * cmd)
980 {
981     index_t lbi = INDEX_INVALID;
982
983     while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
984     {
985         if (unformat (input, "%d", &lbi))
986             ;
987         else
988             break;
989     }
990
991     if (INDEX_INVALID != lbi)
992     {
993         if (pool_is_free_index(load_balance_pool, lbi))
994         {
995             vlib_cli_output (vm, "no such load-balance:%d", lbi);
996         }
997         else
998         {
999             vlib_cli_output (vm, "%U", format_load_balance, lbi,
1000                          LOAD_BALANCE_FORMAT_DETAIL);
1001         }
1002     }
1003     else
1004     {
1005         load_balance_t *lb;
1006
1007         pool_foreach(lb, load_balance_pool,
1008         ({
1009             vlib_cli_output (vm, "%U", format_load_balance,
1010                              load_balance_get_index(lb),
1011                              LOAD_BALANCE_FORMAT_NONE);
1012         }));
1013     }
1014
1015     return 0;
1016 }
1017
1018 VLIB_CLI_COMMAND (load_balance_show_command, static) = {
1019     .path = "show load-balance",
1020     .short_help = "show load-balance [<index>]",
1021     .function = load_balance_show,
1022 };
1023
1024
1025 always_inline u32
1026 ip_flow_hash (void *data)
1027 {
1028   ip4_header_t *iph = (ip4_header_t *) data;
1029
1030   if ((iph->ip_version_and_header_length & 0xF0) == 0x40)
1031     return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT);
1032   else
1033     return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT);
1034 }
1035
1036 always_inline u64
1037 mac_to_u64 (u8 * m)
1038 {
1039   return (*((u64 *) m) & 0xffffffffffff);
1040 }
1041
1042 always_inline u32
1043 l2_flow_hash (vlib_buffer_t * b0)
1044 {
1045   ethernet_header_t *eh;
1046   u64 a, b, c;
1047   uword is_ip, eh_size;
1048   u16 eh_type;
1049
1050   eh = vlib_buffer_get_current (b0);
1051   eh_type = clib_net_to_host_u16 (eh->type);
1052   eh_size = ethernet_buffer_header_size (b0);
1053
1054   is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6);
1055
1056   /* since we have 2 cache lines, use them */
1057   if (is_ip)
1058     a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size);
1059   else
1060     a = eh->type;
1061
1062   b = mac_to_u64 ((u8 *) eh->dst_address);
1063   c = mac_to_u64 ((u8 *) eh->src_address);
1064   hash_mix64 (a, b, c);
1065
1066   return (u32) c;
1067 }
1068
1069 typedef struct load_balance_trace_t_
1070 {
1071     index_t lb_index;
1072 } load_balance_trace_t;
1073
1074 always_inline uword
1075 load_balance_inline (vlib_main_t * vm,
1076                      vlib_node_runtime_t * node,
1077                      vlib_frame_t * frame,
1078                      int is_l2)
1079 {
1080   u32 n_left_from, next_index, *from, *to_next;
1081
1082   from = vlib_frame_vector_args (frame);
1083   n_left_from = frame->n_vectors;
1084
1085   next_index = node->cached_next_index;
1086
1087   while (n_left_from > 0)
1088     {
1089       u32 n_left_to_next;
1090
1091       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1092
1093       while (n_left_from > 0 && n_left_to_next > 0)
1094         {
1095           vlib_buffer_t *b0;
1096           u32 bi0, lbi0, next0;
1097           const dpo_id_t *dpo0;
1098           const load_balance_t *lb0;
1099
1100           bi0 = from[0];
1101           to_next[0] = bi0;
1102           from += 1;
1103           to_next += 1;
1104           n_left_from -= 1;
1105           n_left_to_next -= 1;
1106
1107           b0 = vlib_get_buffer (vm, bi0);
1108
1109           /* lookup dst + src mac */
1110           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1111           lb0 = load_balance_get(lbi0);
1112
1113           if (is_l2)
1114           {
1115               vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);
1116           }
1117           else
1118           {
1119               /* it's BIER */
1120               const bier_hdr_t *bh0 = vlib_buffer_get_current(b0);
1121               vnet_buffer(b0)->ip.flow_hash = bier_compute_flow_hash(bh0);
1122           }
1123
1124           dpo0 = load_balance_get_bucket_i(lb0, 
1125                                            vnet_buffer(b0)->ip.flow_hash &
1126                                            (lb0->lb_n_buckets_minus_1));
1127
1128           next0 = dpo0->dpoi_next_node;
1129           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1130
1131           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1132             {
1133               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1134                                                          sizeof (*tr));
1135               tr->lb_index = lbi0;
1136             }
1137           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1138                                            n_left_to_next, bi0, next0);
1139         }
1140
1141       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1142     }
1143
1144   return frame->n_vectors;
1145 }
1146
1147 static uword
1148 l2_load_balance (vlib_main_t * vm,
1149                  vlib_node_runtime_t * node,
1150                  vlib_frame_t * frame)
1151 {
1152     return (load_balance_inline(vm, node, frame, 1));
1153 }
1154
1155 static u8 *
1156 format_l2_load_balance_trace (u8 * s, va_list * args)
1157 {
1158   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1159   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1160   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1161
1162   s = format (s, "L2-load-balance: index %d", t->lb_index);
1163   return s;
1164 }
1165
1166 /**
1167  * @brief
1168  */
1169 VLIB_REGISTER_NODE (l2_load_balance_node) = {
1170   .function = l2_load_balance,
1171   .name = "l2-load-balance",
1172   .vector_size = sizeof (u32),
1173
1174   .format_trace = format_l2_load_balance_trace,
1175   .n_next_nodes = 1,
1176   .next_nodes = {
1177       [0] = "error-drop",
1178   },
1179 };
1180
1181 static uword
1182 nsh_load_balance (vlib_main_t * vm,
1183                  vlib_node_runtime_t * node,
1184                  vlib_frame_t * frame)
1185 {
1186   u32 n_left_from, next_index, *from, *to_next;
1187
1188   from = vlib_frame_vector_args (frame);
1189   n_left_from = frame->n_vectors;
1190
1191   next_index = node->cached_next_index;
1192
1193   while (n_left_from > 0)
1194     {
1195       u32 n_left_to_next;
1196
1197       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1198
1199       while (n_left_from > 0 && n_left_to_next > 0)
1200         {
1201           vlib_buffer_t *b0;
1202           u32 bi0, lbi0, next0, *nsh0;
1203           const dpo_id_t *dpo0;
1204           const load_balance_t *lb0;
1205
1206           bi0 = from[0];
1207           to_next[0] = bi0;
1208           from += 1;
1209           to_next += 1;
1210           n_left_from -= 1;
1211           n_left_to_next -= 1;
1212
1213           b0 = vlib_get_buffer (vm, bi0);
1214
1215           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1216           lb0 = load_balance_get(lbi0);
1217
1218           /* SPI + SI are the second word of the NSH header */
1219           nsh0 = vlib_buffer_get_current (b0);
1220           vnet_buffer(b0)->ip.flow_hash = nsh0[1] % lb0->lb_n_buckets;
1221
1222           dpo0 = load_balance_get_bucket_i(lb0,
1223                                            vnet_buffer(b0)->ip.flow_hash &
1224                                            (lb0->lb_n_buckets_minus_1));
1225
1226           next0 = dpo0->dpoi_next_node;
1227           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1228
1229           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1230             {
1231               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1232                                                          sizeof (*tr));
1233               tr->lb_index = lbi0;
1234             }
1235           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1236                                            n_left_to_next, bi0, next0);
1237         }
1238
1239       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1240     }
1241
1242   return frame->n_vectors;
1243 }
1244
1245 static u8 *
1246 format_nsh_load_balance_trace (u8 * s, va_list * args)
1247 {
1248   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1249   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1250   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1251
1252   s = format (s, "NSH-load-balance: index %d", t->lb_index);
1253   return s;
1254 }
1255
1256 /**
1257  * @brief
1258  */
1259 VLIB_REGISTER_NODE (nsh_load_balance_node) = {
1260   .function = nsh_load_balance,
1261   .name = "nsh-load-balance",
1262   .vector_size = sizeof (u32),
1263
1264   .format_trace = format_nsh_load_balance_trace,
1265   .n_next_nodes = 1,
1266   .next_nodes = {
1267       [0] = "error-drop",
1268   },
1269 };
1270
1271 static u8 *
1272 format_bier_load_balance_trace (u8 * s, va_list * args)
1273 {
1274   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1275   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1276   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1277
1278   s = format (s, "BIER-load-balance: index %d", t->lb_index);
1279   return s;
1280 }
1281
1282 static uword
1283 bier_load_balance (vlib_main_t * vm,
1284                    vlib_node_runtime_t * node,
1285                    vlib_frame_t * frame)
1286 {
1287     return (load_balance_inline(vm, node, frame, 0));
1288 }
1289
1290 /**
1291  * @brief
1292  */
1293 VLIB_REGISTER_NODE (bier_load_balance_node) = {
1294   .function = bier_load_balance,
1295   .name = "bier-load-balance",
1296   .vector_size = sizeof (u32),
1297
1298   .format_trace = format_bier_load_balance_trace,
1299   .sibling_of = "mpls-load-balance",
1300 };