Route counters in the stats segment
[vpp.git] / src / vnet / dpo / load_balance.c
1 /*
2  * Copyright (c) 2016 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 #include <vnet/ip/lookup.h>
17 #include <vnet/dpo/load_balance.h>
18 #include <vnet/dpo/load_balance_map.h>
19 #include <vnet/dpo/drop_dpo.h>
20 #include <vppinfra/math.h>              /* for fabs */
21 #include <vnet/adj/adj.h>
22 #include <vnet/adj/adj_internal.h>
23 #include <vnet/fib/fib_urpf_list.h>
24 #include <vnet/bier/bier_fwd.h>
25
26 /*
27  * distribution error tolerance for load-balancing
28  */
29 const f64 multipath_next_hop_error_tolerance = 0.1;
30
31 #undef LB_DEBUG
32
33 #ifdef LB_DEBUG
34 #define LB_DBG(_lb, _fmt, _args...)                                     \
35 {                                                                       \
36     u8* _tmp =NULL;                                                     \
37     clib_warning("lb:[%s]:" _fmt,                                       \
38                  load_balance_format(load_balance_get_index((_lb)),     \
39                                      0, _tmp),                          \
40                  ##_args);                                              \
41     vec_free(_tmp);                                                     \
42 }
43 #else
44 #define LB_DBG(_p, _fmt, _args...)
45 #endif
46
47
48 /**
49  * Pool of all DPOs. It's not static so the DP can have fast access
50  */
51 load_balance_t *load_balance_pool;
52
53 /**
54  * The one instance of load-balance main
55  */
56 load_balance_main_t load_balance_main = {
57     .lbm_to_counters = {
58         .name = "route-to",
59         .stat_segment_name = "/net/route/to",
60     },
61     .lbm_via_counters = {
62         .name = "route-via",
63         .stat_segment_name = "/net/route/via",
64     }
65 };
66
67 f64
68 load_balance_get_multipath_tolerance (void)
69 {
70     return (multipath_next_hop_error_tolerance);
71 }
72
73 static inline index_t
74 load_balance_get_index (const load_balance_t *lb)
75 {
76     return (lb - load_balance_pool);
77 }
78
79 static inline dpo_id_t*
80 load_balance_get_buckets (load_balance_t *lb)
81 {
82     if (LB_HAS_INLINE_BUCKETS(lb))
83     {
84         return (lb->lb_buckets_inline);
85     }
86     else
87     {
88         return (lb->lb_buckets);
89     }
90 }
91
92 static load_balance_t *
93 load_balance_alloc_i (void)
94 {
95     load_balance_t *lb;
96
97     pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
98     memset(lb, 0, sizeof(*lb));
99
100     lb->lb_map = INDEX_INVALID;
101     lb->lb_urpf = INDEX_INVALID;
102     vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
103                                    load_balance_get_index(lb));
104     vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
105                                    load_balance_get_index(lb));
106     vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters),
107                                load_balance_get_index(lb));
108     vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
109                                load_balance_get_index(lb));
110
111     return (lb);
112 }
113
114 static u8*
115 load_balance_format (index_t lbi,
116                      load_balance_format_flags_t flags,
117                      u32 indent,
118                      u8 *s)
119 {
120     vlib_counter_t to, via;
121     load_balance_t *lb;
122     dpo_id_t *buckets;
123     u32 i;
124
125     lb = load_balance_get(lbi);
126     vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to);
127     vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via);
128     buckets = load_balance_get_buckets(lb);
129
130     s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE);
131     s = format(s, "[proto:%U ", format_dpo_proto, lb->lb_proto);
132     s = format(s, "index:%d buckets:%d ", lbi, lb->lb_n_buckets);
133     s = format(s, "uRPF:%d ", lb->lb_urpf);
134     s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
135     if (0 != via.packets)
136     {
137         s = format(s, " via:[%Ld:%Ld]",
138                    via.packets, via.bytes);
139     }
140     s = format(s, "]");
141
142     if (INDEX_INVALID != lb->lb_map)
143     {
144         s = format(s, "\n%U%U",
145                    format_white_space, indent+4,
146                    format_load_balance_map, lb->lb_map, indent+4);
147     }
148     for (i = 0; i < lb->lb_n_buckets; i++)
149     {
150         s = format(s, "\n%U[%d] %U",
151                    format_white_space, indent+2,
152                    i,
153                    format_dpo_id,
154                    &buckets[i], indent+6);
155     }
156     return (s);
157 }
158
159 u8*
160 format_load_balance (u8 * s, va_list * args)
161 {
162     index_t lbi = va_arg(*args, index_t);
163     load_balance_format_flags_t flags = va_arg(*args, load_balance_format_flags_t);
164
165     return (load_balance_format(lbi, flags, 0, s));
166 }
167 static u8*
168 format_load_balance_dpo (u8 * s, va_list * args)
169 {
170     index_t lbi = va_arg(*args, index_t);
171     u32 indent = va_arg(*args, u32);
172
173     return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
174 }
175
176
177 static load_balance_t *
178 load_balance_create_i (u32 num_buckets,
179                        dpo_proto_t lb_proto,
180                        flow_hash_config_t fhc)
181 {
182     load_balance_t *lb;
183
184     lb = load_balance_alloc_i();
185     lb->lb_hash_config = fhc;
186     lb->lb_n_buckets = num_buckets;
187     lb->lb_n_buckets_minus_1 = num_buckets-1;
188     lb->lb_proto = lb_proto;
189
190     if (!LB_HAS_INLINE_BUCKETS(lb))
191     {
192         vec_validate_aligned(lb->lb_buckets,
193                              lb->lb_n_buckets - 1,
194                              CLIB_CACHE_LINE_BYTES);
195     }
196
197     LB_DBG(lb, "create");
198
199     return (lb);
200 }
201
202 index_t
203 load_balance_create (u32 n_buckets,
204                      dpo_proto_t lb_proto,
205                      flow_hash_config_t fhc)
206 {
207     return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc)));
208 }
209
210 static inline void
211 load_balance_set_bucket_i (load_balance_t *lb,
212                            u32 bucket,
213                            dpo_id_t *buckets,
214                            const dpo_id_t *next)
215 {
216     dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next);
217 }
218
219 void
220 load_balance_set_bucket (index_t lbi,
221                          u32 bucket,
222                          const dpo_id_t *next)
223 {
224     load_balance_t *lb;
225     dpo_id_t *buckets;
226
227     lb = load_balance_get(lbi);
228     buckets = load_balance_get_buckets(lb);
229
230     ASSERT(bucket < lb->lb_n_buckets);
231
232     load_balance_set_bucket_i(lb, bucket, buckets, next);
233 }
234
235 int
236 load_balance_is_drop (const dpo_id_t *dpo)
237 {
238     load_balance_t *lb;
239
240     if (DPO_LOAD_BALANCE != dpo->dpoi_type)
241         return (0);
242
243     lb = load_balance_get(dpo->dpoi_index);
244
245     if (1 == lb->lb_n_buckets)
246     {
247         return (dpo_is_drop(load_balance_get_bucket_i(lb, 0)));
248     }
249     return (0);
250 }
251
252 u16
253 load_balance_n_buckets (index_t lbi)
254 {
255     load_balance_t *lb;
256
257     lb = load_balance_get(lbi);
258
259     return (lb->lb_n_buckets);
260 }
261
262 void
263 load_balance_set_fib_entry_flags (index_t lbi,
264                                   fib_entry_flag_t flags)
265 {
266     load_balance_t *lb;
267
268     lb = load_balance_get(lbi);
269     lb->lb_fib_entry_flags = flags;
270 }
271
272
273 void
274 load_balance_set_urpf (index_t lbi,
275                        index_t urpf)
276 {
277     load_balance_t *lb;
278     index_t old;
279
280     lb = load_balance_get(lbi);
281
282     /*
283      * packets in flight we see this change. but it's atomic, so :P
284      */
285     old = lb->lb_urpf;
286     lb->lb_urpf = urpf;
287
288     fib_urpf_list_unlock(old);
289     fib_urpf_list_lock(urpf);
290 }
291
292 index_t
293 load_balance_get_urpf (index_t lbi)
294 {
295     load_balance_t *lb;
296
297     lb = load_balance_get(lbi);
298
299     return (lb->lb_urpf);
300 }
301
302 const dpo_id_t *
303 load_balance_get_bucket (index_t lbi,
304                          u32 bucket)
305 {
306     load_balance_t *lb;
307
308     lb = load_balance_get(lbi);
309
310     return (load_balance_get_bucket_i(lb, bucket));
311 }
312
313 static int
314 next_hop_sort_by_weight (const load_balance_path_t * n1,
315                          const load_balance_path_t * n2)
316 {
317     return ((int) n1->path_weight - (int) n2->path_weight);
318 }
319
320 /* Given next hop vector is over-written with normalized one with sorted weights and
321    with weights corresponding to the number of adjacencies for each next hop.
322    Returns number of adjacencies in block. */
323 u32
324 ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops,
325                                   load_balance_path_t ** normalized_next_hops,
326                                   u32 *sum_weight_in,
327                                   f64 multipath_next_hop_error_tolerance)
328 {
329     load_balance_path_t * nhs;
330     uword n_nhs, n_adj, n_adj_left, i, sum_weight;
331     f64 norm, error;
332
333     n_nhs = vec_len (raw_next_hops);
334     ASSERT (n_nhs > 0);
335     if (n_nhs == 0)
336         return 0;
337
338     /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
339     nhs = *normalized_next_hops;
340     vec_validate (nhs, 2*n_nhs - 1);
341
342     /* Fast path: 1 next hop in block. */
343     n_adj = n_nhs;
344     if (n_nhs == 1)
345     {
346         nhs[0] = raw_next_hops[0];
347         nhs[0].path_weight = 1;
348         _vec_len (nhs) = 1;
349         sum_weight = 1;
350         goto done;
351     }
352
353     else if (n_nhs == 2)
354     {
355         int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;
356
357         /* Fast sort. */
358         nhs[0] = raw_next_hops[cmp];
359         nhs[1] = raw_next_hops[cmp ^ 1];
360
361         /* Fast path: equal cost multipath with 2 next hops. */
362         if (nhs[0].path_weight == nhs[1].path_weight)
363         {
364             nhs[0].path_weight = nhs[1].path_weight = 1;
365             _vec_len (nhs) = 2;
366             sum_weight = 2;
367             goto done;
368         }
369     }
370     else
371     {
372         clib_memcpy (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
373         qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
374     }
375
376     /* Find total weight to normalize weights. */
377     sum_weight = 0;
378     for (i = 0; i < n_nhs; i++)
379         sum_weight += nhs[i].path_weight;
380
381     /* In the unlikely case that all weights are given as 0, set them all to 1. */
382     if (sum_weight == 0)
383     {
384         for (i = 0; i < n_nhs; i++)
385             nhs[i].path_weight = 1;
386         sum_weight = n_nhs;
387     }
388
389     /* Save copies of all next hop weights to avoid being overwritten in loop below. */
390     for (i = 0; i < n_nhs; i++)
391         nhs[n_nhs + i].path_weight = nhs[i].path_weight;
392
393     /* Try larger and larger power of 2 sized adjacency blocks until we
394        find one where traffic flows to within 1% of specified weights. */
395     for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
396     {
397         error = 0;
398
399         norm = n_adj / ((f64) sum_weight);
400         n_adj_left = n_adj;
401         for (i = 0; i < n_nhs; i++)
402         {
403             f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */
404             word n = flt_round_nearest (nf);
405
406             n = n > n_adj_left ? n_adj_left : n;
407             n_adj_left -= n;
408             error += fabs (nf - n);
409             nhs[i].path_weight = n;
410
411             if (0 == nhs[i].path_weight)
412             {
413                 /*
414                  * when the weight skew is high (norm is small) and n == nf.
415                  * without this correction the path with a low weight would have
416                  * no represenation in the load-balanace - don't want that.
417                  * If the weight skew is high so the load-balance has many buckets
418                  * to allow it. pays ya money takes ya choice.
419                  */
420                 error = n_adj;
421                 break;
422             }
423         }
424
425         nhs[0].path_weight += n_adj_left;
426
427         /* Less than 5% average error per adjacency with this size adjacency block? */
428         if (error <= multipath_next_hop_error_tolerance*n_adj)
429         {
430             /* Truncate any next hops with zero weight. */
431             _vec_len (nhs) = i;
432             break;
433         }
434     }
435
436 done:
437     /* Save vector for next call. */
438     *normalized_next_hops = nhs;
439     *sum_weight_in = sum_weight;
440     return n_adj;
441 }
442
443 static load_balance_path_t *
444 load_balance_multipath_next_hop_fixup (const load_balance_path_t *nhs,
445                                        dpo_proto_t drop_proto)
446 {
447     if (0 == vec_len(nhs))
448     {
449         load_balance_path_t *new_nhs = NULL, *nh;
450
451         /*
452          * we need something for the load-balance. so use the drop
453          */
454         vec_add2(new_nhs, nh, 1);
455
456         nh->path_weight = 1;
457         dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto));
458
459         return (new_nhs);
460     }
461
462     return (NULL);
463 }
464
465 /*
466  * Fill in adjacencies in block based on corresponding
467  * next hop adjacencies.
468  */
469 static void
470 load_balance_fill_buckets (load_balance_t *lb,
471                            load_balance_path_t *nhs,
472                            dpo_id_t *buckets,
473                            u32 n_buckets)
474 {
475     load_balance_path_t * nh;
476     u16 ii, bucket;
477
478     bucket = 0;
479
480     /*
481      * the next-hops have normalised weights. that means their sum is the number
482      * of buckets we need to fill.
483      */
484     vec_foreach (nh, nhs)
485     {
486         for (ii = 0; ii < nh->path_weight; ii++)
487         {
488             ASSERT(bucket < n_buckets);
489             load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
490         }
491     }
492 }
493
494 static inline void
495 load_balance_set_n_buckets (load_balance_t *lb,
496                             u32 n_buckets)
497 {
498     lb->lb_n_buckets = n_buckets;
499     lb->lb_n_buckets_minus_1 = n_buckets-1;
500 }
501
502 void
503 load_balance_multipath_update (const dpo_id_t *dpo,
504                                const load_balance_path_t * raw_nhs,
505                                load_balance_flags_t flags)
506 {
507     load_balance_path_t *nh, *nhs, *fixed_nhs;
508     u32 sum_of_weights, n_buckets, ii;
509     index_t lbmi, old_lbmi;
510     load_balance_t *lb;
511     dpo_id_t *tmp_dpo;
512
513     nhs = NULL;
514
515     ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
516     lb = load_balance_get(dpo->dpoi_index);
517     fixed_nhs = load_balance_multipath_next_hop_fixup(raw_nhs, lb->lb_proto);
518     n_buckets =
519         ip_multipath_normalize_next_hops((NULL == fixed_nhs ?
520                                           raw_nhs :
521                                           fixed_nhs),
522                                          &nhs,
523                                          &sum_of_weights,
524                                          multipath_next_hop_error_tolerance);
525
526     ASSERT (n_buckets >= vec_len (raw_nhs));
527
528     /*
529      * Save the old load-balance map used, and get a new one if required.
530      */
531     old_lbmi = lb->lb_map;
532     if (flags & LOAD_BALANCE_FLAG_USES_MAP)
533     {
534         lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs);
535     }
536     else
537     {
538         lbmi = INDEX_INVALID;
539     }
540
541     if (0 == lb->lb_n_buckets)
542     {
543         /*
544          * first time initialisation. no packets inflight, so we can write
545          * at leisure.
546          */
547         load_balance_set_n_buckets(lb, n_buckets);
548
549         if (!LB_HAS_INLINE_BUCKETS(lb))
550             vec_validate_aligned(lb->lb_buckets,
551                                  lb->lb_n_buckets - 1,
552                                  CLIB_CACHE_LINE_BYTES);
553
554         load_balance_fill_buckets(lb, nhs,
555                                   load_balance_get_buckets(lb),
556                                   n_buckets);
557         lb->lb_map = lbmi;
558     }
559     else
560     {
561         /*
562          * This is a modification of an existing load-balance.
563          * We need to ensure that packets inflight see a consistent state, that
564          * is the number of reported buckets the LB has (read from
565          * lb_n_buckets_minus_1) is not more than it actually has. So if the
566          * number of buckets is increasing, we must update the bucket array first,
567          * then the reported number. vice-versa if the number of buckets goes down.
568          */
569         if (n_buckets == lb->lb_n_buckets)
570         {
571             /*
572              * no change in the number of buckets. we can simply fill what
573              * is new over what is old.
574              */
575             load_balance_fill_buckets(lb, nhs,
576                                       load_balance_get_buckets(lb),
577                                       n_buckets);
578             lb->lb_map = lbmi;
579         }
580         else if (n_buckets > lb->lb_n_buckets)
581         {
582             /*
583              * we have more buckets. the old load-balance map (if there is one)
584              * will remain valid, i.e. mapping to indices within range, so we
585              * update it last.
586              */
587             if (n_buckets > LB_NUM_INLINE_BUCKETS &&
588                 lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
589             {
590                 /*
591                  * the new increased number of buckets is crossing the threshold
592                  * from the inline storage to out-line. Alloc the outline buckets
593                  * first, then fixup the number. then reset the inlines.
594                  */
595                 ASSERT(NULL == lb->lb_buckets);
596                 vec_validate_aligned(lb->lb_buckets,
597                                      n_buckets - 1,
598                                      CLIB_CACHE_LINE_BYTES);
599
600                 load_balance_fill_buckets(lb, nhs,
601                                           lb->lb_buckets,
602                                           n_buckets);
603                 CLIB_MEMORY_BARRIER();
604                 load_balance_set_n_buckets(lb, n_buckets);
605
606                 CLIB_MEMORY_BARRIER();
607
608                 for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++)
609                 {
610                     dpo_reset(&lb->lb_buckets_inline[ii]);
611                 }
612             }
613             else
614             {
615                 if (n_buckets <= LB_NUM_INLINE_BUCKETS)
616                 {
617                     /*
618                      * we are not crossing the threshold and it's still inline buckets.
619                      * we can write the new on the old..
620                      */
621                     load_balance_fill_buckets(lb, nhs,
622                                               load_balance_get_buckets(lb),
623                                               n_buckets);
624                     CLIB_MEMORY_BARRIER();
625                     load_balance_set_n_buckets(lb, n_buckets);
626                 }
627                 else
628                 {
629                     /*
630                      * we are not crossing the threshold. We need a new bucket array to
631                      * hold the increased number of choices.
632                      */
633                     dpo_id_t *new_buckets, *old_buckets, *tmp_dpo;
634
635                     new_buckets = NULL;
636                     old_buckets = load_balance_get_buckets(lb);
637
638                     vec_validate_aligned(new_buckets,
639                                          n_buckets - 1,
640                                          CLIB_CACHE_LINE_BYTES);
641
642                     load_balance_fill_buckets(lb, nhs, new_buckets, n_buckets);
643                     CLIB_MEMORY_BARRIER();
644                     lb->lb_buckets = new_buckets;
645                     CLIB_MEMORY_BARRIER();
646                     load_balance_set_n_buckets(lb, n_buckets);
647
648                     vec_foreach(tmp_dpo, old_buckets)
649                     {
650                         dpo_reset(tmp_dpo);
651                     }
652                     vec_free(old_buckets);
653                 }
654             }
655
656             /*
657              * buckets fixed. ready for the MAP update.
658              */
659             lb->lb_map = lbmi;
660         }
661         else
662         {
663             /*
664              * bucket size shrinkage.
665              * Any map we have will be based on the old
666              * larger number of buckets, so will be translating to indices
667              * out of range. So the new MAP must be installed first.
668              */
669             lb->lb_map = lbmi;
670             CLIB_MEMORY_BARRIER();
671
672
673             if (n_buckets <= LB_NUM_INLINE_BUCKETS &&
674                 lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS)
675             {
676                 /*
677                  * the new decreased number of buckets is crossing the threshold
678                  * from out-line storage to inline:
679                  *   1 - Fill the inline buckets,
680                  *   2 - fixup the number (and this point the inline buckets are
681                  *       used).
682                  *   3 - free the outline buckets
683                  */
684                 load_balance_fill_buckets(lb, nhs,
685                                           lb->lb_buckets_inline,
686                                           n_buckets);
687                 CLIB_MEMORY_BARRIER();
688                 load_balance_set_n_buckets(lb, n_buckets);
689                 CLIB_MEMORY_BARRIER();
690
691                 vec_foreach(tmp_dpo, lb->lb_buckets)
692                 {
693                     dpo_reset(tmp_dpo);
694                 }
695                 vec_free(lb->lb_buckets);
696             }
697             else
698             {
699                 /*
700                  * not crossing the threshold.
701                  *  1 - update the number to the smaller size
702                  *  2 - write the new buckets
703                  *  3 - reset those no longer used.
704                  */
705                 dpo_id_t *buckets;
706                 u32 old_n_buckets;
707
708                 old_n_buckets = lb->lb_n_buckets;
709                 buckets = load_balance_get_buckets(lb);
710
711                 load_balance_set_n_buckets(lb, n_buckets);
712                 CLIB_MEMORY_BARRIER();
713
714                 load_balance_fill_buckets(lb, nhs,
715                                           buckets,
716                                           n_buckets);
717
718                 for (ii = n_buckets; ii < old_n_buckets; ii++)
719                 {
720                     dpo_reset(&buckets[ii]);
721                 }
722             }
723         }
724     }
725
726     vec_foreach (nh, nhs)
727     {
728         dpo_reset(&nh->path_dpo);
729     }
730     vec_free(nhs);
731     vec_free(fixed_nhs);
732
733     load_balance_map_unlock(old_lbmi);
734 }
735
736 static void
737 load_balance_lock (dpo_id_t *dpo)
738 {
739     load_balance_t *lb;
740
741     lb = load_balance_get(dpo->dpoi_index);
742
743     lb->lb_locks++;
744 }
745
746 static void
747 load_balance_destroy (load_balance_t *lb)
748 {
749     dpo_id_t *buckets;
750     int i;
751
752     buckets = load_balance_get_buckets(lb);
753
754     for (i = 0; i < lb->lb_n_buckets; i++)
755     {
756         dpo_reset(&buckets[i]);
757     }
758
759     LB_DBG(lb, "destroy");
760     if (!LB_HAS_INLINE_BUCKETS(lb))
761     {
762         vec_free(lb->lb_buckets);
763     }
764
765     fib_urpf_list_unlock(lb->lb_urpf);
766     load_balance_map_unlock(lb->lb_map);
767
768     pool_put(load_balance_pool, lb);
769 }
770
771 static void
772 load_balance_unlock (dpo_id_t *dpo)
773 {
774     load_balance_t *lb;
775
776     lb = load_balance_get(dpo->dpoi_index);
777
778     lb->lb_locks--;
779
780     if (0 == lb->lb_locks)
781     {
782         load_balance_destroy(lb);
783     }
784 }
785
786 static void
787 load_balance_mem_show (void)
788 {
789     fib_show_memory_usage("load-balance",
790                           pool_elts(load_balance_pool),
791                           pool_len(load_balance_pool),
792                           sizeof(load_balance_t));
793     load_balance_map_show_mem();
794 }
795
796 const static dpo_vft_t lb_vft = {
797     .dv_lock = load_balance_lock,
798     .dv_unlock = load_balance_unlock,
799     .dv_format = format_load_balance_dpo,
800     .dv_mem_show = load_balance_mem_show,
801 };
802
803 /**
804  * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance
805  *        object.
806  *
807  * this means that these graph nodes are ones from which a load-balance is the
808  * parent object in the DPO-graph.
809  *
810  * We do not list all the load-balance nodes, such as the *-lookup. instead
811  * we are relying on the correct use of the .sibling_of field when setting
812  * up these sibling nodes.
813  */
814 const static char* const load_balance_ip4_nodes[] =
815 {
816     "ip4-load-balance",
817     NULL,
818 };
819 const static char* const load_balance_ip6_nodes[] =
820 {
821     "ip6-load-balance",
822     NULL,
823 };
824 const static char* const load_balance_mpls_nodes[] =
825 {
826     "mpls-load-balance",
827     NULL,
828 };
829 const static char* const load_balance_l2_nodes[] =
830 {
831     "l2-load-balance",
832     NULL,
833 };
834 const static char* const load_balance_nsh_nodes[] =
835 {
836     "nsh-load-balance",
837     NULL
838 };
839 const static char* const load_balance_bier_nodes[] =
840 {
841     "bier-load-balance",
842     NULL,
843 };
844 const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
845 {
846     [DPO_PROTO_IP4]  = load_balance_ip4_nodes,
847     [DPO_PROTO_IP6]  = load_balance_ip6_nodes,
848     [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
849     [DPO_PROTO_ETHERNET] = load_balance_l2_nodes,
850     [DPO_PROTO_NSH] = load_balance_nsh_nodes,
851     [DPO_PROTO_BIER] = load_balance_bier_nodes,
852 };
853
854 void
855 load_balance_module_init (void)
856 {
857     index_t lbi;
858
859     dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes);
860
861     /*
862      * Special LB with index zero. we need to define this since the v4 mtrie
863      * assumes an index of 0 implies the ply is empty. therefore all 'real'
864      * adjs need a non-zero index.
865      * This should never be used, but just in case, stack it on a drop.
866      */
867     lbi = load_balance_create(1, DPO_PROTO_IP4, 0);
868     load_balance_set_bucket(lbi, 0, drop_dpo_get(DPO_PROTO_IP4));
869
870     load_balance_map_module_init();
871 }
872
873 static clib_error_t *
874 load_balance_show (vlib_main_t * vm,
875                    unformat_input_t * input,
876                    vlib_cli_command_t * cmd)
877 {
878     index_t lbi = INDEX_INVALID;
879
880     while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
881     {
882         if (unformat (input, "%d", &lbi))
883             ;
884         else
885             break;
886     }
887
888     if (INDEX_INVALID != lbi)
889     {
890         vlib_cli_output (vm, "%U", format_load_balance, lbi,
891                          LOAD_BALANCE_FORMAT_DETAIL);
892     }
893     else
894     {
895         load_balance_t *lb;
896
897         pool_foreach(lb, load_balance_pool,
898         ({
899             vlib_cli_output (vm, "%U", format_load_balance,
900                              load_balance_get_index(lb),
901                              LOAD_BALANCE_FORMAT_NONE);
902         }));
903     }
904
905     return 0;
906 }
907
908 VLIB_CLI_COMMAND (load_balance_show_command, static) = {
909     .path = "show load-balance",
910     .short_help = "show load-balance [<index>]",
911     .function = load_balance_show,
912 };
913
914
915 always_inline u32
916 ip_flow_hash (void *data)
917 {
918   ip4_header_t *iph = (ip4_header_t *) data;
919
920   if ((iph->ip_version_and_header_length & 0xF0) == 0x40)
921     return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT);
922   else
923     return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT);
924 }
925
926 always_inline u64
927 mac_to_u64 (u8 * m)
928 {
929   return (*((u64 *) m) & 0xffffffffffff);
930 }
931
932 always_inline u32
933 l2_flow_hash (vlib_buffer_t * b0)
934 {
935   ethernet_header_t *eh;
936   u64 a, b, c;
937   uword is_ip, eh_size;
938   u16 eh_type;
939
940   eh = vlib_buffer_get_current (b0);
941   eh_type = clib_net_to_host_u16 (eh->type);
942   eh_size = ethernet_buffer_header_size (b0);
943
944   is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6);
945
946   /* since we have 2 cache lines, use them */
947   if (is_ip)
948     a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size);
949   else
950     a = eh->type;
951
952   b = mac_to_u64 ((u8 *) eh->dst_address);
953   c = mac_to_u64 ((u8 *) eh->src_address);
954   hash_mix64 (a, b, c);
955
956   return (u32) c;
957 }
958
959 typedef struct load_balance_trace_t_
960 {
961     index_t lb_index;
962 } load_balance_trace_t;
963
964 always_inline uword
965 load_balance_inline (vlib_main_t * vm,
966                      vlib_node_runtime_t * node,
967                      vlib_frame_t * frame,
968                      int is_l2)
969 {
970   u32 n_left_from, next_index, *from, *to_next;
971
972   from = vlib_frame_vector_args (frame);
973   n_left_from = frame->n_vectors;
974
975   next_index = node->cached_next_index;
976
977   while (n_left_from > 0)
978     {
979       u32 n_left_to_next;
980
981       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
982
983       while (n_left_from > 0 && n_left_to_next > 0)
984         {
985           vlib_buffer_t *b0;
986           u32 bi0, lbi0, next0;
987           const dpo_id_t *dpo0;
988           const load_balance_t *lb0;
989
990           bi0 = from[0];
991           to_next[0] = bi0;
992           from += 1;
993           to_next += 1;
994           n_left_from -= 1;
995           n_left_to_next -= 1;
996
997           b0 = vlib_get_buffer (vm, bi0);
998
999           /* lookup dst + src mac */
1000           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1001           lb0 = load_balance_get(lbi0);
1002
1003           if (is_l2)
1004           {
1005               vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);
1006           }
1007           else
1008           {
1009               /* it's BIER */
1010               const bier_hdr_t *bh0 = vlib_buffer_get_current(b0);
1011               vnet_buffer(b0)->ip.flow_hash = bier_compute_flow_hash(bh0);
1012           }
1013
1014           dpo0 = load_balance_get_bucket_i(lb0, 
1015                                            vnet_buffer(b0)->ip.flow_hash &
1016                                            (lb0->lb_n_buckets_minus_1));
1017
1018           next0 = dpo0->dpoi_next_node;
1019           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1020
1021           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1022             {
1023               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1024                                                          sizeof (*tr));
1025               tr->lb_index = lbi0;
1026             }
1027           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1028                                            n_left_to_next, bi0, next0);
1029         }
1030
1031       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1032     }
1033
1034   return frame->n_vectors;
1035 }
1036
1037 static uword
1038 l2_load_balance (vlib_main_t * vm,
1039                  vlib_node_runtime_t * node,
1040                  vlib_frame_t * frame)
1041 {
1042     return (load_balance_inline(vm, node, frame, 1));
1043 }
1044
1045 static u8 *
1046 format_l2_load_balance_trace (u8 * s, va_list * args)
1047 {
1048   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1049   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1050   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1051
1052   s = format (s, "L2-load-balance: index %d", t->lb_index);
1053   return s;
1054 }
1055
1056 /**
1057  * @brief
1058  */
1059 VLIB_REGISTER_NODE (l2_load_balance_node) = {
1060   .function = l2_load_balance,
1061   .name = "l2-load-balance",
1062   .vector_size = sizeof (u32),
1063
1064   .format_trace = format_l2_load_balance_trace,
1065   .n_next_nodes = 1,
1066   .next_nodes = {
1067       [0] = "error-drop",
1068   },
1069 };
1070
1071 static uword
1072 nsh_load_balance (vlib_main_t * vm,
1073                  vlib_node_runtime_t * node,
1074                  vlib_frame_t * frame)
1075 {
1076   u32 n_left_from, next_index, *from, *to_next;
1077
1078   from = vlib_frame_vector_args (frame);
1079   n_left_from = frame->n_vectors;
1080
1081   next_index = node->cached_next_index;
1082
1083   while (n_left_from > 0)
1084     {
1085       u32 n_left_to_next;
1086
1087       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1088
1089       while (n_left_from > 0 && n_left_to_next > 0)
1090         {
1091           vlib_buffer_t *b0;
1092           u32 bi0, lbi0, next0, *nsh0;
1093           const dpo_id_t *dpo0;
1094           const load_balance_t *lb0;
1095
1096           bi0 = from[0];
1097           to_next[0] = bi0;
1098           from += 1;
1099           to_next += 1;
1100           n_left_from -= 1;
1101           n_left_to_next -= 1;
1102
1103           b0 = vlib_get_buffer (vm, bi0);
1104
1105           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1106           lb0 = load_balance_get(lbi0);
1107
1108           /* SPI + SI are the second word of the NSH header */
1109           nsh0 = vlib_buffer_get_current (b0);
1110           vnet_buffer(b0)->ip.flow_hash = nsh0[1] % lb0->lb_n_buckets;
1111
1112           dpo0 = load_balance_get_bucket_i(lb0,
1113                                            vnet_buffer(b0)->ip.flow_hash &
1114                                            (lb0->lb_n_buckets_minus_1));
1115
1116           next0 = dpo0->dpoi_next_node;
1117           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1118
1119           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1120             {
1121               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1122                                                          sizeof (*tr));
1123               tr->lb_index = lbi0;
1124             }
1125           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1126                                            n_left_to_next, bi0, next0);
1127         }
1128
1129       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1130     }
1131
1132   return frame->n_vectors;
1133 }
1134
1135 static u8 *
1136 format_nsh_load_balance_trace (u8 * s, va_list * args)
1137 {
1138   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1139   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1140   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1141
1142   s = format (s, "NSH-load-balance: index %d", t->lb_index);
1143   return s;
1144 }
1145
1146 /**
1147  * @brief
1148  */
1149 VLIB_REGISTER_NODE (nsh_load_balance_node) = {
1150   .function = nsh_load_balance,
1151   .name = "nsh-load-balance",
1152   .vector_size = sizeof (u32),
1153
1154   .format_trace = format_nsh_load_balance_trace,
1155   .n_next_nodes = 1,
1156   .next_nodes = {
1157       [0] = "error-drop",
1158   },
1159 };
1160
1161 static u8 *
1162 format_bier_load_balance_trace (u8 * s, va_list * args)
1163 {
1164   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1165   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1166   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1167
1168   s = format (s, "BIER-load-balance: index %d", t->lb_index);
1169   return s;
1170 }
1171
1172 static uword
1173 bier_load_balance (vlib_main_t * vm,
1174                    vlib_node_runtime_t * node,
1175                    vlib_frame_t * frame)
1176 {
1177     return (load_balance_inline(vm, node, frame, 0));
1178 }
1179
1180 /**
1181  * @brief
1182  */
1183 VLIB_REGISTER_NODE (bier_load_balance_node) = {
1184   .function = bier_load_balance,
1185   .name = "bier-load-balance",
1186   .vector_size = sizeof (u32),
1187
1188   .format_trace = format_bier_load_balance_trace,
1189   .sibling_of = "mpls-load-balance",
1190 };