MTRIE Optimisations 2
[vpp.git] / src / vnet / dpo / load_balance.c
1 /*
2  * Copyright (c) 2016 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 #include <vnet/ip/lookup.h>
17 #include <vnet/dpo/load_balance.h>
18 #include <vnet/dpo/load_balance_map.h>
19 #include <vnet/dpo/drop_dpo.h>
20 #include <vppinfra/math.h>              /* for fabs */
21 #include <vnet/adj/adj.h>
22 #include <vnet/adj/adj_internal.h>
23 #include <vnet/fib/fib_urpf_list.h>
24
25 /*
26  * distribution error tolerance for load-balancing
27  */
28 const f64 multipath_next_hop_error_tolerance = 0.1;
29
30 #undef LB_DEBUG
31
32 #ifdef LB_DEBUG
33 #define LB_DBG(_lb, _fmt, _args...)                                     \
34 {                                                                       \
35     u8* _tmp =NULL;                                                     \
36     clib_warning("lb:[%s]:" _fmt,                                       \
37                  load_balance_format(load_balance_get_index((_lb)),     \
38                                      0, _tmp),                          \
39                  ##_args);                                              \
40     vec_free(_tmp);                                                     \
41 }
42 #else
43 #define LB_DBG(_p, _fmt, _args...)
44 #endif
45
46
47 /**
48  * Pool of all DPOs. It's not static so the DP can have fast access
49  */
50 load_balance_t *load_balance_pool;
51
52 /**
53  * The one instance of load-balance main
54  */
55 load_balance_main_t load_balance_main;
56
57 f64
58 load_balance_get_multipath_tolerance (void)
59 {
60     return (multipath_next_hop_error_tolerance);
61 }
62
63 static inline index_t
64 load_balance_get_index (const load_balance_t *lb)
65 {
66     return (lb - load_balance_pool);
67 }
68
69 static inline dpo_id_t*
70 load_balance_get_buckets (load_balance_t *lb)
71 {
72     if (LB_HAS_INLINE_BUCKETS(lb))
73     {
74         return (lb->lb_buckets_inline);
75     }
76     else
77     {
78         return (lb->lb_buckets);
79     }
80 }
81
82 static load_balance_t *
83 load_balance_alloc_i (void)
84 {
85     load_balance_t *lb;
86
87     pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
88     memset(lb, 0, sizeof(*lb));
89
90     lb->lb_map = INDEX_INVALID;
91     lb->lb_urpf = INDEX_INVALID;
92     vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
93                                    load_balance_get_index(lb));
94     vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
95                                    load_balance_get_index(lb));
96     vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters),
97                                load_balance_get_index(lb));
98     vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
99                                load_balance_get_index(lb));
100
101     return (lb);
102 }
103
104 static u8*
105 load_balance_format (index_t lbi,
106                      load_balance_format_flags_t flags,
107                      u32 indent,
108                      u8 *s)
109 {
110     vlib_counter_t to, via;
111     load_balance_t *lb;
112     dpo_id_t *buckets;
113     u32 i;
114
115     lb = load_balance_get(lbi);
116     vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to);
117     vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via);
118     buckets = load_balance_get_buckets(lb);
119
120     s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE);
121     s = format(s, "[index:%d buckets:%d ", lbi, lb->lb_n_buckets);
122     s = format(s, "uRPF:%d ", lb->lb_urpf);
123     s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
124     if (0 != via.packets)
125     {
126         s = format(s, " via:[%Ld:%Ld]",
127                    via.packets, via.bytes);
128     }
129     s = format(s, "]");
130
131     if (INDEX_INVALID != lb->lb_map)
132     {
133         s = format(s, "\n%U%U",
134                    format_white_space, indent+4,
135                    format_load_balance_map, lb->lb_map, indent+4);
136     }
137     for (i = 0; i < lb->lb_n_buckets; i++)
138     {
139         s = format(s, "\n%U[%d] %U",
140                    format_white_space, indent+2,
141                    i,
142                    format_dpo_id,
143                    &buckets[i], indent+6);
144     }
145     return (s);
146 }
147
148 u8*
149 format_load_balance (u8 * s, va_list * args)
150 {
151     index_t lbi = va_arg(*args, index_t);
152     load_balance_format_flags_t flags = va_arg(*args, load_balance_format_flags_t);
153
154     return (load_balance_format(lbi, flags, 0, s));
155 }
156 static u8*
157 format_load_balance_dpo (u8 * s, va_list * args)
158 {
159     index_t lbi = va_arg(*args, index_t);
160     u32 indent = va_arg(*args, u32);
161
162     return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
163 }
164
165
166 static load_balance_t *
167 load_balance_create_i (u32 num_buckets,
168                        dpo_proto_t lb_proto,
169                        flow_hash_config_t fhc)
170 {
171     load_balance_t *lb;
172
173     lb = load_balance_alloc_i();
174     lb->lb_hash_config = fhc;
175     lb->lb_n_buckets = num_buckets;
176     lb->lb_n_buckets_minus_1 = num_buckets-1;
177     lb->lb_proto = lb_proto;
178
179     if (!LB_HAS_INLINE_BUCKETS(lb))
180     {
181         vec_validate_aligned(lb->lb_buckets,
182                              lb->lb_n_buckets - 1,
183                              CLIB_CACHE_LINE_BYTES);
184     }
185
186     LB_DBG(lb, "create");
187
188     return (lb);
189 }
190
191 index_t
192 load_balance_create (u32 n_buckets,
193                      dpo_proto_t lb_proto,
194                      flow_hash_config_t fhc)
195 {
196     return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc)));
197 }
198
199 static inline void
200 load_balance_set_bucket_i (load_balance_t *lb,
201                            u32 bucket,
202                            dpo_id_t *buckets,
203                            const dpo_id_t *next)
204 {
205     dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next);
206 }
207
208 void
209 load_balance_set_bucket (index_t lbi,
210                          u32 bucket,
211                          const dpo_id_t *next)
212 {
213     load_balance_t *lb;
214     dpo_id_t *buckets;
215
216     lb = load_balance_get(lbi);
217     buckets = load_balance_get_buckets(lb);
218
219     ASSERT(bucket < lb->lb_n_buckets);
220
221     load_balance_set_bucket_i(lb, bucket, buckets, next);
222 }
223
224 int
225 load_balance_is_drop (const dpo_id_t *dpo)
226 {
227     load_balance_t *lb;
228
229     if (DPO_LOAD_BALANCE != dpo->dpoi_type)
230         return (0);
231
232     lb = load_balance_get(dpo->dpoi_index);
233
234     if (1 == lb->lb_n_buckets)
235     {
236         return (dpo_is_drop(load_balance_get_bucket_i(lb, 0)));
237     }
238     return (0);
239 }
240
241 void
242 load_balance_set_fib_entry_flags (index_t lbi,
243                                   fib_entry_flag_t flags)
244 {
245     load_balance_t *lb;
246
247     lb = load_balance_get(lbi);
248     lb->lb_fib_entry_flags = flags;
249 }
250
251
252 void
253 load_balance_set_urpf (index_t lbi,
254                        index_t urpf)
255 {
256     load_balance_t *lb;
257     index_t old;
258
259     lb = load_balance_get(lbi);
260
261     /*
262      * packets in flight we see this change. but it's atomic, so :P
263      */
264     old = lb->lb_urpf;
265     lb->lb_urpf = urpf;
266
267     fib_urpf_list_unlock(old);
268     fib_urpf_list_lock(urpf);
269 }
270
271 index_t
272 load_balance_get_urpf (index_t lbi)
273 {
274     load_balance_t *lb;
275
276     lb = load_balance_get(lbi);
277
278     return (lb->lb_urpf);
279 }
280
281 const dpo_id_t *
282 load_balance_get_bucket (index_t lbi,
283                          u32 bucket)
284 {
285     load_balance_t *lb;
286
287     lb = load_balance_get(lbi);
288
289     return (load_balance_get_bucket_i(lb, bucket));
290 }
291
292 static int
293 next_hop_sort_by_weight (const load_balance_path_t * n1,
294                          const load_balance_path_t * n2)
295 {
296     return ((int) n1->path_weight - (int) n2->path_weight);
297 }
298
299 /* Given next hop vector is over-written with normalized one with sorted weights and
300    with weights corresponding to the number of adjacencies for each next hop.
301    Returns number of adjacencies in block. */
302 u32
303 ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops,
304                                   load_balance_path_t ** normalized_next_hops,
305                                   u32 *sum_weight_in,
306                                   f64 multipath_next_hop_error_tolerance)
307 {
308     load_balance_path_t * nhs;
309     uword n_nhs, n_adj, n_adj_left, i, sum_weight;
310     f64 norm, error;
311
312     n_nhs = vec_len (raw_next_hops);
313     ASSERT (n_nhs > 0);
314     if (n_nhs == 0)
315         return 0;
316
317     /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
318     nhs = *normalized_next_hops;
319     vec_validate (nhs, 2*n_nhs - 1);
320
321     /* Fast path: 1 next hop in block. */
322     n_adj = n_nhs;
323     if (n_nhs == 1)
324     {
325         nhs[0] = raw_next_hops[0];
326         nhs[0].path_weight = 1;
327         _vec_len (nhs) = 1;
328         sum_weight = 1;
329         goto done;
330     }
331
332     else if (n_nhs == 2)
333     {
334         int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;
335
336         /* Fast sort. */
337         nhs[0] = raw_next_hops[cmp];
338         nhs[1] = raw_next_hops[cmp ^ 1];
339
340         /* Fast path: equal cost multipath with 2 next hops. */
341         if (nhs[0].path_weight == nhs[1].path_weight)
342         {
343             nhs[0].path_weight = nhs[1].path_weight = 1;
344             _vec_len (nhs) = 2;
345             sum_weight = 2;
346             goto done;
347         }
348     }
349     else
350     {
351         clib_memcpy (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
352         qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
353     }
354
355     /* Find total weight to normalize weights. */
356     sum_weight = 0;
357     for (i = 0; i < n_nhs; i++)
358         sum_weight += nhs[i].path_weight;
359
360     /* In the unlikely case that all weights are given as 0, set them all to 1. */
361     if (sum_weight == 0)
362     {
363         for (i = 0; i < n_nhs; i++)
364             nhs[i].path_weight = 1;
365         sum_weight = n_nhs;
366     }
367
368     /* Save copies of all next hop weights to avoid being overwritten in loop below. */
369     for (i = 0; i < n_nhs; i++)
370         nhs[n_nhs + i].path_weight = nhs[i].path_weight;
371
372     /* Try larger and larger power of 2 sized adjacency blocks until we
373        find one where traffic flows to within 1% of specified weights. */
374     for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
375     {
376         error = 0;
377
378         norm = n_adj / ((f64) sum_weight);
379         n_adj_left = n_adj;
380         for (i = 0; i < n_nhs; i++)
381         {
382             f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */
383             word n = flt_round_nearest (nf);
384
385             n = n > n_adj_left ? n_adj_left : n;
386             n_adj_left -= n;
387             error += fabs (nf - n);
388             nhs[i].path_weight = n;
389
390             if (0 == nhs[i].path_weight)
391             {
392                 /*
393                  * when the weight skew is high (norm is small) and n == nf.
394                  * without this correction the path with a low weight would have
395                  * no represenation in the load-balanace - don't want that.
396                  * If the weight skew is high so the load-balance has many buckets
397                  * to allow it. pays ya money takes ya choice.
398                  */
399                 error = n_adj;
400                 break;
401             }
402         }
403
404         nhs[0].path_weight += n_adj_left;
405
406         /* Less than 5% average error per adjacency with this size adjacency block? */
407         if (error <= multipath_next_hop_error_tolerance*n_adj)
408         {
409             /* Truncate any next hops with zero weight. */
410             _vec_len (nhs) = i;
411             break;
412         }
413     }
414
415 done:
416     /* Save vector for next call. */
417     *normalized_next_hops = nhs;
418     *sum_weight_in = sum_weight;
419     return n_adj;
420 }
421
422 static load_balance_path_t *
423 load_balance_multipath_next_hop_fixup (const load_balance_path_t *nhs,
424                                        dpo_proto_t drop_proto)
425 {
426     if (0 == vec_len(nhs))
427     {
428         load_balance_path_t *new_nhs = NULL, *nh;
429
430         /*
431          * we need something for the load-balance. so use the drop
432          */
433         vec_add2(new_nhs, nh, 1);
434
435         nh->path_weight = 1;
436         dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto));
437
438         return (new_nhs);
439     }
440
441     return (NULL);
442 }
443
444 /*
445  * Fill in adjacencies in block based on corresponding
446  * next hop adjacencies.
447  */
448 static void
449 load_balance_fill_buckets (load_balance_t *lb,
450                            load_balance_path_t *nhs,
451                            dpo_id_t *buckets,
452                            u32 n_buckets)
453 {
454     load_balance_path_t * nh;
455     u16 ii, bucket;
456
457     bucket = 0;
458
459     /*
460      * the next-hops have normalised weights. that means their sum is the number
461      * of buckets we need to fill.
462      */
463     vec_foreach (nh, nhs)
464     {
465         for (ii = 0; ii < nh->path_weight; ii++)
466         {
467             ASSERT(bucket < n_buckets);
468             load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
469         }
470     }
471 }
472
473 static inline void
474 load_balance_set_n_buckets (load_balance_t *lb,
475                             u32 n_buckets)
476 {
477     lb->lb_n_buckets = n_buckets;
478     lb->lb_n_buckets_minus_1 = n_buckets-1;
479 }
480
481 void
482 load_balance_multipath_update (const dpo_id_t *dpo,
483                                const load_balance_path_t * raw_nhs,
484                                load_balance_flags_t flags)
485 {
486     load_balance_path_t *nh, *nhs, *fixed_nhs;
487     u32 sum_of_weights, n_buckets, ii;
488     index_t lbmi, old_lbmi;
489     load_balance_t *lb;
490     dpo_id_t *tmp_dpo;
491
492     nhs = NULL;
493
494     ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
495     lb = load_balance_get(dpo->dpoi_index);
496     fixed_nhs = load_balance_multipath_next_hop_fixup(raw_nhs, lb->lb_proto);
497     n_buckets =
498         ip_multipath_normalize_next_hops((NULL == fixed_nhs ?
499                                           raw_nhs :
500                                           fixed_nhs),
501                                          &nhs,
502                                          &sum_of_weights,
503                                          multipath_next_hop_error_tolerance);
504
505     ASSERT (n_buckets >= vec_len (raw_nhs));
506
507     /*
508      * Save the old load-balance map used, and get a new one if required.
509      */
510     old_lbmi = lb->lb_map;
511     if (flags & LOAD_BALANCE_FLAG_USES_MAP)
512     {
513         lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs);
514     }
515     else
516     {
517         lbmi = INDEX_INVALID;
518     }
519
520     if (0 == lb->lb_n_buckets)
521     {
522         /*
523          * first time initialisation. no packets inflight, so we can write
524          * at leisure.
525          */
526         load_balance_set_n_buckets(lb, n_buckets);
527
528         if (!LB_HAS_INLINE_BUCKETS(lb))
529             vec_validate_aligned(lb->lb_buckets,
530                                  lb->lb_n_buckets - 1,
531                                  CLIB_CACHE_LINE_BYTES);
532
533         load_balance_fill_buckets(lb, nhs,
534                                   load_balance_get_buckets(lb),
535                                   n_buckets);
536         lb->lb_map = lbmi;
537     }
538     else
539     {
540         /*
541          * This is a modification of an existing load-balance.
542          * We need to ensure that packets inflight see a consistent state, that
543          * is the number of reported buckets the LB has (read from
544          * lb_n_buckets_minus_1) is not more than it actually has. So if the
545          * number of buckets is increasing, we must update the bucket array first,
546          * then the reported number. vice-versa if the number of buckets goes down.
547          */
548         if (n_buckets == lb->lb_n_buckets)
549         {
550             /*
551              * no change in the number of buckets. we can simply fill what
552              * is new over what is old.
553              */
554             load_balance_fill_buckets(lb, nhs,
555                                       load_balance_get_buckets(lb),
556                                       n_buckets);
557             lb->lb_map = lbmi;
558         }
559         else if (n_buckets > lb->lb_n_buckets)
560         {
561             /*
562              * we have more buckets. the old load-balance map (if there is one)
563              * will remain valid, i.e. mapping to indices within range, so we
564              * update it last.
565              */
566             if (n_buckets > LB_NUM_INLINE_BUCKETS &&
567                 lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
568             {
569                 /*
570                  * the new increased number of buckets is crossing the threshold
571                  * from the inline storage to out-line. Alloc the outline buckets
572                  * first, then fixup the number. then reset the inlines.
573                  */
574                 ASSERT(NULL == lb->lb_buckets);
575                 vec_validate_aligned(lb->lb_buckets,
576                                      n_buckets - 1,
577                                      CLIB_CACHE_LINE_BYTES);
578
579                 load_balance_fill_buckets(lb, nhs,
580                                           lb->lb_buckets,
581                                           n_buckets);
582                 CLIB_MEMORY_BARRIER();
583                 load_balance_set_n_buckets(lb, n_buckets);
584
585                 CLIB_MEMORY_BARRIER();
586
587                 for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++)
588                 {
589                     dpo_reset(&lb->lb_buckets_inline[ii]);
590                 }
591             }
592             else
593             {
594                 if (n_buckets <= LB_NUM_INLINE_BUCKETS)
595                 {
596                     /*
597                      * we are not crossing the threshold and it's still inline buckets.
598                      * we can write the new on the old..
599                      */
600                     load_balance_fill_buckets(lb, nhs,
601                                               load_balance_get_buckets(lb),
602                                               n_buckets);
603                     CLIB_MEMORY_BARRIER();
604                     load_balance_set_n_buckets(lb, n_buckets);
605                 }
606                 else
607                 {
608                     /*
609                      * we are not crossing the threshold. We need a new bucket array to
610                      * hold the increased number of choices.
611                      */
612                     dpo_id_t *new_buckets, *old_buckets, *tmp_dpo;
613
614                     new_buckets = NULL;
615                     old_buckets = load_balance_get_buckets(lb);
616
617                     vec_validate_aligned(new_buckets,
618                                          n_buckets - 1,
619                                          CLIB_CACHE_LINE_BYTES);
620
621                     load_balance_fill_buckets(lb, nhs, new_buckets, n_buckets);
622                     CLIB_MEMORY_BARRIER();
623                     lb->lb_buckets = new_buckets;
624                     CLIB_MEMORY_BARRIER();
625                     load_balance_set_n_buckets(lb, n_buckets);
626
627                     vec_foreach(tmp_dpo, old_buckets)
628                     {
629                         dpo_reset(tmp_dpo);
630                     }
631                     vec_free(old_buckets);
632                 }
633             }
634
635             /*
636              * buckets fixed. ready for the MAP update.
637              */
638             lb->lb_map = lbmi;
639         }
640         else
641         {
642             /*
643              * bucket size shrinkage.
644              * Any map we have will be based on the old
645              * larger number of buckets, so will be translating to indices
646              * out of range. So the new MAP must be installed first.
647              */
648             lb->lb_map = lbmi;
649             CLIB_MEMORY_BARRIER();
650
651
652             if (n_buckets <= LB_NUM_INLINE_BUCKETS &&
653                 lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS)
654             {
655                 /*
656                  * the new decreased number of buckets is crossing the threshold
657                  * from out-line storage to inline:
658                  *   1 - Fill the inline buckets,
659                  *   2 - fixup the number (and this point the inline buckets are
660                  *       used).
661                  *   3 - free the outline buckets
662                  */
663                 load_balance_fill_buckets(lb, nhs,
664                                           lb->lb_buckets_inline,
665                                           n_buckets);
666                 CLIB_MEMORY_BARRIER();
667                 load_balance_set_n_buckets(lb, n_buckets);
668                 CLIB_MEMORY_BARRIER();
669
670                 vec_foreach(tmp_dpo, lb->lb_buckets)
671                 {
672                     dpo_reset(tmp_dpo);
673                 }
674                 vec_free(lb->lb_buckets);
675             }
676             else
677             {
678                 /*
679                  * not crossing the threshold.
680                  *  1 - update the number to the smaller size
681                  *  2 - write the new buckets
682                  *  3 - reset those no longer used.
683                  */
684                 dpo_id_t *buckets;
685                 u32 old_n_buckets;
686
687                 old_n_buckets = lb->lb_n_buckets;
688                 buckets = load_balance_get_buckets(lb);
689
690                 load_balance_set_n_buckets(lb, n_buckets);
691                 CLIB_MEMORY_BARRIER();
692
693                 load_balance_fill_buckets(lb, nhs,
694                                           buckets,
695                                           n_buckets);
696
697                 for (ii = n_buckets; ii < old_n_buckets; ii++)
698                 {
699                     dpo_reset(&buckets[ii]);
700                 }
701             }
702         }
703     }
704
705     vec_foreach (nh, nhs)
706     {
707         dpo_reset(&nh->path_dpo);
708     }
709     vec_free(nhs);
710     vec_free(fixed_nhs);
711
712     load_balance_map_unlock(old_lbmi);
713 }
714
715 static void
716 load_balance_lock (dpo_id_t *dpo)
717 {
718     load_balance_t *lb;
719
720     lb = load_balance_get(dpo->dpoi_index);
721
722     lb->lb_locks++;
723 }
724
725 static void
726 load_balance_destroy (load_balance_t *lb)
727 {
728     dpo_id_t *buckets;
729     int i;
730
731     buckets = load_balance_get_buckets(lb);
732
733     for (i = 0; i < lb->lb_n_buckets; i++)
734     {
735         dpo_reset(&buckets[i]);
736     }
737
738     LB_DBG(lb, "destroy");
739     if (!LB_HAS_INLINE_BUCKETS(lb))
740     {
741         vec_free(lb->lb_buckets);
742     }
743
744     fib_urpf_list_unlock(lb->lb_urpf);
745     load_balance_map_unlock(lb->lb_map);
746
747     pool_put(load_balance_pool, lb);
748 }
749
750 static void
751 load_balance_unlock (dpo_id_t *dpo)
752 {
753     load_balance_t *lb;
754
755     lb = load_balance_get(dpo->dpoi_index);
756
757     lb->lb_locks--;
758
759     if (0 == lb->lb_locks)
760     {
761         load_balance_destroy(lb);
762     }
763 }
764
765 static void
766 load_balance_mem_show (void)
767 {
768     fib_show_memory_usage("load-balance",
769                           pool_elts(load_balance_pool),
770                           pool_len(load_balance_pool),
771                           sizeof(load_balance_t));
772     load_balance_map_show_mem();
773 }
774
775 const static dpo_vft_t lb_vft = {
776     .dv_lock = load_balance_lock,
777     .dv_unlock = load_balance_unlock,
778     .dv_format = format_load_balance_dpo,
779     .dv_mem_show = load_balance_mem_show,
780 };
781
782 /**
783  * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance
784  *        object.
785  *
786  * this means that these graph nodes are ones from which a load-balance is the
787  * parent object in the DPO-graph.
788  *
789  * We do not list all the load-balance nodes, such as the *-lookup. instead
790  * we are relying on the correct use of the .sibling_of field when setting
791  * up these sibling nodes.
792  */
793 const static char* const load_balance_ip4_nodes[] =
794 {
795     "ip4-load-balance",
796     NULL,
797 };
798 const static char* const load_balance_ip6_nodes[] =
799 {
800     "ip6-load-balance",
801     NULL,
802 };
803 const static char* const load_balance_mpls_nodes[] =
804 {
805     "mpls-load-balance",
806     NULL,
807 };
808 const static char* const load_balance_l2_nodes[] =
809 {
810     "l2-load-balance",
811     NULL,
812 };
813 const static char* const load_balance_nsh_nodes[] =
814 {
815     "nsh-load-balance",
816     NULL,
817 };
818 const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
819 {
820     [DPO_PROTO_IP4]  = load_balance_ip4_nodes,
821     [DPO_PROTO_IP6]  = load_balance_ip6_nodes,
822     [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
823     [DPO_PROTO_ETHERNET] = load_balance_l2_nodes,
824     [DPO_PROTO_NSH] = load_balance_nsh_nodes,
825 };
826
827 void
828 load_balance_module_init (void)
829 {
830     index_t lbi;
831
832     dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes);
833
834     /*
835      * Special LB with index zero. we need to define this since the v4 mtrie
836      * assumes an index of 0 implies the ply is empty. therefore all 'real'
837      * adjs need a non-zero index.
838      * This should never be used, but just in case, stack it on a drop.
839      */
840     lbi = load_balance_create(1, DPO_PROTO_IP4, 0);
841     load_balance_set_bucket(lbi, 0, drop_dpo_get(DPO_PROTO_IP4));
842
843     load_balance_map_module_init();
844 }
845
846 static clib_error_t *
847 load_balance_show (vlib_main_t * vm,
848                    unformat_input_t * input,
849                    vlib_cli_command_t * cmd)
850 {
851     index_t lbi = INDEX_INVALID;
852
853     while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
854     {
855         if (unformat (input, "%d", &lbi))
856             ;
857         else
858             break;
859     }
860
861     if (INDEX_INVALID != lbi)
862     {
863         vlib_cli_output (vm, "%U", format_load_balance, lbi,
864                          LOAD_BALANCE_FORMAT_DETAIL);
865     }
866     else
867     {
868         load_balance_t *lb;
869
870         pool_foreach(lb, load_balance_pool,
871         ({
872             vlib_cli_output (vm, "%U", format_load_balance,
873                              load_balance_get_index(lb),
874                              LOAD_BALANCE_FORMAT_NONE);
875         }));
876     }
877
878     return 0;
879 }
880
881 VLIB_CLI_COMMAND (load_balance_show_command, static) = {
882     .path = "show load-balance",
883     .short_help = "show load-balance [<index>]",
884     .function = load_balance_show,
885 };
886
887
888 always_inline u32
889 ip_flow_hash (void *data)
890 {
891   ip4_header_t *iph = (ip4_header_t *) data;
892
893   if ((iph->ip_version_and_header_length & 0xF0) == 0x40)
894     return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT);
895   else
896     return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT);
897 }
898
899 always_inline u64
900 mac_to_u64 (u8 * m)
901 {
902   return (*((u64 *) m) & 0xffffffffffff);
903 }
904
905 always_inline u32
906 l2_flow_hash (vlib_buffer_t * b0)
907 {
908   ethernet_header_t *eh;
909   u64 a, b, c;
910   uword is_ip, eh_size;
911   u16 eh_type;
912
913   eh = vlib_buffer_get_current (b0);
914   eh_type = clib_net_to_host_u16 (eh->type);
915   eh_size = ethernet_buffer_header_size (b0);
916
917   is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6);
918
919   /* since we have 2 cache lines, use them */
920   if (is_ip)
921     a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size);
922   else
923     a = eh->type;
924
925   b = mac_to_u64 ((u8 *) eh->dst_address);
926   c = mac_to_u64 ((u8 *) eh->src_address);
927   hash_mix64 (a, b, c);
928
929   return (u32) c;
930 }
931
932 typedef struct load_balance_trace_t_
933 {
934     index_t lb_index;
935 } load_balance_trace_t;
936
937 static uword
938 l2_load_balance (vlib_main_t * vm,
939                  vlib_node_runtime_t * node,
940                  vlib_frame_t * frame)
941 {
942   u32 n_left_from, next_index, *from, *to_next;
943
944   from = vlib_frame_vector_args (frame);
945   n_left_from = frame->n_vectors;
946
947   next_index = node->cached_next_index;
948
949   while (n_left_from > 0)
950     {
951       u32 n_left_to_next;
952
953       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
954
955       while (n_left_from > 0 && n_left_to_next > 0)
956         {
957           vlib_buffer_t *b0;
958           u32 bi0, lbi0, next0;
959           const dpo_id_t *dpo0;
960           const load_balance_t *lb0;
961
962           bi0 = from[0];
963           to_next[0] = bi0;
964           from += 1;
965           to_next += 1;
966           n_left_from -= 1;
967           n_left_to_next -= 1;
968
969           b0 = vlib_get_buffer (vm, bi0);
970
971           /* lookup dst + src mac */
972           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
973           lb0 = load_balance_get(lbi0);
974
975           vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);
976
977           dpo0 = load_balance_get_bucket_i(lb0, 
978                                            vnet_buffer(b0)->ip.flow_hash &
979                                            (lb0->lb_n_buckets_minus_1));
980
981           next0 = dpo0->dpoi_next_node;
982           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
983
984           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
985             {
986               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
987                                                          sizeof (*tr));
988               tr->lb_index = lbi0;
989             }
990           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
991                                            n_left_to_next, bi0, next0);
992         }
993
994       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
995     }
996
997   return frame->n_vectors;
998 }
999
1000 static u8 *
1001 format_l2_load_balance_trace (u8 * s, va_list * args)
1002 {
1003   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1004   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1005   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1006
1007   s = format (s, "L2-load-balance: index %d", t->lb_index);
1008   return s;
1009 }
1010
1011 /**
1012  * @brief
1013  */
1014 VLIB_REGISTER_NODE (l2_load_balance_node) = {
1015   .function = l2_load_balance,
1016   .name = "l2-load-balance",
1017   .vector_size = sizeof (u32),
1018
1019   .format_trace = format_l2_load_balance_trace,
1020   .n_next_nodes = 1,
1021   .next_nodes = {
1022       [0] = "error-drop",
1023   },
1024 };
1025
1026 static uword
1027 nsh_load_balance (vlib_main_t * vm,
1028                  vlib_node_runtime_t * node,
1029                  vlib_frame_t * frame)
1030 {
1031   u32 n_left_from, next_index, *from, *to_next;
1032
1033   from = vlib_frame_vector_args (frame);
1034   n_left_from = frame->n_vectors;
1035
1036   next_index = node->cached_next_index;
1037
1038   while (n_left_from > 0)
1039     {
1040       u32 n_left_to_next;
1041
1042       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1043
1044       while (n_left_from > 0 && n_left_to_next > 0)
1045         {
1046           vlib_buffer_t *b0;
1047           u32 bi0, lbi0, next0, *nsh0;
1048           const dpo_id_t *dpo0;
1049           const load_balance_t *lb0;
1050
1051           bi0 = from[0];
1052           to_next[0] = bi0;
1053           from += 1;
1054           to_next += 1;
1055           n_left_from -= 1;
1056           n_left_to_next -= 1;
1057
1058           b0 = vlib_get_buffer (vm, bi0);
1059
1060           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1061           lb0 = load_balance_get(lbi0);
1062
1063           /* SPI + SI are the second word of the NSH header */
1064           nsh0 = vlib_buffer_get_current (b0);
1065           vnet_buffer(b0)->ip.flow_hash = nsh0[1] % lb0->lb_n_buckets;
1066
1067           dpo0 = load_balance_get_bucket_i(lb0,
1068                                            vnet_buffer(b0)->ip.flow_hash &
1069                                            (lb0->lb_n_buckets_minus_1));
1070
1071           next0 = dpo0->dpoi_next_node;
1072           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1073
1074           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1075             {
1076               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1077                                                          sizeof (*tr));
1078               tr->lb_index = lbi0;
1079             }
1080           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1081                                            n_left_to_next, bi0, next0);
1082         }
1083
1084       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1085     }
1086
1087   return frame->n_vectors;
1088 }
1089
1090 static u8 *
1091 format_nsh_load_balance_trace (u8 * s, va_list * args)
1092 {
1093   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1094   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1095   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1096
1097   s = format (s, "NSH-load-balance: index %d", t->lb_index);
1098   return s;
1099 }
1100
1101 /**
1102  * @brief
1103  */
1104 VLIB_REGISTER_NODE (nsh_load_balance_node) = {
1105   .function = nsh_load_balance,
1106   .name = "nsh-load-balance",
1107   .vector_size = sizeof (u32),
1108
1109   .format_trace = format_nsh_load_balance_trace,
1110   .n_next_nodes = 1,
1111   .next_nodes = {
1112       [0] = "error-drop",
1113   },
1114 };