FIB Memory Usage Diagnostics
[vpp.git] / vnet / vnet / dpo / load_balance.c
1 /*
2  * Copyright (c) 2016 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 #include <vnet/ip/lookup.h>
17 #include <vnet/dpo/load_balance.h>
18 #include <vnet/dpo/load_balance_map.h>
19 #include <vnet/dpo/drop_dpo.h>
20 #include <vppinfra/math.h>              /* for fabs */
21 #include <vnet/adj/adj.h>
22 #include <vnet/adj/adj_internal.h>
23
24 /*
25  * distribution error tolerance for load-balancing
26  */
27 const f64 multipath_next_hop_error_tolerance = 0.1;
28
29 #undef LB_DEBUG
30
31 #ifdef LB_DEBUG
32 #define LB_DBG(_lb, _fmt, _args...)                                     \
33 {                                                                       \
34     u8* _tmp =NULL;                                                     \
35     clib_warning("lb:[%s]:" _fmt,                                       \
36                  load_balance_format(load_balance_get_index((_lb)),     \
37                                      0, _tmp),                          \
38                  ##_args);                                              \
39     vec_free(_tmp);                                                     \
40 }
41 #else
42 #define LB_DBG(_p, _fmt, _args...)
43 #endif
44
45
46 /**
47  * Pool of all DPOs. It's not static so the DP can have fast access
48  */
49 load_balance_t *load_balance_pool;
50
51 /**
52  * The one instance of load-balance main
53  */
54 load_balance_main_t load_balance_main;
55
56 f64
57 load_balance_get_multipath_tolerance (void)
58 {
59     return (multipath_next_hop_error_tolerance);
60 }
61
62 static inline index_t
63 load_balance_get_index (const load_balance_t *lb)
64 {
65     return (lb - load_balance_pool);
66 }
67
68 static inline dpo_id_t*
69 load_balance_get_buckets (load_balance_t *lb)
70 {
71     if (LB_HAS_INLINE_BUCKETS(lb))
72     {
73         return (lb->lb_buckets_inline);
74     }
75     else
76     {
77         return (lb->lb_buckets);
78     }
79 }
80
81 static load_balance_t *
82 load_balance_alloc_i (void)
83 {
84     load_balance_t *lb;
85
86     pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
87     memset(lb, 0, sizeof(*lb));
88
89     lb->lb_map = INDEX_INVALID;
90     vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
91                                    load_balance_get_index(lb));
92     vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
93                                    load_balance_get_index(lb));
94     vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters),
95                                load_balance_get_index(lb));
96     vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
97                                load_balance_get_index(lb));
98
99     return (lb);
100 }
101
102 static u8*
103 load_balance_format (index_t lbi,
104                      load_balance_format_flags_t flags,
105                      u32 indent,
106                      u8 *s)
107 {
108     vlib_counter_t to, via;
109     load_balance_t *lb;
110     dpo_id_t *buckets;
111     u32 i;
112
113     lb = load_balance_get(lbi);
114     vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to);
115     vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via);
116     buckets = load_balance_get_buckets(lb);
117
118     s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE);
119     s = format(s, "[index:%d buckets:%d ", lbi, lb->lb_n_buckets);
120     s = format(s, "locks:%d ", lb->lb_locks);
121     s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
122     if (0 != via.packets)
123     {
124         s = format(s, " via:[%Ld:%Ld]",
125                    via.packets, via.bytes);
126     }
127     s = format(s, "]");
128
129     if (INDEX_INVALID != lb->lb_map)
130     {
131         s = format(s, "\n%U%U",
132                    format_white_space, indent+4,
133                    format_load_balance_map, lb->lb_map, indent+4);
134     }
135     for (i = 0; i < lb->lb_n_buckets; i++)
136     {
137         s = format(s, "\n%U[%d] %U",
138                    format_white_space, indent+2,
139                    i,
140                    format_dpo_id,
141                    &buckets[i], indent+6);
142     }
143     return (s);
144 }
145
146 u8*
147 format_load_balance (u8 * s, va_list * args)
148 {
149     index_t lbi = va_arg(args, index_t);
150     load_balance_format_flags_t flags = va_arg(args, load_balance_format_flags_t);
151
152     return (load_balance_format(lbi, flags, 0, s));
153 }
154 static u8*
155 format_load_balance_dpo (u8 * s, va_list * args)
156 {
157     index_t lbi = va_arg(args, index_t);
158     u32 indent = va_arg(args, u32);
159
160     return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
161 }
162
163
164 static load_balance_t *
165 load_balance_create_i (u32 num_buckets,
166                        dpo_proto_t lb_proto,
167                        flow_hash_config_t fhc)
168 {
169     load_balance_t *lb;
170
171     lb = load_balance_alloc_i();
172     lb->lb_hash_config = fhc;
173     lb->lb_n_buckets = num_buckets;
174     lb->lb_n_buckets_minus_1 = num_buckets-1;
175     lb->lb_proto = lb_proto;
176
177     if (!LB_HAS_INLINE_BUCKETS(lb))
178     {
179         vec_validate_aligned(lb->lb_buckets,
180                              lb->lb_n_buckets - 1,
181                              CLIB_CACHE_LINE_BYTES);
182     }
183
184     LB_DBG(lb, "create");
185
186     return (lb);
187 }
188
189 index_t
190 load_balance_create (u32 n_buckets,
191                      dpo_proto_t lb_proto,
192                      flow_hash_config_t fhc)
193 {
194     return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc)));
195 }
196
197 static inline void
198 load_balance_set_bucket_i (load_balance_t *lb,
199                            u32 bucket,
200                            dpo_id_t *buckets,
201                            const dpo_id_t *next)
202 {
203     dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next);
204 }
205
206 void
207 load_balance_set_bucket (index_t lbi,
208                          u32 bucket,
209                          const dpo_id_t *next)
210 {
211     load_balance_t *lb;
212     dpo_id_t *buckets;
213
214     lb = load_balance_get(lbi);
215     buckets = load_balance_get_buckets(lb);
216
217     ASSERT(bucket < lb->lb_n_buckets);
218
219     load_balance_set_bucket_i(lb, bucket, buckets, next);
220 }
221
222 int
223 load_balance_is_drop (const dpo_id_t *dpo)
224 {
225     load_balance_t *lb;
226
227     if (DPO_LOAD_BALANCE != dpo->dpoi_type)
228         return (0);
229
230     lb = load_balance_get(dpo->dpoi_index);
231
232     if (1 == lb->lb_n_buckets)
233     {
234         return (dpo_is_drop(load_balance_get_bucket_i(lb, 0)));
235     }
236     return (0);
237 }
238
239 const dpo_id_t *
240 load_balance_get_bucket (index_t lbi,
241                          u32 bucket)
242 {
243     load_balance_t *lb;
244
245     lb = load_balance_get(lbi);
246
247     return (load_balance_get_bucket_i(lb, bucket));
248 }
249
250 static int
251 next_hop_sort_by_weight (load_balance_path_t * n1,
252                          load_balance_path_t * n2)
253 {
254     return ((int) n1->path_weight - (int) n2->path_weight);
255 }
256
257 /* Given next hop vector is over-written with normalized one with sorted weights and
258    with weights corresponding to the number of adjacencies for each next hop.
259    Returns number of adjacencies in block. */
260 u32
261 ip_multipath_normalize_next_hops (load_balance_path_t * raw_next_hops,
262                                   load_balance_path_t ** normalized_next_hops,
263                                   u32 *sum_weight_in,
264                                   f64 multipath_next_hop_error_tolerance)
265 {
266     load_balance_path_t * nhs;
267     uword n_nhs, n_adj, n_adj_left, i, sum_weight;
268     f64 norm, error;
269
270     n_nhs = vec_len (raw_next_hops);
271     ASSERT (n_nhs > 0);
272     if (n_nhs == 0)
273         return 0;
274
275     /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
276     nhs = *normalized_next_hops;
277     vec_validate (nhs, 2*n_nhs - 1);
278
279     /* Fast path: 1 next hop in block. */
280     n_adj = n_nhs;
281     if (n_nhs == 1)
282     {
283         nhs[0] = raw_next_hops[0];
284         nhs[0].path_weight = 1;
285         _vec_len (nhs) = 1;
286         sum_weight = 1;
287         goto done;
288     }
289
290     else if (n_nhs == 2)
291     {
292         int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;
293
294         /* Fast sort. */
295         nhs[0] = raw_next_hops[cmp];
296         nhs[1] = raw_next_hops[cmp ^ 1];
297
298         /* Fast path: equal cost multipath with 2 next hops. */
299         if (nhs[0].path_weight == nhs[1].path_weight)
300         {
301             nhs[0].path_weight = nhs[1].path_weight = 1;
302             _vec_len (nhs) = 2;
303             sum_weight = 2;
304             goto done;
305         }
306     }
307     else
308     {
309         clib_memcpy (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
310         qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
311     }
312
313     /* Find total weight to normalize weights. */
314     sum_weight = 0;
315     for (i = 0; i < n_nhs; i++)
316         sum_weight += nhs[i].path_weight;
317
318     /* In the unlikely case that all weights are given as 0, set them all to 1. */
319     if (sum_weight == 0)
320     {
321         for (i = 0; i < n_nhs; i++)
322             nhs[i].path_weight = 1;
323         sum_weight = n_nhs;
324     }
325
326     /* Save copies of all next hop weights to avoid being overwritten in loop below. */
327     for (i = 0; i < n_nhs; i++)
328         nhs[n_nhs + i].path_weight = nhs[i].path_weight;
329
330     /* Try larger and larger power of 2 sized adjacency blocks until we
331        find one where traffic flows to within 1% of specified weights. */
332     for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
333     {
334         error = 0;
335
336         norm = n_adj / ((f64) sum_weight);
337         n_adj_left = n_adj;
338         for (i = 0; i < n_nhs; i++)
339         {
340             f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */
341             word n = flt_round_nearest (nf);
342
343             n = n > n_adj_left ? n_adj_left : n;
344             n_adj_left -= n;
345             error += fabs (nf - n);
346             nhs[i].path_weight = n;
347         }
348
349         nhs[0].path_weight += n_adj_left;
350
351         /* Less than 5% average error per adjacency with this size adjacency block? */
352         if (error <= multipath_next_hop_error_tolerance*n_adj)
353         {
354             /* Truncate any next hops with zero weight. */
355             _vec_len (nhs) = i;
356             break;
357         }
358     }
359
360 done:
361     /* Save vector for next call. */
362     *normalized_next_hops = nhs;
363     *sum_weight_in = sum_weight;
364     return n_adj;
365 }
366
367 static load_balance_path_t *
368 load_balance_multipath_next_hop_fixup (load_balance_path_t *nhs,
369                                        dpo_proto_t drop_proto)
370 {
371     if (0 == vec_len(nhs))
372     {
373         load_balance_path_t *nh;
374
375         /*
376          * we need something for the load-balance. so use the drop
377          */
378         vec_add2(nhs, nh, 1);
379
380         nh->path_weight = 1;
381         dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto));
382     }
383
384     return (nhs);
385 }
386
387 /*
388  * Fill in adjacencies in block based on corresponding
389  * next hop adjacencies.
390  */
391 static void
392 load_balance_fill_buckets (load_balance_t *lb,
393                            load_balance_path_t *nhs,
394                            dpo_id_t *buckets,
395                            u32 n_buckets)
396 {
397     load_balance_path_t * nh;
398     u16 ii, bucket;
399
400     bucket = 0;
401
402     /*
403      * the next-hops have normalised weights. that means their sum is the number
404      * of buckets we need to fill.
405      */
406     vec_foreach (nh, nhs)
407     {
408         for (ii = 0; ii < nh->path_weight; ii++)
409         {
410             ASSERT(bucket < n_buckets);
411             load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
412         }
413     }
414 }
415
416 static inline void
417 load_balance_set_n_buckets (load_balance_t *lb,
418                             u32 n_buckets)
419 {
420     lb->lb_n_buckets = n_buckets;
421     lb->lb_n_buckets_minus_1 = n_buckets-1;
422 }
423
424 void
425 load_balance_multipath_update (const dpo_id_t *dpo,
426                                load_balance_path_t * raw_next_hops,
427                                load_balance_flags_t flags)
428 {
429     u32 sum_of_weights,n_buckets, ii;
430     load_balance_path_t * nh, * nhs;
431     index_t lbmi, old_lbmi;
432     load_balance_t *lb;
433     dpo_id_t *tmp_dpo;
434
435     nhs = NULL;
436
437     ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
438     lb = load_balance_get(dpo->dpoi_index);
439     raw_next_hops =
440         load_balance_multipath_next_hop_fixup(raw_next_hops,
441                                               lb->lb_proto);
442     n_buckets =
443         ip_multipath_normalize_next_hops(raw_next_hops,
444                                          &nhs,
445                                          &sum_of_weights,
446                                          multipath_next_hop_error_tolerance);
447
448     ASSERT (n_buckets >= vec_len (raw_next_hops));
449
450     /*
451      * Save the old load-balance map used, and get a new one if required.
452      */
453     old_lbmi = lb->lb_map;
454     if (flags & LOAD_BALANCE_FLAG_USES_MAP)
455     {
456         lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs);
457     }
458     else
459     {
460         lbmi = INDEX_INVALID;
461     }
462
463     if (0 == lb->lb_n_buckets)
464     {
465         /*
466          * first time initialisation. no packets inflight, so we can write
467          * at leisure.
468          */
469         load_balance_set_n_buckets(lb, n_buckets);
470
471         if (!LB_HAS_INLINE_BUCKETS(lb))
472             vec_validate_aligned(lb->lb_buckets,
473                                  lb->lb_n_buckets - 1,
474                                  CLIB_CACHE_LINE_BYTES);
475
476         load_balance_fill_buckets(lb, nhs,
477                                   load_balance_get_buckets(lb),
478                                   n_buckets);
479         lb->lb_map = lbmi;
480     }
481     else
482     {
483         /*
484          * This is a modification of an existing load-balance.
485          * We need to ensure that packets inflight see a consistent state, that
486          * is the number of reported buckets the LB has (read from
487          * lb_n_buckets_minus_1) is not more than it actually has. So if the
488          * number of buckets is increasing, we must update the bucket array first,
489          * then the reported number. vice-versa if the number of buckets goes down.
490          */
491         if (n_buckets == lb->lb_n_buckets)
492         {
493             /*
494              * no change in the number of buckets. we can simply fill what
495              * is new over what is old.
496              */
497             load_balance_fill_buckets(lb, nhs,
498                                       load_balance_get_buckets(lb),
499                                       n_buckets);
500             lb->lb_map = lbmi;
501         }
502         else if (n_buckets > lb->lb_n_buckets)
503         {
504             /*
505              * we have more buckets. the old load-balance map (if there is one)
506              * will remain valid, i.e. mapping to indices within range, so we
507              * update it last.
508              */
509             if (n_buckets > LB_NUM_INLINE_BUCKETS &&
510                 lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
511             {
512                 /*
513                  * the new increased number of buckets is crossing the threshold
514                  * from the inline storage to out-line. Alloc the outline buckets
515                  * first, then fixup the number. then reset the inlines.
516                  */
517                 ASSERT(NULL == lb->lb_buckets);
518                 vec_validate_aligned(lb->lb_buckets,
519                                      n_buckets - 1,
520                                      CLIB_CACHE_LINE_BYTES);
521
522                 load_balance_fill_buckets(lb, nhs,
523                                           lb->lb_buckets,
524                                           n_buckets);
525                 CLIB_MEMORY_BARRIER();
526                 load_balance_set_n_buckets(lb, n_buckets);
527
528                 CLIB_MEMORY_BARRIER();
529
530                 for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++)
531                 {
532                     dpo_reset(&lb->lb_buckets_inline[ii]);
533                 }
534             }
535             else
536             {
537                 /*
538                  * we are not crossing the threshold. we can write the new on the
539                  * old, whether they be inline or not.
540                  */
541                 load_balance_fill_buckets(lb, nhs,
542                                           load_balance_get_buckets(lb),
543                                           n_buckets);
544                 CLIB_MEMORY_BARRIER();
545                 load_balance_set_n_buckets(lb, n_buckets);
546             }
547
548             /*
549              * buckets fixed. ready for the MAP update.
550              */
551             lb->lb_map = lbmi;
552         }
553         else
554         {
555             /*
556              * bucket size shrinkage.
557              * Any map we have will be based on the old
558              * larger number of buckets, so will be translating to indices
559              * out of range. So the new MAP must be installed first.
560              */
561             lb->lb_map = lbmi;
562             CLIB_MEMORY_BARRIER();
563
564
565             if (n_buckets <= LB_NUM_INLINE_BUCKETS &&
566                 lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS)
567             {
568                 /*
569                  * the new decreased number of buckets is crossing the threshold
570                  * from out-line storage to inline:
571                  *   1 - Fill the inline buckets,
572                  *   2 - fixup the number (and this point the inline buckets are
573                  *       used).
574                  *   3 - free the outline buckets
575                  */
576                 load_balance_fill_buckets(lb, nhs,
577                                           lb->lb_buckets_inline,
578                                           n_buckets);
579                 CLIB_MEMORY_BARRIER();
580                 load_balance_set_n_buckets(lb, n_buckets);
581                 CLIB_MEMORY_BARRIER();
582
583                 vec_foreach(tmp_dpo, lb->lb_buckets)
584                 {
585                     dpo_reset(tmp_dpo);
586                 }
587                 vec_free(lb->lb_buckets);
588             }
589             else
590             {
591                 /*
592                  * not crossing the threshold.
593                  *  1 - update the number to the smaller size
594                  *  2 - write the new buckets
595                  *  3 - reset those no longer used.
596                  */
597                 dpo_id_t *buckets;
598                 u32 old_n_buckets;
599
600                 old_n_buckets = lb->lb_n_buckets;
601                 buckets = load_balance_get_buckets(lb);
602
603                 load_balance_set_n_buckets(lb, n_buckets);
604                 CLIB_MEMORY_BARRIER();
605
606                 load_balance_fill_buckets(lb, nhs,
607                                           buckets,
608                                           n_buckets);
609
610                 for (ii = old_n_buckets-n_buckets; ii < old_n_buckets; ii++)
611                 {
612                     dpo_reset(&buckets[ii]);
613                 }
614             }
615         }
616     }
617
618     vec_foreach (nh, nhs)
619     {
620         dpo_reset(&nh->path_dpo);
621     }
622
623     load_balance_map_unlock(old_lbmi);
624 }
625
626 static void
627 load_balance_lock (dpo_id_t *dpo)
628 {
629     load_balance_t *lb;
630
631     lb = load_balance_get(dpo->dpoi_index);
632
633     lb->lb_locks++;
634 }
635
636 static void
637 load_balance_destroy (load_balance_t *lb)
638 {
639     dpo_id_t *buckets;
640     int i;
641
642     buckets = load_balance_get_buckets(lb);
643
644     for (i = 0; i < lb->lb_n_buckets; i++)
645     {
646         dpo_reset(&buckets[i]);
647     }
648
649     LB_DBG(lb, "destroy");
650     if (!LB_HAS_INLINE_BUCKETS(lb))
651     {
652         vec_free(lb->lb_buckets);
653     }
654
655     pool_put(load_balance_pool, lb);
656 }
657
658 static void
659 load_balance_unlock (dpo_id_t *dpo)
660 {
661     load_balance_t *lb;
662
663     lb = load_balance_get(dpo->dpoi_index);
664
665     lb->lb_locks--;
666
667     if (0 == lb->lb_locks)
668     {
669         load_balance_destroy(lb);
670     }
671 }
672
673 static void
674 load_balance_mem_show (void)
675 {
676     fib_show_memory_usage("load-balance",
677                           pool_elts(load_balance_pool),
678                           pool_len(load_balance_pool),
679                           sizeof(load_balance_t));
680 }
681
682 const static dpo_vft_t lb_vft = {
683     .dv_lock = load_balance_lock,
684     .dv_unlock = load_balance_unlock,
685     .dv_format = format_load_balance_dpo,
686     .dv_mem_show = load_balance_mem_show,
687 };
688
689 /**
690  * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance
691  *        object.
692  *
693  * this means that these graph nodes are ones from which a load-balance is the
694  * parent object in the DPO-graph.
695  *
696  * We do not list all the load-balance nodes, such as the *-lookup. instead
697  * we are relying on the correct use of the .sibling_of field when setting
698  * up these sibling nodes.
699  */
700 const static char* const load_balance_ip4_nodes[] =
701 {
702     "ip4-load-balance",
703     NULL,
704 };
705 const static char* const load_balance_ip6_nodes[] =
706 {
707     "ip6-load-balance",
708     NULL,
709 };
710 const static char* const load_balance_mpls_nodes[] =
711 {
712     "mpls-load-balance",
713     NULL,
714 };
715 const static char* const load_balance_l2_nodes[] =
716 {
717     "l2-load-balance",
718     NULL,
719 };
720 const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
721 {
722     [DPO_PROTO_IP4]  = load_balance_ip4_nodes,
723     [DPO_PROTO_IP6]  = load_balance_ip6_nodes,
724     [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
725     [DPO_PROTO_ETHERNET] = load_balance_l2_nodes,
726 };
727
728 void
729 load_balance_module_init (void)
730 {
731     dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes);
732
733     load_balance_map_module_init();
734 }
735
736 static clib_error_t *
737 load_balance_show (vlib_main_t * vm,
738                    unformat_input_t * input,
739                    vlib_cli_command_t * cmd)
740 {
741     index_t lbi = INDEX_INVALID;
742
743     while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
744     {
745         if (unformat (input, "%d", &lbi))
746             ;
747         else
748             break;
749     }
750
751     if (INDEX_INVALID != lbi)
752     {
753         vlib_cli_output (vm, "%U", format_load_balance, lbi,
754                          LOAD_BALANCE_FORMAT_DETAIL);
755     }
756     else
757     {
758         load_balance_t *lb;
759
760         pool_foreach(lb, load_balance_pool,
761         ({
762             vlib_cli_output (vm, "%U", format_load_balance,
763                              load_balance_get_index(lb),
764                              LOAD_BALANCE_FORMAT_NONE);
765         }));
766     }
767
768     return 0;
769 }
770
771 VLIB_CLI_COMMAND (load_balance_show_command, static) = {
772     .path = "show load-balance",
773     .short_help = "show load-balance [<index>]",
774     .function = load_balance_show,
775 };
776
777
778 always_inline u32
779 ip_flow_hash (void *data)
780 {
781   ip4_header_t *iph = (ip4_header_t *) data;
782
783   if ((iph->ip_version_and_header_length & 0xF0) == 0x40)
784     return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT);
785   else
786     return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT);
787 }
788
789 always_inline u64
790 mac_to_u64 (u8 * m)
791 {
792   return (*((u64 *) m) & 0xffffffffffff);
793 }
794
795 always_inline u32
796 l2_flow_hash (vlib_buffer_t * b0)
797 {
798   ethernet_header_t *eh;
799   u64 a, b, c;
800   uword is_ip, eh_size;
801   u16 eh_type;
802
803   eh = vlib_buffer_get_current (b0);
804   eh_type = clib_net_to_host_u16 (eh->type);
805   eh_size = ethernet_buffer_header_size (b0);
806
807   is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6);
808
809   /* since we have 2 cache lines, use them */
810   if (is_ip)
811     a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size);
812   else
813     a = eh->type;
814
815   b = mac_to_u64 ((u8 *) eh->dst_address);
816   c = mac_to_u64 ((u8 *) eh->src_address);
817   hash_mix64 (a, b, c);
818
819   return (u32) c;
820 }
821
822 typedef struct load_balance_trace_t_
823 {
824     index_t lb_index;
825 } load_balance_trace_t;
826
827 static uword
828 l2_load_balance (vlib_main_t * vm,
829                  vlib_node_runtime_t * node,
830                  vlib_frame_t * frame)
831 {
832   u32 n_left_from, next_index, *from, *to_next;
833
834   from = vlib_frame_vector_args (frame);
835   n_left_from = frame->n_vectors;
836
837   next_index = node->cached_next_index;
838
839   while (n_left_from > 0)
840     {
841       u32 n_left_to_next;
842
843       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
844
845       while (n_left_from > 0 && n_left_to_next > 0)
846         {
847           vlib_buffer_t *b0;
848           u32 bi0, lbi0, next0;
849           const dpo_id_t *dpo0;
850           const load_balance_t *lb0;
851
852           bi0 = from[0];
853           to_next[0] = bi0;
854           from += 1;
855           to_next += 1;
856           n_left_from -= 1;
857           n_left_to_next -= 1;
858
859           b0 = vlib_get_buffer (vm, bi0);
860
861           /* lookup dst + src mac */
862           lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
863           lb0 = load_balance_get(lbi0);
864
865           vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);
866
867           dpo0 = load_balance_get_bucket_i(lb0, 
868                                            vnet_buffer(b0)->ip.flow_hash &
869                                            (lb0->lb_n_buckets_minus_1));
870
871           next0 = dpo0->dpoi_next_node;
872           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
873
874           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
875             {
876               load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
877                                                          sizeof (*tr));
878               tr->lb_index = lbi0;
879             }
880           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
881                                            n_left_to_next, bi0, next0);
882         }
883
884       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
885     }
886
887   return frame->n_vectors;
888 }
889
890 static u8 *
891 format_load_balance_trace (u8 * s, va_list * args)
892 {
893   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
894   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
895   load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
896
897   s = format (s, "L2-load-balance: index %d", t->lb_index);
898   return s;
899 }
900
901 /**
902  * @brief
903  */
904 VLIB_REGISTER_NODE (l2_load_balance_node) = {
905   .function = l2_load_balance,
906   .name = "l2-load-balance",
907   .vector_size = sizeof (u32),
908
909   .format_trace = format_load_balance_trace,
910   .n_next_nodes = 1,
911   .next_nodes = {
912       [0] = "error-drop",
913   },
914 };