ip: Path MTU
[vpp.git] / src / vnet / dpo / load_balance.c
index af054f1..a212532 100644 (file)
@@ -13,7 +13,6 @@
  * limitations under the License.
  */
 
-#include <vnet/ip/lookup.h>
 #include <vnet/dpo/load_balance.h>
 #include <vnet/dpo/load_balance_map.h>
 #include <vnet/dpo/drop_dpo.h>
 #include <vnet/adj/adj.h>
 #include <vnet/adj/adj_internal.h>
 #include <vnet/fib/fib_urpf_list.h>
+#include <vnet/bier/bier_fwd.h>
+#include <vnet/fib/mpls_fib.h>
+#include <vnet/ip/ip4_inlines.h>
+#include <vnet/ip/ip6_inlines.h>
+
+// clang-format off
 
 /*
  * distribution error tolerance for load-balancing
  */
 const f64 multipath_next_hop_error_tolerance = 0.1;
 
-#undef LB_DEBUG
+static const char *load_balance_attr_names[] = LOAD_BALANCE_ATTR_NAMES;
+
+/**
+ * the logger
+ */
+vlib_log_class_t load_balance_logger;
 
-#ifdef LB_DEBUG
 #define LB_DBG(_lb, _fmt, _args...)                                     \
 {                                                                       \
-    u8* _tmp =NULL;                                                     \
-    clib_warning("lb:[%s]:" _fmt,                                       \
-                 load_balance_format(load_balance_get_index((_lb)),     \
-                                     0, _tmp),                          \
-                 ##_args);                                              \
-    vec_free(_tmp);                                                     \
+    vlib_log_debug(load_balance_logger,                                 \
+                   "lb:[%U]:" _fmt,                                     \
+                   format_load_balance, load_balance_get_index(_lb),    \
+                   LOAD_BALANCE_FORMAT_NONE,                            \
+                   ##_args);                                            \
 }
-#else
-#define LB_DBG(_p, _fmt, _args...)
-#endif
-
 
 /**
  * Pool of all DPOs. It's not static so the DP can have fast access
@@ -52,7 +56,16 @@ load_balance_t *load_balance_pool;
 /**
  * The one instance of load-balance main
  */
-load_balance_main_t load_balance_main;
+load_balance_main_t load_balance_main = {
+    .lbm_to_counters = {
+        .name = "route-to",
+        .stat_segment_name = "/net/route/to",
+    },
+    .lbm_via_counters = {
+        .name = "route-via",
+        .stat_segment_name = "/net/route/via",
+    }
+};
 
 f64
 load_balance_get_multipath_tolerance (void)
@@ -83,12 +96,33 @@ static load_balance_t *
 load_balance_alloc_i (void)
 {
     load_balance_t *lb;
+    u8 need_barrier_sync = 0;
+    vlib_main_t *vm = vlib_get_main();
+    ASSERT (vm->thread_index == 0);
+
+    pool_get_aligned_will_expand (load_balance_pool, need_barrier_sync,
+                                  CLIB_CACHE_LINE_BYTES);
+    if (need_barrier_sync)
+        vlib_worker_thread_barrier_sync (vm);
 
     pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
-    memset(lb, 0, sizeof(*lb));
+    clib_memset(lb, 0, sizeof(*lb));
 
     lb->lb_map = INDEX_INVALID;
     lb->lb_urpf = INDEX_INVALID;
+
+    if (need_barrier_sync == 0)
+    {
+        need_barrier_sync += vlib_validate_combined_counter_will_expand
+            (&(load_balance_main.lbm_to_counters),
+             load_balance_get_index(lb));
+        need_barrier_sync += vlib_validate_combined_counter_will_expand
+            (&(load_balance_main.lbm_via_counters),
+             load_balance_get_index(lb));
+        if (need_barrier_sync)
+            vlib_worker_thread_barrier_sync (vm);
+    }
+
     vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
                                    load_balance_get_index(lb));
     vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
@@ -98,6 +132,9 @@ load_balance_alloc_i (void)
     vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
                                load_balance_get_index(lb));
 
+    if (need_barrier_sync)
+        vlib_worker_thread_barrier_release (vm);
+
     return (lb);
 }
 
@@ -121,6 +158,21 @@ load_balance_format (index_t lbi,
     s = format(s, "[proto:%U ", format_dpo_proto, lb->lb_proto);
     s = format(s, "index:%d buckets:%d ", lbi, lb->lb_n_buckets);
     s = format(s, "uRPF:%d ", lb->lb_urpf);
+    if (lb->lb_flags)
+    {
+        load_balance_attr_t attr;
+
+        s = format(s, "flags:[");
+
+        FOR_EACH_LOAD_BALANCE_ATTR(attr)
+        {
+            if (lb->lb_flags & (1 << attr))
+            {
+                s = format (s, "%s", load_balance_attr_names[attr]);
+            }
+        }
+        s = format(s, "] ");
+    }
     s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
     if (0 != via.packets)
     {
@@ -154,6 +206,7 @@ format_load_balance (u8 * s, va_list * args)
 
     return (load_balance_format(lbi, flags, 0, s));
 }
+
 static u8*
 format_load_balance_dpo (u8 * s, va_list * args)
 {
@@ -163,6 +216,26 @@ format_load_balance_dpo (u8 * s, va_list * args)
     return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
 }
 
+flow_hash_config_t
+load_balance_get_default_flow_hash (dpo_proto_t lb_proto)
+{
+    switch (lb_proto)
+    {
+    case DPO_PROTO_IP4:
+    case DPO_PROTO_IP6:
+        return (IP_FLOW_HASH_DEFAULT);
+
+    case DPO_PROTO_MPLS:
+        return (MPLS_FLOW_HASH_DEFAULT);
+
+    case DPO_PROTO_ETHERNET:
+    case DPO_PROTO_BIER:
+    case DPO_PROTO_NSH:
+        break;
+    }
+
+    return (0);
+}
 
 static load_balance_t *
 load_balance_create_i (u32 num_buckets,
@@ -239,6 +312,16 @@ load_balance_is_drop (const dpo_id_t *dpo)
     return (0);
 }
 
+u16
+load_balance_n_buckets (index_t lbi)
+{
+    load_balance_t *lb;
+
+    lb = load_balance_get(lbi);
+
+    return (lb->lb_n_buckets);
+}
+
 void
 load_balance_set_fib_entry_flags (index_t lbi,
                                   fib_entry_flag_t flags)
@@ -349,7 +432,7 @@ ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops,
     }
     else
     {
-        clib_memcpy (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
+        clib_memcpy_fast (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
         qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
     }
 
@@ -393,7 +476,7 @@ ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops,
                 /*
                  * when the weight skew is high (norm is small) and n == nf.
                  * without this correction the path with a low weight would have
-                 * no represenation in the load-balanace - don't want that.
+                 * no representation in the load-balanace - don't want that.
                  * If the weight skew is high so the load-balance has many buckets
                  * to allow it. pays ya money takes ya choice.
                  */
@@ -447,12 +530,12 @@ load_balance_multipath_next_hop_fixup (const load_balance_path_t *nhs,
  * next hop adjacencies.
  */
 static void
-load_balance_fill_buckets (load_balance_t *lb,
-                           load_balance_path_t *nhs,
-                           dpo_id_t *buckets,
-                           u32 n_buckets)
+load_balance_fill_buckets_norm (load_balance_t *lb,
+                                load_balance_path_t *nhs,
+                                dpo_id_t *buckets,
+                                u32 n_buckets)
 {
-    load_balance_path_t * nh;
+    load_balance_path_t *nh;
     u16 ii, bucket;
 
     bucket = 0;
@@ -470,6 +553,69 @@ load_balance_fill_buckets (load_balance_t *lb,
         }
     }
 }
+static void
+load_balance_fill_buckets_sticky (load_balance_t *lb,
+                                  load_balance_path_t *nhs,
+                                  dpo_id_t *buckets,
+                                  u32 n_buckets)
+{
+    load_balance_path_t *nh, *fwding_paths;
+    u16 ii, bucket, fpath;
+
+    fpath = bucket = 0;
+    fwding_paths = NULL;
+
+    vec_foreach (nh, nhs)
+    {
+        if (!dpo_is_drop(&nh->path_dpo))
+        {
+            vec_add1(fwding_paths, *nh);
+        }
+    }
+    if (vec_len(fwding_paths) == 0)
+        fwding_paths = vec_dup(nhs);
+
+    /*
+     * the next-hops have normalised weights. that means their sum is the number
+     * of buckets we need to fill.
+     */
+    vec_foreach (nh, nhs)
+    {
+        for (ii = 0; ii < nh->path_weight; ii++)
+        {
+            ASSERT(bucket < n_buckets);
+            if (!dpo_is_drop(&nh->path_dpo))
+            {
+                load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
+            }
+            else
+            {
+                /* fill the bucks from the next up path */
+                load_balance_set_bucket_i(lb, bucket++, buckets, &fwding_paths[fpath].path_dpo);
+                fpath = (fpath + 1) % vec_len(fwding_paths);
+            }
+        }
+    }
+
+    vec_free(fwding_paths);
+}
+
+static void
+load_balance_fill_buckets (load_balance_t *lb,
+                           load_balance_path_t *nhs,
+                           dpo_id_t *buckets,
+                           u32 n_buckets,
+                           load_balance_flags_t flags)
+{
+    if (flags & LOAD_BALANCE_FLAG_STICKY)
+    {
+        load_balance_fill_buckets_sticky(lb, nhs, buckets, n_buckets);
+    }
+    else
+    {
+        load_balance_fill_buckets_norm(lb, nhs, buckets, n_buckets);
+    }
+}
 
 static inline void
 load_balance_set_n_buckets (load_balance_t *lb,
@@ -494,6 +640,7 @@ load_balance_multipath_update (const dpo_id_t *dpo,
 
     ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
     lb = load_balance_get(dpo->dpoi_index);
+    lb->lb_flags = flags;
     fixed_nhs = load_balance_multipath_next_hop_fixup(raw_nhs, lb->lb_proto);
     n_buckets =
         ip_multipath_normalize_next_hops((NULL == fixed_nhs ?
@@ -533,7 +680,7 @@ load_balance_multipath_update (const dpo_id_t *dpo,
 
         load_balance_fill_buckets(lb, nhs,
                                   load_balance_get_buckets(lb),
-                                  n_buckets);
+                                  n_buckets, flags);
         lb->lb_map = lbmi;
     }
     else
@@ -554,7 +701,7 @@ load_balance_multipath_update (const dpo_id_t *dpo,
              */
             load_balance_fill_buckets(lb, nhs,
                                       load_balance_get_buckets(lb),
-                                      n_buckets);
+                                      n_buckets, flags);
             lb->lb_map = lbmi;
         }
         else if (n_buckets > lb->lb_n_buckets)
@@ -579,7 +726,7 @@ load_balance_multipath_update (const dpo_id_t *dpo,
 
                 load_balance_fill_buckets(lb, nhs,
                                           lb->lb_buckets,
-                                          n_buckets);
+                                          n_buckets, flags);
                 CLIB_MEMORY_BARRIER();
                 load_balance_set_n_buckets(lb, n_buckets);
 
@@ -600,7 +747,7 @@ load_balance_multipath_update (const dpo_id_t *dpo,
                      */
                     load_balance_fill_buckets(lb, nhs,
                                               load_balance_get_buckets(lb),
-                                              n_buckets);
+                                              n_buckets, flags);
                     CLIB_MEMORY_BARRIER();
                     load_balance_set_n_buckets(lb, n_buckets);
                 }
@@ -619,7 +766,8 @@ load_balance_multipath_update (const dpo_id_t *dpo,
                                          n_buckets - 1,
                                          CLIB_CACHE_LINE_BYTES);
 
-                    load_balance_fill_buckets(lb, nhs, new_buckets, n_buckets);
+                    load_balance_fill_buckets(lb, nhs, new_buckets,
+                                              n_buckets, flags);
                     CLIB_MEMORY_BARRIER();
                     lb->lb_buckets = new_buckets;
                     CLIB_MEMORY_BARRIER();
@@ -663,7 +811,7 @@ load_balance_multipath_update (const dpo_id_t *dpo,
                  */
                 load_balance_fill_buckets(lb, nhs,
                                           lb->lb_buckets_inline,
-                                          n_buckets);
+                                          n_buckets, flags);
                 CLIB_MEMORY_BARRIER();
                 load_balance_set_n_buckets(lb, n_buckets);
                 CLIB_MEMORY_BARRIER();
@@ -691,9 +839,8 @@ load_balance_multipath_update (const dpo_id_t *dpo,
                 load_balance_set_n_buckets(lb, n_buckets);
                 CLIB_MEMORY_BARRIER();
 
-                load_balance_fill_buckets(lb, nhs,
-                                          buckets,
-                                          n_buckets);
+                load_balance_fill_buckets(lb, nhs, buckets,
+                                          n_buckets, flags);
 
                 for (ii = n_buckets; ii < old_n_buckets; ii++)
                 {
@@ -773,11 +920,30 @@ load_balance_mem_show (void)
     load_balance_map_show_mem();
 }
 
+static u16
+load_balance_dpo_get_mtu (const dpo_id_t *dpo)
+{
+    const dpo_id_t *buckets;
+    load_balance_t *lb;
+    u16 i, mtu = 0xffff;
+
+    lb = load_balance_get(dpo->dpoi_index);
+    buckets = load_balance_get_buckets(lb);
+
+    for (i = 0; i < lb->lb_n_buckets; i++)
+    {
+        mtu = clib_min (mtu, dpo_get_mtu (&buckets[i]));
+    }
+
+    return (mtu);
+}
+
 const static dpo_vft_t lb_vft = {
     .dv_lock = load_balance_lock,
     .dv_unlock = load_balance_unlock,
     .dv_format = format_load_balance_dpo,
     .dv_mem_show = load_balance_mem_show,
+    .dv_get_mtu = load_balance_dpo_get_mtu,
 };
 
 /**
@@ -814,6 +980,11 @@ const static char* const load_balance_l2_nodes[] =
 const static char* const load_balance_nsh_nodes[] =
 {
     "nsh-load-balance",
+    NULL
+};
+const static char* const load_balance_bier_nodes[] =
+{
+    "bier-load-balance",
     NULL,
 };
 const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
@@ -823,6 +994,7 @@ const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
     [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
     [DPO_PROTO_ETHERNET] = load_balance_l2_nodes,
     [DPO_PROTO_NSH] = load_balance_nsh_nodes,
+    [DPO_PROTO_BIER] = load_balance_bier_nodes,
 };
 
 void
@@ -841,6 +1013,9 @@ load_balance_module_init (void)
     lbi = load_balance_create(1, DPO_PROTO_IP4, 0);
     load_balance_set_bucket(lbi, 0, drop_dpo_get(DPO_PROTO_IP4));
 
+    load_balance_logger =
+        vlib_log_register_class("dpo", "load-balance");
+
     load_balance_map_module_init();
 }
 
@@ -861,19 +1036,26 @@ load_balance_show (vlib_main_t * vm,
 
     if (INDEX_INVALID != lbi)
     {
-        vlib_cli_output (vm, "%U", format_load_balance, lbi,
+        if (pool_is_free_index(load_balance_pool, lbi))
+        {
+            vlib_cli_output (vm, "no such load-balance:%d", lbi);
+        }
+        else
+        {
+            vlib_cli_output (vm, "%U", format_load_balance, lbi,
                          LOAD_BALANCE_FORMAT_DETAIL);
+        }
     }
     else
     {
         load_balance_t *lb;
 
-        pool_foreach(lb, load_balance_pool,
-        ({
+        pool_foreach (lb, load_balance_pool)
+         {
             vlib_cli_output (vm, "%U", format_load_balance,
                              load_balance_get_index(lb),
                              LOAD_BALANCE_FORMAT_NONE);
-        }));
+        }
     }
 
     return 0;
@@ -935,10 +1117,11 @@ typedef struct load_balance_trace_t_
     index_t lb_index;
 } load_balance_trace_t;
 
-static uword
-l2_load_balance (vlib_main_t * vm,
-                vlib_node_runtime_t * node,
-                vlib_frame_t * frame)
+always_inline uword
+load_balance_inline (vlib_main_t * vm,
+                    vlib_node_runtime_t * node,
+                    vlib_frame_t * frame,
+                    int is_l2)
 {
   u32 n_left_from, next_index, *from, *to_next;
 
@@ -973,9 +1156,18 @@ l2_load_balance (vlib_main_t * vm,
          lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
          lb0 = load_balance_get(lbi0);
 
-         vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);
-
-         dpo0 = load_balance_get_bucket_i(lb0, 
+         if (is_l2)
+         {
+             vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);
+         }
+         else
+         {
+             /* it's BIER */
+             const bier_hdr_t *bh0 = vlib_buffer_get_current(b0);
+             vnet_buffer(b0)->ip.flow_hash = bier_compute_flow_hash(bh0);
+         }
+
+         dpo0 = load_balance_get_bucket_i(lb0,
                                           vnet_buffer(b0)->ip.flow_hash &
                                           (lb0->lb_n_buckets_minus_1));
 
@@ -998,6 +1190,14 @@ l2_load_balance (vlib_main_t * vm,
   return frame->n_vectors;
 }
 
+static uword
+l2_load_balance (vlib_main_t * vm,
+                vlib_node_runtime_t * node,
+                vlib_frame_t * frame)
+{
+    return (load_balance_inline(vm, node, frame, 1));
+}
+
 static u8 *
 format_l2_load_balance_trace (u8 * s, va_list * args)
 {
@@ -1113,3 +1313,36 @@ VLIB_REGISTER_NODE (nsh_load_balance_node) = {
       [0] = "error-drop",
   },
 };
+
+static u8 *
+format_bier_load_balance_trace (u8 * s, va_list * args)
+{
+  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+  load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
+
+  s = format (s, "BIER-load-balance: index %d", t->lb_index);
+  return s;
+}
+
+static uword
+bier_load_balance (vlib_main_t * vm,
+                  vlib_node_runtime_t * node,
+                  vlib_frame_t * frame)
+{
+    return (load_balance_inline(vm, node, frame, 0));
+}
+
+/**
+ * @brief
+ */
+VLIB_REGISTER_NODE (bier_load_balance_node) = {
+  .function = bier_load_balance,
+  .name = "bier-load-balance",
+  .vector_size = sizeof (u32),
+
+  .format_trace = format_bier_load_balance_trace,
+  .sibling_of = "mpls-load-balance",
+};
+
+// clang-format on