A Protocol Independent Hierarchical FIB (VPP-352) 02/2502/17
authorNeale Ranns <nranns@cisco.com>
Thu, 25 Aug 2016 14:29:12 +0000 (15:29 +0100)
committerDamjan Marion <dmarion.lists@gmail.com>
Wed, 21 Sep 2016 17:37:39 +0000 (17:37 +0000)
Main Enhancements:
 - Protocol Independent FIB API
 - Hierarchical FIB entries. Dynamic recursive route resolution.
 - Extranet Support.
 - Integration of IP and MPLS forwarding.
 - Separation of FIB and Adjacency databases.
 - Data-Plane Object forwarding model.

Change-Id: I52dc815c0d0aa8b493e3cf6b978568f3cc82296c
Signed-off-by: Neale Ranns <nranns@cisco.com>
197 files changed:
.gitignore
plugins/ila-plugin/ila/ila.c
plugins/ila-plugin/ila/ila.h
plugins/lb-plugin/lb/lb.c
plugins/lb-plugin/lb/node.c
plugins/sixrd-plugin/Makefile.am
plugins/sixrd-plugin/sixrd/sixrd.c
plugins/sixrd-plugin/sixrd/sixrd.h
plugins/sixrd-plugin/sixrd/sixrd_dpo.c [new file with mode: 0644]
plugins/sixrd-plugin/sixrd/sixrd_dpo.h [new file with mode: 0644]
plugins/vcgn-plugin/vcgn/vcgn_classify.c
vnet/Makefile.am
vnet/etc/scripts/arp4-mpls [new file with mode: 0644]
vnet/etc/scripts/lfib/ip4-to-mpls [new file with mode: 0644]
vnet/etc/scripts/lfib/mpls-pop-to-mpls [new file with mode: 0644]
vnet/etc/scripts/lfib/mpls-to-ip4 [new file with mode: 0644]
vnet/etc/scripts/lfib/mpls-to-mpls [new file with mode: 0644]
vnet/etc/scripts/mpls-o-ethernet/pg [new file with mode: 0644]
vnet/etc/scripts/mpls-o-ethernet/single.conf [new file with mode: 0644]
vnet/etc/scripts/source_and_port_range_check [new file with mode: 0644]
vnet/vnet/adj/adj.c [new file with mode: 0644]
vnet/vnet/adj/adj.h [new file with mode: 0644]
vnet/vnet/adj/adj_alloc.c [moved from vnet/vnet/ip/adj_alloc.c with 72% similarity]
vnet/vnet/adj/adj_alloc.h [moved from vnet/vnet/ip/adj_alloc.h with 87% similarity]
vnet/vnet/adj/adj_glean.c [new file with mode: 0644]
vnet/vnet/adj/adj_glean.h [new file with mode: 0644]
vnet/vnet/adj/adj_internal.h [new file with mode: 0644]
vnet/vnet/adj/adj_midchain.c [new file with mode: 0644]
vnet/vnet/adj/adj_midchain.h [new file with mode: 0644]
vnet/vnet/adj/adj_nbr.c [new file with mode: 0644]
vnet/vnet/adj/adj_nbr.h [new file with mode: 0644]
vnet/vnet/adj/adj_rewrite.c [new file with mode: 0644]
vnet/vnet/adj/adj_rewrite.h [new file with mode: 0644]
vnet/vnet/adj/adj_types.h [new file with mode: 0644]
vnet/vnet/classify/ip_classify.c
vnet/vnet/classify/vnet_classify.c
vnet/vnet/config.h
vnet/vnet/cop/ip4_whitelist.c
vnet/vnet/cop/ip6_whitelist.c
vnet/vnet/devices/dpdk/cli.c
vnet/vnet/devices/dpdk/node.c
vnet/vnet/devices/ssvm/node.c
vnet/vnet/dhcp/client.c
vnet/vnet/dhcp/proxy_node.c
vnet/vnet/dhcpv6/proxy_node.c
vnet/vnet/dpo/classify_dpo.c [new file with mode: 0644]
vnet/vnet/dpo/classify_dpo.h [new file with mode: 0644]
vnet/vnet/dpo/dpo.c [new file with mode: 0644]
vnet/vnet/dpo/dpo.h [new file with mode: 0644]
vnet/vnet/dpo/drop_dpo.c [new file with mode: 0644]
vnet/vnet/dpo/drop_dpo.h [new file with mode: 0644]
vnet/vnet/dpo/load_balance.c [new file with mode: 0644]
vnet/vnet/dpo/load_balance.h [new file with mode: 0644]
vnet/vnet/dpo/load_balance_map.c [new file with mode: 0644]
vnet/vnet/dpo/load_balance_map.h [new file with mode: 0644]
vnet/vnet/dpo/lookup_dpo.c [new file with mode: 0644]
vnet/vnet/dpo/lookup_dpo.h [new file with mode: 0644]
vnet/vnet/dpo/mpls_label_dpo.c [new file with mode: 0644]
vnet/vnet/dpo/mpls_label_dpo.h [new file with mode: 0644]
vnet/vnet/dpo/punt_dpo.c [new file with mode: 0644]
vnet/vnet/dpo/punt_dpo.h [new file with mode: 0644]
vnet/vnet/dpo/receive_dpo.c [new file with mode: 0644]
vnet/vnet/dpo/receive_dpo.h [new file with mode: 0644]
vnet/vnet/ethernet/arp.c
vnet/vnet/ethernet/ethernet.h
vnet/vnet/ethernet/interface.c
vnet/vnet/fib/fib.c [new file with mode: 0644]
vnet/vnet/fib/fib.h [new file with mode: 0644]
vnet/vnet/fib/fib_attached_export.c [new file with mode: 0644]
vnet/vnet/fib/fib_attached_export.h [new file with mode: 0644]
vnet/vnet/fib/fib_entry.c [new file with mode: 0644]
vnet/vnet/fib/fib_entry.h [new file with mode: 0644]
vnet/vnet/fib/fib_entry_cover.c [new file with mode: 0644]
vnet/vnet/fib/fib_entry_cover.h [new file with mode: 0644]
vnet/vnet/fib/fib_entry_src.c [new file with mode: 0644]
vnet/vnet/fib/fib_entry_src.h [new file with mode: 0644]
vnet/vnet/fib/fib_entry_src_adj.c [new file with mode: 0644]
vnet/vnet/fib/fib_entry_src_api.c [new file with mode: 0644]
vnet/vnet/fib/fib_entry_src_default.c [new file with mode: 0644]
vnet/vnet/fib/fib_entry_src_default_route.c [new file with mode: 0644]
vnet/vnet/fib/fib_entry_src_interface.c [new file with mode: 0644]
vnet/vnet/fib/fib_entry_src_lisp.c [new file with mode: 0644]
vnet/vnet/fib/fib_entry_src_mpls.c [new file with mode: 0644]
vnet/vnet/fib/fib_entry_src_rr.c [new file with mode: 0644]
vnet/vnet/fib/fib_entry_src_special.c [new file with mode: 0644]
vnet/vnet/fib/fib_internal.h [new file with mode: 0644]
vnet/vnet/fib/fib_node.c [new file with mode: 0644]
vnet/vnet/fib/fib_node.h [new file with mode: 0644]
vnet/vnet/fib/fib_node_list.c [new file with mode: 0644]
vnet/vnet/fib/fib_node_list.h [new file with mode: 0644]
vnet/vnet/fib/fib_path.c [new file with mode: 0644]
vnet/vnet/fib/fib_path.h [new file with mode: 0644]
vnet/vnet/fib/fib_path_ext.c [new file with mode: 0644]
vnet/vnet/fib/fib_path_ext.h [new file with mode: 0644]
vnet/vnet/fib/fib_path_list.c [new file with mode: 0644]
vnet/vnet/fib/fib_path_list.h [new file with mode: 0644]
vnet/vnet/fib/fib_table.c [new file with mode: 0644]
vnet/vnet/fib/fib_table.h [new file with mode: 0644]
vnet/vnet/fib/fib_test.c [new file with mode: 0644]
vnet/vnet/fib/fib_types.c [new file with mode: 0644]
vnet/vnet/fib/fib_types.h [new file with mode: 0644]
vnet/vnet/fib/fib_walk.c [new file with mode: 0644]
vnet/vnet/fib/fib_walk.h [new file with mode: 0644]
vnet/vnet/fib/ip4_fib.c [new file with mode: 0644]
vnet/vnet/fib/ip4_fib.h [new file with mode: 0644]
vnet/vnet/fib/ip6_fib.c [new file with mode: 0644]
vnet/vnet/fib/ip6_fib.h [new file with mode: 0644]
vnet/vnet/fib/mpls_fib.c [new file with mode: 0644]
vnet/vnet/fib/mpls_fib.h [new file with mode: 0644]
vnet/vnet/gre/gre.c
vnet/vnet/gre/gre.h
vnet/vnet/gre/interface.c
vnet/vnet/gre/node.c
vnet/vnet/handoff.c
vnet/vnet/handoff.h
vnet/vnet/interface.c
vnet/vnet/interface.h
vnet/vnet/interface_cli.c
vnet/vnet/interface_funcs.h
vnet/vnet/ip/format.h
vnet/vnet/ip/ip4.h
vnet/vnet/ip/ip4_forward.c
vnet/vnet/ip/ip4_mtrie.c
vnet/vnet/ip/ip4_mtrie.h
vnet/vnet/ip/ip4_source_and_port_range_check.c
vnet/vnet/ip/ip4_source_check.c
vnet/vnet/ip/ip4_test.c
vnet/vnet/ip/ip6.h
vnet/vnet/ip/ip6_forward.c
vnet/vnet/ip/ip6_hop_by_hop.c
vnet/vnet/ip/ip6_neighbor.c
vnet/vnet/ip/ip6_packet.h
vnet/vnet/ip/ip_feature_registration.c
vnet/vnet/ip/ip_feature_registration.h
vnet/vnet/ip/ip_source_and_port_range_check.h
vnet/vnet/ip/lookup.c
vnet/vnet/ip/lookup.h
vnet/vnet/ip/ping.c
vnet/vnet/ip/udp.h
vnet/vnet/ipsec-gre/ipsec_gre.c
vnet/vnet/lisp-cp/control.c
vnet/vnet/lisp-cp/control.h
vnet/vnet/lisp-cp/lisp_cp_dpo.c [new file with mode: 0644]
vnet/vnet/lisp-cp/lisp_cp_dpo.h [new file with mode: 0644]
vnet/vnet/lisp-cp/lisp_types.c
vnet/vnet/lisp-cp/lisp_types.h
vnet/vnet/lisp-gpe/interface.c
vnet/vnet/lisp-gpe/ip_forward.c
vnet/vnet/lisp-gpe/lisp_gpe.c
vnet/vnet/lisp-gpe/lisp_gpe.h
vnet/vnet/lisp-gpe/lisp_gpe_adjacency.c [new file with mode: 0644]
vnet/vnet/lisp-gpe/lisp_gpe_adjacency.h [new file with mode: 0644]
vnet/vnet/lisp-gpe/lisp_gpe_sub_interface.c [new file with mode: 0644]
vnet/vnet/lisp-gpe/lisp_gpe_sub_interface.h [new file with mode: 0644]
vnet/vnet/lisp-gpe/lisp_gpe_tunnel.c [new file with mode: 0644]
vnet/vnet/lisp-gpe/lisp_gpe_tunnel.h [new file with mode: 0644]
vnet/vnet/map/map.c
vnet/vnet/map/map.h
vnet/vnet/map/map_dpo.c [new file with mode: 0644]
vnet/vnet/map/map_dpo.h [new file with mode: 0644]
vnet/vnet/mcast/mcast_test.c
vnet/vnet/misc.c
vnet/vnet/mpls-gre/node.c [deleted file]
vnet/vnet/mpls-gre/packet.h [deleted file]
vnet/vnet/mpls/error.def [moved from vnet/vnet/mpls-gre/error.def with 91% similarity]
vnet/vnet/mpls/interface.c [moved from vnet/vnet/mpls-gre/interface.c with 69% similarity]
vnet/vnet/mpls/mpls.c [moved from vnet/vnet/mpls-gre/mpls.c with 74% similarity]
vnet/vnet/mpls/mpls.h [moved from vnet/vnet/mpls-gre/mpls.h with 65% similarity]
vnet/vnet/mpls/mpls_features.c [new file with mode: 0644]
vnet/vnet/mpls/mpls_lookup.c [new file with mode: 0644]
vnet/vnet/mpls/mpls_output.c [new file with mode: 0644]
vnet/vnet/mpls/mpls_types.h [new file with mode: 0644]
vnet/vnet/mpls/node.c [new file with mode: 0644]
vnet/vnet/mpls/packet.h [new file with mode: 0644]
vnet/vnet/mpls/pg.c [moved from vnet/vnet/mpls-gre/pg.c with 98% similarity]
vnet/vnet/mpls/policy_encap.c [moved from vnet/vnet/mpls-gre/policy_encap.c with 99% similarity]
vnet/vnet/pg/stream.c
vnet/vnet/rewrite.c
vnet/vnet/sr/sr.c
vnet/vnet/sr/sr.h
vnet/vnet/sr/sr_replicate.c
vnet/vnet/vxlan-gpe/vxlan_gpe.c
vnet/vnet/vxlan/vxlan.c
vpp-api-test/vat/api_format.c
vpp/app/vpe_cli.c
vpp/stats/stats.c
vpp/vpp-api/api.c
vpp/vpp-api/custom_dump.c
vpp/vpp-api/vpe.api
vpp/vpp-api/vpp_get_metrics.c
vppinfra/vppinfra/bihash_24_8.h
vppinfra/vppinfra/bihash_template.c
vppinfra/vppinfra/bihash_template.h
vppinfra/vppinfra/format.c
vppinfra/vppinfra/format.h
vppinfra/vppinfra/hash.h
vppinfra/vppinfra/vec.h

index a86a50c..05203e7 100644 (file)
@@ -59,12 +59,12 @@ test-driver
 # cscope and ctags
 /cscope.*
 /tags
+ID
+TAGS
 # ggtags
 GPATH
 GRTAGS
 GTAGS
-TAGS
-
 # Generated documentation
 /build-root/docs
 /build-root/.doxygen-bootstrap.ok
index 99d1db8..029dd21 100644 (file)
@@ -16,6 +16,8 @@
 #include <ila/ila.h>
 #include <vnet/plugin/plugin.h>
 #include <vnet/ip/lookup.h>
+#include <vnet/dpo/dpo.h>
+#include <vnet/fib/fib_table.h>
 
 static ila_main_t ila_main;
 
@@ -39,7 +41,6 @@ static char *ila_error_strings[] = {
 };
 
 typedef enum {
-  ILA_ILA2SIR_NEXT_IP6_REWRITE,
   ILA_ILA2SIR_NEXT_DROP,
   ILA_ILA2SIR_N_NEXT,
 } ila_ila2sir_next_t;
@@ -56,6 +57,16 @@ static ila_entry_t ila_sir2ila_default_entry = {
   .dir = ILA_DIR_ILA2SIR, //Will pass the packet with no
 };
 
+/**
+ * @brief Dynamically registered DPO Type for ILA
+ */
+static dpo_type_t ila_dpo_type;
+
+/**
+ * @brief Dynamically registered FIB node type for ILA
+ */
+static fib_node_type_t ila_fib_node_type;
+
 u8 *
 format_half_ip6_address (u8 * s, va_list * va)
 {
@@ -120,28 +131,29 @@ format_ila_entry (u8 * s, va_list * va)
   if (!e)
     {
       return format (s, "%-15s%=40s%=40s%+16s%+18s%+11s", "Type", "SIR Address",
-                    "ILA Address", "Adjacency Index", "Checksum Mode", "Direction");
-
+                    "ILA Address", "Checksum Mode", "Direction", "Next DPO");
     }
   else if (vnm)
     {
-      if (e->ila_adj_index == ~0)
+      if (ip6_address_is_zero(&e->next_hop))
        {
-         return format (s, "%-15U%=40U%=40U%16s%18U%11U",
+         return format (s, "%-15U%=40U%=40U%18U%11U%s",
                         format_ila_type, e->type,
                         format_ip6_address, &e->sir_address,
                         format_ip6_address, &e->ila_address,
-                        "n/a", format_csum_mode, e->csum_mode,
-                        format_ila_direction, e->dir);
+                        format_csum_mode, e->csum_mode,
+                        format_ila_direction, e->dir,
+                        "n/a");
        }
       else
        {
-         return format (s, "%-15U%=40U%=40U%16d%18U%11U",
+         return format (s, "%-15U%=40U%=40U%18U%11U%U",
                         format_ila_type, e->type,
                         format_ip6_address, &e->sir_address,
                         format_ip6_address, &e->ila_address,
-                        e->ila_adj_index, format_csum_mode, e->csum_mode,
-                        format_ila_direction, e->dir);
+                        format_csum_mode, e->csum_mode,
+                        format_ila_direction, e->dir,
+                        format_dpo_id, &e->ila_dpo, 0);
        }
     }
 
@@ -239,8 +251,6 @@ static uword
 ila_ila2sir (vlib_main_t * vm,
             vlib_node_runtime_t * node, vlib_frame_t * frame)
 {
-  ip6_main_t *im = &ip6_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
   u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
   ila_main_t *ilm = &ila_main;
 
@@ -256,10 +266,8 @@ ila_ila2sir (vlib_main_t * vm,
        {
          u32 pi0, pi1;
          vlib_buffer_t *p0, *p1;
-         ip_adjacency_t *adj0, *adj1;
          ila_entry_t *ie0, *ie1;
          ip6_header_t *ip60, *ip61;
-         ila_adj_data_t *ad0, *ad1;
          ip6_address_t *sir_address0, *sir_address1;
 
          {
@@ -287,14 +295,10 @@ ila_ila2sir (vlib_main_t * vm,
          ip61 = vlib_buffer_get_current (p1);
          sir_address0 = &ip60->dst_address;
          sir_address1 = &ip61->dst_address;
-         adj0 =
-           ip_get_adjacency (lm, vnet_buffer (p0)->ip.adj_index[VLIB_TX]);
-         adj1 =
-           ip_get_adjacency (lm, vnet_buffer (p1)->ip.adj_index[VLIB_TX]);
-         ad0 = (ila_adj_data_t *) & adj0->opaque;
-         ad1 = (ila_adj_data_t *) & adj1->opaque;
-         ie0 = pool_elt_at_index (ilm->entries, ad0->entry_index);
-         ie1 = pool_elt_at_index (ilm->entries, ad1->entry_index);
+         ie0 = pool_elt_at_index (ilm->entries,
+                                  vnet_buffer (p0)->ip.adj_index[VLIB_TX]);
+         ie1 = pool_elt_at_index (ilm->entries,
+                                  vnet_buffer (p1)->ip.adj_index[VLIB_TX]);
 
          if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
            {
@@ -321,13 +325,13 @@ ila_ila2sir (vlib_main_t * vm,
          ip61->dst_address.as_u64[0] = sir_address1->as_u64[0];
          ip61->dst_address.as_u64[1] = sir_address1->as_u64[1];
 
-         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = ie0->ila_adj_index;
-         vnet_buffer (p1)->ip.adj_index[VLIB_TX] = ie1->ila_adj_index;
+         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = ie0->ila_dpo.dpoi_index;
+         vnet_buffer (p1)->ip.adj_index[VLIB_TX] = ie1->ila_dpo.dpoi_index;
 
          vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next,
                                           n_left_to_next, pi0, pi1,
-                                          ILA_ILA2SIR_NEXT_IP6_REWRITE,
-                                          ILA_ILA2SIR_NEXT_IP6_REWRITE);
+                                          ie0->ila_dpo.dpoi_next_node,
+                                          ie1->ila_dpo.dpoi_next_node);
        }
 
       /* Single loop */
@@ -335,8 +339,6 @@ ila_ila2sir (vlib_main_t * vm,
        {
          u32 pi0;
          vlib_buffer_t *p0;
-         ip_adjacency_t *adj0;
-         ila_adj_data_t *ad0;
          ila_entry_t *ie0;
          ip6_header_t *ip60;
          ip6_address_t *sir_address0;
@@ -350,10 +352,8 @@ ila_ila2sir (vlib_main_t * vm,
          p0 = vlib_get_buffer (vm, pi0);
          ip60 = vlib_buffer_get_current (p0);
          sir_address0 = &ip60->dst_address;
-         adj0 =
-           ip_get_adjacency (lm, vnet_buffer (p0)->ip.adj_index[VLIB_TX]);
-         ad0 = (ila_adj_data_t *) & adj0->opaque;
-         ie0 = pool_elt_at_index (ilm->entries, ad0->entry_index);
+         ie0 = pool_elt_at_index (ilm->entries,
+                                  vnet_buffer (p0)->ip.adj_index[VLIB_TX]);
 
          if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
            {
@@ -367,11 +367,11 @@ ila_ila2sir (vlib_main_t * vm,
          sir_address0 = (ie0->dir != ILA_DIR_SIR2ILA) ? &ie0->sir_address : sir_address0;
          ip60->dst_address.as_u64[0] = sir_address0->as_u64[0];
          ip60->dst_address.as_u64[1] = sir_address0->as_u64[1];
-         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = ie0->ila_adj_index;
+         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = ie0->ila_dpo.dpoi_index;
 
          vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
                                           n_left_to_next, pi0,
-                                          ILA_ILA2SIR_NEXT_IP6_REWRITE);
+                                          ie0->ila_dpo.dpoi_next_node);
        }
       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
     }
@@ -379,16 +379,22 @@ ila_ila2sir (vlib_main_t * vm,
   return frame->n_vectors;
 }
 
+/** *INDENT-OFF* */
 VLIB_REGISTER_NODE (ila_ila2sir_node, static) =
 {
-  .function = ila_ila2sir,.name = "ila-to-sir",.vector_size =
-    sizeof (u32),.format_trace = format_ila_ila2sir_trace,.n_errors =
-    ILA_N_ERROR,.error_strings = ila_error_strings,.n_next_nodes =
-    ILA_ILA2SIR_N_NEXT,.next_nodes =
+  .function = ila_ila2sir,
+  .name = "ila-to-sir",
+  .vector_size = sizeof (u32),
+  .format_trace = format_ila_ila2sir_trace,
+  .n_errors = ILA_N_ERROR,
+  .error_strings = ila_error_strings,
+  .n_next_nodes = ILA_ILA2SIR_N_NEXT,
+  .next_nodes =
   {
-  [ILA_ILA2SIR_NEXT_IP6_REWRITE] = "ip6-rewrite",
-      [ILA_ILA2SIR_NEXT_DROP] = "error-drop"}
-,};
+      [ILA_ILA2SIR_NEXT_DROP] = "error-drop"
+  },
+};
+/** *INDENT-ON* */
 
 typedef enum
 {
@@ -580,28 +586,48 @@ ila_sir2ila (vlib_main_t * vm,
   return frame->n_vectors;
 }
 
+/** *INDENT-OFF* */
 VLIB_REGISTER_NODE (ila_sir2ila_node, static) =
 {
-  .function = ila_sir2ila,.name = "sir-to-ila",.vector_size =
-    sizeof (u32),.format_trace = format_ila_sir2ila_trace,.n_errors =
-    ILA_N_ERROR,.error_strings = ila_error_strings,.n_next_nodes =
-    ILA_SIR2ILA_N_NEXT,.next_nodes =
+  .function = ila_sir2ila,.name = "sir-to-ila",
+  .vector_size = sizeof (u32),
+  .format_trace = format_ila_sir2ila_trace,
+  .n_errors = ILA_N_ERROR,
+  .error_strings = ila_error_strings,
+  .n_next_nodes = ILA_SIR2ILA_N_NEXT,
+  .next_nodes =
   {
-  [ILA_SIR2ILA_NEXT_DROP] = "error-drop"}
-,};
+      [ILA_SIR2ILA_NEXT_DROP] = "error-drop"
+  },
+};
+/** *INDENT-ON* */
 
+/** *INDENT-OFF* */
 VNET_IP6_UNICAST_FEATURE_INIT (ila_sir2ila, static) =
 {
   .node_name = "sir-to-ila",
   .runs_before = ORDER_CONSTRAINTS{"ip6-lookup", 0},
   .feature_index = &ila_main.ila_sir2ila_feature_index,
 };
+/** *INDENT-ON* */
+
+static void
+ila_entry_stack (ila_entry_t *ie)
+{
+    /*
+     * restack on the next-hop's FIB entry
+     */
+    dpo_stack(ila_dpo_type,
+             DPO_PROTO_IP6,
+             &ie->ila_dpo,
+             fib_entry_contribute_ip_forwarding(
+                 ie->next_hop_fib_entry_index));
+}
 
 int
 ila_add_del_entry (ila_add_del_entry_args_t * args)
 {
   ila_main_t *ilm = &ila_main;
-  ip6_main_t *im6 = &ip6_main;
   BVT (clib_bihash_kv) kv, value;
 
   //Sanity check
@@ -642,7 +668,7 @@ ila_add_del_entry (ila_add_del_entry_args_t * args)
       pool_get (ilm->entries, e);
       e->type = args->type;
       e->sir_address = args->sir_address;
-      e->ila_adj_index = args->local_adj_index;
+      e->next_hop = args->next_hop_address;
       e->csum_mode = args->csum_mode;
       e->dir = args->dir;
 
@@ -698,31 +724,56 @@ ila_add_del_entry (ila_add_del_entry_args_t * args)
       BV (clib_bihash_add_del) (&ilm->id_to_entry_table, &kv,
                                1 /* is_add */ );
 
-      if (e->ila_adj_index != ~0)
+      if (!ip6_address_is_zero(&e->next_hop))
        {
-         //This is a local entry - let's create a local adjacency
-         ip_adjacency_t adj;
-         ip6_add_del_route_args_t route_args;
-         ila_adj_data_t *ad;
-
-         //Adjacency
-         memset (&adj, 0, sizeof (adj));
-         adj.explicit_fib_index = ~0;
-         adj.lookup_next_index = ilm->ip6_lookup_next_index;
-         ad = (ila_adj_data_t *) & adj.opaque;
-         ad->entry_index = e - ilm->entries;
-
-         //Route
-         memset (&route_args, 0, sizeof (route_args));
-         route_args.table_index_or_table_id = 0;
-         route_args.flags = IP6_ROUTE_FLAG_ADD;
-         route_args.dst_address = e->ila_address;
-         route_args.dst_address_length = 128;
-         route_args.adj_index = ~0;
-         route_args.add_adj = &adj;
-         route_args.n_add_adj = 1;
-
-         ip6_add_del_route (im6, &route_args);
+         /*
+          * become a child of the FIB netry for the next-hop
+          * so we are informed when its forwarding changes
+          */
+         fib_prefix_t next_hop = {
+             .fp_addr = {
+                 .ip6 = e->next_hop,
+             },
+             .fp_len = 128,
+             .fp_proto = FIB_PROTOCOL_IP6,
+         };
+
+         e->next_hop_fib_entry_index = 
+             fib_table_entry_special_add(0,
+                                         &next_hop,
+                                         FIB_SOURCE_RR,
+                                         FIB_ENTRY_FLAG_NONE,
+                                         ADJ_INDEX_INVALID);
+         e->next_hop_child_index =
+             fib_entry_child_add(e->next_hop_fib_entry_index,
+                                 ila_fib_node_type,
+                                 e - ilm->entries);
+
+         /*
+          * Create a route that results in the ILA entry
+          */
+         dpo_id_t dpo = DPO_NULL;
+         fib_prefix_t pfx = {
+             .fp_addr = {
+                 .ip6 = e->ila_address,
+             },
+             .fp_len = 128,
+             .fp_proto = FIB_PROTOCOL_IP6,
+         };
+
+         dpo_set(&dpo, ila_dpo_type, DPO_PROTO_IP6, e - ilm->entries);
+
+         fib_table_entry_special_dpo_add(0,
+                                         &pfx,
+                                         FIB_SOURCE_PLUGIN_HI,
+                                         FIB_ENTRY_FLAG_EXCLUSIVE,
+                                         &dpo);
+         dpo_reset(&dpo);
+
+         /*
+          * finally stack the ILA entry so it will forward to the next-hop
+          */
+         ila_entry_stack (e);
        }
     }
   else
@@ -740,21 +791,27 @@ ila_add_del_entry (ila_add_del_entry_args_t * args)
 
       e = &ilm->entries[value.value];
 
-      if (e->ila_adj_index != ~0)
+      if (!ip6_address_is_zero(&e->next_hop))
        {
-         //Delete that route - Associated adjacency will be deleted too
-         ip6_add_del_route_args_t route_args;
-         memset (&route_args, 0, sizeof (route_args));
-         route_args.table_index_or_table_id = 0;
-         route_args.flags = IP6_ROUTE_FLAG_DEL;
-         route_args.dst_address = e->ila_address;
-         route_args.dst_address_length = 128;
-         route_args.adj_index = ~0;
-         route_args.add_adj = NULL;
-         route_args.n_add_adj = 0;
-
-         ip6_add_del_route (im6, &route_args);
+         fib_prefix_t pfx = {
+             .fp_addr = {
+                 .ip6 = e->ila_address,
+             },
+             .fp_len = 128,
+             .fp_proto = FIB_PROTOCOL_IP6,
+         };
+
+         fib_table_entry_special_remove(0, &pfx, FIB_SOURCE_PLUGIN_HI);
+         /*
+          * remove this ILA entry as child of the FIB netry for the next-hop
+          */
+         fib_entry_child_remove(e->next_hop_fib_entry_index,
+                                e->next_hop_child_index);
+         fib_table_entry_delete_index(e->next_hop_fib_entry_index,
+                                      FIB_SOURCE_RR);
+         e->next_hop_fib_entry_index = FIB_NODE_INDEX_INVALID;
        }
+      dpo_reset (&e->ila_dpo);
 
       BV (clib_bihash_add_del) (&ilm->id_to_entry_table, &kv,
                                0 /* is_add */ );
@@ -796,24 +853,103 @@ vlib_plugin_register (vlib_main_t * vm, vnet_plugin_handoff_t * h,
   return error;
 }
 
-u8 *ila_format_adjacency(u8 * s, va_list * va)
+u8 *format_ila_dpo (u8 * s, va_list * va)
 {
+  index_t index = va_arg (*va, index_t);
+  CLIB_UNUSED(u32 indent) = va_arg (*va, u32);
   ila_main_t *ilm = &ila_main;
-  __attribute((unused)) ip_lookup_main_t *lm = va_arg (*va, ip_lookup_main_t *);
-  ip_adjacency_t *adj = va_arg (*va, ip_adjacency_t *);
-  ila_adj_data_t * ad = (ila_adj_data_t *) & adj->opaque;
-  ila_entry_t *ie = pool_elt_at_index (ilm->entries, ad->entry_index);
-  return format(s, "idx:%d sir:%U", ad->entry_index, format_ip6_address, &ie->sir_address);
+  ila_entry_t *ie = pool_elt_at_index (ilm->entries, index);
+  return format(s, "ILA: idx:%d sir:%U",
+               index,
+               format_ip6_address, &ie->sir_address);
+}
+
+/**
+ * @brief no-op lock function.
+ * The lifetime of the ILA entry is managed by the control plane
+ */
+static void
+ila_dpo_lock (dpo_id_t *dpo)
+{
 }
 
+/**
+ * @brief no-op unlock function.
+ * The lifetime of the ILA entry is managed by the control plane
+ */
+static void
+ila_dpo_unlock (dpo_id_t *dpo)
+{
+}
+
+const static dpo_vft_t ila_vft = {
+    .dv_lock = ila_dpo_lock,
+    .dv_unlock = ila_dpo_unlock,
+    .dv_format = format_ila_dpo,
+};
+const static char* const ila_ip6_nodes[] =
+{
+    "ila-to-sir",
+    NULL,
+};
+const static char* const * const ila_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP6]  = ila_ip6_nodes,
+};
+
+static fib_node_t *
+ila_fib_node_get_node (fib_node_index_t index)
+{
+  ila_main_t *ilm = &ila_main;
+  ila_entry_t *ie = pool_elt_at_index (ilm->entries, index);
+
+  return (&ie->ila_fib_node);
+}
+
+/**
+ * @brief no-op unlock function.
+ * The lifetime of the ILA entry is managed by the control plane
+ */
+static void
+ila_fib_node_last_lock_gone (fib_node_t *node)
+{
+}
+
+static ila_entry_t *
+ila_entry_from_fib_node (fib_node_t *node)
+{
+    return ((ila_entry_t*)(((char*)node) -
+                          STRUCT_OFFSET_OF(ila_entry_t, ila_fib_node)));
+}
+
+/**
+ * @brief
+ * Callback function invoked when the forwarding changes for the ILA next-hop
+ */
+static fib_node_back_walk_rc_t
+ila_fib_node_back_walk_notify (fib_node_t *node,
+                              fib_node_back_walk_ctx_t *ctx)
+{
+    ila_entry_stack(ila_entry_from_fib_node(node));
+
+    return (FIB_NODE_BACK_WALK_CONTINUE);
+}
+
+/*
+ * ILA's FIB graph node virtual function table
+ */
+static const fib_node_vft_t ila_fib_node_vft = {
+    .fnv_get = ila_fib_node_get_node,
+    .fnv_last_lock = ila_fib_node_last_lock_gone,
+    .fnv_back_walk = ila_fib_node_back_walk_notify,
+};
+
 clib_error_t *
 ila_init (vlib_main_t * vm)
 {
   ila_main_t *ilm = &ila_main;
   ilm->entries = NULL;
 
-  ASSERT (sizeof (ila_adj_data_t) < IP_ADJACENCY_OPAQUE_SZ);
-
   ilm->lookup_table_nbuckets = ILA_TABLE_DEFAULT_HASH_NUM_BUCKETS;
   ilm->lookup_table_nbuckets = 1 << max_log2 (ilm->lookup_table_nbuckets);
   ilm->lookup_table_size = ILA_TABLE_DEFAULT_HASH_MEMORY_SIZE;
@@ -822,15 +958,12 @@ ila_init (vlib_main_t * vm)
                         "ila id to entry index table",
                         ilm->lookup_table_nbuckets, ilm->lookup_table_size);
 
+  ila_dpo_type = dpo_register_new_type(&ila_vft, ila_nodes);
+  ila_fib_node_type = fib_node_register_new_type(&ila_fib_node_vft);
+
   return NULL;
 }
 
-VNET_IP6_REGISTER_ADJACENCY(ila2sir) = {
-  .node_name = "ila-to-sir",
-  .fn = ila_format_adjacency,
-  .next_index = &ila_main.ip6_lookup_next_index
-};
-
 VLIB_INIT_FUNCTION (ila_init);
 
 static clib_error_t *
@@ -839,9 +972,7 @@ ila_entry_command_fn (vlib_main_t * vm,
 {
   unformat_input_t _line_input, *line_input = &_line_input;
   ila_add_del_entry_args_t args = { 0 };
-  ip6_address_t next_hop;
   u8 next_hop_set = 0;
-  ip6_main_t *im6 = &ip6_main;
   int ret;
 
   args.type = ILA_TYPE_IID;
@@ -856,32 +987,27 @@ ila_entry_command_fn (vlib_main_t * vm,
     {
       if (unformat (line_input, "type %U", unformat_ila_type, &args.type))
        ;
-      else
-       if (unformat
-           (line_input, "sir-address %U", unformat_ip6_address,
-            &args.sir_address))
-       ;
-      else
-       if (unformat
-           (line_input, "locator %U", unformat_half_ip6_address,
-            &args.locator))
+      else if (unformat
+              (line_input, "sir-address %U", unformat_ip6_address,
+               &args.sir_address))
        ;
-      else if (unformat (line_input, "adj-index %u", &args.local_adj_index))
+      else if (unformat
+              (line_input, "locator %U", unformat_half_ip6_address,
+               &args.locator))
        ;
-      else
-       if (unformat
-           (line_input, "csum-mode %U", unformat_ila_csum_mode,
-            &args.csum_mode))
+      else if (unformat
+              (line_input, "csum-mode %U", unformat_ila_csum_mode,
+               &args.csum_mode))
        ;
       else if (unformat (line_input, "vnid %x", &args.vnid))
        ;
-      else
-       if (unformat
-           (line_input, "next-hop %U", unformat_ip6_address, &next_hop))
-       next_hop_set = 1;
+      else if (unformat
+              (line_input, "next-hop %U", unformat_ip6_address,
+               &args.next_hop_address))
+       ;
       else if (unformat
              (line_input, "direction %U", unformat_ila_direction, &args.dir))
-           ;
+        next_hop_set = 1;
       else if (unformat (line_input, "del"))
        args.is_del = 1;
       else
@@ -891,26 +1017,8 @@ ila_entry_command_fn (vlib_main_t * vm,
 
   unformat_free (line_input);
 
-  if (next_hop_set)
-    {
-      if (args.local_adj_index != ~0)
-       return clib_error_return (0,
-                                 "Specified both next hop and adjacency index");
-
-      u32 ai = ip6_get_route (im6, 0, 0, &next_hop, 128);
-      if (ai == 0)
-       return clib_error_return (0, "No route to next-hop %U",
-                                 format_ip6_address, &next_hop);
-
-      ip_lookup_main_t *lm6 = &ip6_main.lookup_main;
-      ip_adjacency_t *adj6 = ip_get_adjacency (lm6, ai);
-      if (adj6->lookup_next_index != IP_LOOKUP_NEXT_REWRITE)
-       {
-         return clib_error_return (0,
-                                   "Next-Hop route has to be a rewrite route");
-       }
-      args.local_adj_index = ai;
-    }
+  if (!next_hop_set)
+      return clib_error_return (0, "Specified a next hop");
 
   if ((ret = ila_add_del_entry (&args)))
     return clib_error_return (0, "ila_add_del_entry returned error %d", ret);
index b800fdd..657511f 100644 (file)
@@ -18,6 +18,7 @@
 
 #include <vnet/vnet.h>
 #include <vnet/ip/ip.h>
+#include <vnet/fib/fib_node.h>
 
 #include <vppinfra/bihash_24_8.h>
 #include <vppinfra/bihash_template.h>
@@ -59,17 +60,32 @@ typedef enum {
 } ila_direction_t;
 
 typedef struct {
+  /**
+   * Fib Node base class
+   */
+  fib_node_t ila_fib_node;
   ila_type_t type;
   ip6_address_t sir_address;
   ip6_address_t ila_address;
-  u32 ila_adj_index;
+  ip6_address_t next_hop;
   ila_csum_mode_t csum_mode;
   ila_direction_t dir;
-} ila_entry_t;
 
-typedef struct {
-  u32 entry_index;
-} ila_adj_data_t;
+  /**
+   * The FIB entry index for the next-hop
+   */
+  fib_node_index_t next_hop_fib_entry_index;
+
+  /**
+   * The child index on the FIB entry
+   */
+  u32 next_hop_child_index;
+
+  /**
+   * The next DPO in the grpah to follow
+   */
+  dpo_id_t ila_dpo;
+} ila_entry_t;
 
 typedef struct {
   ila_entry_t *entries;                //Pool of ILA entries
@@ -87,6 +103,7 @@ typedef struct {
 typedef struct {
   ila_type_t type;
   ip6_address_t sir_address;
+  ip6_address_t next_hop_address;
   u64 locator;
   u32 vnid;
   u32 local_adj_index;
index e91bdf0..140c221 100644 (file)
@@ -537,54 +537,54 @@ int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n)
 
 int lb_as_lookup_bypass(u32 vip_index, ip46_address_t *address, u8 is_disable)
 {
-  lb_get_writer_lock();
-  lb_main_t *lbm = &lb_main;
-  u32 as_index;
-  lb_as_t *as;
-  lb_vip_t *vip;
-
-  if (!(vip = lb_vip_get_by_index(vip_index)) ||
-      lb_as_find_index_vip(vip, address, &as_index)) {
-    lb_put_writer_lock();
-    return VNET_API_ERROR_NO_SUCH_ENTRY;
-  }
-
-  as = &lbm->ass[as_index];
-
-  if (is_disable) {
-    as->adj_index = ~0;
-  } else if (lb_vip_is_gre4(vip)) {
-    uword *p = ip4_get_route (&ip4_main, 0, 0, as->address.ip4.as_u8, 32);
-    if (p == 0) {
-      lb_put_writer_lock();
-      return VNET_API_ERROR_NO_SUCH_ENTRY;
-    }
-    u32 ai = (u32)p[0];
-    ip_lookup_main_t *lm4 = &ip4_main.lookup_main;
-    ip_adjacency_t *adj4 = ip_get_adjacency (lm4, ai);
-    if (adj4->lookup_next_index != IP_LOOKUP_NEXT_REWRITE) {
-      lb_put_writer_lock();
-      return VNET_API_ERROR_INCORRECT_ADJACENCY_TYPE;
-    }
-
-    as->adj_index = ai;
-  } else {
-    u32 ai = ip6_get_route (&ip6_main, 0, 0, &as->address.ip6, 128);
-    if (ai == 0) {
-      lb_put_writer_lock();
-      return VNET_API_ERROR_NO_SUCH_ENTRY;
-    }
-
-    ip_lookup_main_t *lm6 = &ip6_main.lookup_main;
-    ip_adjacency_t *adj6 = ip_get_adjacency (lm6, ai);
-    if (adj6->lookup_next_index != IP_LOOKUP_NEXT_REWRITE) {
-      lb_put_writer_lock();
-      return VNET_API_ERROR_INCORRECT_ADJACENCY_TYPE;
-    }
-
-    as->adj_index = ai;
-  }
-  lb_put_writer_lock();
+  /* lb_get_writer_lock(); */
+  /* lb_main_t *lbm = &lb_main; */
+  /* u32 as_index; */
+  /* lb_as_t *as; */
+  /* lb_vip_t *vip; */
+
+  /* if (!(vip = lb_vip_get_by_index(vip_index)) || */
+  /*     lb_as_find_index_vip(vip, address, &as_index)) { */
+  /*   lb_put_writer_lock(); */
+  /*   return VNET_API_ERROR_NO_SUCH_ENTRY; */
+  /* } */
+
+  /* as = &lbm->ass[as_index]; */
+
+  /* if (is_disable) { */
+  /*   as->adj_index = ~0; */
+  /* } else if (lb_vip_is_gre4(vip)) { */
+  /*   uword *p = ip4_get_route (&ip4_main, 0, 0, as->address.ip4.as_u8, 32); */
+  /*   if (p == 0) { */
+  /*     lb_put_writer_lock(); */
+  /*     return VNET_API_ERROR_NO_SUCH_ENTRY; */
+  /*   } */
+  /*   u32 ai = (u32)p[0]; */
+  /*   ip_lookup_main_t *lm4 = &ip4_main.lookup_main; */
+  /*   ip_adjacency_t *adj4 = ip_get_adjacency (lm4, ai); */
+  /*   if (adj4->lookup_next_index != IP_LOOKUP_NEXT_REWRITE) { */
+  /*     lb_put_writer_lock(); */
+  /*     return VNET_API_ERROR_INCORRECT_ADJACENCY_TYPE; */
+  /*   } */
+
+  /*   as->adj_index = ai; */
+  /* } else { */
+  /*   u32 ai = ip6_get_route (&ip6_main, 0, 0, &as->address.ip6, 128); */
+  /*   if (ai == 0) { */
+  /*     lb_put_writer_lock(); */
+  /*     return VNET_API_ERROR_NO_SUCH_ENTRY; */
+  /*   } */
+
+  /*   ip_lookup_main_t *lm6 = &ip6_main.lookup_main; */
+  /*   ip_adjacency_t *adj6 = ip_get_adjacency (lm6, ai); */
+  /*   if (adj6->lookup_next_index != IP_LOOKUP_NEXT_REWRITE) { */
+  /*     lb_put_writer_lock(); */
+  /*     return VNET_API_ERROR_INCORRECT_ADJACENCY_TYPE; */
+  /*   } */
+
+  /*   as->adj_index = ai; */
+  /* } */
+  /* lb_put_writer_lock(); */
   return 0;
 }
 
@@ -594,41 +594,41 @@ int lb_as_lookup_bypass(u32 vip_index, ip46_address_t *address, u8 is_disable)
  */
 static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip)
 {
-  ip_adjacency_t adj;
-  //Adjacency
-  memset (&adj, 0, sizeof (adj));
-  adj.explicit_fib_index = ~0;
-  lb_adj_data_t *ad = (lb_adj_data_t *) &adj.opaque;
-  ad->vip_index = vip - lbm->vips;
-
-  ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned
-  u32 lookup_next_index = lbm->ip_lookup_next_index[vip->type];
-
-  if (lb_vip_is_ip4(vip)) {
-    adj.lookup_next_index = lookup_next_index;
-    ip4_add_del_route_args_t route_args = {};
-    ip4_main_t *im4 = &ip4_main;
-    route_args.table_index_or_table_id = 0;
-    route_args.flags = IP4_ROUTE_FLAG_ADD;
-    route_args.dst_address = vip->prefix.ip4;
-    route_args.dst_address_length = vip->plen - 96;
-    route_args.adj_index = ~0;
-    route_args.add_adj = &adj;
-    route_args.n_add_adj = 1;
-    ip4_add_del_route (im4, &route_args);
-  } else {
-    adj.lookup_next_index = lookup_next_index;
-    ip6_add_del_route_args_t route_args = {};
-    ip6_main_t *im6 = &ip6_main;
-    route_args.table_index_or_table_id = 0;
-    route_args.flags = IP6_ROUTE_FLAG_ADD;
-    route_args.dst_address = vip->prefix.ip6;
-    route_args.dst_address_length = vip->plen;
-    route_args.adj_index = ~0;
-    route_args.add_adj = &adj;
-    route_args.n_add_adj = 1;
-    ip6_add_del_route (im6, &route_args);
-  }
+  /* ip_adjacency_t adj; */
+  /* //Adjacency */
+  /* memset (&adj, 0, sizeof (adj)); */
+  /* adj.explicit_fib_index = ~0; */
+  /* lb_adj_data_t *ad = (lb_adj_data_t *) &adj.opaque; */
+  /* ad->vip_index = vip - lbm->vips; */
+
+  /* ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned */
+  /* u32 lookup_next_index = lbm->ip_lookup_next_index[vip->type]; */
+
+  /* if (lb_vip_is_ip4(vip)) { */
+  /*   adj.lookup_next_index = lookup_next_index; */
+  /*   ip4_add_del_route_args_t route_args = {}; */
+  /*   ip4_main_t *im4 = &ip4_main; */
+  /*   route_args.table_index_or_table_id = 0; */
+  /*   route_args.flags = IP4_ROUTE_FLAG_ADD; */
+  /*   route_args.dst_address = vip->prefix.ip4; */
+  /*   route_args.dst_address_length = vip->plen - 96; */
+  /*   route_args.adj_index = ~0; */
+  /*   route_args.add_adj = &adj; */
+  /*   route_args.n_add_adj = 1; */
+  /*   ip4_add_del_route (im4, &route_args); */
+  /* } else { */
+  /*   adj.lookup_next_index = lookup_next_index; */
+  /*   ip6_add_del_route_args_t route_args = {}; */
+  /*   ip6_main_t *im6 = &ip6_main; */
+  /*   route_args.table_index_or_table_id = 0; */
+  /*   route_args.flags = IP6_ROUTE_FLAG_ADD; */
+  /*   route_args.dst_address = vip->prefix.ip6; */
+  /*   route_args.dst_address_length = vip->plen; */
+  /*   route_args.adj_index = ~0; */
+  /*   route_args.add_adj = &adj; */
+  /*   route_args.n_add_adj = 1; */
+  /*   ip6_add_del_route (im6, &route_args); */
+  /* } */
 }
 
 /**
@@ -636,30 +636,30 @@ static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip)
  */
 static void lb_vip_del_adjacency(lb_main_t *lbm, lb_vip_t *vip)
 {
-  ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned
-  if (lb_vip_is_ip4(vip)) {
-    ip4_main_t *im4 = &ip4_main;
-    ip4_add_del_route_args_t route_args = {};
-    route_args.table_index_or_table_id = 0;
-    route_args.flags = IP4_ROUTE_FLAG_DEL;
-    route_args.dst_address = vip->prefix.ip4;
-    route_args.dst_address_length = vip->plen - 96;
-    route_args.adj_index = ~0;
-    route_args.add_adj = NULL;
-    route_args.n_add_adj = 0;
-    ip4_add_del_route (im4, &route_args);
-  } else {
-    ip6_main_t *im6 = &ip6_main;
-    ip6_add_del_route_args_t route_args = {};
-    route_args.table_index_or_table_id = 0;
-    route_args.flags = IP6_ROUTE_FLAG_DEL;
-    route_args.dst_address = vip->prefix.ip6;
-    route_args.dst_address_length = vip->plen;
-    route_args.adj_index = ~0;
-    route_args.add_adj = NULL;
-    route_args.n_add_adj = 0;
-    ip6_add_del_route (im6, &route_args);
-  }
+  /* ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned */
+  /* if (lb_vip_is_ip4(vip)) { */
+  /*   ip4_main_t *im4 = &ip4_main; */
+  /*   ip4_add_del_route_args_t route_args = {}; */
+  /*   route_args.table_index_or_table_id = 0; */
+  /*   route_args.flags = IP4_ROUTE_FLAG_DEL; */
+  /*   route_args.dst_address = vip->prefix.ip4; */
+  /*   route_args.dst_address_length = vip->plen - 96; */
+  /*   route_args.adj_index = ~0; */
+  /*   route_args.add_adj = NULL; */
+  /*   route_args.n_add_adj = 0; */
+  /*   ip4_add_del_route (im4, &route_args); */
+  /* } else { */
+  /*   ip6_main_t *im6 = &ip6_main; */
+  /*   ip6_add_del_route_args_t route_args = {}; */
+  /*   route_args.table_index_or_table_id = 0; */
+  /*   route_args.flags = IP6_ROUTE_FLAG_DEL; */
+  /*   route_args.dst_address = vip->prefix.ip6; */
+  /*   route_args.dst_address_length = vip->plen; */
+  /*   route_args.adj_index = ~0; */
+  /*   route_args.add_adj = NULL; */
+  /*   route_args.n_add_adj = 0; */
+  /*   ip6_add_del_route (im6, &route_args); */
+  /* } */
 }
 
 int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u32 new_length, u32 *vip_index)
index 8df462c..4f71616 100644 (file)
@@ -48,15 +48,15 @@ typedef struct {
   u32 as_index;
 } lb_trace_t;
 
-u8 *lb_format_adjacency(u8 * s, va_list * va)
-{
-  lb_main_t *lbm = &lb_main;
-  __attribute((unused)) ip_lookup_main_t *lm = va_arg (*va, ip_lookup_main_t *);
-  ip_adjacency_t *adj = va_arg (*va, ip_adjacency_t *);
-  lb_adj_data_t *ad = (lb_adj_data_t *) &adj->opaque;
-  __attribute__((unused)) lb_vip_t *vip = pool_elt_at_index (lbm->vips, ad->vip_index);
-  return format(s, "vip idx:%d", ad->vip_index);
-}
+/* u8 *lb_format_adjacency(u8 * s, va_list * va) */
+/* { */
+/*   lb_main_t *lbm = &lb_main; */
+/*   __attribute((unused)) ip_lookup_main_t *lm = va_arg (*va, ip_lookup_main_t *); */
+/*   ip_adjacency_t *adj = va_arg (*va, ip_adjacency_t *); */
+/*   lb_adj_data_t *ad = (lb_adj_data_t *) &adj->opaque; */
+/*   __attribute__((unused)) lb_vip_t *vip = pool_elt_at_index (lbm->vips, ad->vip_index); */
+/*   return format(s, "vip idx:%d", ad->vip_index); */
+/* } */
 
 u8 *
 format_lb_trace (u8 * s, va_list * args)
@@ -319,11 +319,11 @@ VLIB_REGISTER_NODE (lb6_gre6_node) =
   },
 };
 
-VNET_IP6_REGISTER_ADJACENCY(lb6_gre6) = {
-  .node_name = "lb6-gre6",
-  .fn = lb_format_adjacency,
-  .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP6_GRE6]
-};
+/* VNET_IP6_REGISTER_ADJACENCY(lb6_gre6) = { */
+/*   .node_name = "lb6-gre6", */
+/*   .fn = lb_format_adjacency, */
+/*   .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP6_GRE6] */
+/* }; */
 
 VLIB_REGISTER_NODE (lb6_gre4_node) =
 {
@@ -344,11 +344,11 @@ VLIB_REGISTER_NODE (lb6_gre4_node) =
   },
 };
 
-VNET_IP6_REGISTER_ADJACENCY(lb6_gre4) = {
-  .node_name = "lb6-gre4",
-  .fn = lb_format_adjacency,
-  .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP6_GRE4]
-};
+/* VNET_IP6_REGISTER_ADJACENCY(lb6_gre4) = { */
+/*   .node_name = "lb6-gre4", */
+/*   .fn = lb_format_adjacency, */
+/*   .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP6_GRE4] */
+/* }; */
 
 VLIB_REGISTER_NODE (lb4_gre6_node) =
 {
@@ -369,11 +369,11 @@ VLIB_REGISTER_NODE (lb4_gre6_node) =
   },
 };
 
-VNET_IP4_REGISTER_ADJACENCY(lb4_gre6) = {
-  .node_name = "lb4-gre6",
-  .fn = lb_format_adjacency,
-  .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP4_GRE6]
-};
+/* VNET_IP4_REGISTER_ADJACENCY(lb4_gre6) = { */
+/*   .node_name = "lb4-gre6", */
+/*   .fn = lb_format_adjacency, */
+/*   .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP4_GRE6] */
+/* }; */
 
 VLIB_REGISTER_NODE (lb4_gre4_node) =
 {
@@ -394,8 +394,8 @@ VLIB_REGISTER_NODE (lb4_gre4_node) =
   },
 };
 
-VNET_IP4_REGISTER_ADJACENCY(lb4_gre4) = {
-  .node_name = "lb4-gre4",
-  .fn = lb_format_adjacency,
-  .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP4_GRE4]
-};
+/* VNET_IP4_REGISTER_ADJACENCY(lb4_gre4) = { */
+/*   .node_name = "lb4-gre4", */
+/*   .fn = lb_format_adjacency, */
+/*   .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP4_GRE4] */
+/* }; */
index 71c8da4..eb0d806 100644 (file)
@@ -18,11 +18,13 @@ AM_LDFLAGS = -module -shared -avoid-version
 
 libsixrd_plugin_la_SOURCES =                   \
        sixrd/sixrd.c                           \
+       sixrd/sixrd_dpo.c                       \
        sixrd/ip4_sixrd.c                       \
        sixrd/ip6_sixrd.c
 
 noinst_HEADERS =                               \
-       sixrd/sixrd.h
+       sixrd/sixrd.h                           \
+       sixrd/sixrd_dpo.h
 
 BUILT_SOURCES =
 
index e842d49..65d353a 100644 (file)
 #include "sixrd.h"
 #include <vnet/plugin/plugin.h>
 
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/ip6_fib.h>
+#include <vnet/adj/adj.h>
+
 /*
  * This code supports the following sixrd modes:
  * 
 
 int
 sixrd_create_domain (ip6_address_t *ip6_prefix,
-                   u8 ip6_prefix_len,
-                   ip4_address_t *ip4_prefix,
-                   u8 ip4_prefix_len,
-                   ip4_address_t *ip4_src,
-                   u32 *sixrd_domain_index,
+                    u8 ip6_prefix_len,
+                    ip4_address_t *ip4_prefix,
+                    u8 ip4_prefix_len,
+                    ip4_address_t *ip4_src,
+                    u32 *sixrd_domain_index,
                     u16 mtu)
 {
+  dpo_id_t dpo_v6 = DPO_NULL, dpo_v4 = DPO_NULL;
   sixrd_main_t *mm = &sixrd_main;
-  ip4_main_t *im4 = &ip4_main;
-  ip6_main_t *im6 = &ip6_main;
+  fib_node_index_t fei;
   sixrd_domain_t *d;
-  ip_adjacency_t adj;
-  ip4_add_del_route_args_t args4;
-  ip6_add_del_route_args_t args6;
-  u32 *p;
 
   /* Get domain index */
   pool_get_aligned(mm->domains, d, CLIB_CACHE_LINE_BYTES);
@@ -61,55 +61,79 @@ sixrd_create_domain (ip6_address_t *ip6_prefix,
   if (ip4_prefix_len < 32)
     d->shift = 64 - ip6_prefix_len + (32 - ip4_prefix_len);
     
-  /* Init IP adjacency */
-  memset(&adj, 0, sizeof(adj));
-  adj.explicit_fib_index = ~0;
-  p = (u32 *)&adj.rewrite_data[0];
-  *p = (u32) (*sixrd_domain_index);
-
-  /* Create ip6 adjacency */
-  memset(&args6, 0, sizeof(args6));
-  args6.table_index_or_table_id = 0;
-  args6.flags = IP6_ROUTE_FLAG_ADD;
-  args6.dst_address.as_u64[0] = ip6_prefix->as_u64[0];
-  args6.dst_address.as_u64[1] = ip6_prefix->as_u64[1];
-  args6.dst_address_length = ip6_prefix_len;
-  args6.adj_index = ~0;
-  args6.add_adj = &adj;
-  args6.n_add_adj = 1;
-  adj.lookup_next_index = mm->ip6_lookup_next_index;
-  ip6_add_del_route(im6, &args6);
-
-  /* Multiple SIXRD domains may share same source IPv4 TEP */
-  uword *q = ip4_get_route(im4, 0, 0, (u8 *)ip4_src, 32);
-  if (q) {
-    u32 ai = q[0];
-    ip_lookup_main_t *lm4 = &ip4_main.lookup_main;
-    ip_adjacency_t *adj4 = ip_get_adjacency(lm4, ai);
-    if (adj4->lookup_next_index != mm->ip4_lookup_next_index) {
-      clib_warning("BR source address already assigned: %U", format_ip4_address, ip4_src);
-      pool_put(mm->domains, d);
-      return -1;
-    }
-    /* Shared source */
-    p = (u32 *)&adj4->rewrite_data[0];
-    p[0] = ~0;
-
-    /* Add refcount, so we don't accidentially delete the route underneath someone */
-    p[1]++;
-  } else {
-    /* Create ip4 adjacency. */
-    memset(&args4, 0, sizeof(args4));
-    args4.table_index_or_table_id = 0;
-    args4.flags = IP4_ROUTE_FLAG_ADD;
-    args4.dst_address.as_u32 = ip4_src->as_u32;
-    args4.dst_address_length = 32;
-    args4.adj_index = ~0;
-    args4.add_adj = &adj;
-    args4.n_add_adj = 1;
-    adj.lookup_next_index = mm->ip4_lookup_next_index;
-    ip4_add_del_route(im4, &args4);
+  /* Create IPv6 route/adjacency */
+  fib_prefix_t pfx6 = {
+      .fp_proto = FIB_PROTOCOL_IP6,
+      .fp_len = d->ip6_prefix_len,
+      .fp_addr = {
+         .ip6 = d->ip6_prefix,
+      },
+  };
+  sixrd_dpo_create(FIB_PROTOCOL_IP6,
+                  *sixrd_domain_index,
+                  &dpo_v6);
+  fib_table_entry_special_dpo_add(0, &pfx6,
+                                 FIB_SOURCE_SIXRD,
+                                 FIB_ENTRY_FLAG_EXCLUSIVE,
+                                 &dpo_v6);
+  dpo_reset (&dpo_v6);
+
+  /*
+   * Multiple SIXRD domains may share same source IPv4 TEP
+   * In this case the route will exist and be SixRD sourced.
+   * Find the adj (if any) already contributed and modify it
+   */
+  fib_prefix_t pfx4 = {
+      .fp_proto = FIB_PROTOCOL_IP6,
+      .fp_len = 32,
+      .fp_addr = {
+         .ip4 = d->ip4_src,
+      },
+  };
+  fei = fib_table_lookup_exact_match(0, &pfx4);
+
+  if (FIB_NODE_INDEX_INVALID != fei)
+  {
+      dpo_id_t dpo = DPO_NULL;
+
+      if (fib_entry_get_dpo_for_source (fei, FIB_SOURCE_SIXRD, &dpo))
+      {
+         /*
+          * modify the existing adj to indicate it's shared
+          * skip to route add.
+          * It is locked to pair with the unlock below.
+          */
+         const dpo_id_t *sd_dpo;
+         sixrd_dpo_t *sd;
+
+         ASSERT(DPO_LOAD_BALANCE == dpo.dpoi_type);
+
+         sd_dpo = load_balance_get_bucket(dpo.dpoi_index, 0);
+         sd = sixrd_dpo_get (sd_dpo->dpoi_index);
+
+         sd->sd_domain = ~0;
+         dpo_copy (&dpo_v4, sd_dpo);
+         dpo_reset (&dpo);
+
+         goto route_add;
+      }
   }
+  /* first time addition of the route */
+  sixrd_dpo_create(FIB_PROTOCOL_IP4,
+                  *sixrd_domain_index,
+                  &dpo_v4);
+
+route_add:
+  /*
+   * Create ip4 route. This is a reference counted add. If the prefix
+   * already exists and is SixRD sourced, it is now SixRD source n+1 times
+   * and will need to be removed n+1 times.
+   */
+  fib_table_entry_special_dpo_add(0, &pfx4,
+                                 FIB_SOURCE_SIXRD,
+                                 FIB_ENTRY_FLAG_EXCLUSIVE,
+                                 &dpo_v4);
+  dpo_reset (&dpo_v4);
 
   return 0;
 }
@@ -121,57 +145,33 @@ int
 sixrd_delete_domain (u32 sixrd_domain_index)
 {
   sixrd_main_t *mm = &sixrd_main;
-  ip4_main_t *im4 = &ip4_main;
-  ip6_main_t *im6 = &ip6_main;
   sixrd_domain_t *d;
-  ip_adjacency_t adj;
-  ip4_add_del_route_args_t args4;
-  ip6_add_del_route_args_t args6;
 
   if (pool_is_free_index(mm->domains, sixrd_domain_index)) {
-    clib_warning("SIXRD domain delete: domain does not exist: %d", sixrd_domain_index);
+    clib_warning("SIXRD domain delete: domain does not exist: %d",
+                sixrd_domain_index);
     return -1;
   }
 
   d = pool_elt_at_index(mm->domains, sixrd_domain_index);
 
-  memset(&adj, 0, sizeof(adj));
-  adj.explicit_fib_index = ~0;
-
-  /* Delete ip6 adjacency */
-  memset(&args6, 0, sizeof (args6));
-  args6.table_index_or_table_id = 0;
-  args6.flags = IP6_ROUTE_FLAG_DEL;
-  args6.dst_address.as_u64[0] = d->ip6_prefix.as_u64[0];
-  args6.dst_address.as_u64[1] = d->ip6_prefix.as_u64[1];
-  args6.dst_address_length = d->ip6_prefix_len;
-  args6.adj_index = 0;
-  args6.add_adj = &adj;
-  args6.n_add_adj = 0;
-  ip6_add_del_route(im6, &args6);
-
-  /* Delete ip4 adjacency */
-  uword *q = ip4_get_route(im4, 0, 0, (u8 *)&d->ip4_src, 32);
-  if (q) {
-    u32 ai = q[0];
-    ip_lookup_main_t *lm4 = &ip4_main.lookup_main;
-    ip_adjacency_t *adj4 = ip_get_adjacency(lm4, ai);
-
-    u32 *p = (u32 *)&adj4->rewrite_data[0];
-    /* Delete route when no other domains use this source */
-    if (p[1] == 0) {
-      memset(&args4, 0, sizeof(args4));
-      args4.table_index_or_table_id = 0;
-      args4.flags = IP4_ROUTE_FLAG_DEL;
-      args4.dst_address.as_u32 = d->ip4_prefix.as_u32;
-      args4.dst_address_length = d->ip4_prefix_len;
-      args4.adj_index = 0;
-      args4.add_adj = &adj;
-      args4.n_add_adj = 0;
-      ip4_add_del_route(im4, &args4);
-    }
-    p[1]--;
-  }
+  fib_prefix_t pfx = {
+      .fp_proto = FIB_PROTOCOL_IP4,
+      .fp_len = 32,
+      .fp_addr = {
+         .ip4 = d->ip4_src,
+      },
+  };
+  fib_table_entry_special_remove(0, &pfx, FIB_SOURCE_SIXRD);
+
+  fib_prefix_t pfx6 = {
+      .fp_proto = FIB_PROTOCOL_IP6,
+      .fp_len = d->ip6_prefix_len,
+      .fp_addr = {
+         .ip6 = d->ip6_prefix,
+      },
+  };
+  fib_table_entry_special_remove(0, &pfx6, FIB_SOURCE_SIXRD);
 
   pool_put(mm->domains, d);
 
@@ -361,19 +361,9 @@ vlib_plugin_register (vlib_main_t * vm, vnet_plugin_handoff_t * h,
 
 static clib_error_t * sixrd_init (vlib_main_t * vm)
 {
-  clib_error_t * error = 0;
-  sixrd_main_t *mm = &sixrd_main;
+  sixrd_dpo_module_init ();
 
-  vlib_node_t * ip6_lookup_node = vlib_get_node_by_name(vm, (u8 *)"ip6-lookup");
-  vlib_node_t * ip4_lookup_node = vlib_get_node_by_name(vm, (u8 *)"ip4-lookup");
-  vlib_node_t * ip6_sixrd_node = vlib_get_node_by_name(vm, (u8 *)"ip6-sixrd");
-  vlib_node_t * ip4_sixrd_node = vlib_get_node_by_name(vm, (u8 *)"ip4-sixrd");
-  ASSERT(ip6_lookup_node && ip4_lookup_node && ip6_sixrd_node && ip4_sixrd_node);
-
-  mm->ip6_lookup_next_index = vlib_node_add_next(vm, ip6_lookup_node->index, ip6_sixrd_node->index);
-  mm->ip4_lookup_next_index = vlib_node_add_next(vm, ip4_lookup_node->index, ip4_sixrd_node->index);
-
-  return error;
+  return (NULL);
 }
 
 VLIB_INIT_FUNCTION (sixrd_init);
index 2f0912f..56714c9 100644 (file)
@@ -17,6 +17,9 @@
 #include <vppinfra/error.h>
 #include <vnet/vnet.h>
 #include <vnet/ip/ip.h>
+#include <vnet/fib/ip6_fib.h>
+
+#include "sixrd_dpo.h"
 
 int sixrd_create_domain(ip6_address_t *ip6_prefix, u8 ip6_prefix_len,
                        ip4_address_t *ip4_prefix, u8 ip4_prefix_len,
@@ -44,9 +47,6 @@ typedef struct {
   /* convenience */
   vlib_main_t *vlib_main;
   vnet_main_t *vnet_main;
-
-  u32 ip4_lookup_next_index;
-  u32 ip6_lookup_next_index;
 } sixrd_main_t;
 
 #define foreach_sixrd_error                            \
@@ -99,16 +99,16 @@ sixrd_get_addr (sixrd_domain_t *d, u64 dal)
  * Get the SIXRD domain from an IPv6 lookup adjacency.
  */
 static_always_inline sixrd_domain_t *
-ip6_sixrd_get_domain (u32 adj_index, u32 *sixrd_domain_index)
+ip6_sixrd_get_domain (u32 sdi, u32 *sixrd_domain_index)
 {
   sixrd_main_t *mm = &sixrd_main;
-  ip_lookup_main_t *lm = &ip6_main.lookup_main;
-  ip_adjacency_t *adj = ip_get_adjacency(lm, adj_index);
-  ASSERT(adj);
-  uword *p = (uword *)adj->rewrite_data;
-  ASSERT(p);
-  *sixrd_domain_index = p[0];
-  return pool_elt_at_index(mm->domains, p[0]);
+  sixrd_dpo_t *sd;
+
+  sd = sixrd_dpo_get(sdi);
+
+  ASSERT(sd);
+  *sixrd_domain_index = sd->sd_domain;
+  return pool_elt_at_index(mm->domains, *sixrd_domain_index);
 }
 
 /*
@@ -117,28 +117,25 @@ ip6_sixrd_get_domain (u32 adj_index, u32 *sixrd_domain_index)
  * The IPv6 address is used otherwise.
  */
 static_always_inline sixrd_domain_t *
-ip4_sixrd_get_domain (u32 adj_index, ip6_address_t *addr,
+ip4_sixrd_get_domain (u32 sdi, ip6_address_t *addr,
                      u32 *sixrd_domain_index, u8 *error)
 {
   sixrd_main_t *mm = &sixrd_main;
-  ip6_main_t *im6 = &ip6_main;
-  ip_lookup_main_t *lm4 = &ip4_main.lookup_main;
-  ip_lookup_main_t *lm6 = &ip6_main.lookup_main;
-  ip_adjacency_t *adj = ip_get_adjacency(lm4, adj_index);
-  ASSERT(adj);
-  uword *p = (uword *)adj->rewrite_data;
-  ASSERT(p);
-  *sixrd_domain_index = p[0];
-  if (p[0] != ~0)
-    return pool_elt_at_index(mm->domains, p[0]);
-
-  u32 ai = ip6_fib_lookup_with_table(im6, 0, addr);
-  ip_adjacency_t *adj6 = ip_get_adjacency (lm6, ai);
-  if (PREDICT_TRUE(adj6->lookup_next_index == mm->ip6_lookup_next_index)) {
-    uword *p = (uword *)adj6->rewrite_data;
-    *sixrd_domain_index = p[0];
+  sixrd_dpo_t *sd;
+
+  sd = sixrd_dpo_get(sdi);
+  *sixrd_domain_index = sd->sd_domain;
+  if (*sixrd_domain_index != ~0)
     return pool_elt_at_index(mm->domains, *sixrd_domain_index);
-  }
+
+  u32 lbi = ip6_fib_table_fwding_lookup(&ip6_main, 0, addr);
+  const dpo_id_t *dpo = load_balance_get_bucket(lbi, 0);
+  if (PREDICT_TRUE(dpo->dpoi_type == sixrd_dpo_type))
+    {
+      sd = sixrd_dpo_get(dpo->dpoi_index);
+      *sixrd_domain_index = sd->sd_domain;
+      return pool_elt_at_index(mm->domains, *sixrd_domain_index);
+    }
   *error = SIXRD_ERROR_NO_DOMAIN;
   return NULL;
 }
diff --git a/plugins/sixrd-plugin/sixrd/sixrd_dpo.c b/plugins/sixrd-plugin/sixrd/sixrd_dpo.c
new file mode 100644 (file)
index 0000000..88a0793
--- /dev/null
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sixrd_dpo.h"
+#include <vnet/ip/ip.h>
+
+/**
+ * pool of all MPLS Label DPOs
+ */
+sixrd_dpo_t *sixrd_dpo_pool;
+
+/**
+ * The register SIXRD DPO type
+ */
+dpo_type_t sixrd_dpo_type;
+
+static sixrd_dpo_t *
+sixrd_dpo_alloc (void)
+{
+    sixrd_dpo_t *sd;
+
+    pool_get_aligned(sixrd_dpo_pool, sd, CLIB_CACHE_LINE_BYTES);
+    memset(sd, 0, sizeof(*sd));
+
+    return (sd);
+}
+
+static index_t
+sixrd_dpo_get_index (sixrd_dpo_t *sd)
+{
+    return (sd - sixrd_dpo_pool);
+}
+
+void
+sixrd_dpo_create (dpo_proto_t dproto,
+               u32 domain_index,
+               dpo_id_t *dpo)
+{
+    sixrd_dpo_t *sd;
+
+    sd = sixrd_dpo_alloc();
+    sd->sd_domain = domain_index;
+    sd->sd_proto = dproto;
+
+    dpo_set(dpo,
+           sixrd_dpo_type,
+           dproto,
+           sixrd_dpo_get_index(sd));
+}
+
+u8*
+format_sixrd_dpo (u8 *s, va_list *args)
+{
+    index_t index = va_arg (*args, index_t);
+    CLIB_UNUSED(u32 indent) = va_arg (*args, u32);
+    sixrd_dpo_t *sd;
+
+    sd = sixrd_dpo_get(index);
+
+    return (format(s, "sixrd:[%d]:%U domain:%d",
+                  index,
+                   format_dpo_proto, sd->sd_proto,
+                  sd->sd_domain));
+}
+
+
+static void
+sixrd_dpo_lock (dpo_id_t *dpo)
+{
+    sixrd_dpo_t *sd;
+
+    sd = sixrd_dpo_get(dpo->dpoi_index);
+
+    sd->sd_locks++;
+}
+
+static void
+sixrd_dpo_unlock (dpo_id_t *dpo)
+{
+    sixrd_dpo_t *sd;
+
+    sd = sixrd_dpo_get(dpo->dpoi_index);
+
+    sd->sd_locks--;
+
+    if (0 == sd->sd_locks)
+    {
+       pool_put(sixrd_dpo_pool, sd);
+    }
+}
+
+const static dpo_vft_t sd_vft = {
+    .dv_lock = sixrd_dpo_lock,
+    .dv_unlock = sixrd_dpo_unlock,
+    .dv_format = format_sixrd_dpo,
+};
+
+const static char* const sixrd_ip4_nodes[] =
+{
+    "ip4-sixrd",
+    NULL,
+};
+const static char* const sixrd_ip6_nodes[] =
+{
+    "ip6-sixrd",
+    NULL,
+};
+
+const static char* const * const sixrd_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = sixrd_ip4_nodes,
+    [DPO_PROTO_IP6]  = sixrd_ip6_nodes,
+    [DPO_PROTO_MPLS] = NULL,
+};
+
+void
+sixrd_dpo_module_init (void)
+{
+    sixrd_dpo_type = dpo_register_new_type(&sd_vft, sixrd_nodes);
+}
diff --git a/plugins/sixrd-plugin/sixrd/sixrd_dpo.h b/plugins/sixrd-plugin/sixrd/sixrd_dpo.h
new file mode 100644 (file)
index 0000000..1714228
--- /dev/null
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIXRD_DPO_H__
+#define __SIXRD_DPO_H__
+
+#include <vnet/vnet.h>
+#include <vnet/dpo/dpo.h>
+
+/**
+ * A representation of a 6RD DPO
+ */
+typedef struct sixrd_dpo_t
+{
+    /**
+     * The dat-plane protocol
+     */
+    dpo_proto_t sd_proto;
+
+    /**
+     * the SIXRD domain index
+     */
+    u32 sd_domain;
+
+    /**
+     * Number of locks/users of the label
+     */
+    u16 sd_locks;
+} sixrd_dpo_t;
+
+extern void sixrd_dpo_create (dpo_proto_t dproto,
+                             u32 domain_index,
+                             dpo_id_t *dpo);
+
+/*
+ * Encapsulation violation for fast data-path access
+ */
+extern sixrd_dpo_t *sixrd_dpo_pool;
+extern dpo_type_t sixrd_dpo_type;
+
+static inline sixrd_dpo_t *
+sixrd_dpo_get (index_t index)
+{
+    return (pool_elt_at_index(sixrd_dpo_pool, index));
+}
+
+extern void sixrd_dpo_module_init(void);
+
+#endif
index 18cc4ba..b9917df 100644 (file)
@@ -384,7 +384,7 @@ VLIB_REGISTER_NODE (vcgn_classify_node) = {
   .next_nodes = {
     [VCGN_CLASSIFY_NEXT_IP4_INPUT]      = "ip4-input",
     [VCGN_CLASSIFY_NEXT_IP6_INPUT]      = "ip6-input",
-    [VCGN_CLASSIFY_NEXT_MPLS_INPUT]     = "mpls-gre-input",
+    [VCGN_CLASSIFY_NEXT_MPLS_INPUT]     = "mpls-input",
     [VCGN_CLASSIFY_NEXT_ETHERNET_INPUT] = "ethernet-input",
        [VCGN_CLASSIFY_NEXT_UDP_INSIDE]     = "vcgn-v4-udp-i2o",
        [VCGN_CLASSIFY_NEXT_UDP_OUTSIDE]    = "vcgn-v4-udp-o2i",
index 1c47c65..41568e0 100644 (file)
@@ -13,7 +13,7 @@
 
 AUTOMAKE_OPTIONS = foreign subdir-objects
 
-AM_CFLAGS = -Wall @DPDK@ @IPSEC@ @IPV6SR@
+AM_CFLAGS = -Wall -Werror @DPDK@ @IPSEC@ @IPV6SR@
 
 libvnet_la_SOURCES =
 libvnetplugin_la_SOURCES =
@@ -264,7 +264,6 @@ nobase_include_HEADERS +=                   \
 # Layer 3 protocol: IP v4/v6
 ########################################
 libvnet_la_SOURCES +=                          \
- vnet/ip/adj_alloc.c                           \
  vnet/ip/format.c                              \
  vnet/ip/icmp4.c                               \
  vnet/ip/icmp6.c                               \
@@ -296,7 +295,6 @@ libvnet_la_SOURCES +=                               \
  vnet/ip/ip_frag.c
 
 nobase_include_HEADERS +=                      \
- vnet/ip/adj_alloc.h                           \
  vnet/ip/format.h                              \
  vnet/ip/icmp46_packet.h                       \
  vnet/ip/icmp4.h                               \
@@ -369,13 +367,15 @@ nobase_include_HEADERS +=                 \
 ########################################
 libvnet_la_SOURCES +=                           \
  vnet/map/map.c                                 \
+ vnet/map/map_dpo.c                             \
  vnet/map/ip4_map.c                             \
  vnet/map/ip6_map.c                             \
  vnet/map/ip4_map_t.c                           \
  vnet/map/ip6_map_t.c
 
 nobase_include_HEADERS +=                       \
- vnet/map/map.h
+ vnet/map/map.h                                 \
+ vnet/map/map_dpo.h
 
 if ENABLE_TESTS
 TESTS += test_map
@@ -422,16 +422,20 @@ nobase_include_HEADERS +=                 \
 # Tunnel protocol: gre+mpls
 ########################################
 libvnet_la_SOURCES +=                          \
- vnet/mpls-gre/mpls.c                          \
- vnet/mpls-gre/node.c                          \
- vnet/mpls-gre/interface.c                     \
- vnet/mpls-gre/policy_encap.c                  \
- vnet/mpls-gre/pg.c
+ vnet/mpls/mpls.c                              \
+ vnet/mpls/mpls_lookup.c                       \
+ vnet/mpls/mpls_output.c                       \
+ vnet/mpls/mpls_features.c                     \
+ vnet/mpls/node.c                              \
+ vnet/mpls/interface.c                         \
+ vnet/mpls/policy_encap.c                      \
+ vnet/mpls/pg.c
  
 nobase_include_HEADERS +=                      \
- vnet/mpls-gre/mpls.h                          \
- vnet/mpls-gre/packet.h                                \
- vnet/mpls-gre/error.def
+ vnet/mpls/mpls.h                              \
+ vnet/mpls/mpls_types.h                                \
+ vnet/mpls/packet.h                            \
+ vnet/mpls/error.def
 
 
 ########################################
@@ -466,6 +470,7 @@ nobase_include_HEADERS +=                   \
 
 libvnet_la_SOURCES +=                          \
  vnet/lisp-cp/lisp_types.c                     \
+ vnet/lisp-cp/lisp_cp_dpo.c                    \
  vnet/lisp-cp/control.c                                \
  vnet/lisp-cp/gid_dictionary.c                 \
  vnet/lisp-cp/lisp_msg_serdes.c                        \
@@ -513,6 +518,9 @@ endif
 
 libvnet_la_SOURCES +=                          \
  vnet/lisp-gpe/lisp_gpe.c                      \
+ vnet/lisp-gpe/lisp_gpe_sub_interface.c         \
+ vnet/lisp-gpe/lisp_gpe_adjacency.c             \
+ vnet/lisp-gpe/lisp_gpe_tunnel.c                \
  vnet/lisp-gpe/interface.c                     \
  vnet/lisp-gpe/ip_forward.c                    \
  vnet/lisp-gpe/decap.c                         
@@ -719,6 +727,90 @@ nobase_include_HEADERS +=                  \
   vnet/unix/tuntap.h                           \
   vnet/unix/tapcli.h
 
+########################################
+# FIB
+########################################
+
+libvnet_la_SOURCES +=                          \
+  vnet/fib/fib.c                                \
+  vnet/fib/fib_test.c                           \
+  vnet/fib/ip4_fib.c                            \
+  vnet/fib/ip6_fib.c                            \
+  vnet/fib/mpls_fib.c                           \
+  vnet/fib/fib_table.c                          \
+  vnet/fib/fib_walk.c                           \
+  vnet/fib/fib_types.c                          \
+  vnet/fib/fib_node.c                           \
+  vnet/fib/fib_node_list.c                      \
+  vnet/fib/fib_entry.c                          \
+  vnet/fib/fib_entry_src.c                      \
+  vnet/fib/fib_entry_src_rr.c                   \
+  vnet/fib/fib_entry_src_interface.c            \
+  vnet/fib/fib_entry_src_default_route.c        \
+  vnet/fib/fib_entry_src_special.c              \
+  vnet/fib/fib_entry_src_api.c                  \
+  vnet/fib/fib_entry_src_adj.c                  \
+  vnet/fib/fib_entry_src_mpls.c                 \
+  vnet/fib/fib_entry_src_lisp.c                 \
+  vnet/fib/fib_entry_cover.c                    \
+  vnet/fib/fib_path_list.c                      \
+  vnet/fib/fib_path.c                          \
+  vnet/fib/fib_path_ext.c                      \
+  vnet/fib/fib_attached_export.c
+
+nobase_include_HEADERS +=                      \
+  vnet/fib/fib.h                               \
+  vnet/fib/ip4_fib.h                           \
+  vnet/fib/ip6_fib.h                           \
+  vnet/fib/fib_types.h                         \
+  vnet/fib/fib_table.h                         \
+  vnet/fib/fib_node.h                          \
+  vnet/fib/fib_node_list.h                     \
+  vnet/fib/fib_entry.h                         
+
+########################################
+# ADJ
+########################################
+
+libvnet_la_SOURCES +=                          \
+  vnet/adj/adj_alloc.c                         \
+  vnet/adj/adj_nbr.c                           \
+  vnet/adj/adj_rewrite.c                       \
+  vnet/adj/adj_glean.c                         \
+  vnet/adj/adj_midchain.c                      \
+  vnet/adj/adj.c                            
+
+nobase_include_HEADERS +=                      \
+  vnet/adj/adj.h                               \
+  vnet/adj/adj_types.h                         \
+  vnet/adj/adj_rewrite.h                       \
+  vnet/adj/adj_glean.h                         \
+  vnet/adj/adj_nbr.h                           
+
+########################################
+# Data-Plane Objects
+########################################
+
+libvnet_la_SOURCES +=                          \
+  vnet/dpo/dpo.c                                \
+  vnet/dpo/drop_dpo.c                           \
+  vnet/dpo/punt_dpo.c                           \
+  vnet/dpo/receive_dpo.c                        \
+  vnet/dpo/load_balance.c                      \
+  vnet/dpo/load_balance_map.c                  \
+  vnet/dpo/lookup_dpo.c                        \
+  vnet/dpo/classify_dpo.c                      \
+  vnet/dpo/mpls_label_dpo.c
+
+nobase_include_HEADERS +=                      \
+  vnet/dpo/load_balance.h                       \
+  vnet/dpo/drop_dpo.h                           \
+  vnet/dpo/lookup_dpo.h                         \
+  vnet/dpo/punt_dpo.h                           \
+  vnet/dpo/classify_dpo.h                       \
+  vnet/dpo/receive_dpo.h                        \
+  vnet/dpo/dpo.h
+
 ########################################
 # Plugin client library
 ########################################
diff --git a/vnet/etc/scripts/arp4-mpls b/vnet/etc/scripts/arp4-mpls
new file mode 100644 (file)
index 0000000..d3d39f3
--- /dev/null
@@ -0,0 +1,24 @@
+packet-generator new {
+  name x
+  limit 1
+  node ip4-input
+  size 64-64
+  no-recycle
+  data {
+    ICMP: 1.0.0.2 -> 2.2.2.2
+    ICMP echo_request
+    incrementing 100
+  }
+}
+
+loop create
+loop create
+set int state loop0 up
+set int state loop1 up
+
+set int ip address loop0 1.0.0.1/24
+set int ip address loop1 2.0.0.1/24
+
+ip route add 2.2.2.2/32 via 2.0.0.2 loop1 out-label 33
+
+trace add pg-input 100
diff --git a/vnet/etc/scripts/lfib/ip4-to-mpls b/vnet/etc/scripts/lfib/ip4-to-mpls
new file mode 100644 (file)
index 0000000..8575379
--- /dev/null
@@ -0,0 +1,26 @@
+packet-generator new {
+  name x
+  limit 1
+  node ip4-input
+  size 64-64
+  no-recycle
+  data {
+    ICMP: 1.0.0.2 -> 2.2.2.2
+    ICMP echo_request
+    incrementing 100
+  }
+}
+
+loop create
+loop create
+set int state loop0 up
+set int state loop1 up
+
+set int ip address loop0 1.0.0.1/24
+set int ip address loop1 2.0.0.1/24
+
+set ip arp static loop1 2.0.0.2 dead.beef.babe
+set int mpls loop1 enable
+ip route add 2.2.2.2/32 via 2.0.0.2 loop1 out-label 33
+
+trace add pg-input 100
diff --git a/vnet/etc/scripts/lfib/mpls-pop-to-mpls b/vnet/etc/scripts/lfib/mpls-pop-to-mpls
new file mode 100644 (file)
index 0000000..2818ac1
--- /dev/null
@@ -0,0 +1,28 @@
+packet-generator new {
+  name x
+  limit 1
+  node mpls-input
+  size 72-72
+  no-recycle
+  data {
+   hex 0x0001e0ff0001f1ff4500004000000000400177ba010000020202020208007a6e000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f2021222324252627
+  }
+}
+
+loop create
+loop create
+set int state loop0 up
+set int state loop1 up
+
+set int ip address loop0 1.0.0.1/24
+set int ip address loop1 2.0.0.1/24
+
+set ip arp static loop1 2.0.0.2 dead.beef.babe
+set int mpls loop1 enable
+
+ip route add 2.2.2.2/32 via 2.0.0.2 loop1 out-label 33
+
+mpls local-label add 30 non-eos mpls-lookup-in-table 0
+mpls local-label add 31 2.2.2.2/32 
+
+trace add pg-input 100
diff --git a/vnet/etc/scripts/lfib/mpls-to-ip4 b/vnet/etc/scripts/lfib/mpls-to-ip4
new file mode 100644 (file)
index 0000000..24e235e
--- /dev/null
@@ -0,0 +1,27 @@
+packet-generator new {
+  name x
+  limit 1
+  node mpls-input
+  size 68-68
+  no-recycle
+  data {
+   hex 0x0001e1ff4500004000000000400177ba010000020202020208007a6e000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f2021222324252627
+  }
+}
+
+loop create
+loop create
+set int state loop0 up
+set int state loop1 up
+
+set int ip address loop0 1.0.0.1/24
+set int ip address loop1 2.0.0.1/24
+
+set ip arp static loop1 2.0.0.2 dead.beef.babe
+set int mpls loop1 enable
+
+ip route add 2.2.2.2/32 via 2.0.0.2 loop1 out-label 33
+
+mpls local-label add 30 eos ip4-lookup-in-table 0
+
+trace add pg-input 100
diff --git a/vnet/etc/scripts/lfib/mpls-to-mpls b/vnet/etc/scripts/lfib/mpls-to-mpls
new file mode 100644 (file)
index 0000000..497dbab
--- /dev/null
@@ -0,0 +1,26 @@
+packet-generator new {
+  name x
+  limit 1
+  node mpls-input
+  size 68-68
+  no-recycle
+  data {
+   hex 0x0001e1ff4500004000000000400177ba010000020200000208007a6e000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f2021222324252627
+  }
+}
+
+loop create
+loop create
+set int state loop0 up
+set int state loop1 up
+
+set int ip address loop0 1.0.0.1/24
+set int ip address loop1 2.0.0.1/24
+
+set ip arp static loop1 2.0.0.2 dead.beef.babe
+set int mpls loop1 enable
+
+ip route add 2.2.2.2/32 via 2.0.0.2 loop1 out-label 33
+mpls local-label add 30 2.2.2.2/32 
+
+trace add pg-input 100
diff --git a/vnet/etc/scripts/mpls-o-ethernet/pg b/vnet/etc/scripts/mpls-o-ethernet/pg
new file mode 100644 (file)
index 0000000..ba5397f
--- /dev/null
@@ -0,0 +1,10 @@
+packet-generator new {
+  name x
+  limit 1
+  node mpls-ethernet-input
+  size 68-68
+  no-recycle
+  data {
+   hex 0x0001e1ff4500004000000000400177ba010000020200000208007a6e000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f2021222324252627
+  }
+}
\ No newline at end of file
diff --git a/vnet/etc/scripts/mpls-o-ethernet/single.conf b/vnet/etc/scripts/mpls-o-ethernet/single.conf
new file mode 100644 (file)
index 0000000..2a25d35
--- /dev/null
@@ -0,0 +1,17 @@
+comment { single node configuration }
+
+loop create
+loop create
+set int state loop0 up
+set int state loop1 up
+
+set int ip address loop0 1.0.0.1/24
+set int ip address loop1 2.0.0.1/24
+
+
+ip route add 2.2.2.2/32 via 2.0.0.2 loop1
+
+mpls encap add label 30 fib 0 dest 2.2.2.2
+mpls decap add label 30 fib 0
+
+create mpls ethernet tunnel dst 00:50:56:b7:05:cb adj 2.2.2.2/32 tx-intfc loop1 fib-id 0
diff --git a/vnet/etc/scripts/source_and_port_range_check b/vnet/etc/scripts/source_and_port_range_check
new file mode 100644 (file)
index 0000000..dce227b
--- /dev/null
@@ -0,0 +1,63 @@
+
+create loop int
+
+set int state loop0 up
+set int ip addr loop0 10.10.10.10/32
+
+packet-generator new {
+  name deny-from-default-route
+  limit 1
+  node ip4-input
+  size 64-64
+  no-recycle
+  data {
+    UDP: 1.2.3.4 -> 5.6.7.8
+    UDP: 3000 -> 3001
+    length 128 checksum 0 incrementing 1
+  }
+}
+
+packet-generator new {
+  name allow
+  limit 1
+  node ip4-input
+  size 64-64
+  no-recycle
+  data {
+    UDP: 1.1.1.1 -> 5.6.7.8
+    UDP: 3000 -> 3001
+    length 128 checksum 0 incrementing 1
+  }
+}
+
+packet-generator new {
+  name deny-from-port-range
+  limit 1
+  node ip4-input
+  size 64-64
+  no-recycle
+  data {
+    UDP: 1.1.1.1 -> 5.6.7.8
+    UDP: 6000 -> 6001
+    length 128 checksum 0 incrementing 1
+  }
+}
+
+set ip source-and-port-range-check 1.1.1.0/24 range 2000 - 3000 vrf 99
+
+set interface ip source-and-port-range-check pg0 udp-out-vrf 99
+
+ show ip source-and-port-range-check vrf 99 1.1.1.1
+
+set ip source-and-port-range-check 1.1.1.0/24 range 4000 - 5000 vrf 99
+
+set ip source-and-port-range-check 1.1.2.0/24 range 4000 - 5000 vrf 99
+
+show ip source-and-port-range-check vrf 99 1.1.1.1
+show ip source-and-port-range-check vrf 99 1.1.2.1
+
+set ip source-and-port-range-check 1.1.2.0/24 range 4000 - 5000 vrf 99 del
+
+show ip source-and-port-range-check vrf 99 1.1.2.1
+
+tr add pg-input 100
diff --git a/vnet/vnet/adj/adj.c b/vnet/vnet/adj/adj.c
new file mode 100644 (file)
index 0000000..b552fdb
--- /dev/null
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/adj/adj.h>
+#include <vnet/adj/adj_alloc.h>
+#include <vnet/adj/adj_internal.h>
+#include <vnet/adj/adj_glean.h>
+#include <vnet/adj/adj_midchain.h>
+#include <vnet/fib/fib_node_list.h>
+
+/*
+ * Special Adj with index zero. we need to define this since the v4 mtrie
+ * assumes an index of 0 implies the ply is empty. therefore all 'real'
+ * adjs need a non-zero index.
+ */
+static ip_adjacency_t *special_v4_miss_adj_with_index_zero;
+
+/* Adjacency packet/byte counters indexed by adjacency index. */
+vlib_combined_counter_main_t adjacency_counters;
+
+always_inline void
+adj_poison (ip_adjacency_t * adj)
+{
+    if (CLIB_DEBUG > 0)
+    {
+       u32 save_handle = adj->heap_handle;;
+
+       memset (adj, 0xfe, sizeof (adj[0]));
+
+       adj->heap_handle = save_handle;
+    }
+}
+
+ip_adjacency_t *
+adj_alloc (fib_protocol_t proto)
+{
+    ip_adjacency_t *adj;
+
+    adj = aa_alloc();
+
+    adj_poison(adj);
+
+    /* Make sure certain fields are always initialized. */
+    /* Validate adjacency counters. */
+    vlib_validate_combined_counter(&adjacency_counters,
+                                   adj->heap_handle);
+
+    adj->rewrite_header.sw_if_index = ~0;
+    adj->mcast_group_index = ~0;
+    adj->saved_lookup_next_index = 0;
+    adj->n_adj = 1;
+
+    fib_node_init(&adj->ia_node,
+                  FIB_NODE_TYPE_ADJ);
+    adj->ia_nh_proto = proto;
+
+    return (adj);
+}
+
+static int
+adj_index_is_special (adj_index_t adj_index)
+{
+    if (ADJ_INDEX_INVALID == adj_index)
+       return (!0);
+
+    return (0);
+}
+
+/**
+ * @brief Pretty print helper function for formatting specific adjacencies.
+ * @param s - input string to format
+ * @param args - other args passed to format function such as:
+ *                 - vnet_main_t
+ *                 - ip_lookup_main_t
+ *                 - adj_index
+ */
+u8 *
+format_ip_adjacency (u8 * s, va_list * args)
+{
+  vnet_main_t * vnm = va_arg (*args, vnet_main_t *);
+  u32 adj_index = va_arg (*args, u32);
+  format_ip_adjacency_flags_t fiaf = va_arg (*args, format_ip_adjacency_flags_t);
+  ip_adjacency_t * adj = adj_get(adj_index);
+  
+  switch (adj->lookup_next_index)
+  {
+  case IP_LOOKUP_NEXT_REWRITE:
+      s = format (s, "%U", format_adj_nbr, adj_index, 0);
+      break;
+  case IP_LOOKUP_NEXT_ARP:
+      s = format (s, "%U", format_adj_nbr_incomplete, adj_index, 0);
+      break;
+  case IP_LOOKUP_NEXT_GLEAN:
+      s = format (s, " %U",
+                 format_vnet_sw_interface_name,
+                 vnm,
+                 vnet_get_sw_interface(vnm,
+                                       adj->rewrite_header.sw_if_index));
+      break;
+
+  case IP_LOOKUP_NEXT_MIDCHAIN:
+      s = format (s, "%U", format_adj_midchain, adj_index, 2);
+      break;
+  default:
+      break;
+  }
+  s = format (s, " index:%d", adj_index);
+
+  if (fiaf & FORMAT_IP_ADJACENCY_DETAIL)
+  {
+      s = format (s, " locks:%d", adj->ia_node.fn_locks);
+      s = format(s, "\nchildren:\n ");
+      s = fib_node_children_format(adj->ia_node.fn_children, s);
+  }
+
+  return s;
+}
+
+/*
+ * adj_last_lock_gone
+ *
+ * last lock/reference to the adj has gone, we no longer need it.
+ */
+static void
+adj_last_lock_gone (ip_adjacency_t *adj)
+{
+    ASSERT(0 == fib_node_list_get_size(adj->ia_node.fn_children));
+    ADJ_DBG(adj, "last-lock-gone");
+
+    switch (adj->lookup_next_index)
+    {
+    case IP_LOOKUP_NEXT_MIDCHAIN:
+        dpo_reset(&adj->sub_type.midchain.next_dpo);
+        /* FALL THROUGH */
+    case IP_LOOKUP_NEXT_ARP:
+    case IP_LOOKUP_NEXT_REWRITE:
+       /*
+        * complete and incomplete nbr adjs
+        */
+       adj_nbr_remove(adj->ia_nh_proto,
+                      adj->ia_link,
+                      &adj->sub_type.nbr.next_hop,
+                      adj->rewrite_header.sw_if_index);
+       break;
+    case IP_LOOKUP_NEXT_GLEAN:
+       adj_glean_remove(adj->ia_nh_proto,
+                        adj->rewrite_header.sw_if_index);
+       break;
+    default:
+       /*
+        * type not stored in any DB from which we need to remove it
+        */
+       break;
+    }
+
+    fib_node_deinit(&adj->ia_node);
+    aa_free(adj);
+}
+
+void
+adj_lock (adj_index_t adj_index)
+{
+    ip_adjacency_t *adj;
+
+    if (adj_index_is_special(adj_index))
+    {
+       return;
+    }
+
+    adj = adj_get(adj_index);
+    ASSERT(adj);
+    ASSERT(adj->heap_handle!=0);
+
+    ADJ_DBG(adj, "lock");
+    fib_node_lock(&adj->ia_node);
+}
+
+void
+adj_unlock (adj_index_t adj_index)
+{
+    ip_adjacency_t *adj;
+
+    if (adj_index_is_special(adj_index))
+    {
+       return;
+    }
+
+    adj = adj_get(adj_index);
+    ASSERT(adj);
+    ASSERT(adj->heap_handle!=0);
+
+    ADJ_DBG(adj, "unlock");
+    ASSERT(adj);
+    ASSERT(adj->heap_handle!=0);
+
+    fib_node_unlock(&adj->ia_node);
+}
+
+u32
+adj_child_add (adj_index_t adj_index,
+              fib_node_type_t child_type,
+              fib_node_index_t child_index)
+{
+    ASSERT(ADJ_INDEX_INVALID != adj_index);
+    if (adj_index_is_special(adj_index))
+    {
+       return (~0);
+    }
+
+    return (fib_node_child_add(FIB_NODE_TYPE_ADJ,
+                               adj_index,
+                               child_type,
+                               child_index));
+}
+
+void
+adj_child_remove (adj_index_t adj_index,
+                 u32 sibling_index)
+{
+    if (adj_index_is_special(adj_index))
+    {
+       return;
+    }
+
+    fib_node_child_remove(FIB_NODE_TYPE_ADJ,
+                          adj_index,
+                          sibling_index);
+}
+
+static fib_node_t *
+adj_get_node (fib_node_index_t index)
+{
+    ip_adjacency_t *adj;
+
+    adj = adj_get(index);
+
+    return (&adj->ia_node);
+}
+
+#define ADJ_FROM_NODE(_node)                                           \
+    ((ip_adjacency_t*)((char*)_node - STRUCT_OFFSET_OF(ip_adjacency_t, ia_node)))
+
+static void
+adj_node_last_lock_gone (fib_node_t *node)
+{
+    adj_last_lock_gone(ADJ_FROM_NODE(node));
+}
+
+static fib_node_back_walk_rc_t
+adj_back_walk_notify (fib_node_t *node,
+                     fib_node_back_walk_ctx_t *ctx)
+{
+    /*
+     * Que pasa. yo soj en el final!
+     */
+    ASSERT(0);
+
+    return (FIB_NODE_BACK_WALK_CONTINUE);
+}
+
+/*
+ * Adjacency's graph node virtual function table
+ */
+static const fib_node_vft_t adj_vft = {
+    .fnv_get = adj_get_node,
+    .fnv_last_lock = adj_node_last_lock_gone,
+    .fnv_back_walk = adj_back_walk_notify,
+};
+
+static clib_error_t *
+adj_module_init (vlib_main_t * vm)
+{
+    fib_node_register_type(FIB_NODE_TYPE_ADJ, &adj_vft);
+
+    adj_nbr_module_init();
+    adj_glean_module_init();
+    adj_midchain_module_init();
+
+    /*
+     * 4 special adjs for v4 and v6 resp.
+     */
+    aa_bootstrap(8);
+    special_v4_miss_adj_with_index_zero = adj_alloc(FIB_PROTOCOL_IP4);
+
+    return (NULL);
+}
+
+VLIB_INIT_FUNCTION (adj_module_init);
+
+/* 
+ * DEPRECATED: DO NOT USE
+ *
+ * Create new block of given number of contiguous adjacencies.
+ */
+ip_adjacency_t *
+ip_add_adjacency (ip_lookup_main_t * lm,
+                 ip_adjacency_t * copy_adj,
+                 u32 n_adj,
+                 u32 * adj_index_return)
+{
+  ip_adjacency_t * adj;
+  u32 ai, i, handle;
+
+  ASSERT(1==n_adj);
+
+  adj = aa_alloc ();
+  handle = ai = adj->heap_handle;
+
+  /* Validate adjacency counters. */
+  vlib_validate_combined_counter (&adjacency_counters, ai + n_adj - 1);
+
+  for (i = 0; i < n_adj; i++)
+    {
+      /* Make sure certain fields are always initialized. */
+      adj[i].rewrite_header.sw_if_index = ~0;
+      adj[i].mcast_group_index = ~0;
+      adj[i].saved_lookup_next_index = 0;
+
+      if (copy_adj)
+       adj[i] = copy_adj[i];
+
+      adj[i].heap_handle = handle;
+      adj[i].n_adj = n_adj;
+
+      /* Zero possibly stale counters for re-used adjacencies. */
+      vlib_zero_combined_counter (&adjacency_counters, ai + i);
+    }
+
+  *adj_index_return = ai;
+  return adj;
+}
diff --git a/vnet/vnet/adj/adj.h b/vnet/vnet/adj/adj.h
new file mode 100644 (file)
index 0000000..3a12364
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * An adjacency is a representation of an attached L3 peer.
+ *
+ * Adjacency Sub-types:
+ *   - neighbour: a representation of an attached L3 peer.
+ *                Key:{addr,interface,link/ether-type}
+ *           SHARED
+ *   - glean: used to drive ARP/ND for packets destined to a local sub-net.
+ *            'glean' mean use the packet's destination address as the target
+ *            address in the ARP packet.
+ *          UNSHARED. Only one per-interface.
+ *   - midchain: a nighbour adj on a virtual/tunnel interface.
+ *   - rewrite: an adj with no key, but with a rewrite string.
+ *
+ * The API to create and update the adjacency is very sub-type specific. This
+ * is intentional as it encourages the user to carefully consider which adjacency
+ * sub-type they are really using, and hence assign it data in the appropriate
+ * sub-type space in the union of sub-types. This prevents the adj becoming a
+ * disorganised dumping group for 'my features needs a u16 somewhere' data. It
+ * is important to enforce this approach as space in the adjacency is a premium,
+ * as we need it to fit in 1 cache line.
+ *
+ * the API is also based around an index to an ajdacency not a raw pointer. This
+ * is so the user doesn't suffer the same limp inducing firearm injuries that
+ * the author suffered as the adjacenices can realloc.
+ */
+
+#ifndef __ADJ_H__
+#define __ADJ_H__
+
+#include <vnet/ip/lookup.h>
+#include <vnet/adj/adj_types.h>
+#include <vnet/adj/adj_nbr.h>
+#include <vnet/adj/adj_rewrite.h>
+#include <vnet/adj/adj_glean.h>
+
+/**
+ * @brief
+ *   Take a reference counting lock on the adjacency
+ */
+extern void adj_lock(adj_index_t adj_index);
+/**
+ * @brief
+ *   Release a reference counting lock on the adjacency
+ */
+extern void adj_unlock(adj_index_t adj_index);
+
+/**
+ * @brief
+ *  Add a child dependent to an adjacency. The child will
+ *  thus be informed via its registerd back-walk function
+ *  when the adjacency state changes.
+ */
+extern u32 adj_child_add(adj_index_t adj_index,
+                        fib_node_type_t type,
+                        fib_node_index_t child_index);
+/**
+ * @brief
+ *  Remove a child dependent
+ */
+extern void adj_child_remove(adj_index_t adj_index,
+                            u32 sibling_index);
+
+/**
+ * @brief
+ * The global adjacnecy heap. Exposed for fast/inline data-plane access
+ */
+extern ip_adjacency_t *adj_heap;
+
+/**
+ * @brief 
+ * Adjacency packet counters
+ */
+extern vlib_combined_counter_main_t adjacency_counters;
+
+/**
+ * @brief
+ * Get a pointer to an adjacency object from its index
+ */
+static inline ip_adjacency_t *
+adj_get (adj_index_t adj_index)
+{
+    return (vec_elt_at_index(adj_heap, adj_index));
+}
+
+#endif
similarity index 72%
rename from vnet/vnet/ip/adj_alloc.c
rename to vnet/vnet/adj/adj_alloc.c
index 3ae7a19..5cc8cf6 100644 (file)
  * limitations under the License.
  */
 
-#include <vnet/ip/adj_alloc.h>
+#include <vnet/adj/adj_alloc.h>
 #include <vnet/ip/ip.h>
 
+/*
+ * the single adj heap
+ */
+ip_adjacency_t *adj_heap;
+
 /* 
  * any operation which could cause the adj vector to be reallocated
  * must have a worker thread barrier
  */
-
 static inline int will_reallocate (ip_adjacency_t * adjs, u32 n)
 {
   uword aligned_header_bytes, new_data_bytes;
@@ -45,13 +49,14 @@ static inline int will_reallocate (ip_adjacency_t * adjs, u32 n)
 }
 
 ip_adjacency_t * 
-aa_alloc (ip_adjacency_t * adjs, ip_adjacency_t **blockp, u32 n)
+aa_alloc (void)
 {
   vlib_main_t * vm = &vlib_global_main;
-  aa_header_t * ah = aa_header (adjs);
+  aa_header_t * ah = aa_header (adj_heap);
   ip_adjacency_t * adj_block;
   u32 freelist_length;
   int need_barrier_sync = 0;
+  u32 n = 1;
   
   ASSERT(os_get_cpu_number() == 0);
   ASSERT (clib_mem_is_heap_object (_vec_find(ah)));
@@ -59,14 +64,14 @@ aa_alloc (ip_adjacency_t * adjs, ip_adjacency_t **blockp, u32 n)
   /* If we don't have a freelist of size N, fresh allocation is required */
   if (vec_len (ah->free_indices_by_size) <= n)
     {
-      if (will_reallocate (adjs, n))
+      if (will_reallocate (adj_heap, n))
         {
           need_barrier_sync = 1;
           vlib_worker_thread_barrier_sync (vm);
         }
       /* Workers wont look at the freelists... */
       vec_validate (ah->free_indices_by_size, n);
-      vec_add2_ha (adjs, adj_block, n, aa_aligned_header_bytes, 
+      vec_add2_ha (adj_heap, adj_block, n, aa_aligned_header_bytes, 
                    CLIB_CACHE_LINE_BYTES);
       if (need_barrier_sync)
         vlib_worker_thread_barrier_release (vm);
@@ -77,17 +82,17 @@ aa_alloc (ip_adjacency_t * adjs, ip_adjacency_t **blockp, u32 n)
     {
       u32 index = ah->free_indices_by_size[n][freelist_length-1];
 
-      adj_block = &adjs[index];
+      adj_block = &adj_heap[index];
       _vec_len(ah->free_indices_by_size[n]) -= 1;
       goto out;
     }
   /* Allocate a new block of size N */
-  if (will_reallocate (adjs, n))
+  if (will_reallocate (adj_heap, n))
     {
       need_barrier_sync = 1;
       vlib_worker_thread_barrier_sync (vm);
     }
-  vec_add2_ha (adjs, adj_block, n, aa_aligned_header_bytes, 
+  vec_add2_ha (adj_heap, adj_block, n, aa_aligned_header_bytes, 
                CLIB_CACHE_LINE_BYTES);
   
   if (need_barrier_sync)
@@ -95,40 +100,45 @@ aa_alloc (ip_adjacency_t * adjs, ip_adjacency_t **blockp, u32 n)
 
  out:
   memset (adj_block, 0, n * (sizeof(*adj_block)));
-  adj_block->heap_handle = adj_block - adjs;
+  adj_block->heap_handle = adj_block - adj_heap;
   adj_block->n_adj = n;
-  *blockp = adj_block;
-  return adjs;
+
+  /*
+   * the adj heap may have realloc'd. recache.
+   */
+  ip4_main.lookup_main.adjacency_heap = adj_heap;
+  ip6_main.lookup_main.adjacency_heap = adj_heap;
+
+  return (adj_block);
 }
 
-void aa_free (ip_adjacency_t * adjs, ip_adjacency_t * adj)
+void aa_free (ip_adjacency_t * adj)
 {
-  aa_header_t * ah = aa_header (adjs);
+  aa_header_t * ah = aa_header (adj_heap);
   
-  ASSERT (adjs && adj && (adj->heap_handle < vec_len (adjs)));
-  ASSERT (adj->n_adj < vec_len (ah->free_indices_by_size));
+  ASSERT (adj_heap && adj && (adj->heap_handle < vec_len (adj_heap)));
   ASSERT (adj->heap_handle != 0);
   
   vec_add1 (ah->free_indices_by_size[adj->n_adj], adj->heap_handle);
   adj->heap_handle = 0;
 }
 
-ip_adjacency_t * aa_bootstrap (ip_adjacency_t * adjs, u32 n)
+void aa_bootstrap (u32 n)
 {
   ip_adjacency_t * adj_block;
   aa_header_t * ah;
   int i;
 
-  vec_add2_ha (adjs, adj_block, n, aa_aligned_header_bytes, 
+  vec_add2_ha (adj_heap, adj_block, n, aa_aligned_header_bytes, 
                CLIB_CACHE_LINE_BYTES);
 
   memset (adj_block, 0, n * sizeof(*adj_block));
-  ah = aa_header (adjs);
+  ah = aa_header (adj_heap);
   memset (ah, 0, sizeof (*ah));
 
   vec_validate (ah->free_indices_by_size, 1);
 
-  for (i = 0 ; i < vec_len (adjs); i++)
+  for (i = 0 ; i < vec_len (adj_heap); i++)
     {
       adj_block->n_adj = 1;
       adj_block->heap_handle = ~0;
@@ -136,24 +146,23 @@ ip_adjacency_t * aa_bootstrap (ip_adjacency_t * adjs, u32 n)
       vec_add1 (ah->free_indices_by_size[1], n - (i+1));
     }
 
-  return adjs;
+  ip4_main.lookup_main.adjacency_heap = adj_heap;
+  ip6_main.lookup_main.adjacency_heap = adj_heap;
 }
 
 u8 * format_adjacency_alloc (u8 * s, va_list * args)
 {
   vnet_main_t * vnm = va_arg (*args, vnet_main_t *);
-  ip_lookup_main_t * lm = va_arg (*args, ip_lookup_main_t *);
-  ip_adjacency_t * adjs = va_arg (*args, ip_adjacency_t *);
   int verbose = va_arg (*args, int);
   ip_adjacency_t * adj;
   u32 inuse = 0, freed = 0;
   u32 on_freelist = 0;
   int i, j;
-  aa_header_t * ah = aa_header (adjs);
+  aa_header_t * ah = aa_header (adj_heap);
 
-  for (i = 0; i < vec_len (adjs); i += adj->n_adj)
+  for (i = 0; i < vec_len (adj_heap); i += adj->n_adj)
     {
-      adj = adjs + i;
+      adj = adj_heap + i;
       if ((i == 0) || adj->heap_handle)
         inuse += adj->n_adj;
       else
@@ -164,19 +173,19 @@ u8 * format_adjacency_alloc (u8 * s, va_list * args)
     {
       for (j = 0; j < vec_len(ah->free_indices_by_size[i]); j++)
         {
-          adj = adjs + ah->free_indices_by_size[i][j];
+          adj = adj_heap + ah->free_indices_by_size[i][j];
           ASSERT(adj->heap_handle == 0);
           on_freelist += adj->n_adj;
         }
     }
       
-  s = format (s, "adjs: %d total, %d in use, %d free, %d on freelists\n",
-              vec_len(adjs), inuse, freed, on_freelist);
+  s = format (s, "adj_heap: %d total, %d in use, %d free, %d on freelists\n",
+              vec_len(adj_heap), inuse, freed, on_freelist);
   if (verbose)
     {
-      for (i = 0; i < vec_len (adjs); i += adj->n_adj)
+      for (i = 0; i < vec_len (adj_heap); i += adj->n_adj)
         {
-          adj = adjs + i;
+          adj = adj_heap + i;
           if ((i == 0) || adj->heap_handle)
             {
               if (adj->n_adj > 1)
@@ -190,7 +199,7 @@ u8 * format_adjacency_alloc (u8 * s, va_list * args)
                     s = format (s, "      ");
 
                   s = format(s, "%U\n", format_ip_adjacency, 
-                         vnm, lm, i+j);
+                            vnm, i+j, FORMAT_IP_ADJACENCY_NONE);
                 }
             }
         }
@@ -200,36 +209,22 @@ u8 * format_adjacency_alloc (u8 * s, va_list * args)
 
 static clib_error_t *
 show_adjacency_alloc_command_fn (vlib_main_t * vm,
-                unformat_input_t * input,
-                vlib_cli_command_t * cmd)
+                                unformat_input_t * input,
+                                vlib_cli_command_t * cmd)
 {
   int verbose = 0;
   vnet_main_t *vnm = vnet_get_main();
-  ip_lookup_main_t *lm = 0;
-  ip_adjacency_t * adjs = 0;
-  int is_ip4 = 1;
   
   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) 
     {
       if (unformat (input, "verbose"))
         verbose = 1;
-      else if (unformat (input, "ip4"))
-        ;
-      else if (unformat (input, "ip6"))
-        is_ip4 = 0;
       else
         return clib_error_return (0, "unknown input `%U'",
                                   format_unformat_error, input);
     }
 
-  if (is_ip4)
-      lm = &ip4_main.lookup_main;
-  else
-      lm = &ip6_main.lookup_main;
-
-  adjs = lm->adjacency_heap;
-
-  vlib_cli_output (vm, "%U", format_adjacency_alloc, vnm, lm, adjs, verbose);
+  vlib_cli_output (vm, "%U", format_adjacency_alloc, vnm, verbose);
 
   return 0;
 }
similarity index 87%
rename from vnet/vnet/ip/adj_alloc.h
rename to vnet/vnet/adj/adj_alloc.h
index a10146c..7d1a3fb 100644 (file)
@@ -16,7 +16,8 @@
 #ifndef __adj_alloc_h__
 #define __adj_alloc_h__
 
-/* 
+/**
+ * @brief
  * Adjacency allocator: heap-like in that the code
  * will dole out contiguous chunks of n items. In the interests of 
  * thread safety, we don't bother about coalescing free blocks of size r
@@ -43,10 +44,9 @@ static inline aa_header_t * aa_header (void * v)
   return vec_aligned_header (v, sizeof (aa_header_t), sizeof (void *));
 }
 
-ip_adjacency_t * 
-aa_alloc (ip_adjacency_t * adjs, ip_adjacency_t **blockp, u32 n);
-void aa_free (ip_adjacency_t * adjs, ip_adjacency_t * adj);
-ip_adjacency_t * aa_bootstrap (ip_adjacency_t * adjs, u32 n);
+extern ip_adjacency_t *aa_alloc(void);
+extern void aa_free (ip_adjacency_t * adj);
+extern void aa_bootstrap (u32 n);
 
 format_function_t format_adj_allocation;
 
diff --git a/vnet/vnet/adj/adj_glean.c b/vnet/vnet/adj/adj_glean.c
new file mode 100644 (file)
index 0000000..6eb6718
--- /dev/null
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/adj/adj.h>
+#include <vnet/adj/adj_alloc.h>
+#include <vnet/adj/adj_internal.h>
+#include <vnet/fib/fib_walk.h>
+
+/*
+ * The 'DB' of all glean adjs.
+ * There is only one glean per-interface per-protocol, so this is a per-interface
+ * vector
+ */
+static adj_index_t *adj_gleans[FIB_PROTOCOL_MAX];
+
+static inline vlib_node_registration_t*
+adj_get_glean_node (fib_protocol_t proto)
+{
+    switch (proto) {
+    case FIB_PROTOCOL_IP4:
+       return (&ip4_glean_node);
+    case FIB_PROTOCOL_IP6:
+       return (&ip6_glean_node);
+    case FIB_PROTOCOL_MPLS:
+       break;
+    }
+    ASSERT(0);
+    return (NULL);
+}
+
+/*
+ * adj_glean_add_or_lock
+ *
+ * The next_hop address here is used for source address selection in the DP.
+ * The glean adj is added to an interface's connected prefix, the next-hop
+ * passed here is the local prefix on the same interface.
+ */
+adj_index_t
+adj_glean_add_or_lock (fib_protocol_t proto,
+                      u32 sw_if_index,
+                      const ip46_address_t *nh_addr)
+{
+    ip_adjacency_t * adj;
+
+    vec_validate_init_empty(adj_gleans[proto], sw_if_index, ADJ_INDEX_INVALID);
+
+    if (ADJ_INDEX_INVALID == adj_gleans[proto][sw_if_index])
+    {
+       adj = adj_alloc(proto);
+
+       adj->lookup_next_index = IP_LOOKUP_NEXT_GLEAN;
+       adj->ia_nh_proto = proto;
+       adj_gleans[proto][sw_if_index] = adj->heap_handle;
+
+       if (NULL != nh_addr)
+       {
+           adj->sub_type.glean.receive_addr = *nh_addr;
+       }
+
+       adj->rewrite_header.data_bytes = 0;
+
+       vnet_rewrite_for_sw_interface(vnet_get_main(),
+                                     adj_fib_proto_2_nd(proto),
+                                     sw_if_index,
+                                     adj_get_glean_node(proto)->index,
+                                     VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST,
+                                     &adj->rewrite_header,
+                                     sizeof (adj->rewrite_data));
+    }
+    else
+    {
+       adj = adj_get(adj_gleans[proto][sw_if_index]);
+    }
+
+    adj_lock(adj->heap_handle);
+
+    return (adj->heap_handle);
+}
+
+void
+adj_glean_remove (fib_protocol_t proto,
+                 u32 sw_if_index)
+{
+    ASSERT(sw_if_index < vec_len(adj_gleans[proto]));
+
+    adj_gleans[proto][sw_if_index] = ADJ_INDEX_INVALID;
+}
+
+static clib_error_t *
+adj_glean_interface_state_change (vnet_main_t * vnm,
+                                 u32 sw_if_index,
+                                 u32 flags)
+{
+    /*
+     * for each glean on the interface trigger a walk back to the children
+     */
+    fib_protocol_t proto;
+    ip_adjacency_t *adj;
+
+
+    for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
+    {
+       if (sw_if_index >= vec_len(adj_gleans[proto]) ||
+           ADJ_INDEX_INVALID == adj_gleans[proto][sw_if_index])
+           continue;
+
+       adj = adj_get(adj_gleans[proto][sw_if_index]);
+
+       fib_node_back_walk_ctx_t bw_ctx = {
+           .fnbw_reason = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP ?
+                           FIB_NODE_BW_REASON_FLAG_INTERFACE_UP :
+                           FIB_NODE_BW_REASON_FLAG_INTERFACE_DOWN),
+       };
+
+       fib_walk_sync(FIB_NODE_TYPE_ADJ, adj->heap_handle, &bw_ctx);
+    }
+
+    return (NULL);
+}
+
+VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION(adj_glean_interface_state_change);
+
+static clib_error_t *
+adj_glean_interface_delete (vnet_main_t * vnm,
+                           u32 sw_if_index,
+                           u32 is_add)
+{
+    /*
+     * for each glean on the interface trigger a walk back to the children
+     */
+    fib_protocol_t proto;
+    ip_adjacency_t *adj;
+
+    if (is_add)
+    {
+       /*
+        * not interested in interface additions. we will not back walk
+        * to resolve paths through newly added interfaces. Why? The control
+        * plane should have the brains to add interfaces first, then routes.
+        * So the case where there are paths with a interface that matches
+        * one just created is the case where the path resolved through an
+        * interface that was deleted, and still has not been removed. The
+        * new interface added, is NO GUARANTEE that the interface being
+        * added now, even though it may have the same sw_if_index, is the
+        * same interface that the path needs. So tough!
+        * If the control plane wants these routes to resolve it needs to
+        * remove and add them again.
+        */
+       return (NULL);
+    }
+
+    for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
+    {
+       if (sw_if_index >= vec_len(adj_gleans[proto]) ||
+           ADJ_INDEX_INVALID == adj_gleans[proto][sw_if_index])
+           continue;
+
+       adj = adj_get(adj_gleans[proto][sw_if_index]);
+
+       fib_node_back_walk_ctx_t bw_ctx = {
+           .fnbw_reason =  FIB_NODE_BW_REASON_FLAG_INTERFACE_DELETE,
+       };
+
+       fib_walk_sync(FIB_NODE_TYPE_ADJ, adj->heap_handle, &bw_ctx);
+    }
+
+    return (NULL);
+}
+
+VNET_SW_INTERFACE_ADD_DEL_FUNCTION(adj_glean_interface_delete);
+
+u8*
+format_adj_glean (u8* s, va_list *ap)
+{
+    index_t index = va_arg(ap, index_t);
+    CLIB_UNUSED(u32 indent) = va_arg(ap, u32);
+    vnet_main_t * vnm = vnet_get_main();
+    ip_adjacency_t * adj = adj_get(index);
+
+    return (format(s, " glean: %U",
+                   format_vnet_sw_interface_name,
+                   vnm,
+                   vnet_get_sw_interface(vnm,
+                                         adj->rewrite_header.sw_if_index)));
+}
+
+
+static void
+adj_dpo_lock (dpo_id_t *dpo)
+{
+    adj_lock(dpo->dpoi_index);
+}
+static void
+adj_dpo_unlock (dpo_id_t *dpo)
+{
+    adj_unlock(dpo->dpoi_index);
+}
+
+const static dpo_vft_t adj_glean_dpo_vft = {
+    .dv_lock = adj_dpo_lock,
+    .dv_unlock = adj_dpo_unlock,
+    .dv_format = format_adj_glean,
+};
+
+/**
+ * @brief The per-protocol VLIB graph nodes that are assigned to a glean
+ *        object.
+ *
+ * this means that these graph nodes are ones from which a glean is the
+ * parent object in the DPO-graph.
+ */
+const static char* const glean_ip4_nodes[] =
+{
+    "ip4-glean",
+    NULL,
+};
+const static char* const glean_ip6_nodes[] =
+{
+    "ip6-glean",
+    NULL,
+};
+
+const static char* const * const glean_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = glean_ip4_nodes,
+    [DPO_PROTO_IP6]  = glean_ip6_nodes,
+    [DPO_PROTO_MPLS] = NULL,
+};
+
+void
+adj_glean_module_init (void)
+{
+    dpo_register(DPO_ADJACENCY_GLEAN, &adj_glean_dpo_vft, glean_nodes);
+}
diff --git a/vnet/vnet/adj/adj_glean.h b/vnet/vnet/adj/adj_glean.h
new file mode 100644 (file)
index 0000000..ce3534e
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @brief Glean Adjacency
+ *
+ * A gleean adjacency represent the need to discover new peers on an
+ * attached link. Packets that hit a glean adjacency will generate an
+ * ARP/ND packet addessesed to the packet's destination address.
+ * Note this is different to an incomplete neighbour adjacency, which
+ * does not send ARP/ND requests to the packet's destination address,
+ * but instead to the next-hop address of the adjacency itself.
+ */
+
+#ifndef __ADJ_GLEAN_H__
+#define __ADJ_GLEAN_H__
+
+#include <vnet/adj/adj_types.h>
+
+/**
+ * @brief
+ *  Add (and lock) a new or lock an existing glean adjacency
+ *
+ * @param proto
+ *  The protocol for the neighbours that we wish to glean
+ *
+ * @param sw_if_index
+ *  The interface on which to glean
+ *
+ * @param nh_addr
+ *  the address applied to the interface on which to glean. This
+ *  as the source address in packets when the ARP/ND packet is sent
+ */
+extern adj_index_t adj_glean_add_or_lock(fib_protocol_t proto,
+                                        u32 sw_if_index,
+                                        const ip46_address_t *nh_addr);
+
+/**
+ * @brief
+ *  Module initialisation
+ */
+extern void adj_glean_module_init(void);
+
+#endif
diff --git a/vnet/vnet/adj/adj_internal.h b/vnet/vnet/adj/adj_internal.h
new file mode 100644 (file)
index 0000000..79042d1
--- /dev/null
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ADJ_INTERNAL_H__
+#define __ADJ_INTERNAL_H__
+
+#include <vnet/adj/adj.h>
+#include <vnet/ip/ip.h>
+#include <vnet/mpls/mpls.h>
+
+
+/**
+ * big switch to turn on Adjacency debugging
+ */
+#undef ADJ_DEBUG
+
+/*
+ * Debug macro
+ */
+#ifdef ADJ_DEBUG
+#define ADJ_DBG(_adj, _fmt, _args...)                  \
+{                                                      \
+    clib_warning("adj:[%d:%p]:" _fmt,                  \
+                _adj->heap_handle, _adj,               \
+                ##_args);                              \
+}
+#else
+#define ADJ_DBG(_e, _fmt, _args...)
+#endif
+
+static inline vlib_node_registration_t*
+adj_get_rewrite_node (fib_link_t linkt)
+{
+    switch (linkt) {
+    case FIB_LINK_IP4:
+       return (&ip4_rewrite_node);
+    case FIB_LINK_IP6:
+       return (&ip6_rewrite_node);
+    case FIB_LINK_MPLS:
+       return (&mpls_output_node);
+    }
+    ASSERT(0);
+    return (NULL);
+}
+
+static inline vnet_l3_packet_type_t
+adj_fib_link_2_vnet (fib_link_t linkt)
+{
+    switch (linkt)
+    {
+    case FIB_LINK_IP4:
+       return (VNET_L3_PACKET_TYPE_IP4);
+    case FIB_LINK_IP6:
+       return (VNET_L3_PACKET_TYPE_IP6);
+    case FIB_LINK_MPLS:
+       return (VNET_L3_PACKET_TYPE_MPLS_UNICAST);
+    }
+    return (0);
+}
+
+static inline vnet_l3_packet_type_t
+adj_fib_proto_2_nd (fib_protocol_t fp)
+{
+    switch (fp)
+    {
+    case FIB_PROTOCOL_IP4:
+       return (VNET_L3_PACKET_TYPE_ARP);
+    case FIB_PROTOCOL_IP6:
+       return (VNET_L3_PACKET_TYPE_IP6);
+    case FIB_PROTOCOL_MPLS:
+       return (VNET_L3_PACKET_TYPE_MPLS_UNICAST);
+    }
+    return (0);
+}
+
+extern ip_adjacency_t * adj_alloc(fib_protocol_t proto);
+
+extern void adj_nbr_remove(fib_protocol_t nh_proto,
+                          fib_link_t link_type,
+                          const ip46_address_t *nh_addr,
+                          u32 sw_if_index);
+extern void adj_glean_remove(fib_protocol_t proto,
+                            u32 sw_if_index);
+
+#endif
diff --git a/vnet/vnet/adj/adj_midchain.c b/vnet/vnet/adj/adj_midchain.c
new file mode 100644 (file)
index 0000000..4b9b6a4
--- /dev/null
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/adj/adj_nbr.h>
+#include <vnet/adj/adj_internal.h>
+#include <vnet/ethernet/arp_packet.h>
+#include <vnet/dpo/drop_dpo.h>
+#include <vnet/fib/fib_walk.h>
+
+static inline u32
+adj_get_midchain_node (fib_link_t link)
+{
+    switch (link) {
+    case FIB_LINK_IP4:
+       return (ip4_midchain_node.index);
+    case FIB_LINK_IP6:
+       return (ip6_midchain_node.index);
+    case FIB_LINK_MPLS:
+       return (mpls_midchain_node.index);
+    }
+    ASSERT(0);
+    return (0);
+}
+
+/**
+ * adj_nbr_midchain_update_rewrite
+ *
+ * Update the adjacency's rewrite string. A NULL string implies the
+ * rewrite is reset (i.e. when ARP/ND etnry is gone).
+ * NB: the adj being updated may be handling traffic in the DP.
+ */
+void
+adj_nbr_midchain_update_rewrite (adj_index_t adj_index,
+                                u32 post_rewrite_node,
+                                u8 *rewrite)
+{
+    ip_adjacency_t *adj;
+
+    ASSERT(ADJ_INDEX_INVALID != adj_index);
+
+    adj = adj_get(adj_index);
+    adj->lookup_next_index = IP_LOOKUP_NEXT_MIDCHAIN;
+    adj->sub_type.midchain.tx_function_node = post_rewrite_node;
+
+    if (NULL != rewrite)
+    {
+       /*
+        * new rewrite provided.
+        * use a dummy rewrite header to get the interface to print into.
+        */
+       ip_adjacency_t dummy;
+        dpo_id_t tmp = DPO_NULL;
+
+       vnet_rewrite_for_tunnel(vnet_get_main(),
+                               adj->rewrite_header.sw_if_index,
+                               adj_get_midchain_node(adj->ia_link),
+                               adj->sub_type.midchain.tx_function_node,
+                               &dummy.rewrite_header,
+                               rewrite,
+                               vec_len(rewrite));
+
+       /*
+        * this is an update of an existing rewrite.
+         * packets are in flight. we'll need to briefly stack on the drop DPO
+         * whilst the rewrite is written, so any packets that see the partial update
+         * are binned.
+         */
+        if (!dpo_id_is_valid(&adj->sub_type.midchain.next_dpo))
+        {
+            /*
+             * not stacked yet. stack on the drop
+             */
+            dpo_stack(DPO_ADJACENCY_MIDCHAIN,
+                      fib_proto_to_dpo(adj->ia_nh_proto),
+                      &adj->sub_type.midchain.next_dpo,
+                      drop_dpo_get(fib_proto_to_dpo(adj->ia_nh_proto)));
+        }
+            
+        dpo_copy(&tmp, &adj->sub_type.midchain.next_dpo);
+        dpo_stack(DPO_ADJACENCY_MIDCHAIN,
+                  fib_proto_to_dpo(adj->ia_nh_proto),
+                  &adj->sub_type.midchain.next_dpo,
+                  drop_dpo_get(fib_proto_to_dpo(adj->ia_nh_proto)));
+
+       CLIB_MEMORY_BARRIER();
+
+       clib_memcpy(&adj->rewrite_header,
+                   &dummy.rewrite_header,
+                   VLIB_BUFFER_PRE_DATA_SIZE);
+
+       CLIB_MEMORY_BARRIER();
+
+        /*
+         * The graph arc used/created here is from the post-rewirte node to the
+         * child's registered node. This is because post adj processing the next
+         * node is the interface's specific node, then the post-write-node (aka
+         * the interface's tx-function) - from there we need to get to the stacked
+         * child's node.
+         */
+        dpo_stack_from_node(adj->sub_type.midchain.tx_function_node,
+                            &adj->sub_type.midchain.next_dpo,
+                            &tmp);
+        dpo_reset(&tmp);
+    }
+    else
+    {
+       ASSERT(0);
+    }
+
+    /*
+     * time for walkies fido.
+     */
+    fib_node_back_walk_ctx_t bw_ctx = {
+       .fnbw_reason = FIB_NODE_BW_REASON_ADJ_UPDATE,
+    };
+
+    fib_walk_sync(FIB_NODE_TYPE_ADJ, adj->heap_handle, &bw_ctx);
+}
+
+/**
+ * adj_nbr_midchain_stack
+ */
+void
+adj_nbr_midchain_stack (adj_index_t adj_index,
+                       const dpo_id_t *next)
+{
+    ip_adjacency_t *adj;
+
+    ASSERT(ADJ_INDEX_INVALID != adj_index);
+
+    adj = adj_get(adj_index);
+
+    ASSERT(IP_LOOKUP_NEXT_MIDCHAIN == adj->lookup_next_index);
+
+    dpo_stack_from_node(adj->sub_type.midchain.tx_function_node,
+                        &adj->sub_type.midchain.next_dpo,
+                        next);
+}
+
+u8*
+format_adj_midchain (u8* s, va_list *ap)
+{
+    index_t index = va_arg(ap, index_t);
+    u32 indent = va_arg(ap, u32);
+    vnet_main_t * vnm = vnet_get_main();
+    ip_adjacency_t * adj = adj_get(index);
+
+    s = format (s, "%U", format_fib_link, adj->ia_link);
+    s = format (s, " via %U ",
+               format_ip46_address, &adj->sub_type.nbr.next_hop);
+    s = format (s, " %U",
+                format_vnet_rewrite,
+                vnm->vlib_main, &adj->rewrite_header,
+                sizeof (adj->rewrite_data), indent);
+    s = format (s, "\n%Ustacked-on:\n%U%U",
+                format_white_space, indent,
+                format_white_space, indent+2,
+                format_dpo_id, &adj->sub_type.midchain.next_dpo, indent+2);
+
+    return (s);
+}
+
+static void
+adj_dpo_lock (dpo_id_t *dpo)
+{
+    adj_lock(dpo->dpoi_index);
+}
+static void
+adj_dpo_unlock (dpo_id_t *dpo)
+{
+    adj_unlock(dpo->dpoi_index);
+}
+
+const static dpo_vft_t adj_midchain_dpo_vft = {
+    .dv_lock = adj_dpo_lock,
+    .dv_unlock = adj_dpo_unlock,
+    .dv_format = format_adj_midchain,
+};
+
+/**
+ * @brief The per-protocol VLIB graph nodes that are assigned to a midchain
+ *        object.
+ *
+ * this means that these graph nodes are ones from which a midchain is the
+ * parent object in the DPO-graph.
+ */
+const static char* const midchain_ip4_nodes[] =
+{
+    "ip4-midchain",
+    NULL,
+};
+const static char* const midchain_ip6_nodes[] =
+{
+    "ip6-midchain",
+    NULL,
+};
+const static char* const midchain_mpls_nodes[] =
+{
+    "mpls-midchain",
+    NULL,
+};
+
+const static char* const * const midchain_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = midchain_ip4_nodes,
+    [DPO_PROTO_IP6]  = midchain_ip6_nodes,
+    [DPO_PROTO_MPLS] = midchain_mpls_nodes,
+};
+
+void
+adj_midchain_module_init (void)
+{
+    dpo_register(DPO_ADJACENCY_MIDCHAIN, &adj_midchain_dpo_vft, midchain_nodes);
+}
diff --git a/vnet/vnet/adj/adj_midchain.h b/vnet/vnet/adj/adj_midchain.h
new file mode 100644 (file)
index 0000000..adf86f1
--- /dev/null
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * Midchain Adjacency sub-type. These adjs represent an L3 peer on a
+ * tunnel interface. The tunnel's adjacency is thus not the end of the chain,
+ * and needs to stack on/link to another chain (or portion of the graph) to
+ * reach the tunnel's destination.
+ */
+
+#ifndef __ADJ_MIDCHAIN_H__
+#define __ADJ_MIDCHAIN_H__
+
+#include <vnet/adj/adj.h>
+
+/**
+ * @brief
+ *  Convert an existing neighbour adjacency into a midchain
+ *
+ * @param adj_index
+ *  The index of the neighbour adjacency.
+ *
+ * @param post_rewrite_node
+ *  The VLIB graph node that provides the post-encap fixup.
+ *  where 'fixup' is e.g., correcting chksum, length, etc.
+ *
+ * @param rewrite
+ *  The rewrite.
+ */
+extern void adj_nbr_midchain_update_rewrite(adj_index_t adj_index,
+                                           u32 post_rewrite_node,
+                                           u8 *rewrite);
+
+/**
+ * @brief
+ *  [re]stack a midchain. 'Stacking' is the act of forming parent-child
+ *  relationships in the data-plane graph.
+ *
+ * @param adj_index
+ *  The index of the midchain to stack
+ *
+ * @param dpo
+ *  The parent DPO to stack onto (i.e. become a child of).
+ */
+extern void adj_nbr_midchain_stack(adj_index_t adj_index,
+                                  const dpo_id_t *dpo);
+
+/**
+ * @brief
+ *  Module initialisation
+ */
+extern void adj_midchain_module_init(void);
+
+/**
+ * @brief
+ * Format a midchain adjacency
+ */
+extern u8* format_adj_midchain(u8* s, va_list *ap);
+
+#endif
diff --git a/vnet/vnet/adj/adj_nbr.c b/vnet/vnet/adj/adj_nbr.c
new file mode 100644 (file)
index 0000000..7da1bec
--- /dev/null
@@ -0,0 +1,835 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/adj/adj_nbr.h>
+#include <vnet/adj/adj_internal.h>
+#include <vnet/ethernet/arp_packet.h>
+#include <vnet/fib/fib_walk.h>
+
+/*
+ * Vector Hash tables of neighbour (traditional) adjacencies
+ *  Key: interface(for the vector index), address (and its proto),
+ *       link-type/ether-type.
+ */
+static BVT(clib_bihash) **adj_nbr_tables[FIB_PROTOCOL_MAX];
+
+// FIXME SIZE APPROPRIATELY. ASK DAVEB.
+#define ADJ_NBR_DEFAULT_HASH_NUM_BUCKETS (64 * 64)
+#define ADJ_NBR_DEFAULT_HASH_MEMORY_SIZE (32<<20)
+
+
+#define ADJ_NBR_SET_KEY(_key, _lt, _nh)         \
+{                                              \
+    _key.key[0] = (_nh)->as_u64[0];            \
+    _key.key[1] = (_nh)->as_u64[1];            \
+    _key.key[2] = (_lt);                       \
+}
+
+#define ADJ_NBR_ITF_OK(_proto, _itf)                   \
+    (((_itf) < vec_len(adj_nbr_tables[_proto])) &&     \
+     (NULL != adj_nbr_tables[_proto][sw_if_index]))
+
+static void
+adj_nbr_insert (fib_protocol_t nh_proto,
+               fib_link_t link_type,
+               const ip46_address_t *nh_addr,
+               u32 sw_if_index,
+               adj_index_t adj_index)
+{
+    BVT(clib_bihash_kv) kv;
+
+    if (sw_if_index >= vec_len(adj_nbr_tables[nh_proto]))
+    {
+       vec_validate(adj_nbr_tables[nh_proto], sw_if_index);
+    }
+    if (NULL == adj_nbr_tables[nh_proto][sw_if_index])
+    {
+       adj_nbr_tables[nh_proto][sw_if_index] =
+           clib_mem_alloc_aligned(sizeof(BVT(clib_bihash)),
+                                  CLIB_CACHE_LINE_BYTES);
+       memset(adj_nbr_tables[nh_proto][sw_if_index],
+              0,
+              sizeof(BVT(clib_bihash)));
+
+       BV(clib_bihash_init) (adj_nbr_tables[nh_proto][sw_if_index],
+                             "Adjacency Neighbour table",
+                             ADJ_NBR_DEFAULT_HASH_NUM_BUCKETS,
+                             ADJ_NBR_DEFAULT_HASH_MEMORY_SIZE);
+    }
+
+    ADJ_NBR_SET_KEY(kv, link_type, nh_addr);
+    kv.value = adj_index;
+
+    BV(clib_bihash_add_del) (adj_nbr_tables[nh_proto][sw_if_index], &kv, 1);
+}
+
+void
+adj_nbr_remove (fib_protocol_t nh_proto,
+               fib_link_t link_type,
+               const ip46_address_t *nh_addr,
+               u32 sw_if_index)
+{
+    BVT(clib_bihash_kv) kv;
+
+    if (!ADJ_NBR_ITF_OK(nh_proto, sw_if_index))
+       return;
+
+    ADJ_NBR_SET_KEY(kv, link_type, nh_addr);
+
+    BV(clib_bihash_add_del) (adj_nbr_tables[nh_proto][sw_if_index], &kv, 0);
+}
+
+static adj_index_t
+adj_nbr_find (fib_protocol_t nh_proto,
+             fib_link_t link_type,
+             const ip46_address_t *nh_addr,
+             u32 sw_if_index)
+{
+    BVT(clib_bihash_kv) kv;
+
+    ADJ_NBR_SET_KEY(kv, link_type, nh_addr);
+
+    if (!ADJ_NBR_ITF_OK(nh_proto, sw_if_index))
+       return (ADJ_INDEX_INVALID);
+
+    if (BV(clib_bihash_search)(adj_nbr_tables[nh_proto][sw_if_index],
+                              &kv, &kv) < 0)
+    {
+       return (ADJ_INDEX_INVALID);
+    }
+    else
+    {
+       return (kv.value);
+    }
+}
+
+static inline vlib_node_registration_t*
+adj_get_nd_node (fib_protocol_t proto)
+{
+    switch (proto) {
+    case FIB_PROTOCOL_IP4:
+       return (&ip4_arp_node);
+    case FIB_PROTOCOL_IP6:
+       return (&ip6_discover_neighbor_node);
+    case FIB_PROTOCOL_MPLS:
+       break;
+    }
+    ASSERT(0);
+    return (NULL);
+}
+
+static void
+adj_ip4_nbr_probe (ip_adjacency_t *adj)
+{
+    vnet_main_t * vnm = vnet_get_main();
+    ip4_main_t * im = &ip4_main;
+    ip_interface_address_t * ia;
+    ethernet_arp_header_t * h;
+    vnet_hw_interface_t * hi;
+    vnet_sw_interface_t * si;
+    ip4_address_t * src;
+    vlib_buffer_t * b;
+    vlib_main_t * vm;
+    u32 bi = 0;
+
+    vm = vlib_get_main();
+
+    si = vnet_get_sw_interface (vnm,
+                               adj->rewrite_header.sw_if_index);
+
+    if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
+    {
+        return;
+    }
+
+    src =
+      ip4_interface_address_matching_destination(im,
+                                                &adj->sub_type.nbr.next_hop.ip4,
+                                                adj->rewrite_header.sw_if_index,
+                                                &ia);
+    if (! src)
+    {
+        return;
+    }
+
+    h = vlib_packet_template_get_packet (vm, &im->ip4_arp_request_packet_template, &bi);
+
+    hi = vnet_get_sup_hw_interface (vnm, adj->rewrite_header.sw_if_index);
+
+    clib_memcpy (h->ip4_over_ethernet[0].ethernet,
+                hi->hw_address,
+                sizeof (h->ip4_over_ethernet[0].ethernet));
+
+    h->ip4_over_ethernet[0].ip4 = src[0];
+    h->ip4_over_ethernet[1].ip4 = adj->sub_type.nbr.next_hop.ip4;
+
+    b = vlib_get_buffer (vm, bi);
+    vnet_buffer (b)->sw_if_index[VLIB_RX] =
+      vnet_buffer (b)->sw_if_index[VLIB_TX] =
+          adj->rewrite_header.sw_if_index;
+
+    /* Add encapsulation string for software interface (e.g. ethernet header). */
+    vnet_rewrite_one_header (adj[0], h, sizeof (ethernet_header_t));
+    vlib_buffer_advance (b, -adj->rewrite_header.data_bytes);
+
+    {
+        vlib_frame_t * f = vlib_get_frame_to_node (vm, hi->output_node_index);
+       u32 * to_next = vlib_frame_vector_args (f);
+       to_next[0] = bi;
+       f->n_vectors = 1;
+       vlib_put_frame_to_node (vm, hi->output_node_index, f);
+    }
+}
+
+static void
+adj_ip6_nbr_probe (ip_adjacency_t *adj)
+{
+    icmp6_neighbor_solicitation_header_t * h;
+    vnet_main_t * vnm = vnet_get_main();
+    ip6_main_t * im = &ip6_main;
+    ip_interface_address_t * ia;
+    ip6_address_t * dst, *src;
+    vnet_hw_interface_t * hi;
+    vnet_sw_interface_t * si;
+    vlib_buffer_t * b;
+    int bogus_length;
+    vlib_main_t * vm;
+    u32 bi = 0;
+
+    vm = vlib_get_main();
+
+    si = vnet_get_sw_interface(vnm, adj->rewrite_header.sw_if_index);
+    dst = &adj->sub_type.nbr.next_hop.ip6;
+
+    if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
+    {
+        return;
+    }
+    src = ip6_interface_address_matching_destination(im, dst,
+                                                    adj->rewrite_header.sw_if_index,
+                                                    &ia);
+    if (! src)
+    {
+       return;
+    }
+
+    h = vlib_packet_template_get_packet(vm,
+                                       &im->discover_neighbor_packet_template,
+                                       &bi);
+
+    hi = vnet_get_sup_hw_interface(vnm, adj->rewrite_header.sw_if_index);
+
+    h->ip.dst_address.as_u8[13] = dst->as_u8[13];
+    h->ip.dst_address.as_u8[14] = dst->as_u8[14];
+    h->ip.dst_address.as_u8[15] = dst->as_u8[15];
+    h->ip.src_address = src[0];
+    h->neighbor.target_address = dst[0];
+
+    clib_memcpy (h->link_layer_option.ethernet_address,
+                hi->hw_address,
+                vec_len(hi->hw_address));
+
+    h->neighbor.icmp.checksum = 
+       ip6_tcp_udp_icmp_compute_checksum(vm, 0, &h->ip, &bogus_length);
+    ASSERT(bogus_length == 0);
+
+    b = vlib_get_buffer (vm, bi);
+    vnet_buffer (b)->sw_if_index[VLIB_RX] =
+       vnet_buffer (b)->sw_if_index[VLIB_TX] =
+          adj->rewrite_header.sw_if_index;
+
+    /* Add encapsulation string for software interface (e.g. ethernet header). */
+    vnet_rewrite_one_header(adj[0], h, sizeof (ethernet_header_t));
+    vlib_buffer_advance(b, -adj->rewrite_header.data_bytes);
+
+    {
+       vlib_frame_t * f = vlib_get_frame_to_node(vm, hi->output_node_index);
+       u32 * to_next = vlib_frame_vector_args(f);
+       to_next[0] = bi;
+       f->n_vectors = 1;
+       vlib_put_frame_to_node(vm, hi->output_node_index, f);
+    }
+}
+
+static ip_adjacency_t*
+adj_nbr_alloc (fib_protocol_t nh_proto,
+              fib_link_t link_type,
+              const ip46_address_t *nh_addr,
+              u32 sw_if_index)
+{
+    ip_adjacency_t *adj;
+
+    adj = adj_alloc(nh_proto);
+
+    adj_nbr_insert(nh_proto, link_type, nh_addr,
+                  sw_if_index,
+                  adj->heap_handle);
+
+    /*
+     * since we just added the ADJ we have no rewrite string for it,
+     * so its for ARP
+     */
+    adj->lookup_next_index = IP_LOOKUP_NEXT_ARP;
+    adj->sub_type.nbr.next_hop = *nh_addr;
+    adj->ia_link = link_type;
+    adj->ia_nh_proto = nh_proto;
+    memset(&adj->sub_type.midchain.next_dpo, 0,
+           sizeof(adj->sub_type.midchain.next_dpo));
+
+    return (adj);
+}
+
+/*
+ * adj_add_for_nbr
+ *
+ * Add an adjacency for the neighbour requested.
+ *
+ * The key for an adj is:
+ *   - the Next-hops protocol (i.e. v4 or v6)
+ *   - the address of the next-hop
+ *   - the interface the next-hop is reachable through
+ *   - fib_index; this is broken. i will fix it.
+ *     the adj lookup currently occurs in the FIB.
+ */
+adj_index_t
+adj_nbr_add_or_lock (fib_protocol_t nh_proto,
+                    fib_link_t link_type,
+                    const ip46_address_t *nh_addr,
+                    u32 sw_if_index)
+{
+    adj_index_t adj_index;
+    ip_adjacency_t *adj;
+
+    adj_index = adj_nbr_find(nh_proto, link_type, nh_addr, sw_if_index);
+
+    if (ADJ_INDEX_INVALID == adj_index)
+    {
+       adj = adj_nbr_alloc(nh_proto, link_type, nh_addr, sw_if_index);
+
+       /*
+        * If there is no next-hop, this is the 'auto-adj' used on p2p
+        * links instead of a glean.
+        */
+       if (ip46_address_is_zero(nh_addr))
+       {
+           adj->lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
+
+           vnet_rewrite_for_sw_interface(vnet_get_main(),
+                                         adj_fib_link_2_vnet(link_type),
+                                         sw_if_index,
+                                         adj_get_rewrite_node(link_type)->index,
+                                         VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST,
+                                         &adj->rewrite_header,
+                                         sizeof (adj->rewrite_data));
+       }
+       else
+       {
+           vnet_rewrite_for_sw_interface(vnet_get_main(),
+                                         adj_fib_proto_2_nd(nh_proto),
+                                         sw_if_index,
+                                         adj_get_nd_node(nh_proto)->index,
+                                         VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST,
+                                         &adj->rewrite_header,
+                                         sizeof (adj->rewrite_data));
+
+           switch (nh_proto)
+           {
+           case FIB_PROTOCOL_IP4:
+               adj_ip4_nbr_probe(adj);
+               break;
+           case FIB_PROTOCOL_IP6:
+               adj_ip6_nbr_probe(adj);
+               break;
+           case FIB_PROTOCOL_MPLS:
+               break;
+           }
+       }
+    }
+    else
+    {
+       adj = adj_get(adj_index);
+    }
+
+    adj_lock(adj->heap_handle);
+
+    return (adj->heap_handle);
+}
+
+adj_index_t
+adj_nbr_add_or_lock_w_rewrite (fib_protocol_t nh_proto,
+                              fib_link_t link_type,
+                              const ip46_address_t *nh_addr,
+                              u32 sw_if_index,
+                              u8 *rewrite)
+{
+    adj_index_t adj_index;
+    ip_adjacency_t *adj;
+
+    adj_index = adj_nbr_find(nh_proto, link_type, nh_addr, sw_if_index);
+
+    if (ADJ_INDEX_INVALID == adj_index)
+    {
+       adj = adj_nbr_alloc(nh_proto, link_type, nh_addr, sw_if_index);
+       adj->rewrite_header.sw_if_index = sw_if_index;
+    }
+    else
+    {
+        adj = adj_get(adj_index);
+    }
+
+    adj_lock(adj->heap_handle);
+    adj_nbr_update_rewrite(adj->heap_handle, rewrite);
+
+    return (adj->heap_handle);
+}
+
+/**
+ * adj_nbr_update_rewrite
+ *
+ * Update the adjacency's rewrite string. A NULL string implies the
+ * rewirte is reset (i.e. when ARP/ND etnry is gone).
+ * NB: the adj being updated may be handling traffic in the DP.
+ */
+void
+adj_nbr_update_rewrite (adj_index_t adj_index,
+                       u8 *rewrite)
+{
+    ip_adjacency_t *adj;
+
+    ASSERT(ADJ_INDEX_INVALID != adj_index);
+
+    adj = adj_get(adj_index);
+
+    if (NULL != rewrite)
+    {
+       /*
+        * new rewrite provided.
+        * use a dummy rewrite header to get the interface to print into.
+        */
+       ip_adjacency_t dummy;
+
+       vnet_rewrite_for_sw_interface(vnet_get_main(),
+                                     adj_fib_link_2_vnet(adj->ia_link),
+                                     adj->rewrite_header.sw_if_index,
+                                     adj_get_rewrite_node(adj->ia_link)->index,
+                                     rewrite,
+                                     &dummy.rewrite_header,
+                                     sizeof (dummy.rewrite_data));
+
+       if (IP_LOOKUP_NEXT_REWRITE == adj->lookup_next_index)
+       {
+           /*
+            * this is an update of an existing rewrite.
+            * we can't just paste in the new rewrite as that is not atomic.
+            * So we briefly swap the ADJ to ARP type, paste, then swap back.
+            */
+           adj->lookup_next_index = IP_LOOKUP_NEXT_ARP;
+           CLIB_MEMORY_BARRIER();
+       }
+       /*
+        * else
+        *   this is the first time the rewrite is added.
+        *   paste it on then swap the next type.
+        */
+       clib_memcpy(&adj->rewrite_header,
+                   &dummy.rewrite_header,
+                   VLIB_BUFFER_PRE_DATA_SIZE);
+
+       adj->lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
+    }
+    else
+    {
+       /*
+        * clear the rewrite.
+        */
+       adj->lookup_next_index = IP_LOOKUP_NEXT_ARP;
+       CLIB_MEMORY_BARRIER();
+
+       adj->rewrite_header.data_bytes = 0;
+    }
+
+    /*
+     * time for walkies fido.
+     * The link type MPLS Adj never has children. So if it is this adj
+     * that is updated, we need to walk from its IP sibling.
+     */
+    if (FIB_LINK_MPLS == adj->ia_link)
+    {
+        adj_index = adj_nbr_find(adj->ia_nh_proto,
+                                fib_proto_to_link(adj->ia_nh_proto),
+                                &adj->sub_type.nbr.next_hop,
+                                adj->rewrite_header.sw_if_index);
+
+        ASSERT(ADJ_INDEX_INVALID != adj_index);
+    }
+
+    fib_node_back_walk_ctx_t bw_ctx = {
+       .fnbw_reason = FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE,
+       /*
+        * This walk only needs to go back one level, but there is no control here.
+         * the first receiving fib_entry_t will quash the walk
+        */
+    };
+
+    fib_walk_sync(FIB_NODE_TYPE_ADJ, adj_index, &bw_ctx);
+}
+
+typedef struct adj_db_count_ctx_t_ {
+    u64 count;
+} adj_db_count_ctx_t;
+
+static void
+adj_db_count (BVT(clib_bihash_kv) * kvp,
+             void *arg)
+{
+    adj_db_count_ctx_t * ctx = arg;
+    ctx->count++;
+}
+
+u32
+adj_nbr_db_size (void)
+{
+    adj_db_count_ctx_t ctx = {
+       .count = 0,
+    };
+    fib_protocol_t proto;
+    u32 sw_if_index = 0;
+
+    for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
+    {
+       vec_foreach_index(sw_if_index, adj_nbr_tables[proto])
+       {
+           if (NULL != adj_nbr_tables[proto][sw_if_index])
+           {
+               BV(clib_bihash_foreach_key_value_pair) (
+                   adj_nbr_tables[proto][sw_if_index],
+                   adj_db_count,
+                   &ctx);
+           }
+       }
+    }
+    return (ctx.count);
+}
+
+/**
+ * Context for the state change walk of the DB
+ */
+typedef struct adj_nbr_interface_state_change_ctx_t_
+{
+    /**
+     * Flags passed from the vnet notifiy function
+     */
+    int flags;
+} adj_nbr_interface_state_change_ctx_t;
+
+static void
+adj_nbr_interface_state_change_one (BVT(clib_bihash_kv) * kvp,
+                                   void *arg)
+{
+    /*
+     * Back walk the graph to inform the forwarding entries
+     * that this interface state has changed.
+     */
+    adj_nbr_interface_state_change_ctx_t *ctx = arg;
+
+    fib_node_back_walk_ctx_t bw_ctx = {
+       .fnbw_reason = (ctx->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP ?
+                       FIB_NODE_BW_REASON_FLAG_INTERFACE_UP :
+                       FIB_NODE_BW_REASON_FLAG_INTERFACE_DOWN),
+    };
+
+    fib_walk_sync(FIB_NODE_TYPE_ADJ, kvp->value, &bw_ctx);
+}
+
+static clib_error_t *
+adj_nbr_interface_state_change (vnet_main_t * vnm,
+                               u32 sw_if_index,
+                               u32 flags)
+{
+    fib_protocol_t proto;
+
+    /*
+     * walk each adj on the interface and trigger a walk from that adj
+     */
+    for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
+    {
+       if (!ADJ_NBR_ITF_OK(proto, sw_if_index))
+           continue;
+
+       adj_nbr_interface_state_change_ctx_t ctx = {
+           .flags = flags,
+       };
+
+       BV(clib_bihash_foreach_key_value_pair) (
+           adj_nbr_tables[proto][sw_if_index],
+           adj_nbr_interface_state_change_one,
+           &ctx);
+    }
+
+    return (NULL);
+}
+
+VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION(adj_nbr_interface_state_change);
+
+static void
+adj_nbr_interface_delete_one (BVT(clib_bihash_kv) * kvp,
+                             void *arg)
+{
+    /*
+     * Back walk the graph to inform the forwarding entries
+     * that this interface has been deleted.
+     */
+    fib_node_back_walk_ctx_t bw_ctx = {
+       .fnbw_reason = FIB_NODE_BW_REASON_FLAG_INTERFACE_DELETE,
+    };
+
+    fib_walk_sync(FIB_NODE_TYPE_ADJ, kvp->value, &bw_ctx);
+}
+
+/**
+ * adj_nbr_interface_add_del
+ *
+ * Registered to receive interface Add and delete notifications
+ */
+static clib_error_t *
+adj_nbr_interface_add_del (vnet_main_t * vnm,
+                          u32 sw_if_index,
+                          u32 is_add)
+{
+    fib_protocol_t proto;
+
+    if (is_add)
+    {
+       /*
+        * not interested in interface additions. we will not back walk
+        * to resolve paths through newly added interfaces. Why? The control
+        * plane should have the brains to add interfaces first, then routes.
+        * So the case where there are paths with a interface that matches
+        * one just created is the case where the path resolved through an
+        * interface that was deleted, and still has not been removed. The
+        * new interface added, is NO GUARANTEE that the interface being
+        * added now, even though it may have the same sw_if_index, is the
+        * same interface that the path needs. So tough!
+        * If the control plane wants these routes to resolve it needs to
+        * remove and add them again.
+        */
+       return (NULL);
+    }
+
+    for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
+    {
+       if (!ADJ_NBR_ITF_OK(proto, sw_if_index))
+           continue;
+
+       BV(clib_bihash_foreach_key_value_pair) (
+           adj_nbr_tables[proto][sw_if_index],
+           adj_nbr_interface_delete_one,
+           NULL);
+    }
+
+    return (NULL);
+   
+}
+
+VNET_SW_INTERFACE_ADD_DEL_FUNCTION(adj_nbr_interface_add_del);
+
+
+static void
+adj_nbr_show_one (BVT(clib_bihash_kv) * kvp,
+                 void *arg)
+{
+    vlib_cli_output (arg, "[@%d]  %U",
+                     kvp->value,
+                     format_ip_adjacency,
+                     vnet_get_main(), kvp->value,
+                    FORMAT_IP_ADJACENCY_NONE);
+}
+
+static clib_error_t *
+adj_nbr_show (vlib_main_t * vm,
+             unformat_input_t * input,
+             vlib_cli_command_t * cmd)
+{
+    adj_index_t ai = ADJ_INDEX_INVALID;
+
+    while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+       if (unformat (input, "%d", &ai))
+           ;
+       else
+           break;
+    }
+
+    if (ADJ_INDEX_INVALID != ai)
+    {
+       vlib_cli_output (vm, "[@%d] %U",
+                         ai,
+
+                         format_ip_adjacency,
+                        vnet_get_main(), ai,
+                        FORMAT_IP_ADJACENCY_DETAIL);
+    }
+    else
+    {
+       fib_protocol_t proto;
+
+       for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
+       {
+           u32 sw_if_index;
+
+           vec_foreach_index(sw_if_index, adj_nbr_tables[proto])
+           {
+               if (!ADJ_NBR_ITF_OK(proto, sw_if_index))
+                   continue;
+
+               BV(clib_bihash_foreach_key_value_pair) (
+                   adj_nbr_tables[proto][sw_if_index],
+                   adj_nbr_show_one,
+                   vm);
+           }
+       }
+    }
+
+    return 0;
+}
+
+VLIB_CLI_COMMAND (ip4_show_fib_command, static) = {
+    .path = "show adj nbr",
+    .short_help = "show adj nbr [<adj_index>] [sw_if_index <index>]",
+    .function = adj_nbr_show,
+};
+
+u8*
+format_adj_nbr_incomplete (u8* s, va_list *ap)
+{
+    index_t index = va_arg(ap, index_t);
+    CLIB_UNUSED(u32 indent) = va_arg(ap, u32);
+    vnet_main_t * vnm = vnet_get_main();
+    ip_adjacency_t * adj = adj_get(index);
+
+    s = format (s, "arp-%U", format_fib_link, adj->ia_link);
+    s = format (s, ": via %U",
+                format_ip46_address, &adj->sub_type.nbr.next_hop);
+    s = format (s, " %U",
+                format_vnet_sw_interface_name,
+                vnm,
+                vnet_get_sw_interface(vnm,
+                                      adj->rewrite_header.sw_if_index));
+
+    return (s);
+}
+
+u8*
+format_adj_nbr (u8* s, va_list *ap)
+{
+    index_t index = va_arg(ap, index_t);
+    CLIB_UNUSED(u32 indent) = va_arg(ap, u32);
+    vnet_main_t * vnm = vnet_get_main();
+    ip_adjacency_t * adj = adj_get(index);
+
+    s = format (s, "%U", format_fib_link, adj->ia_link);
+    s = format (s, " via %U ",
+               format_ip46_address, &adj->sub_type.nbr.next_hop);
+    s = format (s, "%U",
+               format_vnet_rewrite,
+               vnm->vlib_main, &adj->rewrite_header, sizeof (adj->rewrite_data), 0);
+
+    return (s);
+}
+
+static void
+adj_dpo_lock (dpo_id_t *dpo)
+{
+    adj_lock(dpo->dpoi_index);
+}
+static void
+adj_dpo_unlock (dpo_id_t *dpo)
+{
+    adj_unlock(dpo->dpoi_index);
+}
+
+const static dpo_vft_t adj_nbr_dpo_vft = {
+    .dv_lock = adj_dpo_lock,
+    .dv_unlock = adj_dpo_unlock,
+    .dv_format = format_adj_nbr,
+};
+const static dpo_vft_t adj_nbr_incompl_dpo_vft = {
+    .dv_lock = adj_dpo_lock,
+    .dv_unlock = adj_dpo_unlock,
+    .dv_format = format_adj_nbr_incomplete,
+};
+
+/**
+ * @brief The per-protocol VLIB graph nodes that are assigned to an adjacency
+ *        object.
+ *
+ * this means that these graph nodes are ones from which a nbr is the
+ * parent object in the DPO-graph.
+ */
+const static char* const nbr_ip4_nodes[] =
+{
+    "ip4-rewrite-transit",
+    NULL,
+};
+const static char* const nbr_ip6_nodes[] =
+{
+    "ip6-rewrite",
+    NULL,
+};
+const static char* const nbr_mpls_nodes[] =
+{
+    "mpls-output",
+    NULL,
+};
+const static char* const * const nbr_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = nbr_ip4_nodes,
+    [DPO_PROTO_IP6]  = nbr_ip6_nodes,
+    [DPO_PROTO_MPLS] = nbr_mpls_nodes,
+};
+
+const static char* const nbr_incomplete_ip4_nodes[] =
+{
+    "ip4-arp",
+    NULL,
+};
+const static char* const nbr_incomplete_ip6_nodes[] =
+{
+    "ip6-discover-neighbor",
+    NULL,
+};
+const static char* const nbr_incomplete_mpls_nodes[] =
+{
+    "mpls-adj-incomplete",
+    NULL,
+};
+
+const static char* const * const nbr_incomplete_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = nbr_incomplete_ip4_nodes,
+    [DPO_PROTO_IP6]  = nbr_incomplete_ip6_nodes,
+    [DPO_PROTO_MPLS] = nbr_incomplete_mpls_nodes,
+};
+
+void
+adj_nbr_module_init (void)
+{
+    dpo_register(DPO_ADJACENCY,
+                 &adj_nbr_dpo_vft,
+                 nbr_nodes);
+    dpo_register(DPO_ADJACENCY_INCOMPLETE,
+                 &adj_nbr_incompl_dpo_vft,
+                 nbr_incomplete_nodes);
+}
diff --git a/vnet/vnet/adj/adj_nbr.h b/vnet/vnet/adj/adj_nbr.h
new file mode 100644 (file)
index 0000000..331423b
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @brief
+ * Neighbour Adjacency sub-type. These adjs represent an L3 peer on a
+ * connected link. 
+ */
+
+#ifndef __ADJ_NBR_H__
+#define __ADJ_NBR_H__
+
+#include <vnet/vnet.h>
+#include <vnet/adj/adj_types.h>
+#include <vnet/fib/fib_node.h>
+#include <vnet/dpo/dpo.h>
+
+/**
+ * @brief
+ *  Add (and lock) a new or lock an existing neighbour adjacency
+ *
+ * @param nh_proto
+ *  The protocol for the next-hop address (v4 or v6)
+ *
+ * @param link_type
+ *  A description of the protocol of the packets that will forward
+ *  through this adj. On an ethernet interface this is the MAC header's
+ *  ether-type
+ *
+ * @param nh_addr
+ *  The address of the next-hop/peer to send the packet to
+ *
+ * @param sw_if_index
+ *  The interface on which the peer resides
+ */
+extern adj_index_t adj_nbr_add_or_lock(fib_protocol_t nh_proto,
+                                      fib_link_t link_type,
+                                      const ip46_address_t *nh_addr,
+                                      u32 sw_if_index);
+
+/**
+ * @brief
+ *  Add (and lock) a new or lock an existing neighbour adjacency
+ *
+ * @param nh_proto
+ *  The protocol for the next-hop address (v4 or v6)
+ *
+ * @param link_type
+ *  A description of the protocol of the packets that will forward
+ *  through this adj. On an ethernet interface this is the MAC header's
+ *  ether-type
+ *
+ * @param nh_addr
+ *  The address of the next-hop/peer to send the packet to
+ *
+ * @param sw_if_index
+ *  The interface on which the peer resides
+ *
+ * @param rewrite
+ *  The rewrite to prepend to packets
+ */
+extern adj_index_t adj_nbr_add_or_lock_w_rewrite(fib_protocol_t nh_proto,
+                                                fib_link_t link_type,
+                                                const ip46_address_t *nh_addr,
+                                                u32 sw_if_index,
+                                                u8 *rewrite);
+
+/**
+ * @brief
+ *  Update the rewrite string for an existing adjacecny.
+ *
+ * @param
+ *  The index of the adj to update
+ *
+ * @param
+ *  The new rewrite
+ */
+extern void adj_nbr_update_rewrite(adj_index_t adj_index,
+                                  u8 *rewrite);
+
+/**
+ * @brief
+ * Format aa incomplete neigbour (ARP) adjacency
+ */
+extern u8* format_adj_nbr_incomplete(u8* s, va_list *ap);
+
+/**
+ * @brief
+ * Format a neigbour (REWRITE) adjacency
+ */
+extern u8* format_adj_nbr(u8* s, va_list *ap);
+
+/**
+ * @brief
+ *  Module initialisation
+ */
+extern void adj_nbr_module_init(void);
+
+/**
+ * @brief
+ *  Return the size of the adjacency database. for testing purposes
+ */
+extern u32 adj_nbr_db_size(void);
+
+#endif
diff --git a/vnet/vnet/adj/adj_rewrite.c b/vnet/vnet/adj/adj_rewrite.c
new file mode 100644 (file)
index 0000000..db802e3
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/adj/adj.h>
+#include <vnet/adj/adj_alloc.h>
+#include <vnet/adj/adj_internal.h>
+
+/**
+ * adj_rewrite_add_and_lock
+ *
+ * A rewrite sub-type has the rewrite string provided, but no key
+ */
+adj_index_t
+adj_rewrite_add_and_lock (fib_protocol_t nh_proto,
+                         fib_link_t link_type,
+                         u32 sw_if_index,
+                         u8 *rewrite)
+{
+    ip_adjacency_t *adj;
+
+    adj = adj_alloc(nh_proto);
+
+    adj->lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
+    adj->ia_link = link_type;
+    adj->rewrite_header.sw_if_index = sw_if_index;
+
+    ASSERT(NULL != rewrite);
+
+    vnet_rewrite_for_sw_interface(vnet_get_main(),
+                                 adj_fib_link_2_vnet(link_type),
+                                 adj->rewrite_header.sw_if_index,
+                                 adj_get_rewrite_node(link_type)->index,
+                                 rewrite,
+                                 &adj->rewrite_header,
+                                 sizeof (adj->rewrite_data));
+
+    adj_lock(adj->heap_handle);
+
+    return (adj->heap_handle);
+}
diff --git a/vnet/vnet/adj/adj_rewrite.h b/vnet/vnet/adj/adj_rewrite.h
new file mode 100644 (file)
index 0000000..f8df255
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @brief
+ * A rewrite adjacency has no key, and thus cannot be 'found' from the
+ * FIB resolution code. the client therefore needs to maange these adjacencies
+ */
+
+#ifndef __ADJ_REWRITE_H__
+#define __ADJ_REWRITE_H__
+
+#include <vnet/adj/adj_types.h>
+
+/**
+ * @brief
+ *  Add (and lock) a new or lock an existing neighbour adjacency
+ *
+ * @param nh_proto
+ *  The protocol for the next-hop address (v4 or v6)
+ *
+ * @param link_type
+ *  A description of the protocol of the packets that will forward
+ *  through this adj. On an ethernet interface this is the MAC header's
+ *  ether-type
+ *
+ * @param sw_if_index
+ *  The interface on which the peer resides
+ *
+ * @param rewrite
+ *  The rewrite to prepend to packets
+ */
+extern adj_index_t adj_rewrite_add_and_lock(fib_protocol_t nh_proto,
+                                           fib_link_t link_type,
+                                           u32 sw_if_index,
+                                           u8 *rewrite);
+
+#endif
diff --git a/vnet/vnet/adj/adj_types.h b/vnet/vnet/adj/adj_types.h
new file mode 100644 (file)
index 0000000..a723466
--- /dev/null
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ADJ_TYPES_H__
+#define __ADJ_TYPES_H__
+
+#include <vnet/vnet.h>
+
+/**
+ * @brief An index for adjacencies.
+ * Alas 'C' is not typesafe enough to b0rk when a u32 is used instead of
+ * an adi_index_t. However, for us humans, we can glean much more intent
+ * from the declaration
+ *  foo bar(adj_index_t t);
+ * than we can from
+ *  foo bar(u32 t);
+ */
+typedef u32 adj_index_t; 
+
+/**
+ * @brief Invalid ADJ index - used when no adj is known
+ * likewise blazoned capitals INVALID speak volumes where ~0 does not.
+ */
+#define ADJ_INDEX_INVALID ((u32)~0)
+
+#endif
index c44f25e..44973ae 100644 (file)
@@ -15,6 +15,7 @@
 #include <vnet/ip/ip.h>
 #include <vnet/ethernet/ethernet.h>    /* for ethernet_header_t */
 #include <vnet/classify/vnet_classify.h>
+#include <vnet/dpo/classify_dpo.h>
 
 typedef struct {
   u32 next_index;
@@ -63,7 +64,6 @@ ip_classify_inline (vlib_main_t * vm,
   u32 n_left_from, * from, * to_next;
   ip_lookup_next_t next_index;
   vnet_classify_main_t * vcm = &vnet_classify_main;
-  ip_lookup_main_t * lm;
   f64 now = vlib_time_now (vm);
   u32 hits = 0;
   u32 misses = 0;
@@ -71,10 +71,8 @@ ip_classify_inline (vlib_main_t * vm,
   u32 n_next;
 
   if (is_ip4) {
-    lm = &ip4_main.lookup_main;
     n_next = IP4_LOOKUP_N_NEXT;
   } else {
-    lm = &ip6_main.lookup_main;
     n_next = IP6_LOOKUP_N_NEXT;
   }
 
@@ -88,8 +86,8 @@ ip_classify_inline (vlib_main_t * vm,
       vlib_buffer_t * b0, * b1;
       u32 bi0, bi1;
       u8 * h0, * h1;
-      u32 adj_index0, adj_index1;
-      ip_adjacency_t * adj0, * adj1;
+      u32 cd_index0, cd_index1;
+      classify_dpo_t *cd0, * cd1;
       u32 table_index0, table_index1;
       vnet_classify_table_t * t0, * t1;
 
@@ -116,13 +114,13 @@ ip_classify_inline (vlib_main_t * vm,
       h1 = (void *)vlib_buffer_get_current(b1) -
                 ethernet_buffer_header_size(b1);
         
-      adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
-      adj0 = ip_get_adjacency (lm, adj_index0);
-      table_index0 = adj0->classify.table_index;
+      cd_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
+      cd0 = classify_dpo_get(cd_index0);
+      table_index0 = cd0->cd_table_index;
 
-      adj_index1 = vnet_buffer (b1)->ip.adj_index[VLIB_TX];
-      adj1 = ip_get_adjacency (lm, adj_index1);
-      table_index1 = adj1->classify.table_index;
+      cd_index1 = vnet_buffer (b1)->ip.adj_index[VLIB_TX];
+      cd1 = classify_dpo_get(cd_index1);
+      table_index1 = cd1->cd_table_index;
 
       t0 = pool_elt_at_index (vcm->tables, table_index0);
 
@@ -151,8 +149,8 @@ ip_classify_inline (vlib_main_t * vm,
       vlib_buffer_t * b0;
       u32 bi0;
       u8 * h0;
-      u32 adj_index0;
-      ip_adjacency_t * adj0;
+      u32 cd_index0;
+      classify_dpo_t *cd0;
       u32 table_index0;
       vnet_classify_table_t * t0;
 
@@ -161,9 +159,9 @@ ip_classify_inline (vlib_main_t * vm,
       h0 = (void *)vlib_buffer_get_current(b0) -
                 ethernet_buffer_header_size(b0);
         
-      adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
-      adj0 = ip_get_adjacency (lm, adj_index0);
-      table_index0 = adj0->classify.table_index;
+      cd_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
+      cd0 = classify_dpo_get(cd_index0);
+      table_index0 = cd0->cd_table_index;
 
       t0 = pool_elt_at_index (vcm->tables, table_index0);
       vnet_buffer(b0)->l2_classify.hash = 
@@ -192,7 +190,7 @@ ip_classify_inline (vlib_main_t * vm,
        {
           u32 bi0;
          vlib_buffer_t * b0;
-          u32 next0 = IP_LOOKUP_NEXT_MISS;
+          u32 next0 = IP_LOOKUP_NEXT_DROP;
           u32 table_index0;
           vnet_classify_table_t * t0;
           vnet_classify_entry_t * e0;
index 2eee0f5..7716fc9 100644 (file)
@@ -1106,9 +1106,7 @@ uword unformat_l2_output_next_index (unformat_input_t * input, va_list * args)
 }
 
 #define foreach_ip_next                         \
-_(miss, MISS)                                   \
 _(drop, DROP)                                   \
-_(local, LOCAL)                                 \
 _(rewrite, REWRITE)
 
 uword unformat_ip_next_index (unformat_input_t * input, va_list * args)
@@ -2121,7 +2119,7 @@ test_classify_command_fn (vlib_main_t * vm,
                                        memory_size,
                                        0 /* skip */,
                                        3 /* vectors to match */);
-          t->miss_next_index = IP_LOOKUP_NEXT_LOCAL;
+          t->miss_next_index = IP_LOOKUP_NEXT_DROP;
           vlib_cli_output (vm, "Create table %d", t - cm->tables);
         }
       
index d80ff19..b77a779 100644 (file)
@@ -161,6 +161,10 @@ u32 vnet_config_del_feature (vlib_main_t * vm,
                             void *feature_config,
                             u32 n_feature_config_bytes);
 
+u8 *vnet_config_format_features (vlib_main_t * vm,
+                                vnet_config_main_t * cm,
+                                u32 config_index, u8 * s);
+
 #endif /* included_vnet_config_h */
 
 /*
index 5578558..d5121e7 100644 (file)
@@ -13,6 +13,8 @@
  * limitations under the License.
  */
 #include <vnet/cop/cop.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/dpo/load_balance.h>
 
 typedef struct {
   u32 next_index;
@@ -57,9 +59,7 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm,
   u32 n_left_from, * from, * to_next;
   cop_feature_type_t next_index;
   cop_main_t *cm = &cop_main;
-  ip4_main_t * im4 = &ip4_main;
-  ip_lookup_main_t * lm4 = &im4->lookup_main;
-  vlib_combined_counter_main_t * vcm = &im4->lookup_main.adjacency_counters;
+  vlib_combined_counter_main_t * vcm = &load_balance_main.lbm_via_counters;
   u32 cpu_index = vm->cpu_index;
 
   from = vlib_frame_vector_args (frame);
@@ -74,7 +74,7 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm,
                           to_next, n_left_to_next);
 
       while (n_left_from >= 4 && n_left_to_next >= 2)
-       {
+       {
           u32 bi0, bi1;
           vlib_buffer_t * b0, * b1;
           u32 next0, next1;
@@ -82,147 +82,142 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm,
           ip4_header_t * ip0, * ip1;
           cop_config_main_t * ccm0, * ccm1;
           cop_config_data_t * c0, * c1;
-         ip4_fib_mtrie_t * mtrie0, * mtrie1;
-         ip4_fib_mtrie_leaf_t leaf0, leaf1;
-          u32 adj_index0, adj_index1;
-          ip_adjacency_t * adj0, * adj1;
-          
-         /* Prefetch next iteration. */
-         {
-           vlib_buffer_t * p2, * p3;
+         ip4_fib_mtrie_t * mtrie0, * mtrie1;
+         ip4_fib_mtrie_leaf_t leaf0, leaf1;
+          u32 lb_index0, lb_index1;
+          const load_balance_t * lb0, *lb1;
+          const dpo_id_t *dpo0, *dpo1;
+
+         /* Prefetch next iteration. */
+         {
+           vlib_buffer_t * p2, * p3;
             
-           p2 = vlib_get_buffer (vm, from[2]);
-           p3 = vlib_get_buffer (vm, from[3]);
+           p2 = vlib_get_buffer (vm, from[2]);
+           p3 = vlib_get_buffer (vm, from[3]);
             
-           vlib_prefetch_buffer_header (p2, LOAD);
-           vlib_prefetch_buffer_header (p3, LOAD);
+           vlib_prefetch_buffer_header (p2, LOAD);
+           vlib_prefetch_buffer_header (p3, LOAD);
 
-           CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
-           CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
-         }
+           CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+           CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+         }
 
           /* speculatively enqueue b0 and b1 to the current next frame */
-         to_next[0] = bi0 = from[0];
-         to_next[1] = bi1 = from[1];
-         from += 2;
-         to_next += 2;
-         n_left_from -= 2;
-         n_left_to_next -= 2;
-
-         b0 = vlib_get_buffer (vm, bi0);
+         to_next[0] = bi0 = from[0];
+         to_next[1] = bi1 = from[1];
+         from += 2;
+         to_next += 2;
+         n_left_from -= 2;
+         n_left_to_next -= 2;
+
+         b0 = vlib_get_buffer (vm, bi0);
           sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
 
-         ip0 = vlib_buffer_get_current (b0);
+         ip0 = vlib_buffer_get_current (b0);
 
-         ccm0 = cm->cop_config_mains + VNET_COP_IP4;
+         ccm0 = cm->cop_config_mains + VNET_COP_IP4;
 
-         c0 = vnet_get_config_data 
+         c0 = vnet_get_config_data
               (&ccm0->config_main,
                &vnet_buffer (b0)->cop.current_config_index,
                &next0,
                sizeof (c0[0]));
 
-         mtrie0 = &vec_elt_at_index (im4->fibs, c0->fib_index)->mtrie;
+         mtrie0 = &ip4_fib_get (c0->fib_index)->mtrie;
 
-         leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
+         leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
 
-         leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, 
+         leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0,
                                              &ip0->src_address, 0);
 
-         leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, 
+         leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0,
                                              &ip0->src_address, 1);
 
-         leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, 
+         leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0,
                                              &ip0->src_address, 2);
 
-         leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, 
+         leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0,
                                              &ip0->src_address, 3);
 
-         adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+         lb_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
 
-         ASSERT (adj_index0 
-                  == ip4_fib_lookup_with_table (im4, c0->fib_index,
-                                                &ip0->src_address,
-                                                1 /* no_default_route */));
-         adj0 = ip_get_adjacency (lm4, adj_index0);
-          if (PREDICT_FALSE(adj0->lookup_next_index != IP_LOOKUP_NEXT_LOCAL))
+         ASSERT (lb_index0
+                  == ip4_fib_table_lookup_lb (ip4_fib_get(c0->fib_index),
+                                              &ip0->src_address));
+         lb0 = load_balance_get (lb_index0);
+          dpo0 = load_balance_get_bucket_i(lb0, 0);
+
+          if (PREDICT_FALSE(dpo0->dpoi_type != DPO_RECEIVE))
             {
               b0->error = node->errors[IP4_COP_WHITELIST_ERROR_DROPPED];
               next0 = RX_COP_DROP;
             }
 
-         b1 = vlib_get_buffer (vm, bi1);
+         b1 = vlib_get_buffer (vm, bi1);
           sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
 
-         ip1 = vlib_buffer_get_current (b1);
+         ip1 = vlib_buffer_get_current (b1);
 
-         ccm1 = cm->cop_config_mains + VNET_COP_IP4;
+         ccm1 = cm->cop_config_mains + VNET_COP_IP4;
 
-         c1 = vnet_get_config_data 
+         c1 = vnet_get_config_data
               (&ccm1->config_main,
                &vnet_buffer (b1)->cop.current_config_index,
                &next1,
                sizeof (c1[0]));
+         mtrie1 = &ip4_fib_get (c1->fib_index)->mtrie;
 
-         mtrie1 = &vec_elt_at_index (im4->fibs, c1->fib_index)->mtrie;
-
-         leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
+         leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
 
-         leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, 
+         leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1,
                                              &ip1->src_address, 0);
 
-         leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, 
+         leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1,
                                              &ip1->src_address, 1);
 
-         leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, 
+         leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1,
                                              &ip1->src_address, 2);
 
-         leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, 
+         leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1,
                                              &ip1->src_address, 3);
 
-         adj_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
-
-         ASSERT (adj_index1 
-                  == ip4_fib_lookup_with_table (im4, c1->fib_index,
-                                                &ip1->src_address,
-                                                1 /* no_default_route */));
-         adj1 = ip_get_adjacency (lm4, adj_index1);
+         lb_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
+         ASSERT (lb_index1
+                  == ip4_fib_table_lookup_lb (ip4_fib_get(c1->fib_index),
+                                              &ip1->src_address));
+         lb1 = load_balance_get (lb_index1);
+          dpo1 = load_balance_get_bucket_i(lb1, 0);
 
-          vlib_increment_combined_counter 
-              (vcm, cpu_index, adj_index0, 1,
-               vlib_buffer_length_in_chain (vm, b0) 
+          vlib_increment_combined_counter
+              (vcm, cpu_index, lb_index0, 1,
+               vlib_buffer_length_in_chain (vm, b0)
                + sizeof(ethernet_header_t));
 
-          vlib_increment_combined_counter 
-              (vcm, cpu_index, adj_index1, 1,
+          vlib_increment_combined_counter
+              (vcm, cpu_index, lb_index1, 1,
                vlib_buffer_length_in_chain (vm, b1)
                + sizeof(ethernet_header_t));
 
-          if (PREDICT_FALSE(adj0->lookup_next_index != IP_LOOKUP_NEXT_LOCAL))
-            {
-              b0->error = node->errors[IP4_COP_WHITELIST_ERROR_DROPPED];
-              next0 = RX_COP_DROP;
-            }
 
-          if (PREDICT_FALSE(adj1->lookup_next_index != IP_LOOKUP_NEXT_LOCAL))
+          if (PREDICT_FALSE(dpo1->dpoi_type != DPO_RECEIVE))
             {
               b1->error = node->errors[IP4_COP_WHITELIST_ERROR_DROPPED];
               next1 = RX_COP_DROP;
             }
 
-          if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) 
-                            && (b0->flags & VLIB_BUFFER_IS_TRACED))) 
+          if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+                            && (b0->flags & VLIB_BUFFER_IS_TRACED)))
             {
-              ip4_cop_whitelist_trace_t *t = 
+              ip4_cop_whitelist_trace_t *t =
                  vlib_add_trace (vm, node, b0, sizeof (*t));
               t->sw_if_index = sw_if_index0;
               t->next_index = next0;
             }
 
-          if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) 
-                            && (b1->flags & VLIB_BUFFER_IS_TRACED))) 
+          if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+                            && (b1->flags & VLIB_BUFFER_IS_TRACED)))
             {
-              ip4_cop_whitelist_trace_t *t = 
+              ip4_cop_whitelist_trace_t *t =
                  vlib_add_trace (vm, node, b1, sizeof (*t));
               t->sw_if_index = sw_if_index1;
               t->next_index = next1;
@@ -245,8 +240,9 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm,
           cop_config_data_t *c0;
          ip4_fib_mtrie_t * mtrie0;
          ip4_fib_mtrie_leaf_t leaf0;
-          u32 adj_index0;
-          ip_adjacency_t * adj0;
+          u32 lb_index0;
+          const load_balance_t * lb0;
+          const dpo_id_t *dpo0;
 
           /* speculatively enqueue b0 to the current next frame */
          bi0 = from[0];
@@ -269,7 +265,7 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm,
                &next0,
                sizeof (c0[0]));
 
-         mtrie0 = &vec_elt_at_index (im4->fibs, c0->fib_index)->mtrie;
+         mtrie0 = &ip4_fib_get (c0->fib_index)->mtrie;
 
          leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
 
@@ -285,20 +281,21 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm,
          leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, 
                                              &ip0->src_address, 3);
 
-         adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+         lb_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+
+         ASSERT (lb_index0 
+                  == ip4_fib_table_lookup_lb (ip4_fib_get(c0->fib_index),
+                                             &ip0->src_address));
 
-         ASSERT (adj_index0 
-                  == ip4_fib_lookup_with_table (im4, c0->fib_index,
-                                                &ip0->src_address,
-                                                1 /* no_default_route */));
-         adj0 = ip_get_adjacency (lm4, adj_index0);
+         lb0 = load_balance_get (lb_index0);
+          dpo0 = load_balance_get_bucket_i(lb0, 0);
 
           vlib_increment_combined_counter 
-              (vcm, cpu_index, adj_index0, 1,
+              (vcm, cpu_index, lb_index0, 1,
                vlib_buffer_length_in_chain (vm, b0) 
                + sizeof(ethernet_header_t));
 
-          if (PREDICT_FALSE(adj0->lookup_next_index != IP_LOOKUP_NEXT_LOCAL))
+          if (PREDICT_FALSE(dpo0->dpoi_type != DPO_RECEIVE))
             {
               b0->error = node->errors[IP4_COP_WHITELIST_ERROR_DROPPED];
               next0 = RX_COP_DROP;
index 4a8f33f..c2e16cc 100644 (file)
@@ -13,6 +13,8 @@
  * limitations under the License.
  */
 #include <vnet/cop/cop.h>
+#include <vnet/fib/ip6_fib.h>
+#include <vnet/dpo/load_balance.h>
 
 typedef struct {
   u32 next_index;
@@ -58,8 +60,7 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm,
   cop_feature_type_t next_index;
   cop_main_t *cm = &cop_main;
   ip6_main_t * im6 = &ip6_main;
-  ip_lookup_main_t * lm6 = &im6->lookup_main;
-  vlib_combined_counter_main_t * vcm = &im6->lookup_main.adjacency_counters;
+  vlib_combined_counter_main_t * vcm = &load_balance_main.lbm_via_counters;
   u32 cpu_index = vm->cpu_index;
 
   from = vlib_frame_vector_args (frame);
@@ -82,9 +83,10 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm,
           ip6_header_t * ip0, * ip1;
           cop_config_main_t * ccm0, * ccm1;
           cop_config_data_t * c0, * c1;
-          u32 adj_index0, adj_index1;
-          ip_adjacency_t * adj0, * adj1;
-          
+          u32 lb_index0, lb_index1;
+          const load_balance_t * lb0, *lb1;
+          const dpo_id_t *dpo0, *dpo1;
+         
          /* Prefetch next iteration. */
          {
            vlib_buffer_t * p2, * p3;
@@ -120,10 +122,12 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm,
                &next0,
                sizeof (c0[0]));
 
-          adj_index0 = ip6_fib_lookup_with_table (im6, c0->fib_index, 
-                                                  &ip0->src_address);
-         adj0 = ip_get_adjacency (lm6, adj_index0);
-          if (PREDICT_FALSE(adj0->lookup_next_index != IP_LOOKUP_NEXT_LOCAL))
+          lb_index0 = ip6_fib_table_fwding_lookup (im6, c0->fib_index, 
+                                                   &ip0->src_address);
+         lb0 = load_balance_get (lb_index0);
+          dpo0 = load_balance_get_bucket_i(lb0, 0);
+
+          if (PREDICT_FALSE(dpo0->dpoi_type != DPO_RECEIVE))
             {
               b0->error = node->errors[IP6_COP_WHITELIST_ERROR_DROPPED];
               next0 = RX_COP_DROP;
@@ -142,28 +146,23 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm,
                &next1,
                sizeof (c1[0]));
 
-          adj_index1 = ip6_fib_lookup_with_table (im6, c1->fib_index, 
-                                                  &ip1->src_address);
+          lb_index1 = ip6_fib_table_fwding_lookup (im6, c1->fib_index, 
+                                                   &ip1->src_address);
 
-         adj1 = ip_get_adjacency (lm6, adj_index1);
+         lb1 = load_balance_get (lb_index1);
+          dpo1 = load_balance_get_bucket_i(lb1, 0);
 
           vlib_increment_combined_counter 
-              (vcm, cpu_index, adj_index0, 1,
+              (vcm, cpu_index, lb_index0, 1,
                vlib_buffer_length_in_chain (vm, b0) 
                + sizeof(ethernet_header_t));
 
           vlib_increment_combined_counter 
-              (vcm, cpu_index, adj_index1, 1,
+              (vcm, cpu_index, lb_index1, 1,
                vlib_buffer_length_in_chain (vm, b1)
                + sizeof(ethernet_header_t));
 
-          if (PREDICT_FALSE(adj0->lookup_next_index != IP_LOOKUP_NEXT_LOCAL))
-            {
-              b0->error = node->errors[IP6_COP_WHITELIST_ERROR_DROPPED];
-              next0 = RX_COP_DROP;
-            }
-
-          if (PREDICT_FALSE(adj1->lookup_next_index != IP_LOOKUP_NEXT_LOCAL))
+          if (PREDICT_FALSE(dpo1->dpoi_type != DPO_RECEIVE))
             {
               b1->error = node->errors[IP6_COP_WHITELIST_ERROR_DROPPED];
               next1 = RX_COP_DROP;
@@ -202,8 +201,9 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm,
           ip6_header_t * ip0;
           cop_config_main_t *ccm0;
           cop_config_data_t *c0;
-          u32 adj_index0;
-          ip_adjacency_t * adj0;
+          u32 lb_index0;
+          const load_balance_t * lb0;
+          const dpo_id_t *dpo0;
 
           /* speculatively enqueue b0 to the current next frame */
          bi0 = from[0];
@@ -226,17 +226,18 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm,
                &next0,
                sizeof (c0[0]));
 
-          adj_index0 = ip6_fib_lookup_with_table (im6, c0->fib_index, 
-                                                  &ip0->src_address);
+          lb_index0 = ip6_fib_table_fwding_lookup (im6, c0->fib_index, 
+                                                   &ip0->src_address);
 
-         adj0 = ip_get_adjacency (lm6, adj_index0);
+         lb0 = load_balance_get (lb_index0);
+          dpo0 = load_balance_get_bucket_i(lb0, 0);
 
           vlib_increment_combined_counter 
-              (vcm, cpu_index, adj_index0, 1,
+              (vcm, cpu_index, lb_index0, 1,
                vlib_buffer_length_in_chain (vm, b0) 
                + sizeof(ethernet_header_t));
 
-          if (PREDICT_FALSE(adj0->lookup_next_index != IP_LOOKUP_NEXT_LOCAL))
+          if (PREDICT_FALSE(dpo0->dpoi_type != DPO_RECEIVE))
             {
               b0->error = node->errors[IP6_COP_WHITELIST_ERROR_DROPPED];
               next0 = RX_COP_DROP;
index 2ffb958..9e8fed4 100644 (file)
@@ -21,7 +21,7 @@
 #include <vnet/ethernet/ethernet.h>
 #include <vnet/devices/dpdk/dpdk.h>
 #include <vnet/classify/vnet_classify.h>
-#include <vnet/mpls-gre/packet.h>
+#include <vnet/mpls/packet.h>
 
 #include "dpdk_priv.h"
 
index a9e286e..63e7e55 100644 (file)
@@ -21,7 +21,7 @@
 #include <vnet/ethernet/ethernet.h>
 #include <vnet/devices/dpdk/dpdk.h>
 #include <vnet/classify/vnet_classify.h>
-#include <vnet/mpls-gre/packet.h>
+#include <vnet/mpls/packet.h>
 #include <vnet/handoff.h>
 
 #include "dpdk_priv.h"
@@ -687,7 +687,7 @@ poll_rate_limit (dpdk_main_t * dm)
 
     <em>Next Nodes:</em>
     - Static arcs to: error-drop, ethernet-input,
-      ip4-input-no-checksum, ip6-input, mpls-gre-input
+      ip4-input-no-checksum, ip6-input, mpls-input
     - per-interface redirection, controlled by
       <code>xd->per_interface_next_index</code>
 */
@@ -791,7 +791,7 @@ VLIB_REGISTER_NODE (dpdk_input_node) = {
     [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
     [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
     [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input",
-    [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input",
+    [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-input",
   },
 };
 
@@ -805,7 +805,6 @@ VLIB_NODE_FUNCTION_MULTIARCH_CLONE(dpdk_input_efd)
 CLIB_MULTIARCH_SELECT_FN(dpdk_input);
 CLIB_MULTIARCH_SELECT_FN(dpdk_input_rss);
 CLIB_MULTIARCH_SELECT_FN(dpdk_input_efd);
-/* *INDENT-ON* */
 
 /*
  * Override the next nodes for the dpdk input nodes.
@@ -876,11 +875,3 @@ efd_config (u32 enabled,
   set_efd_bitmap (&tm->efd.mpls_exp_bitmap, mpls_exp, mpls_op);
   set_efd_bitmap (&tm->efd.vlan_cos_bitmap, vlan_cos, vlan_op);
 }
-
-/*
- * fd.io coding-style-patch-verification: ON
- *
- * Local Variables:
- * eval: (c-set-style "gnu")
- * End:
- */
index e7d9792..e613cc9 100644 (file)
@@ -330,7 +330,7 @@ VLIB_REGISTER_NODE (ssvm_eth_input_node) = {
         [SSVM_ETH_INPUT_NEXT_ETHERNET_INPUT] = "ethernet-input",
         [SSVM_ETH_INPUT_NEXT_IP4_INPUT] = "ip4-input",
         [SSVM_ETH_INPUT_NEXT_IP6_INPUT] = "ip6-input",
-        [SSVM_ETH_INPUT_NEXT_MPLS_INPUT] = "mpls-gre-input",
+        [SSVM_ETH_INPUT_NEXT_MPLS_INPUT] = "mpls-input",
   },
 };
 
index 5916cfd..ffe6e8d 100644 (file)
  */
 #include <vlib/vlib.h>
 #include <vnet/dhcp/proxy.h>
+#include <vnet/fib/fib_table.h>
 
 dhcp_client_main_t dhcp_client_main;
 static u8 * format_dhcp_client_state (u8 * s, va_list * va);
 static vlib_node_registration_t dhcp_client_process_node;
 
-void __attribute__((weak))
-api_config_default_ip_route (u8 is_ipv6, u8 is_add, u32 vrf_id,
-                             u32 sw_if_index, u8 *next_hop_addr)
-{
-  /* dummy function */
-  return;
-}
-
 static void 
 dhcp_client_acquire_address (dhcp_client_main_t * dcm, dhcp_client_t * c)
 {
@@ -214,14 +207,34 @@ int dhcp_client_for_us (u32 bi, vlib_buffer_t * b,
 
           /*
            * Configure default IP route:
-           *  - vrf_id is 0 by default.
            */
           if (c->router_address.as_u32)
-             api_config_default_ip_route (0 /* is_ipv6 */,
-                                           1 /* is_add */,
-                                           0 /* vrf_id */,
-                                           c->sw_if_index,
-                                           (u8 *)&c->router_address);
+           {
+             fib_prefix_t all_0s =
+             {
+                 .fp_len = 0,
+                 .fp_addr.ip4.as_u32 = 0x0,
+                 .fp_proto = FIB_PROTOCOL_IP4,
+             };
+             ip46_address_t nh =
+             {
+                 .ip4 = c->router_address,
+             };
+
+             fib_table_entry_path_add (fib_table_get_index_for_sw_if_index(
+                                          FIB_PROTOCOL_IP4,
+                                          c->sw_if_index),
+                                       &all_0s,
+                                       FIB_SOURCE_DHCP,
+                                       FIB_ENTRY_FLAG_NONE,
+                                       FIB_PROTOCOL_IP4,
+                                       &nh,
+                                       c->sw_if_index,
+                                       ~0,
+                                       1,
+                                       MPLS_LABEL_INVALID,
+                                       FIB_ROUTE_PATH_FLAG_NONE);
+           }
 
           /*
            * Call the user's event callback to report DHCP information
@@ -496,11 +509,29 @@ dhcp_bound_state (dhcp_client_main_t * dcm, dhcp_client_t * c, f64 now)
   if (now > c->lease_expires)
     {
       if (c->router_address.as_u32)
-        api_config_default_ip_route (0 /* is_ipv6 */,
-                                     0 /* is_add */,
-                                     0 /* vrf_id */,
-                                     c->sw_if_index,
-                                     (u8 *)&c->router_address);
+        {
+         fib_prefix_t all_0s =
+         {
+             .fp_len = 0,
+             .fp_addr.ip4.as_u32 = 0x0,
+             .fp_proto = FIB_PROTOCOL_IP4,
+         };
+         ip46_address_t nh = {
+             .ip4 = c->router_address,
+         };
+
+         fib_table_entry_path_remove(fib_table_get_index_for_sw_if_index(
+                                         FIB_PROTOCOL_IP4,
+                                         c->sw_if_index),
+                                     &all_0s,
+                                     FIB_SOURCE_DHCP,
+                                     FIB_PROTOCOL_IP4,
+                                     &nh,
+                                     c->sw_if_index,
+                                     ~0,
+                                     1,
+                                     FIB_ROUTE_PATH_FLAG_NONE);
+       }
 
       dhcp_client_release_address (dcm, c);
       c->state = DHCP_DISCOVER;
@@ -689,7 +720,7 @@ show_dhcp_client_command_fn (vlib_main_t * vm,
       p = hash_get (dcm->client_by_sw_if_index, sw_if_index);
       if (p == 0)
         return clib_error_return (0, "dhcp client not configured");
-      c = pool_elt_at_index (dcm->clients, sw_if_index);
+      c = pool_elt_at_index (dcm->clients, p[0]);
       vlib_cli_output (vm, "%U", format_dhcp_client, dcm, c, verbose);
       return 0;
     }
@@ -715,6 +746,18 @@ int dhcp_client_add_del (dhcp_client_add_del_args_t * a)
   vlib_main_t * vm = dcm->vlib_main;
   dhcp_client_t * c;
   uword * p;
+  fib_prefix_t all_1s =
+  {
+      .fp_len = 32,
+      .fp_addr.ip4.as_u32 = 0xffffffff,
+      .fp_proto = FIB_PROTOCOL_IP4,
+  };
+  fib_prefix_t all_0s =
+  {
+      .fp_len = 0,
+      .fp_addr.ip4.as_u32 = 0x0,
+      .fp_proto = FIB_PROTOCOL_IP4,
+  };
 
   p = hash_get (dcm->client_by_sw_if_index, a->sw_if_index);
 
@@ -738,6 +781,22 @@ int dhcp_client_add_del (dhcp_client_add_del_args_t * a)
       } while (c->transaction_id == 0);
       set_l2_rewrite (dcm, c);
       hash_set (dcm->client_by_sw_if_index, a->sw_if_index, c - dcm->clients);
+
+      /* this add is ref counted by FIB so we can add for each itf */
+      fib_table_entry_special_add(fib_table_get_index_for_sw_if_index(
+                                     FIB_PROTOCOL_IP4,
+                                     c->sw_if_index),
+                                 &all_1s,
+                                 FIB_SOURCE_DHCP,
+                                 FIB_ENTRY_FLAG_LOCAL,
+                                 ADJ_INDEX_INVALID);
+
+     /*
+       * enable the interface to RX IPv4 packets
+       * this is also ref counted
+       */
+      ip4_sw_interface_enable_disable (c->sw_if_index, 1);
+
       vlib_process_signal_event (vm, dhcp_client_process_node.index, 
                                  EVENT_DHCP_CLIENT_WAKEUP, c - dcm->clients);
     }
@@ -745,12 +804,32 @@ int dhcp_client_add_del (dhcp_client_add_del_args_t * a)
     {
       c = pool_elt_at_index (dcm->clients, p[0]);
 
+      fib_table_entry_special_remove(fib_table_get_index_for_sw_if_index(
+                                        FIB_PROTOCOL_IP4,
+                                        c->sw_if_index),
+                                    &all_1s,
+                                    FIB_SOURCE_DHCP);
+
       if (c->router_address.as_u32)
-        api_config_default_ip_route (0 /* is_ipv6 */,
-                                     0 /* is_add */,
-                                     0 /* vrf_id */,
-                                     c->sw_if_index,
-                                     (u8 *)&c->router_address);
+      {
+         ip46_address_t nh = {
+             .ip4 = c->router_address,
+         };
+
+         fib_table_entry_path_remove(fib_table_get_index_for_sw_if_index(
+                                         FIB_PROTOCOL_IP4,
+                                         c->sw_if_index),
+                                     &all_0s,
+                                     FIB_SOURCE_DHCP,
+                                     FIB_PROTOCOL_IP4,
+                                     &nh,
+                                     c->sw_if_index,
+                                     ~0,
+                                     1,
+                                     FIB_ROUTE_PATH_FLAG_NONE);
+      }
+      ip4_sw_interface_enable_disable (c->sw_if_index, 0);
+
       vec_free (c->option_55_data);
       vec_free (c->hostname);
       vec_free (c->client_identifier);
index 2073b3f..7018fc3 100644 (file)
@@ -18,6 +18,7 @@
 #include <vlib/vlib.h>
 #include <vnet/pg/pg.h>
 #include <vnet/dhcp/proxy.h>
+#include <vnet/fib/ip4_fib.h>
 
 static char * dhcp_proxy_error_strings[] = {
 #define dhcp_proxy_error(n,s) s,
@@ -225,7 +226,7 @@ dhcp_proxy_to_server_input (vlib_main_t * vm,
               
               fib_index = im->fib_index_by_sw_if_index 
                 [vnet_buffer(b0)->sw_if_index[VLIB_RX]];
-             fib = vec_elt_at_index (im->fibs, fib_index);
+             fib = ip4_fib_get (fib_index);
              fib_id = fib->table_id;
 
               end = b0->data + b0->current_data + b0->current_length;
@@ -699,9 +700,7 @@ int dhcp_proxy_set_server_2 (ip4_address_t *addr, ip4_address_t *src_address,
                              int insert_option_82, int is_del)
 {
   dhcp_proxy_main_t * dpm = &dhcp_proxy_main;
-  ip4_main_t * im = &ip4_main;
   dhcp_server_t * server = 0;
-  ip4_fib_t *rx_fib, *server_fib;
   u32 server_index = 0;
   u32 rx_fib_index = 0;
 
@@ -711,18 +710,11 @@ int dhcp_proxy_set_server_2 (ip4_address_t *addr, ip4_address_t *src_address,
   if (src_address->as_u32 == 0)
     return VNET_API_ERROR_INVALID_SRC_ADDRESS;
 
-  rx_fib = find_ip4_fib_by_table_index_or_id 
-    (&ip4_main, rx_fib_id, IP4_ROUTE_FLAG_TABLE_ID);
-    
-  if (rx_fib == 0)
-    return VNET_API_ERROR_NO_SUCH_INNER_FIB;
-  
-  server_fib = find_ip4_fib_by_table_index_or_id 
-    (&ip4_main, server_fib_id, IP4_ROUTE_FLAG_TABLE_ID);
-    
-  if (server_fib == 0)
-    return VNET_API_ERROR_NO_SUCH_FIB;
-  
+  rx_fib_index = fib_table_find_or_create_and_lock(FIB_PROTOCOL_IP4,
+                                                   rx_fib_id);
+  server_index = fib_table_find_or_create_and_lock(FIB_PROTOCOL_IP4,
+                                                   server_fib_id);
+
   if (rx_fib_id == 0)
     {
       server = pool_elt_at_index (dpm->dhcp_servers, 0);
@@ -735,8 +727,6 @@ int dhcp_proxy_set_server_2 (ip4_address_t *addr, ip4_address_t *src_address,
       goto initialize_it;
     }
 
-  rx_fib_index = rx_fib - im->fibs;
-
   if (is_del)
     {
       if (rx_fib_index >= vec_len(dpm->dhcp_server_index_by_rx_fib_index))
@@ -768,7 +758,7 @@ int dhcp_proxy_set_server_2 (ip4_address_t *addr, ip4_address_t *src_address,
  initialize_it:
 
   server->dhcp_server.as_u32 = addr->as_u32;
-  server->server_fib_index = server_fib - im->fibs;
+  server->server_fib_index = server_index;
   server->dhcp_src_address.as_u32 = src_address->as_u32;
   server->insert_option_82 = insert_option_82;
   server->valid = 1;
@@ -883,14 +873,12 @@ u8 * format_dhcp_proxy_server (u8 * s, va_list * args)
       return s;
     }
 
-  server_fib = find_ip4_fib_by_table_index_or_id 
-    (&ip4_main, server->server_fib_index, IP4_ROUTE_FLAG_FIB_INDEX);
+  server_fib = ip4_fib_get(server->server_fib_index);
 
   if (server_fib)
     server_fib_id = server_fib->table_id;
 
-  rx_fib = find_ip4_fib_by_table_index_or_id 
-    (&ip4_main, rx_fib_index, IP4_ROUTE_FLAG_FIB_INDEX);
+  rx_fib = ip4_fib_get(rx_fib_index);
 
   if (rx_fib)
     rx_fib_id = rx_fib->table_id;
index 4dc746f..323bdf9 100644 (file)
@@ -18,6 +18,7 @@
 #include <vlib/vlib.h>
 #include <vnet/pg/pg.h>
 #include <vnet/dhcpv6/proxy.h>
+#include <vnet/fib/ip6_fib.h>
 
 static char * dhcpv6_proxy_error_strings[] = {
 #define dhcpv6_proxy_error(n,s) s,
@@ -323,7 +324,7 @@ dhcpv6_proxy_to_server_input (vlib_main_t * vm,
 
           fib_index = im->fib_index_by_sw_if_index 
               [vnet_buffer(b0)->sw_if_index[VLIB_RX]];
-          fib = vec_elt_at_index (im->fibs, fib_index);
+          fib = ip6_fib_get (fib_index);
           fib_id = fib->table_id;
 
           p_vss = hash_get (dpm->vss_index_by_vrf_id,
@@ -573,7 +574,7 @@ dhcpv6_proxy_to_client_input (vlib_main_t * vm,
 
       svr_fib_index = im->fib_index_by_sw_if_index
           [vnet_buffer(b0)->sw_if_index[VLIB_RX]];
-      svr_fib = vec_elt_at_index (im->fibs, svr_fib_index);
+      svr_fib = ip6_fib_get (svr_fib_index);
       svr_fib_id = svr_fib->table_id;
 
       if (svr_fib_id != dpm->server_fib_index ||
@@ -831,8 +832,7 @@ u8 * format_dhcpv6_proxy_server (u8 * s, va_list * args)
       return s;
     }
 
-  f = find_ip6_fib_by_table_index_or_id (&ip6_main, dm->server_fib_index,
-                                         IP6_ROUTE_FLAG_FIB_INDEX);
+  f = ip6_fib_get (dm->server_fib_index);
   if (f)
     fib_id = f->table_id;
 
diff --git a/vnet/vnet/dpo/classify_dpo.c b/vnet/vnet/dpo/classify_dpo.c
new file mode 100644 (file)
index 0000000..3b7b98f
--- /dev/null
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/ip/ip.h>
+#include <vnet/dpo/classify_dpo.h>
+#include <vnet/mpls/mpls.h>
+
+/*
+ * pool of all MPLS Label DPOs
+ */
+classify_dpo_t *classify_dpo_pool;
+
+static classify_dpo_t *
+classify_dpo_alloc (void)
+{
+    classify_dpo_t *cd;
+
+    pool_get_aligned(classify_dpo_pool, cd, CLIB_CACHE_LINE_BYTES);
+    memset(cd, 0, sizeof(*cd));
+
+    return (cd);
+}
+
+static index_t
+classify_dpo_get_index (classify_dpo_t *cd)
+{
+    return (cd - classify_dpo_pool);
+}
+
+index_t
+classify_dpo_create (fib_protocol_t proto,
+                     u32 classify_table_index)
+{
+    classify_dpo_t *cd;
+
+    cd = classify_dpo_alloc();
+    cd->cd_proto = proto;
+    cd->cd_table_index = classify_table_index;
+
+    return (classify_dpo_get_index(cd));
+}
+
+u8*
+format_classify_dpo (u8 *s, va_list *args)
+{
+    index_t index = va_arg (*args, index_t);
+    CLIB_UNUSED(u32 indent) = va_arg (*args, u32);
+    classify_dpo_t *cd;
+
+    cd = classify_dpo_get(index);
+
+    return (format(s, "classify:[%d]:table:%d",
+                  index, cd->cd_table_index));
+}
+
+static void
+classify_dpo_lock (dpo_id_t *dpo)
+{
+    classify_dpo_t *cd;
+
+    cd = classify_dpo_get(dpo->dpoi_index);
+
+    cd->cd_locks++;
+}
+
+static void
+classify_dpo_unlock (dpo_id_t *dpo)
+{
+    classify_dpo_t *cd;
+
+    cd = classify_dpo_get(dpo->dpoi_index);
+
+    cd->cd_locks--;
+
+    if (0 == cd->cd_locks)
+    {
+       pool_put(classify_dpo_pool, cd);
+    }
+}
+
+const static dpo_vft_t cd_vft = {
+    .dv_lock = classify_dpo_lock,
+    .dv_unlock = classify_dpo_unlock,
+    .dv_format = format_classify_dpo,
+};
+
+const static char* const classify_ip4_nodes[] =
+{
+    "ip4-classify",
+    NULL,
+};
+const static char* const classify_ip6_nodes[] =
+{
+    "ip6-classify",
+    NULL,
+};
+const static char* const * const classify_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = classify_ip4_nodes,
+    [DPO_PROTO_IP6]  = classify_ip6_nodes,
+    [DPO_PROTO_MPLS] = NULL,
+};
+
+void
+classify_dpo_module_init (void)
+{
+    dpo_register(DPO_CLASSIFY, &cd_vft, classify_nodes);
+}
diff --git a/vnet/vnet/dpo/classify_dpo.h b/vnet/vnet/dpo/classify_dpo.h
new file mode 100644 (file)
index 0000000..cd35c3c
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CLASSIFY_DPO_H__
+#define __CLASSIFY_DPO_H__
+
+#include <vnet/vnet.h>
+#include <vnet/mpls/packet.h>
+#include <vnet/dpo/dpo.h>
+
+/**
+ * A representation of an MPLS label for imposition in the data-path
+ */
+typedef struct classify_dpo_t
+{
+    fib_protocol_t cd_proto;
+
+    u32 cd_table_index;
+
+    /**
+     * Number of locks/users of the label
+     */
+    u16 cd_locks;
+} classify_dpo_t;
+
+extern index_t classify_dpo_create(fib_protocol_t proto,
+                                   u32 classify_table_index);
+
+extern u8* format_classify_dpo(u8 *s, va_list *args);
+
+/*
+ * Encapsulation violation for fast data-path access
+ */
+extern classify_dpo_t *classify_dpo_pool;
+
+static inline classify_dpo_t *
+classify_dpo_get (index_t index)
+{
+    return (pool_elt_at_index(classify_dpo_pool, index));
+}
+
+extern void classify_dpo_module_init(void);
+
+#endif
diff --git a/vnet/vnet/dpo/dpo.c b/vnet/vnet/dpo/dpo.c
new file mode 100644 (file)
index 0000000..5eff52b
--- /dev/null
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @brief
+ * A Data-Path Object is an object that represents actions that are
+ * applied to packets are they are switched through VPP.
+ * 
+ * The DPO is a base class that is specialised by other objects to provide
+ * concreate actions
+ *
+ * The VLIB graph nodes are graph of types, the DPO graph is a graph of instances.
+ */
+
+#include <vnet/dpo/dpo.h>
+#include <vnet/ip/lookup.h>
+#include <vnet/ip/format.h>
+#include <vnet/adj/adj.h>
+
+#include <vnet/dpo/load_balance.h>
+#include <vnet/dpo/mpls_label_dpo.h>
+#include <vnet/dpo/lookup_dpo.h>
+#include <vnet/dpo/drop_dpo.h>
+#include <vnet/dpo/receive_dpo.h>
+#include <vnet/dpo/punt_dpo.h>
+#include <vnet/dpo/classify_dpo.h>
+
+/**
+ * Array of char* names for the DPO types and protos
+ */
+static const char* dpo_type_names[] = DPO_TYPES;
+static const char* dpo_proto_names[] = DPO_PROTOS;
+
+/**
+ * @brief Vector of virtual function tables for the DPO types
+ *
+ * This is a vector so we can dynamically register new DPO types in plugins.
+ */
+static dpo_vft_t *dpo_vfts;
+
+/**
+ * @brief vector of graph node names associated with each DPO type and protocol.
+ *
+ *   dpo_nodes[child_type][child_proto][node_X] = node_name;
+ * i.e.
+ *   dpo_node[DPO_LOAD_BALANCE][DPO_PROTO_IP4][0] = "ip4-lookup"
+ *   dpo_node[DPO_LOAD_BALANCE][DPO_PROTO_IP4][1] = "ip4-load-balance"
+ *
+ * This is a vector so we can dynamically register new DPO types in plugins.
+ */
+static const char* const * const ** dpo_nodes;
+
+/**
+ * @brief Vector of edge indicies from parent DPO nodes to child
+ *
+ * dpo_edges[child_type][child_proto][parent_type] = edge_index
+ *
+ * This array is derived at init time from the dpo_nodes above. Note that
+ * the third dimension in dpo_nodes is lost, hence, the edge index from each
+ * node MUST be the same.
+ *
+ * Note that this array is child type specific, not child instance specific.
+ */
+static u32 ***dpo_edges;
+
+/**
+ * @brief The DPO type value that can be assigend to the next dynamic
+ *        type registration.
+ */
+static dpo_type_t dpo_dynamic = DPO_LAST;
+
+u8 *
+format_dpo_type (u8 * s, va_list * args)
+{
+    dpo_type_t type = va_arg (*args, int);
+
+    s = format(s, "%s", dpo_type_names[type]);
+
+    return (s);
+}
+
+u8 *
+format_dpo_id (u8 * s, va_list * args)
+{
+    dpo_id_t *dpo = va_arg (*args, dpo_id_t*);
+    u32 indent = va_arg (*args, u32);
+
+    s = format(s, "[@%d]: ", dpo->dpoi_next_node);
+
+    if (NULL != dpo_vfts[dpo->dpoi_type].dv_format)
+    {
+        return (format(s, "%U",
+                       dpo_vfts[dpo->dpoi_type].dv_format,
+                       dpo->dpoi_index,
+                       indent));
+    }
+
+    switch (dpo->dpoi_type)
+    {
+    case DPO_FIRST:
+       s = format(s, "unset");
+       break;
+    default:
+       s = format(s, "unknown");
+       break;
+    }
+    return (s);
+}
+
+u8 *
+format_dpo_proto (u8 * s, va_list * args)
+{
+    dpo_proto_t proto = va_arg (*args, int);
+
+    return (format(s, "%s", dpo_proto_names[proto]));
+}
+
+void
+dpo_set (dpo_id_t *dpo,
+        dpo_type_t type,
+        dpo_proto_t proto,
+        index_t index)
+{
+    dpo_id_t tmp = *dpo;
+
+    dpo->dpoi_type = type;
+    dpo->dpoi_proto = proto,
+    dpo->dpoi_index = index;
+
+    if (DPO_ADJACENCY == type)
+    {
+       /*
+        * set the adj subtype
+        */
+       ip_adjacency_t *adj;
+
+       adj = adj_get(index);
+
+       switch (adj->lookup_next_index)
+       {
+       case IP_LOOKUP_NEXT_ARP:
+           dpo->dpoi_type = DPO_ADJACENCY_INCOMPLETE;
+           break;
+       case IP_LOOKUP_NEXT_MIDCHAIN:
+           dpo->dpoi_type = DPO_ADJACENCY_MIDCHAIN;
+           break;
+       default:
+           break;
+       }
+    }
+    dpo_lock(dpo);
+    dpo_unlock(&tmp);
+}
+
+void
+dpo_reset (dpo_id_t *dpo)
+{
+    dpo_set(dpo, DPO_FIRST, DPO_PROTO_NONE, INDEX_INVALID);
+}
+
+/**
+ * \brief
+ * Compare two Data-path objects
+ *
+ * like memcmp, return 0 is matching, !0 otherwise.
+ */
+int
+dpo_cmp (const dpo_id_t *dpo1,
+        const dpo_id_t *dpo2)
+{
+    int res;
+
+    res = dpo1->dpoi_type - dpo2->dpoi_type;
+
+    if (0 != res) return (res);
+
+    return (dpo1->dpoi_index - dpo2->dpoi_index);
+}
+
+void
+dpo_copy (dpo_id_t *dst,
+         const dpo_id_t *src)
+{
+    dpo_id_t tmp = *dst;
+
+    /*
+     * the destination is written in a single u64 write - hence atomically w.r.t
+     * any packets inflight.
+     */
+    *((u64*)dst) = *(u64*)src; 
+
+    dpo_lock(dst);
+    dpo_unlock(&tmp);    
+}
+
+int
+dpo_is_adj (const dpo_id_t *dpo)
+{
+    return ((dpo->dpoi_type == DPO_ADJACENCY) ||
+           (dpo->dpoi_type == DPO_ADJACENCY_INCOMPLETE) ||
+           (dpo->dpoi_type == DPO_ADJACENCY_MIDCHAIN) ||
+           (dpo->dpoi_type == DPO_ADJACENCY_GLEAN));
+}
+
+void
+dpo_register (dpo_type_t type,
+             const dpo_vft_t *vft,
+              const char * const * const * nodes)
+{
+    vec_validate(dpo_vfts, type);
+    dpo_vfts[type] = *vft;
+
+    vec_validate(dpo_nodes, type);
+    dpo_nodes[type] = nodes;
+}
+
+dpo_type_t
+dpo_register_new_type (const dpo_vft_t *vft,
+                       const char * const * const * nodes)
+{
+    dpo_type_t type = dpo_dynamic++;
+
+    dpo_register(type, vft, nodes);
+
+    return (type);
+}
+
+void
+dpo_lock (dpo_id_t *dpo)
+{
+    if (!dpo_id_is_valid(dpo))
+       return;
+
+    dpo_vfts[dpo->dpoi_type].dv_lock(dpo);
+}
+
+void
+dpo_unlock (dpo_id_t *dpo)
+{
+    if (!dpo_id_is_valid(dpo))
+       return;
+
+    dpo_vfts[dpo->dpoi_type].dv_unlock(dpo);
+}
+
+
+static u32
+dpo_get_next_node (dpo_type_t child_type,
+                   dpo_proto_t child_proto,
+                   const dpo_id_t *parent_dpo)
+{
+    dpo_proto_t parent_proto;
+    dpo_type_t parent_type;
+
+    parent_type = parent_dpo->dpoi_type;
+    parent_proto = parent_dpo->dpoi_proto;
+
+    vec_validate(dpo_edges, child_type);
+    vec_validate(dpo_edges[child_type], child_proto);
+    vec_validate_init_empty(dpo_edges[child_type][child_proto],
+                            parent_dpo->dpoi_type, ~0);
+
+    /*
+     * if the edge index has not yet been created for this node to node transistion
+     */
+    if (~0 == dpo_edges[child_type][child_proto][parent_type])
+    {
+        vlib_node_t *parent_node, *child_node;
+        vlib_main_t *vm;
+        u32 edge ,pp, cc;
+
+        vm = vlib_get_main();
+
+        ASSERT(NULL != dpo_nodes[child_type]);
+        ASSERT(NULL != dpo_nodes[child_type][child_proto]);
+        ASSERT(NULL != dpo_nodes[parent_type]);
+        ASSERT(NULL != dpo_nodes[parent_type][parent_proto]);
+
+        pp = 0;
+
+        /*
+         * create a graph arc from each of the parent's registered node types,
+         * to each of the childs.
+         */
+        while (NULL != dpo_nodes[child_type][child_proto][pp])
+        {
+            parent_node =
+                vlib_get_node_by_name(vm,
+                                      (u8*) dpo_nodes[child_type][child_proto][pp]);
+
+            cc = 0;
+
+            while (NULL != dpo_nodes[parent_type][child_proto][cc])
+            {
+                child_node =
+                    vlib_get_node_by_name(vm,
+                                          (u8*) dpo_nodes[parent_type][parent_proto][cc]);
+
+                edge = vlib_node_add_next(vm,
+                                          parent_node->index,
+                                          child_node->index);
+
+                if (~0 == dpo_edges[child_type][child_proto][parent_type])
+                {
+                    dpo_edges[child_type][child_proto][parent_type] = edge;
+                }
+                else
+                {
+                    ASSERT(dpo_edges[child_type][child_proto][parent_type] == edge);
+                }
+                cc++;
+            }
+            pp++;
+        }
+    }
+
+    return (dpo_edges[child_type][child_proto][parent_type]);
+}
+
+/**
+ * @brief Stack one DPO object on another, and thus establish a child parent
+ * relationship. The VLIB graph arc used is taken from the parent and child types
+ * passed.
+ */
+static void
+dpo_stack_i (u32 edge,
+             dpo_id_t *dpo,
+             const dpo_id_t *parent)
+{
+    /*
+     * in order to get an atomic update of the parent we create a temporary,
+     * from a copy of the child, and add the next_node. then we copy to the parent
+     */
+    dpo_id_t tmp = DPO_NULL;
+    dpo_copy(&tmp, parent);
+
+    /*
+     * get the edge index for the parent to child VLIB graph transisition
+     */
+    tmp.dpoi_next_node = edge;
+
+    /*
+     * this update is atomic.
+     */
+    dpo_copy(dpo, &tmp);
+
+    dpo_reset(&tmp);
+}
+
+/**
+ * @brief Stack one DPO object on another, and thus establish a child-parent
+ * relationship. The VLIB graph arc used is taken from the parent and child types
+ * passed.
+ */
+void
+dpo_stack (dpo_type_t child_type,
+           dpo_proto_t child_proto,
+           dpo_id_t *dpo,
+           const dpo_id_t *parent)
+{
+    dpo_stack_i(dpo_get_next_node(child_type, child_proto, parent), dpo, parent);
+}
+
+/**
+ * @brief Stack one DPO object on another, and thus establish a child parent
+ * relationship. A new VLIB graph arc is created from the child node passed
+ * to the nodes registered by the parent. The VLIB infra will ensure this arc
+ * is added only once.
+ */
+void
+dpo_stack_from_node (u32 child_node_index,
+                     dpo_id_t *dpo,
+                     const dpo_id_t *parent)
+{
+    dpo_proto_t parent_proto;
+    vlib_node_t *parent_node;
+    dpo_type_t parent_type;
+    vlib_main_t *vm;
+    u32 edge;
+
+    parent_type = parent->dpoi_type;
+    parent_proto = parent->dpoi_proto;
+
+    vm = vlib_get_main();
+
+    ASSERT(NULL != dpo_nodes[parent_type]);
+    ASSERT(NULL != dpo_nodes[parent_type][parent_proto]);
+
+    parent_node =
+        vlib_get_node_by_name(vm, (u8*) dpo_nodes[parent_type][parent_proto][0]);
+
+    edge = vlib_node_add_next(vm,
+                              child_node_index,
+                              parent_node->index);
+
+    dpo_stack_i(edge, dpo, parent);
+}
+
+static clib_error_t *
+dpo_module_init (vlib_main_t * vm)
+{
+    drop_dpo_module_init();
+    punt_dpo_module_init();
+    receive_dpo_module_init();
+    load_balance_module_init();
+    mpls_label_dpo_module_init();
+    classify_dpo_module_init();
+    lookup_dpo_module_init();
+
+    return (NULL);
+}
+
+VLIB_INIT_FUNCTION(dpo_module_init);
diff --git a/vnet/vnet/dpo/dpo.h b/vnet/vnet/dpo/dpo.h
new file mode 100644 (file)
index 0000000..8c22f00
--- /dev/null
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @brief
+ * A Data-Path Object is an object that represents actions that are
+ * applied to packets are they are switched through VPP's data-path.
+ * 
+ * The DPO can be considered to be like is a base class that is specialised
+ * by other objects to provide concreate actions
+ *
+ * The VLIB graph nodes are graph of DPO types, the DPO graph is a graph of
+ * instances.
+ */
+
+#ifndef __DPO_H__
+#define __DPO_H__
+
+#include <vnet/vnet.h>
+
+/**
+ * @brief An index for adjacencies.
+ * Alas 'C' is not typesafe enough to b0rk when a u32 is used instead of
+ * an index_t. However, for us humans, we can glean much more intent
+ * from the declaration
+ *  foo barindex_t t);
+ * than we can from
+ *  foo bar(u32 t);
+ */
+typedef u32 index_t;
+
+/**
+ * @brief Invalid index - used when no index is known
+ * blazoned capitals INVALID speak volumes where ~0 does not.
+ */
+#define INDEX_INVALID ((index_t)(~0))
+
+/**
+ * @brief Data path protocol.
+ * Actions performed on packets in the data-plane can be described and represented
+ * by protocol independent objects, i.e. ADJACENCY, but the spceifics actions
+ * required during ADJACENCY processing can be protocol dependent. For example,
+ * the adjacency rewrite node performs a ip4 checksum calculation,  ip6 and MPLS
+ * do not, all 3 perform a TTL decrement. The VLIB graph nodes are thus protocol
+ * dependent, and thus each graph edge/arc is too.
+ * When programming a DPO's next node arc from child to parent it is thus required
+ * to know the parent's data-path protocol so the correct arc index can be used.
+ */
+typedef enum dpo_proto_t_
+{
+#if CLIB_DEBUG > 0
+    DPO_PROTO_IP4 = 1,
+#else
+    DPO_PROTO_IP4 = 0,
+#endif
+    DPO_PROTO_IP6,
+    DPO_PROTO_MPLS,
+} __attribute__((packed)) dpo_proto_t;
+
+#define DPO_PROTO_NUM (DPO_PROTO_MPLS+1)
+#define DPO_PROTO_NONE (DPO_PROTO_NUM+1)
+
+#define DPO_PROTOS {           \
+    [DPO_PROTO_IP4]  = "ip4",  \
+    [DPO_PROTO_IP6]  = "ip6",  \
+    [DPO_PROTO_MPLS] = "mpls", \
+}
+
+/**
+ * @brief Common types of data-path objects
+ * New types can be dynamically added using dpo_register_new_type()
+ */
+typedef enum dpo_type_t_ {
+    /**
+     * A non-zero value first so we can spot unitialisation errors
+     */
+    DPO_FIRST,
+    DPO_DROP,
+    DPO_PUNT,
+    /**
+     * @brief load-balancing over a choice of [un]equal cost paths
+     */
+    DPO_LOAD_BALANCE,
+    DPO_ADJACENCY,
+    DPO_ADJACENCY_INCOMPLETE,
+    DPO_ADJACENCY_MIDCHAIN,
+    DPO_ADJACENCY_GLEAN,
+    DPO_RECEIVE,
+    DPO_LOOKUP,
+    DPO_LISP_CP,
+    DPO_CLASSIFY,
+    DPO_MPLS_LABEL,
+    DPO_LAST,
+} __attribute__((packed)) dpo_type_t;
+
+#define DPO_TYPE_NUM DPO_LAST
+
+#define DPO_TYPES {                    \
+    [DPO_FIRST] = "dpo-invalid",       \
+    [DPO_DROP] = "dpo-drop",   \
+    [DPO_PUNT] = "dpo-punt",   \
+    [DPO_ADJACENCY] = "dpo-adjacency", \
+    [DPO_ADJACENCY_INCOMPLETE] = "dpo-adjacency-incomplete",   \
+    [DPO_ADJACENCY_MIDCHAIN] = "dpo-adjacency-midcahin",       \
+    [DPO_ADJACENCY_GLEAN] = "dpo-glean",       \
+    [DPO_RECEIVE] = "dpo-receive",     \
+    [DPO_LOOKUP] = "dpo-lookup",       \
+    [DPO_LOAD_BALANCE] = "dpo-load-balance",   \
+    [DPO_LISP_CP] = "dpo-lisp-cp",     \
+    [DPO_CLASSIFY] = "dpo-classify",   \
+    [DPO_MPLS_LABEL] = "dpo-mpls-label",       \
+}
+
+/**
+ * @brief The identity of a DPO is a combination of its type and its
+ * instance number/index of objects of that type
+ */
+typedef struct dpo_id_t_ {
+    /**
+     * the type
+     */
+    dpo_type_t dpoi_type;
+    /**
+     * the data-path protocol of the type.
+     */
+    dpo_proto_t dpoi_proto;
+    /**
+     * The next VLIB node to follow.
+     */
+    u16 dpoi_next_node;
+    /**
+     * the index of objects of that type
+     */
+    index_t dpoi_index;
+} __attribute__ ((aligned(sizeof(u64)))) dpo_id_t;
+
+_Static_assert(sizeof(dpo_id_t) <= sizeof(u64),
+              "DPO ID is greater than sizeof u64 "
+              "atomic updates need to be revisited");
+
+/**
+ * @brief An initialiser for DPos declared on the stack.
+ */
+#define DPO_NULL {0}
+
+/**
+ * @brief Return true if the DPO object is valid, i.e. has been initialised.
+ */
+static inline int
+dpo_id_is_valid (const dpo_id_t *dpoi)
+{
+    return (dpoi->dpoi_type != DPO_FIRST &&
+           dpoi->dpoi_index != INDEX_INVALID);
+}
+
+/**
+ * @brief
+ *  Take a reference counting lock on the DPO
+ */
+extern void dpo_lock(dpo_id_t *dpo);
+
+/**
+ * @brief
+ *  Release a reference counting lock on the DPO
+ */
+extern void dpo_unlock(dpo_id_t *dpo);
+
+/**
+ * @brief Set/create a DPO ID
+ * The DPO will be locked.
+ *
+ * @param dpo
+ *  The DPO object to configure
+ *
+ * @param type
+ *  The dpo_type_t of the DPO
+ *
+ * @param proto
+ *  The dpo_proto_t of the DPO
+ *
+ * @param index
+ *  The type specific index of the DPO
+ */
+extern void dpo_set(dpo_id_t *dpo,
+                   dpo_type_t type,
+                   dpo_proto_t proto,
+                   index_t index);
+
+/**
+ * @brief reset a DPO ID
+ * The DPO will be unlocked.
+ *
+ * @param dpo
+ *  The DPO object to reset
+ */
+extern void dpo_reset(dpo_id_t *dpo);
+
+/**
+ * @brief compare two DPOs for equality
+ */
+extern int dpo_cmp(const dpo_id_t *dpo1,
+                  const dpo_id_t *dpo2);
+
+/**
+ * @brief
+ *  atomic copy a data-plane object.
+ * This is safe to use when the dst DPO is currently switching packets
+ */
+extern void dpo_copy(dpo_id_t *dst,
+                    const dpo_id_t *src);
+
+/**
+ * @brief Return TRUE is the DPO is any type of adjacency
+ */
+extern int dpo_is_adj(const dpo_id_t *dpo);
+
+/**
+ * @biref Format a DPO_id_t oject
+ */
+extern u8 *format_dpo_id(u8 * s, va_list * args);
+
+/**
+ * @biref format a DPO type
+ */
+extern u8 *format_dpo_type(u8 * s, va_list * args);
+
+/**
+ * @brief format a DPO protocol
+ */
+extern u8 *format_dpo_proto(u8 * s, va_list * args);
+
+/**
+ * @brief
+ *  Set and stack a DPO.
+ *  The DPO passed is set to the parent DPO and the necessary
+ *  VLIB graph arcs are created. The child_type and child_proto
+ * are used to get the VLID nodes from which the arcs are added.
+ *
+ * @param child_type
+ *  Child DPO type.
+ *
+ * @param child_proto
+ *  Child DPO proto
+ *
+ * @parem dpo
+ *  This is the DPO to stack and set.
+ *
+ * @paren parent_dpo
+ *  The parent DPO to stack onto.
+ */
+extern void dpo_stack(dpo_type_t child_type,
+                      dpo_proto_t child_proto,
+                      dpo_id_t *dpo,
+                      const dpo_id_t *parent_dpo);
+
+/**
+ * @brief 
+ *  Set and stack a DPO.
+ *  The DPO passed is set to the parent DPO and the necessary
+ *  VLIB graph arcs are created, from the child_node passed.
+ *
+ * @param child_node
+ *  The VLIB grpah node index to create an arc from to the parent
+ *
+ * @parem dpo
+ *  This is the DPO to stack and set.
+ *
+ * @paren parent_dpo
+ *  The parent DPO to stack onto.
+ */ 
+extern void dpo_stack_from_node(u32 child_node,
+                                dpo_id_t *dpo,
+                                const dpo_id_t *parent);
+
+/**
+ * @brief  A lock function registered for a DPO type
+ */
+typedef void (*dpo_lock_fn_t)(dpo_id_t *dpo);
+
+/**
+ * @brief An unlock function registered for a DPO type
+ */
+typedef void (*dpo_unlock_fn_t)(dpo_id_t *dpo);
+
+/**
+ * @brief A virtual function table regisitered for a DPO type
+ */
+typedef struct dpo_vft_t_
+{
+    /**
+     * A reference counting lock function
+     */
+    dpo_lock_fn_t dv_lock;
+    /**
+     * A reference counting unlock function
+     */
+    dpo_lock_fn_t dv_unlock;
+    /**
+     * A format function
+     */
+    format_function_t *dv_format;
+} dpo_vft_t;
+
+
+/**
+ * @brief For a given DPO type Register:
+ *   - a virtual function table
+ *   - a NULL terminated array of graph nodes from which that object type
+ *     will originate packets, i.e. the nodes in which the object type will be
+ *     the parent DPO in the DP graph. The ndoes are per-data-path protocol
+ *     (see above).
+ *
+ * @param type
+ *  The type being registered. 
+ *
+ * @param vft
+ *  The virtual function table to register for the type.
+ *
+ * @param nodes
+ *  The string description of the per-protocol VLIB graph nodes.
+ */
+void dpo_register(dpo_type_t type,
+                 const dpo_vft_t *vft,
+                  const char * const * const * nodes);
+
+/**
+ * @brief Create and register a new DPO type.
+ *
+ * This can be used by plugins to create new DPO types that are not listed
+ * in dpo_type_t enum
+ *
+ * @param vft
+ *  The virtual function table to register for the type.
+ *
+ * @param nodes
+ *  The string description of the per-protocol VLIB graph nodes.
+ *
+ * @return The new dpo_type_t
+ */
+dpo_type_t dpo_register_new_type(const dpo_vft_t *vft,
+                                 const char * const * const * nodes);
+
+#endif
diff --git a/vnet/vnet/dpo/drop_dpo.c b/vnet/vnet/dpo/drop_dpo.c
new file mode 100644 (file)
index 0000000..62f5648
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @brief
+ * The data-path object representing dropping the packet
+ */
+
+#include <vnet/dpo/dpo.h>
+
+static dpo_id_t drop_dpos[DPO_PROTO_NUM];
+
+const dpo_id_t *
+drop_dpo_get (dpo_proto_t proto)
+{
+    dpo_set(&drop_dpos[proto], DPO_DROP, proto, 1);
+
+    return (&drop_dpos[proto]);
+}
+
+int
+dpo_is_drop (const dpo_id_t *dpo)
+{
+    return (dpo->dpoi_type == DPO_DROP);
+}
+
+static void
+drop_dpo_lock (dpo_id_t *dpo)
+{
+    /*
+     * not maintaining a lock count on the drop
+     * more trouble than it's worth.
+     * There always needs to be one around. no point it managaing its lifetime
+     */
+}
+static void
+drop_dpo_unlock (dpo_id_t *dpo)
+{
+}
+
+static u8*
+format_drop_dpo (u8 *s, va_list *ap)
+{
+    CLIB_UNUSED(index_t index) = va_arg(ap, index_t);
+    CLIB_UNUSED(u32 indent) = va_arg(ap, u32);
+
+    return (format(s, "dpo-drop"));
+}
+
+const static dpo_vft_t drop_vft = {
+    .dv_lock   = drop_dpo_lock,
+    .dv_unlock = drop_dpo_unlock,
+    .dv_format = format_drop_dpo,
+};
+
+/**
+ * @brief The per-protocol VLIB graph nodes that are assigned to a drop
+ *        object.
+ *
+ * this means that these graph nodes are ones from which a drop is the
+ * parent object in the DPO-graph.
+ */
+const static char* const drop_ip4_nodes[] =
+{
+    "ip4-drop",
+    NULL,
+};
+const static char* const drop_ip6_nodes[] =
+{
+    "ip6-drop",
+    NULL,
+};
+const static char* const drop_mpls_nodes[] =
+{
+    "mpls-drop",
+    NULL,
+};
+const static char* const * const drop_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = drop_ip4_nodes,
+    [DPO_PROTO_IP6]  = drop_ip6_nodes,
+    [DPO_PROTO_MPLS] = drop_mpls_nodes,
+};
+
+void
+drop_dpo_module_init (void)
+{
+    dpo_register(DPO_DROP, &drop_vft, drop_nodes);
+}
diff --git a/vnet/vnet/dpo/drop_dpo.h b/vnet/vnet/dpo/drop_dpo.h
new file mode 100644 (file)
index 0000000..e7bd8f5
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @brief
+ * A Data-Path Object is an object that represents actions that are
+ * applied to packets are they are switched through VPP.
+ * 
+ * The DPO is a base class that is specialised by other objects to provide
+ * concreate actions
+ *
+ * The VLIB graph nodes are graph of types, the DPO graph is a graph of instances.
+ */
+
+#ifndef __DROP_DPO_H__
+#define __DROP_DPO_H__
+
+#include <vnet/dpo/dpo.h>
+
+extern int dpo_is_drop(const dpo_id_t *dpo);
+
+extern const dpo_id_t *drop_dpo_get(dpo_proto_t proto);
+
+extern void drop_dpo_module_init(void);
+
+#endif
diff --git a/vnet/vnet/dpo/load_balance.c b/vnet/vnet/dpo/load_balance.c
new file mode 100644 (file)
index 0000000..963ff0b
--- /dev/null
@@ -0,0 +1,760 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/ip/lookup.h>
+#include <vnet/dpo/load_balance.h>
+#include <vnet/dpo/load_balance_map.h>
+#include <vnet/dpo/drop_dpo.h>
+#include <vppinfra/math.h>              /* for fabs */
+#include <vnet/adj/adj.h>
+#include <vnet/adj/adj_alloc.h>
+#include <vnet/adj/adj_internal.h>
+
+/*
+ * distribution error tolerance for load-balancing
+ */
+const f64 multipath_next_hop_error_tolerance = 0.1;
+
+#undef LB_DEBUG
+
+#ifdef LB_DEBUG
+#define LB_DBG(_lb, _fmt, _args...)                                     \
+{                                                                       \
+    u8* _tmp =NULL;                                                     \
+    clib_warning("lb:[%s]:" _fmt,                                       \
+                 load_balance_format(load_balance_get_index((_lb)),     \
+                                     0, _tmp),                          \
+                 ##_args);                                              \
+    vec_free(_tmp);                                                     \
+}
+#else
+#define LB_DBG(_p, _fmt, _args...)
+#endif
+
+
+/**
+ * Pool of all DPOs. It's not static so the DP can have fast access
+ */
+load_balance_t *load_balance_pool;
+
+/**
+ * The one instance of load-balance main
+ */
+load_balance_main_t load_balance_main;
+
+f64
+load_balance_get_multipath_tolerance (void)
+{
+    return (multipath_next_hop_error_tolerance);
+}
+
+static inline index_t
+load_balance_get_index (const load_balance_t *lb)
+{
+    return (lb - load_balance_pool);
+}
+
+static inline dpo_id_t*
+load_balance_get_buckets (load_balance_t *lb)
+{
+    if (LB_HAS_INLINE_BUCKETS(lb))
+    {
+        return (lb->lb_buckets_inline);
+    }
+    else
+    {
+        return (lb->lb_buckets);
+    }
+}
+
+static load_balance_t *
+load_balance_alloc_i (void)
+{
+    load_balance_t *lb;
+
+    pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
+    memset(lb, 0, sizeof(*lb));
+
+    lb->lb_map = INDEX_INVALID;
+    vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
+                                   load_balance_get_index(lb));
+    vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
+                                   load_balance_get_index(lb));
+    vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters),
+                               load_balance_get_index(lb));
+    vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
+                               load_balance_get_index(lb));
+
+    return (lb);
+}
+
+static u8*
+load_balance_format (index_t lbi,
+                     load_balance_format_flags_t flags,
+                     u32 indent,
+                     u8 *s)
+{
+    vlib_counter_t to, via;
+    load_balance_t *lb;
+    dpo_id_t *buckets;
+    u32 i;
+
+    lb = load_balance_get(lbi);
+    vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to);
+    vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via);
+    buckets = load_balance_get_buckets(lb);
+
+    s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE);
+    s = format(s, "[index:%d buckets:%d ", lbi, lb->lb_n_buckets);
+    s = format(s, "locks:%d ", lb->lb_locks);
+    s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
+    if (0 != via.packets)
+    {
+        s = format(s, " via:[%Ld:%Ld]",
+                   via.packets, via.bytes);
+    }
+    s = format(s, "]");
+
+    if (INDEX_INVALID != lb->lb_map)
+    {
+        s = format(s, "\n%U%U",
+                   format_white_space, indent+4,
+                   format_load_balance_map, lb->lb_map, indent+4);
+    }
+    for (i = 0; i < lb->lb_n_buckets; i++)
+    {
+        s = format(s, "\n%U[%d] %U",
+                   format_white_space, indent+2,
+                   i,
+                   format_dpo_id,
+                   &buckets[i], indent+6);
+    }
+    return (s);
+}
+
+u8*
+format_load_balance (u8 * s, va_list * args)
+{
+    index_t lbi = va_arg(args, index_t);
+    load_balance_format_flags_t flags = va_arg(args, load_balance_format_flags_t);
+
+    return (load_balance_format(lbi, flags, 0, s));
+}
+static u8*
+format_load_balance_dpo (u8 * s, va_list * args)
+{
+    index_t lbi = va_arg(args, index_t);
+    u32 indent = va_arg(args, u32);
+
+    return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
+}
+
+
+static load_balance_t *
+load_balance_create_i (u32 num_buckets,
+                       dpo_proto_t lb_proto,
+                       flow_hash_config_t fhc)
+{
+    load_balance_t *lb;
+
+    lb = load_balance_alloc_i();
+    lb->lb_hash_config = fhc;
+    lb->lb_n_buckets = num_buckets;
+    lb->lb_n_buckets_minus_1 = num_buckets-1;
+    lb->lb_proto = lb_proto;
+
+    if (!LB_HAS_INLINE_BUCKETS(lb))
+    {
+        vec_validate_aligned(lb->lb_buckets,
+                             lb->lb_n_buckets - 1,
+                             CLIB_CACHE_LINE_BYTES);
+    }
+
+    LB_DBG(lb, "create");
+
+    return (lb);
+}
+
+index_t
+load_balance_create (u32 n_buckets,
+                     dpo_proto_t lb_proto,
+                     flow_hash_config_t fhc)
+{
+    return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc)));
+}
+
+static inline void
+load_balance_set_bucket_i (load_balance_t *lb,
+                           u32 bucket,
+                           dpo_id_t *buckets,
+                           const dpo_id_t *next)
+{
+    dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next);
+}
+
+void
+load_balance_set_bucket (index_t lbi,
+                         u32 bucket,
+                         const dpo_id_t *next)
+{
+    load_balance_t *lb;
+    dpo_id_t *buckets;
+
+    lb = load_balance_get(lbi);
+    buckets = load_balance_get_buckets(lb);
+
+    ASSERT(bucket < lb->lb_n_buckets);
+
+    load_balance_set_bucket_i(lb, bucket, buckets, next);
+}
+
+int
+load_balance_is_drop (const dpo_id_t *dpo)
+{
+    load_balance_t *lb;
+
+    if (DPO_LOAD_BALANCE != dpo->dpoi_type)
+        return (0);
+
+    lb = load_balance_get(dpo->dpoi_index);
+
+    if (1 == lb->lb_n_buckets)
+    {
+        return (dpo_is_drop(load_balance_get_bucket_i(lb, 0)));
+    }
+    return (0);
+}
+
+const dpo_id_t *
+load_balance_get_bucket (index_t lbi,
+                         u32 bucket)
+{
+    load_balance_t *lb;
+
+    lb = load_balance_get(lbi);
+
+    return (load_balance_get_bucket_i(lb, bucket));
+}
+
+static int
+next_hop_sort_by_weight (load_balance_path_t * n1,
+                         load_balance_path_t * n2)
+{
+    return ((int) n1->path_weight - (int) n2->path_weight);
+}
+
+/* Given next hop vector is over-written with normalized one with sorted weights and
+   with weights corresponding to the number of adjacencies for each next hop.
+   Returns number of adjacencies in block. */
+u32
+ip_multipath_normalize_next_hops (load_balance_path_t * raw_next_hops,
+                                  load_balance_path_t ** normalized_next_hops,
+                                  u32 *sum_weight_in,
+                                  f64 multipath_next_hop_error_tolerance)
+{
+    load_balance_path_t * nhs;
+    uword n_nhs, n_adj, n_adj_left, i, sum_weight;
+    f64 norm, error;
+
+    n_nhs = vec_len (raw_next_hops);
+    ASSERT (n_nhs > 0);
+    if (n_nhs == 0)
+        return 0;
+
+    /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
+    nhs = *normalized_next_hops;
+    vec_validate (nhs, 2*n_nhs - 1);
+
+    /* Fast path: 1 next hop in block. */
+    n_adj = n_nhs;
+    if (n_nhs == 1)
+    {
+        nhs[0] = raw_next_hops[0];
+        nhs[0].path_weight = 1;
+        _vec_len (nhs) = 1;
+        sum_weight = 1;
+        goto done;
+    }
+
+    else if (n_nhs == 2)
+    {
+        int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;
+
+        /* Fast sort. */
+        nhs[0] = raw_next_hops[cmp];
+        nhs[1] = raw_next_hops[cmp ^ 1];
+
+        /* Fast path: equal cost multipath with 2 next hops. */
+        if (nhs[0].path_weight == nhs[1].path_weight)
+        {
+            nhs[0].path_weight = nhs[1].path_weight = 1;
+            _vec_len (nhs) = 2;
+            sum_weight = 2;
+            goto done;
+        }
+    }
+    else
+    {
+        clib_memcpy (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
+        qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
+    }
+
+    /* Find total weight to normalize weights. */
+    sum_weight = 0;
+    for (i = 0; i < n_nhs; i++)
+        sum_weight += nhs[i].path_weight;
+
+    /* In the unlikely case that all weights are given as 0, set them all to 1. */
+    if (sum_weight == 0)
+    {
+        for (i = 0; i < n_nhs; i++)
+            nhs[i].path_weight = 1;
+        sum_weight = n_nhs;
+    }
+
+    /* Save copies of all next hop weights to avoid being overwritten in loop below. */
+    for (i = 0; i < n_nhs; i++)
+        nhs[n_nhs + i].path_weight = nhs[i].path_weight;
+
+    /* Try larger and larger power of 2 sized adjacency blocks until we
+       find one where traffic flows to within 1% of specified weights. */
+    for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
+    {
+        error = 0;
+
+        norm = n_adj / ((f64) sum_weight);
+        n_adj_left = n_adj;
+        for (i = 0; i < n_nhs; i++)
+        {
+            f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */
+            word n = flt_round_nearest (nf);
+
+            n = n > n_adj_left ? n_adj_left : n;
+            n_adj_left -= n;
+            error += fabs (nf - n);
+            nhs[i].path_weight = n;
+        }
+
+        nhs[0].path_weight += n_adj_left;
+
+        /* Less than 5% average error per adjacency with this size adjacency block? */
+        if (error <= multipath_next_hop_error_tolerance*n_adj)
+        {
+            /* Truncate any next hops with zero weight. */
+            _vec_len (nhs) = i;
+            break;
+        }
+    }
+
+done:
+    /* Save vector for next call. */
+    *normalized_next_hops = nhs;
+    *sum_weight_in = sum_weight;
+    return n_adj;
+}
+
+static load_balance_path_t *
+load_balance_multipath_next_hop_fixup (load_balance_path_t *nhs,
+                                       dpo_proto_t drop_proto)
+{
+    if (0 == vec_len(nhs))
+    {
+        load_balance_path_t *nh;
+
+        /*
+         * we need something for the load-balance. so use the drop
+         */
+        vec_add2(nhs, nh, 1);
+
+        nh->path_weight = 1;
+        dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto));
+    }
+
+    return (nhs);
+}
+
+/*
+ * Fill in adjacencies in block based on corresponding
+ * next hop adjacencies.
+ */
+static void
+load_balance_fill_buckets (load_balance_t *lb,
+                           load_balance_path_t *nhs,
+                           dpo_id_t *buckets,
+                           u32 n_buckets)
+{
+    load_balance_path_t * nh;
+    u16 ii, bucket;
+
+    bucket = 0;
+
+    /*
+     * the next-hops have normalised weights. that means their sum is the number
+     * of buckets we need to fill.
+     */
+    vec_foreach (nh, nhs)
+    {
+        for (ii = 0; ii < nh->path_weight; ii++)
+        {
+            ASSERT(bucket < n_buckets);
+            load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
+        }
+    }
+}
+
+static inline void
+load_balance_set_n_buckets (load_balance_t *lb,
+                            u32 n_buckets)
+{
+    lb->lb_n_buckets = n_buckets;
+    lb->lb_n_buckets_minus_1 = n_buckets-1;
+}
+
+void
+load_balance_multipath_update (const dpo_id_t *dpo,
+                               load_balance_path_t * raw_next_hops,
+                               load_balance_flags_t flags)
+{
+    u32 sum_of_weights,n_buckets, ii;
+    load_balance_path_t * nh, * nhs;
+    index_t lbmi, old_lbmi;
+    load_balance_t *lb;
+    dpo_id_t *tmp_dpo;
+
+    nhs = NULL;
+
+    ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
+    lb = load_balance_get(dpo->dpoi_index);
+    raw_next_hops =
+        load_balance_multipath_next_hop_fixup(raw_next_hops,
+                                              lb->lb_proto);
+    n_buckets =
+        ip_multipath_normalize_next_hops(raw_next_hops,
+                                         &nhs,
+                                         &sum_of_weights,
+                                         multipath_next_hop_error_tolerance);
+
+    ASSERT (n_buckets >= vec_len (raw_next_hops));
+
+    /*
+     * Save the old load-balance map used, and get a new one if required.
+     */
+    old_lbmi = lb->lb_map;
+    if (flags & LOAD_BALANCE_FLAG_USES_MAP)
+    {
+        lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs);
+    }
+    else
+    {
+        lbmi = INDEX_INVALID;
+    }
+
+    if (0 == lb->lb_n_buckets)
+    {
+        /*
+         * first time initialisation. no packets inflight, so we can write
+         * at leisure.
+         */
+        load_balance_set_n_buckets(lb, n_buckets);
+
+        if (!LB_HAS_INLINE_BUCKETS(lb))
+            vec_validate_aligned(lb->lb_buckets,
+                                 lb->lb_n_buckets - 1,
+                                 CLIB_CACHE_LINE_BYTES);
+
+        load_balance_fill_buckets(lb, nhs,
+                                  load_balance_get_buckets(lb),
+                                  n_buckets);
+        lb->lb_map = lbmi;
+    }
+    else
+    {
+        /*
+         * This is a modification of an existing load-balance.
+         * We need to ensure that packets inflight see a consistent state, that
+         * is the number of reported buckets the LB has (read from
+         * lb_n_buckets_minus_1) is not more than it actually has. So if the
+         * number of buckets is increasing, we must update the bucket array first,
+         * then the reported number. vice-versa if the number of buckets goes down.
+         */
+        if (n_buckets == lb->lb_n_buckets)
+        {
+            /*
+             * no change in the number of buckets. we can simply fill what
+             * is new over what is old.
+             */
+            load_balance_fill_buckets(lb, nhs,
+                                      load_balance_get_buckets(lb),
+                                      n_buckets);
+            lb->lb_map = lbmi;
+        }
+        else if (n_buckets > lb->lb_n_buckets)
+        {
+            /*
+             * we have more buckets. the old load-balance map (if there is one)
+             * will remain valid, i.e. mapping to indices within range, so we
+             * update it last.
+             */
+            if (n_buckets > LB_NUM_INLINE_BUCKETS &&
+                lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
+            {
+                /*
+                 * the new increased number of buckets is crossing the threshold
+                 * from the inline storage to out-line. Alloc the outline buckets
+                 * first, then fixup the number. then reset the inlines.
+                 */
+                ASSERT(NULL == lb->lb_buckets);
+                vec_validate_aligned(lb->lb_buckets,
+                                     n_buckets - 1,
+                                     CLIB_CACHE_LINE_BYTES);
+
+                load_balance_fill_buckets(lb, nhs,
+                                          lb->lb_buckets,
+                                          n_buckets);
+                CLIB_MEMORY_BARRIER();
+                load_balance_set_n_buckets(lb, n_buckets);
+
+                CLIB_MEMORY_BARRIER();
+
+                for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++)
+                {
+                    dpo_reset(&lb->lb_buckets_inline[ii]);
+                }
+            }
+            else
+            {
+                /*
+                 * we are not crossing the threshold. we can write the new on the
+                 * old, whether they be inline or not.
+                 */
+                load_balance_fill_buckets(lb, nhs,
+                                          load_balance_get_buckets(lb),
+                                          n_buckets);
+                CLIB_MEMORY_BARRIER();
+                load_balance_set_n_buckets(lb, n_buckets);
+            }
+
+            /*
+             * buckets fixed. ready for the MAP update.
+             */
+            lb->lb_map = lbmi;
+        }
+        else
+        {
+            /*
+             * bucket size shrinkage.
+             * Any map we have will be based on the old
+             * larger number of buckets, so will be translating to indices
+             * out of range. So the new MAP must be installed first.
+             */
+            lb->lb_map = lbmi;
+            CLIB_MEMORY_BARRIER();
+
+
+            if (n_buckets <= LB_NUM_INLINE_BUCKETS &&
+                lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS)
+            {
+                /*
+                 * the new decreased number of buckets is crossing the threshold
+                 * from out-line storage to inline:
+                 *   1 - Fill the inline buckets,
+                 *   2 - fixup the number (and this point the inline buckets are
+                 *       used).
+                 *   3 - free the outline buckets
+                 */
+                load_balance_fill_buckets(lb, nhs,
+                                          lb->lb_buckets_inline,
+                                          n_buckets);
+                CLIB_MEMORY_BARRIER();
+                load_balance_set_n_buckets(lb, n_buckets);
+                CLIB_MEMORY_BARRIER();
+
+                vec_foreach(tmp_dpo, lb->lb_buckets)
+                {
+                    dpo_reset(tmp_dpo);
+                }
+                vec_free(lb->lb_buckets);
+            }
+            else
+            {
+                /*
+                 * not crossing the threshold.
+                 *  1 - update the number to the smaller size
+                 *  2 - write the new buckets
+                 *  3 - reset those no longer used.
+                 */
+                dpo_id_t *buckets;
+                u32 old_n_buckets;
+
+                old_n_buckets = lb->lb_n_buckets;
+                buckets = load_balance_get_buckets(lb);
+
+                load_balance_set_n_buckets(lb, n_buckets);
+                CLIB_MEMORY_BARRIER();
+
+                load_balance_fill_buckets(lb, nhs,
+                                          buckets,
+                                          n_buckets);
+
+                for (ii = old_n_buckets-n_buckets; ii < old_n_buckets; ii++)
+                {
+                    dpo_reset(&buckets[ii]);
+                }
+            }
+        }
+    }
+
+    vec_foreach (nh, nhs)
+    {
+        dpo_reset(&nh->path_dpo);
+    }
+
+    load_balance_map_unlock(old_lbmi);
+}
+
+static void
+load_balance_lock (dpo_id_t *dpo)
+{
+    load_balance_t *lb;
+
+    lb = load_balance_get(dpo->dpoi_index);
+
+    lb->lb_locks++;
+}
+
+static void
+load_balance_destroy (load_balance_t *lb)
+{
+    dpo_id_t *buckets;
+    int i;
+
+    buckets = load_balance_get_buckets(lb);
+
+    for (i = 0; i < lb->lb_n_buckets; i++)
+    {
+        dpo_reset(&buckets[i]);
+    }
+
+    LB_DBG(lb, "destroy");
+    if (!LB_HAS_INLINE_BUCKETS(lb))
+    {
+        vec_free(lb->lb_buckets);
+    }
+
+    pool_put(load_balance_pool, lb);
+}
+
+static void
+load_balance_unlock (dpo_id_t *dpo)
+{
+    load_balance_t *lb;
+
+    lb = load_balance_get(dpo->dpoi_index);
+
+    lb->lb_locks--;
+
+    if (0 == lb->lb_locks)
+    {
+        load_balance_destroy(lb);
+    }
+}
+
+const static dpo_vft_t lb_vft = {
+    .dv_lock = load_balance_lock,
+    .dv_unlock = load_balance_unlock,
+    .dv_format = format_load_balance_dpo,
+};
+
+/**
+ * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance
+ *        object.
+ *
+ * this means that these graph nodes are ones from which a load-balance is the
+ * parent object in the DPO-graph.
+ *
+ * We do not list all the load-balance nodes, such as the *-lookup. instead
+ * we are relying on the correct use of the .sibling_of field when setting
+ * up these sibling nodes.
+ */
+const static char* const load_balance_ip4_nodes[] =
+{
+    "ip4-load-balance",
+    NULL,
+};
+const static char* const load_balance_ip6_nodes[] =
+{
+    "ip6-load-balance",
+    NULL,
+};
+const static char* const load_balance_mpls_nodes[] =
+{
+    "mpls-load-balance",
+    NULL,
+};
+const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = load_balance_ip4_nodes,
+    [DPO_PROTO_IP6]  = load_balance_ip6_nodes,
+    [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
+};
+
+void
+load_balance_module_init (void)
+{
+    dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes);
+
+    load_balance_map_module_init();
+}
+
+static clib_error_t *
+load_balance_show (vlib_main_t * vm,
+                   unformat_input_t * input,
+                   vlib_cli_command_t * cmd)
+{
+    index_t lbi = INDEX_INVALID;
+
+    while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+        if (unformat (input, "%d", &lbi))
+            ;
+        else
+            break;
+    }
+
+    if (INDEX_INVALID != lbi)
+    {
+        vlib_cli_output (vm, "%U", format_load_balance, lbi,
+                         LOAD_BALANCE_FORMAT_DETAIL);
+    }
+    else
+    {
+        load_balance_t *lb;
+
+        pool_foreach(lb, load_balance_pool,
+        ({
+            vlib_cli_output (vm, "%U", format_load_balance,
+                             load_balance_get_index(lb),
+                             LOAD_BALANCE_FORMAT_NONE);
+        }));
+    }
+
+    return 0;
+}
+
+VLIB_CLI_COMMAND (load_balance_show_command, static) = {
+    .path = "show load-balance",
+    .short_help = "show load-balance [<index>]",
+    .function = load_balance_show,
+};
diff --git a/vnet/vnet/dpo/load_balance.h b/vnet/vnet/dpo/load_balance.h
new file mode 100644 (file)
index 0000000..d630a2c
--- /dev/null
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * \brief
+ * The load-balance object represents an ECMP choice. The buckets of a load
+ * balance object point to the sub-graph after the choice is made.
+ * THe load-balance object is also object type returned from a FIB table lookup.
+ * As such it needs to represent the case where there is only one coice. It may
+ * seem like overkill to use a load-balance object in this case, but the reason
+ * is for performance. If the load-balance object were not the result of the FIB
+ * lookup, then some other object would be. The case where there was ECMP
+ * this other object would need a load-balance as a parent and hence just add
+ * an unnecessary indirection.
+ *
+ * It is also the object in the DP that represents a via-fib-entry in a recursive
+ * route.
+ *
+ */
+
+#ifndef __LOAD_BALANCE_H__
+#define __LOAD_BALANCE_H__
+
+#include <vlib/vlib.h>
+#include <vnet/ip/lookup.h>
+#include <vnet/dpo/dpo.h>
+#include <vnet/fib/fib_types.h>
+
+/**
+ * Load-balance main
+ */
+typedef struct load_balance_main_t_
+{
+    vlib_combined_counter_main_t lbm_to_counters;
+    vlib_combined_counter_main_t lbm_via_counters;
+} load_balance_main_t;
+
+extern load_balance_main_t load_balance_main;
+
+/**
+ * The number of buckets that a load-balance object can have and still
+ * fit in one cache-line
+ */
+#define LB_NUM_INLINE_BUCKETS 4
+
+/**
+ * @brief One path from an [EU]CMP set that the client wants to add to a
+ * load-balance object
+ */
+typedef struct load_balance_path_t_ {
+    /**
+     * ID of the Data-path object.
+     */
+    dpo_id_t path_dpo;
+
+    /**
+     * The index of the FIB path
+     */
+    fib_node_index_t path_index;
+
+    /**
+     * weight for the path.
+     */
+    u32 path_weight;
+} load_balance_path_t;
+
+/**
+ * The FIB DPO provieds;
+ *  - load-balancing over the next DPOs in the chain/graph
+ *  - per-route counters
+ */
+typedef struct load_balance_t_ {
+    /**
+     * number of buckets in the load-balance. always a power of 2.
+     */
+    u16 lb_n_buckets;
+    /**
+     * number of buckets in the load-balance - 1. used in the switch path
+     * as part of the hash calculation.
+     */
+    u16 lb_n_buckets_minus_1;
+
+   /**
+     * The protocol of packets that traverse this LB.
+     * need in combination with the flow hash config to determine how to hash.
+     * u8.
+     */
+    dpo_proto_t lb_proto;
+
+    /**
+     * The number of locks, which is approximately the number of users,
+     * of this load-balance.
+     * Load-balance objects of via-entries are heavily shared by recursives,
+     * so the lock count is a u32.
+     */
+    u32 lb_locks;
+
+    /**
+     * index of the load-balance map, INVALID if this LB does not use one
+     */
+    index_t lb_map;
+
+    /**
+     * the hash config to use when selecting a bucket. this is a u16
+     */
+    flow_hash_config_t lb_hash_config;
+
+    /**
+     * Vector of buckets containing the next DPOs, sized as lbo_num
+     */
+    dpo_id_t *lb_buckets;
+
+    /**
+     * The rest of the cache line is used for buckets. In the common case
+     * where there there are less than 4 buckets, then the buckets are
+     * on the same cachlie and we save ourselves a pointer dereferance in 
+     * the data-path.
+     */
+    dpo_id_t lb_buckets_inline[LB_NUM_INLINE_BUCKETS];
+} load_balance_t;
+
+_Static_assert(sizeof(load_balance_t) <= CLIB_CACHE_LINE_BYTES,
+              "A load_balance object size exceeds one cachline");
+
+/**
+ * Flags controlling load-balance formatting/display
+ */
+typedef enum load_balance_format_flags_t_ {
+    LOAD_BALANCE_FORMAT_NONE,
+    LOAD_BALANCE_FORMAT_DETAIL = (1 << 0),
+} load_balance_format_flags_t;
+
+/**
+ * Flags controlling load-balance creation and modification
+ */
+typedef enum load_balance_flags_t_ {
+    LOAD_BALANCE_FLAG_NONE = 0,
+    LOAD_BALANCE_FLAG_USES_MAP = (1 << 0),
+} load_balance_flags_t;
+
+extern index_t load_balance_create(u32 num_buckets,
+                                  dpo_proto_t lb_proto,
+                                  flow_hash_config_t fhc);
+extern void load_balance_multipath_update(
+    const dpo_id_t *dpo,
+    load_balance_path_t * raw_next_hops,
+    load_balance_flags_t flags);
+
+extern void load_balance_set_bucket(index_t lbi,
+                                   u32 bucket,
+                                   const dpo_id_t *next);
+
+extern u8* format_load_balance(u8 * s, va_list * args);
+
+extern const dpo_id_t *load_balance_get_bucket(index_t lbi,
+                                              u32 bucket);
+extern int load_balance_is_drop(const dpo_id_t *dpo);
+
+extern f64 load_balance_get_multipath_tolerance(void);
+
+/**
+ * The encapsulation breakages are for fast DP access
+ */
+extern load_balance_t *load_balance_pool;
+static inline load_balance_t*
+load_balance_get (index_t lbi)
+{
+    return (pool_elt_at_index(load_balance_pool, lbi));
+}
+
+#define LB_HAS_INLINE_BUCKETS(_lb)             \
+    ((_lb)->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
+
+static inline const dpo_id_t *
+load_balance_get_bucket_i (const load_balance_t *lb,
+                          u32 bucket)
+{
+    ASSERT(bucket < lb->lb_n_buckets);
+
+    if (PREDICT_TRUE(LB_HAS_INLINE_BUCKETS(lb)))
+    {
+       return (&lb->lb_buckets_inline[bucket]);
+    }
+    else
+    {
+       return (&lb->lb_buckets[bucket]);
+    }
+}
+
+extern void load_balance_module_init(void);
+
+#endif
diff --git a/vnet/vnet/dpo/load_balance_map.c b/vnet/vnet/dpo/load_balance_map.c
new file mode 100644 (file)
index 0000000..f08801f
--- /dev/null
@@ -0,0 +1,566 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @brief
+ */
+#include <vnet/fib/fib_path.h>
+#include <vnet/fib/fib_node_list.h>
+#include <vnet/dpo/load_balance_map.h>
+#include <vnet/dpo/load_balance.h>
+
+/**
+ * A hash-table of load-balance maps by path index.
+ * this provides the fast lookup of the LB map when a path goes down
+ */
+static uword *lb_maps_by_path_index;
+
+/**
+ * A hash-table of load-balance maps by set of paths.
+ * This provides the LB map sharing.
+ * LB maps do not necessarily use all the paths in the list, since
+ * the entry that is requesting the map, may not have an out-going
+ * label for each of the paths.
+ */
+static uword *load_balance_map_db;
+
+typedef enum load_balance_map_path_flags_t_
+{
+    LOAD_BALANCE_MAP_PATH_UP     = (1 << 0),
+    LOAD_BALANCE_MAP_PATH_USABLE = (1 << 1),
+} __attribute__ ((packed)) load_balance_map_path_flags_t;
+
+typedef struct load_balance_map_path_t_ {
+    /**
+     * Index of the path
+     */
+    fib_node_index_t lbmp_index;
+
+    /**
+     * Sibling Index in the list of all maps with this path index
+     */
+    fib_node_index_t lbmp_sibling;
+
+    /**
+     * the normalised wegiht of the path
+     */
+    u32 lbmp_weight;
+
+    /**
+     * The sate of the path
+     */
+    load_balance_map_path_flags_t lbmp_flags;
+} load_balance_map_path_t;
+
+/**
+ * The global pool of LB maps
+ */
+load_balance_map_t *load_balance_map_pool;
+
+/*
+ * Debug macro
+ */
+#ifdef FIB_DEBUG
+#define LOAD_BALANCE_MAP_DBG(_pl, _fmt, _args...)       \
+    {                                                   \
+        clib_warning("lbm: FIXME" _fmt,                 \
+                     ##_args);                          \
+    }
+#else
+#define LOAD_BALANCE_MAP_DBG(_pl, _fmt, _args...)
+#endif
+
+static index_t
+load_balance_map_get_index (load_balance_map_t *lbm)
+{
+    return (lbm - load_balance_map_pool);
+}
+
+u8*
+format_load_balance_map (u8 *s, va_list ap)
+{
+    index_t lbmi = va_arg(ap, index_t);
+    u32 indent = va_arg(ap, u32);
+    load_balance_map_t *lbm;
+    u32 n_buckets, ii;
+
+    lbm = load_balance_map_get(lbmi);
+    n_buckets = vec_len(lbm->lbm_buckets);
+
+    s = format(s, "load-balance-map: index:%d buckets:%d", lbmi, n_buckets);
+    s = format(s, "\n%U index:", format_white_space, indent+2);
+    for (ii = 0; ii < n_buckets; ii++)
+    {
+        s = format(s, "%5d", ii);
+    }
+    s = format(s, "\n%U   map:", format_white_space, indent+2);
+    for (ii = 0; ii < n_buckets; ii++)
+    {
+        s = format(s, "%5d", lbm->lbm_buckets[ii]);
+    }
+
+    return (s);
+}
+
+
+static uword
+load_balance_map_hash (load_balance_map_t *lbm)
+{
+    u32 old_lbm_hash, new_lbm_hash, hash;
+    load_balance_map_path_t *lb_path;
+
+    new_lbm_hash = old_lbm_hash = vec_len(lbm->lbm_paths);
+
+    vec_foreach (lb_path, lbm->lbm_paths)
+    {
+        hash = lb_path->lbmp_index;
+        hash_mix32(hash, old_lbm_hash, new_lbm_hash);
+    }
+
+    return (new_lbm_hash);
+}
+
+always_inline uword
+load_balance_map_db_hash_key_from_index (uword index)
+{
+    return 1 + 2*index;
+}
+
+always_inline uword
+load_balance_map_db_hash_key_is_index (uword key)
+{
+    return key & 1;
+}
+
+always_inline uword
+load_balance_map_db_hash_key_2_index (uword key)
+{
+    ASSERT (load_balance_map_db_hash_key_is_index (key));
+    return key / 2;
+}
+
+static load_balance_map_t*
+load_balance_map_db_get_from_hash_key (uword key)
+{
+    load_balance_map_t *lbm;
+
+    if (load_balance_map_db_hash_key_is_index (key))
+    {
+        index_t lbm_index;
+
+        lbm_index = load_balance_map_db_hash_key_2_index(key);
+        lbm = load_balance_map_get(lbm_index);
+    }
+    else
+    {
+        lbm = uword_to_pointer (key, load_balance_map_t *);
+    }
+
+    return (lbm);
+}
+
+static uword
+load_balance_map_db_hash_key_sum (hash_t * h,
+                                  uword key)
+{
+    load_balance_map_t *lbm;
+
+    lbm = load_balance_map_db_get_from_hash_key(key);
+
+    return (load_balance_map_hash(lbm));
+}
+
+static uword
+load_balance_map_db_hash_key_equal (hash_t * h,
+                                    uword key1,
+                                    uword key2)
+{
+    load_balance_map_t *lbm1, *lbm2;
+
+    lbm1 = load_balance_map_db_get_from_hash_key(key1);
+    lbm2 = load_balance_map_db_get_from_hash_key(key2);
+
+    return (load_balance_map_hash(lbm1) ==
+            load_balance_map_hash(lbm2));
+}
+
+static index_t
+load_balance_map_db_find (load_balance_map_t *lbm)
+{
+    uword *p;
+
+    p = hash_get(load_balance_map_db, lbm);
+
+    if (NULL != p)
+    {
+        return p[0];
+    }
+
+    return (FIB_NODE_INDEX_INVALID);
+}
+
+static void
+load_balance_map_db_insert (load_balance_map_t *lbm)
+{
+    load_balance_map_path_t *lbmp;
+    fib_node_list_t list;
+    uword *p;
+
+    ASSERT(FIB_NODE_INDEX_INVALID == load_balance_map_db_find(lbm));
+
+    /*
+     * insert into the DB based on the set of paths.
+     */
+    hash_set (load_balance_map_db,
+              load_balance_map_db_hash_key_from_index(
+                  load_balance_map_get_index(lbm)),
+              load_balance_map_get_index(lbm));
+
+    /*
+     * insert into each per-path list.
+     */
+    vec_foreach(lbmp, lbm->lbm_paths)
+    {
+        p = hash_get(lb_maps_by_path_index, lbmp->lbmp_index);
+
+        if (NULL == p)
+        {
+            list = fib_node_list_create();
+            hash_set(lb_maps_by_path_index, lbmp->lbmp_index, list);
+        }
+        else
+        {
+            list = p[0];
+        }
+
+        lbmp->lbmp_sibling =
+            fib_node_list_push_front(list,
+                                     0, FIB_NODE_TYPE_FIRST,
+                                     load_balance_map_get_index(lbm));
+    }
+
+    LOAD_BALANCE_MAP_DBG(lbm, "DB-inserted");
+}
+
+static void
+load_balance_map_db_remove (load_balance_map_t *lbm)
+{
+    load_balance_map_path_t *lbmp;
+    uword *p;
+
+    ASSERT(FIB_NODE_INDEX_INVALID != load_balance_map_db_find(lbm));
+
+    hash_unset(load_balance_map_db,
+               load_balance_map_db_hash_key_from_index(
+                   load_balance_map_get_index(lbm)));
+
+    /*
+     * remove from each per-path list.
+     */
+    vec_foreach(lbmp, lbm->lbm_paths)
+    {
+        p = hash_get(lb_maps_by_path_index, lbmp->lbmp_index);
+
+        ASSERT(NULL != p);
+
+        fib_node_list_remove(p[0], lbmp->lbmp_sibling);
+    }
+
+    LOAD_BALANCE_MAP_DBG(lbm, "DB-removed");
+}
+
+/**
+ * @brief from the paths that are usable, fill the Map.
+ */
+static void
+load_balance_map_fill (load_balance_map_t *lbm)
+{
+    load_balance_map_path_t *lbmp;
+    u32 n_buckets, bucket, ii, jj;
+    u16 *tmp_buckets;
+
+    tmp_buckets = NULL;
+    n_buckets = vec_len(lbm->lbm_buckets);
+
+    /*
+     * run throught the set of paths once, and build a vector of the
+     * indices that are usable. we do this is a scratch space, since we
+     * need to refer to it multiple times as we build the real buckets.
+     */
+    vec_validate(tmp_buckets, n_buckets-1);
+
+    bucket = jj = 0;
+    vec_foreach (lbmp, lbm->lbm_paths)
+    {
+        if (fib_path_is_resolved(lbmp->lbmp_index))
+        {
+            for (ii = 0; ii < lbmp->lbmp_weight; ii++)
+            {
+                tmp_buckets[jj++] = bucket++;
+            }
+        }
+        else 
+        {
+            bucket += lbmp->lbmp_weight;
+        }
+    }
+    _vec_len(tmp_buckets) = jj;
+
+    /*
+     * If the number of temporaries written is as many as we need, implying
+     * all paths were up, then we can simply copy the scratch area over the
+     * actual buckets' memory
+     */
+    if (jj == n_buckets)
+    {
+        memcpy(lbm->lbm_buckets,
+               tmp_buckets,
+               sizeof(lbm->lbm_buckets[0]) * n_buckets);
+    }
+    else
+    {
+        /*
+         * one or more paths are down.
+         */
+        if (0 == vec_len(tmp_buckets))
+        {
+            /*
+             * if the scratch area is empty, then no paths are usable.
+             * they will all drop. so use them all, lest we account drops
+             * against only one.
+             */
+            for (bucket = 0; bucket < n_buckets; bucket++)
+            {
+                lbm->lbm_buckets[bucket] = bucket;
+            }
+        }
+        else
+        {
+            bucket = jj = 0;
+            vec_foreach (lbmp, lbm->lbm_paths)
+            {
+                if (fib_path_is_resolved(lbmp->lbmp_index))
+                {
+                    for (ii = 0; ii < lbmp->lbmp_weight; ii++)
+                    {
+                        lbm->lbm_buckets[bucket] = bucket;
+                        bucket++;
+                    }
+                }
+                else
+                {
+                    /*
+                     * path is unusable
+                     * cycle through the scratch space selecting a index.
+                     * this means we load balance, in the intended ratio,
+                     * over the paths that are still usable.
+                     */
+                    for (ii = 0; ii < lbmp->lbmp_weight; ii++)
+                    {
+                        lbm->lbm_buckets[bucket] = tmp_buckets[jj];
+                        jj = (jj + 1) % vec_len(tmp_buckets);
+                        bucket++;
+                    }
+                }
+            }
+       }
+    }
+
+    vec_free(tmp_buckets);
+}
+
+static load_balance_map_t*
+load_balance_map_alloc (const load_balance_path_t *paths)
+{
+    load_balance_map_t *lbm;
+    u32 ii;
+
+    pool_get_aligned(load_balance_map_pool, lbm, CLIB_CACHE_LINE_BYTES);
+    memset(lbm, 0, sizeof(*lbm));
+
+    vec_validate(lbm->lbm_paths, vec_len(paths)-1);
+
+    vec_foreach_index(ii, paths)
+    {
+        lbm->lbm_paths[ii].lbmp_index  = paths[ii].path_index;
+        lbm->lbm_paths[ii].lbmp_weight = paths[ii].path_weight;
+    }
+
+    return (lbm);
+}
+
+static load_balance_map_t *
+load_balance_map_init (load_balance_map_t *lbm,
+                       u32 n_buckets,
+                       u32 sum_of_weights)
+{
+    lbm->lbm_sum_of_norm_weights = sum_of_weights;
+    vec_validate(lbm->lbm_buckets, n_buckets-1);
+
+    load_balance_map_db_insert(lbm);
+
+    load_balance_map_fill(lbm);
+
+    return (lbm);
+}
+
+index_t
+load_balance_map_add_or_lock (u32 n_buckets,
+                              u32 sum_of_weights,
+                              const load_balance_path_t *paths)
+{
+    load_balance_map_t *tmp, *lbm;
+    index_t lbmi;
+
+    tmp = load_balance_map_alloc(paths);
+
+    lbmi = load_balance_map_db_find(tmp);
+
+    if (INDEX_INVALID == lbmi)
+    {
+        lbm = load_balance_map_init(tmp, n_buckets, sum_of_weights);
+    }
+    else
+    {
+        lbm = load_balance_map_get(lbmi);
+    }
+
+    lbm->lbm_locks++;
+
+    return (load_balance_map_get_index(lbm));
+}
+
+void
+load_balance_map_lock (index_t lbmi)
+{
+    load_balance_map_t *lbm;
+
+    lbm = load_balance_map_get(lbmi);
+
+    lbm->lbm_locks++;
+}
+
+void
+load_balance_map_unlock (index_t lbmi)
+{
+    load_balance_map_t *lbm;
+
+    if (INDEX_INVALID == lbmi)
+    {
+        return;
+    }
+
+    lbm = load_balance_map_get(lbmi);
+
+    lbm->lbm_locks--;
+
+    if (0 == lbm->lbm_locks)
+    {
+        load_balance_map_db_remove(lbm);
+        vec_free(lbm->lbm_paths);
+        vec_free(lbm->lbm_buckets);
+        pool_put(load_balance_map_pool, lbm);
+    }
+}
+
+static int
+load_balance_map_path_state_change_walk (fib_node_ptr_t *fptr,
+                                         void *ctx)
+{
+    load_balance_map_t *lbm;
+
+    lbm = load_balance_map_get(fptr->fnp_index);
+
+    load_balance_map_fill(lbm);
+
+    return (!0);
+}
+
+/**
+ * @brief the state of a path has changed (it has no doubt gone down).
+ * This is the trigger to perform a PIC edge cutover and update the maps
+ * to exclude this path.
+ */
+void
+load_balance_map_path_state_change (fib_node_index_t path_index)
+{
+    uword *p;
+
+    /*
+     * re-stripe the buckets for each affect MAP
+     */
+    p = hash_get(lb_maps_by_path_index, path_index);
+
+    if (NULL == p)
+        return;
+
+    fib_node_list_walk(p[0], load_balance_map_path_state_change_walk, NULL);
+}
+
+/**
+ * @brief Make/add a new or lock an existing Load-balance map
+ */
+void
+load_balance_map_module_init (void)
+{
+    load_balance_map_db =
+        hash_create2 (/* elts */ 0,
+                      /* user */ 0,
+                      /* value_bytes */ sizeof (index_t),
+                      load_balance_map_db_hash_key_sum,
+                      load_balance_map_db_hash_key_equal,
+                      /* format pair/arg */
+                      0, 0);
+
+    lb_maps_by_path_index = hash_create(0, sizeof(fib_node_list_t));
+}
+
+static clib_error_t *
+load_balance_map_show (vlib_main_t * vm,
+                       unformat_input_t * input,
+                       vlib_cli_command_t * cmd)
+{
+    index_t lbmi = INDEX_INVALID;
+
+    while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+        if (unformat (input, "%d", &lbmi))
+            ;
+        else
+            break;
+    }
+
+    if (INDEX_INVALID != lbmi)
+    {
+        vlib_cli_output (vm, "%U", format_load_balance_map, lbmi, 0);
+    }
+    else
+    {
+        load_balance_map_t *lbm;
+
+        pool_foreach(lbm, load_balance_map_pool,
+        ({
+            vlib_cli_output (vm, "%U", format_load_balance_map,
+                             load_balance_map_get_index(lbm), 0);
+        }));
+    }
+
+    return 0;
+}
+
+VLIB_CLI_COMMAND (load_balance_map_show_command, static) = {
+    .path = "show load-balance-map",
+    .short_help = "show load-balance-map [<index>]",
+    .function = load_balance_map_show,
+};
diff --git a/vnet/vnet/dpo/load_balance_map.h b/vnet/vnet/dpo/load_balance_map.h
new file mode 100644 (file)
index 0000000..f080e97
--- /dev/null
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @brief
+ */
+
+#ifndef __LOAD_BALANCE_MAP_H__
+#define __LOAD_BALANCE_MAP_H__
+
+#include <vlib/vlib.h>
+#include <vnet/fib/fib_types.h>
+#include <vnet/dpo/load_balance.h>
+
+struct load_balance_map_path_t_;
+
+/**
+ */
+typedef struct load_balance_map_t_ {
+    /**
+     * The buckets of the map that provide the index to index translation.
+     * In the first cacheline.
+     */
+    u16 *lbm_buckets;
+
+    /**
+     * the vector of paths this MAP represents
+     */
+    struct load_balance_map_path_t_ *lbm_paths;
+
+    /**
+     * the sum of the normalised weights. cache for convenience
+     */
+    u32 lbm_sum_of_norm_weights;
+
+    /**
+     * Number of locks. Maps are shared by a large number of recrusvie fib_entry_ts
+     */
+    u32 lbm_locks;
+} load_balance_map_t;
+
+extern index_t load_balance_map_add_or_lock(u32 n_buckets,
+                                            u32 sum_of_weights,
+                                            const load_balance_path_t *norm_paths);
+
+extern void load_balance_map_lock(index_t lmbi);
+extern void load_balance_map_unlock(index_t lbmi);
+
+extern void load_balance_map_path_state_change(fib_node_index_t path_index);
+
+extern u8* format_load_balance_map(u8 *s, va_list ap);
+
+/**
+ * The encapsulation breakages are for fast DP access
+ */
+extern load_balance_map_t *load_balance_map_pool;
+
+static inline load_balance_map_t*
+load_balance_map_get (index_t lbmi)
+{
+    return (pool_elt_at_index(load_balance_map_pool, lbmi));
+}
+
+
+extern void load_balance_map_module_init(void);
+
+#endif
diff --git a/vnet/vnet/dpo/lookup_dpo.c b/vnet/vnet/dpo/lookup_dpo.c
new file mode 100644 (file)
index 0000000..0bfc065
--- /dev/null
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/ip/ip.h>
+#include <vnet/dpo/lookup_dpo.h>
+#include <vnet/dpo/load_balance.h>
+#include <vnet/mpls/mpls.h>
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/fib/ip6_fib.h>
+#include <vnet/fib/mpls_fib.h>
+
+static const char *const lookup_input_names[] = LOOKUP_INPUTS;
+
+/**
+ * @brief Enumeration of the lookup subtypes
+ */
+typedef enum lookup_sub_type_t_
+{
+    LOOKUP_SUB_TYPE_SRC,
+    LOOKUP_SUB_TYPE_DST,
+    LOOKUP_SUB_TYPE_DST_TABLE_FROM_INTERFACE,
+} lookup_sub_type_t;
+#define LOOKUP_SUB_TYPE_NUM (LOOKUP_SUB_TYPE_DST_TABLE_FROM_INTERFACE+1)
+
+#define FOR_EACH_LOOKUP_SUB_TYPE(_st)                                   \
+    for (_st = LOOKUP_SUB_TYPE_IP4_SRC; _st < LOOKUP_SUB_TYPE_NUM; _st++)
+
+/**
+ * @brief pool of all MPLS Label DPOs
+ */
+lookup_dpo_t *lookup_dpo_pool;
+
+/**
+ * @brief An array of registered DPO type values for the sub-types
+ */
+static dpo_type_t lookup_dpo_sub_types[LOOKUP_SUB_TYPE_NUM];
+
+static lookup_dpo_t *
+lookup_dpo_alloc (void)
+{
+    lookup_dpo_t *lkd;
+
+    pool_get_aligned(lookup_dpo_pool, lkd, CLIB_CACHE_LINE_BYTES);
+
+    return (lkd);
+}
+
+static index_t
+lookup_dpo_get_index (lookup_dpo_t *lkd)
+{
+    return (lkd - lookup_dpo_pool);
+}
+
+static void
+lookup_dpo_add_or_lock_i (fib_node_index_t fib_index,
+                          dpo_proto_t proto,
+                          lookup_input_t input,
+                          lookup_table_t table_config,
+                          dpo_id_t *dpo)
+{
+    lookup_dpo_t *lkd;
+    dpo_type_t type;
+
+    lkd = lookup_dpo_alloc();
+    lkd->lkd_fib_index = fib_index;
+    lkd->lkd_proto = proto;
+    lkd->lkd_input = input;
+    lkd->lkd_table = table_config;
+
+    /*
+     * use the input type to select the lookup sub-type
+     */
+    type = 0;
+
+    switch (input)
+    {
+    case LOOKUP_INPUT_SRC_ADDR:
+        type = lookup_dpo_sub_types[LOOKUP_SUB_TYPE_SRC];
+        break;
+    case LOOKUP_INPUT_DST_ADDR:
+        switch (table_config)
+        {
+        case LOOKUP_TABLE_FROM_INPUT_INTERFACE:
+            type = lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST_TABLE_FROM_INTERFACE];
+            break;
+        case LOOKUP_TABLE_FROM_CONFIG:
+            type = lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST];
+            break;
+        }
+    }
+
+    if (0 == type)
+    {
+        dpo_reset(dpo);
+    }
+    else
+    {
+        dpo_set(dpo, type, proto, lookup_dpo_get_index(lkd));
+    }
+}
+
+void
+lookup_dpo_add_or_lock_w_fib_index (fib_node_index_t fib_index,
+                                    dpo_proto_t proto,
+                                    lookup_input_t input,
+                                    lookup_table_t table_config,
+                                    dpo_id_t *dpo)
+{
+    if (LOOKUP_TABLE_FROM_CONFIG == table_config)
+    {
+       fib_table_lock(fib_index, dpo_proto_to_fib(proto));
+    }
+    lookup_dpo_add_or_lock_i(fib_index, proto, input, table_config, dpo);
+}
+
+void
+lookup_dpo_add_or_lock_w_table_id (u32 table_id,
+                                   dpo_proto_t proto,
+                                   lookup_input_t input,
+                                   lookup_table_t table_config,
+                                   dpo_id_t *dpo)
+{
+    fib_node_index_t fib_index = FIB_NODE_INDEX_INVALID;
+
+    if (LOOKUP_TABLE_FROM_CONFIG == table_config)
+    {
+       fib_index =
+           fib_table_find_or_create_and_lock(dpo_proto_to_fib(proto),
+                                             table_id);
+    }
+
+    ASSERT(FIB_NODE_INDEX_INVALID != fib_index);
+    lookup_dpo_add_or_lock_i(fib_index, proto, input, table_config, dpo);    
+}
+
+u8*
+format_lookup_dpo (u8 *s, va_list *args)
+{
+    index_t index = va_arg (*args, index_t);
+    lookup_dpo_t *lkd;
+
+    lkd = lookup_dpo_get(index);
+
+    if (LOOKUP_TABLE_FROM_INPUT_INTERFACE == lkd->lkd_table)
+    {
+        s = format(s, "%s lookup in interface's %U table",
+                   lookup_input_names[lkd->lkd_input],
+                   format_dpo_proto, lkd->lkd_proto);
+    }
+    else
+    {
+       s = format(s, "%s lookup in %U",
+                  lookup_input_names[lkd->lkd_input],
+                  format_fib_table_name, lkd->lkd_fib_index,
+                  dpo_proto_to_fib(lkd->lkd_proto));
+    }
+    return (s);
+}
+
+static void
+lookup_dpo_lock (dpo_id_t *dpo)
+{
+    lookup_dpo_t *lkd;
+
+    lkd = lookup_dpo_get(dpo->dpoi_index);
+
+    lkd->lkd_locks++;
+}
+
+static void
+lookup_dpo_unlock (dpo_id_t *dpo)
+{
+    lookup_dpo_t *lkd;
+
+    lkd = lookup_dpo_get(dpo->dpoi_index);
+
+    lkd->lkd_locks--;
+
+    if (0 == lkd->lkd_locks)
+    {
+        if (LOOKUP_TABLE_FROM_CONFIG == lkd->lkd_table)
+        {
+           fib_table_unlock(lkd->lkd_fib_index,
+                            dpo_proto_to_fib(lkd->lkd_proto));
+        }
+        pool_put(lookup_dpo_pool, lkd);
+    }
+}
+
+always_inline void
+ip4_src_fib_lookup_one (u32 src_fib_index0,
+                        const ip4_address_t * addr0,
+                        u32 * src_adj_index0)
+{
+    ip4_fib_mtrie_leaf_t leaf0, leaf1;
+    ip4_fib_mtrie_t * mtrie0;
+
+    mtrie0 = &ip4_fib_get (src_fib_index0)->mtrie;
+
+    leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
+    leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 0);
+    leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 1);
+    leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 2);
+    leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 3);
+
+    /* Handle default route. */
+    leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
+    src_adj_index0[0] = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+}
+
+always_inline void
+ip4_src_fib_lookup_two (u32 src_fib_index0,
+                        u32 src_fib_index1,
+                        const ip4_address_t * addr0,
+                        const ip4_address_t * addr1,
+                        u32 * src_adj_index0,
+                        u32 * src_adj_index1)
+{
+    ip4_fib_mtrie_leaf_t leaf0, leaf1;
+    ip4_fib_mtrie_t * mtrie0, * mtrie1;
+
+    mtrie0 = &ip4_fib_get (src_fib_index0)->mtrie;
+    mtrie1 = &ip4_fib_get (src_fib_index1)->mtrie;
+
+    leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
+
+    leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 0);
+    leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, addr1, 0);
+
+    leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 1);
+    leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, addr1, 1);
+
+    leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 2);
+    leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, addr1, 2);
+
+    leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 3);
+    leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, addr1, 3);
+
+    /* Handle default route. */
+    leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
+    leaf1 = (leaf1 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie1->default_leaf : leaf1);
+    src_adj_index0[0] = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+    src_adj_index1[0] = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
+}
+
+/**
+ * @brief Lookup trace  data
+ */
+typedef struct lookup_trace_t_
+{
+    union {
+       ip46_address_t addr;
+       mpls_unicast_header_t hdr;
+    };
+    fib_node_index_t fib_index;
+    index_t lbi;
+} lookup_trace_t;
+
+
+always_inline uword
+lookup_dpo_ip4_inline (vlib_main_t * vm,
+                       vlib_node_runtime_t * node,
+                       vlib_frame_t * from_frame,
+                       int input_src_addr,
+                       int table_from_interface)
+{
+    u32 n_left_from, next_index, * from, * to_next;
+    u32 cpu_index = os_get_cpu_number();
+    vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
+
+    from = vlib_frame_vector_args (from_frame);
+    n_left_from = from_frame->n_vectors;
+
+    next_index = node->cached_next_index;
+
+    while (n_left_from > 0)
+    {
+        u32 n_left_to_next;
+
+        vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+        /* while (n_left_from >= 4 && n_left_to_next >= 2) */
+        /*   } */
+
+        while (n_left_from > 0 && n_left_to_next > 0)
+        {
+            u32 bi0, lkdi0, lbi0, fib_index0,  next0;
+            const ip4_address_t *input_addr;
+            const load_balance_t *lb0;
+            const lookup_dpo_t * lkd0;
+            const ip4_header_t * ip0;
+            const dpo_id_t *dpo0;
+            vlib_buffer_t * b0;
+
+            bi0 = from[0];
+            to_next[0] = bi0;
+            from += 1;
+            to_next += 1;
+            n_left_from -= 1;
+            n_left_to_next -= 1;
+
+            b0 = vlib_get_buffer (vm, bi0);
+            ip0 = vlib_buffer_get_current (b0);
+
+            /* dst lookup was done by ip4 lookup */
+            lkdi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX];
+            lkd0 = lookup_dpo_get(lkdi0);
+
+            /*
+             * choose between a lookup using the fib index in the DPO
+             * or getting the FIB index from the interface.
+             */
+            if (table_from_interface)
+            {
+                fib_index0 = 
+                    ip4_fib_table_get_index_for_sw_if_index(
+                        vnet_buffer(b0)->sw_if_index[VLIB_RX]);
+            }
+            else
+            {
+                fib_index0 = lkd0->lkd_fib_index;
+            }
+
+            /*
+             * choose between a source or destination address lookup in the table
+             */
+            if (input_src_addr)
+            {
+                input_addr = &ip0->src_address;
+            }
+            else
+            {
+                input_addr = &ip0->dst_address;
+            }
+
+            /* do lookup */
+            ip4_src_fib_lookup_one (fib_index0, input_addr, &lbi0);
+            lb0 = load_balance_get(lbi0);
+            dpo0 = load_balance_get_bucket_i(lb0, 0);
+
+            next0 = dpo0->dpoi_next_node;
+            vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
+
+            vlib_increment_combined_counter
+                (cm, cpu_index, lbi0, 1,
+                 vlib_buffer_length_in_chain (vm, b0));
+
+            if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) 
+            {
+                lookup_trace_t *tr = vlib_add_trace (vm, node, 
+                                                     b0, sizeof (*tr));
+                tr->fib_index = fib_index0;
+                tr->lbi = lbi0;
+                tr->addr.ip4 = *input_addr;
+            }
+
+            vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next,
+                                            n_left_to_next, bi0, next0);
+        }
+        vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+    }
+    return from_frame->n_vectors;
+}
+
+static u8 *
+format_lookup_trace (u8 * s, va_list * args)
+{
+    CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+    CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+    lookup_trace_t * t = va_arg (*args, lookup_trace_t *);
+    uword indent = format_get_indent (s);
+    s = format (s, "%U fib-index:%d addr:%U load-balance:%d",
+                format_white_space, indent,
+                t->fib_index,
+                format_ip46_address, &t->addr, IP46_TYPE_ANY,
+                t->lbi);
+    return s;
+}
+
+always_inline uword
+lookup_ip4_dst (vlib_main_t * vm,
+                vlib_node_runtime_t * node,
+                vlib_frame_t * from_frame)
+{
+    return (lookup_dpo_ip4_inline(vm, node, from_frame, 0, 0));
+}
+
+VLIB_REGISTER_NODE (lookup_ip4_dst_node) = {
+    .function = lookup_ip4_dst,
+    .name = "lookup-ip4-dst",
+    .vector_size = sizeof (u32),
+    .sibling_of = "ip4-lookup",
+    .format_trace = format_lookup_trace,
+};
+VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip4_dst_node, lookup_ip4_dst)
+
+always_inline uword
+lookup_ip4_dst_itf (vlib_main_t * vm,
+                    vlib_node_runtime_t * node,
+                    vlib_frame_t * from_frame)
+{
+    return (lookup_dpo_ip4_inline(vm, node, from_frame, 0, 1));
+}
+
+VLIB_REGISTER_NODE (lookup_ip4_dst_itf_node) = {
+    .function = lookup_ip4_dst_itf,
+    .name = "lookup-ip4-dst-itf",
+    .vector_size = sizeof (u32),
+    .sibling_of = "ip4-lookup",
+    .format_trace = format_lookup_trace,
+};
+VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip4_dst_itf_node, lookup_ip4_dst_itf)
+
+always_inline uword
+lookup_ip4_src (vlib_main_t * vm,
+                vlib_node_runtime_t * node,
+                vlib_frame_t * from_frame)
+{
+    return (lookup_dpo_ip4_inline(vm, node, from_frame, 1, 0));
+}
+
+VLIB_REGISTER_NODE (lookup_ip4_src_node) = {
+    .function = lookup_ip4_src,
+    .name = "lookup-ip4-src",
+    .vector_size = sizeof (u32),
+    .format_trace = format_lookup_trace,
+    .sibling_of = "ip4-lookup",
+};
+VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip4_src_node, lookup_ip4_src)
+
+always_inline uword
+lookup_dpo_ip6_inline (vlib_main_t * vm,
+                       vlib_node_runtime_t * node,
+                       vlib_frame_t * from_frame,
+                       int input_src_addr)
+{
+    vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
+    u32 n_left_from, next_index, * from, * to_next;
+    u32 cpu_index = os_get_cpu_number();
+
+    from = vlib_frame_vector_args (from_frame);
+    n_left_from = from_frame->n_vectors;
+
+    next_index = node->cached_next_index;
+
+    while (n_left_from > 0)
+    {
+        u32 n_left_to_next;
+
+        vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+        /* while (n_left_from >= 4 && n_left_to_next >= 2) */
+        /*   { */
+        /*   } */
+
+        while (n_left_from > 0 && n_left_to_next > 0)
+        {
+            u32 bi0, lkdi0, lbi0, fib_index0, next0;
+            const ip6_address_t *input_addr0;
+            const load_balance_t *lb0;
+            const lookup_dpo_t * lkd0;
+            const ip6_header_t * ip0;
+            const dpo_id_t *dpo0;
+            vlib_buffer_t * b0;
+
+            bi0 = from[0];
+            to_next[0] = bi0;
+            from += 1;
+            to_next += 1;
+            n_left_from -= 1;
+            n_left_to_next -= 1;
+
+            b0 = vlib_get_buffer (vm, bi0);
+            ip0 = vlib_buffer_get_current (b0);
+
+            /* dst lookup was done by ip6 lookup */
+            lkdi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX];
+            lkd0 = lookup_dpo_get(lkdi0);
+            fib_index0 = lkd0->lkd_fib_index;
+
+            /*
+             * choose between a source or destination address lookup in the table
+             */
+            if (input_src_addr)
+            {
+                input_addr0 = &ip0->src_address;
+            }
+            else
+            {
+                input_addr0 = &ip0->dst_address;
+            }
+
+            /* do src lookup */
+            lbi0 = ip6_fib_table_fwding_lookup(&ip6_main,
+                                               fib_index0,
+                                               input_addr0);
+            lb0 = load_balance_get(lbi0);
+            dpo0 = load_balance_get_bucket_i(lb0, 0);
+
+            next0 = dpo0->dpoi_next_node;
+            vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
+
+            vlib_increment_combined_counter
+                (cm, cpu_index, lbi0, 1,
+                 vlib_buffer_length_in_chain (vm, b0));
+
+            if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) 
+            {
+                lookup_trace_t *tr = vlib_add_trace (vm, node, 
+                                                     b0, sizeof (*tr));
+                tr->fib_index = fib_index0;
+                tr->lbi = lbi0;
+                tr->addr.ip6 = *input_addr0;
+            }
+            vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next,
+                                            n_left_to_next, bi0, next0);
+        }
+        vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+    }
+    return from_frame->n_vectors;
+}
+
+always_inline uword
+lookup_ip6_dst (vlib_main_t * vm,
+                vlib_node_runtime_t * node,
+                vlib_frame_t * from_frame)
+{
+    return (lookup_dpo_ip6_inline(vm, node, from_frame, 0 /*use src*/));
+}
+
+VLIB_REGISTER_NODE (lookup_ip6_dst_node) = {
+    .function = lookup_ip6_dst,
+    .name = "lookup-ip6-dst",
+    .vector_size = sizeof (u32),
+    .format_trace = format_lookup_trace,
+    .sibling_of = "ip6-lookup",
+};
+VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip6_dst_node, lookup_ip6_dst)
+
+always_inline uword
+lookup_ip6_src (vlib_main_t * vm,
+                vlib_node_runtime_t * node,
+                vlib_frame_t * from_frame)
+{
+    return (lookup_dpo_ip6_inline(vm, node, from_frame, 1 /*use src*/));
+}
+
+VLIB_REGISTER_NODE (lookup_ip6_src_node) = {
+    .function = lookup_ip6_src,
+    .name = "lookup-ip6-src",
+    .vector_size = sizeof (u32),
+    .format_trace = format_lookup_trace,
+    .sibling_of = "ip6-lookup",
+};
+VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip6_src_node, lookup_ip6_src)
+
+always_inline uword
+lookup_dpo_mpls_inline (vlib_main_t * vm,
+                       vlib_node_runtime_t * node,
+                       vlib_frame_t * from_frame,
+                       int table_from_interface)
+{
+    u32 n_left_from, next_index, * from, * to_next;
+    u32 cpu_index = os_get_cpu_number();
+    vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
+
+    from = vlib_frame_vector_args (from_frame);
+    n_left_from = from_frame->n_vectors;
+
+    next_index = node->cached_next_index;
+
+    while (n_left_from > 0)
+    {
+        u32 n_left_to_next;
+
+        vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+        /* while (n_left_from >= 4 && n_left_to_next >= 2) */
+        /*   } */
+
+        while (n_left_from > 0 && n_left_to_next > 0)
+        {
+            u32 bi0, lkdi0, lbi0, fib_index0,  next0;
+            const mpls_unicast_header_t * hdr0;
+            const load_balance_t *lb0;
+            const lookup_dpo_t * lkd0;
+            const dpo_id_t *dpo0;
+            vlib_buffer_t * b0;
+
+            bi0 = from[0];
+            to_next[0] = bi0;
+            from += 1;
+            to_next += 1;
+            n_left_from -= 1;
+            n_left_to_next -= 1;
+
+            b0 = vlib_get_buffer (vm, bi0);
+            hdr0 = vlib_buffer_get_current (b0);
+
+            /* dst lookup was done by mpls lookup */
+            lkdi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX];
+            lkd0 = lookup_dpo_get(lkdi0);
+
+            /*
+             * choose between a lookup using the fib index in the DPO
+             * or getting the FIB index from the interface.
+             */
+            if (table_from_interface)
+            {
+                fib_index0 = 
+                    mpls_fib_table_get_index_for_sw_if_index(
+                        vnet_buffer(b0)->sw_if_index[VLIB_RX]);
+            }
+            else
+            {
+                fib_index0 = lkd0->lkd_fib_index;
+            }
+
+            /* do lookup */
+            lbi0 = mpls_fib_table_forwarding_lookup (fib_index0, hdr0);
+            lb0  = load_balance_get(lbi0);
+            dpo0 = load_balance_get_bucket_i(lb0, 0);
+
+            next0 = dpo0->dpoi_next_node;
+            vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
+
+            vlib_increment_combined_counter
+                (cm, cpu_index, lbi0, 1,
+                 vlib_buffer_length_in_chain (vm, b0));
+
+           if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) 
+            {
+                lookup_trace_t *tr = vlib_add_trace (vm, node, 
+                                                     b0, sizeof (*tr));
+                tr->fib_index = fib_index0;
+                tr->lbi = lbi0;
+                tr->hdr = *hdr0;
+            }
+
+           vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next,
+                                            n_left_to_next, bi0, next0);
+        }
+        vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+    }
+    return from_frame->n_vectors;
+}
+
+static u8 *
+format_lookup_mpls_trace (u8 * s, va_list * args)
+{
+    CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+    CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+    lookup_trace_t * t = va_arg (*args, lookup_trace_t *);
+    uword indent = format_get_indent (s);
+    mpls_unicast_header_t hdr;
+
+    hdr.label_exp_s_ttl = clib_net_to_host_u32(t->hdr.label_exp_s_ttl);
+
+    s = format (s, "%U fib-index:%d hdr:%U load-balance:%d",
+                format_white_space, indent,
+                t->fib_index,
+                format_mpls_header, hdr,
+                t->lbi);
+    return s;
+}
+
+always_inline uword
+lookup_mpls_dst (vlib_main_t * vm,
+                vlib_node_runtime_t * node,
+                vlib_frame_t * from_frame)
+{
+    return (lookup_dpo_mpls_inline(vm, node, from_frame, 0));
+}
+
+VLIB_REGISTER_NODE (lookup_mpls_dst_node) = {
+    .function = lookup_mpls_dst,
+    .name = "lookup-mpls-dst",
+    .vector_size = sizeof (u32),
+    .sibling_of = "mpls-lookup",
+    .format_trace = format_lookup_mpls_trace,
+    .n_next_nodes = 0,
+};
+VLIB_NODE_FUNCTION_MULTIARCH (lookup_mpls_dst_node, lookup_mpls_dst)
+
+always_inline uword
+lookup_mpls_dst_itf (vlib_main_t * vm,
+                    vlib_node_runtime_t * node,
+                    vlib_frame_t * from_frame)
+{
+    return (lookup_dpo_mpls_inline(vm, node, from_frame, 1));
+}
+
+VLIB_REGISTER_NODE (lookup_mpls_dst_itf_node) = {
+    .function = lookup_mpls_dst_itf,
+    .name = "lookup-mpls-dst-itf",
+    .vector_size = sizeof (u32),
+    .sibling_of = "mpls-lookup",
+    .format_trace = format_lookup_mpls_trace,
+    .n_next_nodes = 0,
+};
+VLIB_NODE_FUNCTION_MULTIARCH (lookup_mpls_dst_itf_node, lookup_mpls_dst_itf)
+
+const static dpo_vft_t lkd_vft = {
+    .dv_lock = lookup_dpo_lock,
+    .dv_unlock = lookup_dpo_unlock,
+    .dv_format = format_lookup_dpo,
+};
+
+const static char* const lookup_src_ip4_nodes[] =
+{
+    "lookup-ip4-src",
+    NULL,
+};
+const static char* const lookup_src_ip6_nodes[] =
+{
+    "lookup-ip6-src",
+    NULL,
+};
+const static char* const * const lookup_src_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = lookup_src_ip4_nodes,
+    [DPO_PROTO_IP6]  = lookup_src_ip6_nodes,
+    [DPO_PROTO_MPLS] = NULL,
+};
+
+const static char* const lookup_dst_ip4_nodes[] =
+{
+    "lookup-ip4-dst",
+    NULL,
+};
+const static char* const lookup_dst_ip6_nodes[] =
+{
+    "lookup-ip6-dst",
+    NULL,
+};
+const static char* const lookup_dst_mpls_nodes[] =
+{
+    "lookup-mpls-dst",
+    NULL,
+};
+const static char* const * const lookup_dst_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = lookup_dst_ip4_nodes,
+    [DPO_PROTO_IP6]  = lookup_dst_ip6_nodes,
+    [DPO_PROTO_MPLS] = lookup_dst_mpls_nodes,
+};
+
+const static char* const lookup_dst_from_interface_ip4_nodes[] =
+{
+    "lookup-ip4-dst-itf",
+    NULL,
+};
+const static char* const lookup_dst_from_interface_ip6_nodes[] =
+{
+    "lookup-ip6-dst-itf",
+    NULL,
+};
+const static char* const lookup_dst_from_interface_mpls_nodes[] =
+{
+    "lookup-mpls-dst-itf",
+    NULL,
+};
+const static char* const * const lookup_dst_from_interface_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = lookup_dst_from_interface_ip4_nodes,
+    [DPO_PROTO_IP6]  = lookup_dst_from_interface_ip6_nodes,
+    [DPO_PROTO_MPLS] = lookup_dst_from_interface_mpls_nodes,
+};
+
+
+void
+lookup_dpo_module_init (void)
+{
+    dpo_register(DPO_LOOKUP, &lkd_vft, NULL);
+
+    /*
+     * There are various sorts of lookup; src or dst addr v4 /v6 etc.
+     * there isn't an object type for each (there is only the lookup_dpo_t),
+     * but, for performance reasons, there is a data plane function, and hence
+     * VLIB node for each. VLIB graph node construction is based on DPO types
+     * so we create sub-types.
+     */
+    lookup_dpo_sub_types[LOOKUP_SUB_TYPE_SRC] =
+        dpo_register_new_type(&lkd_vft, lookup_src_nodes);
+    lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST] =
+        dpo_register_new_type(&lkd_vft, lookup_dst_nodes);
+    lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST_TABLE_FROM_INTERFACE] =
+        dpo_register_new_type(&lkd_vft, lookup_dst_nodes);
+}
diff --git a/vnet/vnet/dpo/lookup_dpo.h b/vnet/vnet/dpo/lookup_dpo.h
new file mode 100644 (file)
index 0000000..ff28338
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LOOKUP_DPO_H__
+#define __LOOKUP_DPO_H__
+
+#include <vnet/vnet.h>
+#include <vnet/fib/fib_types.h>
+#include <vnet/dpo/dpo.h>
+
+/**
+ * Switch to use the packet's source or destination address for lookup
+ */
+typedef enum lookup_input_t_ {
+    LOOKUP_INPUT_SRC_ADDR,
+    LOOKUP_INPUT_DST_ADDR,
+} __attribute__ ((packed)) lookup_input_t;
+
+#define LOOKUP_INPUTS {                         \
+    [LOOKUP_INPUT_SRC_ADDR] = "src-address",    \
+    [LOOKUP_INPUT_DST_ADDR] = "dst-address",    \
+}
+
+/**
+ * Switch to use the packet's source or destination address for lookup
+ */
+typedef enum lookup_table_t_ {
+    LOOKUP_TABLE_FROM_INPUT_INTERFACE,
+    LOOKUP_TABLE_FROM_CONFIG,
+} __attribute__ ((packed)) lookup_table_t;
+
+#define LOOKUP_TABLES {                                   \
+    [LOOKUP_INPUT_SRC_ADDR] = "table-input-interface",    \
+    [LOOKUP_INPUT_DST_ADDR] = "table-configured",         \
+}
+
+/**
+ * A representation of an MPLS label for imposition in the data-path
+ */
+typedef struct lookup_dpo_t
+{
+    /**
+     * The FIB, or interface from which to get a FIB, in which to perform
+     * the next lookup;
+     */
+    fib_node_index_t lkd_fib_index;
+
+    /**
+     * The protocol of the FIB for the lookup, and hence
+     * the protocol of the packet
+     */
+    dpo_proto_t lkd_proto;
+
+    /**
+     * Switch to use src or dst address
+     */
+    lookup_input_t lkd_input;
+
+    /**
+     * Switch to use the table index passed, or the table of the input interface
+     */
+    lookup_table_t lkd_table;
+
+    /**
+     * Number of locks
+     */
+    u16 lkd_locks;
+} lookup_dpo_t;
+
+extern void lookup_dpo_add_or_lock_w_fib_index(fib_node_index_t fib_index,
+                                               dpo_proto_t proto,
+                                               lookup_input_t input,
+                                               lookup_table_t table,
+                                               dpo_id_t *dpo);
+extern void lookup_dpo_add_or_lock_w_table_id(u32 table_id,
+                                              dpo_proto_t proto,
+                                              lookup_input_t input,
+                                              lookup_table_t table,
+                                              dpo_id_t *dpo);
+
+extern u8* format_lookup_dpo(u8 *s, va_list *args);
+
+/*
+ * Encapsulation violation for fast data-path access
+ */
+extern lookup_dpo_t *lookup_dpo_pool;
+
+static inline lookup_dpo_t *
+lookup_dpo_get (index_t index)
+{
+    return (pool_elt_at_index(lookup_dpo_pool, index));
+}
+
+extern void lookup_dpo_module_init(void);
+
+#endif
diff --git a/vnet/vnet/dpo/mpls_label_dpo.c b/vnet/vnet/dpo/mpls_label_dpo.c
new file mode 100644 (file)
index 0000000..0ec840e
--- /dev/null
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/ip/ip.h>
+#include <vnet/dpo/mpls_label_dpo.h>
+#include <vnet/mpls/mpls.h>
+
+/*
+ * pool of all MPLS Label DPOs
+ */
+mpls_label_dpo_t *mpls_label_dpo_pool;
+
+static mpls_label_dpo_t *
+mpls_label_dpo_alloc (void)
+{
+    mpls_label_dpo_t *mld;
+
+    pool_get_aligned(mpls_label_dpo_pool, mld, CLIB_CACHE_LINE_BYTES);
+    memset(mld, 0, sizeof(*mld));
+
+    dpo_reset(&mld->mld_dpo);
+
+    return (mld);
+}
+
+static index_t
+mpls_label_dpo_get_index (mpls_label_dpo_t *mld)
+{
+    return (mld - mpls_label_dpo_pool);
+}
+
+index_t
+mpls_label_dpo_create (mpls_label_t label,
+                       mpls_eos_bit_t eos,
+                       u8 ttl,
+                       u8 exp,
+                      const dpo_id_t *dpo)
+{
+    mpls_label_dpo_t *mld;
+
+    mld = mpls_label_dpo_alloc();
+
+    vnet_mpls_uc_set_label(&mld->mld_hdr.label_exp_s_ttl, label);
+    vnet_mpls_uc_set_ttl(&mld->mld_hdr.label_exp_s_ttl, ttl);
+    vnet_mpls_uc_set_exp(&mld->mld_hdr.label_exp_s_ttl, exp);
+    vnet_mpls_uc_set_s(&mld->mld_hdr.label_exp_s_ttl, eos);
+
+    /*
+     * get the header in network byte order since we will paint it
+     * on a packet in the data-plane
+     */
+    mld->mld_hdr.label_exp_s_ttl =
+        clib_host_to_net_u32(mld->mld_hdr.label_exp_s_ttl);
+
+    dpo_stack(DPO_MPLS_LABEL, DPO_PROTO_MPLS, &mld->mld_dpo, dpo);
+
+    return (mpls_label_dpo_get_index(mld));
+}
+
+u8*
+format_mpls_label_dpo (u8 *s, va_list *args)
+{
+    index_t index = va_arg (*args, index_t);
+    u32 indent = va_arg (*args, u32);
+    mpls_unicast_header_t hdr;
+    mpls_label_dpo_t *mld;
+
+    mld = mpls_label_dpo_get(index);
+
+    hdr.label_exp_s_ttl =
+        clib_net_to_host_u32(mld->mld_hdr.label_exp_s_ttl);
+
+    return (format(s, "mpls-label:[%d]:%U\n%U%U",
+                  index,
+                   format_mpls_header, hdr,
+                  format_white_space, indent,
+                  format_dpo_id, &mld->mld_dpo, indent+2));
+}
+
+static void
+mpls_label_dpo_lock (dpo_id_t *dpo)
+{
+    mpls_label_dpo_t *mld;
+
+    mld = mpls_label_dpo_get(dpo->dpoi_index);
+
+    mld->mld_locks++;
+}
+
+static void
+mpls_label_dpo_unlock (dpo_id_t *dpo)
+{
+    mpls_label_dpo_t *mld;
+
+    mld = mpls_label_dpo_get(dpo->dpoi_index);
+
+    mld->mld_locks--;
+
+    if (0 == mld->mld_locks)
+    {
+       dpo_reset(&mld->mld_dpo);
+       pool_put(mpls_label_dpo_pool, mld);
+    }
+}
+
+/**
+ * @brief A struct to hold tracing information for the MPLS label imposition
+ * node.
+ */
+typedef struct mpls_label_imposition_trace_t_
+{
+    /**
+     * The MPLS header imposed
+     */
+    mpls_unicast_header_t hdr;
+} mpls_label_imposition_trace_t;
+
+always_inline uword
+mpls_label_imposition (vlib_main_t * vm,
+                       vlib_node_runtime_t * node,
+                       vlib_frame_t * from_frame)
+{
+    u32 n_left_from, next_index, * from, * to_next;
+
+    from = vlib_frame_vector_args (from_frame);
+    n_left_from = from_frame->n_vectors;
+
+    next_index = node->cached_next_index;
+
+    while (n_left_from > 0)
+    {
+        u32 n_left_to_next;
+
+        vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+        while (n_left_from > 0 && n_left_to_next > 0)
+        {
+            mpls_unicast_header_t *hdr0;
+            mpls_label_dpo_t *mld0;
+            vlib_buffer_t * b0;
+            u32 bi0, mldi0;
+            u32 next0;
+
+            bi0 = from[0];
+            to_next[0] = bi0;
+            from += 1;
+            to_next += 1;
+            n_left_from -= 1;
+            n_left_to_next -= 1;
+
+            b0 = vlib_get_buffer (vm, bi0);
+
+            /* dst lookup was done by ip4 lookup */
+            mldi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX];
+            mld0 = mpls_label_dpo_get(mldi0);
+
+            /* Paint the MPLS header */
+            vlib_buffer_advance(b0, -sizeof(*hdr0));
+            hdr0 = vlib_buffer_get_current(b0);
+
+            // FIXME.
+            // need to copy the TTL from the correct place.
+            // for IPvX imposition from the IP header
+            // so we need a deidcated ipx-to-mpls-label-imp-node
+            // for mpls switch and stack another solution is required.
+            *hdr0 = mld0->mld_hdr;
+
+            next0 = mld0->mld_dpo.dpoi_next_node;
+            vnet_buffer(b0)->ip.adj_index[VLIB_TX] = mld0->mld_dpo.dpoi_index;
+
+            if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) 
+            {
+                mpls_label_imposition_trace_t *tr =
+                    vlib_add_trace (vm, node, b0, sizeof (*tr));
+                tr->hdr = *hdr0;
+            }
+
+            vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next,
+                                            n_left_to_next, bi0, next0);
+        }
+        vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+    }
+    return from_frame->n_vectors;
+}
+
+static u8 *
+format_mpls_label_imposition_trace (u8 * s, va_list * args)
+{
+    CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+    CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+    mpls_label_imposition_trace_t * t;
+    mpls_unicast_header_t hdr;
+    uword indent;
+
+    t = va_arg (*args, mpls_label_imposition_trace_t *);
+    indent = format_get_indent (s);
+    hdr.label_exp_s_ttl = clib_net_to_host_u32(t->hdr.label_exp_s_ttl);
+
+    s = format (s, "%Umpls-header:%U",
+                format_white_space, indent,
+                format_mpls_header, hdr);
+    return (s);
+}
+
+VLIB_REGISTER_NODE (mpls_label_imposition_node) = {
+    .function = mpls_label_imposition,
+    .name = "mpls-label-imposition",
+    .vector_size = sizeof (u32),
+
+    .format_trace = format_mpls_label_imposition_trace,
+    .n_next_nodes = 1,
+    .next_nodes = {
+        [0] = "error-drop",
+    }
+};
+VLIB_NODE_FUNCTION_MULTIARCH (mpls_label_imposition_node, mpls_label_imposition)
+
+const static dpo_vft_t mld_vft = {
+    .dv_lock = mpls_label_dpo_lock,
+    .dv_unlock = mpls_label_dpo_unlock,
+    .dv_format = format_mpls_label_dpo,
+};
+
+const static char* const mpls_label_imp_ip4_nodes[] =
+{
+    "mpls-label-imposition",
+    NULL,
+};
+const static char* const mpls_label_imp_ip6_nodes[] =
+{
+    "mpls-label-imposition",
+    NULL,
+};
+const static char* const mpls_label_imp_mpls_nodes[] =
+{
+    "mpls-label-imposition",
+    NULL,
+};
+const static char* const * const mpls_label_imp_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = mpls_label_imp_ip4_nodes,
+    [DPO_PROTO_IP6]  = mpls_label_imp_ip6_nodes,
+    [DPO_PROTO_MPLS] = mpls_label_imp_mpls_nodes,
+};
+
+
+void
+mpls_label_dpo_module_init (void)
+{
+    dpo_register(DPO_MPLS_LABEL, &mld_vft, mpls_label_imp_nodes);
+}
diff --git a/vnet/vnet/dpo/mpls_label_dpo.h b/vnet/vnet/dpo/mpls_label_dpo.h
new file mode 100644 (file)
index 0000000..47ee344
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MPLS_LABEL_DPO_H__
+#define __MPLS_LABEL_DPO_H__
+
+#include <vnet/vnet.h>
+#include <vnet/mpls/packet.h>
+#include <vnet/dpo/dpo.h>
+
+/**
+ * A representation of an MPLS label for imposition in the data-path
+ */
+typedef struct mpls_label_dpo_t
+{
+    /**
+     * The MPLS label header to impose
+     */
+    mpls_unicast_header_t mld_hdr;
+
+    /**
+     * Next DPO in the graph
+     */
+    dpo_id_t mld_dpo;
+
+    /**
+     * Number of locks/users of the label
+     */
+    u16 mld_locks;
+} mpls_label_dpo_t;
+
+extern index_t mpls_label_dpo_create(mpls_label_t label,
+                                     mpls_eos_bit_t eos,
+                                     u8 ttl,
+                                     u8 exp,
+                                    const dpo_id_t *dpo);
+
+extern u8* format_mpls_label_dpo(u8 *s, va_list *args);
+
+
+/*
+ * Encapsulation violation for fast data-path access
+ */
+extern mpls_label_dpo_t *mpls_label_dpo_pool;
+
+static inline mpls_label_dpo_t *
+mpls_label_dpo_get (index_t index)
+{
+    return (pool_elt_at_index(mpls_label_dpo_pool, index));
+}
+
+extern void mpls_label_dpo_module_init(void);
+
+#endif
diff --git a/vnet/vnet/dpo/punt_dpo.c b/vnet/vnet/dpo/punt_dpo.c
new file mode 100644 (file)
index 0000000..e27a8ff
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @brief
+ * The data-path object representing puntping the packet
+ */
+
+#include <vnet/dpo/dpo.h>
+
+static dpo_id_t punt_dpos[DPO_PROTO_NUM];
+
+const dpo_id_t *
+punt_dpo_get (dpo_proto_t proto)
+{
+    dpo_set(&punt_dpos[proto], DPO_PUNT, proto, 1);
+
+    return (&punt_dpos[proto]);
+}
+
+int
+dpo_is_punt (const dpo_id_t *dpo)
+{
+    return (dpo->dpoi_type == DPO_PUNT);
+}
+
+static void
+punt_dpo_lock (dpo_id_t *dpo)
+{
+    /*
+     * not maintaining a lock count on the punt
+     * more trouble than it's worth.
+     * There always needs to be one around. no point it managaing its lifetime
+     */
+}
+static void
+punt_dpo_unlock (dpo_id_t *dpo)
+{
+}
+
+static u8*
+format_punt_dpo (u8 *s, va_list *ap)
+{
+    CLIB_UNUSED(index_t index) = va_arg(ap, index_t);
+    CLIB_UNUSED(u32 indent) = va_arg(ap, u32);
+
+    return (format(s, "dpo-punt"));
+}
+
+const static dpo_vft_t punt_vft = {
+    .dv_lock   = punt_dpo_lock,
+    .dv_unlock = punt_dpo_unlock,
+    .dv_format = format_punt_dpo,
+};
+
+/**
+ * @brief The per-protocol VLIB graph nodes that are assigned to a punt
+ *        object.
+ *
+ * this means that these graph nodes are ones from which a punt is the
+ * parent object in the DPO-graph.
+ */
+const static char* const punt_ip4_nodes[] =
+{
+    "ip4-punt",
+    NULL,
+};
+const static char* const punt_ip6_nodes[] =
+{
+    "ip6-punt",
+    NULL,
+};
+const static char* const punt_mpls_nodes[] =
+{
+    "mpls-punt",
+    NULL,
+};
+const static char* const * const punt_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = punt_ip4_nodes,
+    [DPO_PROTO_IP6]  = punt_ip6_nodes,
+    [DPO_PROTO_MPLS] = punt_mpls_nodes,
+};
+
+void
+punt_dpo_module_init (void)
+{
+    dpo_register(DPO_PUNT, &punt_vft, punt_nodes);
+}
diff --git a/vnet/vnet/dpo/punt_dpo.h b/vnet/vnet/dpo/punt_dpo.h
new file mode 100644 (file)
index 0000000..370547c
--- /dev/null
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @brief A DPO to punt packets to the Control-plane
+ */
+
+#ifndef __PUNT_DPO_H__
+#define __PUNT_DPO_H__
+
+#include <vnet/dpo/dpo.h>
+
+extern int dpo_is_punt(const dpo_id_t *dpo);
+
+extern const dpo_id_t *punt_dpo_get(dpo_proto_t proto);
+
+extern void punt_dpo_module_init(void);
+
+#endif
diff --git a/vnet/vnet/dpo/receive_dpo.c b/vnet/vnet/dpo/receive_dpo.c
new file mode 100644 (file)
index 0000000..ee7d82b
--- /dev/null
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @brief
+ * The data-path object representing receiveing the packet, i.e. it's for-us
+ */
+#include <vlib/vlib.h>
+#include <vnet/ip/ip.h>
+#include <vnet/dpo/receive_dpo.h>
+
+/**
+ * @brief pool of all receive DPOs
+ */
+receive_dpo_t *receive_dpo_pool;
+
+static receive_dpo_t *
+receive_dpo_alloc (void)
+{
+    receive_dpo_t *rd;
+
+    pool_get_aligned(receive_dpo_pool, rd, CLIB_CACHE_LINE_BYTES);
+    memset(rd, 0, sizeof(*rd));
+
+    return (rd);
+}
+
+static receive_dpo_t *
+receive_dpo_get_from_dpo (const dpo_id_t *dpo)
+{
+    ASSERT(DPO_RECEIVE == dpo->dpoi_type);
+
+    return (receive_dpo_get(dpo->dpoi_index));
+}
+
+
+/*
+ * receive_dpo_add_or_lock
+ *
+ * The next_hop address here is used for source address selection in the DP.
+ * The local adj is added to an interface's receive prefix, the next-hop
+ * passed here is the local prefix on the same interface.
+ */
+void
+receive_dpo_add_or_lock (dpo_proto_t proto,
+                         u32 sw_if_index,
+                         const ip46_address_t *nh_addr,
+                         dpo_id_t *dpo)
+{
+    receive_dpo_t *rd;
+
+    rd = receive_dpo_alloc();
+
+    rd->rd_sw_if_index = sw_if_index;
+    if (NULL != nh_addr)
+    {
+       rd->rd_addr = *nh_addr;
+    }
+
+    dpo_set(dpo, DPO_RECEIVE, proto, (rd - receive_dpo_pool));
+}
+
+static void
+receive_dpo_lock (dpo_id_t *dpo)
+{
+    receive_dpo_t *rd;
+
+    rd = receive_dpo_get_from_dpo(dpo);
+    rd->rd_locks++;
+}
+
+static void
+receive_dpo_unlock (dpo_id_t *dpo)
+{
+    receive_dpo_t *rd;
+
+    rd = receive_dpo_get_from_dpo(dpo);
+    rd->rd_locks--;
+
+    if (0 == rd->rd_locks)
+    {
+        pool_put(receive_dpo_pool, rd);
+    }
+}
+
+static u8*
+format_receive_dpo (u8 *s, va_list *ap)
+{
+    CLIB_UNUSED(index_t index) = va_arg(ap, index_t);
+    CLIB_UNUSED(u32 indent) = va_arg(ap, u32);
+    vnet_main_t * vnm = vnet_get_main();
+    receive_dpo_t *rd;
+
+    rd = receive_dpo_get(index);
+
+    if (~0 != rd->rd_sw_if_index)
+    {
+        return (format(s, "dpo-receive: %U on %U",
+                       format_ip46_address, &rd->rd_addr, IP46_TYPE_ANY,
+                       format_vnet_sw_interface_name, vnm,
+                       vnet_get_sw_interface(vnm, rd->rd_sw_if_index)));
+    }
+    else
+    {
+        return (format(s, "dpo-receive"));
+    }
+}
+
+const static dpo_vft_t receive_vft = {
+    .dv_lock = receive_dpo_lock,
+    .dv_unlock = receive_dpo_unlock,
+    .dv_format = format_receive_dpo,
+};
+
+/**
+ * @brief The per-protocol VLIB graph nodes that are assigned to a receive
+ *        object.
+ *
+ * this means that these graph nodes are ones from which a receive is the
+ * parent object in the DPO-graph.
+ */
+const static char* const receive_ip4_nodes[] =
+{
+    "ip4-local",
+    NULL,
+};
+const static char* const receive_ip6_nodes[] =
+{
+    "ip6-local",
+    NULL,
+};
+
+const static char* const * const receive_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = receive_ip4_nodes,
+    [DPO_PROTO_IP6]  = receive_ip6_nodes,
+    [DPO_PROTO_MPLS] = NULL,
+};
+
+void
+receive_dpo_module_init (void)
+{
+    dpo_register(DPO_RECEIVE, &receive_vft, receive_nodes);
+}
diff --git a/vnet/vnet/dpo/receive_dpo.h b/vnet/vnet/dpo/receive_dpo.h
new file mode 100644 (file)
index 0000000..2420fd7
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @brief
+ * The data-path object representing receiveing the packet, i.e. it's for-us
+ */
+
+#ifndef __RECEIVE_DPO_H__
+#define __RECEIVE_DPO_H__
+
+#include <vnet/dpo/dpo.h>
+#include <vnet/ip/ip6.h>
+
+typedef struct receive_dpo_t_
+{
+    /**
+     * The Software interface index on which traffic is received
+     */
+    u32 rd_sw_if_index;
+
+    /**
+     * The address on the receive interface. packet are destined to this address
+     */
+    ip46_address_t rd_addr;
+
+    /**
+     * number oflocks.
+     */
+    u16 rd_locks;
+} receive_dpo_t;
+
+extern void receive_dpo_add_or_lock (dpo_proto_t proto,
+                                     u32 sw_if_index,
+                                     const ip46_address_t *nh_addr,
+                                     dpo_id_t *dpo);
+
+extern void receive_dpo_module_init(void);
+
+/**
+ * @brief pool of all receive DPOs
+ */
+receive_dpo_t *receive_dpo_pool;
+
+static inline receive_dpo_t *
+receive_dpo_get (index_t index)
+{
+    return (pool_elt_at_index(receive_dpo_pool, index));
+}
+
+#endif
index 56df480..d08764a 100644 (file)
@@ -21,6 +21,9 @@
 #include <vnet/ethernet/arp_packet.h>
 #include <vnet/l2/l2_input.h>
 #include <vppinfra/mhash.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/adj/adj.h>
+#include <vnet/mpls/mpls.h>
 
 /**
  * @file
@@ -36,24 +39,44 @@ void vl_api_rpc_call_main_thread (void *fp, u8 * data, u32 data_length);
 typedef struct
 {
   u32 sw_if_index;
-  u32 fib_index;
   ip4_address_t ip4_address;
-} ethernet_arp_ip4_key_t;
 
-typedef struct
-{
-  ethernet_arp_ip4_key_t key;
   u8 ethernet_address[6];
 
   u16 flags;
-#define ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC (1 << 0)
-#define ETHERNET_ARP_IP4_ENTRY_FLAG_GLEAN  (2 << 0)
+#define ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC  (1 << 0)
+#define ETHERNET_ARP_IP4_ENTRY_FLAG_DYNAMIC (1 << 1)
 
   u64 cpu_time_last_updated;
-
-  u32 *adjacencies;
+  adj_index_t adj_index[FIB_LINK_NUM];
 } ethernet_arp_ip4_entry_t;
 
+/**
+ * @brief administrative and operational state falgs on an interface
+ */
+typedef enum ethernet_arp_interface_flags_t_
+{
+  ETHERNET_ARP_INTERFACE_UP = (0 << 1),
+  ETHERNET_ARP_INTERFACE_MPLS_ENABLE = (1 << 0),
+} ethernet_arp_interface_flags_t;
+
+/**
+ * @brief Per-interface ARP configuration and state
+ */
+typedef struct ethernet_arp_interface_t_
+{
+    /**
+     * Hash table of ARP entries.
+     * Since this hash table is per-interface, the key is only the IPv4 address.
+     */
+  uword *arp_entries;
+
+    /**
+     * Flags for administrative and operational state
+     */
+  ethernet_arp_interface_flags_t flags;
+} ethernet_arp_interface_t;
+
 typedef struct
 {
   u32 lo_addr;
@@ -87,18 +110,43 @@ typedef struct
 
   ethernet_arp_ip4_entry_t *ip4_entry_pool;
 
-  mhash_t ip4_entry_by_key;
-
   /* ARP attack mitigation */
   u32 arp_delete_rotor;
   u32 limit_arp_cache_size;
 
+  /** Per interface state */
+  ethernet_arp_interface_t *ethernet_arp_by_sw_if_index;
+
   /* Proxy arp vector */
   ethernet_proxy_arp_t *proxy_arps;
 } ethernet_arp_main_t;
 
 static ethernet_arp_main_t ethernet_arp_main;
 
+
+typedef enum arp_ether_type_t_
+{
+  ARP_ETHER_TYPE_IP4 = (1 << 0),
+  ARP_ETHER_TYPE_MPLS = (1 << 1),
+} arp_ether_type_t;
+#define ARP_ETHER_TYPE_BOTH (ARP_ETHER_TYPE_MPLS | ARP_ETHER_TYPE_IP4)
+
+typedef struct
+{
+  u32 sw_if_index;
+  ethernet_arp_ip4_over_ethernet_address_t a;
+  int is_static;
+  int flags;
+#define ETHERNET_ARP_ARGS_REMOVE (1<<0)
+#define ETHERNET_ARP_ARGS_FLUSH  (1<<1)
+#define ETHERNET_ARP_ARGS_POPULATE  (1<<2)
+  arp_ether_type_t ether_type;
+} vnet_arp_set_ip4_over_ethernet_rpc_args_t;
+
+static void
+set_ip4_over_ethernet_rpc_callback (vnet_arp_set_ip4_over_ethernet_rpc_args_t
+                                   * a);
+
 static u8 *
 format_ethernet_arp_hardware_type (u8 * s, va_list * va)
 {
@@ -229,27 +277,23 @@ format_ethernet_arp_ip4_entry (u8 * s, va_list * va)
   vnet_main_t *vnm = va_arg (*va, vnet_main_t *);
   ethernet_arp_ip4_entry_t *e = va_arg (*va, ethernet_arp_ip4_entry_t *);
   vnet_sw_interface_t *si;
-  ip4_fib_t *fib;
   u8 *flags = 0;
 
   if (!e)
-    return format (s, "%=12s%=6s%=16s%=6s%=20s%=24s", "Time", "FIB", "IP4",
+    return format (s, "%=12s%=16s%=6s%=20s%=24s", "Time", "IP4",
                   "Flags", "Ethernet", "Interface");
 
-  fib = find_ip4_fib_by_table_index_or_id (&ip4_main, e->key.fib_index,
-                                          IP4_ROUTE_FLAG_FIB_INDEX);
-  si = vnet_get_sw_interface (vnm, e->key.sw_if_index);
-
-  if (e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_GLEAN)
-    flags = format (flags, "G");
+  si = vnet_get_sw_interface (vnm, e->sw_if_index);
 
   if (e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC)
     flags = format (flags, "S");
 
-  s = format (s, "%=12U%=6u%=16U%=6s%=20U%=24U",
+  if (e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_DYNAMIC)
+    flags = format (flags, "D");
+
+  s = format (s, "%=12U%=16U%=6s%=20U%=24U",
              format_vlib_cpu_time, vnm->vlib_main, e->cpu_time_last_updated,
-             fib->table_id,
-             format_ip4_address, &e->key.ip4_address,
+             format_ip4_address, &e->ip4_address,
              flags ? (char *) flags : "",
              format_ethernet_address, e->ethernet_address,
              format_vnet_sw_interface_name, vnm, si);
@@ -294,207 +338,126 @@ format_arp_term_input_trace (u8 * s, va_list * va)
   return s;
 }
 
-clib_error_t *
-ethernet_arp_sw_interface_up_down (vnet_main_t * vnm,
-                                  u32 sw_if_index, u32 flags)
+static void
+arp_mk_complete (ethernet_arp_interface_t * eai,
+                ethernet_arp_ip4_entry_t * e, arp_ether_type_t et)
 {
-  ethernet_arp_main_t *am = &ethernet_arp_main;
-  ethernet_arp_ip4_entry_t *e;
-  u32 i;
-  u32 *to_add_del = 0;
+  fib_prefix_t pfx = {
+    .fp_len = 32,
+    .fp_proto = FIB_PROTOCOL_IP4,
+    .fp_addr = {
+               .ip4 = e->ip4_address,
+               },
+  };
+  u32 fib_index;
 
-  /* *INDENT-OFF* */
- pool_foreach (e, am->ip4_entry_pool, ({
-    if (e->key.sw_if_index == sw_if_index)
-       vec_add1 (to_add_del, e - am->ip4_entry_pool);
-  }));
- /* *INDENT-ON* */
+  fib_index = ip4_fib_table_get_index_for_sw_if_index (e->sw_if_index);
 
-  for (i = 0; i < vec_len (to_add_del); i++)
+  if (et & ARP_ETHER_TYPE_IP4)
     {
-      ethernet_arp_ip4_over_ethernet_address_t arp_add;
-      e = pool_elt_at_index (am->ip4_entry_pool, to_add_del[i]);
-
-      clib_memcpy (&arp_add.ethernet, e->ethernet_address, 6);
-      arp_add.ip4.as_u32 = e->key.ip4_address.as_u32;
-
-      if (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)
+      if (ADJ_INDEX_INVALID == e->adj_index[FIB_LINK_IP4])
        {
-         vnet_arp_set_ip4_over_ethernet (vnm,
-                                         e->key.sw_if_index,
-                                         e->key.fib_index, &arp_add,
-                                         e->flags &
-                                         ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC);
+         e->adj_index[FIB_LINK_IP4] =
+           adj_nbr_add_or_lock_w_rewrite (FIB_PROTOCOL_IP4,
+                                          FIB_LINK_IP4,
+                                          &pfx.fp_addr,
+                                          e->sw_if_index,
+                                          e->ethernet_address);
+         ASSERT (ADJ_INDEX_INVALID != e->adj_index[FIB_LINK_IP4]);
+
+         fib_table_entry_update_one_path (fib_index,
+                                          &pfx,
+                                          FIB_SOURCE_ADJ,
+                                          FIB_ENTRY_FLAG_ATTACHED,
+                                          FIB_PROTOCOL_IP4,
+                                          &pfx.fp_addr,
+                                          e->sw_if_index,
+                                          ~0,
+                                          1,
+                                          MPLS_LABEL_INVALID,
+                                          FIB_ROUTE_PATH_FLAG_NONE);
        }
-      else if ((e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC) == 0)
+      else
        {
-         vnet_arp_unset_ip4_over_ethernet (vnm,
-                                           e->key.sw_if_index,
-                                           e->key.fib_index, &arp_add);
+         adj_nbr_update_rewrite (e->adj_index[FIB_LINK_IP4],
+                                 e->ethernet_address);
+       }
+    }
+  if ((et & ARP_ETHER_TYPE_MPLS) &&
+      eai->flags & ETHERNET_ARP_INTERFACE_MPLS_ENABLE)
+    {
+      if (ADJ_INDEX_INVALID == e->adj_index[FIB_LINK_MPLS])
+       {
+         e->adj_index[FIB_LINK_MPLS] =
+           adj_nbr_add_or_lock_w_rewrite (FIB_PROTOCOL_IP4,
+                                          FIB_LINK_MPLS,
+                                          &pfx.fp_addr,
+                                          e->sw_if_index,
+                                          e->ethernet_address);
+         ASSERT (ADJ_INDEX_INVALID != e->adj_index[FIB_LINK_MPLS]);
+       }
+      else
+       {
+         adj_nbr_update_rewrite (e->adj_index[FIB_LINK_MPLS],
+                                 e->ethernet_address);
        }
     }
-
-  vec_free (to_add_del);
-  return 0;
-}
-
-VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ethernet_arp_sw_interface_up_down);
-
-static int
-vnet_arp_set_ip4_over_ethernet_internal (vnet_main_t * vnm,
-                                        u32 sw_if_index,
-                                        u32 fib_index,
-                                        void *a_arg, int is_static);
-
-static int
-vnet_arp_unset_ip4_over_ethernet_internal (vnet_main_t * vnm,
-                                          u32 sw_if_index,
-                                          u32 fib_index, void *a_arg);
-
-typedef struct
-{
-  u32 sw_if_index;
-  u32 fib_index;
-  ethernet_arp_ip4_over_ethernet_address_t a;
-  int is_static;
-  int is_remove;               /* set is_remove=1 to clear arp entry */
-} vnet_arp_set_ip4_over_ethernet_rpc_args_t;
-
-static void set_ip4_over_ethernet_rpc_callback
-  (vnet_arp_set_ip4_over_ethernet_rpc_args_t * a)
-{
-  vnet_main_t *vm = vnet_get_main ();
-  ASSERT (os_get_cpu_number () == 0);
-
-  if (a->is_remove)
-    vnet_arp_unset_ip4_over_ethernet_internal (vm,
-                                              a->sw_if_index,
-                                              a->fib_index, &(a->a));
-  else
-    vnet_arp_set_ip4_over_ethernet_internal (vm,
-                                            a->sw_if_index,
-                                            a->fib_index,
-                                            &(a->a), a->is_static);
-}
-
-int
-vnet_arp_set_ip4_over_ethernet (vnet_main_t * vnm,
-                               u32 sw_if_index,
-                               u32 fib_index, void *a_arg, int is_static)
-{
-  ethernet_arp_ip4_over_ethernet_address_t *a = a_arg;
-  vnet_arp_set_ip4_over_ethernet_rpc_args_t args;
-
-  args.sw_if_index = sw_if_index;
-  args.fib_index = fib_index;
-  args.is_static = is_static;
-  args.is_remove = 0;
-  clib_memcpy (&args.a, a, sizeof (*a));
-
-  vl_api_rpc_call_main_thread (set_ip4_over_ethernet_rpc_callback,
-                              (u8 *) & args, sizeof (args));
-  return 0;
 }
 
 int
 vnet_arp_set_ip4_over_ethernet_internal (vnet_main_t * vnm,
-                                        u32 sw_if_index,
-                                        u32 fib_index,
-                                        void *a_arg, int is_static)
+                                        vnet_arp_set_ip4_over_ethernet_rpc_args_t
+                                        * args)
 {
-  ethernet_arp_ip4_key_t k;
   ethernet_arp_ip4_entry_t *e = 0;
   ethernet_arp_main_t *am = &ethernet_arp_main;
-  ethernet_arp_ip4_over_ethernet_address_t *a = a_arg;
+  ethernet_arp_ip4_over_ethernet_address_t *a = &args->a;
   vlib_main_t *vm = vlib_get_main ();
-  ip4_main_t *im = &ip4_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
   int make_new_arp_cache_entry = 1;
   uword *p;
-  ip4_add_del_route_args_t args;
-  ip_adjacency_t adj, *existing_adj;
   pending_resolution_t *pr, *mc;
+  ethernet_arp_interface_t *arp_int;
+  fib_link_t link;
+  int is_static = args->is_static;
+  u32 sw_if_index = args->sw_if_index;
 
-  u32 next_index;
-  u32 adj_index;
-
-  fib_index = (fib_index != (u32) ~ 0)
-    ? fib_index : im->fib_index_by_sw_if_index[sw_if_index];
+  vec_validate (am->ethernet_arp_by_sw_if_index, sw_if_index);
 
-  k.sw_if_index = sw_if_index;
-  k.ip4_address = a->ip4;
-  k.fib_index = fib_index;
+  arp_int = &am->ethernet_arp_by_sw_if_index[sw_if_index];
 
-  p = mhash_get (&am->ip4_entry_by_key, &k);
-  if (p)
+  if (NULL != arp_int->arp_entries)
     {
-      e = pool_elt_at_index (am->ip4_entry_pool, p[0]);
+      p = hash_get (arp_int->arp_entries, a->ip4.as_u32);
+      if (p)
+       {
+         e = pool_elt_at_index (am->ip4_entry_pool, p[0]);
 
-      /* Refuse to over-write static arp. */
-      if (!is_static && (e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC))
-       return -2;
-      make_new_arp_cache_entry = 0;
+         /* Refuse to over-write static arp. */
+         if (!is_static && (e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC))
+           return -2;
+         make_new_arp_cache_entry = 0;
+       }
     }
 
-  /* Note: always install the route. It might have been deleted */
-  memset (&adj, 0, sizeof (adj));
-  adj.lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
-  adj.n_adj = 1;               /*  otherwise signature compare fails */
+  if (make_new_arp_cache_entry)
+    {
+      pool_get (am->ip4_entry_pool, e);
 
-  vnet_rewrite_for_sw_interface (vnm, VNET_L3_PACKET_TYPE_IP4, sw_if_index, ip4_rewrite_node.index, a->ethernet,       /* destination address */
-                                &adj.rewrite_header,
-                                sizeof (adj.rewrite_data));
+      if (NULL == arp_int->arp_entries)
+       {
+         arp_int->arp_entries = hash_create (0, sizeof (u32));
+         if (mpls_sw_interface_is_enabled (sw_if_index))
+           arp_int->flags |= ETHERNET_ARP_INTERFACE_MPLS_ENABLE;
+       }
 
-  /* result of this lookup should be next-hop adjacency */
-  adj_index = ip4_fib_lookup_with_table (im, fib_index, &a->ip4, 0);
-  existing_adj = ip_get_adjacency (lm, adj_index);
+      hash_set (arp_int->arp_entries, a->ip4.as_u32, e - am->ip4_entry_pool);
 
-  if (existing_adj->lookup_next_index == IP_LOOKUP_NEXT_ARP &&
-      existing_adj->arp.next_hop.ip4.as_u32 == a->ip4.as_u32)
-    {
-      u32 *ai;
-      u32 *adjs = vec_dup (e->adjacencies);
-      /* Update all adj assigned to this arp entry */
-      vec_foreach (ai, adjs)
+      e->sw_if_index = sw_if_index;
+      e->ip4_address = a->ip4;
+      FOR_EACH_FIB_LINK (link)
       {
-       int i;
-       ip_adjacency_t *uadj = ip_get_adjacency (lm, *ai);
-       for (i = 0; i < uadj->n_adj; i++)
-         if (uadj[i].lookup_next_index == IP_LOOKUP_NEXT_ARP &&
-             uadj[i].arp.next_hop.ip4.as_u32 == a->ip4.as_u32)
-           ip_update_adjacency (lm, *ai + i, &adj);
+       e->adj_index[link] = ADJ_INDEX_INVALID;
       }
-      vec_free (adjs);
-    }
-  else
-    {
-      /* Check that new adjacency actually isn't exactly the same as
-       *  what is already there. If we over-write the adjacency with
-       *  exactly the same info, its technically a new adjacency with
-       *  new counters, but to user it appears as counters reset.
-       */
-      if (vnet_ip_adjacency_share_compare (&adj, existing_adj) == 0)
-       {
-         /* create new adj */
-         args.table_index_or_table_id = fib_index;
-         args.flags =
-           IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_ADD |
-           IP4_ROUTE_FLAG_NEIGHBOR;
-         args.dst_address = a->ip4;
-         args.dst_address_length = 32;
-         args.adj_index = ~0;
-         args.add_adj = &adj;
-         args.n_add_adj = 1;
-         ip4_add_del_route (im, &args);
-       }
-    }
-
-  if (make_new_arp_cache_entry)
-    {
-      pool_get (am->ip4_entry_pool, e);
-      mhash_set (&am->ip4_entry_by_key, &k, e - am->ip4_entry_pool,
-                /* old value */ 0);
-      e->key = k;
     }
 
   /* Update time stamp and ethernet address. */
@@ -503,11 +466,16 @@ vnet_arp_set_ip4_over_ethernet_internal (vnet_main_t * vnm,
   e->cpu_time_last_updated = clib_cpu_time_now ();
   if (is_static)
     e->flags |= ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC;
+  else
+    e->flags |= ETHERNET_ARP_IP4_ENTRY_FLAG_DYNAMIC;
+
+  arp_mk_complete (arp_int, e, ARP_ETHER_TYPE_BOTH);
 
   /* Customer(s) waiting for this address to be resolved? */
   p = hash_get (am->pending_resolutions_by_address, a->ip4.as_u32);
   if (p)
     {
+      u32 next_index;
       next_index = p[0];
 
       while (next_index != (u32) ~ 0)
@@ -526,6 +494,7 @@ vnet_arp_set_ip4_over_ethernet_internal (vnet_main_t * vnm,
   p = hash_get (am->mac_changes_by_address, a->ip4.as_u32);
   if (p)
     {
+      u32 next_index;
       next_index = p[0];
 
       while (next_index != (u32) ~ 0)
@@ -688,6 +657,7 @@ typedef enum
   _ (l2_address_mismatch, "ARP hw addr does not match L2 frame src addr") \
   _ (missing_interface_address, "ARP missing interface address") \
   _ (gratuitous_arp, "ARP probe or announcement dropped") \
+  _ (interface_no_table, "Interface is not mapped to an IP table") \
 
 typedef enum
 {
@@ -697,29 +667,6 @@ typedef enum
     ETHERNET_ARP_N_ERROR,
 } ethernet_arp_input_error_t;
 
-/* get first interface address */
-ip4_address_t *
-ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index,
-                            ip_interface_address_t ** result_ia)
-{
-  ip_lookup_main_t *lm = &im->lookup_main;
-  ip_interface_address_t *ia = 0;
-  ip4_address_t *result = 0;
-
-  /* *INDENT-OFF* */
-  foreach_ip_interface_address (lm, ia, sw_if_index,
-                               1 /* honor unnumbered */ ,
-  ({
-    ip4_address_t * a =
-      ip_interface_address_get_address (lm, ia);
-    result = a; break;
-  }));
-  /* *INDENT-ON* */
-
-  if (result_ia)
-    *result_ia = result ? ia : 0;
-  return result;
-}
 
 static void
 unset_random_arp_entry (void)
@@ -747,16 +694,14 @@ unset_random_arp_entry (void)
   e = pool_elt_at_index (am->ip4_entry_pool, index);
 
   clib_memcpy (&delme.ethernet, e->ethernet_address, 6);
-  delme.ip4.as_u32 = e->key.ip4_address.as_u32;
+  delme.ip4.as_u32 = e->ip4_address.as_u32;
 
-  vnet_arp_unset_ip4_over_ethernet (vnm, e->key.sw_if_index,
-                                   e->key.fib_index, &delme);
+  vnet_arp_unset_ip4_over_ethernet (vnm, e->sw_if_index, &delme);
 }
 
 static void
 arp_unnumbered (vlib_buffer_t * p0,
-               u32 pi0,
-               ethernet_header_t * eth0, ip_interface_address_t * ifa0)
+               u32 pi0, ethernet_header_t * eth0, u32 sw_if_index)
 {
   vlib_main_t *vm = vlib_get_main ();
   vnet_main_t *vnm = vnet_get_main ();
@@ -777,7 +722,7 @@ arp_unnumbered (vlib_buffer_t * p0,
   clib_memcpy (dst_mac_address, eth0->dst_address, sizeof (dst_mac_address));
 
   /* Figure out which sw_if_index supplied the address */
-  unnum_src_sw_if_index = ifa0->sw_if_index;
+  unnum_src_sw_if_index = sw_if_index;
 
   /* Track down all users of the unnumbered source */
   /* *INDENT-OFF* */
@@ -928,13 +873,14 @@ arp_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
          vnet_hw_interface_t *hw_if0;
          ethernet_arp_header_t *arp0;
          ethernet_header_t *eth0;
-         ip_interface_address_t *ifa0;
          ip_adjacency_t *adj0;
-         ip4_address_t *if_addr0;
-         ip4_address_t proxy_src;
-         u32 pi0, error0, next0, sw_if_index0;
-         u8 is_request0, src_is_local0, dst_is_local0, is_unnum0;
+         ip4_address_t *if_addr0, proxy_src;
+         u32 pi0, error0, next0, sw_if_index0, conn_sw_if_index0, fib_index0;
+         u8 is_request0, dst_is_local0, is_unnum0;
          ethernet_proxy_arp_t *pa;
+         fib_node_index_t dst_fei, src_fei;
+         fib_prefix_t pfx0;
+         fib_entry_flag_t src_flags, dst_flags;
 
          pi0 = from[0];
          to_next[0] = pi0;
@@ -942,6 +888,7 @@ arp_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
          to_next += 1;
          n_left_from -= 1;
          n_left_to_next -= 1;
+         pa = 0;
 
          p0 = vlib_get_buffer (vm, pi0);
          arp0 = vlib_buffer_get_current (p0);
@@ -963,43 +910,56 @@ arp_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
          sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];
 
          if (error0)
-           goto drop1;
+           goto drop2;
 
          /* Check that IP address is local and matches incoming interface. */
-         if_addr0 =
-           ip4_interface_address_matching_destination (im4,
-                                                       &arp0->
-                                                       ip4_over_ethernet[1].
-                                                       ip4, sw_if_index0,
-                                                       &ifa0);
-         if (!if_addr0)
+         fib_index0 = ip4_fib_table_get_index_for_sw_if_index (sw_if_index0);
+         if (~0 == fib_index0)
+           {
+             error0 = ETHERNET_ARP_ERROR_interface_no_table;
+             goto drop2;
+
+           }
+         dst_fei = ip4_fib_table_lookup (ip4_fib_get (fib_index0),
+                                         &arp0->ip4_over_ethernet[1].ip4,
+                                         32);
+         dst_flags = fib_entry_get_flags (dst_fei);
+
+         conn_sw_if_index0 = fib_entry_get_resolving_interface (dst_fei);
+
+         if (!(FIB_ENTRY_FLAG_CONNECTED & dst_flags))
            {
              error0 = ETHERNET_ARP_ERROR_l3_dst_address_not_local;
              goto drop1;
            }
 
          /* Honor unnumbered interface, if any */
-         is_unnum0 = sw_if_index0 != ifa0->sw_if_index;
+         is_unnum0 = sw_if_index0 != conn_sw_if_index0;
 
          /* Source must also be local to subnet of matching interface address. */
-         if (!ip4_destination_matches_interface
-             (im4, &arp0->ip4_over_ethernet[0].ip4, ifa0))
+         src_fei = ip4_fib_table_lookup (ip4_fib_get (fib_index0),
+                                         &arp0->ip4_over_ethernet[0].ip4,
+                                         32);
+         src_flags = fib_entry_get_flags (src_fei);
+
+         if (!((FIB_ENTRY_FLAG_ATTACHED & src_flags) ||
+               (FIB_ENTRY_FLAG_CONNECTED & src_flags)) ||
+             sw_if_index0 != fib_entry_get_resolving_interface (src_fei))
            {
              error0 = ETHERNET_ARP_ERROR_l3_src_address_not_local;
-             goto drop1;
+             goto drop2;
            }
 
          /* Reject requests/replies with our local interface address. */
-         src_is_local0 =
-           if_addr0->as_u32 == arp0->ip4_over_ethernet[0].ip4.as_u32;
-         if (src_is_local0)
+         if (FIB_ENTRY_FLAG_LOCAL & src_flags)
            {
              error0 = ETHERNET_ARP_ERROR_l3_src_address_is_local;
-             goto drop1;
+             goto drop2;
            }
 
-         dst_is_local0 =
-           if_addr0->as_u32 == arp0->ip4_over_ethernet[1].ip4.as_u32;
+         dst_is_local0 = (FIB_ENTRY_FLAG_LOCAL & dst_flags);
+         fib_entry_get_prefix (dst_fei, &pfx0);
+         if_addr0 = &pfx0.fp_addr.ip4;
 
          /* Fill in ethernet header. */
          eth0 = ethernet_buffer_get_header (p0);
@@ -1023,7 +983,6 @@ arp_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
                unset_random_arp_entry ();
 
              vnet_arp_set_ip4_over_ethernet (vnm, sw_if_index0,
-                                             (u32) ~ 0 /* default fib */ ,
                                              &arp0->ip4_over_ethernet[0],
                                              0 /* is_static */ );
              error0 = ETHERNET_ARP_ERROR_l3_src_address_learned;
@@ -1064,21 +1023,25 @@ arp_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
          clib_memcpy (eth0->src_address, hw_if0->hw_address, 6);
 
          /* Figure out how much to rewind current data from adjacency. */
-         if (ifa0)
+         /* get the adj from the destination's covering connected */
+         if (NULL == pa)
            {
-             adj0 = ip_get_adjacency (&ip4_main.lookup_main,
-                                      ifa0->neighbor_probe_adj_index);
-             if (adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP)
+             adj0 =
+               adj_get (fib_entry_get_adj_for_source
+                        (ip4_fib_table_lookup
+                         (ip4_fib_get (fib_index0),
+                          &arp0->ip4_over_ethernet[1].ip4, 31),
+                         FIB_SOURCE_INTERFACE));
+             if (adj0->lookup_next_index != IP_LOOKUP_NEXT_GLEAN)
                {
                  error0 = ETHERNET_ARP_ERROR_missing_interface_address;
                  goto drop2;
                }
              if (is_unnum0)
-               arp_unnumbered (p0, pi0, eth0, ifa0);
+               arp_unnumbered (p0, pi0, eth0, conn_sw_if_index0);
              else
                vlib_buffer_advance (p0, -adj0->rewrite_header.data_bytes);
            }
-
          vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
                                           n_left_to_next, pi0, next0);
 
@@ -1128,8 +1091,8 @@ arp_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
                     * $$$ is the answer ever anything other than
                     * vlib_buffer_reset(..)?
                     */
-                   ifa0 = 0;
                    if_addr0 = &proxy_src;
+                   is_unnum0 = 0;
                    vlib_buffer_reset (p0);
                    n_proxy_arp_replies_sent++;
                    goto send_reply;
@@ -1192,10 +1155,9 @@ ip4_arp_entry_sort (void *a1, void *a2)
   int cmp;
   vnet_main_t *vnm = vnet_get_main ();
 
-  cmp = vnet_sw_interface_compare
-    (vnm, e1->key.sw_if_index, e2->key.sw_if_index);
+  cmp = vnet_sw_interface_compare (vnm, e1->sw_if_index, e2->sw_if_index);
   if (!cmp)
-    cmp = ip4_address_compare (&e1->key.ip4_address, &e2->key.ip4_address);
+    cmp = ip4_address_compare (&e1->ip4_address, &e2->ip4_address);
   return cmp;
 }
 
@@ -1228,7 +1190,7 @@ show_ip4_arp (vlib_main_t * vm,
       vlib_cli_output (vm, "%U", format_ethernet_arp_ip4_entry, vnm, 0);
       vec_foreach (e, es)
       {
-       if (sw_if_index != ~0 && e->key.sw_if_index != sw_if_index)
+       if (sw_if_index != ~0 && e->sw_if_index != sw_if_index)
          continue;
        vlib_cli_output (vm, "%U", format_ethernet_arp_ip4_entry, vnm, e);
       }
@@ -1346,91 +1308,196 @@ ip4_set_arp_limit (u32 arp_limit)
   return 0;
 }
 
+/**
+ * @brief Control Plane hook to remove an ARP entry
+ */
+int
+vnet_arp_unset_ip4_over_ethernet (vnet_main_t * vnm,
+                                 u32 sw_if_index, void *a_arg)
+{
+  ethernet_arp_ip4_over_ethernet_address_t *a = a_arg;
+  vnet_arp_set_ip4_over_ethernet_rpc_args_t args;
+
+  args.sw_if_index = sw_if_index;
+  args.flags = ETHERNET_ARP_ARGS_REMOVE;
+  args.ether_type = ARP_ETHER_TYPE_IP4;
+  clib_memcpy (&args.a, a, sizeof (*a));
+
+  vl_api_rpc_call_main_thread (set_ip4_over_ethernet_rpc_callback,
+                              (u8 *) & args, sizeof (args));
+  return 0;
+}
+
+/**
+ * @brief Internally generated event to flush the ARP cache on an
+ * interface state change event.
+ * A flush will remove dynamic ARP entries, and for statics remove the MAC
+ * address from the corresponding adjacencies.
+ */
+static int
+vnet_arp_flush_ip4_over_ethernet (vnet_main_t * vnm,
+                                 u32 sw_if_index,
+                                 arp_ether_type_t et, void *a_arg)
+{
+  ethernet_arp_ip4_over_ethernet_address_t *a = a_arg;
+  vnet_arp_set_ip4_over_ethernet_rpc_args_t args;
+
+  args.sw_if_index = sw_if_index;
+  args.flags = ETHERNET_ARP_ARGS_FLUSH;
+  args.ether_type = et;
+  clib_memcpy (&args.a, a, sizeof (*a));
+
+  vl_api_rpc_call_main_thread (set_ip4_over_ethernet_rpc_callback,
+                              (u8 *) & args, sizeof (args));
+  return 0;
+}
+
+/**
+ * @brief Internally generated event to populate the ARP cache on an
+ * interface state change event.
+ * For static entries this will re-source the adjacencies.
+ *
+ * @param sw_if_index The interface on which the ARP entires are acted
+ * @param et The ether type of those ARP entries.
+ */
+static int
+vnet_arp_populate_ip4_over_ethernet (vnet_main_t * vnm,
+                                    u32 sw_if_index,
+                                    arp_ether_type_t et, void *a_arg)
+{
+  ethernet_arp_ip4_over_ethernet_address_t *a = a_arg;
+  vnet_arp_set_ip4_over_ethernet_rpc_args_t args;
+
+  args.sw_if_index = sw_if_index;
+  args.flags = ETHERNET_ARP_ARGS_POPULATE;
+  args.ether_type = et;
+  clib_memcpy (&args.a, a, sizeof (*a));
+
+  vl_api_rpc_call_main_thread (set_ip4_over_ethernet_rpc_callback,
+                              (u8 *) & args, sizeof (args));
+  return 0;
+}
+
+/*
+ * arp_add_del_interface_address
+ *
+ * callback when an interface address is added or deleted
+ */
 static void
-arp_ip4_entry_del_adj (ethernet_arp_ip4_entry_t * e, u32 adj_index)
+arp_add_del_interface_address (ip4_main_t * im,
+                              uword opaque,
+                              u32 sw_if_index,
+                              ip4_address_t * address,
+                              u32 address_length,
+                              u32 if_address_index, u32 is_del)
 {
-  int done = 0;
-  int i;
+  /*
+   * Flush the ARP cache of all entries covered by the address
+   * that is being removed.
+   */
+  ethernet_arp_main_t *am = &ethernet_arp_main;
+  ethernet_arp_ip4_entry_t *e;
 
-  while (!done)
+  if (vec_len (am->ethernet_arp_by_sw_if_index) < sw_if_index)
+    return;
+
+  if (is_del)
     {
-      vec_foreach_index (i, e->adjacencies)
-       if (vec_elt (e->adjacencies, i) == adj_index)
+      ethernet_arp_interface_t *eai;
+      u32 i, *to_delete = 0;
+      hash_pair_t *pair;
+
+      eai = &am->ethernet_arp_by_sw_if_index[sw_if_index];
+
+      hash_foreach_pair (pair, eai->arp_entries, (
+                                                  {
+                                                  e =
+                                                  pool_elt_at_index
+                                                  (am->ip4_entry_pool,
+                                                   pair->value[0]);
+                                                  if
+                                                  (ip4_destination_matches_route
+                                                   (im, &e->ip4_address,
+                                                    address, address_length))
+                                                  {
+                                                  vec_add1 (to_delete,
+                                                            e -
+                                                            am->ip4_entry_pool);}
+                                                  }
+                        ));
+
+      for (i = 0; i < vec_len (to_delete); i++)
        {
-         vec_del1 (e->adjacencies, i);
-         continue;
+         ethernet_arp_ip4_over_ethernet_address_t delme;
+         e = pool_elt_at_index (am->ip4_entry_pool, to_delete[i]);
+
+         clib_memcpy (&delme.ethernet, e->ethernet_address, 6);
+         delme.ip4.as_u32 = e->ip4_address.as_u32;
+
+         vnet_arp_flush_ip4_over_ethernet (vnet_get_main (),
+                                           e->sw_if_index,
+                                           ARP_ETHER_TYPE_BOTH, &delme);
        }
-      done = 1;
+
+      vec_free (to_delete);
     }
 }
 
 static void
-arp_ip4_entry_add_adj (ethernet_arp_ip4_entry_t * e, u32 adj_index)
+ethernet_arp_sw_interface_mpls_state_change (u32 sw_if_index, u32 is_enable)
 {
-  int i;
-  vec_foreach_index (i, e->adjacencies)
-    if (vec_elt (e->adjacencies, i) == adj_index)
+  ethernet_arp_main_t *am = &ethernet_arp_main;
+  ethernet_arp_ip4_entry_t *e;
+  ethernet_arp_interface_t *eai;
+  u32 i, *to_update = 0;
+  hash_pair_t *pair;
+
+  if (vec_len (am->ethernet_arp_by_sw_if_index) < sw_if_index)
     return;
-  vec_add1 (e->adjacencies, adj_index);
-}
 
-static void
-arp_add_del_adj_cb (struct ip_lookup_main_t *lm,
-                   u32 adj_index, ip_adjacency_t * adj, u32 is_del)
-{
-  ethernet_arp_main_t *am = &ethernet_arp_main;
-  ip4_main_t *im = &ip4_main;
-  ethernet_arp_ip4_key_t k;
-  ethernet_arp_ip4_entry_t *e = 0;
-  uword *p;
-  u32 ai;
+  eai = &am->ethernet_arp_by_sw_if_index[sw_if_index];
+
+  if (is_enable)
+    eai->flags |= ETHERNET_ARP_INTERFACE_MPLS_ENABLE;
+  else
+    eai->flags &= ~ETHERNET_ARP_INTERFACE_MPLS_ENABLE;
+
+  hash_foreach_pair (pair, eai->arp_entries, (
+                                              {
+                                              vec_add1 (to_update,
+                                                        pair->value[0]);
+                                              }
+                    ));
 
-  for (ai = adj->heap_handle; ai < adj->heap_handle + adj->n_adj; ai++)
+  for (i = 0; i < vec_len (to_update); i++)
     {
-      adj = ip_get_adjacency (lm, ai);
-      if (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP
-         && adj->arp.next_hop.ip4.as_u32)
+      ethernet_arp_ip4_over_ethernet_address_t updateme;
+      e = pool_elt_at_index (am->ip4_entry_pool, to_update[i]);
+
+      clib_memcpy (&updateme.ethernet, e->ethernet_address, 6);
+      updateme.ip4.as_u32 = e->ip4_address.as_u32;
+
+      if (is_enable)
        {
-         k.sw_if_index = adj->rewrite_header.sw_if_index;
-         k.ip4_address.as_u32 = adj->arp.next_hop.ip4.as_u32;
-         k.fib_index =
-           im->fib_index_by_sw_if_index[adj->rewrite_header.sw_if_index];
-         p = mhash_get (&am->ip4_entry_by_key, &k);
-         if (p)
-           e = pool_elt_at_index (am->ip4_entry_pool, p[0]);
+         vnet_arp_populate_ip4_over_ethernet (vnet_get_main (),
+                                              e->sw_if_index,
+                                              ARP_ETHER_TYPE_MPLS,
+                                              &updateme);
        }
       else
        continue;
 
-      if (is_del)
-       {
-         if (!e)
-           clib_warning ("Adjacency contains unknown ARP next hop %U (del)",
-                         format_ip46_address, &adj->arp.next_hop,
-                         IP46_TYPE_IP4);
-         else
-           arp_ip4_entry_del_adj (e, adj->heap_handle);
-       }
-      else                     /* add */
-       {
-         if (!e)
-           clib_warning ("Adjacency contains unknown ARP next hop %U (add)",
-                         format_ip46_address, &adj->arp.next_hop,
-                         IP46_TYPE_IP4);
-         else
-           arp_ip4_entry_add_adj (e, adj->heap_handle);
-       }
     }
+  vec_free (to_update);
 }
 
 static clib_error_t *
 ethernet_arp_init (vlib_main_t * vm)
 {
   ethernet_arp_main_t *am = &ethernet_arp_main;
-  pg_node_t *pn;
-  clib_error_t *error;
   ip4_main_t *im = &ip4_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
+  clib_error_t *error;
+  pg_node_t *pn;
 
   if ((error = vlib_call_init_function (vm, ethernet_init)))
     return error;
@@ -1445,10 +1512,6 @@ ethernet_arp_init (vlib_main_t * vm)
   foreach_ethernet_arp_opcode;
 #undef _
 
-  mhash_init (&am->ip4_entry_by_key,
-             /* value size */ sizeof (uword),
-             /* key size */ sizeof (ethernet_arp_ip4_key_t));
-
   /* $$$ configurable */
   am->limit_arp_cache_size = 50000;
 
@@ -1468,100 +1531,239 @@ ethernet_arp_init (vlib_main_t * vm)
 #undef _
   }
 
-  ip_register_add_del_adjacency_callback (lm, arp_add_del_adj_cb);
+  ip4_add_del_interface_address_callback_t cb;
+  cb.function = arp_add_del_interface_address;
+  cb.function_opaque = 0;
+  vec_add1 (im->add_del_interface_address_callbacks, cb);
+
+  vec_add1 (mpls_main.mpls_interface_state_change_callbacks,
+           ethernet_arp_sw_interface_mpls_state_change);
 
   return 0;
 }
 
 VLIB_INIT_FUNCTION (ethernet_arp_init);
 
-int
-vnet_arp_unset_ip4_over_ethernet (vnet_main_t * vnm,
-                                 u32 sw_if_index, u32 fib_index, void *a_arg)
+static void
+arp_mk_incomplete (ethernet_arp_interface_t * eai,
+                  ethernet_arp_ip4_entry_t * e, arp_ether_type_t et)
 {
-  ethernet_arp_ip4_over_ethernet_address_t *a = a_arg;
-  vnet_arp_set_ip4_over_ethernet_rpc_args_t args;
+  fib_prefix_t pfx = {
+    .fp_len = 32,
+    .fp_proto = FIB_PROTOCOL_IP4,
+    .fp_addr = {
+               .ip4 = e->ip4_address,
+               },
+  };
+  u32 fib_index;
 
-  args.sw_if_index = sw_if_index;
-  args.fib_index = fib_index;
-  args.is_remove = 1;
-  clib_memcpy (&args.a, a, sizeof (*a));
+  fib_index = ip4_fib_table_get_index_for_sw_if_index (e->sw_if_index);
 
-  vl_api_rpc_call_main_thread (set_ip4_over_ethernet_rpc_callback,
-                              (u8 *) & args, sizeof (args));
-  return 0;
+  if ((ARP_ETHER_TYPE_IP4 & et) &&
+      (ADJ_INDEX_INVALID != e->adj_index[FIB_LINK_IP4]))
+    {
+      /*
+       * revert the adj this ARP entry sourced to incomplete
+       */
+      adj_nbr_update_rewrite (e->adj_index[FIB_LINK_IP4], NULL);
+
+      /*
+       * remove the FIB erntry the ARP entry sourced
+       */
+      fib_table_entry_delete (fib_index, &pfx, FIB_SOURCE_ADJ);
+
+      /*
+       * Unlock the adj now that the ARP entry is no longer a source
+       */
+      adj_unlock (e->adj_index[FIB_LINK_IP4]);
+      e->adj_index[FIB_LINK_IP4] = ADJ_INDEX_INVALID;
+    }
+  if ((ARP_ETHER_TYPE_MPLS & et) &&
+      (ADJ_INDEX_INVALID != e->adj_index[FIB_LINK_MPLS]))
+    {
+      /*
+       * revert the adj this ARP entry sourced to incomplete
+       */
+      adj_nbr_update_rewrite (e->adj_index[FIB_LINK_MPLS], NULL);
+
+      /*
+       * Unlock the adj now that the ARP entry is no longer a source
+       */
+      adj_unlock (e->adj_index[FIB_LINK_MPLS]);
+      e->adj_index[FIB_LINK_MPLS] = ADJ_INDEX_INVALID;
+    }
+}
+
+static void
+arp_entry_free (ethernet_arp_interface_t * eai, ethernet_arp_ip4_entry_t * e)
+{
+  ethernet_arp_main_t *am = &ethernet_arp_main;
+
+  hash_unset (eai->arp_entries, e->ip4_address.as_u32);
+  pool_put (am->ip4_entry_pool, e);
+}
+
+static ethernet_arp_ip4_entry_t *
+arp_entry_find (ethernet_arp_interface_t * eai, const ip4_address_t * addr)
+{
+  ethernet_arp_main_t *am = &ethernet_arp_main;
+  ethernet_arp_ip4_entry_t *e = NULL;
+  uword *p;
+
+  if (NULL != eai->arp_entries)
+    {
+      p = hash_get (eai->arp_entries, addr->as_u32);
+      if (!p)
+       return (NULL);
+
+      e = pool_elt_at_index (am->ip4_entry_pool, p[0]);
+    }
+
+  return (e);
 }
 
 static inline int
 vnet_arp_unset_ip4_over_ethernet_internal (vnet_main_t * vnm,
-                                          u32 sw_if_index,
-                                          u32 fib_index, void *a_arg)
+                                          vnet_arp_set_ip4_over_ethernet_rpc_args_t
+                                          * args)
 {
-  ethernet_arp_ip4_entry_t *e;
   ethernet_arp_main_t *am = &ethernet_arp_main;
-  ethernet_arp_ip4_over_ethernet_address_t *a = a_arg;
-  ethernet_arp_ip4_key_t k;
-  uword *p;
-  ip4_add_del_route_args_t args;
-  ip4_main_t *im = &ip4_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
-  u32 adj_index;
-  ip_adjacency_t *adj;
-
-  k.sw_if_index = sw_if_index;
-  k.ip4_address = a->ip4;
-  k.fib_index = fib_index;
-  p = mhash_get (&am->ip4_entry_by_key, &k);
-  if (!p)
-    return -1;
+  ethernet_arp_ip4_entry_t *e;
+  ethernet_arp_interface_t *eai;
 
-  memset (&args, 0, sizeof (args));
+  eai = &am->ethernet_arp_by_sw_if_index[args->sw_if_index];
 
-  /*
-   * Make sure that the route actually exists before we try to delete it,
-   * and make sure that it's a rewrite adjacency.
-   *
-   * If we point 1-N unnumbered interfaces at a loopback interface and
-   * shut down the loopback before shutting down 1-N unnumbered
-   * interfaces, the ARP cache will still have an entry,
-   * but the route will have disappeared.
-   *
-   * See also ip4_del_interface_routes (...)
-   *            -> ip4_delete_matching_routes (...).
-   */
+  e = arp_entry_find (eai, &args->a.ip4);
+
+  if (NULL != e)
+    {
+      arp_mk_incomplete (eai, e, ARP_ETHER_TYPE_BOTH);
+      arp_entry_free (eai, e);
+    }
+
+  return 0;
+}
+
+static int
+vnet_arp_flush_ip4_over_ethernet_internal (vnet_main_t * vnm,
+                                          vnet_arp_set_ip4_over_ethernet_rpc_args_t
+                                          * args)
+{
+  ethernet_arp_main_t *am = &ethernet_arp_main;
+  ethernet_arp_ip4_entry_t *e;
+  ethernet_arp_interface_t *eai;
+
+  eai = &am->ethernet_arp_by_sw_if_index[args->sw_if_index];
 
-  adj_index = ip4_fib_lookup_with_table
-    (im, fib_index, &a->ip4, 1 /* disable default route */ );
+  e = arp_entry_find (eai, &args->a.ip4);
 
-  /* Miss adj? Forget it... */
-  if (adj_index != lm->miss_adj_index)
+  if (NULL != e)
     {
-      adj = ip_get_adjacency (lm, adj_index);
+      arp_mk_incomplete (eai, e, args->ether_type);
+
       /*
-       * Stupid control-plane trick:
-       * admin down an interface (removes arp routes from fib),
-       * bring the interface back up (does not reinstall them)
-       * then remove the arp cache entry (yuck). When that happens,
-       * the adj we find here will be the interface subnet ARP adj.
+       * The difference between flush and unset, is that an unset
+       * means delete for static and dynamic entries. A flush
+       * means delete only for dynamic. Flushing is what the DP
+       * does in response to interface events. unset is only done
+       * by the control plane.
        */
-      if (adj->lookup_next_index == IP_LOOKUP_NEXT_REWRITE)
+      if ((e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_DYNAMIC) &&
+         (args->ether_type & ARP_ETHER_TYPE_IP4))
        {
-         args.table_index_or_table_id = fib_index;
-         args.flags = IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_DEL
-           | IP4_ROUTE_FLAG_NEIGHBOR;
-         args.dst_address = a->ip4;
-         args.dst_address_length = 32;
-         ip4_add_del_route (im, &args);
-         ip4_maybe_remap_adjacencies (im, fib_index, args.flags);
+         arp_entry_free (eai, e);
        }
     }
+  return (0);
+}
+
+static int
+vnet_arp_populate_ip4_over_ethernet_internal (vnet_main_t * vnm,
+                                             vnet_arp_set_ip4_over_ethernet_rpc_args_t
+                                             * args)
+{
+  ethernet_arp_main_t *am = &ethernet_arp_main;
+  ethernet_arp_ip4_entry_t *e;
+  ethernet_arp_interface_t *eai;
+
+  eai = &am->ethernet_arp_by_sw_if_index[args->sw_if_index];
+
+  e = arp_entry_find (eai, &args->a.ip4);
+
+  if (NULL != e)
+    {
+      arp_mk_complete (eai, e, args->ether_type);
+    }
+  return (0);
+}
+
+static void
+set_ip4_over_ethernet_rpc_callback (vnet_arp_set_ip4_over_ethernet_rpc_args_t
+                                   * a)
+{
+  vnet_main_t *vm = vnet_get_main ();
+  ASSERT (os_get_cpu_number () == 0);
+
+  if (a->flags & ETHERNET_ARP_ARGS_REMOVE)
+    vnet_arp_unset_ip4_over_ethernet_internal (vm, a);
+  else if (a->flags & ETHERNET_ARP_ARGS_FLUSH)
+    vnet_arp_flush_ip4_over_ethernet_internal (vm, a);
+  else if (a->flags & ETHERNET_ARP_ARGS_POPULATE)
+    vnet_arp_populate_ip4_over_ethernet_internal (vm, a);
+  else
+    vnet_arp_set_ip4_over_ethernet_internal (vm, a);
+}
+
+/**
+ * @brief Invoked when the interface's admin state changes
+ */
+static clib_error_t *
+ethernet_arp_sw_interface_up_down (vnet_main_t * vnm,
+                                  u32 sw_if_index, u32 flags)
+{
+  ethernet_arp_main_t *am = &ethernet_arp_main;
+  ethernet_arp_ip4_entry_t *e;
+  u32 i, *to_delete = 0;
+
+  /* *INDENT-OFF* */
+  pool_foreach (e, am->ip4_entry_pool,
+  ({
+    if (e->sw_if_index == sw_if_index)
+      {
+       vec_add1 (to_delete, e - am->ip4_entry_pool);
+      }
+  }));
+  /* *INDENT-ON* */
+
+  for (i = 0; i < vec_len (to_delete); i++)
+    {
+      ethernet_arp_ip4_over_ethernet_address_t delme;
+      e = pool_elt_at_index (am->ip4_entry_pool, to_delete[i]);
+
+      clib_memcpy (&delme.ethernet, e->ethernet_address, 6);
+      delme.ip4.as_u32 = e->ip4_address.as_u32;
+
+      if (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)
+       {
+         vnet_arp_populate_ip4_over_ethernet (vnm, e->sw_if_index,
+                                              ARP_ETHER_TYPE_BOTH, &delme);
+       }
+      else
+       {
+         vnet_arp_flush_ip4_over_ethernet (vnm, e->sw_if_index,
+                                           ARP_ETHER_TYPE_BOTH, &delme);
+       }
+
+    }
+  vec_free (to_delete);
+
 
-  e = pool_elt_at_index (am->ip4_entry_pool, p[0]);
-  mhash_unset (&am->ip4_entry_by_key, &e->key, 0);
-  pool_put (am->ip4_entry_pool, e);
   return 0;
 }
 
+VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ethernet_arp_sw_interface_up_down);
+
+
 static void
 increment_ip4_and_mac_address (ethernet_arp_ip4_over_ethernet_address_t * a)
 {
@@ -1585,6 +1787,24 @@ increment_ip4_and_mac_address (ethernet_arp_ip4_over_ethernet_address_t * a)
     }
 }
 
+int
+vnet_arp_set_ip4_over_ethernet (vnet_main_t * vnm,
+                               u32 sw_if_index, void *a_arg, int is_static)
+{
+  ethernet_arp_ip4_over_ethernet_address_t *a = a_arg;
+  vnet_arp_set_ip4_over_ethernet_rpc_args_t args;
+
+  args.sw_if_index = sw_if_index;
+  args.is_static = is_static;
+  args.flags = 0;
+  args.ether_type = ARP_ETHER_TYPE_IP4;
+  clib_memcpy (&args.a, a, sizeof (*a));
+
+  vl_api_rpc_call_main_thread (set_ip4_over_ethernet_rpc_callback,
+                              (u8 *) & args, sizeof (args));
+  return 0;
+}
+
 int
 vnet_proxy_arp_add_del (ip4_address_t * lo_addr,
                        ip4_address_t * hi_addr, u32 fib_index, int is_del)
@@ -1660,57 +1880,6 @@ vnet_proxy_arp_fib_reset (u32 fib_id)
   return 0;
 }
 
-u32
-vnet_arp_glean_add (u32 fib_index, void *next_hop_arg)
-{
-  ethernet_arp_main_t *am = &ethernet_arp_main;
-  ip4_main_t *im = &ip4_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
-  ip4_address_t *next_hop = next_hop_arg;
-  ip_adjacency_t add_adj, *adj;
-  ip4_add_del_route_args_t args;
-  ethernet_arp_ip4_entry_t *e;
-  ethernet_arp_ip4_key_t k;
-  u32 adj_index;
-
-  adj_index = ip4_fib_lookup_with_table (im, fib_index, next_hop, 0);
-  adj = ip_get_adjacency (lm, adj_index);
-
-  if (!adj || adj->lookup_next_index != IP_LOOKUP_NEXT_ARP)
-    return ~0;
-
-  if (adj->arp.next_hop.ip4.as_u32 != 0)
-    return adj_index;
-
-  k.sw_if_index = adj->rewrite_header.sw_if_index;
-  k.fib_index = fib_index;
-  k.ip4_address.as_u32 = next_hop->as_u32;
-
-  if (mhash_get (&am->ip4_entry_by_key, &k))
-    return adj_index;
-
-  pool_get (am->ip4_entry_pool, e);
-  mhash_set (&am->ip4_entry_by_key, &k, e - am->ip4_entry_pool,
-            /* old value */ 0);
-  e->key = k;
-  e->cpu_time_last_updated = clib_cpu_time_now ();
-  e->flags = ETHERNET_ARP_IP4_ENTRY_FLAG_GLEAN;
-
-  memset (&args, 0, sizeof (args));
-  clib_memcpy (&add_adj, adj, sizeof (add_adj));
-  ip46_address_set_ip4 (&add_adj.arp.next_hop, next_hop);      /* install neighbor /32 route */
-  args.table_index_or_table_id = fib_index;
-  args.flags =
-    IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_ADD | IP4_ROUTE_FLAG_NEIGHBOR;
-  args.dst_address.as_u32 = next_hop->as_u32;
-  args.dst_address_length = 32;
-  args.adj_index = ~0;
-  args.add_adj = &add_adj;
-  args.n_add_adj = 1;
-  ip4_add_del_route (im, &args);
-  return ip4_fib_lookup_with_table (im, fib_index, next_hop, 0);
-}
-
 static clib_error_t *
 ip_arp_add_del_command_fn (vlib_main_t * vm,
                           unformat_input_t * input, vlib_cli_command_t * cmd)
@@ -1784,7 +1953,7 @@ ip_arp_add_del_command_fn (vlib_main_t * vm,
                 1 /* type */ , 0 /* data */ );
 
              vnet_arp_set_ip4_over_ethernet
-               (vnm, sw_if_index, fib_index, &addr, is_static);
+               (vnm, sw_if_index, &addr, is_static);
 
              vlib_process_wait_for_event (vm);
              event_type = vlib_process_get_events (vm, &event_data);
@@ -1793,8 +1962,7 @@ ip_arp_add_del_command_fn (vlib_main_t * vm,
                clib_warning ("event type %d unexpected", event_type);
            }
          else
-           vnet_arp_unset_ip4_over_ethernet
-             (vnm, sw_if_index, fib_index, &addr);
+           vnet_arp_unset_ip4_over_ethernet (vnm, sw_if_index, &addr);
 
          increment_ip4_and_mac_address (&addr);
        }
index 8a1369c..3b2ef87 100644 (file)
@@ -398,13 +398,11 @@ void ethernet_set_rx_redirect (vnet_main_t * vnm, vnet_hw_interface_t * hi,
 
 int
 vnet_arp_set_ip4_over_ethernet (vnet_main_t * vnm,
-                               u32 sw_if_index,
-                               u32 fib_index, void *a_arg, int is_static);
+                               u32 sw_if_index, void *a_arg, int is_static);
 
 int
 vnet_arp_unset_ip4_over_ethernet (vnet_main_t * vnm,
-                                 u32 sw_if_index, u32 fib_index,
-                                 void *a_arg);
+                                 u32 sw_if_index, void *a_arg);
 
 int vnet_proxy_arp_fib_reset (u32 fib_id);
 
@@ -538,8 +536,6 @@ int vnet_add_del_ip4_arp_change_event (vnet_main_t * vnm,
                                       uword type_opaque,
                                       uword data, int is_add);
 
-u32 vnet_arp_glean_add (u32 fib_index, void *next_hop_arg);
-
 extern vlib_node_registration_t ethernet_input_node;
 
 #endif /* included_ethernet_h */
index 0b19b51..f2e2ca0 100644 (file)
 #include <vnet/pg/pg.h>
 #include <vnet/ethernet/ethernet.h>
 #include <vnet/l2/l2_input.h>
+#include <vnet/srp/srp.h>
+#include <vnet/lisp-gpe/lisp_gpe.h>
+#include <vnet/devices/af_packet/af_packet.h>
+
+int
+vnet_sw_interface_is_p2p (vnet_main_t * vnm, u32 sw_if_index)
+{
+  // FIXME - use flags on the HW itf
+  vnet_hw_interface_t *hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
+  return (!(hw->hw_class_index == ethernet_hw_interface_class.index ||
+           hw->hw_class_index == af_packet_device_class.index ||
+           hw->hw_class_index == lisp_gpe_hw_class.index ||
+           hw->hw_class_index == srp_hw_interface_class.index));
+}
 
 /**
  * @file
diff --git a/vnet/vnet/fib/fib.c b/vnet/vnet/fib/fib.c
new file mode 100644 (file)
index 0000000..413f93e
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/fib/fib_entry_src.h>
+#include <vnet/fib/fib_entry.h>
+#include <vnet/fib/fib_path.h>
+#include <vnet/fib/fib_walk.h>
+#include <vnet/fib/fib_path_list.h>
+
+static clib_error_t *
+fib_module_init (vlib_main_t * vm)
+{
+    clib_error_t * error;
+
+    if ((error = vlib_call_init_function (vm, dpo_module_init)))
+       return (error);
+    if ((error = vlib_call_init_function (vm, adj_module_init)))
+       return (error);
+
+    fib_entry_module_init();
+    fib_entry_src_module_init();
+    fib_path_module_init();
+    fib_path_list_module_init();
+    fib_walk_module_init();
+
+    return (NULL);
+}
+
+VLIB_INIT_FUNCTION (fib_module_init);
diff --git a/vnet/vnet/fib/fib.h b/vnet/vnet/fib/fib.h
new file mode 100644 (file)
index 0000000..7cf1d13
--- /dev/null
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * \brief
+ * A IP v4/6 independent FIB.
+ *
+ * The main functions provided by the FIB are as follows;
+ *
+ *  - source priorities
+ *
+ *   A route can be added to the FIB by more than entity or source. Sources
+ * include, but are not limited to, API, CLI, LISP, MAP, etc (for the full list
+ * see fib_entry.h). Each source provides the forwarding information (FI) that
+ * is has determined as required for that route. Since each source determines the
+ * FI using different best  path and loop prevention algorithms, it is not
+ * correct for the FI of multiple sources to be combined. Instead the FIB must
+ * choose to use the FI from only one source. This choose is based on a static
+ * priority assignment. For example;
+ * IF a prefix is added as a result of interface configuration:
+ *    set interface address 192.168.1.1/24 GigE0
+ * and then it is also added from the CLI
+ *    ip route 192.168.1.1/32 via 2.2.2.2/32
+ * then the 'interface' source will prevail, and the route will remain as
+ * 'local'.
+ * The requirement of the FIB is to always install the FI from the winning
+ * source and thus to maintain the FI added by losing sources so it can be
+ * installed should the winning source be withdrawn.
+ *
+ *  - adj-fib maintenance
+ *
+ *   When ARP or ND discover a neighbour on a link an adjacency forms for the
+ * address of that neighbour. It is also required to insert a route in the
+ * appropriate FIB table, corresponding to the VRF for the link, an entry for
+ * that neighbour. This entry is often referred to as an adj-fib. Adj-fibs
+ * have a dedicated source; 'ADJ'.
+ * The priority of the ADJ source is lower than most. This is so the following
+ * config;
+ *    set interface address 192.168.1.1/32 GigE0
+ *    ip arp 192.168.1.2 GigE0 dead.dead.dead
+ *    ip route add 192.168.1.2 via 10.10.10.10 GigE1
+ * will forward traffic for 192.168.1.2 via GigE1. That is the route added
+ * by the control plane is favoured over the adjacency discovered by ARP.
+ * The control plane, with its associated authentication, is considered the
+ * authoritative source.
+ * To counter the nefarious addition of adj-fib, through the nefarious injection
+ * of adjacencies, the FIB is also required to ensure that only adj-fibs whose
+ * less specific covering prefix is connected are installed in forwarding. This
+ * requires the use of 'cover tracking', where a route maintains a dependency
+ * relationship with the route that is its less specific cover. When this cover
+ * changes (i.e. there is a new covering route) or the forwarding information
+ * of the cover changes, then the covered route is notified.
+ *
+ * Overlapping sub-nets are not supported, so no adj-fib has multiple paths.
+ * The control plane is expected to remove a prefix configured for an interface
+ * before the interface changes VRF.
+ * So while the following config is accepted:
+ *    set interface address 192.168.1.1/32 GigE0
+ *    ip arp 192.168.1.2 GigE0 dead.dead.dead
+ *    set interface ip table GigE0 2
+ * it does not result in the desired behaviour.
+ *
+ *  - attached export.
+ *
+ * Further to adj-fib maintenance above consider the following config:
+ *    set interface address 192.168.1.1/24 GigE0
+ *    ip route add table 2 192.168.1.0/24 GigE0
+ * Traffic destined for 192.168.1.2 in table 2 will generate an ARP request
+ * on GigE0. However, since GigE0 is in table 0, all adj-fibs will be added in
+ * FIB 0. Hence all hosts in the sub-net are unreachable from table 2. To resolve
+ * this, all adj-fib and local prefixes are exported (i.e. copied) from the 
+ * 'export' table 0, to the 'import' table 2. There can be many import tables
+ * for a single export table.
+ *
+ *  - recursive route resolution
+ *
+ *   A recursive route is of the form:
+ *       1.1.1.1/32 via 10.10.10.10
+ * i.e. a route for which no egress interface is provided. In order to forward
+ * traffic to 1.1.1.1/32 the FIB must therefore first determine how to forward
+ * traffic to 10.10.10.10/32. This is recursive resolution.
+ * Recursive resolution, just like normal resolution, proceeds via a longest
+ * prefix match for the 'via-address' 10.10.10.10. Note it is only possible
+ * to add routes via an address (i.e. a /32 or /128) not via a shorter mask
+ * prefix. There is no use case for the latter.
+ * Since recursive resolution proceeds via a longest prefix match, the entry
+ * in the FIB that will resolve the recursive route, termed the via-entry, may
+ * change as other routes are added to the FIB. Consider the recursive
+ * route shown above, and this non-recursive route:
+ *       10.10.10.0/24 via 192.168.16.1 GigE0
+ * The entry for 10.10.10.0/24 is thus the resolving via-entry. If this entry is
+ * modified, to say;
+ *       10.10.10.0/24 via 192.16.1.3 GigE0
+ * Then packet for 1.1.1.1/32 must also be sent to the new next-hop.
+ * Now consider the addition of;
+ *       10.10.10.0/28 via 192.168.16.2 GigE0
+ * The more specific /28 is a better longest prefix match and thus becomes the
+ * via-entry. Removal of the /28 means the resolution will revert to the /24.
+ * The tracking to the changes in recursive resolution is the requirement of
+ * the FIB. When the forwarding information of the via-entry changes a back-walk
+ * is used to update dependent recursive routes. When new routes are added to
+ * the table the cover tracking feature provides the necessary notifications to
+ * the via-entry routes.
+ * The adjacency constructed for 1.1.1.1/32 will be a recursive adjacency
+ * whose next adjacency will be contributed from the via-entry. Maintaining
+ * the validity of this recursive adjacency is a requirement of the FIB.
+ *
+ *  - recursive loop avoidance
+ *
+ * Consider this set of routes:
+ *     1.1.1.1/32 via 2.2.2.2
+ *     2.2.2.2/32 via 3.3.3.3
+ *     3.3.3.3/32 via 1.1.1.1
+ * this is termed a recursion loop - all of the routes in the loop are
+ * unresolved in so far as they do not have a resolving adjacency, but each
+ * is resolved because the via-entry is known. It is important here to note
+ * the distinction between the control-plane objects and the data-plane objects
+ * (more details in the implementation section). The control plane objects must
+ * allow the loop to form (i.e. the graph becomes cyclic), however, the
+ * data-plane absolutely must not allow the loop to form, otherwise the packet
+ * would loop indefinitely and never egress the device - meltdown would follow.
+ * The control plane must allow the loop to form, because when the loop breaks,
+ * all members of the loop need to be updated. Forming the loop allows the
+ * dependencies to be correctly setup to allow this to happen.
+ * There is no limit to the depth of recursion supported by VPP so:
+ *    9.9.9.100/32 via 9.9.9.99
+ *    9.9.9.99/32  via 9.9.9.98
+ *    9.9.9.98/32  via 9.9.9.97
+ *      ... turtles, turtles, turtles ...
+ *    9.9.9.1/32 via 10.10.10.10 Gig0
+ * is supported to as many layers of turtles is desired, however, when
+ * back-walking a graph (in this case from 9.9.9.1/32 up toward 9.9.9.100/32)
+ * a FIB needs to differentiate the case where the recursion is deep versus
+ * the case where the recursion is looped. A simple method, employed by VPP FIB,
+ * is to limit the number of steps. VPP FIB limit is 16. Typical BGP scenarios
+ * in the wild do not exceed 3 (BGP Inter-AS option C).
+ * 
+ * - Fast Convergence
+ * 
+ * After a network topology change, the 'convergence' time, is the time taken
+ * for the router to complete a transition to forward traffic using the new
+ * topology. The convergence time is therefore a summation of the time to;
+ *  - detect the failure.
+ *  - calculate the new 'best path' information
+ *  - download the new best paths to the data-plane.
+ *  - install those best best in data-plane forwarding.
+ * The last two points are of relevance to VPP architecture. The download API is
+ * binary and batch, details are not discussed here. There is no HW component to
+ * programme, installation time is bounded by the memory allocation and table
+ * lookup and insert access times.
+ *
+ * 'Fast' convergence refers to a set of technologies that a FIB can employ to
+ * completely or partially restore forwarding whilst the convergence actions
+ * listed above are ongoing. Fast convergence technologies are further
+ * sub-divided into Prefix Independent Convergence (PIC) and Loop Free
+ * Alternate path Fast re-route (LFA-FRR or sometimes called IP-FRR) which
+ * affect recursive and non-recursive routes respectively.
+ *
+ * LFA-FRR
+ *
+ * Consider the network topology below:
+ *
+ *          C
+ *        /   \
+ *  X -- A --- B - Y
+ *       |     |
+ *       D     F
+ *        \   /
+ *          E
+ *
+ * all links are equal cost, traffic is passing from X to Y. the best path is
+ * X-A-B-Y. There are two alternative paths, one via C and one via E. An
+ * alternate path is considered to be loop free if no other router on that path
+ * would forward the traffic back to the sender. Consider router C, its best
+ * path to Y is via B, so if A were to send traffic destined to Y to C, then C
+ * would forward that traffic to B - this is a loop-free alternate path. In
+ * contrast consider router D. D's shortest path to Y is via A, so if A were to
+ * send traffic destined to Y via D, then D would send it back to A; this is
+ * not a loop-free alternate path. There are several points of note;
+ *   - we are considering the pre-failure routing topology
+ *   - any equal-cost multi-path between A and B is also a LFA path.
+ *   - in order for A to calculate LFA paths it must be aware of the best-path
+ *     to Y from the perspective of D. These calculations are thus limited to
+ *     routing protocols that have a full view of the network topology, i.e.
+ *     link-state DB protocols like OSPF or an SDN controller. LFA protected
+ *     prefixes are thus non-recursive.
+ *
+ * LFA is specified as a 1 to 1 redundancy; a primary path has only one LFA
+ * (a.k.a. backup) path. To my knowledge this limitation is one of complexity
+ * in the calculation of and capacity planning using a 1-n redundancy. 
+ *
+ * In the event that the link A-B fails, the alternate path via C can be used.
+ * In order to provide 'fast' failover in the event of a failure, the control
+ * plane will download both the primary and the backup path to the FIB. It is
+ * then a requirement of the FIB to perform the failover (a.k.a cutover) from
+ * the primary to the backup path as quickly as possible, and particularly
+ * without any other control-plane intervention. The expectation is cutover is
+ * less than 50 milli-seconds - a value allegedly from the VOIP QoS. Note that
+ * cutover time still includes the fault detection time, which in a vitalised
+ * environment could be the dominant factor. Failure detection can be either a
+ * link down, which will affect multiple paths on a multi-access interface, or
+ * via a specific path heartbeat (i.e. BFD). 
+ * At this time VPP does not support LFA, that is it does not support the
+ * installation of a primary and backup path[s] for a route. However, it does
+ * support ECMP, and VPP FIB is designed to quickly remove failed paths from
+ * the ECMP set, however, it does not insert shared objects specific to the
+ * protected resource into the forwarding object graph, since this would incur
+ * a forwarding/performance cost. Failover time is thus route number dependent.
+ * Details are provided in the implementation section below.
+ *
+ * PIC
+ *
+ * PIC refers to the concept that the converge time should be independent of
+ * the number of prefixes/routes that are affected by the failure. PIC is
+ * therefore most appropriate when considering networks with large number of
+ * prefixes, i.e. BGP networks and thus recursive prefixes. There are several
+ * flavours of PIC covering different locations of protection and failure
+ * scenarios. An outline is given below, see the literature for more details:
+ *
+ * Y/16 - CE1 -- PE1---\
+ *                | \   P1---\
+ *                |  \        PE3 -- CE3 - X/16
+ *                |   - P2---/
+ * Y/16 - CE2 -- PE2---/
+ *
+ * CE = customer edge, PE = provider edge. external-BGP runs between customer
+ * and provider, internal-BGP runs between provider and provider.
+ *
+ * 1) iBGP PIC-core: consider traffic from CE1 to X/16 via CE3. On PE1 there is
+ *    are routes;
+ *       X/16 (and hundreds of thousands of others like it)
+ *         via PE3
+ *    and
+ *      PE3/32 (its loopback address)
+ *        via 10.0.0.1 Link0 (this is P1)
+ *        via 10.1.1.1 Link1 (this is P2)
+ * the failure is the loss of link0 or link1
+ * As in all PIC scenarios, in order to provide prefix independent convergence
+ * it must be that the route for X/16 (and all other routes via PE3) do not
+ * need to be updated in the FIB. The FIB therefore needs to update a single
+ * object that is shared by all routes - once this shared object is updated,
+ * then all routes using it will be instantly updated to use the new forwarding
+ * information. In this case the shared object is the resolving route via PE3.
+ * Once the route via PE3 is updated via IGP (OSPF) convergence, then all
+ * recursive routes that resolve through it are also updated. VPP FIB
+ * implements this scenario via a recursive-adjacency. the X/16 and it sibling
+ * routes share a recursive-adjacency that links to/points at/stacks on the
+ * normal adjacency contributed by the route for PE3. Once this shared
+ * recursive adj is re-linked then all routes are switched to using the new
+ * forwarding information. This is shown below;
+ *
+ * pre-failure;
+ *   X/16 --> R-ADJ-1 --> ADJ-1-PE3 (multi-path via P1 and P2)
+ *
+ * post-failure:
+ *   X/16 --> R-ADJ-1 --> ADJ-2-PE3 (single path via P1)
+ *
+ * note that R-ADJ-1 (the recursive adj) remains in the forwarding graph,
+ * therefore X/16 (and all its siblings) is not updated.
+ * X/16 and its siblings share the recursive adj since they share the same
+ * path-list. It is the path-list object that contributes the recursive-adj
+ * (see next section for more details)
+ *
+ *
+ * 2) iBGP PIC-edge; Traffic from CE3 to Y/16. On PE3 there is are routes;
+ *      Y/16  (and hundreds of thousands of others like it)
+ *        via PE1
+ *        via PE2 
+ *  and
+ *     PE1/32 (PE1's loopback address)
+ *       via 10.0.2.2 Link0 (this is P1)
+ *     PE2/32 (PE2's loopback address)
+ *       via 10.0.3.3 Link1 (this is P2)
+ *
+ * the failure is the loss of reachability to PE2. this could be either the
+ * loss of the link P2-PE2 or the loss of the node PE2. This is detected either
+ * by the withdrawal of the PE2's loopback route or by some form of failure
+ * detection (i.e. BFD).
+ * VPP FIB again provides PIC via the use of the shared recursive-adj. Y/16 and
+ * its siblings will again share a path-list for the list {PE1,PE2}, this
+ * path-list will contribute a multi-path-recursive-adj, i.e. a multi-path-adj
+ * with each choice therein being another adj;
+ *
+ *  Y/16 -> RM-ADJ --> ADJ1 (for PE1)
+ *                 --> ADJ2 (for PE2)
+ *
+ * when the route for PE1 is withdrawn then the multi-path-recursive-adjacency
+ * is updated to be;
+ *
+ * Y/16 --> RM-ADJ --> ADJ1 (for PE1)
+ *                 --> ADJ1 (for PE1)
+ *
+ * that is both choices in the ECMP set are the same and thus all traffic is
+ * forwarded to PE1. Eventually the control plane will download a route update
+ * for Y/16 to be via PE1 only. At that time the situation will be:
+ *
+ * Y/16 -> R-ADJ --> ADJ1 (for PE1)
+ *
+ * In the scenario above we assumed that PE1 and PE2 are ECMP for Y/16. eBGP
+ * PIC core is also specified for the case were one PE is primary and the other
+ * backup - VPP FIB does not support that case at this time.
+ *
+ * 3) eBGP PIC Edge; Traffic from CE3 to Y/16. On PE1 there is are routes;
+ *      Y/16 (and hundreds of thousands of others like it)
+ *         via CE1 (primary)
+ *         via PE2 (backup)
+ *   and
+ *     CE1 (this is an adj-fib)
+ *       via 11.0.0.1 Link0 (this is CE1) << this is an adj-fib
+ *     PE2 (PE2's loopback address)
+ *       via 10.0.5.5 Link1 (this is link PE1-PE2)
+ * the failure is the loss of link0 to CE1. The failure can be detected by FIB
+ * either as a link down event or by the control plane withdrawing the connected
+ * prefix on the link0 (say 10.0.5.4/30). The latter works because the resolving
+ * entry is an adj-fib, so removing the connected will withdraw the adj-fib, and
+ * hence the recursive path becomes unresolved. The former is faster,
+ * particularly in the case of Inter-AS option A where there are many VLAN
+ * sub-interfaces on the PE-CE link, one for each VRF, and so the control plane
+ * must remove the connected prefix for each sub-interface to trigger PIC in
+ * each VRF. Note though that total PIC cutover time will depend on VRF scale
+ * with either trigger.
+ * Primary and backup paths in this eBGP PIC-edge scenario are calculated by
+ * BGP. Each peer is configured to always advertise its best external path to
+ * its iBGP peers. Backup paths therefore send traffic from the PE back into the
+ * core to an alternate PE. A PE may have multiple external paths, i.e. multiple
+ * directly connected CEs, it may also have multiple backup PEs, however there
+ * is no correlation between the two, so unlike LFA-FRR, the redundancy model is
+ * N-M; N primary paths are backed-up by M backup paths - only when all primary
+ * paths fail, then the cutover is performed onto the M backup paths. Note that
+ * PE2 must be suitably configured to forward traffic on its external path that
+ * was received from PE1. VPP FIB does not support external-internal-BGP (eiBGP)
+ * load-balancing.
+ *
+ * As with LFA-FRR the use of primary and backup paths is not currently
+ * supported, however, the use of a recursive-multi-path-adj, and a suitably
+ * constrained hashing algorithm to choose from the primary or backup path sets,
+ * would again provide the necessary shared object and hence the prefix scale
+ * independent cutover.
+ *
+ * Astute readers will recognise that both of the eBGP PIC scenarios refer only
+ * to a BGP free core.
+ *
+ * Fast convergence implementation options come in two flavours:
+ *  1) Insert switches into the data-path. The switch represents the protected
+ *     resource. If the switch is 'on' the primary path is taken, otherwise
+ *     the backup path is taken. Testing the switch in the data-path comes with
+ *     an associated performance cost. A given packet may encounter more than
+ *     one protected resource as it is forwarded. This approach minimises
+ *     cutover times as packets will be forwarded on the backup path as soon
+ *     as the protected resource is detected to be down and the single switch
+ *     is tripped. However, it comes at a performance cost, which increases
+ *     with each shared resource a packet encounters in the data-path.
+ *     This approach is thus best suited to LFA-FRR where the protected routes
+ *     are non-recursive (i.e. encounter few shared resources) and the
+ *     expectation on cutover times is more stringent (<50msecs).
+ *  2) Update shared objects. Identify objects in the data-path, that are
+ *     required to be present whether or not fast convergence is required (i.e.
+ *     adjacencies) that can be shared by multiple routes. Create a dependency
+ *     between these objects at the protected resource. When the protected
+ *     resource fails, each of the shared objects is updated in a way that all
+ *     users of it see a consistent change. This approach incurs no performance
+ *     penalty as the data-path structure is unchanged, however, the cutover
+ *     times are longer as more work is required when the resource fails. This
+ *     scheme is thus more appropriate to recursive prefixes (where the packet
+ *     will encounter multiple protected resources) and to fast-convergence
+ *     technologies where the cutover times are less stringent (i.e. PIC).
+ *
+ * Implementation:
+ * ---------------
+ *
+ * Due to the requirements outlined above, not all routes known to FIB
+ * (e.g. adj-fibs) are installed in forwarding. However, should circumstances
+ * change, those routes will need to be added. This adds the requirement that
+ * a FIB maintains two tables per-VRF, per-AF (where a 'table' is indexed by
+ * prefix); the forwarding and non-forwarding tables.
+ *
+ * For DP speed in VPP we want the lookup in the forwarding table to directly 
+ * result in the ADJ. So the two tables; one contains all the routes (a 
+ * lookup therein yields a fib_entry_t), the other contains only the forwarding 
+ * routes (a lookup therein yields an ip_adjacency_t). The latter is used by the
+ * DP. 
+ * This trades memory for forwarding performance. A good trade-off in VPP's
+ * expected operating environments.
+ *
+ * Note these tables are keyed only by the prefix (and since there 2 two
+ * per-VRF, implicitly by the VRF too). The key for an adjacency is the
+ * tuple:{next-hop, address (and it's AF), interface, link/ether-type}.
+ * consider this curious, but allowed, config;
+ *
+ *   set int ip addr 10.0.0.1/24 Gig0
+ *   set ip arp Gig0 10.0.0.2 dead.dead.dead
+ *   # a host in that sub-net is routed via a better next hop (say it avoids a
+ *   # big L2 domain)
+ *   ip route add 10.0.0.2 Gig1 192.168.1.1
+ *   # this recursive should go via Gig1
+ *   ip route add 1.1.1.1/32 via 10.0.0.2
+ *   # this non-recursive should go via Gig0
+ *   ip route add 2.2.2.2/32 via Gig0 10.0.0.2
+ *
+ * for the last route, the lookup for the path (via {Gig0, 10.0.0.2}) in the
+ * prefix table would not yield the correct result. To fix this we need a
+ * separate table for the adjacencies.
+ *
+ *  - FIB data structures;
+ *
+ * fib_entry_t:
+ *   - a representation of a route.
+ *     - has a prefix.
+ *    - it maintains an array of path-lists that have been contributed by the
+ *      different sources
+ *    - install an adjacency in the forwarding table contributed by the best
+ *      source's path-list.
+ *
+ * fib_path_list_t:
+ *   - a list of paths
+ *   - path-lists may be shared between FIB entries. The path-lists are thus
+ *     kept in a DB. The key is the combined description of the paths. We share
+ *     path-lists  when it will aid convergence to do so. Adding path-lists to
+ *     this DB that are never shared, or are not shared by prefixes that are
+ *     not subject to PIC, will increase the size of the DB unnecessarily and
+ *     may lead to increased search times due to hash collisions.
+ *   - the path-list contributes the appropriate adj for the entry in the 
+ *     forwarding table. The adj can be 'normal', multi-path or recursive,
+ *     depending on the number of paths and their types.
+ *   - since path-lists are shared there is only one instance of the multi-path 
+ *     adj that they [may] create. As such multi-path adjacencies do not need a
+ *     separate DB.
+ * The path-list with recursive paths and the recursive adjacency that it
+ * contributes forms the backbone of the fast convergence architecture (as 
+ * described previously). 
+ *
+ * fib_path_t:
+ *   - a description of how to forward the traffic (i.e. via {Gig1, K}).
+ *   - the path describes the intent on how to forward. This differs from how 
+ *     the path resolves. I.e. it might not be resolved at all (since the
+ *     interface is deleted or down).
+ *   - paths have different types, most notably recursive or non-recursive.
+ *   - a fib_path_t will contribute the appropriate adjacency object. It is from
+ *     these contributions that the DP graph/chain for the route is built.
+ *   - if the path is recursive and a recursion loop is detected, then the path
+ *     will contribute the special DROP adjacency. This way, whilst the control
+ *     plane graph is looped, the data-plane graph does not.
+ *
+ * we build a graph of these objects;
+ *
+ *  fib_entry_t -> fib_path_list_t -> fib_path_t -> ...
+ *
+ * for recursive paths:
+ *
+ *   fib_path_t -> fib_entry_t -> ....
+ *
+ * for non-recursive paths
+ *
+ *  fib_path_t -> ip_adjacency_t -> interface
+ *
+ * These objects, which constitute the 'control plane' part of the FIB are used
+ * to represent the resolution of a route. As a whole this is referred to as the
+ * control plane graph. There is a separate DP graph to represent the forwarding
+ * of a packet. In the DP graph each object represents an action that is applied
+ * to a packet as it traverses the graph. For example, a lookup of a IP address
+ * in the forwarding table could result in the following graph:
+ *
+ *    recursive-adj --> multi-path-adj --> interface_A
+ *                                     --> interface_B
+ *
+ * A packet traversing this FIB DP graph would thus also traverse a VPP node
+ * graph of:
+ *
+ *    ipX_recursive --> ipX_rewrite --> interface_A_tx --> etc
+ *
+ * The taxonomy of objects in a FIB graph is as follows, consider;
+ *
+ *   A -->  
+ *   B --> D
+ *   C -->
+ *
+ * Where A,B and C are (for example) routes that resolve through D. 
+ *  parent; D is the parent of A, B, and C.
+ *  children: A, B, and C are children of D. 
+ *  sibling: A, B and C are siblings of one another.
+ *
+ * All shared objects in the FIB are reference counted. Users of these objects
+ * are thus expected to use the add_lock/unlock semantics (as one would
+ * normally use malloc/free).
+ *
+ * WALKS
+ *
+ * It is necessary to walk/traverse the graph forwards (entry to interface) to
+ * perform a collapse or build a recursive adj and backwards (interface
+ * to entry) to perform updates, i.e. when interface state changes or when
+ * recursive route resolution updates occur.
+ * A forward walk follows simply by navigating an object's parent pointer to
+ * access its parent object. For objects with multiple parents (e.g. a 
+ * path-list), each parent is walked in turn.
+ * To support back-walks direct dependencies are maintained between objects,
+ * i.e. in the relationship, {A, B, C} --> D, then object D will maintain a list
+ * of 'pointers' to its children {A, B, C}. Bare C-language pointers are not 
+ * allowed, so a pointer is described in terms of an object type (i.e. entry,
+ * path-list, etc) and index - this allows the object to be retrieved from the
+ * appropriate pool. A list is maintained to achieve fast convergence at scale.
+ * When there are millions or recursive prefixes, it is very inefficient to
+ * blindly walk the tables looking for entries that were affected by a given
+ * topology change. The lowest hanging fruit when optimising is to remove
+ * actions that are not required, so all back-walks only traverse objects that
+ * are directly affected by the change.
+ *
+ * PIC Core and fast-reroute rely on FIB reacting quickly to an interface
+ * state change to update the multi-path-adjacencies that use this interface.
+ * An example graph is shown below:
+ *
+ *    E_a -->
+ *    E_b --> PL_2 --> P_a --> Interface_A
+ *    ...          --> P_c -\
+ *    E_k -->                \
+ *                            Interface_K
+ *                            /
+ *    E_l -->                /
+ *    E_m --> PL_1 --> P_d -/ 
+ *    ...          --> P_f --> Interface_F
+ *    E_z -->
+ *
+ * E  = fib_entry_t
+ * PL = fib_path_list_t
+ * P  = fib_path_t 
+ * The subscripts are arbitrary and serve only to distinguish object instances.
+ * This CP graph result in the following DP graph:
+ *
+ *     M-ADJ-2 --> Interface_A
+ *             \
+ *              -> Interface_K
+ *             / 
+ *     M-ADJ-1 --> Interface_F
+ *
+ * M-ADJ = multi-path-adjacency.
+ *
+ * When interface K goes down a back-walk is started over its dependants in the
+ * control plane graph. This back-walk will reach PL_1 and PL_2 and result in
+ * the calculation of new adjacencies that have interface K removed. The walk
+ * will continue to the entry objects and thus the forwarding table is updated
+ * for each prefix with the new adjacency. The DP graph then becomes:
+ *
+ *    ADJ-3 --> Interface_A
+ *
+ *    ADJ-4 --> Interface_F
+ * 
+ * The eBGP PIC scenarios described above relied on the update of a path-list's
+ * recursive-adjacency to provide the shared point of cutover. This is shown
+ * below
+ *
+ *    E_a -->
+ *    E_b --> PL_2 --> P_a --> E_44 --> PL_a --> P_b --> Interface_A
+ *    ...          --> P_c -\
+ *    E_k -->                \
+ *                            \
+ *                           E_1 --> PL_k -> P_k --> Interface_K
+ *                            /
+ *    E_l -->                /
+ *    E_m --> PL_1 --> P_d -/ 
+ *    ...          --> P_f --> E_55 --> PL_e --> P_e --> Interface_E
+ *    E_z -->
+ *
+ * The failure scenario is the removal of entry E_1 and thus the paths P_c and
+ * P_d become unresolved. To achieve PIC the two shared recursive path-lists,
+ * PL_1 and PL_2 must be updated to remove E_1 from the recursive-multi-path-
+ * adjacencies that they contribute, before any entry E_a to E_z is updated.
+ * This means that as the update propagates backwards (right to left) in the
+ * graph it must do so breadth first not depth first. Note this approach leads
+ * to convergence times that are dependent on the number of path-list and so
+ * the number of combinations of egress PEs - this is desirable as this
+ * scale is considerably lower than the number of prefixes.
+ *
+ * If we consider another section of the graph that is similar to the one
+ * shown above where there is another prefix E_2 in a similar position to E_1
+ * and so also has many dependent children. It is reasonable to expect that a
+ * particular network failure may simultaneously render E_1 and E_2 unreachable.
+ * This means that the update to withdraw E_2 is download immediately after the
+ * update to withdraw E_1. It is a requirement on the FIB to not spend large
+ * amounts of time in a back-walk whilst processing the update for E_1, i.e. the
+ * back-walk must not reach as far as E_a and its siblings. Therefore, after the
+ * back-walk has traversed one generation (breadth first) to update all the
+ * path-lists it should be suspended/back-ground and further updates allowed
+ * to be handled. Once the update queue is empty, the suspended walks can be
+ * resumed. Note that in the case that multiple updates affect the same entry
+ * (say E_1) then this will trigger multiple similar walks, these are merged,
+ * so each child is updated only once.
+ * In the presence of more layers of recursion PIC is still a desirable
+ * feature. Consider an extension to the diagram above, where more recursive
+ * routes (E_100 -> E_200) are added as children of E_a:
+ *
+ * E_100 -->
+ * E_101 --> PL_3 --> P_j-\
+ * ...                     \
+ * E_199 -->               E_a -->
+ *                         E_b --> PL_2 --> P_a --> E_44 --> ...etc..
+ *                         ...          --> P_c -\
+ *                         E_k                    \
+ *                                                E_1 --> ...etc..
+ *                                                 /
+ *                         E_l -->                /
+ *                         E_m --> PL_1 --> P_d -/ 
+ *                         ...          --> P_e --> E_55 --> ...etc..
+ *                         E_z -->
+ *
+ * To achieve PIC for the routes E_100->E_199, PL_3 needs to be updated before
+ * E_b -> E_z, a breadth first traversal at each level would not achieve this.
+ * Instead the walk must proceed intelligently. Children on PL_2 are sorted so
+ * those Entry objects that themselves have children appear first in the list,
+ * those without later. When an entry object is walked that has children, a
+ * walk of its children is pushed to the front background queue. The back
+ * ground queue is a priority queue. As the breadth first traversal proceeds
+ * across the dependent entry object E_a to E_k, when the first entry that does
+ * not have children is reached (E_b), the walk is suspended and placed at the
+ * back of the queue. Following this prioritisation method shared path-list
+ * updates are performed before all non-resolving entry objects.
+ * The CPU/core/thread that handles the updates is the same thread that handles
+ * the back-walks. Handling updates has a higher priority than making walk
+ * progress, so a walk is required to be interruptable/suspendable when new
+ * updates are available.
+ * !!! TODO - this section describes how walks should be not how they are !!!
+ *
+ * In the diagram above E_100 is an IP route, however, VPP has no restrictions
+ * on the type of object that can be a dependent of a FIB entry. Children of
+ * a FIB entry can be (and are) GRE & VXLAN tunnels endpoints, L2VPN LSPs etc.
+ * By including all object types into the graph and extending the back-walk, we
+ * can thus deliver fast convergence to technologies that overlay on an IP
+ * network.
+ *
+ * If having read all the above carefully you are still thinking;  'i don't need
+ * all this %&$* i have a route only I know about and I just need to jam it in',
+ * then fib_table_entry_special_add() is your only friend.
+ */
+
+#ifndef __FIB_H__
+#define __FIB_H__
+
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/fib_entry.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/fib/ip6_fib.h>
+
+#endif
diff --git a/vnet/vnet/fib/fib_attached_export.c b/vnet/vnet/fib/fib_attached_export.c
new file mode 100644 (file)
index 0000000..afc953a
--- /dev/null
@@ -0,0 +1,524 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/fib/fib_entry.h>
+#include <vnet/fib/fib_table.h>
+
+#include "fib_attached_export.h"
+#include "fib_entry_cover.h"
+#include "fib_entry_src.h"
+
+/**
+ * A description of the need to import routes from the export table
+ */
+typedef struct fib_ae_import_t_
+{
+    /**
+     * The entry in the epxort table that this importer
+     * is importing covereds from
+     */
+    fib_node_index_t faei_export_entry;
+
+    /**
+     * The attached entry in the import table
+     */
+    fib_node_index_t faei_import_entry;
+    /**
+     * the sibling index on the cover
+     */
+    u32 faei_export_sibling;
+
+    /**
+     * The index of the exporter tracker. Not set if the
+     * export entry is not valid for export
+     */
+    fib_node_index_t faei_exporter;
+
+    /**
+     * A vector/list of imported entry indicies
+     */
+    fib_node_index_t *faei_importeds;
+
+    /**
+     * The FIB index and prefix we are tracking
+     */
+    fib_node_index_t faei_export_fib;
+    fib_prefix_t faei_prefix;
+
+    /**
+     * The FIB index we are importing into
+     */
+    fib_node_index_t faei_import_fib;
+} fib_ae_import_t;
+
+/**
+ * A description of the need to export routes to one or more export tables
+ */
+typedef struct fib_ae_export_t_ {
+    /**
+     * The vector/list of import tracker indicies
+     */
+    fib_node_index_t *faee_importers;
+
+    /**
+     * THe connected entry this export is acting on behalf of
+     */
+    fib_node_index_t faee_ei;
+
+    /**
+     * Reference counting locks
+     */
+    u32 faee_locks;
+} fib_ae_export_t;
+
+/*
+ * memory pools for the importers and exportes
+ */
+static fib_ae_import_t *fib_ae_import_pool;
+static fib_ae_export_t *fib_ae_export_pool;
+
+static fib_ae_export_t *
+fib_entry_ae_add_or_lock (fib_node_index_t connected)
+{
+    fib_ae_export_t *export;
+    fib_entry_t *entry;
+
+    entry = fib_entry_get(connected);
+
+    if (FIB_NODE_INDEX_INVALID == entry->fe_export)
+    {
+       pool_get(fib_ae_export_pool, export);
+       memset(export, 0, sizeof(*export));
+
+       entry->fe_export = (export - fib_ae_export_pool);
+       export->faee_ei = connected;
+    }
+    else
+    {
+       export = pool_elt_at_index(fib_ae_export_pool, entry->fe_export);
+    }
+
+    export->faee_locks++;
+
+    return (export);
+}
+
+static void
+fib_entry_import_remove (fib_ae_import_t *import,
+                        fib_node_index_t entry_index)
+{
+    fib_prefix_t prefix;
+    u32 index;
+
+    /*
+     * find the index in the vector of the entry we are removing
+     */
+    index = vec_search(import->faei_importeds, entry_index);
+
+    if (index < vec_len(import->faei_importeds))
+    {
+       /*
+        * this is an entry that was previsouly imported
+        */
+       fib_entry_get_prefix(entry_index, &prefix);
+
+       fib_table_entry_special_remove(import->faei_import_fib,
+                                      &prefix,
+                                      FIB_SOURCE_AE);
+
+       fib_entry_unlock(entry_index);
+       vec_del1(import->faei_importeds, index);
+    }
+}
+
+static void
+fib_entry_import_add (fib_ae_import_t *import,
+                     fib_node_index_t entry_index)
+{
+    fib_node_index_t *existing;
+    fib_prefix_t prefix;
+
+    /*
+     * ensure we only add the exported entry once, since
+     * sourcing prefixes in the table is reference counted
+     */
+    vec_foreach(existing, import->faei_importeds)
+    {
+       if (*existing == entry_index)
+       {
+           return;
+       }
+    }
+
+    /*
+     * this is the first time this export entry has been imported
+     * Add it to the import FIB and to the list of importeds
+     */
+    fib_entry_get_prefix(entry_index, &prefix);
+
+    /*
+     * don't import entries that have the same prefix the import entry
+     */
+    if (0 != fib_prefix_cmp(&prefix,
+                           &import->faei_prefix))
+    {
+        const dpo_id_t *dpo;
+
+        dpo = fib_entry_contribute_ip_forwarding(entry_index);
+
+        if (dpo_id_is_valid(dpo))
+        {
+            fib_table_entry_special_dpo_add(import->faei_import_fib,
+                                            &prefix,
+                                            FIB_SOURCE_AE,
+                                            FIB_ENTRY_FLAG_EXCLUSIVE,
+                                            load_balance_get_bucket(dpo->dpoi_index, 0));
+
+            fib_entry_lock(entry_index);
+            vec_add1(import->faei_importeds, entry_index);
+        }
+        /*
+         * else
+         *   the entry currently has no valid forwarding. when it
+         * does it will export itself
+         */
+    }
+}
+
+/**
+ * Call back when walking a connected prefix's covered prefixes for import
+ */
+static int
+fib_entry_covered_walk_import (fib_entry_t *cover,
+                              fib_node_index_t covered,
+                              void *ctx)
+{
+    fib_ae_import_t *import = ctx;
+
+    fib_entry_import_add(import, covered);
+
+    return (0);
+}
+
+/*
+ * fib_entry_ae_import_add
+ *
+ * Add an importer to a connected entry
+ */
+static void
+fib_ae_export_import_add (fib_ae_export_t *export,
+                         fib_ae_import_t *import)
+{
+    fib_entry_t *entry;
+
+    import->faei_exporter = (export - fib_ae_export_pool);
+    entry = fib_entry_get(export->faee_ei);
+
+    fib_entry_cover_walk(entry,
+                        fib_entry_covered_walk_import,
+                        import);
+}
+
+void
+fib_attached_export_import (fib_entry_t *fib_entry,
+                           fib_node_index_t export_fib)
+{
+    fib_ae_import_t *import;
+
+    pool_get(fib_ae_import_pool, import);
+
+    import->faei_import_fib = fib_entry->fe_fib_index;
+    import->faei_export_fib = export_fib;
+    import->faei_prefix = fib_entry->fe_prefix;
+    import->faei_import_entry = fib_entry_get_index(fib_entry);
+    import->faei_export_sibling = ~0;
+
+    /*
+     * do an exact match in the export table
+     */
+    import->faei_export_entry =
+       fib_table_lookup_exact_match(import->faei_export_fib,
+                                    &import->faei_prefix);
+
+    if (FIB_NODE_INDEX_INVALID == import->faei_export_entry)
+    {
+       /*
+        * no exact matching entry in the export table. can't be good.
+        * track the next best thing
+        */
+       import->faei_export_entry =
+           fib_table_lookup(import->faei_export_fib,
+                            &import->faei_prefix);
+       import->faei_exporter = FIB_NODE_INDEX_INVALID;
+    }
+    else
+    {
+       /*
+        * found the entry in the export table. import the
+        * the prefixes that it covers.
+        * only if the prefix found in the export FIB really is
+        * attached do we want to import its covered
+        */
+       if (FIB_ENTRY_FLAG_ATTACHED &
+           fib_entry_get_flags_i(fib_entry_get(import->faei_export_entry)))
+       {
+           fib_ae_export_t *export;
+
+           export = fib_entry_ae_add_or_lock(import->faei_export_entry);
+           vec_add1(export->faee_importers, (import - fib_ae_import_pool));
+           fib_ae_export_import_add(export, import);
+       }
+    }
+
+    /*
+     * track the entry in the export table so we can update appropriately
+     * when it changes
+     */
+    import->faei_export_sibling =
+       fib_entry_cover_track(fib_entry_get(import->faei_export_entry),
+                             fib_entry_get_index(fib_entry));
+
+    fib_entry->fe_import = (import - fib_ae_import_pool);
+}
+
+/**
+ * \brief All the imported entries need to be pruged
+ */
+void
+fib_attached_export_purge (fib_entry_t *fib_entry)
+{
+    if (FIB_NODE_INDEX_INVALID != fib_entry->fe_import)
+    {
+       fib_node_index_t *import_index;
+       fib_entry_t *export_entry;
+       fib_ae_import_t *import;
+       fib_ae_export_t *export;
+
+       import = pool_elt_at_index(fib_ae_import_pool,
+                                  fib_entry->fe_import);
+
+       /*
+        * remove each imported entry
+        */
+       vec_foreach(import_index, import->faei_importeds)
+       {
+           fib_prefix_t prefix;
+
+           fib_entry_get_prefix(*import_index, &prefix);
+
+           fib_table_entry_delete(import->faei_import_fib,
+                                  &prefix,
+                                  FIB_SOURCE_AE);
+           fib_entry_unlock(*import_index);
+       }
+       vec_free(import->faei_importeds);
+
+       /*
+        * stop tracking the export entry
+        */
+       if (~0 != import->faei_export_sibling)
+       {
+           fib_entry_cover_untrack(fib_entry_get(import->faei_export_entry),
+                                   import->faei_export_sibling);
+       }
+       import->faei_export_sibling = ~0;
+
+       /*
+        * remove this import tracker from the export's list,
+        * if it is attached to one. It won't be in the case the tracked
+        * export entry is not an attached exact match.
+        */
+       if (FIB_NODE_INDEX_INVALID != import->faei_exporter)
+       {
+           export_entry = fib_entry_get(import->faei_export_entry);
+           ASSERT(FIB_NODE_INDEX_INVALID != export_entry->fe_export);
+           export = pool_elt_at_index(fib_ae_export_pool, export_entry->fe_export);
+
+           u32 index = vec_search(export->faee_importers,
+                                  (import - fib_ae_import_pool));
+
+           ASSERT(index < vec_len(export->faee_importers));
+           vec_del1(export->faee_importers, index);
+
+           /*
+            * free the exporter if there are no longer importers
+            */
+           if (0 == --export->faee_locks)
+           {
+               pool_put(fib_ae_export_pool, export);
+               export_entry->fe_export = FIB_NODE_INDEX_INVALID;
+           }
+       }
+
+       /*
+        * free the import tracker
+        */
+       pool_put(fib_ae_import_pool, import);
+       fib_entry->fe_import = FIB_NODE_INDEX_INVALID;
+    }  
+}
+
+void
+fib_attached_export_covered_added (fib_entry_t *cover,
+                                  fib_node_index_t covered)
+{
+    if (FIB_NODE_INDEX_INVALID != cover->fe_export)
+    {
+       /*
+        * the covering prefix is exporting to other tables
+        */
+       fib_node_index_t *import_index;
+       fib_ae_import_t *import;
+       fib_ae_export_t *export;
+
+       export = pool_elt_at_index(fib_ae_export_pool, cover->fe_export);
+
+       /*
+        * export the covered entry to each of the importers
+        */
+       vec_foreach(import_index, export->faee_importers)
+       {
+           import = pool_elt_at_index(fib_ae_import_pool, *import_index);
+
+           fib_entry_import_add(import, covered);
+       }
+    }
+}
+
+void
+fib_attached_export_covered_removed (fib_entry_t *cover,
+                                    fib_node_index_t covered)
+{
+    if (FIB_NODE_INDEX_INVALID != cover->fe_export)
+    {
+       /*
+        * the covering prefix is exporting to other tables
+        */
+       fib_node_index_t *import_index;
+       fib_ae_import_t *import;
+       fib_ae_export_t *export;
+
+       export = pool_elt_at_index(fib_ae_export_pool, cover->fe_export);
+
+       /*
+        * remove the covered entry from each of the importers
+        */
+       vec_foreach(import_index, export->faee_importers)
+       {
+           import = pool_elt_at_index(fib_ae_import_pool, *import_index);
+
+           fib_entry_import_remove(import, covered);
+       }
+    }
+}
+
+static void
+fib_attached_export_cover_modified_i (fib_entry_t *fib_entry)
+{
+    if (FIB_NODE_INDEX_INVALID != fib_entry->fe_import)
+    {
+       fib_ae_import_t *import;
+       u32 export_fib;
+
+       /*
+        * safe the temporaries we need from the existing import
+        * since it will be toast after the purge.
+        */
+       import = pool_elt_at_index(fib_ae_import_pool, fib_entry->fe_import);
+       export_fib = import->faei_export_fib;
+
+       /*
+        * keep it simple. purge anything that was previously imported.
+        * then re-evaluate the need to import.
+        */
+       fib_attached_export_purge(fib_entry);
+       fib_attached_export_import(fib_entry, export_fib);
+    }
+}
+
+/**
+ * \brief If this entry is tracking a cover (in another table)
+ *        then that cover has changed. re-evaluate import.
+ */
+void
+fib_attached_export_cover_change (fib_entry_t *fib_entry)
+{
+    fib_attached_export_cover_modified_i(fib_entry);
+}
+
+/**
+ * \brief If this entry is tracking a cover (in another table)
+ *        then that cover has been updated. re-evaluate import.
+ */
+void
+fib_attached_export_cover_update (fib_entry_t *fib_entry)
+{
+    fib_attached_export_cover_modified_i(fib_entry);
+}
+
+u8*
+fib_ae_import_format (fib_node_index_t import_index,
+                     u8* s)
+{
+    if (FIB_NODE_INDEX_INVALID != import_index)
+    {
+       fib_node_index_t *index;
+       fib_ae_import_t *import;
+
+       import = pool_elt_at_index(fib_ae_import_pool, import_index);
+
+       s = format(s, "\n  Attached-Import:%d:[", (import - fib_ae_import_pool));
+       s = format(s, "export-prefix:%U ", format_fib_prefix, &import->faei_prefix);
+       s = format(s, "export-entry:%d ", import->faei_export_entry);
+       s = format(s, "export-sibling:%d ", import->faei_export_sibling);
+       s = format(s, "exporter:%d ", import->faei_exporter);
+       s = format(s, "export-fib:%d ", import->faei_export_fib);
+
+       s = format(s, "import-entry:%d ", import->faei_import_entry);
+       s = format(s, "import-fib:%d ", import->faei_import_fib);
+
+       s = format(s, "importeds:[");
+       vec_foreach(index, import->faei_importeds)
+       {
+           s = format(s, "%d, ", *index);
+       }
+           s = format(s, "]]");
+    }
+
+    return (s);
+}
+
+u8*
+fib_ae_export_format (fib_node_index_t export_index, u8*s)
+{
+    if (FIB_NODE_INDEX_INVALID != export_index)
+    {
+       fib_node_index_t *index;
+       fib_ae_export_t *export;
+
+       export = pool_elt_at_index(fib_ae_export_pool, export_index);
+    
+       s = format(s, "\n  Attached-Export:%d:[", (export - fib_ae_export_pool));
+       s = format(s, "export-entry:%d ", export->faee_ei);
+
+       s = format(s, "importers:[");
+       vec_foreach(index, export->faee_importers)
+       {
+           s = format(s, "%d, ", *index);
+       }
+       s = format(s, "]]");
+    }
+    return (s);
+}
diff --git a/vnet/vnet/fib/fib_attached_export.h b/vnet/vnet/fib/fib_attached_export.h
new file mode 100644 (file)
index 0000000..ee68481
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * FIB attached export
+ *
+ * what's it all about?
+ * say one does this:
+ *    set int ip table Gig0 2
+ *    set int ip addr  Gig0 10.0.0.1/24
+ * Ggi0 is in table 2 with a connected address.
+ * Now we add a routing matching said connected in a different table
+ *    ip route add table 3 10.0.0.0/24 via Gig0
+ * How do we expect traffic in table 3 to be forwarded? Clearly out of
+ * Ggi0. It's an attached route, hence we are saying that we can ARP for
+ * hosts in the attached subnet. and we can. but any ARP entries we send
+ * we be received on Gig0, but since Gig0 is in table 2, it will install
+ * the adj-fins in table 2. So traffic in table 3 will never hit an adj-fib
+ * and hence always the glean, and so thus be effectively dropped.
+ * How do we fix this? Attached Export !! All more specfiic entries in table 2
+ * that track and are covered by the connected are automatically exported into
+ * table 3. Now table 3 also has adj-fibs (and the local) so traffic to hosts
+ * is restored.
+ */
+
+#ifndef __FIB_ATTACHED_EXPORT_H__
+#define __FIB_ATTACHED_EXPORT_H__
+
+#include <vnet/fib/fib_types.h>
+
+extern void fib_attached_export_import(fib_entry_t *fib_entry,
+                                      fib_node_index_t export_fib);
+                                      
+extern void fib_attached_export_purge(fib_entry_t *fib_entry);
+
+extern void fib_attached_export_covered_added(fib_entry_t *cover,
+                                             fib_node_index_t covered);
+extern void fib_attached_export_covered_removed(fib_entry_t *cover,
+                                               fib_node_index_t covered);
+extern void fib_attached_export_cover_change(fib_entry_t *fib_entry);
+extern void fib_attached_export_cover_update(fib_entry_t *fib_entry);
+
+extern u8* fib_ae_import_format(fib_node_index_t import_index, u8*s);
+extern u8* fib_ae_export_format(fib_node_index_t export_index, u8*s);
+
+#endif
diff --git a/vnet/vnet/fib/fib_entry.c b/vnet/vnet/fib/fib_entry.c
new file mode 100644 (file)
index 0000000..8b63f0d
--- /dev/null
@@ -0,0 +1,1493 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/ip/format.h>
+#include <vnet/ip/lookup.h>
+#include <vnet/adj/adj.h>
+#include <vnet/dpo/load_balance.h>
+#include <vnet/dpo/drop_dpo.h>
+
+#include <vnet/fib/fib_entry.h>
+#include <vnet/fib/fib_walk.h>
+#include <vnet/fib/fib_entry_src.h>
+#include <vnet/fib/fib_entry_cover.h>
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/fib_internal.h>
+#include <vnet/fib/fib_attached_export.h>
+#include <vnet/fib/fib_path_ext.h>
+
+/*
+ * Array of strings/names for the FIB sources
+ */
+static const char *fib_source_names[] = FIB_SOURCES;
+static const char *fib_attribute_names[] = FIB_ENTRY_ATTRIBUTES;
+
+/*
+ * Pool for all fib_entries
+ */
+static fib_entry_t *fib_entry_pool;
+
+fib_entry_t *
+fib_entry_get (fib_node_index_t index)
+{
+    return (pool_elt_at_index(fib_entry_pool, index));
+}
+
+static fib_node_t *
+fib_entry_get_node (fib_node_index_t index)
+{
+    return ((fib_node_t*)fib_entry_get(index));
+}
+
+fib_node_index_t
+fib_entry_get_index (const fib_entry_t * fib_entry)
+{
+    return (fib_entry - fib_entry_pool);
+}
+
+static fib_protocol_t
+fib_entry_get_proto (const fib_entry_t * fib_entry)
+{
+    return (fib_entry->fe_prefix.fp_proto);
+}
+
+/**
+ * @brief Turn the chain type requested by the client into the one they
+ * really wanted
+ */
+static fib_forward_chain_type_t
+fib_entry_chain_type_fixup (const fib_entry_t *entry,
+                           fib_forward_chain_type_t fct)
+{
+    if (FIB_FORW_CHAIN_TYPE_MPLS_EOS == fct)
+    {
+       /*
+        * The EOS chain is a tricky since one cannot know the adjacency
+        * to link to without knowing what the packets payload protocol
+        * will be once the label is popped.
+        */
+       fib_forward_chain_type_t dfct;
+
+       dfct = fib_entry_get_default_chain_type(entry);
+
+       if (FIB_FORW_CHAIN_TYPE_MPLS_EOS == dfct)
+       {
+           /*
+            * If the entry being asked is a eos-MPLS label entry,
+            * then use the payload-protocol field, that we stashed there
+            * for just this purpose
+            */
+           return (fib_proto_to_forw_chain_type(entry->fe_prefix.fp_payload_proto));
+       }
+       /*
+        * else give them what this entry would be by default. i.e. if it's a v6
+        * entry, then the label its local labelled should be carrying v6 traffic.
+        * If it's a non-EOS label entry, then there are more labels and we want
+        * a non-eos chain.
+        */
+       return (dfct);
+    }
+
+    return (fct);
+}
+
+fib_forward_chain_type_t
+fib_entry_get_default_chain_type (const fib_entry_t *fib_entry)
+{
+    switch (fib_entry->fe_prefix.fp_proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       return (FIB_FORW_CHAIN_TYPE_UNICAST_IP4);
+    case FIB_PROTOCOL_IP6:
+       return (FIB_FORW_CHAIN_TYPE_UNICAST_IP6);
+    case FIB_PROTOCOL_MPLS:
+       if (MPLS_EOS == fib_entry->fe_prefix.fp_eos)
+           /*
+            * If the entry being asked is a eos-MPLS label entry,
+            * then use the payload-protocol field, that we stashed there
+            * for just this purpose
+            */
+           return (fib_proto_to_forw_chain_type(fib_entry->fe_prefix.fp_payload_proto));
+       else
+           return (FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS);
+    }
+
+    return (FIB_FORW_CHAIN_TYPE_UNICAST_IP4);
+}
+
+u8 *
+format_fib_entry (u8 * s, va_list * args)
+{
+    fib_forward_chain_type_t fct;
+    fib_entry_attribute_t attr;
+    fib_path_ext_t *path_ext;
+    fib_entry_t *fib_entry;
+    fib_entry_src_t *src;
+    fib_node_index_t fei;
+    fib_source_t source;
+    u32 n_covered;
+    int level;
+
+    fei = va_arg (*args, fib_node_index_t);
+    level = va_arg (*args, int);
+    fib_entry = fib_entry_get(fei);
+
+    s = format (s, "%U", format_fib_prefix, &fib_entry->fe_prefix);
+
+    if (level >= FIB_ENTRY_FORMAT_DETAIL)
+    {
+       s = format (s, " fib:%d", fib_entry->fe_fib_index);
+       s = format (s, " index:%d", fib_entry_get_index(fib_entry));
+       s = format (s, " locks:%d", fib_entry->fe_node.fn_locks);
+
+       FOR_EACH_SRC_ADDED(fib_entry, src, source,
+        ({
+           s = format (s, "\n  src:%s ",
+                       fib_source_names[source]);
+           s = fib_entry_src_format(fib_entry, source, s);
+           s = format (s, " refs:%d ", src->fes_ref_count);
+           if (FIB_ENTRY_FLAG_NONE != src->fes_entry_flags) {
+               s = format(s, "flags:");
+               FOR_EACH_FIB_ATTRIBUTE(attr) {
+                   if ((1<<attr) & src->fes_entry_flags) {
+                       s = format (s, "%s,", fib_attribute_names[attr]);
+                   }
+               }
+           }
+           s = format (s, "\n");
+           if (FIB_NODE_INDEX_INVALID != src->fes_pl)
+           {
+               s = fib_path_list_format(src->fes_pl, s);
+           }
+           if (NULL != src->fes_path_exts)
+           {
+               s = format(s, "    Extensions:");
+               vec_foreach(path_ext, src->fes_path_exts)
+               {
+                   s = format(s, "\n     %U", format_fib_path_ext, path_ext);
+               }
+           }
+       }));
+    
+       n_covered = fib_entry_cover_get_size(fib_entry);
+       if (n_covered > 0) {
+           s = format(s, "\n tracking %d covered: ", n_covered);
+           s = fib_entry_cover_list_format(fib_entry, s);
+       }
+       s = fib_ae_import_format(fib_entry->fe_import, s);
+       s = fib_ae_export_format(fib_entry->fe_export, s);
+
+       s = format (s, "\n forwarding: ");
+    }
+    else
+    {
+       s = format (s, "\n");
+    }
+
+    fct = fib_entry_get_default_chain_type(fib_entry);
+
+    if (!dpo_id_is_valid(&fib_entry->fe_lb[fct]))
+    {
+       s = format (s, "  UNRESOLVED\n");
+       return (s);
+    }
+    else
+    {
+        if (level >= FIB_ENTRY_FORMAT_DETAIL2)
+        {
+
+            FOR_EACH_FIB_FORW_CHAIN(fct)
+            {
+                s = format(s, "  %U-chain\n  %U",
+                           format_fib_forw_chain_type, fct,
+                           format_dpo_id,
+                           &fib_entry->fe_lb[fct],
+                           2);
+                s = format(s, "\n");
+            }
+        }
+        else
+        {
+           s = format(s, "  %U-chain\n  %U",
+                      format_fib_forw_chain_type, fct,
+                       format_dpo_id,
+                       &fib_entry->fe_lb[fct],
+                       2);
+            s = format(s, "\n");
+        }
+    }
+
+    if (level >= FIB_ENTRY_FORMAT_DETAIL2)
+    {
+        s = format(s, "\nchildren:");
+        s = fib_node_children_format(fib_entry->fe_node.fn_children, s);
+    }
+
+    /* adj = adj_get(fib_entry->fe_prefix.fp_proto, fib_entry->fe_adj_index); */
+
+    /* ip_multipath_next_hop_t * nhs, tmp_nhs[1]; */
+    /* u32 i, j, n_left, n_nhs; */
+    /* vlib_counter_t c, sum; */
+    /* ip_lookup_main_t *lm = fib_get_lookup_main(fib_entry->fe_prefix.fp_proto); */
+
+    /* if (adj->n_adj == 1) */
+    /* { */
+    /*         nhs = &tmp_nhs[0]; */
+    /*         nhs[0].next_hop_adj_index = ~0; /\* not used *\/ */
+    /*         nhs[0].weight = 1; */
+    /*         n_nhs = 1; */
+    /* } */
+    /* else */
+    /* { */
+    /*         ip_multipath_adjacency_t * madj; */
+    /*         madj = vec_elt_at_index (lm->multipath_adjacencies, adj->heap_handle); */
+    /*         nhs = heap_elt_at_index (lm->next_hop_heap, madj->normalized_next_hops.heap_offset); */
+    /*         n_nhs = madj->normalized_next_hops.count; */
+    /* } */
+
+    /* n_left = nhs[0].weight; */
+    /* vlib_counter_zero (&sum); */
+    /* for (i = j = 0; i < adj->n_adj; i++) */
+    /* { */
+    /*         n_left -= 1; */
+    /*         vlib_get_combined_counter(&lm->adjacency_counters,  */
+    /*                                   fib_entry->fe_adj_index + i, */
+    /*                                   &c); */
+    /*         /\* if (clear) *\/ */
+    /*         /\*     vlib_zero_combined_counter (&lm->adjacency_counters,  *\/ */
+    /*         /\*                             fib_entry->fe_adj_index + i); *\/ */
+
+    /*         vlib_counter_add (&sum, &c); */
+    /*         if (n_left == 0) */
+    /*         { */
+    /*             s = format (s, "%16Ld%16Ld ", sum.packets, sum.bytes); */
+    /*             s = format (s, "weight %d, index %d", */
+    /*                           nhs[j].weight, fib_entry->fe_adj_index + i); */
+
+    /*             if (adj->n_adj > 1) */
+    /*                 s = format (s, ", multipath"); */
+
+    /*             s = format (s, "\n%U", */
+    /*                         format_ip_adjacency, */
+    /*                         vnet_get_main(), lm, fib_entry->fe_adj_index + i); */
+
+    /*             //   vlib_cli_output (vm, "%v", msg); */
+    /*             //vec_free (msg); */
+    /*         } */
+    /*         else */
+    /*         { */
+    /*             j++; */
+    /*             if (j < n_nhs) */
+    /*             { */
+    /*                 n_left = nhs[j].weight; */
+    /*                 vlib_counter_zero (&sum); */
+    /*             } */
+    /*         } */
+    /* } */
+
+    return (s);
+}
+
+static fib_entry_t*
+fib_entry_from_fib_node (fib_node_t *node)
+{
+#if CLIB_DEBUG > 0
+    ASSERT(FIB_NODE_TYPE_ENTRY == node->fn_type);
+#endif
+    return ((fib_entry_t*)node);
+}
+
+static void
+fib_entry_last_lock_gone (fib_node_t *node)
+{
+    fib_forward_chain_type_t fct;
+    fib_entry_t *fib_entry;
+
+    fib_entry = fib_entry_from_fib_node(node);
+
+    FOR_EACH_FIB_FORW_CHAIN(fct)
+    {
+       dpo_reset(&fib_entry->fe_lb[fct]);
+    }
+
+    FIB_ENTRY_DBG(fib_entry, "last-lock");
+
+    fib_node_deinit(&fib_entry->fe_node);
+    // FIXME -RR Backwalk
+    pool_put(fib_entry_pool, fib_entry);
+}
+
+static fib_entry_src_t*
+fib_entry_get_best_src_i (const fib_entry_t *fib_entry)
+{
+    fib_entry_src_t *bsrc;
+
+    /*
+     * the enum of sources is deliberately arranged in priority order
+     */
+    if (0 == vec_len(fib_entry->fe_srcs))
+    {
+       bsrc = NULL;
+    }
+    else
+    {
+       bsrc = vec_elt_at_index(fib_entry->fe_srcs, 0);
+    }
+
+    return (bsrc);
+}
+
+static fib_source_t
+fib_entry_src_get_source (const fib_entry_src_t *esrc)
+{
+    if (NULL != esrc)
+    {
+       return (esrc->fes_src);
+    }
+    return (FIB_SOURCE_MAX);
+}
+
+static fib_entry_flag_t
+fib_entry_src_get_flags (const fib_entry_src_t *esrc)
+{
+    if (NULL != esrc)
+    {
+       return (esrc->fes_entry_flags);
+    }
+    return (FIB_ENTRY_FLAG_NONE);
+}
+
+fib_entry_flag_t
+fib_entry_get_flags (fib_node_index_t fib_entry_index)
+{
+    return (fib_entry_get_flags_i(fib_entry_get(fib_entry_index)));
+}
+
+/*
+ * fib_entry_back_walk_notify
+ *
+ * A back walk has reach this entry.
+ */
+static fib_node_back_walk_rc_t
+fib_entry_back_walk_notify (fib_node_t *node,
+                           fib_node_back_walk_ctx_t *ctx)
+{
+    fib_entry_t *fib_entry;
+
+    fib_entry = fib_entry_from_fib_node(node);
+
+    if (FIB_NODE_BW_REASON_FLAG_EVALUATE & ctx->fnbw_reason        ||
+        FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE & ctx->fnbw_reason      ||
+       FIB_NODE_BW_REASON_FLAG_INTERFACE_UP & ctx->fnbw_reason    ||
+       FIB_NODE_BW_REASON_FLAG_INTERFACE_DOWN & ctx->fnbw_reason  ||
+       FIB_NODE_BW_REASON_FLAG_INTERFACE_DELETE & ctx->fnbw_reason)
+    {
+       fib_entry_src_action_reactivate(fib_entry,
+                                        fib_entry_get_best_source(
+                                            fib_entry_get_index(fib_entry)));
+    }
+
+    if (FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE & ctx->fnbw_reason)
+    {
+        /*
+         * ADJ updates (complete<->incomplete) do not need to propagate to
+         * recursive entries.
+         * The only reason its needed as far back as here, is that the adj
+         * and the incomplete adj are a different DPO type, so the LBs need
+         * to re-stack.
+         */
+        return (FIB_NODE_BACK_WALK_CONTINUE);
+    }
+    else
+    {
+        /*
+         * all other walk types can be reclassifed to a re-evaluate to
+         * all recursive dependents.
+         * By reclassifying we ensure that should any of these walk types meet
+         * they can be merged.
+         */
+        ctx->fnbw_reason = FIB_NODE_BW_REASON_FLAG_EVALUATE;
+
+        /*
+         * propagate the backwalk further if we haven't already reached the
+         * maximum depth.
+         */
+        fib_walk_sync(FIB_NODE_TYPE_ENTRY,
+                      fib_entry_get_index(fib_entry),
+                      ctx);
+    }
+
+    return (FIB_NODE_BACK_WALK_CONTINUE);
+}
+
+/*
+ * The FIB path-list's graph node virtual function table
+ */
+static const fib_node_vft_t fib_entry_vft = {
+    .fnv_get = fib_entry_get_node,
+    .fnv_last_lock = fib_entry_last_lock_gone,
+    .fnv_back_walk = fib_entry_back_walk_notify,
+};
+
+/*
+ * fib_entry_contribute_forwarding
+ *
+ * Get an lock the forwarding information (DPO) contributed by the FIB entry.
+ */
+void
+fib_entry_contribute_forwarding (fib_node_index_t fib_entry_index,
+                                fib_forward_chain_type_t type,
+                                dpo_id_t *dpo)
+{
+    fib_entry_t *fib_entry;
+
+    fib_entry = fib_entry_get(fib_entry_index);
+
+    /*
+     * these are not the droids you are looking for...
+     */
+    type = fib_entry_chain_type_fixup(fib_entry, type);
+
+    if (!dpo_id_is_valid(&fib_entry->fe_lb[type]))
+    {
+       /*
+        * on-demand create eos/non-eos.
+        * There is no on-demand delete because:
+        *   - memory versus complexity & reliability:
+        *      leaving unrequired [n]eos LB arounds wastes memory, cleaning
+        *      then up on the right trigger is more code. i favour the latter.
+        */
+       fib_entry_src_mk_lb(fib_entry,
+                           fib_entry_get_best_src_i(fib_entry),
+                           type,
+                           &fib_entry->fe_lb[type]);
+    }
+
+    dpo_copy(dpo, &fib_entry->fe_lb[type]);
+}
+
+const dpo_id_t *
+fib_entry_contribute_ip_forwarding (fib_node_index_t fib_entry_index)
+{
+    fib_entry_t *fib_entry;
+
+    fib_entry = fib_entry_get(fib_entry_index);
+
+    return (&fib_entry->fe_lb[fib_entry_get_default_chain_type(fib_entry)]);
+}
+
+adj_index_t
+fib_entry_get_adj (fib_node_index_t fib_entry_index)
+{
+    const dpo_id_t *dpo;
+
+    dpo = fib_entry_contribute_ip_forwarding(fib_entry_index);
+    dpo = load_balance_get_bucket(dpo->dpoi_index, 0);
+
+    if (dpo_is_adj(dpo))
+    {
+       return (dpo->dpoi_index);
+    }
+    return (ADJ_INDEX_INVALID);
+}
+
+fib_node_index_t
+fib_entry_get_path_list (fib_node_index_t fib_entry_index)
+{
+    fib_entry_t *fib_entry;
+
+    fib_entry = fib_entry_get(fib_entry_index);
+
+    return (fib_entry->fe_parent);
+}
+
+u32
+fib_entry_get_fib_table_id(fib_node_index_t fib_entry_index)
+{
+    
+
+    return (0);
+}
+
+u32
+fib_entry_child_add (fib_node_index_t fib_entry_index,
+                    fib_node_type_t child_type,
+                    fib_node_index_t child_index)
+{
+    return (fib_node_child_add(FIB_NODE_TYPE_ENTRY,
+                               fib_entry_index,
+                               child_type,
+                               child_index));
+};
+
+void
+fib_entry_child_remove (fib_node_index_t fib_entry_index,
+                       u32 sibling_index)
+{
+    fib_node_child_remove(FIB_NODE_TYPE_ENTRY,
+                          fib_entry_index,
+                          sibling_index);
+}
+
+static fib_entry_t *
+fib_entry_alloc (u32 fib_index,
+                const fib_prefix_t *prefix,
+                fib_node_index_t *fib_entry_index)
+{
+    fib_forward_chain_type_t fct;
+    fib_entry_t *fib_entry;
+
+    pool_get(fib_entry_pool, fib_entry);
+    memset(fib_entry, 0, sizeof(*fib_entry));
+
+    fib_node_init(&fib_entry->fe_node,
+                 FIB_NODE_TYPE_ENTRY);
+
+    fib_entry->fe_fib_index = fib_index;
+    fib_entry->fe_prefix = *prefix;
+    if (FIB_PROTOCOL_MPLS == fib_entry->fe_prefix.fp_proto)
+    {
+       fib_entry->fe_prefix.fp_len = 21;
+       ASSERT(DPO_PROTO_NONE != fib_entry->fe_prefix.fp_payload_proto);
+    }
+
+    fib_entry->fe_export = FIB_NODE_INDEX_INVALID;
+    fib_entry->fe_import = FIB_NODE_INDEX_INVALID;
+    fib_entry->fe_covered = FIB_NODE_INDEX_INVALID;
+    FOR_EACH_FIB_FORW_CHAIN(fct)
+    {
+       dpo_reset(&fib_entry->fe_lb[fct]);
+    }
+
+    *fib_entry_index = fib_entry_get_index(fib_entry);
+
+    FIB_ENTRY_DBG(fib_entry, "alloc");
+
+    return (fib_entry);
+}
+
+static void
+fib_entry_post_flag_update_actions (fib_entry_t *fib_entry,
+                                   fib_source_t source,
+                                   fib_entry_flag_t old_flags)
+{
+    /*
+     * handle changes to attached export for import entries
+     */
+    int is_import  = (FIB_ENTRY_FLAG_IMPORT & fib_entry_get_flags_i(fib_entry));
+    int was_import = (FIB_ENTRY_FLAG_IMPORT & old_flags);
+
+    if (!was_import && is_import)
+    {
+       /*
+        * transition from not exported to exported
+        */
+
+       /*
+        * there is an assumption here that the entry resolves via only
+        * one interface and that it is the cross VRF interface.
+        */
+       u32 sw_if_index = fib_path_list_get_resolving_interface(fib_entry->fe_parent);
+
+       fib_attached_export_import(fib_entry,
+                                  fib_table_get_index_for_sw_if_index(
+                                      fib_entry_get_proto(fib_entry),
+                                       sw_if_index));
+    }
+    else if (was_import && !is_import)
+    {
+       /*
+        * transition from exported to not exported
+        */
+       fib_attached_export_purge(fib_entry);
+    }
+    /*
+     * else
+     *   no change. nothing to do.
+     */
+
+    /*
+     * handle changes to attached export for export entries
+     */
+    int is_attached  = (FIB_ENTRY_FLAG_ATTACHED & fib_entry_get_flags_i(fib_entry));
+    int was_attached = (FIB_ENTRY_FLAG_ATTACHED & old_flags);
+
+    if (!was_attached && is_attached)
+    {
+       /*
+        * transition to attached. time to export
+        */
+       // FIXME
+    }
+    // else FIXME
+}
+
+static void
+fib_entry_post_install_actions (fib_entry_t *fib_entry,
+                               fib_source_t source,
+                               fib_entry_flag_t old_flags)
+{
+    fib_entry_post_flag_update_actions(fib_entry, source, old_flags);
+    fib_entry_src_action_installed(fib_entry, source);
+}
+
+fib_node_index_t
+fib_entry_create (u32 fib_index,
+                 const fib_prefix_t *prefix,
+                 fib_source_t source,
+                 fib_entry_flag_t flags,
+                 const fib_route_path_t *paths)
+{
+    fib_node_index_t fib_entry_index;
+    fib_entry_t *fib_entry;
+
+    ASSERT(0 < vec_len(paths));
+
+    fib_entry = fib_entry_alloc(fib_index, prefix, &fib_entry_index);
+
+    /*
+     * since this is a new entry create, we don't need to check for winning
+     * sources - there is only one.
+     */
+    fib_entry = fib_entry_src_action_add(fib_entry, source, flags,
+                                         drop_dpo_get(
+                                             fib_proto_to_dpo(
+                                                 fib_entry_get_proto(fib_entry))));
+    fib_entry_src_action_path_swap(fib_entry,
+                                  source,
+                                  flags,
+                                  paths);
+    /*
+     * handle possible realloc's by refetching the pointer
+     */
+    fib_entry = fib_entry_get(fib_entry_index);
+    fib_entry_src_action_activate(fib_entry, source);
+
+    fib_entry_post_install_actions(fib_entry, source, FIB_ENTRY_FLAG_NONE);
+
+    return (fib_entry_index);
+}
+
+fib_node_index_t
+fib_entry_create_special (u32 fib_index,
+                         const fib_prefix_t *prefix,
+                         fib_source_t source,
+                         fib_entry_flag_t flags,
+                         const dpo_id_t *dpo)
+{
+    fib_node_index_t fib_entry_index;
+    fib_entry_t *fib_entry;
+
+    /*
+     * create and initiliase the new enty
+     */
+    fib_entry = fib_entry_alloc(fib_index, prefix, &fib_entry_index);
+
+    /*
+     * create the path-list
+     */
+    fib_entry = fib_entry_src_action_add(fib_entry, source, flags, dpo);
+    fib_entry_src_action_activate(fib_entry, source);
+
+    fib_entry_post_install_actions(fib_entry, source, FIB_ENTRY_FLAG_NONE);
+
+    return (fib_entry_index);
+}
+
+static void
+fib_entry_post_update_actions (fib_entry_t *fib_entry,
+                              fib_source_t source,
+                              fib_entry_flag_t old_flags)
+{
+    /*
+     * backwalk to children to inform then of the change to forwarding.
+     */
+    fib_node_back_walk_ctx_t bw_ctx = {
+       .fnbw_reason = FIB_NODE_BW_REASON_FLAG_EVALUATE,
+    };
+
+    fib_walk_sync(FIB_NODE_TYPE_ENTRY, fib_entry_get_index(fib_entry), &bw_ctx);
+
+    /*
+     * then inform any covered prefixes
+     */
+    fib_entry_cover_update_notify(fib_entry);
+
+    fib_entry_post_install_actions(fib_entry, source, old_flags);
+}
+
+void
+fib_entry_special_add (fib_node_index_t fib_entry_index,
+                      fib_source_t source,
+                      fib_entry_flag_t flags,
+                      const dpo_id_t *dpo)
+{
+    fib_source_t best_source;
+    fib_entry_flag_t bflags;
+    fib_entry_t *fib_entry;
+    fib_entry_src_t *bsrc;
+
+    fib_entry = fib_entry_get(fib_entry_index);
+
+    bsrc = fib_entry_get_best_src_i(fib_entry);
+    best_source = fib_entry_src_get_source(bsrc);
+    bflags = fib_entry_src_get_flags(bsrc);
+
+    fib_entry = fib_entry_src_action_add(fib_entry, source, flags, dpo);
+
+    /*
+     * if the path list for the source passed is invalid,
+     * then we need to create a new one. else we are updating
+     * an existing.
+     */
+    if (source < best_source)
+    {
+       /*
+        * we have a new winning source.
+        */
+       fib_entry_src_action_deactivate(fib_entry, best_source);
+       fib_entry_src_action_activate(fib_entry, source);
+    }
+    else if (source > best_source)
+    {
+       /*
+        * the new source loses. nothing to do here.
+        * the data from the source is saved in the path-list created
+        */
+       return;
+    }
+    else
+    {
+       /*
+        * the new source is one this entry already has.
+        * But the path-list was updated, which will contribute new forwarding,
+        * so install it.
+        */
+       fib_entry_src_action_deactivate(fib_entry, source);
+       fib_entry_src_action_activate(fib_entry, source);
+    }
+
+    fib_entry_post_update_actions(fib_entry, source, bflags);
+}
+
+void
+fib_entry_path_add (fib_node_index_t fib_entry_index,
+                   fib_source_t source,
+                   fib_entry_flag_t flags,
+                   const fib_route_path_t *rpath)
+{
+    fib_source_t best_source;
+    fib_entry_flag_t bflags;
+    fib_entry_t *fib_entry;
+    fib_entry_src_t *bsrc;
+
+    ASSERT(1 == vec_len(rpath));
+
+    fib_entry = fib_entry_get(fib_entry_index);
+    ASSERT(NULL != fib_entry);
+
+    bsrc = fib_entry_get_best_src_i(fib_entry);
+    best_source = fib_entry_src_get_source(bsrc);
+    bflags = fib_entry_src_get_flags(bsrc);
+    
+    fib_entry = fib_entry_src_action_path_add(fib_entry, source, flags, rpath);
+
+    /*
+     * if the path list for the source passed is invalid,
+     * then we need to create a new one. else we are updating
+     * an existing.
+     */
+    if (source < best_source)
+    {
+       /*
+        * we have a new winning source.
+        */
+       fib_entry_src_action_deactivate(fib_entry, best_source);
+       fib_entry_src_action_activate(fib_entry, source);
+    }
+    else if (source > best_source)
+    {
+       /*
+        * the new source loses. nothing to do here.
+        * the data from the source is saved in the path-list created
+        */
+       return;
+    }
+    else
+    {
+       /*
+        * the new source is one this entry already has.
+        * But the path-list was updated, which will contribute new forwarding,
+        * so install it.
+        */
+       fib_entry_src_action_deactivate(fib_entry, source);
+       fib_entry_src_action_activate(fib_entry, source);
+    }
+
+    fib_entry_post_update_actions(fib_entry, source, bflags);
+}
+
+/*
+ * fib_entry_path_remove
+ *
+ * remove a path from the entry.
+ * return the fib_entry's index if it is still present, INVALID otherwise.
+ */
+fib_entry_src_flag_t
+fib_entry_path_remove (fib_node_index_t fib_entry_index,
+                      fib_source_t source,
+                      const fib_route_path_t *rpath)
+{
+    fib_entry_src_flag_t sflag;
+    fib_source_t best_source;
+    fib_entry_flag_t bflags;
+    fib_entry_t *fib_entry;
+    fib_entry_src_t *bsrc;
+
+    ASSERT(1 == vec_len(rpath));
+
+    fib_entry = fib_entry_get(fib_entry_index);
+    ASSERT(NULL != fib_entry);
+
+    bsrc = fib_entry_get_best_src_i(fib_entry);
+    best_source = fib_entry_src_get_source(bsrc);
+    bflags = fib_entry_src_get_flags(bsrc);
+
+    sflag = fib_entry_src_action_path_remove(fib_entry, source, rpath);
+
+    /*
+     * if the path list for the source passed is invalid,
+     * then we need to create a new one. else we are updating
+     * an existing.
+     */
+    if (source < best_source )
+    {
+       /*
+        * Que! removing a path from a source that is better than the
+        * one this entry is using.
+        */
+       ASSERT(0);
+    }
+    else if (source > best_source )
+    {
+       /*
+        * the source is not the best. nothing to do.
+        */
+       return (FIB_ENTRY_SRC_FLAG_ADDED);
+    }
+    else
+    {
+       /*
+        * removing a path from the path-list we were using.
+        */
+       if (!(FIB_ENTRY_SRC_FLAG_ADDED & sflag))
+       {
+           /*
+            * the last path from the source was removed.
+            * fallback to lower source
+            */
+           bsrc = fib_entry_get_best_src_i(fib_entry);
+           best_source = fib_entry_src_get_source(bsrc);
+
+           if (FIB_SOURCE_MAX == best_source) {
+               /*
+                * no more sources left. this entry is toast.
+                */
+               fib_entry_src_action_uninstall(fib_entry);
+               fib_entry_post_flag_update_actions(fib_entry, source, bflags);
+
+               return (FIB_ENTRY_SRC_FLAG_NONE);
+           }
+           else
+           {
+               fib_entry_src_action_activate(fib_entry, best_source);
+               source = best_source;
+           }
+       }
+       else
+       {
+           /*
+            * re-install the new forwarding information
+            */
+           fib_entry_src_action_deactivate(fib_entry, source);
+           fib_entry_src_action_activate(fib_entry, source);
+       }
+    }
+
+    fib_entry_post_update_actions(fib_entry, source, bflags);
+
+    /*
+     * still have sources
+     */
+    return (FIB_ENTRY_SRC_FLAG_ADDED);
+}
+
+/*
+ * fib_entry_special_remove
+ *
+ * remove a special source from the entry.
+ * return the fib_entry's index if it is still present, INVALID otherwise.
+ */
+fib_entry_src_flag_t
+fib_entry_special_remove (fib_node_index_t fib_entry_index,
+                         fib_source_t source)
+{
+    fib_entry_src_flag_t sflag;
+    fib_source_t best_source;
+    fib_entry_flag_t bflags;
+    fib_entry_t *fib_entry;
+    fib_entry_src_t *bsrc;
+
+    fib_entry = fib_entry_get(fib_entry_index);
+    ASSERT(NULL != fib_entry);
+
+    bsrc = fib_entry_get_best_src_i(fib_entry);
+    best_source = fib_entry_src_get_source(bsrc);
+    bflags = fib_entry_src_get_flags(bsrc);
+
+    sflag = fib_entry_src_action_remove(fib_entry, source);
+
+    /*
+     * if the path list for the source passed is invalid,
+     * then we need to create a new one. else we are updating
+     * an existing.
+     */
+    if (source < best_source )
+    {
+       /*
+        * Que! removing a path from a source that is better than the
+        * one this entry is using. This can only mean it is a source
+         * this prefix does not have.
+        */
+        return (FIB_ENTRY_SRC_FLAG_ADDED);
+    }
+    else if (source > best_source ) {
+       /*
+        * the source is not the best. nothing to do.
+        */
+       return (FIB_ENTRY_SRC_FLAG_ADDED);
+    }
+    else
+    {
+       if (!(FIB_ENTRY_SRC_FLAG_ADDED & sflag))
+       {
+           /*
+            * the source was removed. use the next best.
+            */
+           bsrc = fib_entry_get_best_src_i(fib_entry);
+           best_source = fib_entry_src_get_source(bsrc);
+
+           if (FIB_SOURCE_MAX == best_source) {
+               /*
+                * no more sources left. this entry is toast.
+                */
+               fib_entry_src_action_uninstall(fib_entry);
+               fib_entry_post_flag_update_actions(fib_entry, source, bflags);
+
+               return (FIB_ENTRY_SRC_FLAG_NONE);
+           }
+           else
+           {
+               fib_entry_src_action_activate(fib_entry, best_source);
+               source = best_source;
+           }
+       }
+       else
+       {
+           /*
+            * re-install the new forwarding information
+            */
+           fib_entry_src_action_reactivate(fib_entry, source);
+       }
+    }
+
+    fib_entry_post_update_actions(fib_entry, source, bflags);
+
+    /*
+     * still have sources
+     */
+    return (FIB_ENTRY_SRC_FLAG_ADDED);
+}
+
+/**
+ * fib_entry_delete
+ *
+ * The source is withdrawing all the paths it provided
+ */
+fib_entry_src_flag_t
+fib_entry_delete (fib_node_index_t fib_entry_index,
+                 fib_source_t source)
+{
+    return (fib_entry_special_remove(fib_entry_index, source));
+}
+
+/**
+ * fib_entry_update
+ *
+ * The source has provided a new set of paths that will replace the old.
+ */
+void
+fib_entry_update (fib_node_index_t fib_entry_index,
+                 fib_source_t source,
+                 fib_entry_flag_t flags,
+                 const fib_route_path_t *paths)
+{
+    fib_source_t best_source;
+    fib_entry_flag_t bflags;
+    fib_entry_t *fib_entry;
+    fib_entry_src_t *bsrc;
+
+    fib_entry = fib_entry_get(fib_entry_index);
+    ASSERT(NULL != fib_entry);
+
+    bsrc = fib_entry_get_best_src_i(fib_entry);
+    best_source = fib_entry_src_get_source(bsrc);
+    bflags = fib_entry_src_get_flags(bsrc);
+
+    fib_entry_src_action_path_swap(fib_entry,
+                                  source,
+                                  flags,
+                                  paths);
+    /*
+     * handle possible realloc's by refetching the pointer
+     */
+    fib_entry = fib_entry_get(fib_entry_index);
+
+    /*
+     * if the path list for the source passed is invalid,
+     * then we need to create a new one. else we are updating
+     * an existing.
+     */
+    if (source < best_source)
+    {
+       /*
+        * we have a new winning source.
+        */
+       fib_entry_src_action_deactivate(fib_entry, best_source);
+       fib_entry_src_action_activate(fib_entry, source);
+    }
+    else if (source > best_source) {
+       /*
+        * the new source loses. nothing to do here.
+        * the data from the source is saved in the path-list created
+        */
+       return;
+    }
+    else
+    {
+       /*
+        * the new source is one this entry already has.
+        * But the path-list was updated, which will contribute new forwarding,
+        * so install it.
+        */
+       fib_entry_src_action_deactivate(fib_entry, source);
+       fib_entry_src_action_activate(fib_entry, source);
+    }
+
+    fib_entry_post_update_actions(fib_entry, source, bflags);
+}
+
+
+/*
+ * fib_entry_cover_changed
+ *
+ * this entry is tracking its cover and that cover has changed.
+ */
+void
+fib_entry_cover_changed (fib_node_index_t fib_entry_index)
+{
+    fib_entry_src_cover_res_t res = {
+       .install = !0,
+       .bw_reason = FIB_NODE_BW_REASON_FLAG_NONE,
+    };
+    fib_source_t source, best_source;
+    fib_entry_flag_t bflags;
+    fib_entry_t *fib_entry;
+    fib_entry_src_t *esrc;
+    u32 index;
+
+    bflags = FIB_ENTRY_FLAG_NONE;
+    best_source = FIB_SOURCE_FIRST;
+    fib_entry = fib_entry_get(fib_entry_index);
+
+    fib_attached_export_cover_change(fib_entry);
+
+    /*
+     * propagate the notificuation to each of the added sources
+     */
+    index = 0;
+    FOR_EACH_SRC_ADDED(fib_entry, esrc, source,
+    ({
+       if (0 == index)
+       {
+           /*
+            * only the best source gets to set the back walk flags
+            */
+           res = fib_entry_src_action_cover_change(fib_entry, source);
+            bflags = fib_entry_src_get_flags(esrc);
+            best_source = fib_entry_src_get_source(esrc);
+       }
+       else
+       {
+           fib_entry_src_action_cover_change(fib_entry, source);
+       }
+       index++;
+    }));
+
+    if (res.install)
+    {
+       fib_entry_src_action_reactivate(fib_entry,
+                                       fib_entry_src_get_source(
+                                           fib_entry_get_best_src_i(fib_entry)));
+        fib_entry_post_install_actions(fib_entry, best_source, bflags);
+    }
+    else
+    {
+       fib_entry_src_action_uninstall(fib_entry);
+    }
+
+    if (FIB_NODE_BW_REASON_FLAG_NONE != res.bw_reason)
+    {
+       /*
+        * time for walkies fido.
+        */
+       fib_node_back_walk_ctx_t bw_ctx = {
+           .fnbw_reason = res.bw_reason,
+        };
+
+       fib_walk_sync(FIB_NODE_TYPE_ENTRY, fib_entry_index, &bw_ctx);
+    }
+}
+
+/*
+ * fib_entry_cover_updated
+ *
+ * this entry is tracking its cover and that cover has been updated
+ * (i.e. its forwarding information has changed).
+ */
+void
+fib_entry_cover_updated (fib_node_index_t fib_entry_index)
+{
+    fib_entry_src_cover_res_t res = {
+       .install = !0,
+       .bw_reason = FIB_NODE_BW_REASON_FLAG_NONE,
+    };
+    fib_source_t source, best_source;
+    fib_entry_flag_t bflags;
+    fib_entry_t *fib_entry;
+    fib_entry_src_t *esrc;
+    u32 index;
+
+    bflags = FIB_ENTRY_FLAG_NONE;
+    best_source = FIB_SOURCE_FIRST;
+    fib_entry = fib_entry_get(fib_entry_index);
+
+    fib_attached_export_cover_update(fib_entry);
+
+    /*
+     * propagate the notificuation to each of the added sources
+     */
+    index = 0;
+    FOR_EACH_SRC_ADDED(fib_entry, esrc, source,
+    ({
+       if (0 == index)
+       {
+           /*
+            * only the best source gets to set the back walk flags
+            */
+           res = fib_entry_src_action_cover_update(fib_entry, source);
+            bflags = fib_entry_src_get_flags(esrc);
+            best_source = fib_entry_src_get_source(esrc);
+       }
+       else
+       {
+           fib_entry_src_action_cover_update(fib_entry, source);
+       }
+       index++;
+    }));
+
+    if (res.install)
+    {
+       fib_entry_src_action_reactivate(fib_entry,
+                                       fib_entry_src_get_source(
+                                           fib_entry_get_best_src_i(fib_entry)));
+        fib_entry_post_install_actions(fib_entry, best_source, bflags);
+    }
+    else
+    {
+       fib_entry_src_action_uninstall(fib_entry);
+    }
+
+    if (FIB_NODE_BW_REASON_FLAG_NONE != res.bw_reason)
+    {
+       /*
+        * time for walkies fido.
+        */
+       fib_node_back_walk_ctx_t bw_ctx = {
+           .fnbw_reason = res.bw_reason,
+        };
+
+       fib_walk_sync(FIB_NODE_TYPE_ENTRY, fib_entry_index, &bw_ctx);
+    }
+}
+
+int
+fib_entry_recursive_loop_detect (fib_node_index_t entry_index,
+                                fib_node_index_t **entry_indicies)
+{
+    fib_entry_t *fib_entry;
+    int was_looped, is_looped;
+
+    fib_entry = fib_entry_get(entry_index);
+
+    if (FIB_NODE_INDEX_INVALID != fib_entry->fe_parent)
+    {
+       fib_node_index_t *entries = *entry_indicies;
+       fib_forward_chain_type_t fct;
+
+       vec_add1(entries, entry_index);
+       was_looped = fib_path_list_is_looped(fib_entry->fe_parent);
+       is_looped = fib_path_list_recursive_loop_detect(fib_entry->fe_parent,
+                                                       &entries);
+
+       *entry_indicies = entries;
+
+       if (!!was_looped != !!is_looped)
+       {
+           /*
+            * re-evaluate all the entry's forwarding
+            * NOTE: this is an inplace modify
+            */
+           FOR_EACH_FIB_FORW_CHAIN(fct)
+           {
+               if (dpo_id_is_valid(&fib_entry->fe_lb[fct]))
+               {
+                   fib_entry_src_mk_lb(fib_entry,
+                                       fib_entry_get_best_src_i(fib_entry),
+                                       fct,
+                                       &fib_entry->fe_lb[fct]);
+               }
+           }
+       }
+    }
+    else
+    {
+       /*
+        * the entry is currently not linked to a path-list. this happens
+        * when it is this entry that is re-linking path-lists and has thus
+        * broken the loop
+        */
+       is_looped = 0;
+    }
+
+    return (is_looped);
+}
+
+u32
+fib_entry_get_resolving_interface (fib_node_index_t entry_index)
+{
+   fib_entry_t *fib_entry;
+
+    fib_entry = fib_entry_get(entry_index);
+
+    return (fib_path_list_get_resolving_interface(fib_entry->fe_parent));
+}
+
+fib_source_t
+fib_entry_get_best_source (fib_node_index_t entry_index)
+{
+    fib_entry_t *fib_entry;
+    fib_entry_src_t *bsrc;
+
+    fib_entry = fib_entry_get(entry_index);
+
+    bsrc = fib_entry_get_best_src_i(fib_entry);
+    return (fib_entry_src_get_source(bsrc));
+}
+
+static int
+fib_ip4_address_compare (ip4_address_t * a1,
+                         ip4_address_t * a2)
+{
+    /*
+     * IP addresses are unsiged ints. the return value here needs to be signed
+     * a simple subtraction won't cut it.
+     * If the addresses are the same, the sort order is undefiend, so phoey.
+     */
+    return ((clib_net_to_host_u32(a1->data_u32) >
+            clib_net_to_host_u32(a2->data_u32) ) ?
+           1 : -1);
+}
+
+static int
+fib_ip6_address_compare (ip6_address_t * a1,
+                         ip6_address_t * a2)
+{
+  int i;
+  for (i = 0; i < ARRAY_LEN (a1->as_u16); i++)
+  {
+      int cmp = (clib_net_to_host_u16 (a1->as_u16[i]) -
+                clib_net_to_host_u16 (a2->as_u16[i]));
+      if (cmp != 0)
+         return cmp;
+  }
+  return 0;
+}
+
+static int
+fib_entry_cmp (fib_node_index_t fib_entry_index1,
+              fib_node_index_t fib_entry_index2)
+{
+    fib_entry_t *fib_entry1, *fib_entry2;
+    int cmp = 0;
+
+    fib_entry1 = fib_entry_get(fib_entry_index1);
+    fib_entry2 = fib_entry_get(fib_entry_index2);
+
+    switch (fib_entry1->fe_prefix.fp_proto)
+    {
+    case FIB_PROTOCOL_IP4:
+        cmp = fib_ip4_address_compare(&fib_entry1->fe_prefix.fp_addr.ip4,
+                                      &fib_entry2->fe_prefix.fp_addr.ip4);
+        break;
+    case FIB_PROTOCOL_IP6:
+        cmp = fib_ip6_address_compare(&fib_entry1->fe_prefix.fp_addr.ip6,
+                                      &fib_entry2->fe_prefix.fp_addr.ip6);
+        break;
+    case FIB_PROTOCOL_MPLS:
+       cmp = (fib_entry1->fe_prefix.fp_label - fib_entry2->fe_prefix.fp_label);
+
+       if (0 == cmp)
+       {
+           cmp = (fib_entry1->fe_prefix.fp_eos - fib_entry2->fe_prefix.fp_eos);
+       }
+        break;
+    }
+
+    if (0 == cmp) {
+       cmp = (fib_entry1->fe_prefix.fp_len - fib_entry2->fe_prefix.fp_len);
+    }
+    return (cmp);   
+}
+
+int
+fib_entry_cmp_for_sort (void *i1, void *i2)
+{
+    fib_node_index_t *fib_entry_index1 = i1, *fib_entry_index2 = i2;
+
+    return (fib_entry_cmp(*fib_entry_index1,
+                         *fib_entry_index2));
+}
+
+void
+fib_entry_lock (fib_node_index_t fib_entry_index)
+{
+    fib_entry_t *fib_entry;
+
+    fib_entry = fib_entry_get(fib_entry_index);
+
+    fib_node_lock(&fib_entry->fe_node);
+}
+
+void
+fib_entry_unlock (fib_node_index_t fib_entry_index)
+{
+    fib_entry_t *fib_entry;
+
+    fib_entry = fib_entry_get(fib_entry_index);
+
+    fib_node_unlock(&fib_entry->fe_node);
+}
+
+void
+fib_entry_module_init (void)
+{
+    fib_node_register_type (FIB_NODE_TYPE_ENTRY, &fib_entry_vft);
+}
+
+void
+fib_entry_get_prefix (fib_node_index_t fib_entry_index,
+                     fib_prefix_t *pfx)
+{
+    fib_entry_t *fib_entry;
+
+    fib_entry = fib_entry_get(fib_entry_index);
+    *pfx = fib_entry->fe_prefix;
+}
+
+u32
+fib_entry_get_fib_index (fib_node_index_t fib_entry_index)
+{
+    fib_entry_t *fib_entry;
+
+    fib_entry = fib_entry_get(fib_entry_index);
+
+    return (fib_entry->fe_fib_index);
+}
+
+u32
+fib_entry_pool_size (void)
+{
+    return (pool_elts(fib_entry_pool));
+}
+
+static clib_error_t *
+show_fib_entry_command (vlib_main_t * vm,
+                       unformat_input_t * input,
+                       vlib_cli_command_t * cmd)
+{
+    fib_node_index_t fei;
+
+    if (unformat (input, "%d", &fei))
+    {
+       /*
+        * show one in detail
+        */
+       if (!pool_is_free_index(fib_entry_pool, fei))
+       {
+           vlib_cli_output (vm, "%d@%U",
+                            fei,
+                            format_fib_entry, fei,
+                            FIB_ENTRY_FORMAT_DETAIL2);
+       }
+       else
+       {
+           vlib_cli_output (vm, "entry %d invalid", fei);
+       }
+    }
+    else
+    {
+       /*
+        * show all
+        */
+       vlib_cli_output (vm, "FIB Entries:");
+       pool_foreach_index(fei, fib_entry_pool,
+        ({
+           vlib_cli_output (vm, "%d@%U",
+                            fei,
+                            format_fib_entry, fei,
+                            FIB_ENTRY_FORMAT_BRIEF);
+       }));
+    }
+
+    return (NULL);
+}
+
+VLIB_CLI_COMMAND (show_fib_entry, static) = {
+  .path = "show fib entry",
+  .function = show_fib_entry_command,
+  .short_help = "show fib entry",
+};
diff --git a/vnet/vnet/fib/fib_entry.h b/vnet/vnet/fib/fib_entry.h
new file mode 100644 (file)
index 0000000..ac22c17
--- /dev/null
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FIB_ENTRY_H__
+#define __FIB_ENTRY_H__
+
+#include <vnet/fib/fib_node.h>
+#include <vnet/adj/adj.h>
+#include <vnet/ip/ip.h>
+#include <vnet/dpo/dpo.h>
+
+/**
+ * The different sources that can create a route.
+ * The sources are defined here the thier relative priority order.
+ * The lower the value the higher the priority
+ */
+typedef enum fib_source_t_ {
+    /**
+     * Marker. Add new values after this one.
+     */
+    FIB_SOURCE_FIRST,
+    /**
+     * Special sources. These are for entries that are added to all
+     * FIBs by default, and should never be over-ridden (hence they
+     * are the highest priority)
+     */
+    FIB_SOURCE_SPECIAL = FIB_SOURCE_FIRST,
+    /**
+     * Classify. A route that links directly to a classify adj
+     */
+    FIB_SOURCE_CLASSIFY,
+    /**
+     * Route added as a result of interface configuration.
+     * this will also come from the API/CLI, but the distinction is
+     * that is from confiiguration on an interface, not a 'ip route' command
+     */
+    FIB_SOURCE_INTERFACE,
+    /**
+     * A high priority source a plugin can use
+     */
+    FIB_SOURCE_PLUGIN_HI,
+    /**
+     * From the control plane API
+     */
+    FIB_SOURCE_API,
+    /**
+     * From the CLI.
+     */
+    FIB_SOURCE_CLI,
+    /**
+     * LISP
+     */
+    FIB_SOURCE_LISP,
+    /**
+     * SRv6
+     */
+    FIB_SOURCE_SR,
+    /**
+     * IPv[46] Mapping
+     */
+    FIB_SOURCE_MAP,
+    /**
+     * SIXRD
+     */
+    FIB_SOURCE_SIXRD,
+    /**
+     * DHCP
+     */
+    FIB_SOURCE_DHCP,
+    /**
+     * Adjacency source.
+     * routes created as a result of ARP/ND entries. This is lower priority
+     * then the API/CLI. This is on purpose. trust me.
+     */
+    FIB_SOURCE_ADJ,
+    /**
+     * MPLS label. The prefix has been assigned a local label. This source
+     * never provides forwarding information, instead it acts as a place-holder
+     * so the association of label to prefix can be maintained
+     */
+    FIB_SOURCE_MPLS,
+    /**
+     * Attached Export source.
+     * routes created as a result of attahced export. routes thus sourced
+     * will be present in the export tables
+     */
+    FIB_SOURCE_AE,
+    /**
+     * Recursive resolution source.
+     * Used to install an entry that is thre resolution traget of another.
+     */
+    FIB_SOURCE_RR,
+    /**
+     * The default route source.
+     * The default route is always added to the FIB table (like the
+     * special sources) but we need to be able to over-ride it with
+     * 'ip route' sources when provided
+     */
+    FIB_SOURCE_DEFAULT_ROUTE,
+    /**
+     * Marker. add new entries before this one.
+     */
+    FIB_SOURCE_LAST = FIB_SOURCE_DEFAULT_ROUTE,
+} __attribute__ ((packed)) fib_source_t;
+
+_Static_assert (sizeof(fib_source_t) == 1,
+               "FIB too many sources");
+
+/**
+ * The maximum number of sources
+ */
+#define FIB_SOURCE_MAX (FIB_SOURCE_LAST+1)
+
+#define FIB_SOURCES {                                  \
+    [FIB_SOURCE_SPECIAL] = "special",                  \
+    [FIB_SOURCE_INTERFACE] = "interface",              \
+    [FIB_SOURCE_API] = "API",                          \
+    [FIB_SOURCE_CLI] = "CLI",                          \
+    [FIB_SOURCE_ADJ] = "adjacency",                    \
+    [FIB_SOURCE_MAP] = "MAP",                          \
+    [FIB_SOURCE_SR] = "SR",                            \
+    [FIB_SOURCE_SIXRD] = "SixRD",                      \
+    [FIB_SOURCE_LISP] = "LISP",                        \
+    [FIB_SOURCE_CLASSIFY] = "classify",                        \
+    [FIB_SOURCE_DHCP] = "DHCP",                        \
+    [FIB_SOURCE_RR] = "recursive-resolution",          \
+    [FIB_SOURCE_AE] = "attached_export",               \
+    [FIB_SOURCE_MPLS] = "mpls",                        \
+    [FIB_SOURCE_DEFAULT_ROUTE] = "default-route",      \
+}
+
+#define FOR_EACH_FIB_SOURCE(_item) \
+    for (_item = FIB_SOURCE_FIRST; _item < FIB_SOURCE_MAX; _item++)
+
+/**
+ * The different sources that can create a route.
+ * The sources are defined here the thier relative priority order.
+ * The lower the value the higher the priority
+ */
+typedef enum fib_entry_attribute_t_ {
+    /**
+     * Marker. Add new values after this one.
+     */
+    FIB_ENTRY_ATTRIBUTE_FIRST,
+    /**
+     * Connected. The prefix is configured on an interface.
+     */
+    FIB_ENTRY_ATTRIBUTE_CONNECTED = FIB_ENTRY_ATTRIBUTE_FIRST,
+    /**
+     * Attached. The prefix is attached to an interface.
+     */
+    FIB_ENTRY_ATTRIBUTE_ATTACHED,
+    /**
+     * The route is an explicit drop.
+     */
+    FIB_ENTRY_ATTRIBUTE_DROP,
+    /**
+     * The route is exclusive. The client creating the route is
+     * providing an exclusive adjacency.
+     */
+    FIB_ENTRY_ATTRIBUTE_EXCLUSIVE,
+    /**
+     * The route is attached cross tables and thus imports covered
+     * prefixes from the other table.
+     */
+    FIB_ENTRY_ATTRIBUTE_IMPORT,
+    /**
+     * The prefix/address is local to this device
+     */
+    FIB_ENTRY_ATTRIBUTE_LOCAL,
+    /**
+     * Marker. add new entries before this one.
+     */
+    FIB_ENTRY_ATTRIBUTE_LAST = FIB_ENTRY_ATTRIBUTE_LOCAL,
+} fib_entry_attribute_t;
+
+/**
+ * The maximum number of sources
+ */
+#define FIB_ENTRY_ATTRIBUTE_MAX (FIB_ENTRY_ATTRIBUTE_LAST+1)
+
+#define FIB_ENTRY_ATTRIBUTES {                         \
+    [FIB_ENTRY_ATTRIBUTE_CONNECTED] = "connected",     \
+    [FIB_ENTRY_ATTRIBUTE_ATTACHED]  = "attached",      \
+    [FIB_ENTRY_ATTRIBUTE_IMPORT]    = "import",                \
+    [FIB_ENTRY_ATTRIBUTE_DROP]      = "drop",          \
+    [FIB_ENTRY_ATTRIBUTE_EXCLUSIVE] = "exclusive",      \
+    [FIB_ENTRY_ATTRIBUTE_LOCAL]     = "local",         \
+}
+
+#define FOR_EACH_FIB_ATTRIBUTE(_item)                  \
+    for (_item = FIB_ENTRY_ATTRIBUTE_FIRST;            \
+        _item < FIB_ENTRY_ATTRIBUTE_MAX;               \
+        _item++)
+
+typedef enum fib_entry_flag_t_ {
+    FIB_ENTRY_FLAG_NONE      = 0,
+    FIB_ENTRY_FLAG_CONNECTED = (1 << FIB_ENTRY_ATTRIBUTE_CONNECTED),
+    FIB_ENTRY_FLAG_ATTACHED  = (1 << FIB_ENTRY_ATTRIBUTE_ATTACHED),
+    FIB_ENTRY_FLAG_DROP      = (1 << FIB_ENTRY_ATTRIBUTE_DROP),
+    FIB_ENTRY_FLAG_EXCLUSIVE = (1 << FIB_ENTRY_ATTRIBUTE_EXCLUSIVE),
+    FIB_ENTRY_FLAG_LOCAL     = (1 << FIB_ENTRY_ATTRIBUTE_LOCAL),
+    FIB_ENTRY_FLAG_IMPORT    = (1 << FIB_ENTRY_ATTRIBUTE_IMPORT),
+} fib_entry_flag_t;
+
+/**
+ * Flags for the source data
+ */
+typedef enum fib_entry_src_attribute_t_ {
+    /**
+     * Marker. Add new values after this one.
+     */
+    FIB_ENTRY_SRC_ATTRIBUTE_FIRST,
+    /**
+     * the source has been added to the entry
+     */
+    FIB_ENTRY_SRC_ATTRIBUTE_ADDED = FIB_ENTRY_SRC_ATTRIBUTE_FIRST,
+    /**
+     * the source is active/best
+     */
+    FIB_ENTRY_SRC_ATTRIBUTE_ACTIVE,
+    /**
+     * Marker. add new entries before this one.
+     */
+    FIB_ENTRY_SRC_ATTRIBUTE_LAST = FIB_ENTRY_SRC_ATTRIBUTE_ACTIVE,
+} fib_entry_src_attribute_t;
+
+#define FIB_ENTRY_SRC_ATTRIBUTE_MAX (FIB_ENTRY_SRC_ATTRIBUTE_LAST+1)
+
+#define FIB_ENTRY_SRC_ATTRIBUTES {              \
+    [FIB_ENTRY_SRC_ATTRIBUTE_ADDED]  = "added",         \
+    [FIB_ENTRY_SRC_ATTRIBUTE_ACTIVE] = "active", \
+}
+
+typedef enum fib_entry_src_flag_t_ {
+    FIB_ENTRY_SRC_FLAG_NONE   = 0,
+    FIB_ENTRY_SRC_FLAG_ADDED  = (1 << FIB_ENTRY_SRC_ATTRIBUTE_ADDED),
+    FIB_ENTRY_SRC_FLAG_ACTIVE = (1 << FIB_ENTRY_SRC_ATTRIBUTE_ACTIVE),
+} __attribute__ ((packed)) fib_entry_src_flag_t;
+
+/*
+ * Keep the size of the flags field to 2 bytes, so it
+ * can be placed next to the 2 bytes reference count
+ */
+_Static_assert (sizeof(fib_entry_src_flag_t) <= 2,
+               "FIB entry flags field size too big");
+
+/**
+ * Information related to the source of a FIB entry
+ */
+typedef struct fib_entry_src_t_ {
+    /**
+     * The path-list created by the source
+     */
+    fib_node_index_t fes_pl;
+    /**
+     * Which source this info block is for
+     */
+    fib_source_t fes_src;
+    /**
+     * Flags on the source
+     */
+    fib_entry_src_flag_t fes_flags;
+    /**
+     * Flags the source contributes to the entry
+     */
+    fib_entry_flag_t fes_entry_flags;
+
+    /**
+     * 1 bytes ref count. This is not the number of users of the Entry
+     * (which is itself not large, due to path-list sharing), but the number
+     * of times a given source has been added. Which is even fewer
+     */
+    u8 fes_ref_count;
+
+    /**
+     * A vector of path extensions
+     */
+    struct fib_path_ext_t_ *fes_path_exts;
+    
+    /**
+     * Source specific info
+     */
+    union {
+       struct {
+           /**
+            * the index of the FIB entry that is the covering entry
+            */
+           fib_node_index_t fesr_cover;
+           /**
+            * This source's index in the cover's list
+            */
+           u32 fesr_sibling;
+       } rr;
+       struct {
+           /**
+            * the index of the FIB entry that is the covering entry
+            */
+           fib_node_index_t fesa_cover;
+           /**
+            * This source's index in the cover's list
+            */
+           u32 fesa_sibling;
+       } adj;
+       struct {
+           /**
+            * the index of the FIB entry that is the covering entry
+            */
+           fib_node_index_t fesi_cover;
+           /**
+            * This source's index in the cover's list
+            */
+           u32 fesi_sibling;
+       } interface;
+       struct {
+           /**
+            * This MPLS local label associated with the prefix.
+            */
+           mpls_label_t fesm_label;
+
+           /**
+            * the indicies of the LFIB entries created
+            */
+           fib_node_index_t fesm_lfes[2];
+       } mpls;
+       struct {
+           /**
+            * The source FIB index.
+            */
+            fib_node_index_t fesl_fib_index;
+       } lisp;
+    };
+} fib_entry_src_t;
+
+/**
+ * An entry in a FIB table.
+ *
+ * This entry represents a route added to the FIB that is stored
+ * in one of the FIB tables.
+ */
+typedef struct fib_entry_t_ {
+    /**
+     * Base class. The entry's node representation in the graph.
+     */
+    fib_node_t fe_node;
+    /**
+     * The prefix of the route
+     */
+    fib_prefix_t fe_prefix;
+    /**
+     * The index of the FIB table this entry is in
+     */
+    u32 fe_fib_index;
+    /**
+     * The load-balance used for forwarding.
+     *
+     * We don't share the EOS and non-EOS even in case when they could be
+     * because:
+     *   - complexity & reliability v. memory
+     *       determining the conditions where sharing is possible is non-trivial.
+     *   - separate LBs means we can get the EOS bit right in the MPLS label DPO
+     *     and so save a few clock cycles in the DP imposition node since we can
+     *     paint the header straight on without the need to check the packet
+     *     type to derive the EOS bit value.
+     */
+    dpo_id_t fe_lb[FIB_FORW_CHAIN_NUM];
+    /**
+     * Vector of source infos.
+     * Most entries will only have 1 source. So we optimise for memory usage,
+     * which is preferable since we have many entries.
+     */
+    fib_entry_src_t *fe_srcs;
+    /**
+     * the path-list for which this entry is a child. This is also the path-list
+     * that is contributing forwarding for this entry.
+     */
+    fib_node_index_t fe_parent;
+    /**
+     * index of this entry in the parent's child list.
+     * This is set when this entry is added as a child, but can also
+     * be changed by the parent as it manages its list.
+     */
+    u32 fe_sibling;
+    /**
+     * Dependency list of covered entries.
+     * these are more specific entries that are interested in changes
+     * to their respective cover
+     */
+    fib_node_list_t fe_covered;
+    /**
+     * exporter
+     */
+    fib_node_index_t fe_export;
+    fib_node_index_t fe_import;
+} fib_entry_t;
+
+#define FOR_EACH_FIB_ENTRY_FLAG(_item) \
+    for (_item = FIB_ENTRY_FLAG_FIRST; _item < FIB_ENTRY_FLAG_MAX; _item++)
+
+#define FIB_ENTRY_FORMAT_BRIEF   (0x0)
+#define FIB_ENTRY_FORMAT_DETAIL  (0x1)
+#define FIB_ENTRY_FORMAT_DETAIL2 (0x2)
+
+extern u8 *format_fib_entry (u8 * s, va_list * args);
+
+extern fib_node_index_t fib_entry_create_special(u32 fib_index,
+                                                const fib_prefix_t *prefix,
+                                                fib_source_t source,
+                                                fib_entry_flag_t flags,
+                                                const dpo_id_t *dpo);
+
+extern fib_node_index_t fib_entry_create (u32 fib_index,
+                                         const fib_prefix_t *prefix,
+                                         fib_source_t source,
+                                         fib_entry_flag_t flags,
+                                         const fib_route_path_t *paths);
+extern void fib_entry_update (fib_node_index_t fib_entry_index,
+                             fib_source_t source,
+                             fib_entry_flag_t flags,
+                             const fib_route_path_t *paths);
+
+extern void fib_entry_path_add(fib_node_index_t fib_entry_index,
+                              fib_source_t source,
+                              fib_entry_flag_t flags,
+                              const fib_route_path_t *rpath);
+extern void fib_entry_special_add(fib_node_index_t fib_entry_index,
+                                 fib_source_t source,
+                                 fib_entry_flag_t flags,
+                                 const dpo_id_t *dpo);
+extern fib_entry_src_flag_t fib_entry_special_remove(fib_node_index_t fib_entry_index,
+                                                    fib_source_t source);
+
+extern fib_entry_src_flag_t fib_entry_path_remove(fib_node_index_t fib_entry_index,
+                                                 fib_source_t source,
+                                                 const fib_route_path_t *rpath);
+extern fib_entry_src_flag_t fib_entry_delete(fib_node_index_t fib_entry_index,
+                                            fib_source_t source);
+
+extern void fib_entry_contribute_forwarding(
+    fib_node_index_t fib_entry_index,
+    fib_forward_chain_type_t type,
+    dpo_id_t *dpo);
+extern const dpo_id_t * fib_entry_contribute_ip_forwarding(
+    fib_node_index_t fib_entry_index);
+extern adj_index_t fib_entry_get_adj_for_source(
+    fib_node_index_t fib_entry_index,
+    fib_source_t source);
+extern const int fib_entry_get_dpo_for_source (
+    fib_node_index_t fib_entry_index,
+    fib_source_t source,
+    dpo_id_t *dpo);
+
+extern adj_index_t fib_entry_get_adj(fib_node_index_t fib_entry_index);
+
+extern int fib_entry_cmp_for_sort(void *i1, void *i2);
+
+extern void fib_entry_cover_changed(fib_node_index_t fib_entry);
+extern void fib_entry_cover_updated(fib_node_index_t fib_entry);
+extern int fib_entry_recursive_loop_detect(fib_node_index_t entry_index,
+                                          fib_node_index_t **entry_indicies);
+
+extern void fib_entry_lock(fib_node_index_t fib_entry_index);
+extern void fib_entry_unlock(fib_node_index_t fib_entry_index);
+
+extern u32 fib_entry_child_add(fib_node_index_t fib_entry_index,
+                              fib_node_type_t type,
+                              fib_node_index_t child_index);
+extern void fib_entry_child_remove(fib_node_index_t fib_entry_index,
+                                  u32 sibling_index);
+extern u32 fib_entry_get_resolving_interface(fib_node_index_t fib_entry_index);
+
+extern void fib_entry_get_prefix(fib_node_index_t fib_entry_index,
+                                fib_prefix_t *pfx);
+extern u32 fib_entry_get_fib_index(fib_node_index_t fib_entry_index);
+extern void fib_entry_set_source_data(fib_node_index_t fib_entry_index,
+                                      fib_source_t source,
+                                      const void *data);
+extern const void* fib_entry_get_source_data(fib_node_index_t fib_entry_index,
+                                             fib_source_t source);
+
+extern fib_entry_flag_t fib_entry_get_flags(fib_node_index_t fib_entry_index);
+extern fib_source_t fib_entry_get_best_source(fib_node_index_t fib_entry_index);
+extern int fib_entry_is_sourced(fib_node_index_t fib_entry_index,
+                                fib_source_t source);
+
+extern fib_node_index_t fib_entry_get_path_list(fib_node_index_t fib_entry_index);
+extern u32 fib_entry_get_fib_table_id(fib_node_index_t fib_entry_index);
+
+extern void fib_entry_module_init(void);
+
+/*
+ * unsafe... beware the raw pointer.
+ */
+extern fib_node_index_t fib_entry_get_index(const fib_entry_t * fib_entry);
+extern fib_entry_t * fib_entry_get(fib_node_index_t fib_entry_index);
+
+/*
+ * for testing purposes.
+ */
+extern u32 fib_entry_pool_size(void);
+
+#endif
diff --git a/vnet/vnet/fib/fib_entry_cover.c b/vnet/vnet/fib/fib_entry_cover.c
new file mode 100644 (file)
index 0000000..06b5b91
--- /dev/null
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/fib/fib_entry_cover.h>
+#include <vnet/fib/fib_entry_src.h>
+#include <vnet/fib/fib_node_list.h>
+
+u32
+fib_entry_cover_track (fib_entry_t* cover,
+                      fib_node_index_t covered)
+{
+    FIB_ENTRY_DBG(cover, "cover-track %d", covered);
+
+    ASSERT(fib_entry_get_index(cover) != covered);
+
+    if (FIB_NODE_INDEX_INVALID == cover->fe_covered)
+    {
+        cover->fe_covered = fib_node_list_create();
+    }
+
+    return (fib_node_list_push_front(cover->fe_covered,
+                                     0, FIB_NODE_TYPE_ENTRY,
+                                     covered));
+}
+
+void
+fib_entry_cover_untrack (fib_entry_t* cover,
+                        u32 tracked_index)
+{
+    FIB_ENTRY_DBG(cover, "cover-untrack @ %d", tracked_index);
+
+    if (FIB_NODE_INDEX_INVALID == cover->fe_covered)
+        return;
+
+    fib_node_list_remove(cover->fe_covered, tracked_index);
+
+    if (0 == fib_node_list_get_size(cover->fe_covered))
+    {
+        fib_node_list_destroy(&cover->fe_covered);
+    }
+}
+
+/**
+ * Internal struct to hold user supplied paraneters for the cover walk
+ */
+typedef struct fib_enty_cover_walk_ctx_t_ {
+    fib_entry_t *cover;
+    fib_entry_covered_walk_t walk;
+    void *ctx;
+} fib_enty_cover_walk_ctx_t;
+
+static int
+fib_entry_cover_walk_node_ptr (fib_node_ptr_t *depend,
+                              void *args)
+{
+    fib_enty_cover_walk_ctx_t *ctx = args;
+
+    ctx->walk(ctx->cover, depend->fnp_index, ctx->ctx);
+
+    /* continue */
+    return (1);
+}
+
+void
+fib_entry_cover_walk (fib_entry_t *cover,
+                     fib_entry_covered_walk_t walk,
+                     void *args)
+{
+    if (FIB_NODE_INDEX_INVALID != cover->fe_covered)
+    {
+        fib_enty_cover_walk_ctx_t ctx = {
+            .cover = cover,
+            .walk = walk,
+            .ctx = args,
+        };
+
+        fib_node_list_walk(cover->fe_covered,
+                           fib_entry_cover_walk_node_ptr,
+                           &ctx);
+    }
+}
+
+u32
+fib_entry_cover_get_size (fib_entry_t *cover)
+{
+    if (FIB_NODE_INDEX_INVALID != cover->fe_covered)
+        return (fib_node_list_get_size(cover->fe_covered));
+    return (0);
+}
+
+typedef struct fib_entry_cover_list_format_ctx_t_ {
+    u8 *s;
+} fib_entry_cover_list_format_ctx_t;
+
+static int
+fib_entry_covered_list_format_one (fib_entry_t *cover,
+                                  fib_node_index_t covered,
+                                  void *args)
+{
+    fib_entry_cover_list_format_ctx_t * ctx = args;
+
+    ctx->s = format(ctx->s, "%d, ", covered);
+
+    /* continue */
+    return (1);
+}
+
+u8*
+fib_entry_cover_list_format (fib_entry_t *fib_entry,
+                            u8 *s)
+{
+    fib_entry_cover_list_format_ctx_t ctx = {
+       .s = s,
+    };
+
+    fib_entry_cover_walk(fib_entry, 
+                        fib_entry_covered_list_format_one,
+                        &ctx);
+
+    return (ctx.s);
+}
+
+static int
+fib_entry_cover_change_one (fib_entry_t *cover,
+                           fib_node_index_t covered,
+                           void *args)
+{
+    fib_node_index_t new_cover;
+
+    /*
+     * The 3 entries involved here are:
+     *   cover - the least specific. It will cover both the others
+     *  new_cover - the enty just inserted below the cover
+     *  covered - the entry that was tracking the cover.
+     *
+     * The checks below are to determine if new_cover is a cover for covered.
+     */
+    new_cover = pointer_to_uword(args);
+
+    if (FIB_NODE_INDEX_INVALID == new_cover)
+    {
+       /*
+        * nothing has been inserted, which implies the cover was removed.
+        * 'cover' is thus the new cover.
+        */
+       fib_entry_cover_changed(covered);
+    }
+    else if (new_cover != covered)
+    {
+       fib_prefix_t pfx_covered, pfx_new_cover;
+
+       fib_entry_get_prefix(covered, &pfx_covered);
+       fib_entry_get_prefix(new_cover, &pfx_new_cover);
+
+       if (fib_prefix_is_cover(&pfx_new_cover, &pfx_covered))
+       {
+           fib_entry_cover_changed(covered);
+       }
+    }
+    /* continue */
+    return (1);
+}
+
+void
+fib_entry_cover_change_notify (fib_node_index_t cover_index,
+                              fib_node_index_t covered)
+{
+    fib_entry_t *cover;
+
+    cover = fib_entry_get(cover_index);
+
+    fib_entry_cover_walk(cover, 
+                        fib_entry_cover_change_one,
+                        uword_to_pointer(covered, void*));
+}
+
+static int
+fib_entry_cover_update_one (fib_entry_t *cover,
+                           fib_node_index_t covered,
+                           void *args)
+{
+    fib_entry_cover_updated(covered);
+
+    /* continue */
+    return (1);
+}
+
+void
+fib_entry_cover_update_notify (fib_entry_t *fib_entry)
+{
+    fib_entry_cover_walk(fib_entry, 
+                        fib_entry_cover_update_one,
+                        NULL);
+}
diff --git a/vnet/vnet/fib/fib_entry_cover.h b/vnet/vnet/fib/fib_entry_cover.h
new file mode 100644 (file)
index 0000000..fbbbc21
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FIB_ENTRY_COVER_H__
+#define __FIB_ENTRY_COVER_H__
+
+#include "fib_entry.h"
+
+/**
+ * callback function used when walking the covered entries
+ */
+typedef int (*fib_entry_covered_walk_t)(fib_entry_t *cover,
+                                       fib_node_index_t covered,
+                                       void *ctx);
+
+extern u32 fib_entry_cover_track(fib_entry_t *cover,
+                                fib_node_index_t covered);
+
+extern void fib_entry_cover_untrack(fib_entry_t *cover,
+                                   u32 tracked_index);
+
+extern void fib_entry_cover_walk(fib_entry_t *cover,
+                                fib_entry_covered_walk_t walk,
+                                void *ctx);
+
+extern void fib_entry_cover_change_notify(fib_node_index_t cover_index,
+                                         fib_node_index_t covered_index);
+extern void fib_entry_cover_update_notify(fib_entry_t *cover);
+
+extern u32 fib_entry_cover_get_size(fib_entry_t *cover);
+
+extern u8* fib_entry_cover_list_format(fib_entry_t *fib_entry,
+                                      u8 *s);
+
+#endif
diff --git a/vnet/vnet/fib/fib_entry_src.c b/vnet/vnet/fib/fib_entry_src.c
new file mode 100644 (file)
index 0000000..f7d84e5
--- /dev/null
@@ -0,0 +1,1278 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/adj/adj.h>
+#include <vnet/dpo/load_balance.h>
+#include <vnet/dpo/mpls_label_dpo.h>
+#include <vnet/dpo/drop_dpo.h>
+
+#include "fib_entry_src.h"
+#include "fib_table.h"
+#include "fib_path_ext.h"
+
+/*
+ * per-source type vft
+ */
+static fib_entry_src_vft_t fib_entry_src_vft[FIB_SOURCE_MAX];
+
+static fib_protocol_t
+fib_entry_get_proto (const fib_entry_t * fib_entry)
+{
+    return (fib_entry->fe_prefix.fp_proto);
+}
+
+void
+fib_entry_src_register (fib_source_t source,
+                       const fib_entry_src_vft_t *vft)
+{
+    fib_entry_src_vft[source] = *vft;
+}
+
+static int
+fib_entry_src_cmp_for_sort (void * v1,
+                           void * v2)
+{
+    fib_entry_src_t *esrc1 = v1, *esrc2 = v2;
+
+    return (esrc1->fes_src - esrc2->fes_src);
+}
+
+void
+fib_entry_src_action_init (fib_entry_t *fib_entry,
+                          fib_source_t source)
+
+{
+    fib_entry_src_t esrc = {
+       .fes_pl = FIB_NODE_INDEX_INVALID,
+       .fes_flags = FIB_ENTRY_SRC_FLAG_NONE,
+       .fes_src = source,
+    };
+
+    if (NULL != fib_entry_src_vft[source].fesv_init)
+    {
+       fib_entry_src_vft[source].fesv_init(&esrc);
+    }
+
+    vec_add1(fib_entry->fe_srcs, esrc);
+    vec_sort_with_function(fib_entry->fe_srcs,
+                          fib_entry_src_cmp_for_sort);
+}
+
+static fib_entry_src_t *
+fib_entry_src_find (const fib_entry_t *fib_entry,
+                   fib_source_t source,
+                   u32 *index)
+
+{
+    fib_entry_src_t *esrc;
+    int ii;
+
+    ii = 0;
+    vec_foreach(esrc, fib_entry->fe_srcs)
+    {
+       if (esrc->fes_src == source)
+       {
+           if (NULL != index)
+           {
+               *index = ii;
+           }
+           return (esrc);
+       }
+       else
+       {
+           ii++;
+       }
+    }
+
+    return (NULL);
+}
+
+int
+fib_entry_is_sourced (fib_node_index_t fib_entry_index,
+                      fib_source_t source)
+{
+    fib_entry_t *fib_entry;
+
+    fib_entry = fib_entry_get(fib_entry_index);
+
+    return (NULL != fib_entry_src_find(fib_entry, source, NULL));
+}
+
+static fib_entry_src_t *
+fib_entry_src_find_or_create (fib_entry_t *fib_entry,
+                             fib_source_t source,
+                             u32 *index)
+{
+    fib_entry_src_t *esrc;
+
+    esrc = fib_entry_src_find(fib_entry, source, NULL);
+
+    if (NULL == esrc)
+    {
+       fib_entry_src_action_init(fib_entry, source);
+    }
+
+    return (fib_entry_src_find(fib_entry, source, NULL));
+}
+
+void
+fib_entry_src_action_deinit (fib_entry_t *fib_entry,
+                            fib_source_t source)
+
+{
+    fib_entry_src_t *esrc;
+    u32 index = ~0;
+
+    esrc = fib_entry_src_find(fib_entry, source, &index);
+
+    ASSERT(NULL != esrc);
+
+    if (NULL != fib_entry_src_vft[source].fesv_deinit)
+    {
+       fib_entry_src_vft[source].fesv_deinit(esrc);
+    }
+
+    vec_free(esrc->fes_path_exts);
+    vec_del1(fib_entry->fe_srcs, index);
+}
+
+fib_entry_src_cover_res_t
+fib_entry_src_action_cover_change (fib_entry_t *fib_entry,
+                                  fib_source_t source)
+{
+    if (NULL != fib_entry_src_vft[source].fesv_cover_change)
+    {
+       return (fib_entry_src_vft[source].fesv_cover_change(
+                   fib_entry_src_find(fib_entry, source, NULL),
+                   fib_entry));
+    }
+
+    fib_entry_src_cover_res_t res = {
+       .install = !0,
+       .bw_reason = FIB_NODE_BW_REASON_FLAG_NONE,
+    };
+    return (res);
+}
+
+fib_entry_src_cover_res_t
+fib_entry_src_action_cover_update (fib_entry_t *fib_entry,
+                                  fib_source_t source)
+{
+    if (NULL != fib_entry_src_vft[source].fesv_cover_update)
+    {
+       return (fib_entry_src_vft[source].fesv_cover_update(
+                   fib_entry_src_find(fib_entry, source, NULL),
+                   fib_entry));
+    }
+
+    fib_entry_src_cover_res_t res = {
+       .install = !0,
+       .bw_reason = FIB_NODE_BW_REASON_FLAG_NONE,
+    };
+    return (res);
+}
+
+typedef struct fib_entry_src_collect_forwarding_ctx_t_
+{
+    load_balance_path_t * next_hops;
+    const fib_entry_t *fib_entry;
+    const fib_entry_src_t *esrc;
+    fib_forward_chain_type_t fct;
+    int is_recursive;
+} fib_entry_src_collect_forwarding_ctx_t;
+
+/**
+ * @brief Determine whether this FIB entry should use a load-balance MAP
+ * to support PIC edge fast convergence
+ */
+load_balance_flags_t
+fib_entry_calc_lb_flags (fib_entry_src_collect_forwarding_ctx_t *ctx)
+{
+    /**
+     * We'll use a LB map is the path-list has recursive paths.
+     * recursive paths implies BGP, and hence scale.
+     */
+    if (ctx->is_recursive)
+    {
+        return (LOAD_BALANCE_FLAG_USES_MAP);
+    }
+    return (LOAD_BALANCE_FLAG_NONE);
+}
+
+static int
+fib_entry_src_valid_out_label (mpls_label_t label)
+{
+    return ((MPLS_LABEL_IS_REAL(label) ||
+             MPLS_IETF_IPV4_EXPLICIT_NULL_LABEL == label ||
+             MPLS_IETF_IPV6_EXPLICIT_NULL_LABEL == label ||
+             MPLS_IETF_IMPLICIT_NULL_LABEL == label));
+}
+
+static int
+fib_entry_src_collect_forwarding (fib_node_index_t pl_index,
+                                  fib_node_index_t path_index,
+                                  void *arg)
+{
+    fib_entry_src_collect_forwarding_ctx_t *ctx;
+    fib_path_ext_t *path_ext;
+
+    ctx = arg;
+
+    /*
+     * if the path is not resolved, don't include it.
+     */
+    if (!fib_path_is_resolved(path_index))
+    {
+        return (!0);
+    }
+
+    if (fib_path_is_recursive(path_index))
+    {
+        ctx->is_recursive = 1;
+    }
+
+    /*
+     * get the matching path-extension for the path being visited.
+     */
+    vec_foreach(path_ext, ctx->esrc->fes_path_exts)
+    {
+        if (path_ext->fpe_path_index == path_index)
+            break;
+    }
+    
+    if (NULL != path_ext &&
+        path_ext->fpe_path_index == path_index &&
+        fib_entry_src_valid_out_label(path_ext->fpe_label))
+    {
+        /*
+         * found a matching extension. stack it to obtain the forwarding
+         * info for this path.
+         */
+        ctx->next_hops = fib_path_ext_stack(path_ext, ctx->fct, ctx->next_hops);
+    }
+    else
+    {
+        load_balance_path_t *nh;
+
+        /*
+         * no extension => no out-going label for this path. that's OK
+         * in the case of an IP or EOS chain, but not for non-EOS
+         */
+        switch (ctx->fct)
+        {
+        case FIB_FORW_CHAIN_TYPE_UNICAST_IP4:
+        case FIB_FORW_CHAIN_TYPE_UNICAST_IP6:
+            /*
+             * EOS traffic with no label to stack, we need the IP Adj
+             */
+            vec_add2(ctx->next_hops, nh, 1);
+
+            nh->path_index = path_index;
+            nh->path_weight = fib_path_get_weight(path_index);
+            fib_path_contribute_forwarding(path_index, ctx->fct, &nh->path_dpo);
+
+            break;
+        case FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS:
+           if (fib_path_is_exclusive(path_index) ||
+               fib_path_is_deag(path_index))
+           {
+               vec_add2(ctx->next_hops, nh, 1);
+
+               nh->path_index = path_index;
+               nh->path_weight = fib_path_get_weight(path_index);
+               fib_path_contribute_forwarding(path_index,
+                                              FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS,
+                                              &nh->path_dpo);
+           }
+            break;
+        case FIB_FORW_CHAIN_TYPE_MPLS_EOS:
+           ASSERT(0);
+           break;
+        }
+    }
+
+    return (!0);
+}
+
+void
+fib_entry_src_mk_lb (fib_entry_t *fib_entry,
+                    const fib_entry_src_t *esrc,
+                    fib_forward_chain_type_t fct,
+                    dpo_id_t *dpo_lb)
+{
+    dpo_proto_t lb_proto;
+
+    /*
+     * If the entry has path extensions then we construct a load-balance
+     * by stacking the extensions on the forwarding chains of the paths.
+     * Otherwise we use the load-balance of the path-list
+     */
+    fib_entry_src_collect_forwarding_ctx_t ctx = {
+        .esrc = esrc,
+        .fib_entry = fib_entry,
+        .next_hops = NULL,
+        .is_recursive = 0,
+        .fct = fct,
+    };
+
+    lb_proto = fib_proto_to_dpo(fib_entry_get_proto(fib_entry));
+
+    fib_path_list_walk(esrc->fes_pl,
+                       fib_entry_src_collect_forwarding,
+                       &ctx);
+
+    if (esrc->fes_entry_flags & FIB_ENTRY_FLAG_EXCLUSIVE)
+    {
+       /*
+        * the client provided the DPO that the entry should link to.
+        * all entries must link to a LB, so if it is an LB already
+        * then we can use it.
+        */
+       if ((1 == vec_len(ctx.next_hops)) &&
+           (DPO_LOAD_BALANCE == ctx.next_hops[0].path_dpo.dpoi_type))
+       {
+           dpo_copy(dpo_lb, &ctx.next_hops[0].path_dpo);
+           dpo_reset(&ctx.next_hops[0].path_dpo);
+           return;
+       }
+    }
+
+    if (!dpo_id_is_valid(dpo_lb))
+    {
+        /*
+         * first time create
+         */
+        flow_hash_config_t fhc;
+
+        fhc = fib_table_get_flow_hash_config(fib_entry->fe_fib_index,
+                                             dpo_proto_to_fib(lb_proto));
+        dpo_set(dpo_lb,
+                DPO_LOAD_BALANCE,
+                lb_proto,
+                load_balance_create(0, lb_proto, fhc));
+    }
+
+    load_balance_multipath_update(dpo_lb,
+                                  ctx.next_hops,
+                                  fib_entry_calc_lb_flags(&ctx));
+}
+
+void
+fib_entry_src_action_install (fib_entry_t *fib_entry,
+                             fib_source_t source)
+{
+    /*
+     * Install the forwarding chain for the given source into the forwarding
+     * tables
+     */
+    fib_forward_chain_type_t fct;
+    fib_entry_src_t *esrc;
+
+    fct = fib_entry_get_default_chain_type(fib_entry);
+    esrc = fib_entry_src_find(fib_entry, source, NULL);
+
+    fib_entry_src_mk_lb(fib_entry, esrc, fct, &fib_entry->fe_lb[fct]);
+
+    FIB_ENTRY_DBG(fib_entry, "install: %d",
+                 fib_entry->fe_lb[fct]);
+
+    /*
+     * insert the adj into the data-plane forwarding trie
+     */
+    fib_table_fwding_dpo_update(fib_entry->fe_fib_index,
+                               &fib_entry->fe_prefix,
+                               &fib_entry->fe_lb[fct]);
+
+    if (FIB_FORW_CHAIN_TYPE_UNICAST_IP4 == fct ||
+       FIB_FORW_CHAIN_TYPE_UNICAST_IP6 == fct)
+    {
+       for (fct = FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS;
+            fct <= FIB_FORW_CHAIN_TYPE_MPLS_EOS;
+            fct++)
+       {
+           /*
+            * if any of the other chain types are already created they will need
+            * updating too
+            */
+           if (dpo_id_is_valid(&fib_entry->fe_lb[fct]))
+           {
+               fib_entry_src_mk_lb(fib_entry,
+                                   esrc,
+                                   fct,
+                                   &fib_entry->fe_lb[fct]);
+           }
+       }
+    }
+}
+
+void
+fib_entry_src_action_uninstall (fib_entry_t *fib_entry)
+{
+    fib_forward_chain_type_t fct;
+
+    fct = fib_entry_get_default_chain_type(fib_entry);
+    /*
+     * uninstall the forwarding chain for the given source from the
+     * forwarding tables
+     */
+    FIB_ENTRY_DBG(fib_entry, "uninstall: %d",
+                 fib_entry->fe_adj_index);
+
+    if (dpo_id_is_valid(&fib_entry->fe_lb[fct]))
+    {
+       /* fib_forward_chain_type_t fct; */
+       /* fib_path_ext_t *path_ext; */
+
+       fib_table_fwding_dpo_remove(
+           fib_entry->fe_fib_index,
+           &fib_entry->fe_prefix,
+           &fib_entry->fe_lb[fct]);
+
+       dpo_reset(&fib_entry->fe_lb[fct]);
+    }
+}
+
+static void
+fib_entry_recursive_loop_detect_i (fib_node_index_t path_list_index)
+{
+    fib_node_index_t *entries = NULL;
+
+    fib_path_list_recursive_loop_detect(path_list_index, &entries);
+
+    vec_free(entries);
+}
+
+void
+fib_entry_src_action_activate (fib_entry_t *fib_entry,
+                              fib_source_t source)
+
+{
+    int houston_we_are_go_for_install;
+    fib_entry_src_t *esrc;
+
+    esrc = fib_entry_src_find(fib_entry, source, NULL);
+
+    ASSERT(!(esrc->fes_flags & FIB_ENTRY_SRC_FLAG_ACTIVE));
+    ASSERT(esrc->fes_flags & FIB_ENTRY_SRC_FLAG_ADDED);
+
+    esrc->fes_flags |= FIB_ENTRY_SRC_FLAG_ACTIVE;
+
+    if (NULL != fib_entry_src_vft[source].fesv_activate)
+    {
+       houston_we_are_go_for_install =
+           fib_entry_src_vft[source].fesv_activate(esrc, fib_entry);
+    }
+    else
+    {
+       /*
+        * the source is not providing an activate function, we'll assume
+        * therefore it has no objection to installing the entry
+        */
+       houston_we_are_go_for_install = !0;
+    }
+
+    /*
+     * link to the path-list provided by the source, and go check
+     * if that forms any loops in the graph.
+     */
+    fib_entry->fe_parent = esrc->fes_pl;
+    fib_entry->fe_sibling =
+       fib_path_list_child_add(fib_entry->fe_parent,
+                               FIB_NODE_TYPE_ENTRY,
+                               fib_entry_get_index(fib_entry));
+
+    fib_entry_recursive_loop_detect_i(fib_entry->fe_parent);
+
+    FIB_ENTRY_DBG(fib_entry, "activate: %d",
+                 fib_entry->fe_parent);
+
+    if (0 != houston_we_are_go_for_install)
+    {
+       fib_entry_src_action_install(fib_entry, source);
+    }
+    else
+    {
+       fib_entry_src_action_uninstall(fib_entry);
+    }
+}
+
+void
+fib_entry_src_action_deactivate (fib_entry_t *fib_entry,
+                                fib_source_t source)
+
+{
+    fib_node_index_t path_list_index;
+    fib_entry_src_t *esrc;
+
+    esrc = fib_entry_src_find(fib_entry, source, NULL);
+
+    ASSERT(esrc->fes_flags & FIB_ENTRY_SRC_FLAG_ACTIVE);
+
+    if (NULL != fib_entry_src_vft[source].fesv_deactivate)
+    {
+       fib_entry_src_vft[source].fesv_deactivate(esrc, fib_entry);
+    }
+
+    esrc->fes_flags &= ~FIB_ENTRY_SRC_FLAG_ACTIVE;
+
+    FIB_ENTRY_DBG(fib_entry, "deactivate: %d", fib_entry->fe_parent);
+
+    /*
+     * un-link from an old path-list. Check for any loops this will clear
+     */
+    path_list_index = fib_entry->fe_parent;
+    fib_entry->fe_parent = FIB_NODE_INDEX_INVALID;
+
+    fib_entry_recursive_loop_detect_i(path_list_index);
+
+    /*
+     * this will unlock the path-list, so it may be invalid thereafter.
+     */
+    fib_path_list_child_remove(path_list_index, fib_entry->fe_sibling);
+    fib_entry->fe_sibling = FIB_NODE_INDEX_INVALID;
+}
+
+static void
+fib_entry_src_action_fwd_update (const fib_entry_t *fib_entry,
+                                fib_source_t source)
+{
+    fib_entry_src_t *esrc;
+
+    vec_foreach(esrc, fib_entry->fe_srcs)
+    {
+       if (NULL != fib_entry_src_vft[esrc->fes_src].fesv_fwd_update)
+       {
+           fib_entry_src_vft[esrc->fes_src].fesv_fwd_update(esrc,
+                                                            fib_entry,
+                                                            source);
+       }
+    }
+}
+
+void
+fib_entry_src_action_reactivate (fib_entry_t *fib_entry,
+                                fib_source_t source)
+{
+    fib_node_index_t path_list_index;
+    fib_entry_src_t *esrc;
+
+    esrc = fib_entry_src_find(fib_entry, source, NULL);
+
+    ASSERT(esrc->fes_flags & FIB_ENTRY_SRC_FLAG_ACTIVE);
+
+    FIB_ENTRY_DBG(fib_entry, "reactivate: %d to %d",
+                 fib_entry->fe_parent,
+                 esrc->fes_pl);
+
+    if (fib_entry->fe_parent != esrc->fes_pl)
+    {
+       /*
+        * un-link from an old path-list. Check for any loops this will clear
+        */
+       path_list_index = fib_entry->fe_parent;
+       fib_entry->fe_parent = FIB_NODE_INDEX_INVALID;
+
+       /*
+        * temporary lock so it doesn't get deleted when this entry is no
+        * longer a child.
+        */
+       fib_path_list_lock(path_list_index);
+
+       /*
+        * this entry is no longer a child. after unlinking check if any loops
+        * were broken
+        */
+       fib_path_list_child_remove(path_list_index,
+                                  fib_entry->fe_sibling);
+
+       fib_entry_recursive_loop_detect_i(path_list_index);
+
+       /*
+        * link to the path-list provided by the source, and go check
+        * if that forms any loops in the graph.
+        */
+       fib_entry->fe_parent = esrc->fes_pl;
+       fib_entry->fe_sibling =
+           fib_path_list_child_add(fib_entry->fe_parent,
+                                   FIB_NODE_TYPE_ENTRY,
+                                   fib_entry_get_index(fib_entry));
+
+       fib_entry_recursive_loop_detect_i(fib_entry->fe_parent);
+       fib_path_list_unlock(path_list_index);
+    }
+    fib_entry_src_action_install(fib_entry, source);
+    fib_entry_src_action_fwd_update(fib_entry, source);
+}
+
+void
+fib_entry_src_action_installed (const fib_entry_t *fib_entry,
+                               fib_source_t source)
+{
+    fib_entry_src_t *esrc;
+
+    esrc = fib_entry_src_find(fib_entry, source, NULL);
+
+    if (NULL != fib_entry_src_vft[source].fesv_installed)
+    {
+       fib_entry_src_vft[source].fesv_installed(esrc,
+                                                fib_entry);
+    }
+
+    fib_entry_src_action_fwd_update(fib_entry, source);
+}
+
+/*
+ * fib_entry_src_action_add
+ *
+ * Adding a source can result in a new fib_entry being created, which
+ * can inturn mean the pool is realloc'd and thus the entry passed as
+ * an argument it also realloc'd
+ * @return the original entry
+ */
+fib_entry_t *
+fib_entry_src_action_add (fib_entry_t *fib_entry,
+                         fib_source_t source,
+                         fib_entry_flag_t flags,
+                         const dpo_id_t *dpo)
+{
+    fib_node_index_t fib_entry_index;
+    fib_entry_src_t *esrc;
+
+    esrc = fib_entry_src_find_or_create(fib_entry, source, NULL);
+
+    esrc->fes_ref_count++;
+
+    if (1 != esrc->fes_ref_count)
+    {
+        /*
+         * we only want to add the source on the 0->1 transition
+         */
+        return (fib_entry);
+    }
+
+    esrc->fes_entry_flags = flags;
+
+    /*
+     * save variable so we can recover from a fib_entry realloc.
+     */
+    fib_entry_index = fib_entry_get_index(fib_entry);
+
+    if (NULL != fib_entry_src_vft[source].fesv_add)
+    {
+       fib_entry_src_vft[source].fesv_add(esrc,
+                                          fib_entry,
+                                          flags,
+                                          fib_entry_get_proto(fib_entry),
+                                          dpo);
+    }
+
+    fib_entry = fib_entry_get(fib_entry_index);
+
+    esrc->fes_flags |= FIB_ENTRY_SRC_FLAG_ADDED;
+
+    fib_path_list_lock(esrc->fes_pl);
+
+    /*
+     * the source owns a lock on the entry
+     */
+    fib_entry_lock(fib_entry_get_index(fib_entry));
+
+    return (fib_entry);
+}
+
+fib_entry_src_flag_t
+fib_entry_src_action_remove (fib_entry_t *fib_entry,
+                            fib_source_t source)
+
+{
+    fib_node_index_t old_path_list;
+    fib_entry_src_flag_t sflags;
+    fib_entry_src_t *esrc;
+
+    esrc = fib_entry_src_find(fib_entry, source, NULL);
+
+    if (NULL == esrc)
+       return (FIB_ENTRY_SRC_FLAG_ACTIVE);
+
+    esrc->fes_ref_count--;
+    sflags = esrc->fes_flags;
+
+    if (0 != esrc->fes_ref_count)
+    {
+        /*
+         * only remove the source on the 1->0 transisition
+         */
+        return (sflags);
+    }
+
+    if (esrc->fes_flags & FIB_ENTRY_SRC_FLAG_ACTIVE)
+    {
+       fib_entry_src_action_deactivate(fib_entry, source);
+    }
+
+    old_path_list = esrc->fes_pl;
+
+    if (NULL != fib_entry_src_vft[source].fesv_remove)
+    {
+       fib_entry_src_vft[source].fesv_remove(esrc);
+    }
+
+    fib_path_list_unlock(old_path_list);
+    fib_entry_unlock(fib_entry_get_index(fib_entry));
+
+    sflags &= ~FIB_ENTRY_SRC_FLAG_ADDED;
+    fib_entry_src_action_deinit(fib_entry, source);
+
+    return (sflags);
+}
+
+static inline int
+fib_route_recurses_via_self (const fib_prefix_t *prefix,
+                            const fib_route_path_t *rpath)
+{
+    /*
+     * not all zeros next hop &&
+     * is recursive path &&
+     * nexthop is same as the route's address
+     */
+    return ((!ip46_address_is_zero(&rpath->frp_addr)) &&
+           (~0 == rpath->frp_sw_if_index) &&
+           (0 == ip46_address_cmp(&rpath->frp_addr, &prefix->fp_addr)));
+
+}
+
+/*
+ * fib_route_attached_cross_table
+ *
+ * Return true the the route is attached via an interface that
+ * is not in the same table as the route
+ */
+static inline int
+fib_route_attached_cross_table (const fib_entry_t *fib_entry,
+                               const fib_route_path_t *rpath)
+{
+    /*
+     * - All zeros next-hop
+     * - a valid interface
+     * - entry's fib index not equeal to interface's index
+     */
+    if (ip46_address_is_zero(&rpath->frp_addr) &&
+       (~0 != rpath->frp_sw_if_index) &&
+       (fib_entry->fe_fib_index != 
+        fib_table_get_index_for_sw_if_index(fib_entry_get_proto(fib_entry),
+                                            rpath->frp_sw_if_index)))
+    {
+       return (!0);
+    }
+    return (0);
+}
+
+/*
+ * fib_route_attached_cross_table
+ *
+ * Return true the the route is attached via an interface that
+ * is not in the same table as the route
+ */
+static inline int
+fib_path_is_attached (const fib_route_path_t *rpath)
+{
+    /*
+     * - All zeros next-hop
+     * - a valid interface
+     */
+    if (ip46_address_is_zero(&rpath->frp_addr) &&
+       (~0 != rpath->frp_sw_if_index))
+    {
+       return (!0);
+    }
+    return (0);
+}
+
+fib_path_list_flags_t
+fib_entry_src_flags_2_path_list_flags (fib_entry_flag_t eflags)
+{
+    fib_path_list_flags_t plf = FIB_PATH_LIST_FLAG_NONE;
+
+    if (eflags & FIB_ENTRY_FLAG_DROP)
+    {
+       plf |= FIB_PATH_LIST_FLAG_DROP;
+    }
+    if (eflags & FIB_ENTRY_FLAG_LOCAL)
+    {
+       plf |= FIB_PATH_LIST_FLAG_LOCAL;
+    }
+    if (eflags & FIB_ENTRY_FLAG_EXCLUSIVE)
+    {
+       plf |= FIB_PATH_LIST_FLAG_EXCLUSIVE;
+    }
+
+    return (plf);
+}
+
+static void
+fib_entry_flags_update (const fib_entry_t *fib_entry,
+                       const fib_route_path_t *rpath,
+                       fib_path_list_flags_t *pl_flags,
+                       fib_entry_src_t *esrc)
+{
+    /*
+     * don't allow the addition of a recursive looped path for prefix
+     * via itself.
+     */
+    if (fib_route_recurses_via_self(&fib_entry->fe_prefix, rpath))     
+    {
+       /*
+        * force the install of a drop path-list.
+        * we want the entry to have some path-list, mainly so
+        * the dodgy path can be rmeoved when the source stops playing
+        * silly buggers.
+        */
+       *pl_flags |= FIB_PATH_LIST_FLAG_DROP;
+    }
+    else
+    {
+       *pl_flags &= ~FIB_PATH_LIST_FLAG_DROP;
+    }
+
+    if ((esrc->fes_src == FIB_SOURCE_API) ||
+       (esrc->fes_src == FIB_SOURCE_CLI))
+    {
+       if (fib_path_is_attached(rpath))
+       {
+           esrc->fes_entry_flags |= FIB_ENTRY_FLAG_ATTACHED;
+       }
+       else
+       {
+           esrc->fes_entry_flags &= ~FIB_ENTRY_FLAG_ATTACHED;
+       }
+    }
+    if (fib_route_attached_cross_table(fib_entry, rpath))
+    {
+       esrc->fes_entry_flags |= FIB_ENTRY_FLAG_IMPORT;
+    }
+    else
+    {
+       esrc->fes_entry_flags &= ~FIB_ENTRY_FLAG_IMPORT;
+    }
+}
+
+/*
+ * fib_entry_src_path_ext_add
+ *
+ * append a path extension to the entry's list
+ */
+static void
+fib_entry_src_path_ext_append (fib_entry_src_t *esrc,
+                              const fib_route_path_t *rpath)
+{
+    if (MPLS_LABEL_INVALID != rpath->frp_label)
+    {
+       fib_path_ext_t *path_ext;
+
+       vec_add2(esrc->fes_path_exts, path_ext, 1);
+
+       fib_path_ext_init(path_ext, esrc->fes_pl, rpath);
+    }
+}
+
+/*
+ * fib_entry_src_path_ext_insert
+ *
+ * insert, sorted, a path extension to the entry's list.
+ * It's not strictly necessary in sort the path extensions, since each
+ * extension has the path index to which it resolves. However, by being
+ * sorted the load-balance produced has a deterministic order, not an order
+ * based on the sequence of extension additions. this is a considerable benefit.
+ */
+static void
+fib_entry_src_path_ext_insert (fib_entry_src_t *esrc,
+                              const fib_route_path_t *rpath)
+{
+    if (0 == vec_len(esrc->fes_path_exts))
+       return (fib_entry_src_path_ext_append(esrc, rpath));
+
+    if (MPLS_LABEL_INVALID != rpath->frp_label)
+    {
+       fib_path_ext_t path_ext;
+       int i = 0;
+
+       fib_path_ext_init(&path_ext, esrc->fes_pl, rpath);
+
+       while (i < vec_len(esrc->fes_path_exts) &&
+              (fib_path_ext_cmp(&esrc->fes_path_exts[i], rpath) < 0))
+       {
+           i++;
+       }
+
+       vec_insert_elts(esrc->fes_path_exts, &path_ext, 1, i);
+    }
+}
+
+/*
+ * fib_entry_src_action_add
+ *
+ * Adding a source can result in a new fib_entry being created, which
+ * can inturn mean the pool is realloc'd and thus the entry passed as
+ * an argument it also realloc'd
+ * @return the entry
+ */
+fib_entry_t*
+fib_entry_src_action_path_add (fib_entry_t *fib_entry,
+                              fib_source_t source,
+                              fib_entry_flag_t flags,
+                              const fib_route_path_t *rpath)
+{
+    fib_node_index_t old_path_list, fib_entry_index;
+    fib_path_list_flags_t pl_flags;
+    fib_path_ext_t *path_ext;
+    fib_entry_src_t *esrc;
+
+    /*
+     * save variable so we can recover from a fib_entry realloc.
+     */
+    fib_entry_index = fib_entry_get_index(fib_entry);
+
+    esrc = fib_entry_src_find(fib_entry, source, NULL);
+    if (NULL == esrc)
+    {
+       fib_entry =
+            fib_entry_src_action_add(fib_entry,
+                                     source,
+                                     flags,
+                                     drop_dpo_get(
+                                         fib_proto_to_dpo(
+                                             fib_entry_get_proto(fib_entry))));
+       esrc = fib_entry_src_find(fib_entry, source, NULL);
+    }
+
+    /*
+     * we are no doubt modifying a path-list. If the path-list
+     * is shared, and hence not modifiable, then the index returned
+     * will be for a different path-list. This FIB entry to needs
+     * to maintain its lock appropriately.
+     */
+    old_path_list = esrc->fes_pl;
+
+    ASSERT(NULL != fib_entry_src_vft[source].fesv_path_add);
+
+    pl_flags = fib_entry_src_flags_2_path_list_flags(fib_entry_get_flags_i(fib_entry));
+    fib_entry_flags_update(fib_entry, rpath, &pl_flags, esrc);
+
+    fib_entry_src_vft[source].fesv_path_add(esrc, fib_entry, pl_flags, rpath);
+    fib_entry = fib_entry_get(fib_entry_index);
+
+    /*
+     * re-resolve all the path-extensions with the new path-list
+     */
+    vec_foreach(path_ext, esrc->fes_path_exts)
+    {
+       fib_path_ext_resolve(path_ext, esrc->fes_pl);
+    }
+    /*
+     * if the path has a label we need to add a path extension
+     */
+    fib_entry_src_path_ext_insert(esrc, rpath);
+
+    fib_path_list_lock(esrc->fes_pl);
+    fib_path_list_unlock(old_path_list);
+
+    return (fib_entry);
+}
+
+/*
+ * fib_entry_src_action_swap
+ *
+ * The source is providing new paths to replace the old ones.
+ * Adding a source can result in a new fib_entry being created, which
+ * can inturn mean the pool is realloc'd and thus the entry passed as
+ * an argument it also realloc'd
+ * @return the entry
+ */
+fib_entry_t*
+fib_entry_src_action_path_swap (fib_entry_t *fib_entry,
+                               fib_source_t source,
+                               fib_entry_flag_t flags,                         
+                               const fib_route_path_t *rpaths)
+{
+    fib_node_index_t old_path_list, fib_entry_index;
+    fib_path_list_flags_t pl_flags;
+    const fib_route_path_t *rpath;
+    fib_entry_src_t *esrc;
+
+    esrc = fib_entry_src_find(fib_entry, source, NULL);
+
+    /*
+     * save variable so we can recover from a fib_entry realloc.
+     */
+    fib_entry_index = fib_entry_get_index(fib_entry);
+
+    if (NULL == esrc)
+    {
+       fib_entry = fib_entry_src_action_add(fib_entry,
+                                            source,
+                                            flags,
+                                             drop_dpo_get(
+                                                 fib_proto_to_dpo(
+                                                     fib_entry_get_proto(fib_entry))));
+       esrc = fib_entry_src_find(fib_entry, source, NULL);
+    }
+
+    /*
+     * swapping paths may create a new path-list (or may use an existing shared)
+     * but we are certainly getting a different one. This FIB entry to needs
+     * to maintain its lock appropriately.
+     */
+    old_path_list = esrc->fes_pl;
+
+    ASSERT(NULL != fib_entry_src_vft[source].fesv_path_swap);
+
+    pl_flags = fib_entry_src_flags_2_path_list_flags(
+                  fib_entry_get_flags_i(fib_entry));
+    vec_foreach(rpath, rpaths)
+    {
+       fib_entry_flags_update(fib_entry, rpath, &pl_flags, esrc);
+    }
+
+    fib_entry_src_vft[source].fesv_path_swap(esrc,
+                                            fib_entry,
+                                            pl_flags,
+                                            rpaths);
+
+    vec_free(esrc->fes_path_exts);
+    vec_foreach(rpath, rpaths)
+    {
+       fib_entry_src_path_ext_append(esrc, rpath);
+    }
+
+    fib_entry = fib_entry_get(fib_entry_index);
+
+    fib_path_list_lock(esrc->fes_pl);
+    fib_path_list_unlock(old_path_list);
+
+    return (fib_entry);
+}
+
+fib_entry_src_flag_t
+fib_entry_src_action_path_remove (fib_entry_t *fib_entry,
+                                 fib_source_t source,
+                                 const fib_route_path_t *rpath)
+{
+    fib_path_list_flags_t pl_flags;
+    fib_node_index_t old_path_list;
+    fib_path_ext_t *path_ext;
+    fib_entry_src_t *esrc;
+
+    esrc = fib_entry_src_find(fib_entry, source, NULL);
+
+    ASSERT(NULL != esrc);
+    ASSERT(esrc->fes_flags & FIB_ENTRY_SRC_FLAG_ADDED);
+
+    /*
+     * we no doubt modifying a path-list. If the path-list
+     * is shared, and hence not modifiable, then the index returned
+     * will be for a different path-list. This FIB entry to needs
+     * to maintain its lock appropriately.
+     */
+    old_path_list = esrc->fes_pl;
+
+    ASSERT(NULL != fib_entry_src_vft[source].fesv_path_remove);
+
+    pl_flags = fib_entry_src_flags_2_path_list_flags(fib_entry_get_flags_i(fib_entry));
+    fib_entry_flags_update(fib_entry, rpath, &pl_flags, esrc);
+
+    fib_entry_src_vft[source].fesv_path_remove(esrc, pl_flags, rpath);
+    /*
+     * find the matching path extension and remove it
+     */
+    vec_foreach(path_ext, esrc->fes_path_exts)
+    {
+       if (!fib_path_ext_cmp(path_ext, rpath))
+       {
+           /*
+            * delete the element moving the remaining elements down 1 position.
+            * this preserves the sorted order.
+            */
+           vec_delete(esrc->fes_path_exts, 1, (path_ext - esrc->fes_path_exts));
+           break;
+       }
+    }
+    /*
+     * re-resolve all the path-extensions with the new path-list
+     */
+    vec_foreach(path_ext, esrc->fes_path_exts)
+    {
+       fib_path_ext_resolve(path_ext, esrc->fes_pl);
+    }
+
+    /*
+     * lock the new path-list, unlock the old if it had one
+     */
+    fib_path_list_unlock(old_path_list);
+
+    if (FIB_NODE_INDEX_INVALID != esrc->fes_pl) {
+       fib_path_list_lock(esrc->fes_pl);
+       return (FIB_ENTRY_SRC_FLAG_ADDED);
+    }
+    else
+    {
+       /*
+        * no more paths left from this source
+        */
+       fib_entry_src_action_remove(fib_entry, source);
+       return (FIB_ENTRY_SRC_FLAG_NONE);
+    }
+}
+
+u8*
+fib_entry_src_format (fib_entry_t *fib_entry,
+                     fib_source_t source,
+                     u8* s)
+{
+    fib_entry_src_t *esrc;
+
+    esrc = fib_entry_src_find(fib_entry, source, NULL);
+
+    if (NULL != fib_entry_src_vft[source].fesv_format)
+    {
+       return (fib_entry_src_vft[source].fesv_format(esrc, s));
+    }
+    return (s);
+}
+
+adj_index_t
+fib_entry_get_adj_for_source (fib_node_index_t fib_entry_index,
+                             fib_source_t source)
+{
+    fib_entry_t *fib_entry;
+    fib_entry_src_t *esrc;
+
+    if (FIB_NODE_INDEX_INVALID == fib_entry_index)
+       return (ADJ_INDEX_INVALID);
+
+    fib_entry = fib_entry_get(fib_entry_index);
+    esrc = fib_entry_src_find(fib_entry, source, NULL);
+
+    if (NULL != esrc)
+    {
+       if (FIB_NODE_INDEX_INVALID != esrc->fes_pl)
+       {
+           return (fib_path_list_get_adj(
+                       esrc->fes_pl,
+                       fib_entry_get_default_chain_type(fib_entry)));
+       }
+    }
+    return (ADJ_INDEX_INVALID);
+}
+
+const int
+fib_entry_get_dpo_for_source (fib_node_index_t fib_entry_index,
+                             fib_source_t source,
+                             dpo_id_t *dpo)
+{
+    fib_entry_t *fib_entry;
+    fib_entry_src_t *esrc;
+
+    if (FIB_NODE_INDEX_INVALID == fib_entry_index)
+       return (0);
+
+    fib_entry = fib_entry_get(fib_entry_index);
+    esrc = fib_entry_src_find(fib_entry, source, NULL);
+
+    if (NULL != esrc)
+    {
+       if (FIB_NODE_INDEX_INVALID != esrc->fes_pl)
+       {
+           fib_path_list_contribute_forwarding(
+               esrc->fes_pl,
+               fib_entry_get_default_chain_type(fib_entry),
+               dpo);
+
+           return (dpo_id_is_valid(dpo));
+       }
+    }
+    return (0);
+}
+
+fib_entry_flag_t
+fib_entry_get_flags_i (const fib_entry_t *fib_entry)
+{
+    fib_entry_flag_t flags;
+
+    /*
+     * the vector of sources is deliberately arranged in priority order
+     */
+    if (0 == vec_len(fib_entry->fe_srcs))
+    {
+       flags = FIB_ENTRY_FLAG_NONE;
+    }
+    else
+    {
+       fib_entry_src_t *esrc;
+
+       esrc = vec_elt_at_index(fib_entry->fe_srcs, 0);
+       flags = esrc->fes_entry_flags;
+    }
+
+    return (flags);
+}
+
+void
+fib_entry_set_source_data (fib_node_index_t fib_entry_index,
+                           fib_source_t source,
+                           const void *data)
+{
+    fib_entry_t *fib_entry;
+    fib_entry_src_t *esrc;
+
+    fib_entry = fib_entry_get(fib_entry_index);
+    esrc = fib_entry_src_find(fib_entry, source, NULL);
+
+    if (NULL != esrc &&
+        NULL != fib_entry_src_vft[source].fesv_set_data)
+    {
+       fib_entry_src_vft[source].fesv_set_data(esrc, fib_entry, data);
+    }
+}
+
+const void*
+fib_entry_get_source_data (fib_node_index_t fib_entry_index,
+                           fib_source_t source)
+{
+    fib_entry_t *fib_entry;
+    fib_entry_src_t *esrc;
+
+    fib_entry = fib_entry_get(fib_entry_index);
+    esrc = fib_entry_src_find(fib_entry, source, NULL);
+
+    if (NULL != esrc &&
+        NULL != fib_entry_src_vft[source].fesv_get_data)
+    {
+       return (fib_entry_src_vft[source].fesv_get_data(esrc, fib_entry));
+    }
+    return (NULL);
+}
+
+void
+fib_entry_src_module_init (void)
+{
+    fib_entry_src_rr_register();
+    fib_entry_src_interface_register();
+    fib_entry_src_default_route_register();
+    fib_entry_src_special_register();
+    fib_entry_src_api_register();
+    fib_entry_src_adj_register();
+    fib_entry_src_mpls_register();
+    fib_entry_src_lisp_register();
+}
diff --git a/vnet/vnet/fib/fib_entry_src.h b/vnet/vnet/fib/fib_entry_src.h
new file mode 100644 (file)
index 0000000..d70aabc
--- /dev/null
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FIB_ENTRY_SRC_H__
+#define __FIB_ENTRY_SRC_H__
+
+#include "fib_entry.h"
+#include "fib_path_list.h"
+#include "fib_internal.h"
+
+/**
+ * Debug macro
+ */
+#ifdef FIB_DEBUG
+#define FIB_ENTRY_DBG(_e, _fmt, _args...)              \
+{                                                      \
+    u8*__tmp = NULL;                                   \
+    __tmp = format(__tmp, "e:[%d:%U",                  \
+                  fib_entry_get_index(_e),             \
+                  format_ip46_address,                 \
+                  &_e->fe_prefix.fp_addr,              \
+                  IP46_TYPE_ANY);                      \
+    __tmp = format(__tmp, "/%d]:",                     \
+                  _e->fe_prefix.fp_len);               \
+    __tmp = format(__tmp, _fmt, ##_args);              \
+    clib_warning("%s", __tmp);                         \
+    vec_free(__tmp);                                   \
+}
+#else
+#define FIB_ENTRY_DBG(_e, _fmt, _args...)
+#endif
+
+/**
+ * Source initialisation Function 
+ */
+typedef void (*fib_entry_src_init_t)(fib_entry_src_t *src);
+
+/**
+ * Source deinitialisation Function 
+ */
+typedef void (*fib_entry_src_deinit_t)(fib_entry_src_t *src);
+
+/**
+ * Source activation. Called when the source is the new best source on the entry.
+ * Return non-zero if the entry can now install, 0 otherwise
+ */
+typedef int (*fib_entry_src_activate_t)(fib_entry_src_t *src,
+                                        const fib_entry_t *fib_entry);
+
+/**
+ * Source Deactivate. 
+ * Called when the source is no longer best source on the entry
+ */
+typedef void (*fib_entry_src_deactivate_t)(fib_entry_src_t *src,
+                                          const fib_entry_t *fib_entry);
+
+/**
+ * Source Add.
+ * Called when the source is added to the entry
+ */
+typedef void (*fib_entry_src_add_t)(fib_entry_src_t *src,
+                                   const fib_entry_t *entry,
+                                   fib_entry_flag_t flags,
+                                   fib_protocol_t proto,
+                                   const dpo_id_t *dpo);
+
+/**
+ * Source Remove.
+ */
+typedef void (*fib_entry_src_remove_t)(fib_entry_src_t *src);
+
+/**
+ * Result from a cover update/change
+ */
+typedef struct fib_entry_src_cover_res_t_ {
+    u16 install;
+    fib_node_bw_reason_flag_t bw_reason;
+} fib_entry_src_cover_res_t;
+
+/**
+ * Cover changed. the source should re-evaluate its cover.
+ */
+typedef fib_entry_src_cover_res_t (*fib_entry_src_cover_change_t)(
+    fib_entry_src_t *src,
+    const fib_entry_t *fib_entry);
+
+/**
+ * Cover updated. The cover the source has, has updated (i.e. its forwarding)
+ * the source may need to re-evaluate.
+ */
+typedef fib_entry_src_cover_res_t (*fib_entry_src_cover_update_t)(
+    fib_entry_src_t *src,
+    const fib_entry_t *fib_entry);
+
+/**
+ * Forwarding updated. Notification that the forwarding information for the
+ * entry has been updated. This notification is sent to all sources, not just
+ * the active best.
+ */
+typedef void (*fib_entry_src_fwd_update_t)(fib_entry_src_t *src,
+                                          const fib_entry_t *fib_entry,
+                                          fib_source_t best_source);
+
+/**
+ * Installed. Notification that the source is now installed as
+ * the entry's forwarding source.
+ */
+typedef void (*fib_entry_src_installed_t)(fib_entry_src_t *src,
+                                         const fib_entry_t *fib_entry);
+
+/**
+ * format.
+ */
+typedef u8* (*fib_entry_src_format_t)(fib_entry_src_t *src,
+                                     u8* s);
+
+/**
+ * Source path add
+ * the source is adding a new path
+ */
+typedef void (*fib_entry_src_path_add_t)(fib_entry_src_t *src,
+                                        const fib_entry_t *fib_entry,
+                                        fib_path_list_flags_t pl_flags,
+                                        const fib_route_path_t *path);
+
+/**
+ * Source path remove
+ * the source is remoinvg a path
+ */
+typedef void (*fib_entry_src_path_remove_t)(fib_entry_src_t *src,
+                                           fib_path_list_flags_t pl_flags,
+                                           const fib_route_path_t *path);
+
+/**
+ * Source path replace/swap
+ * the source is providing a new set of paths
+ */
+typedef void (*fib_entry_src_path_swap_t)(fib_entry_src_t *src,
+                                         const fib_entry_t *fib_entry,
+                                         fib_path_list_flags_t pl_flags,
+                                         const fib_route_path_t *path);
+
+/**
+ * Set source specific opaque data
+ */
+typedef void (*fib_entry_src_set_data_t)(fib_entry_src_t *src,
+                                         const fib_entry_t *fib_entry,
+                                         const void *data);
+
+/**
+ * Get source specific opaque data
+ */
+typedef const void* (*fib_entry_src_get_data_t)(fib_entry_src_t *src,
+                                                const fib_entry_t *fib_entry);
+
+/**
+ * Virtual function table each FIB entry source will register
+ */
+typedef struct fib_entry_src_vft_t_ {
+    fib_entry_src_init_t fesv_init;
+    fib_entry_src_deinit_t fesv_deinit;
+    fib_entry_src_activate_t fesv_activate;
+    fib_entry_src_deactivate_t fesv_deactivate;
+    fib_entry_src_add_t fesv_add;
+    fib_entry_src_remove_t fesv_remove;
+    fib_entry_src_path_swap_t fesv_path_swap;
+    fib_entry_src_path_add_t fesv_path_add;
+    fib_entry_src_path_remove_t fesv_path_remove;
+    fib_entry_src_cover_change_t fesv_cover_change;
+    fib_entry_src_cover_update_t fesv_cover_update;
+    fib_entry_src_format_t fesv_format;
+    fib_entry_src_installed_t fesv_installed;
+    fib_entry_src_fwd_update_t fesv_fwd_update;
+    fib_entry_src_get_data_t fesv_get_data;
+    fib_entry_src_set_data_t fesv_set_data;
+} fib_entry_src_vft_t;
+
+#define FOR_EACH_SRC_ADDED(_entry, _src, _source, action)      \
+{                                                              \
+    vec_foreach(_src, _entry->fe_srcs)                         \
+    {                                                          \
+       if (_src->fes_flags & FIB_ENTRY_SRC_FLAG_ADDED) {       \
+           _source = _src->fes_src;                            \
+           do {                                                \
+               action;                                         \
+           } while(0);                                         \
+       }                                                       \
+    }                                                          \
+}
+
+extern u8* fib_entry_src_format(fib_entry_t *entry,
+                               fib_source_t source,
+                               u8* s);
+
+extern void fib_entry_src_register(fib_source_t source,
+                                  const fib_entry_src_vft_t *vft);
+
+extern void fib_entry_src_action_init(fib_entry_t *entry,
+                                     fib_source_t source);
+
+extern void fib_entry_src_action_deinit(fib_entry_t *fib_entry,
+                                       fib_source_t source);
+
+extern fib_entry_src_cover_res_t fib_entry_src_action_cover_change(
+    fib_entry_t *entry,
+    fib_source_t source);
+
+extern fib_entry_src_cover_res_t fib_entry_src_action_cover_update(
+    fib_entry_t *fib_entry,
+    fib_source_t source);
+
+extern void fib_entry_src_action_activate(fib_entry_t *fib_entry,
+                                         fib_source_t source);
+
+extern void fib_entry_src_action_deactivate(fib_entry_t *fib_entry,
+                                           fib_source_t source);
+extern void fib_entry_src_action_reactivate(fib_entry_t *fib_entry,
+                                           fib_source_t source);
+
+extern fib_entry_t* fib_entry_src_action_add(fib_entry_t *fib_entry,
+                                            fib_source_t source,
+                                            fib_entry_flag_t flags,
+                                            const dpo_id_t *dpo);
+
+extern fib_entry_src_flag_t fib_entry_src_action_remove(fib_entry_t *fib_entry,
+                                                       fib_source_t source);
+
+extern void fib_entry_src_action_install(fib_entry_t *fib_entry,
+                                        fib_source_t source);
+
+extern void fib_entry_src_action_uninstall(fib_entry_t *fib_entry);
+
+extern fib_entry_t* fib_entry_src_action_path_add(fib_entry_t *fib_entry,
+                                                 fib_source_t source,
+                                                 fib_entry_flag_t flags,
+                                                 const fib_route_path_t *path);
+
+extern fib_entry_t* fib_entry_src_action_path_swap(fib_entry_t *fib_entry,
+                                                  fib_source_t source,
+                                                  fib_entry_flag_t flags,
+                                                  const fib_route_path_t *path);
+
+extern fib_entry_src_flag_t fib_entry_src_action_path_remove(fib_entry_t *fib_entry,
+                                                            fib_source_t source,
+                                                            const fib_route_path_t *path);
+
+extern void fib_entry_src_action_installed(const fib_entry_t *fib_entry,
+                                          fib_source_t source);
+
+extern fib_forward_chain_type_t fib_entry_get_default_chain_type(
+    const fib_entry_t *fib_entry);
+extern fib_entry_flag_t fib_entry_get_flags_i(const fib_entry_t *fib_entry);
+extern fib_path_list_flags_t fib_entry_src_flags_2_path_list_flags(
+    fib_entry_flag_t eflags);
+
+extern void fib_entry_src_mk_lb (fib_entry_t *fib_entry,
+                                const fib_entry_src_t *esrc,
+                                fib_forward_chain_type_t fct,
+                                dpo_id_t *dpo_lb);
+
+
+/*
+ * Per-source registration. declared here so we save a separate .h file for each
+ */
+extern void fib_entry_src_default_register(void);
+extern void fib_entry_src_rr_register(void);
+extern void fib_entry_src_interface_register(void);
+extern void fib_entry_src_default_route_register(void);
+extern void fib_entry_src_special_register(void);
+extern void fib_entry_src_api_register(void);
+extern void fib_entry_src_adj_register(void);
+extern void fib_entry_src_mpls_register(void);
+extern void fib_entry_src_lisp_register(void);
+
+extern void fib_entry_src_module_init(void);
+
+#endif
diff --git a/vnet/vnet/fib/fib_entry_src_adj.c b/vnet/vnet/fib/fib_entry_src_adj.c
new file mode 100644 (file)
index 0000000..64f82a7
--- /dev/null
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fib_entry.h"
+#include "fib_entry_src.h"
+#include "fib_path_list.h"
+#include "fib_table.h"
+#include "fib_entry_cover.h"
+#include "fib_attached_export.h"
+
+/**
+ * Source initialisation Function 
+ */
+static void
+fib_entry_src_adj_init (fib_entry_src_t *src)
+{
+    src->adj.fesa_cover = FIB_NODE_INDEX_INVALID;
+    src->adj.fesa_sibling = FIB_NODE_INDEX_INVALID;
+}
+
+static void
+fib_entry_src_adj_path_swap (fib_entry_src_t *src,
+                            const fib_entry_t *entry,
+                            fib_path_list_flags_t pl_flags,
+                            const fib_route_path_t *paths)
+{
+    src->fes_pl = fib_path_list_create(pl_flags, paths);
+}
+
+static void
+fib_entry_src_adj_remove (fib_entry_src_t *src)
+{
+    src->fes_pl = FIB_NODE_INDEX_INVALID;
+}
+
+
+/*
+ * Source activate. 
+ * Called when the source is teh new longer best source on the entry
+ */
+static int
+fib_entry_src_adj_activate (fib_entry_src_t *src,
+                           const fib_entry_t *fib_entry)
+{
+    fib_entry_t *cover;
+
+    /*
+     * find the covering prefix. become a dependent thereof.
+     * there should always be a cover, though it may be the default route.
+     */
+    src->adj.fesa_cover = fib_table_get_less_specific(fib_entry->fe_fib_index,
+                                                     &fib_entry->fe_prefix);
+
+    ASSERT(FIB_NODE_INDEX_INVALID != src->adj.fesa_cover);
+    ASSERT(fib_entry_get_index(fib_entry) != src->adj.fesa_cover);
+
+    cover = fib_entry_get(src->adj.fesa_cover);
+
+    ASSERT(cover != fib_entry);
+
+    src->adj.fesa_sibling =
+       fib_entry_cover_track(cover,
+                             fib_entry_get_index(fib_entry));
+
+    /*
+     * if the ocver is attached then this adj source entry can install, 
+     * via the adj. otherwise install a drop.
+     * This prevents ARP/ND entries that on interface X that do not belong
+     * on X's subnet from being added to the FIB. To do so would allow
+     * nefarious gratuitous ARP requests from attracting traffic to the sender.
+     *
+     * and yes, I really do mean attached and not connected.
+     * this abomination;
+     *   ip route add 10.0.0.0/24 Eth0
+     * is attached. and we want adj-fibs to install on Eth0.
+     */
+    return (FIB_ENTRY_FLAG_ATTACHED & fib_entry_get_flags_i(cover));
+}
+
+/*
+ * Source Deactivate. 
+ * Called when the source is no longer best source on the entry
+ */
+static void
+fib_entry_src_adj_deactivate (fib_entry_src_t *src,
+                             const fib_entry_t *fib_entry)
+{
+    fib_entry_t *cover;
+
+    /*
+     * remove the depednecy on the covering entry
+     */
+    ASSERT(FIB_NODE_INDEX_INVALID != src->adj.fesa_cover);
+    cover = fib_entry_get(src->adj.fesa_cover);
+
+    fib_entry_cover_untrack(cover, src->adj.fesa_sibling);
+
+    /*
+     * tell the cover this entry no longer needs exporting
+     */
+    fib_attached_export_covered_removed(cover, fib_entry_get_index(fib_entry));
+
+    src->adj.fesa_cover = FIB_NODE_INDEX_INVALID;
+}
+
+static u8*
+fib_entry_src_adj_format (fib_entry_src_t *src,
+                        u8* s)
+{
+    return (format(s, "cover:%d", src->adj.fesa_cover));
+}
+
+static void
+fib_entry_src_adj_installed (fib_entry_src_t *src,
+                            const fib_entry_t *fib_entry)
+{
+    /*
+     * The adj source now rules! poke our cover to get exported
+     */
+    fib_entry_t *cover;
+
+    ASSERT(FIB_NODE_INDEX_INVALID != src->adj.fesa_cover);
+    cover = fib_entry_get(src->adj.fesa_cover);
+
+    fib_attached_export_covered_added(cover,
+                                     fib_entry_get_index(fib_entry));
+}
+
+static fib_entry_src_cover_res_t
+fib_entry_src_adj_cover_change (fib_entry_src_t *src,
+                               const fib_entry_t *fib_entry)
+{
+    fib_entry_src_cover_res_t res = {
+       .install = !0,
+       .bw_reason = FIB_NODE_BW_REASON_FLAG_NONE,
+    };
+
+    fib_entry_src_adj_deactivate(src, fib_entry);
+
+    res.install = fib_entry_src_adj_activate(src, fib_entry);
+
+    if (res.install) {
+       /*
+        * ADJ fib can install
+        */
+       res.bw_reason = FIB_NODE_BW_REASON_FLAG_EVALUATE;
+    }
+
+    return (res);
+}
+
+/*
+ * fib_entry_src_adj_cover_update
+ */
+static fib_entry_src_cover_res_t
+fib_entry_src_adj_cover_update (fib_entry_src_t *src,
+                                const fib_entry_t *fib_entry)
+{
+    /*
+     * the cover has updated, i.e. its forwarding or flags
+     * have changed. do'nt decativate/activate here, since this
+     * prefix is updated during the covers walk.
+     */
+    fib_entry_src_cover_res_t res = {
+       .install = !0,
+       .bw_reason = FIB_NODE_BW_REASON_FLAG_NONE,
+    };
+    fib_entry_t *cover;
+
+    ASSERT(FIB_NODE_INDEX_INVALID != src->adj.fesa_cover);
+
+    cover = fib_entry_get(src->adj.fesa_cover);
+
+    res.install = (FIB_ENTRY_FLAG_ATTACHED & fib_entry_get_flags_i(cover));
+
+    return (res);
+}
+
+const static fib_entry_src_vft_t adj_src_vft = {
+    .fesv_init = fib_entry_src_adj_init,
+    .fesv_path_swap = fib_entry_src_adj_path_swap,
+    .fesv_remove = fib_entry_src_adj_remove,
+    .fesv_activate = fib_entry_src_adj_activate,
+    .fesv_deactivate = fib_entry_src_adj_deactivate,
+    .fesv_format = fib_entry_src_adj_format,
+    .fesv_installed = fib_entry_src_adj_installed,
+    .fesv_cover_change = fib_entry_src_adj_cover_change,
+    .fesv_cover_update = fib_entry_src_adj_cover_update,
+};
+
+void
+fib_entry_src_adj_register (void)
+{
+    fib_entry_src_register(FIB_SOURCE_ADJ, &adj_src_vft);
+}
diff --git a/vnet/vnet/fib/fib_entry_src_api.c b/vnet/vnet/fib/fib_entry_src_api.c
new file mode 100644 (file)
index 0000000..edc8a47
--- /dev/null
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fib_entry.h"
+#include "fib_entry_src.h"
+#include "fib_path_list.h"
+
+/**
+ * Source initialisation Function 
+ */
+static void
+fib_entry_src_api_init (fib_entry_src_t *src)
+{
+}
+
+/**
+ * Source deinitialisation Function 
+ */
+static void
+fib_entry_src_api_deinit (fib_entry_src_t *src)
+{
+}
+
+static void
+fib_entry_src_api_path_swap (fib_entry_src_t *src,
+                            const fib_entry_t *entry,
+                            fib_path_list_flags_t pl_flags,
+                            const fib_route_path_t *paths)
+{
+    src->fes_pl = fib_path_list_create((FIB_PATH_LIST_FLAG_SHARED | pl_flags),
+                                      paths);
+}
+
+static void
+fib_entry_src_api_path_add (fib_entry_src_t *src,
+                           const fib_entry_t *entry,
+                           fib_path_list_flags_t pl_flags,
+                           const fib_route_path_t *paths)
+{
+    if (FIB_NODE_INDEX_INVALID == src->fes_pl)
+    {  
+       src->fes_pl =
+           fib_path_list_create((FIB_PATH_LIST_FLAG_SHARED | pl_flags), paths);
+    }
+    else
+    {
+       src->fes_pl =
+           fib_path_list_copy_and_path_add(src->fes_pl,
+                                           (FIB_PATH_LIST_FLAG_SHARED | pl_flags),
+                                           paths);
+    }
+}
+
+static void
+fib_entry_src_api_path_remove (fib_entry_src_t *src,
+                              fib_path_list_flags_t pl_flags,
+                              const fib_route_path_t *paths)
+{
+    if (FIB_NODE_INDEX_INVALID != src->fes_pl)
+    {
+       src->fes_pl =
+           fib_path_list_copy_and_path_remove(src->fes_pl,
+                                              (FIB_PATH_LIST_FLAG_SHARED | pl_flags),
+                                              paths);
+    }
+}
+
+static void
+fib_entry_src_api_add (fib_entry_src_t *src,
+                      const fib_entry_t *entry,
+                      fib_entry_flag_t flags,
+                      fib_protocol_t proto,
+                      const dpo_id_t *dpo)
+{
+    if (FIB_ENTRY_FLAG_NONE != flags)
+    {
+       src->fes_pl = fib_path_list_create_special(
+                         proto,
+                         fib_entry_src_flags_2_path_list_flags(flags),
+                         dpo);
+    }
+}
+
+static void
+fib_entry_src_api_remove (fib_entry_src_t *src)
+{
+    src->fes_pl = FIB_NODE_INDEX_INVALID;
+}
+
+const static fib_entry_src_vft_t api_src_vft = {
+    .fesv_init = fib_entry_src_api_init,
+    .fesv_deinit = fib_entry_src_api_deinit,
+    .fesv_add = fib_entry_src_api_add,
+    .fesv_remove = fib_entry_src_api_remove,
+    .fesv_path_add = fib_entry_src_api_path_add,
+    .fesv_path_swap = fib_entry_src_api_path_swap,
+    .fesv_path_remove = fib_entry_src_api_path_remove,
+};
+
+void
+fib_entry_src_api_register (void)
+{
+    fib_entry_src_register(FIB_SOURCE_PLUGIN_HI, &api_src_vft);
+    fib_entry_src_register(FIB_SOURCE_API, &api_src_vft);
+    fib_entry_src_register(FIB_SOURCE_CLI, &api_src_vft);
+    fib_entry_src_register(FIB_SOURCE_DHCP, &api_src_vft);
+}
diff --git a/vnet/vnet/fib/fib_entry_src_default.c b/vnet/vnet/fib/fib_entry_src_default.c
new file mode 100644 (file)
index 0000000..9846cf5
--- /dev/null
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fib_entry.h"
+#include "fib_entry_src.h"
+#include "fib_path_list.h"
+
+/**
+ * Source initialisation Function 
+ */
+static void
+fib_entry_src_default_init (fib_entry_src_t *src)
+{
+}
+
+/**
+ * Source deinitialisation Function 
+ */
+static void
+fib_entry_src_default_deinit (fib_entry_src_t *src)
+{
+}
+
+static void
+fib_entry_src_cover_change (fib_entry_src_t *src)
+{
+}
+
+/**
+ * Source deinitialisation Function 
+ */
+static void
+fib_entry_src_default_deinit (fib_entry_src_t *src)
+{
+}
+
+static void
+fib_entry_src_default_path_add (fib_entry_src_t *src,
+                               fib_protocol_t proto,
+                               const ip46_address_t *next_hop,
+                               u32 next_hop_sw_if_index,
+                               u32 next_hop_fib_index,
+                               u32 next_hop_weight)
+{
+}
+
+static void
+fib_entry_src_default_path_remove (fib_entry_src_t *src,
+                                    fib_protocol_t proto,
+                                    const ip46_address_t *next_hop,
+                                    u32 next_hop_sw_if_index,
+                                    u32 next_hop_fib_index,
+                                    u32 next_hop_weight)
+{
+}
+
+
+/*
+ * Source activate. 
+ * Called when the source is teh new longer best source on the entry
+ */
+static void
+fib_entry_src_default_activate (fib_entry_src_t *src,
+                                 const fib_entry_t *fib_entry)
+{
+}
+
+/*
+ * Source Deactivate. 
+ * Called when the source is no longer best source on the entry
+ */
+static void
+fib_entry_src_default_deactivate (fib_entry_src_t *src,
+                                   const fib_entry_t *fib_entry)
+{
+}
+
+static void
+fib_entry_src_default_add (fib_entry_src_t *src,
+                            fib_entry_flag_t flags,
+                            fib_protocol_t proto)
+{
+}
+
+static void
+fib_entry_src_default_remove (fib_entry_src_t *src)                         
+{
+}
+
+const static fib_entry_src_vft_t default_src_vft = {
+    .fesv_init = fib_entry_src_default_init,
+    .fesv_deinit = fib_entry_src_default_deinit,
+    .fesv_add = fib_entry_src_default_add,
+    .fesv_remove = fib_entry_src_default_remove,
+    .fesv_path_add = fib_entry_src_default_path_add,
+    .fesv_path_remove = fib_entry_src_default_path_remove,
+    .fesv_activate = fib_entry_src_default_activate,
+    .fesv_deactivate = fib_entry_src_default_deactivate,
+};
+
+void
+fib_entry_src_default_register (void)
+{
+    fib_source_t source;
+
+    FOR_EACH_FIB_SOURCE(source) {
+       fib_entry_src_register(source, &default_src_vft);    
+    }
+}
diff --git a/vnet/vnet/fib/fib_entry_src_default_route.c b/vnet/vnet/fib/fib_entry_src_default_route.c
new file mode 100644 (file)
index 0000000..8615f72
--- /dev/null
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fib_entry.h"
+#include "fib_entry_src.h"
+
+/**
+ * Source initialisation Function 
+ */
+static void
+fib_entry_src_default_route_init (fib_entry_src_t *src)
+{
+    src->fes_flags = FIB_ENTRY_FLAG_NONE;
+}
+
+static void
+fib_entry_src_default_route_remove (fib_entry_src_t *src)
+{
+    src->fes_pl = FIB_NODE_INDEX_INVALID;
+}
+
+static void
+fib_entry_src_default_route_add (fib_entry_src_t *src,
+                                const fib_entry_t *entry,
+                                fib_entry_flag_t flags,
+                                fib_protocol_t proto,
+                                const dpo_id_t *dpo)
+{
+    src->fes_pl = fib_path_list_create_special(proto,
+                                              FIB_PATH_LIST_FLAG_DROP,
+                                              dpo);
+}
+
+const static fib_entry_src_vft_t interface_src_vft = {
+    .fesv_init = fib_entry_src_default_route_init,
+    .fesv_add = fib_entry_src_default_route_add,
+    .fesv_remove = fib_entry_src_default_route_remove,
+};
+
+void
+fib_entry_src_default_route_register (void)
+{
+    fib_entry_src_register(FIB_SOURCE_DEFAULT_ROUTE, &interface_src_vft);    
+}
+
+
diff --git a/vnet/vnet/fib/fib_entry_src_interface.c b/vnet/vnet/fib/fib_entry_src_interface.c
new file mode 100644 (file)
index 0000000..2fb6167
--- /dev/null
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fib_entry.h"
+#include "fib_entry_src.h"
+#include "fib_path_list.h"
+#include "fib_internal.h"
+#include "fib_table.h"
+#include "fib_entry_cover.h"
+#include "fib_attached_export.h"
+
+/**
+ * Source initialisation Function 
+ */
+static void
+fib_entry_src_interface_init (fib_entry_src_t *src)
+{
+    src->interface.fesi_cover = FIB_NODE_INDEX_INVALID;
+    src->interface.fesi_sibling = FIB_NODE_INDEX_INVALID;
+}
+
+static void
+fib_entry_src_interface_path_swap (fib_entry_src_t *src,
+                                  const fib_entry_t *entry,
+                                  fib_path_list_flags_t pl_flags,
+                                  const fib_route_path_t *paths)
+{
+    ip_adjacency_t *adj;
+
+    src->fes_pl = fib_path_list_create(pl_flags, paths);
+
+    /*
+     * this is a hack to get the entry's prefix into the glean adjacnecy
+     * so that it is available for fast retreival in the switch path.
+     */
+    if (!(FIB_ENTRY_FLAG_LOCAL & src->fes_entry_flags))
+    {
+        adj = adj_get(fib_path_list_get_adj(
+                         src->fes_pl,
+                         fib_entry_get_default_chain_type(entry)));
+
+       if (IP_LOOKUP_NEXT_GLEAN == adj->lookup_next_index);
+        {
+            /*
+             * the connected prefix will link to a glean on a non-p2p
+             * interface.
+             */
+            adj->sub_type.glean.receive_addr = entry->fe_prefix.fp_addr;
+        }
+    }
+}
+
+/*
+ * Source activate. 
+ * Called when the source is teh new longer best source on the entry
+ */
+static int
+fib_entry_src_interface_activate (fib_entry_src_t *src,
+                                 const fib_entry_t *fib_entry)
+{
+    fib_entry_t *cover;
+
+    if (FIB_ENTRY_FLAG_LOCAL & src->fes_entry_flags)
+    {
+       /*
+        * Track the covering attached/connected cover. This is so that
+        * during an attached export of the cover, this local prefix is
+        * also exported
+        */
+       src->interface.fesi_cover =
+           fib_table_get_less_specific(fib_entry->fe_fib_index,
+                                       &fib_entry->fe_prefix);
+
+       ASSERT(FIB_NODE_INDEX_INVALID != src->interface.fesi_cover);
+
+       cover = fib_entry_get(src->interface.fesi_cover);
+
+       src->interface.fesi_sibling =
+           fib_entry_cover_track(cover, fib_entry_get_index(fib_entry));
+    }
+
+    return (!0);
+}
+
+
+/*
+ * Source Deactivate. 
+ * Called when the source is no longer best source on the entry
+ */
+static void
+fib_entry_src_interface_deactivate (fib_entry_src_t *src,
+                                   const fib_entry_t *fib_entry)
+{
+    fib_entry_t *cover;
+
+    /*
+     * remove the depednecy on the covering entry
+     */
+    if (FIB_NODE_INDEX_INVALID != src->interface.fesi_cover)
+    {
+       cover = fib_entry_get(src->interface.fesi_cover);
+
+       fib_entry_cover_untrack(cover, src->interface.fesi_sibling);
+
+       src->interface.fesi_cover = FIB_NODE_INDEX_INVALID;
+    }
+}
+
+static fib_entry_src_cover_res_t
+fib_entry_src_interface_cover_change (fib_entry_src_t *src,
+                                     const fib_entry_t *fib_entry)
+{
+    fib_entry_src_cover_res_t res = {
+       .install = !0,
+       .bw_reason = FIB_NODE_BW_REASON_FLAG_NONE,
+    };
+
+    if (FIB_NODE_INDEX_INVALID == src->interface.fesi_cover)
+    {
+       /*
+        * not tracking the cover. surprised we got poked?
+        */
+       return (res);
+    }
+
+    /*
+     * this function is called when this entry's cover has a more specific
+     * entry inserted benaeth it. That does not necessarily mean that this
+     * entry is covered by the new prefix. check that
+     */
+    if (src->rr.fesr_cover != fib_table_get_less_specific(fib_entry->fe_fib_index,
+                                                         &fib_entry->fe_prefix))
+    {
+       fib_entry_src_interface_deactivate(src, fib_entry);
+       fib_entry_src_interface_activate(src, fib_entry);
+    }
+    return (res);
+}
+
+static void
+fib_entry_src_interface_installed (fib_entry_src_t *src,
+                                  const fib_entry_t *fib_entry)
+{
+    /*
+     * The interface source now rules! poke our cover to get exported
+     */
+    fib_entry_t *cover;
+
+    if (FIB_NODE_INDEX_INVALID != src->interface.fesi_cover)
+    {
+       cover = fib_entry_get(src->interface.fesi_cover);
+
+       fib_attached_export_covered_added(cover,
+                                         fib_entry_get_index(fib_entry));
+    }
+}
+
+static u8*
+fib_entry_src_interface_format (fib_entry_src_t *src,
+                               u8* s)
+{
+    return (format(s, "cover:%d", src->interface.fesi_cover));
+}
+
+const static fib_entry_src_vft_t interface_src_vft = {
+    .fesv_init = fib_entry_src_interface_init,
+    .fesv_path_swap = fib_entry_src_interface_path_swap,
+    .fesv_activate = fib_entry_src_interface_activate,
+    .fesv_deactivate = fib_entry_src_interface_deactivate,
+    .fesv_format = fib_entry_src_interface_format,
+    .fesv_installed = fib_entry_src_interface_installed,
+    .fesv_cover_change = fib_entry_src_interface_cover_change,
+    /*
+     * not concerned about updates to the cover. the cover will
+     * decide to export or not
+     */
+};
+
+void
+fib_entry_src_interface_register (void)
+{
+    fib_entry_src_register(FIB_SOURCE_INTERFACE, &interface_src_vft);    
+}
diff --git a/vnet/vnet/fib/fib_entry_src_lisp.c b/vnet/vnet/fib/fib_entry_src_lisp.c
new file mode 100644 (file)
index 0000000..116c492
--- /dev/null
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fib_entry.h"
+#include "fib_entry_src.h"
+#include "fib_path_list.h"
+
+/**
+ * Source initialisation Function 
+ */
+static void
+fib_entry_src_lisp_init (fib_entry_src_t *src)
+{
+}
+
+/**
+ * Source deinitialisation Function 
+ */
+static void
+fib_entry_src_lisp_deinit (fib_entry_src_t *src)
+{
+}
+
+static void
+fib_entry_src_lisp_path_swap (fib_entry_src_t *src,
+                             const fib_entry_t *entry,
+                             fib_path_list_flags_t pl_flags,
+                            const fib_route_path_t *paths)
+{
+    src->fes_pl = fib_path_list_create((FIB_PATH_LIST_FLAG_SHARED | pl_flags),
+                                      paths);
+}
+
+static void
+fib_entry_src_lisp_path_add (fib_entry_src_t *src,
+                           const fib_entry_t *entry,
+                           fib_path_list_flags_t pl_flags,
+                           const fib_route_path_t *paths)
+{
+    if (FIB_NODE_INDEX_INVALID == src->fes_pl)
+    {  
+       src->fes_pl =
+           fib_path_list_create((FIB_PATH_LIST_FLAG_SHARED | pl_flags), paths);
+    }
+    else
+    {
+       src->fes_pl =
+           fib_path_list_copy_and_path_add(src->fes_pl,
+                                           (FIB_PATH_LIST_FLAG_SHARED | pl_flags),
+                                           paths);
+    }
+}
+
+static void
+fib_entry_src_lisp_path_remove (fib_entry_src_t *src,
+                              fib_path_list_flags_t pl_flags,
+                              const fib_route_path_t *paths)
+{
+    if (FIB_NODE_INDEX_INVALID != src->fes_pl)
+    {
+       src->fes_pl =
+           fib_path_list_copy_and_path_remove(src->fes_pl,
+                                              (FIB_PATH_LIST_FLAG_SHARED | pl_flags),
+                                              paths);
+    }
+}
+
+static void
+fib_entry_src_lisp_add (fib_entry_src_t *src,
+                      const fib_entry_t *entry,
+                      fib_entry_flag_t flags,
+                      fib_protocol_t proto,
+                      const dpo_id_t *dpo)
+{
+    if (FIB_ENTRY_FLAG_NONE != flags)
+    {
+       src->fes_pl = fib_path_list_create_special(proto, flags, dpo);
+    }
+}
+
+static void
+fib_entry_src_lisp_remove (fib_entry_src_t *src)
+{
+    src->fes_pl = FIB_NODE_INDEX_INVALID;
+}
+
+static void
+fib_entry_src_lisp_set_data (fib_entry_src_t *src,
+                             const fib_entry_t *entry,
+                             const void *data)
+{
+    src->lisp.fesl_fib_index = *(u32*)data;
+}
+
+static const void*
+fib_entry_src_lisp_get_data (fib_entry_src_t *src,
+                             const fib_entry_t *entry)
+{
+    return (&(src->lisp.fesl_fib_index));
+}
+
+const static fib_entry_src_vft_t api_src_vft = {
+    .fesv_init = fib_entry_src_lisp_init,
+    .fesv_deinit = fib_entry_src_lisp_deinit,
+    .fesv_add = fib_entry_src_lisp_add,
+    .fesv_remove = fib_entry_src_lisp_remove,
+    .fesv_path_add = fib_entry_src_lisp_path_add,
+    .fesv_path_swap = fib_entry_src_lisp_path_swap,
+    .fesv_path_remove = fib_entry_src_lisp_path_remove,
+    .fesv_set_data = fib_entry_src_lisp_set_data,
+    .fesv_get_data = fib_entry_src_lisp_get_data,
+};
+
+void
+fib_entry_src_lisp_register (void)
+{
+    fib_entry_src_register(FIB_SOURCE_LISP, &api_src_vft);
+}
diff --git a/vnet/vnet/fib/fib_entry_src_mpls.c b/vnet/vnet/fib/fib_entry_src_mpls.c
new file mode 100644 (file)
index 0000000..5145c10
--- /dev/null
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/mpls/mpls_types.h>
+#include <vnet/dpo/drop_dpo.h>
+
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/fib_entry.h>
+#include <vnet/fib/fib_entry_src.h>
+#include <vnet/fib/mpls_fib.h>
+
+/**
+ * Source initialisation Function 
+ */
+static void
+fib_entry_src_mpls_init (fib_entry_src_t *src)
+{
+    mpls_eos_bit_t eos;
+
+    src->fes_flags = FIB_ENTRY_FLAG_NONE;
+    src->mpls.fesm_label = MPLS_LABEL_INVALID;
+
+    FOR_EACH_MPLS_EOS_BIT(eos)
+    {
+       src->mpls.fesm_lfes[eos] = FIB_NODE_INDEX_INVALID;
+    }
+}
+
+/**
+ * Source deinitialisation Function 
+ */
+static void
+fib_entry_src_mpls_deinit (fib_entry_src_t *src)
+{
+}
+
+static void
+fib_entry_src_mpls_remove (fib_entry_src_t *src)
+{
+    src->fes_pl = FIB_NODE_INDEX_INVALID;
+    src->mpls.fesm_label = MPLS_LABEL_INVALID;
+}
+
+static void
+fib_entry_src_mpls_add (fib_entry_src_t *src,
+                        const fib_entry_t *entry,
+                        fib_entry_flag_t flags,
+                        fib_protocol_t proto,
+                        const dpo_id_t *dpo)
+{
+    src->fes_pl =
+       fib_path_list_create_special(proto,
+                                    FIB_PATH_LIST_FLAG_DROP,
+                                    drop_dpo_get(fib_proto_to_dpo(proto)));
+}
+
+static void
+fib_entry_src_mpls_fwd_update (fib_entry_src_t *src,
+                              const fib_entry_t *fib_entry,
+                              fib_source_t best_source)
+{
+    dpo_id_t dpo = DPO_NULL;
+    mpls_eos_bit_t eos;
+
+    FOR_EACH_MPLS_EOS_BIT(eos)
+    {
+       fib_entry_contribute_forwarding(fib_entry_get_index(fib_entry),
+                                       (eos ?
+                                        FIB_FORW_CHAIN_TYPE_MPLS_EOS :
+                                        FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS),
+                                       &dpo);
+
+       fib_table_entry_special_dpo_update(src->mpls.fesm_lfes[eos],
+                                          FIB_SOURCE_SPECIAL,
+                                          FIB_ENTRY_FLAG_EXCLUSIVE,
+                                          &dpo);
+    }
+    dpo_reset(&dpo);
+}
+
+static void
+fib_entry_src_mpls_set_data (fib_entry_src_t *src,
+                             const fib_entry_t *entry,
+                             const void *data)
+{
+    dpo_proto_t payload_proto;
+    fib_node_index_t fei;
+    mpls_label_t label;
+    mpls_eos_bit_t eos;
+
+    /*
+     * post MPLS table alloc and the possible rea-alloc of fib entrys
+     * the entry pointer will no longer be valid. so save its index
+     */
+    payload_proto = entry->fe_prefix.fp_proto;
+    fei = fib_entry_get_index(entry);
+    label = *(mpls_label_t*)data;
+
+    if (MPLS_LABEL_INVALID == label)
+    {
+        /*
+         * removing the local label
+         */
+        FOR_EACH_MPLS_EOS_BIT(eos)
+        {
+           fib_table_entry_delete_index(src->mpls.fesm_lfes[eos],
+                                        FIB_SOURCE_SPECIAL);
+        }
+        fib_table_unlock(MPLS_FIB_DEFAULT_TABLE_ID, FIB_PROTOCOL_MPLS);
+        src->mpls.fesm_label = label;
+    }
+    else
+    {
+       fib_prefix_t prefix = {
+           .fp_proto = FIB_PROTOCOL_MPLS,
+           .fp_label = label,
+       };
+       fib_node_index_t fib_index;
+       dpo_id_t dpo = DPO_NULL;
+
+        /*
+         * adding a new local label. make sure the MPLS fib exists.
+         */
+        if (MPLS_LABEL_INVALID == src->mpls.fesm_label)
+        {
+            fib_index =
+               fib_table_find_or_create_and_lock(FIB_PROTOCOL_MPLS,
+                                                 MPLS_FIB_DEFAULT_TABLE_ID);
+        }
+       else
+       {
+           fib_index = mpls_fib_index_from_table_id(MPLS_FIB_DEFAULT_TABLE_ID);
+       }
+
+        src->mpls.fesm_label = label;
+
+       FOR_EACH_MPLS_EOS_BIT(eos)
+       {
+           prefix.fp_eos = eos;
+           prefix.fp_payload_proto = fib_proto_to_dpo(payload_proto);
+
+           fib_entry_contribute_forwarding(fei,
+                                           (eos ?
+                                            FIB_FORW_CHAIN_TYPE_MPLS_EOS :
+                                            FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS),
+                                           &dpo);
+           src->mpls.fesm_lfes[eos] = 
+               fib_table_entry_special_dpo_add(fib_index,
+                                               &prefix,
+                                               FIB_SOURCE_SPECIAL,
+                                               FIB_ENTRY_FLAG_EXCLUSIVE,
+                                               &dpo);
+           dpo_reset(&dpo);
+       }
+    }
+}
+
+static const void *
+fib_entry_src_mpls_get_data (fib_entry_src_t *src,
+                             const fib_entry_t *entry)
+{
+    return (&(src->mpls.fesm_label));
+}
+
+static u8*
+fib_entry_src_mpls_format (fib_entry_src_t *src,
+                          u8* s)
+{
+    return (format(s, "MPLS local-label:%d", src->mpls.fesm_label));
+}
+
+const static fib_entry_src_vft_t mpls_src_vft = {
+    .fesv_init = fib_entry_src_mpls_init,
+    .fesv_deinit = fib_entry_src_mpls_deinit,
+    .fesv_add = fib_entry_src_mpls_add,
+    .fesv_remove = fib_entry_src_mpls_remove,
+    .fesv_format = fib_entry_src_mpls_format,
+    .fesv_fwd_update = fib_entry_src_mpls_fwd_update,
+    .fesv_set_data = fib_entry_src_mpls_set_data,
+    .fesv_get_data = fib_entry_src_mpls_get_data,
+};
+
+void
+fib_entry_src_mpls_register (void)
+{
+    fib_entry_src_register(FIB_SOURCE_MPLS, &mpls_src_vft);    
+}
+
+
diff --git a/vnet/vnet/fib/fib_entry_src_rr.c b/vnet/vnet/fib/fib_entry_src_rr.c
new file mode 100644 (file)
index 0000000..f6b8960
--- /dev/null
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/ip/format.h>
+#include <vnet/ip/lookup.h>
+#include <vnet/adj/adj.h>
+
+#include "fib_entry_src.h"
+#include "fib_entry_cover.h"
+#include "fib_entry.h"
+#include "fib_table.h"
+
+/*
+ * fib_entry_src_rr_resolve_via_connected
+ *
+ * Resolve via a connected cover.
+ */
+static void
+fib_entry_src_rr_resolve_via_connected (fib_entry_src_t *src,
+                                       const fib_entry_t *fib_entry,
+                                       const fib_entry_t *cover)
+{
+    const fib_route_path_t path = {
+       .frp_proto = fib_entry->fe_prefix.fp_proto,
+       .frp_addr = fib_entry->fe_prefix.fp_addr,
+       .frp_sw_if_index = fib_entry_get_resolving_interface(
+                              fib_entry_get_index(cover)),
+       .frp_fib_index = ~0,
+       .frp_weight = 1,
+    };
+    fib_route_path_t *paths = NULL;
+    vec_add1(paths, path);
+
+    /*
+     * since the cover is connected, the address this entry corresponds
+     * to is a peer (ARP-able for) on the interface to which the cover is
+     * connected. The fact we resolve via the cover, just means this RR
+     * source is the first SRC to use said peer. The ARP source will be along
+     * shortly to over-rule this RR source.
+     */
+    src->fes_pl = fib_path_list_create(FIB_PATH_LIST_FLAG_NONE, paths);
+    src->fes_entry_flags = fib_entry_get_flags(fib_entry_get_index(cover));
+
+    vec_free(paths);
+}
+
+/**
+ * Source initialisation Function 
+ */
+static void
+fib_entry_src_rr_init (fib_entry_src_t *src)
+{
+    src->rr.fesr_cover = FIB_NODE_INDEX_INVALID;
+    src->rr.fesr_sibling = FIB_NODE_INDEX_INVALID;
+}
+
+/*
+ * Source activation. Called when the source is the new best source on the entry
+ */
+static int
+fib_entry_src_rr_activate (fib_entry_src_t *src,
+                          const fib_entry_t *fib_entry)
+{
+    fib_entry_t *cover;
+
+    /*
+     * find the covering prefix. become a dependent thereof.
+     * there should always be a cover, though it may be the default route.
+     */
+    src->rr.fesr_cover = fib_table_get_less_specific(fib_entry->fe_fib_index,
+                                                    &fib_entry->fe_prefix);
+
+    ASSERT(FIB_NODE_INDEX_INVALID != src->rr.fesr_cover);
+
+    cover = fib_entry_get(src->rr.fesr_cover);
+
+    src->rr.fesr_sibling =
+       fib_entry_cover_track(cover, fib_entry_get_index(fib_entry));
+
+    /*
+     * if the ocver is attached then install an attached-host path
+     * (like an adj-fib). Otherwise inherit the forwarding from the cover
+     */
+    if (FIB_ENTRY_FLAG_ATTACHED & fib_entry_get_flags_i(cover))
+    {
+       fib_entry_src_rr_resolve_via_connected(src, fib_entry, cover);
+    }
+    else
+    {
+       src->fes_pl = cover->fe_parent;
+    }
+    fib_path_list_lock(src->fes_pl);
+
+    /*
+     * return go for install
+     */
+    return (!0);
+}
+
+/**
+ * Source Deactivate. 
+ * Called when the source is no longer best source on the entry
+ */
+static void
+fib_entry_src_rr_deactivate (fib_entry_src_t *src,
+                            const fib_entry_t *fib_entry)
+{
+    fib_entry_t *cover;
+
+    /*
+     * remove the depednecy on the covering entry
+     */
+    ASSERT(FIB_NODE_INDEX_INVALID != src->rr.fesr_cover);
+    cover = fib_entry_get(src->rr.fesr_cover);
+
+    fib_entry_cover_untrack(cover, src->rr.fesr_sibling);
+
+    src->rr.fesr_cover = FIB_NODE_INDEX_INVALID;
+
+    fib_path_list_unlock(src->fes_pl);
+    src->fes_pl = FIB_NODE_INDEX_INVALID;
+    src->fes_entry_flags = FIB_ENTRY_FLAG_NONE;
+}
+
+static fib_entry_src_cover_res_t
+fib_entry_src_rr_cover_change (fib_entry_src_t *src,
+                              const fib_entry_t *fib_entry)
+{
+    fib_entry_src_cover_res_t res = {
+       .install = !0,
+       .bw_reason = FIB_NODE_BW_REASON_FLAG_NONE,
+    };
+
+    if (FIB_NODE_INDEX_INVALID == src->rr.fesr_cover)
+    {
+       /*
+        * the source may be added, but it is not active
+        * if it is not tracking the cover.
+        */
+       return (res);
+    }
+
+    /*
+     * this function is called when this entry's cover has a more specific
+     * entry inserted benaeth it. That does not necessarily mean that this
+     * entry is covered by the new prefix. check that
+     */
+    if (src->rr.fesr_cover != fib_table_get_less_specific(fib_entry->fe_fib_index,
+                                                         &fib_entry->fe_prefix))
+    {
+       fib_entry_src_rr_deactivate(src, fib_entry);
+       fib_entry_src_rr_activate(src, fib_entry);
+
+       /*
+        * dependent children need to re-resolve to the new forwarding info
+        */
+       res.bw_reason = FIB_NODE_BW_REASON_FLAG_EVALUATE;
+    }
+    return (res);
+}
+
+/*
+ * fib_entry_src_rr_cover_update
+ *
+ * This entry's cover has updated its forwarding info. This entry
+ * will need to re-inheret.
+ */
+static fib_entry_src_cover_res_t
+fib_entry_src_rr_cover_update (fib_entry_src_t *src,
+                              const fib_entry_t *fib_entry)
+{
+    fib_entry_src_cover_res_t res = {
+       .install = !0,
+       .bw_reason = FIB_NODE_BW_REASON_FLAG_NONE,
+    };
+    fib_node_index_t old_path_list;
+    fib_entry_t *cover;
+
+    if (FIB_NODE_INDEX_INVALID == src->rr.fesr_cover)
+    {
+       /*
+        * the source may be added, but it is not active
+        * if it is not tracking the cover.
+        */
+       return (res);
+    }
+
+    cover = fib_entry_get(src->rr.fesr_cover);
+    old_path_list = src->fes_pl;
+
+    /*
+     * if the ocver is attached then install an attached-host path
+     * (like an adj-fib). Otherwise inherit the forwarding from the cover
+     */
+    if (FIB_ENTRY_FLAG_ATTACHED & fib_entry_get_flags_i(cover))
+    {
+       fib_entry_src_rr_resolve_via_connected(src, fib_entry, cover);
+    }
+    else
+    {
+       src->fes_pl = cover->fe_parent;
+    }
+    fib_path_list_lock(src->fes_pl);
+    fib_path_list_unlock(old_path_list);
+
+    /*
+     * dependent children need to re-resolve to the new forwarding info
+     */
+    res.bw_reason = FIB_NODE_BW_REASON_FLAG_EVALUATE;
+
+    return (res);
+}
+
+static u8*
+fib_entry_src_rr_format (fib_entry_src_t *src,
+                        u8* s)
+{
+    return (format(s, "cover:%d", src->rr.fesr_cover));
+}
+
+const static fib_entry_src_vft_t rr_src_vft = {
+    .fesv_init = fib_entry_src_rr_init,
+    .fesv_activate = fib_entry_src_rr_activate,
+    .fesv_deactivate = fib_entry_src_rr_deactivate,
+    .fesv_cover_change = fib_entry_src_rr_cover_change,
+    .fesv_cover_update = fib_entry_src_rr_cover_update,
+    .fesv_format = fib_entry_src_rr_format,
+};
+
+void
+fib_entry_src_rr_register (void)
+{
+    fib_entry_src_register(FIB_SOURCE_RR, &rr_src_vft);    
+}
diff --git a/vnet/vnet/fib/fib_entry_src_special.c b/vnet/vnet/fib/fib_entry_src_special.c
new file mode 100644 (file)
index 0000000..f73e280
--- /dev/null
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fib_entry.h"
+#include "fib_entry_src.h"
+
+/**
+ * Source initialisation Function
+ */
+static void
+fib_entry_src_special_init (fib_entry_src_t *src)
+{
+    src->fes_flags = FIB_ENTRY_FLAG_NONE;
+}
+
+/**
+ * Source deinitialisation Function
+ */
+static void
+fib_entry_src_special_deinit (fib_entry_src_t *src)
+{
+}
+
+static void
+fib_entry_src_special_remove (fib_entry_src_t *src)
+{
+    src->fes_pl = FIB_NODE_INDEX_INVALID;
+}
+
+static void
+fib_entry_src_special_add (fib_entry_src_t *src,
+                          const fib_entry_t *entry,
+                          fib_entry_flag_t flags,
+                          fib_protocol_t proto,
+                          const dpo_id_t *dpo)
+{
+    src->fes_pl =
+       fib_path_list_create_special(proto,
+                                    fib_entry_src_flags_2_path_list_flags(flags),
+                                    dpo);
+}
+
+const static fib_entry_src_vft_t special_src_vft = {
+    .fesv_init = fib_entry_src_special_init,
+    .fesv_deinit = fib_entry_src_special_deinit,
+    .fesv_add = fib_entry_src_special_add,
+    .fesv_remove = fib_entry_src_special_remove,
+};
+
+void
+fib_entry_src_special_register (void)
+{
+    fib_entry_src_register(FIB_SOURCE_SPECIAL, &special_src_vft);
+    fib_entry_src_register(FIB_SOURCE_MAP, &special_src_vft);
+    fib_entry_src_register(FIB_SOURCE_SIXRD, &special_src_vft);
+    fib_entry_src_register(FIB_SOURCE_CLASSIFY, &special_src_vft);
+    fib_entry_src_register(FIB_SOURCE_SR, &special_src_vft);
+    fib_entry_src_register(FIB_SOURCE_AE, &special_src_vft);
+}
diff --git a/vnet/vnet/fib/fib_internal.h b/vnet/vnet/fib/fib_internal.h
new file mode 100644 (file)
index 0000000..26b349e
--- /dev/null
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FIB_INTERNAL_H__
+#define __FIB_INTERNAL_H__
+
+#include <vnet/ip/ip.h>
+#include <vnet/dpo/dpo.h>
+
+/**
+ * Big train switch; FIB debugs on or off
+ */
+#undef FIB_DEBUG
+
+extern void fib_prefix_from_ip46_addr (const ip46_address_t *addr,
+                                      fib_prefix_t *prf);
+
+extern int fib_route_path_cmp(const fib_route_path_t *rpath1,
+                             const fib_route_path_t *rpath2);
+
+/**
+ * @brief
+ *  Add or update an entry in the FIB's forwarding table.
+ * This is called from the fib_entry code. It is not meant to be used
+ * by the client/source.
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix for the entry to add/update
+ *
+ * @param dpo
+ *  The data-path object to use for forwarding
+ */
+extern void fib_table_fwding_dpo_update(u32 fib_index,
+                                       const fib_prefix_t *prefix,
+                                       const dpo_id_t *dpo);
+/**
+ * @brief
+ *  remove an entry in the FIB's forwarding table
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix for the entry to add/update
+ *
+ * @param dpo
+ *  The data-path object to use for forwarding
+ */
+extern void fib_table_fwding_dpo_remove(u32 fib_index,
+                                       const fib_prefix_t *prefix,
+                                       const dpo_id_t *dpo);
+
+
+#endif
diff --git a/vnet/vnet/fib/fib_node.c b/vnet/vnet/fib/fib_node.c
new file mode 100644 (file)
index 0000000..8ac67d2
--- /dev/null
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/fib/fib_node.h>
+#include <vnet/fib/fib_node_list.h>
+
+/*
+ * The per-type vector of virtual function tables
+ */
+static fib_node_vft_t *fn_vfts;
+
+/**
+ * The last registered new type
+ */
+static fib_node_type_t last_new_type = FIB_NODE_TYPE_LAST;
+
+/*
+ * the node type names
+ */
+static const char *fn_type_names[] = FIB_NODE_TYPES;
+
+const char*
+fib_node_type_get_name (fib_node_type_t type)
+{
+    if (type < FIB_NODE_TYPE_LAST)
+       return (fn_type_names[type]);
+    else
+    {
+       if (NULL != fn_vfts[type].fnv_format)
+       {
+           return ("fixme");
+       }
+       else
+       {
+           return ("unknown");
+       }
+    }
+}
+
+/**
+ * fib_node_register_type
+ *
+ * Register the function table for a given type
+ */
+void
+fib_node_register_type (fib_node_type_t type,
+                       const fib_node_vft_t *vft)
+{
+    /*
+     * assert that one only registration is made per-node type
+     */
+    if (vec_len(fn_vfts) > type)
+       ASSERT(NULL == fn_vfts[type].fnv_get);
+
+    /*
+     * Assert that we are getting each of the required functions
+     */
+    ASSERT(NULL != vft->fnv_get);
+    ASSERT(NULL != vft->fnv_last_lock);
+
+    vec_validate(fn_vfts, type);
+    fn_vfts[type] = *vft;
+}
+
+fib_node_type_t
+fib_node_register_new_type (const fib_node_vft_t *vft)
+{
+    fib_node_type_t new_type;
+
+    new_type = ++last_new_type;
+
+    fib_node_register_type(new_type, vft);
+
+    return (new_type);
+}   
+
+static u8*
+fib_node_format (fib_node_ptr_t *fnp, u8*s)
+{
+    return (format(s, "{%s:%d}", fn_type_names[fnp->fnp_type], fnp->fnp_index)); 
+}
+
+u32
+fib_node_child_add (fib_node_type_t parent_type,
+                    fib_node_index_t parent_index,
+                    fib_node_type_t type,
+                   fib_node_index_t index)
+{
+    fib_node_t *parent;
+
+    parent = fn_vfts[parent_type].fnv_get(parent_index);
+
+    /*
+     * return the index of the sibling in the child list
+     */
+    fib_node_lock(parent);
+
+    if (FIB_NODE_INDEX_INVALID == parent->fn_children)
+    {
+        parent->fn_children = fib_node_list_create();
+    }   
+
+    return (fib_node_list_push_front(parent->fn_children,
+                                     0, type,
+                                     index));
+}
+
+void
+fib_node_child_remove (fib_node_type_t parent_type,
+                       fib_node_index_t parent_index,
+                       fib_node_index_t sibling_index)
+{
+    fib_node_t *parent;
+
+    parent = fn_vfts[parent_type].fnv_get(parent_index);
+
+    fib_node_list_remove(parent->fn_children, sibling_index);
+
+    if (0 == fib_node_list_get_size(parent->fn_children))
+    {
+        fib_node_list_destroy(&parent->fn_children);
+    }
+
+    fib_node_unlock(parent);
+}
+
+
+fib_node_back_walk_rc_t
+fib_node_back_walk_one (fib_node_ptr_t *ptr,
+                        fib_node_back_walk_ctx_t *ctx)
+{
+    fib_node_t *node;
+
+    node = fn_vfts[ptr->fnp_type].fnv_get(ptr->fnp_index);
+
+    return (fn_vfts[ptr->fnp_type].fnv_back_walk(node, ctx));
+}
+
+static int
+fib_node_ptr_format_one_child (fib_node_ptr_t *ptr,
+                              void *arg)
+{
+    u8 **s = (u8**) arg;
+
+    *s = fib_node_format(ptr, *s);
+
+    return (1);
+}
+
+u8*
+fib_node_children_format (fib_node_list_t list,
+                         u8 *s)
+{
+    fib_node_list_walk(list, fib_node_ptr_format_one_child, (void*)&s);
+
+    return (s);
+}
+
+void
+fib_node_init (fib_node_t *node,
+              fib_node_type_t type)
+{
+#if CLIB_DEBUG > 0
+    /**
+     * The node's type. make sure we are dynamic/down casting correctly
+     */
+    node->fn_type = type;
+#endif
+    node->fn_locks = 0;
+    node->fn_vft = &fn_vfts[type];
+    node->fn_children = FIB_NODE_INDEX_INVALID;
+}
+
+void
+fib_node_deinit (fib_node_t *node)
+{
+    fib_node_list_destroy(&node->fn_children);
+}
+
+void
+fib_node_lock (fib_node_t *node)
+{
+    node->fn_locks++;
+}
+
+void
+fib_node_unlock (fib_node_t *node)
+{
+    node->fn_locks--;
+
+    if (0 == node->fn_locks)
+    {
+       node->fn_vft->fnv_last_lock(node);
+    }
+}
diff --git a/vnet/vnet/fib/fib_node.h b/vnet/vnet/fib/fib_node.h
new file mode 100644 (file)
index 0000000..a05b6f1
--- /dev/null
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FIB_NODE_H__
+#define __FIB_NODE_H__
+
+#include <vnet/fib/fib_types.h>
+
+/**
+ * The types of nodes in a FIB graph
+ */
+typedef enum fib_node_type_t_ {
+    /**
+     * Marker. New types after this one.
+     */
+    FIB_NODE_TYPE_FIRST = 0,
+    /**
+     * See the respective fib_*.h files for descriptions of these objects.
+     */
+    FIB_NODE_TYPE_WALK,
+    FIB_NODE_TYPE_ENTRY,
+    FIB_NODE_TYPE_PATH_LIST,
+    FIB_NODE_TYPE_PATH,
+    FIB_NODE_TYPE_ADJ,
+    FIB_NODE_TYPE_MPLS_ENTRY,
+    FIB_NODE_TYPE_LISP_GPE_TUNNEL,
+    FIB_NODE_TYPE_LISP_ADJ,
+    FIB_NODE_TYPE_MPLS_GRE_TUNNEL,
+    FIB_NODE_TYPE_GRE_TUNNEL,
+    /**
+     * Marker. New types before this one. leave the test last.
+     */
+    FIB_NODE_TYPE_TEST,
+    FIB_NODE_TYPE_LAST = FIB_NODE_TYPE_TEST,
+} fib_node_type_t;
+
+#define FIB_NODE_TYPE_MAX (FIB_NODE_TYPE_LAST + 1)
+
+#define FIB_NODE_TYPES {                          \
+    [FIB_NODE_TYPE_ENTRY]     = "entry",          \
+    [FIB_NODE_TYPE_WALK]      = "walk",           \
+    [FIB_NODE_TYPE_PATH_LIST] = "path-list",      \
+    [FIB_NODE_TYPE_PATH]      = "path",           \
+    [FIB_NODE_TYPE_MPLS_ENTRY] = "mpls-entry",    \
+    [FIB_NODE_TYPE_ADJ] = "adj",                  \
+    [FIB_NODE_TYPE_LISP_GPE_TUNNEL] = "lisp-gpe-tunnel", \
+    [FIB_NODE_TYPE_LISP_ADJ] = "lisp-adj", \
+    [FIB_NODE_TYPE_MPLS_GRE_TUNNEL] = "mpls-gre-tunnel", \
+    [FIB_NODE_TYPE_GRE_TUNNEL] = "gre-tunnel", \
+}
+
+/**
+ * Reasons for backwalking the FIB object graph
+ */
+typedef enum fib_node_back_walk_reason_t_ {
+    /**
+     * Marker. Add new ones after.
+     */
+    FIB_NODE_BW_REASON_FIRST = 0,
+    /**
+     * Walk to re-resolve the child.
+     * Used when the parent is no longer a valid resolution target
+     */
+    FIB_NODE_BW_REASON_RESOLVE = FIB_NODE_BW_REASON_FIRST,
+    /**
+     * Walk to re-evaluate the forwarding contributed by the parent.
+     * Used when a parent's forwarding changes and the child needs to
+     * incorporate this change in its forwarding.
+     */
+    FIB_NODE_BW_REASON_EVALUATE,
+    /**
+     * A resolving interface has come up
+     */
+    FIB_NODE_BW_REASON_INTERFACE_UP,
+    /**
+     * A resolving interface has gone down
+     */
+    FIB_NODE_BW_REASON_INTERFACE_DOWN,
+    /**
+     * A resolving interface has been deleted.
+     */
+    FIB_NODE_BW_REASON_INTERFACE_DELETE,
+    /**
+     * Walk to re-collapse the multipath adjs when the rewrite of
+     * a unipath adjacency changes
+     */
+    FIB_NODE_BW_REASON_ADJ_UPDATE,
+    /**
+     * Marker. Add new before and update
+     */
+    FIB_NODE_BW_REASON_LAST = FIB_NODE_BW_REASON_EVALUATE,
+} fib_node_back_walk_reason_t;
+
+#define FIB_NODE_BW_REASONS {                  \
+    [FIB_NODE_BW_REASON_RESOLVE] = "resolve"   \
+    [FIB_NODE_BW_REASON_EVALUATE] = "evaluate" \
+    [FIB_NODE_BW_REASON_INTERFACE_UP] = "if-up"        \
+    [FIB_NODE_BW_REASON_INTERFACE_DOWN] = "if-down"    \
+    [FIB_NODE_BW_REASON_INTERFACE_DELETE] = "if-delete"        \
+    [FIB_NODE_BW_REASON_ADJ_UPDATE] = "adj-update"     \
+}
+
+/**
+ * Flags enum constructed from the reaons
+ */
+typedef enum fib_node_bw_reason_flag_t_ {
+    FIB_NODE_BW_REASON_FLAG_NONE = 0,
+    FIB_NODE_BW_REASON_FLAG_RESOLVE = (1 << FIB_NODE_BW_REASON_RESOLVE),
+    FIB_NODE_BW_REASON_FLAG_EVALUATE = (1 << FIB_NODE_BW_REASON_EVALUATE),
+    FIB_NODE_BW_REASON_FLAG_INTERFACE_UP = (1 << FIB_NODE_BW_REASON_INTERFACE_UP),
+    FIB_NODE_BW_REASON_FLAG_INTERFACE_DOWN = (1 << FIB_NODE_BW_REASON_INTERFACE_DOWN),
+    FIB_NODE_BW_REASON_FLAG_INTERFACE_DELETE = (1 << FIB_NODE_BW_REASON_INTERFACE_DELETE),
+    FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE = (1 << FIB_NODE_BW_REASON_ADJ_UPDATE),
+} __attribute__ ((packed)) fib_node_bw_reason_flag_t;
+
+_Static_assert(sizeof(fib_node_bw_reason_flag_t) < 2,
+              "BW Reason enum < 2 byte. Consequences for cover_upd_res_t");
+
+/**
+ * Forward eclarations
+ */
+struct fib_node_t_;
+
+/**
+ * A representation of one pointer to another node.
+ * To fully qualify a node, one must know its type and its index so it
+ * can be retrieved from the appropriate pool. Direct pointers to nodes
+ * are forbidden, since all nodes are allocated from pools, which are vectors,
+ * and thus subject to realloc at any time.
+ */
+typedef struct fib_node_ptr_t_ {
+    /**
+     * node type
+     */
+    fib_node_type_t fnp_type;
+    /**
+     * node's index
+     */
+    fib_node_index_t fnp_index;
+} fib_node_ptr_t;
+
+/**
+ * @brief A list of FIB nodes.
+ */
+typedef u32 fib_node_list_t;
+
+/**
+ * Context passed between object during a back walk.
+ */
+typedef struct fib_node_back_walk_ctx_t_ {
+    /**
+     * The reason/trigger for the backwalk
+     */
+    fib_node_bw_reason_flag_t fnbw_reason;
+
+    /**
+     * the number of levels the walk has already traversed.
+     * this value is maintained by the walk infra, tp limit the depth of
+     * a walk so it does not run indefinately the presence of a loop/cycle
+     * in the graph.
+     */
+    u32 fnbw_depth;
+} fib_node_back_walk_ctx_t;
+
+/**
+ * We consider a depth of 32 to be sufficient to cover all sane
+ * network topologies. Anything more is then an indication that
+ * there is a loop/cycle in the FIB graph.
+ * Note that all object types contribute to 1 to the depth.
+ */
+#define FIB_NODE_GRAPH_MAX_DEPTH ((u32)32)
+
+/**
+ * A callback function for walking a node dependency list
+ */
+typedef int (*fib_node_ptr_walk_t)(fib_node_ptr_t *depend,
+                                  void *ctx);
+
+/**
+ * A list of dependent nodes.
+ * This is currently implemented as a hash_table of fib_node_ptr_t
+ */
+typedef fib_node_ptr_t fib_node_ptr_list_t;
+
+/**
+ * Return code from a back walk function
+ */
+typedef enum fib_node_back_walk_rc_t_ {
+    FIB_NODE_BACK_WALK_MERGE,
+    FIB_NODE_BACK_WALK_CONTINUE,
+} fib_node_back_walk_rc_t;
+
+/**
+ * Function definition to backwalk a FIB node
+ */
+typedef fib_node_back_walk_rc_t (*fib_node_back_walk_t)(
+    struct fib_node_t_ *node,
+    fib_node_back_walk_ctx_t *ctx);
+
+/**
+ * Function definition to get a FIB node from its index
+ */
+typedef struct fib_node_t_* (*fib_node_get_t)(fib_node_index_t index);
+
+/**
+ * Function definition to inform the FIB node that its last lock has gone.
+ */
+typedef void (*fib_node_last_lock_gone_t)(struct fib_node_t_ *node);
+
+/**
+ * A FIB graph nodes virtual function table
+ */
+typedef struct fib_node_vft_t_ {
+    fib_node_get_t fnv_get;
+    fib_node_last_lock_gone_t fnv_last_lock;
+    fib_node_back_walk_t fnv_back_walk;
+    format_function_t *fnv_format;
+} fib_node_vft_t;
+
+/**
+ * An node in the FIB graph
+ *
+ * Objects in the FIB form a graph. 
+ */
+typedef struct fib_node_t_ {
+#if CLIB_DEBUG > 0
+    /**
+     * The node's type. make sure we are dynamic/down casting correctly
+     */
+    fib_node_type_t fn_type;
+#endif
+    /**
+     * The node's VFT.
+     * we could store the type here instead, and lookup the VFT using that. But
+     * I like this better,
+     */
+    const fib_node_vft_t *fn_vft;
+
+    /**
+     * Vector of nodes that depend upon/use/share this node
+     */
+    fib_node_list_t fn_children;
+
+    /**
+     * Number of dependents on this node. This number includes the number
+     * of children
+     */
+    u32 fn_locks;
+} fib_node_t;
+
+/**
+ * @brief
+ *  Register the function table for a given type
+ *
+ * @param ft
+ *  FIB node type
+ *
+ * @param vft
+ * virtual function table
+ */
+extern void fib_node_register_type (fib_node_type_t ft,
+                                   const fib_node_vft_t *vft);
+
+/**
+ * @brief
+ *  Create a new FIB node type and Register the function table for it.
+ *
+ * @param vft
+ * virtual function table
+ *
+ * @return new FIB node type
+ */
+extern fib_node_type_t fib_node_register_new_type (const fib_node_vft_t *vft);
+
+extern void fib_node_init(fib_node_t *node,
+                         fib_node_type_t ft);
+extern void fib_node_deinit(fib_node_t *node);
+
+extern void fib_node_lock(fib_node_t *node);
+extern void fib_node_unlock(fib_node_t *node);
+
+extern u32 fib_node_child_add(fib_node_type_t parent_type,
+                             fib_node_index_t parent_index,
+                             fib_node_type_t child_type,
+                             fib_node_index_t child_index);
+extern void fib_node_child_remove(fib_node_type_t parent_type,
+                                  fib_node_index_t parent_index,
+                                  fib_node_index_t sibling_index);
+
+extern fib_node_back_walk_rc_t fib_node_back_walk_one(fib_node_ptr_t *ptr,
+                                                      fib_node_back_walk_ctx_t *ctx);
+
+extern u8* fib_node_children_format(fib_node_list_t list,
+                                   u8 *s);
+
+extern const char* fib_node_type_get_name(fib_node_type_t type);
+
+static inline int
+fib_node_index_is_valid (fib_node_index_t ni)
+{
+    return (FIB_NODE_INDEX_INVALID != ni);
+}
+
+#endif
+
diff --git a/vnet/vnet/fib/fib_node_list.c b/vnet/vnet/fib/fib_node_list.c
new file mode 100644 (file)
index 0000000..1d2e75e
--- /dev/null
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @brief a hetrogeneous w.r.t. FIB node type, of FIB nodes.
+ * Since we cannot use C pointers, due to memeory reallocs, the next/prev
+ * are described as key:{type,index}.
+ */
+
+#include <vnet/fib/fib_node_list.h>
+
+/**
+ * @brief An element in the list
+ */
+typedef struct fib_node_list_elt_t_
+{
+    /**
+     * An opaque indentifier set by the FIB node owning this element
+     * that will allow the owner to identify which element it is.
+     */
+    int fnle_owner_id;
+
+    /**
+     * The index of the list this element is in
+     */
+    fib_node_list_t fnle_list;
+
+    /**
+     * The owner of this element
+     */
+    fib_node_ptr_t fnle_owner;
+
+    /**
+     * The next element in the list
+     */
+    u32 fnle_next;
+
+    /**
+     * The previous element in the list
+     */
+    u32 fnle_prev;
+} fib_node_list_elt_t;
+
+/**
+ * @brief A list of FIB nodes
+ */
+typedef struct fib_node_list_head_t_
+{
+    /**
+     * The head element
+     */
+    u32 fnlh_head;
+
+    /**
+     * Number of elements in the list
+     */
+    u32 fnlh_n_elts;
+} fib_node_list_head_t;
+
+/**
+ * Pools of list elements and heads
+ */
+static fib_node_list_elt_t *fib_node_list_elt_pool;
+static fib_node_list_head_t *fib_node_list_head_pool;
+
+static index_t
+fib_node_list_elt_get_index (fib_node_list_elt_t *elt)
+{
+    return (elt - fib_node_list_elt_pool);
+}
+
+static fib_node_list_elt_t *
+fib_node_list_elt_get (index_t fi)
+{
+    return (pool_elt_at_index(fib_node_list_elt_pool, fi));
+}
+
+static index_t
+fib_node_list_head_get_index (fib_node_list_head_t *head)
+{
+    return (head - fib_node_list_head_pool);
+}
+static fib_node_list_head_t *
+fib_node_list_head_get (fib_node_list_t fi)
+{
+    return (pool_elt_at_index(fib_node_list_head_pool, fi));
+}
+
+static fib_node_list_elt_t *
+fib_node_list_elt_create (fib_node_list_head_t *head,
+                          int id,
+                          fib_node_type_t type,
+                          fib_node_index_t index)
+{
+    fib_node_list_elt_t *elt;
+
+    pool_get(fib_node_list_elt_pool, elt);
+
+    elt->fnle_list = fib_node_list_head_get_index(head);
+    elt->fnle_owner_id = id;
+    elt->fnle_owner.fnp_type  = type;
+    elt->fnle_owner.fnp_index = index;
+
+    elt->fnle_next = FIB_NODE_INDEX_INVALID;
+    elt->fnle_prev = FIB_NODE_INDEX_INVALID;
+
+    return (elt);
+}
+
+static void
+fib_node_list_head_init (fib_node_list_head_t *head)
+{
+    head->fnlh_n_elts = 0;
+    head->fnlh_head = FIB_NODE_INDEX_INVALID;
+}
+
+/**
+ * @brief Create a new node list. The expectation is that these are few in number
+ * so straight from the memory subsystem
+ */
+fib_node_list_t
+fib_node_list_create (void)
+{
+    fib_node_list_head_t *head;
+
+    pool_get(fib_node_list_head_pool, head);
+
+    fib_node_list_head_init(head);
+
+    return (fib_node_list_head_get_index(head));
+}
+
+void
+fib_node_list_destroy (fib_node_list_t *list)
+{
+    fib_node_list_head_t *head;
+
+    if (FIB_NODE_INDEX_INVALID == *list)
+        return;
+
+    head = fib_node_list_head_get(*list);
+    ASSERT(0 == head->fnlh_n_elts);
+
+    pool_put(fib_node_list_head_pool, head);
+    *list = FIB_NODE_INDEX_INVALID;
+}
+
+
+/**
+ * @brief Insert an element at the from of the list.
+ */
+u32
+fib_node_list_push_front (fib_node_list_t list,
+                          int owner_id,
+                          fib_node_type_t type,
+                          fib_node_index_t index)
+{
+    fib_node_list_elt_t *elt, *next;
+    fib_node_list_head_t *head;
+
+    head = fib_node_list_head_get(list);
+    elt = fib_node_list_elt_create(head, owner_id, type, index);
+
+    elt->fnle_prev = FIB_NODE_INDEX_INVALID;
+    elt->fnle_next = head->fnlh_head;
+
+    if (FIB_NODE_INDEX_INVALID != head->fnlh_head)
+    {
+        next = fib_node_list_elt_get(head->fnlh_head);
+        next->fnle_prev = fib_node_list_elt_get_index(elt);
+    }
+    head->fnlh_head = fib_node_list_elt_get_index(elt);
+
+    head->fnlh_n_elts++;
+
+    return (fib_node_list_elt_get_index(elt));
+}
+
+u32
+fib_node_list_push_back (fib_node_list_t list,
+                        int owner_id,
+                        fib_node_type_t type,
+                        fib_node_index_t index)
+{
+    ASSERT(0);
+    return (FIB_NODE_INDEX_INVALID);
+}
+
+static void
+fib_node_list_extract (fib_node_list_head_t *head,
+                       fib_node_list_elt_t *elt)
+{
+    fib_node_list_elt_t *next, *prev;
+
+    if (FIB_NODE_INDEX_INVALID != elt->fnle_next)
+    {
+        next = fib_node_list_elt_get(elt->fnle_next);
+        next->fnle_prev = elt->fnle_prev;
+    }
+
+    if (FIB_NODE_INDEX_INVALID != elt->fnle_prev)
+    {
+        prev = fib_node_list_elt_get(elt->fnle_prev);
+        prev->fnle_next = elt->fnle_next;
+    }
+    else
+    {
+        ASSERT (fib_node_list_elt_get_index(elt) == head->fnlh_head);
+        head->fnlh_head = elt->fnle_next;
+    }
+}
+
+static void
+fib_node_list_insert_after (fib_node_list_head_t *head,
+                            fib_node_list_elt_t *prev,
+                            fib_node_list_elt_t *elt)
+{
+    fib_node_list_elt_t *next;
+
+    elt->fnle_next = prev->fnle_next;
+    if (FIB_NODE_INDEX_INVALID != prev->fnle_next)
+    {
+        next = fib_node_list_elt_get(prev->fnle_next);
+        next->fnle_prev = fib_node_list_elt_get_index(elt);
+    }
+    prev->fnle_next = fib_node_list_elt_get_index(elt);
+    elt->fnle_prev = fib_node_list_elt_get_index(prev);
+}
+
+void
+fib_node_list_remove (fib_node_list_t list,
+                      u32 sibling)
+{
+    fib_node_list_head_t *head;
+    fib_node_list_elt_t *elt;
+
+    head = fib_node_list_head_get(list);
+    elt  = fib_node_list_elt_get(sibling);
+
+    fib_node_list_extract(head, elt);
+
+    head->fnlh_n_elts--;
+    pool_put(fib_node_list_elt_pool, elt);
+}
+
+void
+fib_node_list_elt_remove (u32 sibling)
+{
+    fib_node_list_elt_t *elt;
+
+    elt = fib_node_list_elt_get(sibling);
+
+    fib_node_list_remove(elt->fnle_list, sibling);
+}
+
+/**
+ * @brief Advance the sibling one step (toward the tail) in the list.
+ * return 0 if at the end of the list, 1 otherwise.
+ */
+int
+fib_node_list_advance (u32 sibling)
+{
+    fib_node_list_elt_t *elt, *next;
+    fib_node_list_head_t *head;
+
+    elt = fib_node_list_elt_get(sibling);
+    head = fib_node_list_head_get(elt->fnle_list);
+
+    if (FIB_NODE_INDEX_INVALID != elt->fnle_next)
+    {
+        /*
+         * not at the end of the list
+         */
+        next = fib_node_list_elt_get(elt->fnle_next);
+
+        fib_node_list_extract(head, elt);
+        fib_node_list_insert_after(head, next, elt);
+
+        return (1);
+    }
+    else
+    {
+        return (0);
+    }
+}
+
+int
+fib_node_list_elt_get_next (u32 sibling,
+                            fib_node_ptr_t *ptr)
+{
+    fib_node_list_elt_t *elt, *next;
+
+    elt = fib_node_list_elt_get(sibling);
+
+    if (FIB_NODE_INDEX_INVALID != elt->fnle_next)
+    {
+        next = fib_node_list_elt_get(elt->fnle_next);
+
+        *ptr = next->fnle_owner;
+        return (1);
+    }
+    else
+    {
+        ptr->fnp_index = FIB_NODE_INDEX_INVALID;
+        return (0);
+    }
+}
+
+u32
+fib_node_list_get_size (fib_node_list_t list)
+{
+    fib_node_list_head_t *head;
+
+    if (FIB_NODE_INDEX_INVALID == list)
+    {
+        return (0);
+    }
+
+    head = fib_node_list_head_get(list);
+
+    return (head->fnlh_n_elts);
+}
+
+int
+fib_node_list_get_front (fib_node_list_t list,
+                         fib_node_ptr_t *ptr)
+{
+    fib_node_list_head_t *head;
+    fib_node_list_elt_t *elt;
+
+
+    if (0 == fib_node_list_get_size(list))
+    {
+        ptr->fnp_index = FIB_NODE_INDEX_INVALID;
+        return (0);
+    }
+
+    head = fib_node_list_head_get(list);
+    elt = fib_node_list_elt_get(head->fnlh_head);
+    
+    *ptr = elt->fnle_owner;
+
+    return (1);
+}
+
+/**
+ * @brief Walk the list of node. This must be safe w.r.t. the removal
+ * of nodes during the walk.
+ */
+void
+fib_node_list_walk (fib_node_list_t list,
+                    fib_node_list_walk_cb_t fn,
+                    void *args)
+{
+    fib_node_list_elt_t *elt;
+    fib_node_list_head_t *head;
+    u32 sibling;
+
+    if (FIB_NODE_INDEX_INVALID == list)
+    {
+        return;
+    }
+
+    head = fib_node_list_head_get(list);
+    sibling = head->fnlh_head;
+
+    while (FIB_NODE_INDEX_INVALID != sibling)
+    {
+        elt = fib_node_list_elt_get(sibling);
+        sibling = elt->fnle_next;
+
+        fn(&elt->fnle_owner, args);
+    }
+}
diff --git a/vnet/vnet/fib/fib_node_list.h b/vnet/vnet/fib/fib_node_list.h
new file mode 100644 (file)
index 0000000..afee3c6
--- /dev/null
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @brief a hetrogeneous w.r.t. FIB node type, list of FIB nodes.
+ * Since we cannot use C pointers, due to memeory reallocs, the next/prev
+ * are described as an index to an element. Each element contains a pointer
+ * (key:{type, index}) to a FIB node.
+ */
+
+#ifndef __FIB_NODE_LIST_H__
+#define __FIB_NODE_LIST_H__
+
+#include <vnet/fib/fib_node.h>
+
+extern fib_node_list_t fib_node_list_create(void);
+extern void fib_node_list_destroy(fib_node_list_t *list);
+
+extern u32 fib_node_list_push_front(fib_node_list_t head,
+                                    int owner_id,
+                                    fib_node_type_t type,
+                                    fib_node_index_t index);
+extern u32 fib_node_list_push_back(fib_node_list_t head,
+                                   int owner_id,
+                                   fib_node_type_t type,
+                                   fib_node_index_t index);
+extern void fib_node_list_remove(fib_node_list_t head,
+                                 u32 sibling);
+extern void fib_node_list_elt_remove(u32 sibling);
+
+extern int fib_node_list_advance(u32 sibling);
+
+extern int fib_node_list_get_front(fib_node_list_t head,
+                                   fib_node_ptr_t *ptr);
+
+extern int fib_node_list_elt_get_next(u32 elt,
+                                      fib_node_ptr_t *ptr);
+
+extern u32 fib_node_list_get_size(fib_node_list_t head);
+
+/**
+ * @brief Callback function invoked during a list walk
+ */
+typedef int (*fib_node_list_walk_cb_t)(fib_node_ptr_t *owner,
+                                       void *args);
+
+extern void fib_node_list_walk(fib_node_list_t head,
+                               fib_node_list_walk_cb_t fn,
+                               void *args);
+#endif
diff --git a/vnet/vnet/fib/fib_path.c b/vnet/vnet/fib/fib_path.c
new file mode 100644 (file)
index 0000000..d2e5e31
--- /dev/null
@@ -0,0 +1,1744 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/format.h>
+#include <vnet/ip/ip.h>
+#include <vnet/dpo/drop_dpo.h>
+#include <vnet/dpo/receive_dpo.h>
+#include <vnet/dpo/load_balance_map.h>
+#include <vnet/dpo/lookup_dpo.h>
+
+#include <vnet/adj/adj.h>
+
+#include "fib_path.h"
+#include "fib_node.h"
+#include "fib_table.h"
+#include "fib_entry.h"
+#include "fib_path_list.h"
+#include "fib_internal.h"
+
+/**
+ * Enurmeration of path types
+ */
+typedef enum fib_path_type_t_ {
+    /**
+     * Marker. Add new types after this one.
+     */
+    FIB_PATH_TYPE_FIRST = 0,
+    /**
+     * Attached-nexthop. An interface and a nexthop are known.
+     */
+    FIB_PATH_TYPE_ATTACHED_NEXT_HOP = FIB_PATH_TYPE_FIRST,
+    /**
+     * attached. Only the interface is known.
+     */
+    FIB_PATH_TYPE_ATTACHED,
+    /**
+     * recursive. Only the next-hop is known.
+     */
+    FIB_PATH_TYPE_RECURSIVE,
+    /**
+     * special. nothing is known. so we drop.
+     */
+    FIB_PATH_TYPE_SPECIAL,
+    /**
+     * exclusive. user provided adj.
+     */
+    FIB_PATH_TYPE_EXCLUSIVE,
+    /**
+     * deag. Link to a lookup adj in the next table
+     */
+    FIB_PATH_TYPE_DEAG,
+    /**
+     * receive. it's for-us.
+     */
+    FIB_PATH_TYPE_RECEIVE,
+    /**
+     * Marker. Add new types before this one, then update it.
+     */
+    FIB_PATH_TYPE_LAST = FIB_PATH_TYPE_RECEIVE,
+} __attribute__ ((packed)) fib_path_type_t;
+
+/**
+ * The maximum number of path_types
+ */
+#define FIB_PATH_TYPE_MAX (FIB_PATH_TYPE_LAST + 1)
+
+#define FIB_PATH_TYPES {                                       \
+    [FIB_PATH_TYPE_ATTACHED_NEXT_HOP] = "attached-nexthop",    \
+    [FIB_PATH_TYPE_ATTACHED]          = "attached",            \
+    [FIB_PATH_TYPE_RECURSIVE]         = "recursive",           \
+    [FIB_PATH_TYPE_SPECIAL]           = "special",             \
+    [FIB_PATH_TYPE_EXCLUSIVE]         = "exclusive",           \
+    [FIB_PATH_TYPE_DEAG]              = "deag",                        \
+    [FIB_PATH_TYPE_RECEIVE]           = "receive",             \
+}
+
+#define FOR_EACH_FIB_PATH_TYPE(_item) \
+    for (_item = FIB_PATH_TYPE_FIRST; _item <= FIB_PATH_TYPE_LAST; _item++)
+
+/**
+ * Enurmeration of path operational (i.e. derived) attributes
+ */
+typedef enum fib_path_oper_attribute_t_ {
+    /**
+     * Marker. Add new types after this one.
+     */
+    FIB_PATH_OPER_ATTRIBUTE_FIRST = 0,
+    /**
+     * The path forms part of a recursive loop.
+     */
+    FIB_PATH_OPER_ATTRIBUTE_RECURSIVE_LOOP = FIB_PATH_OPER_ATTRIBUTE_FIRST,
+    /**
+     * The path is resolved
+     */
+    FIB_PATH_OPER_ATTRIBUTE_RESOLVED,
+    /**
+     * The path has become a permanent drop.
+     */
+    FIB_PATH_OPER_ATTRIBUTE_DROP,
+    /**
+     * Marker. Add new types before this one, then update it.
+     */
+    FIB_PATH_OPER_ATTRIBUTE_LAST = FIB_PATH_OPER_ATTRIBUTE_DROP,
+} __attribute__ ((packed)) fib_path_oper_attribute_t;
+
+/**
+ * The maximum number of path operational attributes
+ */
+#define FIB_PATH_OPER_ATTRIBUTE_MAX (FIB_PATH_OPER_ATTRIBUTE_LAST + 1)
+
+#define FIB_PATH_OPER_ATTRIBUTES {                                     \
+    [FIB_PATH_OPER_ATTRIBUTE_RECURSIVE_LOOP] = "recursive-loop",       \
+    [FIB_PATH_OPER_ATTRIBUTE_RESOLVED]       = "resolved",             \
+    [FIB_PATH_OPER_ATTRIBUTE_DROP]           = "drop",                 \
+}
+
+#define FOR_EACH_FIB_PATH_OPER_ATTRIBUTE(_item) \
+    for (_item = FIB_PATH_OPER_ATTRIBUTE_FIRST; \
+        _item <= FIB_PATH_OPER_ATTRIBUTE_LAST; \
+        _item++)
+
+/**
+ * Path flags from the attributes
+ */
+typedef enum fib_path_oper_flags_t_ {
+    FIB_PATH_OPER_FLAG_NONE = 0,
+    FIB_PATH_OPER_FLAG_RECURSIVE_LOOP = (1 << FIB_PATH_OPER_ATTRIBUTE_RECURSIVE_LOOP),
+    FIB_PATH_OPER_FLAG_DROP = (1 << FIB_PATH_OPER_ATTRIBUTE_DROP),
+    FIB_PATH_OPER_FLAG_RESOLVED = (1 << FIB_PATH_OPER_ATTRIBUTE_RESOLVED),
+} __attribute__ ((packed)) fib_path_oper_flags_t;
+
+/**
+ * A FIB path
+ */
+typedef struct fib_path_t_ {
+    /**
+     * A path is a node in the FIB graph.
+     */
+    fib_node_t fp_node;
+
+    /**
+     * The index of the path-list to which this path belongs
+     */
+    u32 fp_pl_index;
+
+    /**
+     * This marks the start of the memory area used to hash
+     * the path
+     */
+    STRUCT_MARK(path_hash_start);
+
+    /**
+     * Configuration Flags
+     */
+    fib_path_cfg_flags_t fp_cfg_flags;
+
+    /**
+     * The type of the path. This is the selector for the union
+     */
+    fib_path_type_t fp_type;
+
+    /**
+     * The protocol of the next-hop, i.e. the address family of the
+     * next-hop's address. We can't derive this from the address itself
+     * since the address can be all zeros
+     */
+    fib_protocol_t fp_nh_proto;
+
+    /**
+     * UCMP [unnormalised] weigt
+     */
+    u32 fp_weight;
+
+    /**
+     * per-type union of the data required to resolve the path
+     */
+    union {
+       struct {
+           /**
+            * The next-hop
+            */
+           ip46_address_t fp_nh;
+           /**
+            * The interface
+            */
+           u32 fp_interface;
+       } attached_next_hop;
+       struct {
+           /**
+            * The interface
+            */
+           u32 fp_interface;
+       } attached;
+       struct {
+           /**
+            * The next-hop
+            */
+           ip46_address_t fp_nh;
+           /**
+            * The FIB table index in which to find the next-hop.
+            * This needs to be fixed. We should lookup the adjacencies in
+            * a separate table of adjacencies, rather than from the FIB.
+            * Two reasons I can think of:
+            *   - consider:
+            *       int ip addr Gig0 10.0.0.1/24
+            *       ip route 10.0.0.2/32 via Gig1 192.168.1.2
+            *       ip route 1.1.1.1/32 via Gig0 10.0.0.2
+            *     this is perfectly valid.
+            *     Packets addressed to 10.0.0.2 should be sent via Gig1.
+            *     Packets address to 1.1.1.1 should be sent via Gig0.
+            *    when we perform the adj resolution from the FIB for the path
+            *    "via Gig0 10.0.0.2" the lookup will result in the route via Gig1
+            *    and so we will pick up the adj via Gig1 - which was not what the
+            *    operator wanted.
+            *  - we can only return link-type IPv4 and so not the link-type MPLS.
+            *    more on this in a later commit.
+            *
+            * The table ID should only belong to a recursive path and indicate
+            * which FIB should be used to resolve the next-hop.
+            */
+           fib_node_index_t fp_tbl_id;
+       } recursive;
+       struct {
+           /**
+            * The FIN index in which to perfom the next lookup
+            */
+           fib_node_index_t fp_tbl_id;
+       } deag;
+       struct {
+       } special;
+       struct {
+           /**
+            * The user provided 'exclusive' DPO
+            */
+           dpo_id_t fp_ex_dpo;
+       } exclusive;
+       struct {
+           /**
+            * The interface on which the local address is configured
+            */
+           u32 fp_interface;
+           /**
+            * The next-hop
+            */
+           ip46_address_t fp_addr;
+       } receive;
+    };
+    STRUCT_MARK(path_hash_end);
+
+    /**
+     * Memebers in this last section represent information that is
+     * dervied during resolution. It should not be copied to new paths
+     * nor compared.
+     */
+
+    /**
+     * Operational Flags
+     */
+    fib_path_oper_flags_t fp_oper_flags;
+
+    /**
+     * the resolving via fib. not part of the union, since it it not part
+     * of the path's hash.
+     */
+    fib_node_index_t fp_via_fib;
+
+    /**
+     * The Data-path objects through which this path resolves for IP.
+     */
+    dpo_id_t fp_dpo;
+
+    /**
+     * the index of this path in the parent's child list.
+     */
+    u32 fp_sibling;
+} fib_path_t;
+
+/*
+ * Array of strings/names for the path types and attributes
+ */
+static const char *fib_path_type_names[] = FIB_PATH_TYPES;
+static const char *fib_path_oper_attribute_names[] = FIB_PATH_OPER_ATTRIBUTES;
+static const char *fib_path_cfg_attribute_names[]  = FIB_PATH_CFG_ATTRIBUTES;
+
+/*
+ * The memory pool from which we allocate all the paths
+ */
+static fib_path_t *fib_path_pool;
+
+/*
+ * Debug macro
+ */
+#ifdef FIB_DEBUG
+#define FIB_PATH_DBG(_p, _fmt, _args...)                       \
+{                                                              \
+    u8 *_tmp = NULL;                                           \
+    _tmp = fib_path_format(fib_path_get_index(_p), _tmp);      \
+    clib_warning("path:[%d:%s]:" _fmt,                         \
+                fib_path_get_index(_p), _tmp,                  \
+                ##_args);                                      \
+    vec_free(_tmp);                                            \
+}
+#else
+#define FIB_PATH_DBG(_p, _fmt, _args...)
+#endif
+
+static fib_path_t *
+fib_path_get (fib_node_index_t index)
+{
+    return (pool_elt_at_index(fib_path_pool, index));
+}
+
+static fib_node_index_t 
+fib_path_get_index (fib_path_t *path)
+{
+    return (path - fib_path_pool);
+}
+
+static fib_node_t *
+fib_path_get_node (fib_node_index_t index)
+{
+    return ((fib_node_t*)fib_path_get(index));
+}
+
+static fib_path_t*
+fib_path_from_fib_node (fib_node_t *node)
+{
+#if CLIB_DEBUG > 0
+    ASSERT(FIB_NODE_TYPE_PATH == node->fn_type);
+#endif
+    return ((fib_path_t*)node);
+}
+
+u8 *
+format_fib_path (u8 * s, va_list * args)
+{
+    fib_path_t *path = va_arg (*args, fib_path_t *);
+    vnet_main_t * vnm = vnet_get_main();
+    fib_path_oper_attribute_t oattr;
+    fib_path_cfg_attribute_t cattr;
+
+    s = format (s, "      index:%d ", fib_path_get_index(path));
+    s = format (s, "pl-index:%d ", path->fp_pl_index);
+    s = format (s, "%U ", format_fib_protocol, path->fp_nh_proto);
+    s = format (s, "weight=%d ", path->fp_weight);
+    s = format (s, "%s: ", fib_path_type_names[path->fp_type]);
+    if (FIB_PATH_OPER_FLAG_NONE != path->fp_oper_flags) {
+       s = format(s, " oper-flags:");
+       FOR_EACH_FIB_PATH_OPER_ATTRIBUTE(oattr) {
+           if ((1<<oattr) & path->fp_oper_flags) {
+               s = format (s, "%s,", fib_path_oper_attribute_names[oattr]);
+           }
+       }
+    }
+    if (FIB_PATH_CFG_FLAG_NONE != path->fp_cfg_flags) {
+       s = format(s, " cfg-flags:");
+       FOR_EACH_FIB_PATH_CFG_ATTRIBUTE(cattr) {
+           if ((1<<cattr) & path->fp_cfg_flags) {
+               s = format (s, "%s,", fib_path_cfg_attribute_names[cattr]);
+           }
+       }
+    }
+    s = format(s, "\n       ");
+
+    switch (path->fp_type)
+    {
+    case FIB_PATH_TYPE_ATTACHED_NEXT_HOP:
+       s = format (s, "%U", format_ip46_address,
+                   &path->attached_next_hop.fp_nh,
+                   IP46_TYPE_ANY);
+       if (path->fp_oper_flags & FIB_PATH_OPER_FLAG_DROP)
+       {
+           s = format (s, " if_index:%d", path->attached_next_hop.fp_interface);
+       }
+       else
+       {
+           s = format (s, " %U",
+                       format_vnet_sw_interface_name,
+                       vnm,
+                       vnet_get_sw_interface(
+                           vnm,
+                           path->attached_next_hop.fp_interface));
+           if (vnet_sw_interface_is_p2p(vnet_get_main(),
+                                        path->attached_next_hop.fp_interface))
+           {
+               s = format (s, " (p2p)");
+           }
+       }
+       if (!dpo_id_is_valid(&path->fp_dpo))
+       {
+           s = format(s, "\n          unresolved");
+       }
+       else
+       {
+           s = format(s, "\n          %U",
+                      format_dpo_id,
+                      &path->fp_dpo, 13);
+       }
+       break;
+    case FIB_PATH_TYPE_ATTACHED:
+       if (path->fp_oper_flags & FIB_PATH_OPER_FLAG_DROP)
+       {
+           s = format (s, " if_index:%d", path->attached_next_hop.fp_interface);
+       }
+       else
+       {
+           s = format (s, " %U",
+                       format_vnet_sw_interface_name,
+                       vnm,
+                       vnet_get_sw_interface(
+                           vnm,
+                           path->attached.fp_interface));
+       }
+       break;
+    case FIB_PATH_TYPE_RECURSIVE:
+       s = format (s, "via %U",
+                   format_ip46_address,
+                   &path->recursive.fp_nh,
+                   IP46_TYPE_ANY);
+       s = format (s, " in fib:%d", path->recursive.fp_tbl_id, path->fp_via_fib); 
+       s = format (s, " via-fib:%d", path->fp_via_fib); 
+       s = format (s, " via-dpo:[%U:%d]",
+                   format_dpo_type, path->fp_dpo.dpoi_type, 
+                   path->fp_dpo.dpoi_index);
+
+       break;
+    case FIB_PATH_TYPE_RECEIVE:
+    case FIB_PATH_TYPE_SPECIAL:
+    case FIB_PATH_TYPE_DEAG:
+    case FIB_PATH_TYPE_EXCLUSIVE:
+       if (dpo_id_is_valid(&path->fp_dpo))
+       {
+           s = format(s, "%U", format_dpo_id,
+                      &path->fp_dpo, 2);
+       }
+       break;
+    }
+    return (s);
+}
+
+u8 *
+fib_path_format (fib_node_index_t pi, u8 *s)
+{
+    fib_path_t *path;
+
+    path = fib_path_get(pi);
+    ASSERT(NULL != path);
+
+    return (format (s, "%U", format_fib_path, path));
+}
+
+u8 *
+fib_path_adj_format (fib_node_index_t pi,
+                    u32 indent,
+                    u8 *s)
+{
+    fib_path_t *path;
+
+    path = fib_path_get(pi);
+    ASSERT(NULL != path);
+
+    if (!dpo_id_is_valid(&path->fp_dpo))
+    {
+       s = format(s, " unresolved");
+    }
+    else
+    {
+       s = format(s, "%U", format_dpo_id,
+                  &path->fp_dpo, 2);
+    }
+
+    return (s);
+}
+
+/*
+ * fib_path_last_lock_gone
+ *
+ * We don't share paths, we share path lists, so the [un]lock functions
+ * are no-ops
+ */
+static void
+fib_path_last_lock_gone (fib_node_t *node)
+{
+    ASSERT(0);
+}
+
+static const adj_index_t
+fib_path_attached_next_hop_get_adj (fib_path_t *path,
+                                   fib_link_t link)
+{
+    if (vnet_sw_interface_is_p2p(vnet_get_main(),
+                                path->attached_next_hop.fp_interface))
+    {
+       /*
+        * if the interface is p2p then the adj for the specific
+        * neighbour on that link will never exist. on p2p links
+        * the subnet address (the attached route) links to the
+        * auto-adj (see below), we want that adj here too.
+        */
+       return (adj_nbr_add_or_lock(path->fp_nh_proto,
+                                   link,
+                                   &zero_addr,
+                                   path->attached_next_hop.fp_interface));
+    }
+    else
+    {
+       return (adj_nbr_add_or_lock(path->fp_nh_proto,
+                                   link,
+                                   &path->attached_next_hop.fp_nh,
+                                   path->attached_next_hop.fp_interface));
+    }
+}
+
+static void
+fib_path_attached_next_hop_set (fib_path_t *path)
+{
+    /*
+     * resolve directly via the adjacnecy discribed by the
+     * interface and next-hop
+     */
+    if (!vnet_sw_interface_is_admin_up(vnet_get_main(),
+                                     path->attached_next_hop.fp_interface))
+    {
+       path->fp_oper_flags &= ~FIB_PATH_OPER_FLAG_RESOLVED;
+    }
+
+    dpo_set(&path->fp_dpo,
+           DPO_ADJACENCY,
+           fib_proto_to_dpo(path->fp_nh_proto),
+           fib_path_attached_next_hop_get_adj(
+                path,
+                fib_proto_to_link(path->fp_nh_proto)));
+
+    /*
+     * become a child of the adjacency so we receive updates
+     * when its rewrite changes
+     */
+    path->fp_sibling = adj_child_add(path->fp_dpo.dpoi_index,
+                                    FIB_NODE_TYPE_PATH,
+                                    fib_path_get_index(path));
+}
+
+/*
+ * create of update the paths recursive adj
+ */
+static void
+fib_path_recursive_adj_update (fib_path_t *path,
+                              fib_forward_chain_type_t fct,
+                              dpo_id_t *dpo)
+{
+    dpo_id_t via_dpo = DPO_NULL;
+
+    /*
+     * get the DPO to resolve through from the via-entry
+     */
+    fib_entry_contribute_forwarding(path->fp_via_fib,
+                                   fct,
+                                   &via_dpo);
+
+
+    /*
+     * hope for the best - clear if restrictions apply.
+     */
+    path->fp_oper_flags |= FIB_PATH_OPER_FLAG_RESOLVED;
+
+    /*
+     * Validate any recursion constraints and over-ride the via
+     * adj if not met
+     */
+    if (path->fp_oper_flags & FIB_PATH_OPER_FLAG_RECURSIVE_LOOP)
+    {
+       path->fp_oper_flags &= ~FIB_PATH_OPER_FLAG_RESOLVED;
+       dpo_copy(&via_dpo, drop_dpo_get(fib_proto_to_dpo(path->fp_nh_proto)));
+    }
+    else if (path->fp_cfg_flags & FIB_PATH_CFG_FLAG_RESOLVE_HOST)
+    {
+       /*
+        * the via FIB must be a host route.
+        * note the via FIB just added will always be a host route
+        * since it is an RR source added host route. So what we need to
+        * check is whether the route has other sources. If it does then
+        * some other source has added it as a host route. If it doesn't
+        * then it was added only here and inherits forwarding from a cover.
+        * the cover is not a host route.
+        * The RR source is the lowest priority source, so we check if it
+        * is the best. if it is there are no other sources.
+        */
+       if (fib_entry_get_best_source(path->fp_via_fib) >= FIB_SOURCE_RR)
+       {
+           path->fp_oper_flags &= ~FIB_PATH_OPER_FLAG_RESOLVED;
+            dpo_copy(&via_dpo, drop_dpo_get(fib_proto_to_dpo(path->fp_nh_proto)));
+
+            /*
+             * PIC edge trigger. let the load-balance maps know
+             */
+            load_balance_map_path_state_change(fib_path_get_index(path));
+       }
+    }
+    else if (path->fp_cfg_flags & FIB_PATH_CFG_FLAG_RESOLVE_ATTACHED)
+    {
+       /*
+        * RR source entries inherit the flags from the cover, so
+        * we can check the via directly
+        */
+       if (!(FIB_ENTRY_FLAG_ATTACHED & fib_entry_get_flags(path->fp_via_fib)))
+       {
+           path->fp_oper_flags &= ~FIB_PATH_OPER_FLAG_RESOLVED;
+            dpo_copy(&via_dpo, drop_dpo_get(fib_proto_to_dpo(path->fp_nh_proto)));
+
+            /*
+             * PIC edge trigger. let the load-balance maps know
+             */
+            load_balance_map_path_state_change(fib_path_get_index(path));
+       }
+    }
+
+    /*
+     * update the path's contributed DPO
+     */
+    dpo_copy(dpo, &via_dpo);
+
+    FIB_PATH_DBG(path, "recursive update: %U",
+                fib_get_lookup_main(path->fp_nh_proto),
+                &path->fp_dpo, 2);
+
+    dpo_reset(&via_dpo);
+}
+
+/*
+ * fib_path_is_permanent_drop
+ *
+ * Return !0 if the path is configured to permanently drop,
+ * despite other attributes.
+ */
+static int
+fib_path_is_permanent_drop (fib_path_t *path)
+{
+    return ((path->fp_cfg_flags & FIB_PATH_CFG_FLAG_DROP) ||
+           (path->fp_oper_flags & FIB_PATH_OPER_FLAG_DROP));
+}
+
+/*
+ * fib_path_unresolve
+ *
+ * Remove our dependency on the resolution target
+ */
+static void
+fib_path_unresolve (fib_path_t *path)
+{
+    /*
+     * the forced drop path does not need unresolving
+     */
+    if (fib_path_is_permanent_drop(path))
+    {
+       return;
+    }
+
+    switch (path->fp_type)
+    {
+    case FIB_PATH_TYPE_RECURSIVE:
+       if (FIB_NODE_INDEX_INVALID != path->fp_via_fib)
+       {
+           fib_prefix_t pfx;
+
+           fib_prefix_from_ip46_addr(&path->recursive.fp_nh, &pfx);
+           fib_entry_child_remove(path->fp_via_fib,
+                                  path->fp_sibling);
+           fib_table_entry_special_remove(path->recursive.fp_tbl_id,
+                                          &pfx,
+                                          FIB_SOURCE_RR);
+           path->fp_via_fib = FIB_NODE_INDEX_INVALID;
+       }
+       break;
+    case FIB_PATH_TYPE_ATTACHED_NEXT_HOP:
+    case FIB_PATH_TYPE_ATTACHED:
+       adj_child_remove(path->fp_dpo.dpoi_index,
+                        path->fp_sibling);
+        adj_unlock(path->fp_dpo.dpoi_index);
+        break;
+    case FIB_PATH_TYPE_EXCLUSIVE:
+       dpo_reset(&path->exclusive.fp_ex_dpo);
+        break;
+    case FIB_PATH_TYPE_SPECIAL:
+    case FIB_PATH_TYPE_RECEIVE:
+    case FIB_PATH_TYPE_DEAG:
+        /*
+         * these hold only the path's DPO, which is reset below.
+         */
+       break;
+    }
+
+    /*
+     * release the adj we were holding and pick up the
+     * drop just in case.
+     */
+    dpo_reset(&path->fp_dpo);
+    path->fp_oper_flags &= ~FIB_PATH_OPER_FLAG_RESOLVED;
+
+    return;
+}
+
+static fib_forward_chain_type_t
+fib_path_proto_to_chain_type (fib_protocol_t proto)
+{
+    switch (proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       return (FIB_FORW_CHAIN_TYPE_UNICAST_IP4);
+    case FIB_PROTOCOL_IP6:
+       return (FIB_FORW_CHAIN_TYPE_UNICAST_IP6);
+    case FIB_PROTOCOL_MPLS:
+       return (FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS);
+    }
+    return (FIB_FORW_CHAIN_TYPE_UNICAST_IP4);
+}
+
+/*
+ * fib_path_back_walk_notify
+ *
+ * A back walk has reach this path.
+ */
+static fib_node_back_walk_rc_t
+fib_path_back_walk_notify (fib_node_t *node,
+                          fib_node_back_walk_ctx_t *ctx)
+{
+    fib_path_t *path;
+
+    path = fib_path_from_fib_node(node);
+
+    switch (path->fp_type)
+    {
+    case FIB_PATH_TYPE_RECURSIVE:
+       if (FIB_NODE_BW_REASON_FLAG_EVALUATE & ctx->fnbw_reason)
+       {
+           /*
+            * modify the recursive adjacency to use the new forwarding
+            * of the via-fib.
+            * this update is visible to packets in flight in the DP.
+            */
+           fib_path_recursive_adj_update(
+               path,
+               fib_path_proto_to_chain_type(path->fp_nh_proto),
+               &path->fp_dpo);
+       }
+       break;
+    case FIB_PATH_TYPE_ATTACHED_NEXT_HOP:
+       /*
+FIXME comment
+        * ADJ_UPDATE backwalk pass silently through here and up to
+        * the path-list when the multipath adj collapse occurs.
+        * The reason we do this is that the assumtption is that VPP
+        * runs in an environment where the Control-Plane is remote
+        * and hence reacts slowly to link up down. In order to remove
+        * this down link from the ECMP set quickly, we back-walk.
+        * VPP also has dedicated CPUs, so we are not stealing resources
+        * from the CP to do so.
+        */
+       if (FIB_NODE_BW_REASON_FLAG_INTERFACE_UP & ctx->fnbw_reason)
+       {
+           path->fp_oper_flags |= FIB_PATH_OPER_FLAG_RESOLVED;
+       }
+       if (FIB_NODE_BW_REASON_FLAG_INTERFACE_DOWN & ctx->fnbw_reason)
+       {
+           path->fp_oper_flags &= ~FIB_PATH_OPER_FLAG_RESOLVED;
+       }
+       if (FIB_NODE_BW_REASON_FLAG_INTERFACE_DELETE & ctx->fnbw_reason)
+       {
+           /*
+            * The interface this path resolves through has been deleted.
+            * This will leave the path in a permanent drop state. The route
+            * needs to be removed and readded (and hence the path-list deleted)
+            * before it can forward again.
+            */
+           fib_path_unresolve(path);
+           path->fp_oper_flags |= FIB_PATH_OPER_FLAG_DROP;
+       }
+        if (FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE & ctx->fnbw_reason)
+       {
+            /*
+             * restack the DPO to pick up the correct DPO sub-type
+             */
+            adj_index_t ai;
+
+            ai = fib_path_attached_next_hop_get_adj(
+                     path,
+                     fib_proto_to_link(path->fp_nh_proto));
+
+            dpo_set(&path->fp_dpo, DPO_ADJACENCY,
+                    fib_proto_to_dpo(path->fp_nh_proto),
+                    ai);
+            adj_unlock(ai);
+        }
+       break;
+    case FIB_PATH_TYPE_ATTACHED:
+       /*
+        * FIXME; this could schedule a lower priority walk, since attached
+        * routes are not usually in ECMP configurations so the backwalk to
+        * the FIB entry does not need to be high priority
+        */
+       if (FIB_NODE_BW_REASON_FLAG_INTERFACE_UP & ctx->fnbw_reason)
+       {
+           path->fp_oper_flags |= FIB_PATH_OPER_FLAG_RESOLVED;
+       }
+       if (FIB_NODE_BW_REASON_FLAG_INTERFACE_DOWN & ctx->fnbw_reason)
+       {
+           path->fp_oper_flags &= ~FIB_PATH_OPER_FLAG_RESOLVED;
+       }
+       if (FIB_NODE_BW_REASON_FLAG_INTERFACE_DELETE & ctx->fnbw_reason)
+       {
+           fib_path_unresolve(path);
+           path->fp_oper_flags |= FIB_PATH_OPER_FLAG_DROP;
+       }
+       break;
+    case FIB_PATH_TYPE_DEAG:
+       /*
+        * FIXME When VRF delete is allowed this will need a poke.
+        */
+    case FIB_PATH_TYPE_SPECIAL:
+    case FIB_PATH_TYPE_RECEIVE:
+    case FIB_PATH_TYPE_EXCLUSIVE:
+       /*
+        * these path types have no parents. so to be
+        * walked from one is unexpected.
+        */
+       ASSERT(0);
+       break;
+    }
+
+    /*
+     * propagate the backwalk further to the path-list
+     */
+    fib_path_list_back_walk(path->fp_pl_index, ctx);
+
+    return (FIB_NODE_BACK_WALK_CONTINUE);
+}
+
+/*
+ * The FIB path's graph node virtual function table
+ */
+static const fib_node_vft_t fib_path_vft = {
+    .fnv_get = fib_path_get_node,
+    .fnv_last_lock = fib_path_last_lock_gone,
+    .fnv_back_walk = fib_path_back_walk_notify,
+};
+
+static fib_path_cfg_flags_t
+fib_path_route_flags_to_cfg_flags (const fib_route_path_t *rpath)
+{
+    fib_path_cfg_flags_t cfg_flags = FIB_PATH_CFG_ATTRIBUTE_FIRST;
+
+    if (rpath->frp_flags & FIB_ROUTE_PATH_RESOLVE_VIA_HOST)
+       cfg_flags |= FIB_PATH_CFG_FLAG_RESOLVE_HOST;
+    if (rpath->frp_flags & FIB_ROUTE_PATH_RESOLVE_VIA_ATTACHED)
+       cfg_flags |= FIB_PATH_CFG_FLAG_RESOLVE_ATTACHED;
+
+    return (cfg_flags);
+}
+
+/*
+ * fib_path_create
+ *
+ * Create and initialise a new path object.
+ * return the index of the path.
+ */
+fib_node_index_t
+fib_path_create (fib_node_index_t pl_index,
+                fib_protocol_t nh_proto,
+                fib_path_cfg_flags_t flags,
+                const fib_route_path_t *rpath)
+{
+    fib_path_t *path;
+
+    pool_get(fib_path_pool, path);
+    memset(path, 0, sizeof(*path));
+
+    fib_node_init(&path->fp_node,
+                 FIB_NODE_TYPE_PATH);
+
+    dpo_reset(&path->fp_dpo);
+    path->fp_pl_index = pl_index;
+    path->fp_nh_proto = nh_proto;
+    path->fp_via_fib = FIB_NODE_INDEX_INVALID;
+    path->fp_weight = rpath->frp_weight;
+    path->fp_cfg_flags = flags;
+    path->fp_cfg_flags |= fib_path_route_flags_to_cfg_flags(rpath);
+
+    /*
+     * deduce the path's tpye from the parementers and save what is needed.
+     */
+    if (~0 != rpath->frp_sw_if_index)
+    {
+       if (flags & FIB_PATH_CFG_FLAG_LOCAL)
+       {
+           path->fp_type = FIB_PATH_TYPE_RECEIVE;
+           path->receive.fp_interface = rpath->frp_sw_if_index;
+            path->receive.fp_addr = rpath->frp_addr;
+       }
+       else
+       {
+           if (ip46_address_is_zero(&rpath->frp_addr))
+           {
+               path->fp_type = FIB_PATH_TYPE_ATTACHED;
+               path->attached.fp_interface = rpath->frp_sw_if_index;
+           }
+           else
+           {
+               path->fp_type = FIB_PATH_TYPE_ATTACHED_NEXT_HOP;
+               path->attached_next_hop.fp_interface = rpath->frp_sw_if_index;
+               path->attached_next_hop.fp_nh = rpath->frp_addr;
+           }
+       }
+    }
+    else
+    {
+       if (ip46_address_is_zero(&rpath->frp_addr))
+       {
+           if (~0 == rpath->frp_fib_index)
+           {
+               path->fp_type = FIB_PATH_TYPE_SPECIAL;
+           }
+           else
+           {
+               path->fp_type = FIB_PATH_TYPE_DEAG;
+               path->deag.fp_tbl_id = rpath->frp_fib_index;
+           }           
+       }
+       else
+       {
+           path->fp_type = FIB_PATH_TYPE_RECURSIVE;
+           path->recursive.fp_nh = rpath->frp_addr;
+           path->recursive.fp_tbl_id = rpath->frp_fib_index;
+       }
+    }
+
+    FIB_PATH_DBG(path, "create");
+
+    return (fib_path_get_index(path));
+}
+
+/*
+ * fib_path_create_special
+ *
+ * Create and initialise a new path object.
+ * return the index of the path.
+ */
+fib_node_index_t
+fib_path_create_special (fib_node_index_t pl_index,
+                        fib_protocol_t nh_proto,
+                        fib_path_cfg_flags_t flags,
+                        const dpo_id_t *dpo)
+{
+    fib_path_t *path;
+
+    pool_get(fib_path_pool, path);
+    memset(path, 0, sizeof(*path));
+
+    fib_node_init(&path->fp_node,
+                 FIB_NODE_TYPE_PATH);
+    dpo_reset(&path->fp_dpo);
+
+    path->fp_pl_index = pl_index;
+    path->fp_weight = 1;
+    path->fp_nh_proto = nh_proto;
+    path->fp_via_fib = FIB_NODE_INDEX_INVALID;
+    path->fp_cfg_flags = flags;
+
+    if (FIB_PATH_CFG_FLAG_DROP & flags)
+    {
+       path->fp_type = FIB_PATH_TYPE_SPECIAL;
+    }
+    else if (FIB_PATH_CFG_FLAG_LOCAL & flags)
+    {
+       path->fp_type = FIB_PATH_TYPE_RECEIVE;
+       path->attached.fp_interface = FIB_NODE_INDEX_INVALID;
+    }
+    else
+    {
+       path->fp_type = FIB_PATH_TYPE_EXCLUSIVE;
+       ASSERT(NULL != dpo);
+       dpo_copy(&path->exclusive.fp_ex_dpo, dpo);
+    }
+
+    return (fib_path_get_index(path));
+}
+
+/*
+ * fib_path_copy
+ *
+ * Copy a path. return index of new path.
+ */
+fib_node_index_t
+fib_path_copy (fib_node_index_t path_index,
+              fib_node_index_t path_list_index)
+{
+    fib_path_t *path, *orig_path;
+
+    pool_get(fib_path_pool, path);
+
+    orig_path = fib_path_get(path_index);
+    ASSERT(NULL != orig_path);
+
+    memcpy(path, orig_path, sizeof(*path));
+
+    FIB_PATH_DBG(path, "create-copy:%d", path_index);
+
+    /*
+     * reset the dynamic section
+     */
+    fib_node_init(&path->fp_node, FIB_NODE_TYPE_PATH);
+    path->fp_oper_flags     = FIB_PATH_OPER_FLAG_NONE;
+    path->fp_pl_index  = path_list_index;
+    path->fp_via_fib   = FIB_NODE_INDEX_INVALID;
+    memset(&path->fp_dpo, 0, sizeof(path->fp_dpo));
+    dpo_reset(&path->fp_dpo);
+
+    return (fib_path_get_index(path));
+}
+
+/*
+ * fib_path_destroy
+ *
+ * destroy a path that is no longer required
+ */
+void
+fib_path_destroy (fib_node_index_t path_index)
+{
+    fib_path_t *path;
+
+    path = fib_path_get(path_index);
+
+    ASSERT(NULL != path);
+    FIB_PATH_DBG(path, "destroy");
+
+    fib_path_unresolve(path);
+
+    fib_node_deinit(&path->fp_node);
+    pool_put(fib_path_pool, path);
+}
+
+/*
+ * fib_path_destroy
+ *
+ * destroy a path that is no longer required
+ */
+uword
+fib_path_hash (fib_node_index_t path_index)
+{
+    fib_path_t *path;
+
+    path = fib_path_get(path_index);
+
+    return (hash_memory(STRUCT_MARK_PTR(path, path_hash_start),
+                       (STRUCT_OFFSET_OF(fib_path_t, path_hash_end) -
+                        STRUCT_OFFSET_OF(fib_path_t, path_hash_start)),
+                       0));
+}
+
+/*
+ * fib_path_cmp_i
+ *
+ * Compare two paths for equivalence.
+ */
+static int
+fib_path_cmp_i (const fib_path_t *path1,
+               const fib_path_t *path2)
+{
+    int res;
+
+    res = 1;
+
+    /*
+     * paths of different types and protocol are not equal.
+     * different weights only are the same path.
+     */
+    if (path1->fp_type != path2->fp_type)
+    {
+       res = (path1->fp_type - path2->fp_type);
+    }
+    if (path1->fp_nh_proto != path2->fp_nh_proto)
+    {
+       res = (path1->fp_nh_proto - path2->fp_nh_proto);
+    }
+    else
+    {
+       /*
+        * both paths are of the same type.
+        * consider each type and its attributes in turn.
+        */
+       switch (path1->fp_type)
+       {
+       case FIB_PATH_TYPE_ATTACHED_NEXT_HOP:
+           res = ip46_address_cmp(&path1->attached_next_hop.fp_nh,
+                                  &path2->attached_next_hop.fp_nh);
+           if (0 == res) {
+               res = vnet_sw_interface_compare(
+                         vnet_get_main(),
+                         path1->attached_next_hop.fp_interface,
+                         path2->attached_next_hop.fp_interface);
+           }
+           break;
+       case FIB_PATH_TYPE_ATTACHED:
+           res = vnet_sw_interface_compare(
+                     vnet_get_main(),
+                     path1->attached.fp_interface,
+                     path2->attached.fp_interface);
+           break;
+       case FIB_PATH_TYPE_RECURSIVE:
+           res = ip46_address_cmp(&path1->recursive.fp_nh,
+                                  &path2->recursive.fp_nh);
+           if (0 == res)
+           {
+               res = (path1->recursive.fp_tbl_id - path2->recursive.fp_tbl_id);
+           }
+           break;
+       case FIB_PATH_TYPE_DEAG:
+           res = (path1->deag.fp_tbl_id - path2->deag.fp_tbl_id);
+           break;
+       case FIB_PATH_TYPE_SPECIAL:
+       case FIB_PATH_TYPE_RECEIVE:
+       case FIB_PATH_TYPE_EXCLUSIVE:
+           res = 0;
+           break;
+       }
+    }
+    return (res);
+}
+
+/*
+ * fib_path_cmp_for_sort
+ *
+ * Compare two paths for equivalence. Used during path sorting.
+ * As usual 0 means equal.
+ */
+int
+fib_path_cmp_for_sort (void * v1,
+                      void * v2)
+{
+    fib_node_index_t *pi1 = v1, *pi2 = v2;
+    fib_path_t *path1, *path2;
+
+    path1 = fib_path_get(*pi1);
+    path2 = fib_path_get(*pi2);
+
+    return (fib_path_cmp_i(path1, path2));
+}
+
+/*
+ * fib_path_cmp
+ *
+ * Compare two paths for equivalence.
+ */
+int
+fib_path_cmp (fib_node_index_t pi1,
+             fib_node_index_t pi2)
+{
+    fib_path_t *path1, *path2;
+
+    path1 = fib_path_get(pi1);
+    path2 = fib_path_get(pi2);
+
+    return (fib_path_cmp_i(path1, path2));
+}
+
+int
+fib_path_cmp_w_route_path (fib_node_index_t path_index,
+                          const fib_route_path_t *rpath)
+{
+    fib_path_t *path;
+    int res;
+
+    path = fib_path_get(path_index);
+
+    res = 1;
+
+    if (path->fp_weight != rpath->frp_weight)
+    {
+       res = (path->fp_weight - rpath->frp_weight);
+    }
+    else
+    {
+       /*
+        * both paths are of the same type.
+        * consider each type and its attributes in turn.
+        */
+       switch (path->fp_type)
+       {
+       case FIB_PATH_TYPE_ATTACHED_NEXT_HOP:
+           res = ip46_address_cmp(&path->attached_next_hop.fp_nh,
+                                  &rpath->frp_addr);
+           if (0 == res)
+           {
+               res = vnet_sw_interface_compare(
+                         vnet_get_main(),
+                         path->attached_next_hop.fp_interface,
+                         rpath->frp_sw_if_index);
+           }
+           break;
+       case FIB_PATH_TYPE_ATTACHED:
+           res = vnet_sw_interface_compare(
+                     vnet_get_main(),
+                     path->attached.fp_interface,
+                     rpath->frp_sw_if_index);
+           break;
+       case FIB_PATH_TYPE_RECURSIVE:
+           res = ip46_address_cmp(&path->recursive.fp_nh,
+                                  &rpath->frp_addr);
+           if (0 == res)
+           {
+               res = (path->recursive.fp_tbl_id - rpath->frp_fib_index);
+           }
+           break;
+       case FIB_PATH_TYPE_DEAG:
+           res = (path->deag.fp_tbl_id - rpath->frp_fib_index);
+           break;
+       case FIB_PATH_TYPE_SPECIAL:
+       case FIB_PATH_TYPE_RECEIVE:
+       case FIB_PATH_TYPE_EXCLUSIVE:
+           res = 0;
+           break;
+       }
+    }
+    return (res);
+}
+
+/*
+ * fib_path_recursive_loop_detect
+ *
+ * A forward walk of the FIB object graph to detect for a cycle/loop. This
+ * walk is initiated when an entry is linking to a new path list or from an old.
+ * The entry vector passed contains all the FIB entrys that are children of this
+ * path (it is all the entries encountered on the walk so far). If this vector
+ * contains the entry this path resolve via, then a loop is about to form.
+ * The loop must be allowed to form, since we need the dependencies in place
+ * so that we can track when the loop breaks.
+ * However, we MUST not produce a loop in the forwarding graph (else packets
+ * would loop around the switch path until the loop breaks), so we mark recursive
+ * paths as looped so that they do not contribute forwarding information.
+ * By marking the path as looped, an etry such as;
+ *    X/Y
+ *     via a.a.a.a (looped)
+ *     via b.b.b.b (not looped)
+ * can still forward using the info provided by b.b.b.b only
+ */
+int
+fib_path_recursive_loop_detect (fib_node_index_t path_index,
+                               fib_node_index_t **entry_indicies)
+{
+    fib_path_t *path;
+
+    path = fib_path_get(path_index);
+
+    /*
+     * the forced drop path is never looped, cos it is never resolved.
+     */
+    if (fib_path_is_permanent_drop(path))
+    {
+       return (0);
+    }
+
+    switch (path->fp_type)
+    {
+    case FIB_PATH_TYPE_RECURSIVE:
+    {
+       fib_node_index_t *entry_index, *entries;
+       int looped = 0;
+       entries = *entry_indicies;
+
+       vec_foreach(entry_index, entries) {
+           if (*entry_index == path->fp_via_fib)
+           {
+               /*
+                * the entry that is about to link to this path-list (or
+                * one of this path-list's children) is the same entry that
+                * this recursive path resolves through. this is a cycle.
+                * abort the walk.
+                */
+               looped = 1;
+               break;
+           }
+       }
+
+       if (looped)
+       {
+           FIB_PATH_DBG(path, "recursive loop formed");
+           path->fp_oper_flags |= FIB_PATH_OPER_FLAG_RECURSIVE_LOOP;
+
+           dpo_copy(&path->fp_dpo,
+                    drop_dpo_get(fib_proto_to_dpo(path->fp_nh_proto)));
+       }
+       else
+       {
+           /*
+            * no loop here yet. keep forward walking the graph.
+            */     
+           if (fib_entry_recursive_loop_detect(path->fp_via_fib, entry_indicies))
+           {
+               FIB_PATH_DBG(path, "recursive loop formed");
+               path->fp_oper_flags |= FIB_PATH_OPER_FLAG_RECURSIVE_LOOP;
+           }
+           else
+           {
+               FIB_PATH_DBG(path, "recursive loop cleared");
+               path->fp_oper_flags &= ~FIB_PATH_OPER_FLAG_RECURSIVE_LOOP;
+           }
+       }
+       break;
+    }
+    case FIB_PATH_TYPE_ATTACHED_NEXT_HOP:
+    case FIB_PATH_TYPE_ATTACHED:
+    case FIB_PATH_TYPE_SPECIAL:
+    case FIB_PATH_TYPE_DEAG:
+    case FIB_PATH_TYPE_RECEIVE:
+    case FIB_PATH_TYPE_EXCLUSIVE:
+       /*
+        * these path types cannot be part of a loop, since they are the leaves
+        * of the graph.
+        */
+       break;
+    }
+
+    return (fib_path_is_looped(path_index));
+}
+
+int
+fib_path_resolve (fib_node_index_t path_index)
+{
+    fib_path_t *path;
+
+    path = fib_path_get(path_index);
+
+    /*
+     * hope for the best.
+     */
+    path->fp_oper_flags |= FIB_PATH_OPER_FLAG_RESOLVED;
+
+    /*
+     * the forced drop path resolves via the drop adj
+     */
+    if (fib_path_is_permanent_drop(path))
+    {
+       dpo_copy(&path->fp_dpo,
+                 drop_dpo_get(fib_proto_to_dpo(path->fp_nh_proto)));
+       path->fp_oper_flags &= ~FIB_PATH_OPER_FLAG_RESOLVED;
+       return (fib_path_is_resolved(path_index));
+    }
+
+    switch (path->fp_type)
+    {
+    case FIB_PATH_TYPE_ATTACHED_NEXT_HOP:
+       fib_path_attached_next_hop_set(path);
+       break;
+    case FIB_PATH_TYPE_ATTACHED:
+       /*
+        * path->attached.fp_interface
+        */
+       if (!vnet_sw_interface_is_admin_up(vnet_get_main(),
+                                          path->attached.fp_interface))
+       {
+           path->fp_oper_flags &= ~FIB_PATH_OPER_FLAG_RESOLVED;
+       }
+       if (vnet_sw_interface_is_p2p(vnet_get_main(),
+                                    path->attached.fp_interface))
+       {
+           /*
+            * point-2-point interfaces do not require a glean, since
+            * there is nothing to ARP. Install a rewrite/nbr adj instead
+            */
+           dpo_set(&path->fp_dpo,
+                   DPO_ADJACENCY,
+                   fib_proto_to_dpo(path->fp_nh_proto),
+                   adj_nbr_add_or_lock(
+                       path->fp_nh_proto,
+                       fib_proto_to_link(path->fp_nh_proto),
+                       &zero_addr,
+                       path->attached.fp_interface));
+       }
+       else
+       {
+           dpo_set(&path->fp_dpo,
+                   DPO_ADJACENCY_GLEAN,
+                   fib_proto_to_dpo(path->fp_nh_proto),
+                   adj_glean_add_or_lock(path->fp_nh_proto,
+                                         path->attached.fp_interface,
+                                         NULL));
+       }
+       /*
+        * become a child of the adjacency so we receive updates
+        * when the interface state changes
+        */
+       path->fp_sibling = adj_child_add(path->fp_dpo.dpoi_index,
+                                        FIB_NODE_TYPE_PATH,
+                                        fib_path_get_index(path));
+
+       break;
+    case FIB_PATH_TYPE_RECURSIVE:
+    {
+       /*
+        * Create a RR source entry in the table for the address
+        * that this path recurses through.
+        * This resolve action is recursive, hence we may create
+        * more paths in the process. more creates mean maybe realloc
+        * of this path.
+        */
+       fib_node_index_t fei;
+       fib_prefix_t pfx;
+
+       ASSERT(FIB_NODE_INDEX_INVALID == path->fp_via_fib);
+
+       fib_prefix_from_ip46_addr(&path->recursive.fp_nh, &pfx);
+
+       fei = fib_table_entry_special_add(path->recursive.fp_tbl_id,
+                                         &pfx,
+                                         FIB_SOURCE_RR,
+                                         FIB_ENTRY_FLAG_NONE,
+                                         ADJ_INDEX_INVALID);
+
+       path = fib_path_get(path_index);
+       path->fp_via_fib = fei;
+
+       /*
+        * become a dependent child of the entry so the path is 
+        * informed when the forwarding for the entry changes.
+        */
+       path->fp_sibling = fib_entry_child_add(path->fp_via_fib,
+                                              FIB_NODE_TYPE_PATH,
+                                              fib_path_get_index(path));
+
+       /*
+        * create and configure the IP DPO
+        */
+       fib_path_recursive_adj_update(
+           path,
+           fib_path_proto_to_chain_type(path->fp_nh_proto),
+           &path->fp_dpo);
+
+       break;
+    }
+    case FIB_PATH_TYPE_SPECIAL:
+       /*
+        * Resolve via the drop
+        */
+       dpo_copy(&path->fp_dpo,
+                 drop_dpo_get(fib_proto_to_dpo(path->fp_nh_proto)));
+       break;
+    case FIB_PATH_TYPE_DEAG:
+       /*
+        * Resolve via a lookup DPO.
+         * FIXME. control plane should add routes with a table ID
+        */
+       lookup_dpo_add_or_lock_w_fib_index(path->deag.fp_tbl_id,
+                                          fib_proto_to_dpo(path->fp_nh_proto),
+                                          LOOKUP_INPUT_DST_ADDR,
+                                          LOOKUP_TABLE_FROM_CONFIG,
+                                          &path->fp_dpo);
+       break;
+    case FIB_PATH_TYPE_RECEIVE:
+       /*
+        * Resolve via a receive DPO.
+        */
+       receive_dpo_add_or_lock(fib_proto_to_dpo(path->fp_nh_proto),
+                                path->receive.fp_interface,
+                                &path->receive.fp_addr,
+                                &path->fp_dpo);
+       break;
+    case FIB_PATH_TYPE_EXCLUSIVE:
+       /*
+        * Resolve via the user provided DPO
+        */
+       dpo_copy(&path->fp_dpo, &path->exclusive.fp_ex_dpo);
+       break;
+    }
+
+    return (fib_path_is_resolved(path_index));
+}
+
+u32
+fib_path_get_resolving_interface (fib_node_index_t path_index)
+{
+    fib_path_t *path;
+
+    path = fib_path_get(path_index);
+
+    switch (path->fp_type)
+    {
+    case FIB_PATH_TYPE_ATTACHED_NEXT_HOP:
+       return (path->attached_next_hop.fp_interface);
+    case FIB_PATH_TYPE_ATTACHED:
+       return (path->attached.fp_interface);
+    case FIB_PATH_TYPE_RECEIVE:
+       return (path->receive.fp_interface);
+    case FIB_PATH_TYPE_RECURSIVE:
+       return (fib_entry_get_resolving_interface(path->fp_via_fib));    
+    case FIB_PATH_TYPE_SPECIAL:
+    case FIB_PATH_TYPE_DEAG:
+    case FIB_PATH_TYPE_EXCLUSIVE:
+       break;
+    }
+    return (~0);
+}
+
+adj_index_t
+fib_path_get_adj (fib_node_index_t path_index)
+{
+    fib_path_t *path;
+
+    path = fib_path_get(path_index);
+
+    ASSERT(dpo_is_adj(&path->fp_dpo));
+    if (dpo_is_adj(&path->fp_dpo))
+    {
+       return (path->fp_dpo.dpoi_index);
+    }
+    return (ADJ_INDEX_INVALID);
+}
+
+int
+fib_path_get_weight (fib_node_index_t path_index)
+{
+    fib_path_t *path;
+
+    path = fib_path_get(path_index);
+
+    ASSERT(path);
+
+    return (path->fp_weight);
+}
+
+void
+fib_path_contribute_forwarding (fib_node_index_t path_index,
+                               fib_forward_chain_type_t fct,
+                               dpo_id_t *dpo)
+{
+    fib_path_t *path;
+
+    path = fib_path_get(path_index);
+
+    ASSERT(path);
+    ASSERT(FIB_FORW_CHAIN_TYPE_MPLS_EOS != fct);
+
+    FIB_PATH_DBG(path, "contribute");
+
+    /*
+     * The DPO stored in the path was created when the path was resolved.
+     * This then represents the path's 'native' protocol; IP.
+     * For all others will need to go find something else.
+     */
+    if (fib_path_proto_to_chain_type(path->fp_nh_proto) == fct)
+    {
+       dpo_copy(dpo, &path->fp_dpo);
+    }
+    else {
+       switch (path->fp_type)
+       {
+       case FIB_PATH_TYPE_ATTACHED_NEXT_HOP:
+           switch (fct)
+           {
+           case FIB_FORW_CHAIN_TYPE_UNICAST_IP4:
+           case FIB_FORW_CHAIN_TYPE_UNICAST_IP6:
+           case FIB_FORW_CHAIN_TYPE_MPLS_EOS:
+           case FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS:
+           {
+               adj_index_t ai;
+
+               /*
+                * get a MPLS link type adj.
+                */
+               ai = fib_path_attached_next_hop_get_adj(
+                        path,
+                        fib_forw_chain_type_to_link_type(fct));
+               dpo_set(dpo, DPO_ADJACENCY,
+                       fib_forw_chain_type_to_dpo_proto(fct), ai);
+               adj_unlock(ai);
+
+               break;
+           }
+           }
+           break;
+       case FIB_PATH_TYPE_RECURSIVE:
+           switch (fct)
+           {
+           case FIB_FORW_CHAIN_TYPE_MPLS_EOS:
+           case FIB_FORW_CHAIN_TYPE_UNICAST_IP4:
+           case FIB_FORW_CHAIN_TYPE_UNICAST_IP6:
+               /*
+                * Assume that EOS and IP forwarding is the same.
+                * revisit for ieBGP
+                */
+               dpo_copy(dpo, &path->fp_dpo);
+               break;
+           case FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS:
+               fib_path_recursive_adj_update(path, fct, dpo);
+               break;
+           }
+           break;
+       case FIB_PATH_TYPE_DEAG:
+           switch (fct)
+           {
+           case FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS:
+                lookup_dpo_add_or_lock_w_table_id(MPLS_FIB_DEFAULT_TABLE_ID,
+                                                  DPO_PROTO_MPLS,
+                                                  LOOKUP_INPUT_DST_ADDR,
+                                                  LOOKUP_TABLE_FROM_CONFIG,
+                                                  dpo);
+                break;
+           case FIB_FORW_CHAIN_TYPE_UNICAST_IP4:
+           case FIB_FORW_CHAIN_TYPE_UNICAST_IP6:
+           case FIB_FORW_CHAIN_TYPE_MPLS_EOS:
+               dpo_copy(dpo, &path->fp_dpo);
+               break;          
+            }
+            break;
+       case FIB_PATH_TYPE_EXCLUSIVE:
+           dpo_copy(dpo, &path->exclusive.fp_ex_dpo);
+           break;
+        case FIB_PATH_TYPE_ATTACHED:
+       case FIB_PATH_TYPE_RECEIVE:
+       case FIB_PATH_TYPE_SPECIAL:
+           ASSERT(0);
+            break;
+       }
+
+    }
+}
+
+load_balance_path_t *
+fib_path_append_nh_for_multipath_hash (fib_node_index_t path_index,
+                                      fib_forward_chain_type_t fct,
+                                      load_balance_path_t *hash_key)
+{
+    load_balance_path_t *mnh;
+    fib_path_t *path;
+
+    path = fib_path_get(path_index);
+
+    ASSERT(path);
+
+    if (fib_path_is_resolved(path_index))
+    {
+       vec_add2(hash_key, mnh, 1);
+
+       mnh->path_weight = path->fp_weight;
+       mnh->path_index = path_index;
+       dpo_copy(&mnh->path_dpo, &path->fp_dpo);
+    }
+
+    return (hash_key);
+}
+
+int
+fib_path_is_recursive (fib_node_index_t path_index)
+{
+    fib_path_t *path;
+
+    path = fib_path_get(path_index);
+
+    return (FIB_PATH_TYPE_RECURSIVE == path->fp_type);
+}
+
+int
+fib_path_is_exclusive (fib_node_index_t path_index)
+{
+    fib_path_t *path;
+
+    path = fib_path_get(path_index);
+
+    return (FIB_PATH_TYPE_EXCLUSIVE == path->fp_type);
+}
+
+int
+fib_path_is_deag (fib_node_index_t path_index)
+{
+    fib_path_t *path;
+
+    path = fib_path_get(path_index);
+
+    return (FIB_PATH_TYPE_DEAG == path->fp_type);
+}
+
+int
+fib_path_is_resolved (fib_node_index_t path_index)
+{
+    fib_path_t *path;
+
+    path = fib_path_get(path_index);
+
+    return (dpo_id_is_valid(&path->fp_dpo) &&
+           (path->fp_oper_flags & FIB_PATH_OPER_FLAG_RESOLVED) &&
+           !fib_path_is_looped(path_index) &&
+           !fib_path_is_permanent_drop(path));
+}
+
+int
+fib_path_is_looped (fib_node_index_t path_index)
+{
+    fib_path_t *path;
+
+    path = fib_path_get(path_index);
+
+    return (path->fp_oper_flags & FIB_PATH_OPER_FLAG_RECURSIVE_LOOP);
+}
+
+void
+fib_path_module_init (void)
+{
+    fib_node_register_type (FIB_NODE_TYPE_PATH, &fib_path_vft);
+}
+
+static clib_error_t *
+show_fib_path_command (vlib_main_t * vm,
+                       unformat_input_t * input,
+                       vlib_cli_command_t * cmd)
+{
+    fib_path_t *path;
+
+    vlib_cli_output (vm, "FIB Path Lists");
+    pool_foreach(path, fib_path_pool,
+    ({
+       vlib_cli_output (vm, "%U", format_fib_path, path);
+    }));
+
+    return (NULL);
+}
+
+VLIB_CLI_COMMAND (show_fib_path, static) = {
+  .path = "show fib paths",
+  .function = show_fib_path_command,
+  .short_help = "show fib paths",
+};
diff --git a/vnet/vnet/fib/fib_path.h b/vnet/vnet/fib/fib_path.h
new file mode 100644 (file)
index 0000000..16ca358
--- /dev/null
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Given a route of the form;
+ *   q.r.s.t/Y
+ *     via <interface> <next-hop>
+ *
+ * The prefix is: q.r.s.t./Y
+ * the path is: 'via <interface> <next-hop>
+ *
+ * The path is the description of where to send the traffic, and the
+ * the prefix is a description of which traffic to send.
+ * It is the aim of the FIB to resolve the path, i.e. to find the corresponding
+ * adjacency to match the path's description.
+ */
+
+#ifndef __FIB_PATH_H__
+#define __FIB_PATH_H__
+
+#include <vnet/ip/ip.h>
+#include <vnet/dpo/load_balance.h>
+
+#include <vnet/fib/fib_types.h>
+#include <vnet/adj/adj_types.h>
+
+/**
+ * Enurmeration of path configuration attributes
+ */
+typedef enum fib_path_cfg_attribute_t_ {
+    /**
+     * Marker. Add new types after this one.
+     */
+    FIB_PATH_CFG_ATTRIBUTE_FIRST = 0,
+    /**
+     * The path is forced to a drop, whatever the next-hop info says.
+     * something somewhere knows better...
+     */
+    FIB_PATH_CFG_ATTRIBUTE_DROP = FIB_PATH_CFG_ATTRIBUTE_FIRST,
+    /**
+     * The path uses an adj that is exclusive. I.e. it is known only by
+     * the source of the route.
+     */
+    FIB_PATH_CFG_ATTRIBUTE_EXCLUSIVE,
+    /**
+     * Recursion constraint via host
+     */
+    FIB_PATH_CFG_ATTRIBUTE_RESOLVE_HOST,
+    /**
+     * Recursion constraint via attached
+     */
+    FIB_PATH_CFG_ATTRIBUTE_RESOLVE_ATTACHED,
+    /**
+     * The path is a for-us path
+     */
+    FIB_PATH_CFG_ATTRIBUTE_LOCAL,
+    /**
+     * Marker. Add new types before this one, then update it.
+     */
+    FIB_PATH_CFG_ATTRIBUTE_LAST = FIB_PATH_CFG_ATTRIBUTE_LOCAL,
+} __attribute__ ((packed)) fib_path_cfg_attribute_t;
+
+/**
+ * The maximum number of path attributes
+ */
+#define FIB_PATH_CFG_ATTRIBUTE_MAX (FIB_PATH_CFG_ATTRIBUTE_LAST + 1)
+
+#define FIB_PATH_CFG_ATTRIBUTES {                      \
+    [FIB_PATH_CFG_ATTRIBUTE_DROP]  = "drop",           \
+    [FIB_PATH_CFG_ATTRIBUTE_EXCLUSIVE] = "exclusive",  \
+    [FIB_PATH_CFG_ATTRIBUTE_RESOLVE_HOST] = "resolve-host", \
+    [FIB_PATH_CFG_ATTRIBUTE_RESOLVE_ATTACHED] = "resolve-attached", \
+    [FIB_PATH_CFG_ATTRIBUTE_LOCAL] = "local",          \
+}
+
+#define FOR_EACH_FIB_PATH_CFG_ATTRIBUTE(_item) \
+    for (_item = FIB_PATH_CFG_ATTRIBUTE_FIRST; \
+        _item <= FIB_PATH_CFG_ATTRIBUTE_LAST; \
+        _item++)
+
+/**
+ * Path config flags from the attributes
+ */
+typedef enum fib_path_cfg_flags_t_ {
+    FIB_PATH_CFG_FLAG_NONE  = 0,
+    FIB_PATH_CFG_FLAG_DROP  = (1 << FIB_PATH_CFG_ATTRIBUTE_DROP),
+    FIB_PATH_CFG_FLAG_EXCLUSIVE = (1 << FIB_PATH_CFG_ATTRIBUTE_EXCLUSIVE),
+    FIB_PATH_CFG_FLAG_RESOLVE_HOST = (1 << FIB_PATH_CFG_ATTRIBUTE_RESOLVE_HOST),
+    FIB_PATH_CFG_FLAG_RESOLVE_ATTACHED = (1 << FIB_PATH_CFG_ATTRIBUTE_RESOLVE_ATTACHED),
+    FIB_PATH_CFG_FLAG_LOCAL = (1 << FIB_PATH_CFG_ATTRIBUTE_LOCAL),
+} __attribute__ ((packed)) fib_path_cfg_flags_t;
+
+
+extern u8 *fib_path_format(fib_node_index_t pi, u8 *s);
+extern u8 *fib_path_adj_format(fib_node_index_t pi,
+                              u32 indent,
+                              u8 *s);
+
+extern u8 * format_fib_path(u8 * s, va_list * args);
+
+extern fib_node_index_t fib_path_create(fib_node_index_t pl_index,
+                                       fib_protocol_t nh_proto,
+                                       fib_path_cfg_flags_t flags,
+                                       const fib_route_path_t *path);
+extern fib_node_index_t fib_path_create_special(fib_node_index_t pl_index,
+                                               fib_protocol_t nh_proto,
+                                               fib_path_cfg_flags_t flags,
+                                               const dpo_id_t *dpo);
+
+extern int fib_path_cmp(fib_node_index_t path_index1,
+                       fib_node_index_t path_index2);
+extern int fib_path_cmp_for_sort(void * a1, void * a2);
+extern int fib_path_cmp_w_route_path(fib_node_index_t path_index,
+                                    const fib_route_path_t *rpath);
+extern fib_node_index_t fib_path_copy(fib_node_index_t path_index,
+                                     fib_node_index_t path_list_index);
+extern int fib_path_resolve(fib_node_index_t path_index);
+extern int fib_path_is_resolved(fib_node_index_t path_index);
+extern int fib_path_is_recursive(fib_node_index_t path_index);
+extern int fib_path_is_exclusive(fib_node_index_t path_index);
+extern int fib_path_is_deag(fib_node_index_t path_index);
+extern int fib_path_is_looped(fib_node_index_t path_index);
+extern void fib_path_destroy(fib_node_index_t path_index);
+extern uword fib_path_hash(fib_node_index_t path_index);
+extern load_balance_path_t * fib_path_append_nh_for_multipath_hash(
+    fib_node_index_t path_index,
+    fib_forward_chain_type_t fct,
+    load_balance_path_t *hash_key);
+extern void fib_path_contribute_forwarding(fib_node_index_t path_index,
+                                          fib_forward_chain_type_t type,
+                                          dpo_id_t *dpo);
+extern adj_index_t fib_path_get_adj(fib_node_index_t path_index);
+extern int fib_path_recursive_loop_detect(fib_node_index_t path_index,
+                                         fib_node_index_t **entry_indicies);
+extern u32 fib_path_get_resolving_interface(fib_node_index_t fib_entry_index);
+extern int fib_path_get_weight(fib_node_index_t path_index);
+
+extern void fib_path_module_init(void);
+
+extern void fib_path_module_init(void);
+
+#endif
diff --git a/vnet/vnet/fib/fib_path_ext.c b/vnet/vnet/fib/fib_path_ext.c
new file mode 100644 (file)
index 0000000..f40c749
--- /dev/null
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/mpls/mpls.h>
+#include <vnet/dpo/mpls_label_dpo.h>
+#include <vnet/dpo/load_balance.h>
+#include <vnet/dpo/drop_dpo.h>
+
+#include "fib_path_ext.h"
+#include "fib_path.h"
+#include "fib_path_list.h"
+#include "fib_internal.h"
+
+u8 *
+format_fib_path_ext (u8 * s, va_list * args)
+{
+    fib_path_ext_t *path_ext;
+
+    path_ext = va_arg (*args, fib_path_ext_t *);
+
+    s = format(s, "path:%d label:%U",
+              path_ext->fpe_path_index,
+              format_mpls_unicast_label,
+              path_ext->fpe_path.frp_label);
+
+    return (s);
+}
+
+int
+fib_path_ext_cmp (fib_path_ext_t *path_ext,
+                 const fib_route_path_t *rpath)
+{
+    return (fib_route_path_cmp(&path_ext->fpe_path, rpath));
+}
+
+static int
+fib_path_ext_match (fib_node_index_t pl_index,
+                   fib_node_index_t path_index,
+                   void *ctx)
+{
+    fib_path_ext_t *path_ext = ctx;
+
+    if (!fib_path_cmp_w_route_path(path_index,
+                                  &path_ext->fpe_path))
+    {
+       path_ext->fpe_path_index = path_index;
+       return (0);
+    }
+    // keep going
+    return (1);
+}
+
+void
+fib_path_ext_resolve (fib_path_ext_t *path_ext,
+                     fib_node_index_t path_list_index)
+{
+    /*
+     * Find the path on the path list that this is an extension for
+     */
+    path_ext->fpe_path_index = FIB_NODE_INDEX_INVALID;
+    fib_path_list_walk(path_list_index,
+                      fib_path_ext_match,
+                      path_ext);
+}
+
+void
+fib_path_ext_init (fib_path_ext_t *path_ext,
+                  fib_node_index_t path_list_index,
+                  const fib_route_path_t *rpath)
+{
+    path_ext->fpe_path = *rpath;
+    path_ext->fpe_path_index = FIB_NODE_INDEX_INVALID;
+
+    fib_path_ext_resolve(path_ext, path_list_index);
+}
+
+load_balance_path_t *
+fib_path_ext_stack (fib_path_ext_t *path_ext,
+                   fib_forward_chain_type_t parent_fct,
+                   load_balance_path_t *nhs)
+{
+    fib_forward_chain_type_t child_fct;
+    load_balance_path_t *nh;
+
+    if (!fib_path_is_resolved(path_ext->fpe_path_index))
+       return (nhs);
+
+    /*
+     * Since we are stacking this path-extension, it must have a valid out
+     * label. From the chain type request by the child, determine what
+     * chain type we will request from the parent.
+     */
+    switch (parent_fct)
+    {
+    case FIB_FORW_CHAIN_TYPE_MPLS_EOS:
+       ASSERT(0);
+    case FIB_FORW_CHAIN_TYPE_UNICAST_IP4:
+    case FIB_FORW_CHAIN_TYPE_UNICAST_IP6:
+       if (MPLS_IETF_IMPLICIT_NULL_LABEL == path_ext->fpe_label)
+       {
+            /*
+             * implicit-null label for the eos or IP chain, need to pick up
+             * the IP adj
+             */
+           child_fct = parent_fct;
+       }
+        else
+        {
+            /*
+             * we have a label to stack. packets will thus be labelled when
+             * they encounter th child, ergo, non-eos.
+             */
+           child_fct = FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS;
+        }
+       break;
+    case FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS:
+        child_fct = parent_fct;
+       break;
+    default:
+        return (nhs);
+       break;
+    }
+
+    dpo_id_t via_dpo = DPO_NULL;
+
+    /*
+     * The next object in the graph after the imposition of the label
+     * will be the DPO contributed by the path through which the packets
+     * are to be sent. We stack the MPLS Label DPO on this path DPO
+     */
+    fib_path_contribute_forwarding(path_ext->fpe_path_index,
+                                  child_fct,
+                                  &via_dpo);
+
+    if (dpo_is_drop(&via_dpo) ||
+       load_balance_is_drop(&via_dpo))
+    {
+       /*
+        * don't stack a path extension on a drop. doing so will create
+        * a LB bucket entry on drop, and we will lose a percentage of traffic.
+        */
+    }
+    else
+    {
+       vec_add2(nhs, nh, 1);
+       nh->path_weight = fib_path_get_weight(path_ext->fpe_path_index);
+       nh->path_index = path_ext->fpe_path_index;
+       dpo_copy(&nh->path_dpo, &via_dpo);
+
+       /*
+        * The label is stackable for this chain type
+        * construct the mpls header that will be imposed in the data-path
+        */
+       if (MPLS_IETF_IMPLICIT_NULL_LABEL != path_ext->fpe_label)
+       {
+           dpo_set(&nh->path_dpo,
+                   DPO_MPLS_LABEL,
+                   DPO_PROTO_MPLS,
+                   mpls_label_dpo_create(path_ext->fpe_label,
+                                          (parent_fct == FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS ?
+                                           MPLS_NON_EOS :
+                                           MPLS_EOS),
+                                          255, 0,
+                                          &nh->path_dpo));
+       }
+    }
+    dpo_reset(&via_dpo);
+
+    return (nhs);
+}
diff --git a/vnet/vnet/fib/fib_path_ext.h b/vnet/vnet/fib/fib_path_ext.h
new file mode 100644 (file)
index 0000000..949b1e2
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FIB_PATH_EXT_H__
+#define __FIB_PATH_EXT_H__
+
+#include <vnet/mpls/mpls.h>
+#include <vnet/fib/fib_types.h>
+
+/**
+ * A path extension is a per-entry addition to the forwarigind information
+ * when packets are sent for that entry over that path.
+ *
+ * For example:
+ *    ip route add 1.1.1.1/32 via 10.10.10.10 mpls-label 100
+ *
+ * The out-going MPLS label value 100 is a path-extension. It is a value sepcific
+ * to the entry 1.1.1.1/32 and valid only went packets are sent via 10.10.10.10.
+ */
+typedef struct fib_path_ext_t_
+{
+    /**
+     * A description of the path that is being extended.
+     * This description is used to match this extension with the [changing]
+     * instance of a fib_path_t that is extended
+     */
+    fib_route_path_t fpe_path;
+#define fpe_label fpe_path.frp_label
+
+    /**
+     * The index of the path. This is the global index, not the path's
+     * position in the path-list.
+     */
+    fib_node_index_t fpe_path_index;
+} fib_path_ext_t;
+
+
+extern u8 * format_fib_path_ext(u8 * s, va_list * args);
+
+extern void fib_path_ext_init(fib_path_ext_t *path_ext,
+                             fib_node_index_t path_list_index,
+                             const fib_route_path_t *rpath);
+
+extern int fib_path_ext_cmp(fib_path_ext_t *path_ext,
+                           const fib_route_path_t *rpath);
+
+extern void fib_path_ext_resolve(fib_path_ext_t *path_ext,
+                                fib_node_index_t path_list_index);
+
+extern load_balance_path_t *fib_path_ext_stack(fib_path_ext_t *path_ext,
+                                               fib_forward_chain_type_t fct,
+                                               load_balance_path_t *nhs);
+
+#endif
+
diff --git a/vnet/vnet/fib/fib_path_list.c b/vnet/vnet/fib/fib_path_list.c
new file mode 100644 (file)
index 0000000..1df7396
--- /dev/null
@@ -0,0 +1,1100 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vppinfra/mhash.h>
+#include <vnet/ip/ip.h>
+#include <vnet/adj/adj.h>
+#include <vnet/dpo/load_balance.h>
+#include <vnet/dpo/load_balance_map.h>
+
+#include <vnet/fib/fib_path_list.h>
+#include <vnet/fib/fib_internal.h>
+#include <vnet/fib/fib_node_list.h>
+#include <vnet/fib/fib_walk.h>
+
+/**
+ * FIB path-list
+ * A representation of the list/set of path trough which a prefix is reachable
+ */
+typedef struct fib_path_list_t_ {
+    /**
+     * A path-list is a node in the FIB graph.
+     */
+    fib_node_t fpl_node;
+
+    /**
+     * Flags on the path-list
+     */
+    fib_path_list_flags_t fpl_flags;
+
+    /**
+     * The next-hop protocol for the paths in this path list.
+     * Note that fixing the proto here means we don't support a mix of
+     * v4 and v6 paths. ho hum.
+     */
+    fib_protocol_t fpl_nh_proto;
+
+    /**
+     * Vector of paths indecies for all configured paths.
+     * For shareable path-lists this list MUST not change.
+     */
+    fib_node_index_t *fpl_paths;
+} fib_path_list_t;
+
+/*
+ * Array of strings/names for the FIB sources
+ */
+static const char *fib_path_list_attr_names[] = FIB_PATH_LIST_ATTRIBUTES;
+
+/*
+ * The memory pool from which we allocate all the path-lists
+ */
+static fib_path_list_t * fib_path_list_pool;
+
+/*
+ * The data-base of shared path-lists
+ */
+static uword *fib_path_list_db;
+
+/*
+ * Debug macro
+ */
+#ifdef FIB_DEBUG
+#define FIB_PATH_LIST_DBG(_pl, _fmt, _args...)           \
+{                                                        \
+    u8 *_tmp = 0;                                        \
+    _tmp = fib_path_list_format(                         \
+       fib_path_list_get_index(_pl), _tmp);              \
+    clib_warning("pl:[%d:%p:%p:%s]:" _fmt,               \
+                fib_path_list_get_index(_pl),            \
+                _pl, _pl->fpl_paths, _tmp,               \
+                ##_args);                                \
+    vec_free(_tmp);                                      \
+}
+#else
+#define FIB_PATH_LIST_DBG(_pl, _fmt, _args...)
+#endif
+
+static fib_path_list_t *
+fib_path_list_get (fib_node_index_t index)
+{
+    return (pool_elt_at_index(fib_path_list_pool, index));
+}
+
+static fib_node_t *
+fib_path_list_get_node (fib_node_index_t index)
+{
+    return ((fib_node_t*)fib_path_list_get(index));
+}
+
+static fib_path_list_t*
+fib_path_list_from_fib_node (fib_node_t *node)
+{
+#if CLIB_DEBUG > 0
+    ASSERT(FIB_NODE_TYPE_PATH_LIST == node->fn_type);
+#endif
+    return ((fib_path_list_t*)node);
+}
+
+static fib_node_index_t
+fib_path_list_get_index (fib_path_list_t *path_list)
+{
+    return (path_list - fib_path_list_pool);
+}
+
+static u8 *
+format_fib_path_list (u8 * s, va_list * args)
+{
+    fib_path_list_attribute_t attr;
+    fib_node_index_t *path_index;
+    fib_path_list_t *path_list;
+
+    path_list = va_arg (*args, fib_path_list_t *);
+    
+    s = format (s, "    index:%u", fib_path_list_get_index(path_list));
+    s = format (s, " locks:%u", path_list->fpl_node.fn_locks);
+    s = format (s, " proto:%U", format_fib_protocol, path_list->fpl_nh_proto);
+
+    if (FIB_PATH_LIST_FLAG_NONE != path_list->fpl_flags)
+    {
+       s = format (s, " flags:");
+       FOR_EACH_PATH_LIST_ATTRIBUTE(attr)
+        {
+           if ((1<<attr) & path_list->fpl_flags)
+            {
+               s = format (s, "%s,", fib_path_list_attr_names[attr]);
+           }
+       }
+    }
+    vec_foreach (path_index, path_list->fpl_paths)
+    {
+       s = fib_path_format(*path_index, s);
+       s = format(s, "\n");
+    }
+
+    return (s);
+}
+
+u8 *
+fib_path_list_adjs_format (fib_node_index_t path_list_index,
+                          u32 indent,
+                          u8 * s)
+{
+    fib_path_list_t *path_list;
+    u32 i;
+
+    path_list = fib_path_list_get(path_list_index);
+
+    vec_foreach_index (i, path_list->fpl_paths)
+    {
+       s = fib_path_adj_format(path_list->fpl_paths[i],
+                               indent, s);
+    }
+
+    return (s);
+}
+
+
+u8 *
+fib_path_list_format (fib_node_index_t path_list_index,
+                     u8 * s)
+{
+    fib_path_list_t *path_list;
+
+    path_list = fib_path_list_get(path_list_index);
+
+    return (format(s, "%U", format_fib_path_list, path_list));
+}
+
+static uword
+fib_path_list_hash (fib_path_list_t *path_list)
+{
+    uword old_path_list_hash, new_path_list_hash, path_hash;
+    fib_node_index_t *path_index;
+
+    ASSERT(path_list);
+
+    new_path_list_hash = old_path_list_hash = vec_len(path_list->fpl_paths);
+
+    vec_foreach (path_index, path_list->fpl_paths)
+    {
+       path_hash = fib_path_hash(*path_index);
+#if uword_bits == 64
+       hash_mix64(path_hash, old_path_list_hash, new_path_list_hash);
+#else
+       hash_mix32(path_hash, old_path_list_hash, new_path_list_hash);
+#endif
+    }
+
+    return (new_path_list_hash);
+}
+
+always_inline uword
+fib_path_list_db_hash_key_from_index (uword index)
+{
+    return 1 + 2*index;
+}
+
+always_inline uword
+fib_path_list_db_hash_key_is_index (uword key)
+{
+    return key & 1;
+}
+
+always_inline uword
+fib_path_list_db_hash_key_2_index (uword key)
+{
+    ASSERT (fib_path_list_db_hash_key_is_index (key));
+    return key / 2;
+}
+
+static fib_path_list_t*
+fib_path_list_db_get_from_hash_key (uword key)
+{
+    fib_path_list_t *path_list;
+
+    if (fib_path_list_db_hash_key_is_index (key))
+    {
+       fib_node_index_t path_list_index;
+
+       path_list_index = fib_path_list_db_hash_key_2_index(key);
+       path_list = fib_path_list_get(path_list_index);
+    }
+    else
+    {       
+       path_list = uword_to_pointer (key, fib_path_list_t *);
+    }
+
+    return (path_list);
+}
+
+static uword
+fib_path_list_db_hash_key_sum (hash_t * h,
+                              uword key)
+{
+    fib_path_list_t *path_list;
+
+    path_list = fib_path_list_db_get_from_hash_key(key);
+
+    return (fib_path_list_hash(path_list));
+}
+
+static uword
+fib_path_list_db_hash_key_equal (hash_t * h,
+                                uword key1,
+                                uword key2)
+{
+    fib_path_list_t *path_list1, *path_list2;
+
+    path_list1 = fib_path_list_db_get_from_hash_key(key1);
+    path_list2 = fib_path_list_db_get_from_hash_key(key2);
+
+    return (fib_path_list_hash(path_list1) ==
+           fib_path_list_hash(path_list2));
+}
+
+static fib_node_index_t
+fib_path_list_db_find (fib_path_list_t *path_list)
+{
+    uword *p;
+
+    p = hash_get(fib_path_list_db, path_list);
+
+    if (NULL != p)
+    {
+       return p[0];
+    }
+
+    return (FIB_NODE_INDEX_INVALID);
+}
+
+static void
+fib_path_list_db_insert (fib_node_index_t path_list_index)
+{
+    fib_path_list_t *path_list;
+
+    path_list = fib_path_list_get(path_list_index);
+
+    ASSERT(FIB_NODE_INDEX_INVALID == fib_path_list_db_find(path_list));
+
+    hash_set (fib_path_list_db,
+             fib_path_list_db_hash_key_from_index(path_list_index),
+             path_list_index);
+
+    FIB_PATH_LIST_DBG(path_list, "DB-inserted");
+}
+
+static void
+fib_path_list_db_remove (fib_node_index_t path_list_index)
+{
+    fib_path_list_t *path_list;
+
+    path_list = fib_path_list_get(path_list_index);
+
+    ASSERT(FIB_NODE_INDEX_INVALID != fib_path_list_db_find(path_list));
+
+    hash_unset(fib_path_list_db,
+              fib_path_list_db_hash_key_from_index(path_list_index));
+
+    FIB_PATH_LIST_DBG(path_list, "DB-removed");
+}
+
+static void
+fib_path_list_destroy (fib_path_list_t *path_list)
+{
+    fib_node_index_t *path_index;
+
+    FIB_PATH_LIST_DBG(path_list, "destroy");
+
+    vec_foreach (path_index, path_list->fpl_paths)
+    {
+       fib_path_destroy(*path_index);
+    }    
+
+    vec_free(path_list->fpl_paths);
+
+    fib_node_deinit(&path_list->fpl_node);
+    pool_put(fib_path_list_pool, path_list);
+}
+
+static void
+fib_path_list_last_lock_gone (fib_node_t *node)
+{
+    fib_path_list_t *path_list;
+
+    path_list = fib_path_list_from_fib_node(node);
+
+    FIB_PATH_LIST_DBG(path_list, "last-lock");
+
+    if (path_list->fpl_flags & FIB_PATH_LIST_FLAG_SHARED)
+    {
+       fib_path_list_db_remove(fib_path_list_get_index(path_list));
+    }
+    fib_path_list_destroy(path_list);
+}
+
+/*
+ * fib_path_mk_lb
+ *
+ * update the multipath adj this path-list will contribute to its
+ * children's forwarding.
+ */
+static void
+fib_path_list_mk_lb (fib_path_list_t *path_list,
+                    fib_forward_chain_type_t type,
+                    dpo_id_t *dpo)
+{
+    load_balance_path_t *hash_key;
+    fib_node_index_t *path_index;
+
+    hash_key  = NULL;
+
+    /*
+     * We gather the DPOs from resolved paths.
+     */
+    vec_foreach (path_index, path_list->fpl_paths)
+    {
+       hash_key = fib_path_append_nh_for_multipath_hash(
+                      *path_index,
+                      type,
+                      hash_key);
+    }
+
+    /*
+     * Path-list load-balances, which if used, would be shared and hence
+     * never need a load-balance map.
+     */
+    load_balance_multipath_update(dpo, hash_key, LOAD_BALANCE_FLAG_NONE);
+
+    FIB_PATH_LIST_DBG(path_list, "mk lb: %d", dpo->dpoi_index);
+
+    vec_free(hash_key);
+}
+
+/*
+ * fib_path_list_back_walk
+ *
+ * Called from one of this path-list's paths to progate
+ * a back walk
+ */
+void
+fib_path_list_back_walk (fib_node_index_t path_list_index,
+                        fib_node_back_walk_ctx_t *ctx)
+{
+    fib_path_list_t *path_list;
+
+    path_list = fib_path_list_get(path_list_index);
+
+    /*
+     * propagate the backwalk further
+     */
+    if (32 >= fib_node_list_get_size(path_list->fpl_node.fn_children))
+    {
+        /*
+         * only a few children. continue the walk synchronously
+         */
+       fib_walk_sync(FIB_NODE_TYPE_PATH_LIST, path_list_index, ctx);
+    }
+    else
+    {
+        /*
+         * many children. schedule a async walk
+         */
+        fib_walk_async(FIB_NODE_TYPE_PATH_LIST,
+                       path_list_index,
+                       FIB_WALK_PRIORITY_LOW,
+                       ctx);
+    }
+}
+
+/*
+ * fib_path_list_back_walk_notify
+ *
+ * A back walk has reach this path-list.
+ */
+static fib_node_back_walk_rc_t
+fib_path_list_back_walk_notify (fib_node_t *node,
+                               fib_node_back_walk_ctx_t *ctx)
+{
+    /*
+     * the path-list is not a direct child of any other node type
+     * paths, which do not change thier to-list-mapping, save the
+     * list they are a member of, and invoke the BW function directly.
+     */
+    ASSERT(0);
+
+    return (FIB_NODE_BACK_WALK_CONTINUE);
+}
+
+/*
+ * The FIB path-list's graph node virtual function table
+ */
+static const fib_node_vft_t fib_path_list_vft = {
+    .fnv_get = fib_path_list_get_node,
+    .fnv_last_lock = fib_path_list_last_lock_gone,
+    .fnv_back_walk = fib_path_list_back_walk_notify,
+};
+
+static fib_path_list_t *
+fib_path_list_alloc (fib_node_index_t *path_list_index)
+{
+    fib_path_list_t *path_list;
+
+    pool_get(fib_path_list_pool, path_list);
+    memset(path_list, 0, sizeof(*path_list));
+
+    fib_node_init(&path_list->fpl_node,
+                 FIB_NODE_TYPE_PATH_LIST);
+
+    if (NULL != path_list_index)
+    {
+       *path_list_index = fib_path_list_get_index(path_list);
+    }
+
+    FIB_PATH_LIST_DBG(path_list, "alloc");
+
+    return (path_list);
+}
+
+static fib_path_list_t *
+fib_path_list_resolve (fib_path_list_t *path_list)
+{
+    fib_node_index_t *path_index, *paths, path_list_index;
+
+    ASSERT(!(path_list->fpl_flags & FIB_PATH_LIST_FLAG_RESOLVED));
+
+    /*
+     * resolving a path-list is a recursive action. this means more path
+     * lists can be created during this call, and hence this path-list
+     * can be realloc'd. so we work with copies.
+     * this function is called only once per-path list, so its no great overhead.
+     */
+    path_list_index = fib_path_list_get_index(path_list);
+    paths = vec_dup(path_list->fpl_paths);
+
+    vec_foreach (path_index, paths)
+    {
+       fib_path_resolve(*path_index);
+    }
+
+    vec_free(paths);
+    path_list = fib_path_list_get(path_list_index);
+
+    FIB_PATH_LIST_DBG(path_list, "resovled");
+
+    return (path_list);
+}
+
+u32
+fib_path_list_get_resolving_interface (fib_node_index_t path_list_index)
+{
+    fib_node_index_t *path_index;
+    fib_path_list_t *path_list;
+    u32 sw_if_index;
+
+    path_list = fib_path_list_get(path_list_index);
+
+    sw_if_index = ~0;
+    vec_foreach (path_index, path_list->fpl_paths)
+    {
+       sw_if_index = fib_path_get_resolving_interface(*path_index);
+       if (~0 != sw_if_index)
+       {
+           return (sw_if_index);
+       }
+    }
+
+    return (sw_if_index);
+}
+
+int
+fib_path_list_is_looped (fib_node_index_t path_list_index)
+{
+    fib_path_list_t *path_list;
+
+    path_list = fib_path_list_get(path_list_index);
+
+    return (path_list->fpl_flags & FIB_PATH_LIST_FLAG_LOOPED);
+}
+
+static fib_path_cfg_flags_t 
+fib_path_list_flags_2_path_flags (fib_path_list_flags_t plf)
+{
+    fib_path_cfg_flags_t pf = FIB_PATH_CFG_FLAG_NONE;
+
+    if (plf & FIB_PATH_LIST_FLAG_LOCAL)
+    {
+       pf |= FIB_PATH_CFG_FLAG_LOCAL;
+    }
+    if (plf & FIB_PATH_LIST_FLAG_DROP)
+    {
+       pf |= FIB_PATH_CFG_FLAG_DROP;
+    }
+    if (plf & FIB_PATH_LIST_FLAG_EXCLUSIVE)
+    {
+       pf |= FIB_PATH_CFG_FLAG_EXCLUSIVE;
+    }
+
+    return (pf);
+}
+
+static fib_path_list_flags_t
+fib_path_list_flags_fixup (fib_path_list_flags_t flags)
+{
+    /*
+     * we do no share drop nor exclusive path-lists
+     */
+    if (flags & FIB_PATH_LIST_FLAG_DROP ||
+       flags & FIB_PATH_LIST_FLAG_EXCLUSIVE)
+    {
+       flags &= ~FIB_PATH_LIST_FLAG_SHARED;
+    }
+
+    return (flags);
+}
+
+fib_node_index_t
+fib_path_list_create (fib_path_list_flags_t flags,
+                     const fib_route_path_t *rpaths)
+{
+    fib_node_index_t path_list_index, old_path_list_index;
+    fib_path_list_t *path_list;
+    int i;
+
+    flags = fib_path_list_flags_fixup(flags);
+    path_list = fib_path_list_alloc(&path_list_index);
+    path_list->fpl_flags = flags;
+    /*
+     * we'll assume for now all paths are the same next-hop protocol
+     */
+    path_list->fpl_nh_proto = rpaths[0].frp_proto;
+
+    vec_foreach_index(i, rpaths)
+    {
+       vec_add1(path_list->fpl_paths,
+                fib_path_create(path_list_index,
+                                path_list->fpl_nh_proto,
+                                fib_path_list_flags_2_path_flags(flags),
+                                &rpaths[i]));
+    }
+
+    /*
+     * If a shared path list is requested, consult the DB for a match
+     */
+    if (flags & FIB_PATH_LIST_FLAG_SHARED)
+    {
+       /*
+        * check for a matching path-list in the DB.
+        * If we find one then we can return the existing one and destroy the
+        * new one just created.
+        */
+       old_path_list_index = fib_path_list_db_find(path_list);
+       if (FIB_NODE_INDEX_INVALID != old_path_list_index)
+       {
+           fib_path_list_destroy(path_list);
+       
+           path_list_index = old_path_list_index;
+       }
+       else
+       {
+           /*
+            * if there was not a matching path-list, then this
+            * new one will need inserting into the DB and resolving.
+            */
+           fib_path_list_db_insert(path_list_index);
+           path_list = fib_path_list_resolve(path_list);
+       }
+    }
+    else
+    {
+       /*
+        * no shared path list requested. resolve and use the one
+        * just created.
+        */
+       path_list = fib_path_list_resolve(path_list);
+    }
+
+    return (path_list_index);
+}
+
+fib_node_index_t
+fib_path_list_create_special (fib_protocol_t nh_proto,
+                             fib_path_list_flags_t flags,
+                             const dpo_id_t *dpo)
+{
+    fib_node_index_t path_index, path_list_index;
+    fib_path_list_t *path_list;
+
+    path_list = fib_path_list_alloc(&path_list_index);
+    path_list->fpl_flags = flags;
+    path_list->fpl_nh_proto = nh_proto;
+
+    path_index =
+       fib_path_create_special(path_list_index,
+                               path_list->fpl_nh_proto,
+                               fib_path_list_flags_2_path_flags(flags),
+                               dpo);
+    vec_add1(path_list->fpl_paths, path_index);
+
+    /*
+     * we don't share path-lists. we can do PIC on them so why bother.
+     */
+    path_list = fib_path_list_resolve(path_list);
+
+    return (path_list_index);
+}
+
+/*
+ * fib_path_list_copy_and_path_add
+ *
+ * Create a copy of a path-list and append one more path to it.
+ * The path-list returned could either have been newly created, or
+ * can be a shared path-list from the data-base.
+ */
+fib_node_index_t
+fib_path_list_copy_and_path_add (fib_node_index_t orig_path_list_index,
+                                fib_path_list_flags_t flags,
+                                const fib_route_path_t *rpaths)
+{
+    fib_node_index_t path_index, path_list_index, *orig_path_index;
+    fib_path_list_t *path_list, *orig_path_list;
+    fib_node_index_t pi;
+
+    ASSERT(1 == vec_len(rpaths));
+
+    /*
+     * alloc the new list before we retrieve the old one, lest
+     * the alloc result in a realloc
+     */
+    path_list = fib_path_list_alloc(&path_list_index);
+
+    orig_path_list = fib_path_list_get(orig_path_list_index);
+
+    FIB_PATH_LIST_DBG(orig_path_list, "copy-add");
+
+    flags = fib_path_list_flags_fixup(flags);
+    path_list->fpl_flags = flags;
+    path_list->fpl_nh_proto = orig_path_list->fpl_nh_proto;
+    vec_validate(path_list->fpl_paths, vec_len(orig_path_list->fpl_paths));
+    pi = 0;
+
+    vec_foreach (orig_path_index, orig_path_list->fpl_paths)
+    {
+       path_index = fib_path_copy(*orig_path_index, path_list_index);
+       path_list->fpl_paths[pi++] = path_index;
+    }
+    path_index = fib_path_create(path_list_index,
+                                path_list->fpl_nh_proto,
+                                fib_path_list_flags_2_path_flags(flags),
+                                rpaths);
+    path_list->fpl_paths[pi] = path_index;
+
+    /*
+     * we sort the paths since the key for the path-list is
+     * the description of the paths it contains. The paths need to
+     * be sorted else this description will differ.
+     */
+    vec_sort_with_function(path_list->fpl_paths, fib_path_cmp_for_sort);
+
+    FIB_PATH_LIST_DBG(path_list, "path-added");
+
+    /*
+     * If a shared path list is requested, consult the DB for a match
+     */
+    if (path_list->fpl_flags & FIB_PATH_LIST_FLAG_SHARED)
+    {
+       fib_node_index_t exist_path_list_index;
+       /*
+        * check for a matching path-list in the DB.
+        * If we find one then we can return the existing one and destroy the
+        * new one just created.
+        */
+       exist_path_list_index = fib_path_list_db_find(path_list);
+       if (FIB_NODE_INDEX_INVALID != exist_path_list_index)
+       {
+           fib_path_list_destroy(path_list);
+       
+           path_list_index = exist_path_list_index;
+       }
+       else
+       {
+           /*
+            * if there was not a matching path-list, then this
+            * new one will need inserting into the DB and resolving.
+            */
+           fib_path_list_db_insert(path_list_index);
+
+           path_list = fib_path_list_resolve(path_list);
+       }
+    }
+    else
+    {
+       /*
+        * no shared path list requested. resolve and use the one
+        * just created.
+        */
+       path_list = fib_path_list_resolve(path_list);
+    }
+
+    return (path_list_index);
+}
+
+/*
+ * fib_path_list_copy_and_path_remove
+ *
+ * Copy the path-list excluding the path passed.
+ * If the path is the last one, then the index reurned will be invalid.
+ * i.e. the path-list is toast.
+ */
+fib_node_index_t
+fib_path_list_copy_and_path_remove (fib_node_index_t orig_path_list_index,
+                                   fib_path_list_flags_t flags,
+                                   const fib_route_path_t *rpaths)
+{
+    fib_node_index_t path_index, *orig_path_index, path_list_index, tmp_path_index;
+    fib_path_list_t *path_list,  *orig_path_list;
+    fib_node_index_t pi;
+
+    ASSERT(1 == vec_len(rpaths));
+
+    path_list = fib_path_list_alloc(&path_list_index);
+
+    flags = fib_path_list_flags_fixup(flags);
+    orig_path_list = fib_path_list_get(orig_path_list_index);
+
+    FIB_PATH_LIST_DBG(orig_path_list, "copy-remove");
+
+    path_list->fpl_flags = flags;
+    path_list->fpl_nh_proto = orig_path_list->fpl_nh_proto;
+    /*
+     * allocate as many paths as we might need in one go, rather than
+     * using vec_add to do a few at a time.
+     */
+    if (vec_len(orig_path_list->fpl_paths) > 1)
+    {
+       vec_validate(path_list->fpl_paths, vec_len(orig_path_list->fpl_paths) - 2);
+    }
+    pi = 0;
+
+    /*
+     * create a representation of the path to be removed, so it
+     * can be used as a comparison object during the copy.
+     */
+    tmp_path_index = fib_path_create(path_list_index,
+                                    path_list->fpl_nh_proto,
+                                    fib_path_list_flags_2_path_flags(flags),
+                                    rpaths);
+
+    vec_foreach (orig_path_index, orig_path_list->fpl_paths)
+    {
+       if (0 != fib_path_cmp(tmp_path_index, *orig_path_index)) {
+           path_index = fib_path_copy(*orig_path_index, path_list_index);
+           if (pi < vec_len(path_list->fpl_paths))
+           {
+               path_list->fpl_paths[pi++] = path_index;
+           }
+           else
+           {
+               /*
+                * this is the unlikely case that the path being
+                * removed does not match one in the path-list, so
+                * we end up with as many paths as we started with.
+                * the paths vector was sized above with the expectation
+                * that we would have 1 less.
+                */
+               vec_add1(path_list->fpl_paths, path_index);
+           }
+       }
+    }
+
+    /*
+     * done with the temporary now
+     */
+    fib_path_destroy(tmp_path_index);
+
+    /*
+     * if there are no paths, then the new path-list is aborted
+     */
+    if (0 == vec_len(path_list->fpl_paths)) {
+       FIB_PATH_LIST_DBG(path_list, "last-path-removed");
+
+       fib_path_list_destroy(path_list);
+
+       path_list_index = FIB_NODE_INDEX_INVALID;
+    } else {
+       /*
+        * we sort the paths since the key for the path-list is
+        * the description of the paths it contains. The paths need to
+        * be sorted else this description will differ.
+        */
+       vec_sort_with_function(path_list->fpl_paths, fib_path_cmp_for_sort);
+    
+       /*
+        * If a shared path list is requested, consult the DB for a match
+        */
+       if (path_list->fpl_flags & FIB_PATH_LIST_FLAG_SHARED)
+       {
+           fib_node_index_t exist_path_list_index;
+
+            /*
+            * check for a matching path-list in the DB.
+            * If we find one then we can return the existing one and destroy the
+            * new one just created.
+            */
+           exist_path_list_index = fib_path_list_db_find(path_list);
+           if (FIB_NODE_INDEX_INVALID != exist_path_list_index)
+           {
+               fib_path_list_destroy(path_list);
+       
+               path_list_index = exist_path_list_index;
+           }
+           else
+           {
+               /*
+                * if there was not a matching path-list, then this
+                * new one will need inserting into the DB and resolving.
+                */
+               fib_path_list_db_insert(path_list_index);
+
+               path_list = fib_path_list_resolve(path_list);
+           }
+       }
+       else
+       {
+           /*
+            * no shared path list requested. resolve and use the one
+            * just created.
+            */
+           path_list = fib_path_list_resolve(path_list);
+       }
+    }
+
+    return (path_list_index);
+}
+
+/*
+ * fib_path_list_contribute_forwarding
+ *
+ * Return the index of a load-balance that user of this path-list should
+ * use for forwarding
+ */
+void
+fib_path_list_contribute_forwarding (fib_node_index_t path_list_index,
+                                    fib_forward_chain_type_t type,
+                                    dpo_id_t *dpo)
+{
+    fib_path_list_t *path_list;
+
+    path_list = fib_path_list_get(path_list_index);
+
+    fib_path_list_mk_lb(path_list, type, dpo);
+}
+
+/*
+ * fib_path_list_get_adj
+ *
+ * Return the index of a adjacency for the first path that user of this
+ * path-list should use for forwarding
+ */
+adj_index_t
+fib_path_list_get_adj (fib_node_index_t path_list_index,
+                      fib_forward_chain_type_t type)
+{
+    fib_path_list_t *path_list;
+
+    path_list = fib_path_list_get(path_list_index);
+    return (fib_path_get_adj(path_list->fpl_paths[0]));
+}
+
+int
+fib_path_list_recursive_loop_detect (fib_node_index_t path_list_index,
+                                    fib_node_index_t **entry_indicies)
+{
+    fib_node_index_t *path_index;
+    int is_looped, list_looped;
+    fib_path_list_t *path_list;
+
+    list_looped = 0;
+    path_list = fib_path_list_get(path_list_index);
+
+    vec_foreach (path_index, path_list->fpl_paths)
+    {
+       fib_node_index_t *copy, **copy_ptr;
+
+       /*
+        * we need a copy of the nodes visited so that when we add entries
+        * we explore on the nth path and a looped is detected, those entries
+        * are not again searched for n+1 path and so finding a loop that does
+        * not exist.
+        */
+       copy = vec_dup(*entry_indicies);
+       copy_ptr = &copy;
+
+       is_looped  = fib_path_recursive_loop_detect(*path_index, copy_ptr);
+       list_looped += is_looped;
+    }
+
+    FIB_PATH_LIST_DBG(path_list, "loop-detect: eval:%d", eval);
+
+    if (list_looped)
+    {
+       path_list->fpl_flags |= FIB_PATH_LIST_FLAG_LOOPED;
+    }
+    else
+    {
+       path_list->fpl_flags &= ~FIB_PATH_LIST_FLAG_LOOPED;
+    }
+
+    return (list_looped);
+}
+
+u32
+fib_path_list_child_add (fib_node_index_t path_list_index,
+                        fib_node_type_t child_type,
+                        fib_node_index_t child_index)
+{
+    return (fib_node_child_add(FIB_NODE_TYPE_PATH_LIST,
+                               path_list_index,
+                               child_type,
+                               child_index));
+}
+
+void
+fib_path_list_child_remove (fib_node_index_t path_list_index,
+                           u32 si)
+{
+    fib_node_child_remove(FIB_NODE_TYPE_PATH_LIST,
+                          path_list_index,
+                          si);
+}
+
+void
+fib_path_list_lock(fib_node_index_t path_list_index)
+{
+    fib_path_list_t *path_list;
+
+    if (FIB_NODE_INDEX_INVALID != path_list_index)
+    {
+       path_list = fib_path_list_get(path_list_index);
+
+       fib_node_lock(&path_list->fpl_node);
+       FIB_PATH_LIST_DBG(path_list, "lock");
+    }
+}
+
+void
+fib_path_list_unlock (fib_node_index_t path_list_index)
+{
+    fib_path_list_t *path_list;
+
+    if (FIB_NODE_INDEX_INVALID != path_list_index)
+    {
+       path_list = fib_path_list_get(path_list_index);
+       FIB_PATH_LIST_DBG(path_list, "unlock");
+    
+       fib_node_unlock(&path_list->fpl_node);
+    }
+}
+
+u32
+fib_path_list_pool_size (void)
+{
+    return (pool_elts(fib_path_list_pool));    
+}
+
+u32
+fib_path_list_db_size (void)
+{
+    return (hash_elts(fib_path_list_db));
+}
+
+void
+fib_path_list_walk (fib_node_index_t path_list_index,
+                   fib_path_list_walk_fn_t func,
+                   void *ctx)
+{
+    fib_node_index_t *path_index;
+    fib_path_list_t *path_list;
+
+    path_list = fib_path_list_get(path_list_index);
+
+    vec_foreach(path_index, path_list->fpl_paths)
+    {
+       if (!func(path_list_index, *path_index, ctx))
+           break;
+    }
+}
+
+
+void
+fib_path_list_module_init (void)
+{
+    fib_node_register_type (FIB_NODE_TYPE_PATH_LIST, &fib_path_list_vft);
+
+    fib_path_list_db = hash_create2 (/* elts */ 0,
+                                    /* user */ 0,
+                                    /* value_bytes */ sizeof (fib_node_index_t),
+                                    fib_path_list_db_hash_key_sum,
+                                    fib_path_list_db_hash_key_equal,
+                                    /* format pair/arg */
+                                    0, 0);
+}
+
+static clib_error_t *
+show_fib_path_list_command (vlib_main_t * vm,
+                           unformat_input_t * input,
+                           vlib_cli_command_t * cmd)
+{
+    fib_path_list_t *path_list;
+    fib_node_index_t pli;
+
+    if (unformat (input, "%d", &pli))
+    {
+       /*
+        * show one in detail
+        */
+       if (!pool_is_free_index(fib_path_list_pool, pli))
+       {
+           path_list = fib_path_list_get(pli);
+           u8 *s = fib_path_list_format(pli, NULL);
+           s = format(s, "children:");
+           s = fib_node_children_format(path_list->fpl_node.fn_children, s);
+           vlib_cli_output (vm, "%s", s);
+           vec_free(s);
+       }
+       else
+       {
+           vlib_cli_output (vm, "path list %d invalid", pli);
+       }
+    }
+    else
+    {
+       /*
+        * show all
+        */
+       vlib_cli_output (vm, "FIB Path Lists");
+       pool_foreach(path_list, fib_path_list_pool,
+       ({
+           vlib_cli_output (vm, "%U", format_fib_path_list, path_list);
+       }));
+    }
+    return (NULL);
+}
+
+VLIB_CLI_COMMAND (show_fib_path_list, static) = {
+  .path = "show fib path list",
+  .function = show_fib_path_list_command,
+  .short_help = "show fib path list",
+};
diff --git a/vnet/vnet/fib/fib_path_list.h b/vnet/vnet/fib/fib_path_list.h
new file mode 100644 (file)
index 0000000..42e07ab
--- /dev/null
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FIB_PATH_LIST_H__
+#define __FIB_PATH_LIST_H__
+
+#include <vlib/vlib.h>
+#include <vnet/adj/adj.h>
+
+#include "fib_node.h"
+#include "fib_path.h"
+
+/**
+ * Enumeration of path-list flags.
+ */
+typedef enum fib_path_list_attribute_t_ {
+    /**
+     * Marker. Add new flags after this one.
+     */
+    FIB_PATH_LIST_ATTRIBUTE_FIRST = 0,
+    /**
+     * This path list is shareable. Shareable path-lists
+     * are inserted into the path-list data-base.
+     * All path-list are inherently shareable, the reason we share some and
+     * not others is to limit the size of the path-list database. This DB must
+     * be searched for each route update.
+     */
+    FIB_PATH_LIST_ATTRIBUTE_SHARED = FIB_PATH_LIST_ATTRIBUTE_FIRST,
+    /**
+     * explicit drop path-list. Used when the entry source needs to 
+     * force a drop, despite the fact the path info is present.
+     */
+    FIB_PATH_LIST_ATTRIBUTE_DROP,
+    /**
+     * explicit local path-list.
+     */
+    FIB_PATH_LIST_ATTRIBUTE_LOCAL,
+    /**
+     * exclusive path-list. Exclusive means the path will resolve via the
+     * exclusive (user provided) adj.
+     */
+    FIB_PATH_LIST_ATTRIBUTE_EXCLUSIVE,
+    /**
+     * resolved path-list
+     */
+    FIB_PATH_LIST_ATTRIBUTE_RESOLVED,
+    /**
+     * looped path-list. one path looped implies the whole list is
+     */
+    FIB_PATH_LIST_ATTRIBUTE_LOOPED,
+    /**
+     * Marher. Add new flags before this one, and then update it.
+     */
+    FIB_PATH_LIST_ATTRIBUTE_LAST = FIB_PATH_LIST_ATTRIBUTE_LOOPED,
+} fib_path_list_attribute_t;
+
+typedef enum fib_path_list_flags_t_ {
+    FIB_PATH_LIST_FLAG_NONE      = 0,
+    FIB_PATH_LIST_FLAG_SHARED    = (1 << FIB_PATH_LIST_ATTRIBUTE_SHARED),
+    FIB_PATH_LIST_FLAG_DROP      = (1 << FIB_PATH_LIST_ATTRIBUTE_DROP),
+    FIB_PATH_LIST_FLAG_LOCAL     = (1 << FIB_PATH_LIST_ATTRIBUTE_LOCAL),
+    FIB_PATH_LIST_FLAG_EXCLUSIVE = (1 << FIB_PATH_LIST_ATTRIBUTE_EXCLUSIVE),
+    FIB_PATH_LIST_FLAG_RESOLVED  = (1 << FIB_PATH_LIST_ATTRIBUTE_RESOLVED),
+    FIB_PATH_LIST_FLAG_LOOPED    = (1 << FIB_PATH_LIST_ATTRIBUTE_LOOPED),
+} fib_path_list_flags_t;
+
+#define FIB_PATH_LIST_ATTRIBUTES {                      \
+    [FIB_PATH_LIST_ATTRIBUTE_SHARED]    = "shared",     \
+    [FIB_PATH_LIST_ATTRIBUTE_RESOLVED]  = "resolved",   \
+    [FIB_PATH_LIST_ATTRIBUTE_DROP]      = "drop",       \
+    [FIB_PATH_LIST_ATTRIBUTE_EXCLUSIVE] = "exclusive",   \
+    [FIB_PATH_LIST_ATTRIBUTE_LOCAL]     = "local",      \
+    [FIB_PATH_LIST_ATTRIBUTE_LOOPED]     = "looped",    \
+}
+
+#define FOR_EACH_PATH_LIST_ATTRIBUTE(_item)            \
+    for (_item = FIB_PATH_LIST_ATTRIBUTE_FIRST;                \
+        _item <= FIB_PATH_LIST_ATTRIBUTE_LAST;         \
+        _item++)
+
+extern fib_node_index_t fib_path_list_create(fib_path_list_flags_t flags,
+                                            const fib_route_path_t *paths);
+extern fib_node_index_t fib_path_list_create_special(fib_protocol_t nh_proto,
+                                                    fib_path_list_flags_t flags,
+                                                    const dpo_id_t *dpo);
+
+extern fib_node_index_t fib_path_list_copy_and_path_add(
+    fib_node_index_t pl_index,
+    fib_path_list_flags_t flags,
+    const fib_route_path_t *path);
+extern fib_node_index_t fib_path_list_copy_and_path_remove(
+    fib_node_index_t pl_index,
+    fib_path_list_flags_t flags,
+    const fib_route_path_t *path);
+extern void fib_path_list_contribute_forwarding (fib_node_index_t path_list_index,
+                                                fib_forward_chain_type_t type,
+                                                dpo_id_t *dpo);
+extern index_t fib_path_list_get_adj(fib_node_index_t path_list_index,
+                                    fib_forward_chain_type_t type);
+
+extern u32 fib_path_list_child_add(fib_node_index_t pl_index,
+                                  fib_node_type_t type,
+                                  fib_node_index_t child_index);
+extern void fib_path_list_child_remove(fib_node_index_t pl_index,
+                                      fib_node_index_t sibling_index);
+extern void fib_path_list_back_walk(fib_node_index_t pl_index,
+                                   fib_node_back_walk_ctx_t *ctx);
+extern void fib_path_list_lock(fib_node_index_t pl_index);
+extern void fib_path_list_unlock(fib_node_index_t pl_index);
+extern int fib_path_list_recursive_loop_detect(fib_node_index_t path_list_index,
+                                              fib_node_index_t **entry_indicies);
+extern u32 fib_path_list_get_resolving_interface(fib_node_index_t path_list_index);
+extern int fib_path_list_is_looped(fib_node_index_t path_list_index);
+extern u8 * fib_path_list_format(fib_node_index_t pl_index,
+                                u8 * s);
+extern u8 * fib_path_list_adjs_format(fib_node_index_t pl_index,
+                                     u32 indent,
+                                     u8 * s);
+extern index_t fib_path_list_lb_map_add_or_lock(fib_node_index_t pl_index,
+                                                const fib_node_index_t *pis);
+/**
+ * A callback function type for walking a path-list's paths
+ */
+typedef int (*fib_path_list_walk_fn_t)(fib_node_index_t pl_index,
+                                      fib_node_index_t path_index,
+                                      void *ctx);
+
+extern void fib_path_list_walk(fib_node_index_t pl_index,
+                              fib_path_list_walk_fn_t func,
+                              void *ctx);
+
+extern void fib_path_list_module_init(void);
+
+extern void fib_path_list_module_init(void);
+
+/*
+ * functions for testing.
+ */
+u32 fib_path_list_pool_size(void);
+u32 fib_path_list_db_size(void);
+
+#endif
diff --git a/vnet/vnet/fib/fib_table.c b/vnet/vnet/fib/fib_table.c
new file mode 100644 (file)
index 0000000..84c8708
--- /dev/null
@@ -0,0 +1,1052 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/dpo/drop_dpo.h>
+
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/fib_entry_cover.h>
+#include <vnet/fib/fib_internal.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/fib/ip6_fib.h>
+#include <vnet/fib/mpls_fib.h>
+
+fib_table_t *
+fib_table_get (fib_node_index_t index,
+              fib_protocol_t proto)
+{
+    switch (proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       return (pool_elt_at_index(ip4_main.fibs, index));
+    case FIB_PROTOCOL_IP6:
+       return (pool_elt_at_index(ip6_main.fibs, index));
+    case FIB_PROTOCOL_MPLS:
+       return (pool_elt_at_index(mpls_main.fibs, index));
+    }
+    ASSERT(0);
+    return (NULL);
+}
+
+static inline fib_node_index_t
+fib_table_lookup_i (fib_table_t *fib_table,
+                   const fib_prefix_t *prefix)
+{
+    switch (prefix->fp_proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       return (ip4_fib_table_lookup(&fib_table->v4,
+                                    &prefix->fp_addr.ip4,
+                                    prefix->fp_len));
+    case FIB_PROTOCOL_IP6:
+       return (ip6_fib_table_lookup(fib_table->ft_index,
+                                    &prefix->fp_addr.ip6,
+                                    prefix->fp_len));
+    case FIB_PROTOCOL_MPLS:
+       return (mpls_fib_table_lookup(&fib_table->mpls,
+                                     prefix->fp_label,
+                                     prefix->fp_eos));
+    }
+    return (FIB_NODE_INDEX_INVALID);
+}
+
+fib_node_index_t
+fib_table_lookup (u32 fib_index,
+                 const fib_prefix_t *prefix)
+{
+    return (fib_table_lookup_i(fib_table_get(fib_index, prefix->fp_proto), prefix));
+}
+
+static inline fib_node_index_t
+fib_table_lookup_exact_match_i (const fib_table_t *fib_table,
+                               const fib_prefix_t *prefix)
+{
+    switch (prefix->fp_proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       return (ip4_fib_table_lookup_exact_match(&fib_table->v4,
+                                                &prefix->fp_addr.ip4,
+                                                prefix->fp_len));
+    case FIB_PROTOCOL_IP6:
+       return (ip6_fib_table_lookup_exact_match(fib_table->ft_index,
+                                                &prefix->fp_addr.ip6,
+                                                prefix->fp_len));
+    case FIB_PROTOCOL_MPLS:
+       return (mpls_fib_table_lookup(&fib_table->mpls,
+                                     prefix->fp_label,
+                                     prefix->fp_eos));
+    }
+    return (FIB_NODE_INDEX_INVALID);
+}
+
+fib_node_index_t
+fib_table_lookup_exact_match (u32 fib_index,
+                             const fib_prefix_t *prefix)
+{
+    return (fib_table_lookup_exact_match_i(fib_table_get(fib_index,
+                                                        prefix->fp_proto),
+                                          prefix));
+}
+
+static fib_node_index_t
+fib_table_get_less_specific_i (fib_table_t *fib_table,
+                              const fib_prefix_t *prefix)
+{
+    fib_prefix_t pfx;
+
+    pfx = *prefix;
+
+    if (FIB_PROTOCOL_MPLS == pfx.fp_proto)
+    {
+       return (FIB_NODE_INDEX_INVALID);
+    }
+
+    /*
+     * in the absence of a tree structure for the table that allows for an O(1)
+     * parent get, a cheeky way to find the cover is to LPM for the prefix with
+     * mask-1.
+     * there should always be a cover, though it may be the default route. the
+     * default route's cover is the default route.
+     */
+    if (pfx.fp_len != 0) {
+       pfx.fp_len -= 1;
+    }
+
+    return (fib_table_lookup_i(fib_table, &pfx));    
+}
+
+fib_node_index_t
+fib_table_get_less_specific (u32 fib_index,
+                            const fib_prefix_t *prefix)
+{
+    return (fib_table_get_less_specific_i(fib_table_get(fib_index,
+                                                       prefix->fp_proto),
+                                         prefix));
+}
+
+static void
+fib_table_entry_remove (fib_table_t *fib_table,
+                       const fib_prefix_t *prefix,
+                       fib_node_index_t fib_entry_index)
+{
+    vlib_smp_unsafe_warning();
+
+    fib_table->ft_total_route_counts--;
+
+    switch (prefix->fp_proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       ip4_fib_table_entry_remove(&fib_table->v4,
+                                  &prefix->fp_addr.ip4,
+                                  prefix->fp_len);
+       break;
+    case FIB_PROTOCOL_IP6:
+       ip6_fib_table_entry_remove(fib_table->ft_index,
+                                  &prefix->fp_addr.ip6,
+                                  prefix->fp_len);
+       break;
+    case FIB_PROTOCOL_MPLS:
+       mpls_fib_table_entry_remove(&fib_table->mpls,
+                                   prefix->fp_label,
+                                   prefix->fp_eos);
+       break;
+    }
+
+    fib_entry_unlock(fib_entry_index);
+}
+
+static void
+fib_table_post_insert_actions (fib_table_t *fib_table,
+                              const fib_prefix_t *prefix,
+                              fib_node_index_t fib_entry_index)
+{
+    fib_node_index_t fib_entry_cover_index;
+
+    /*
+     * no cover relationships in the MPLS FIB
+     */
+    if (FIB_PROTOCOL_MPLS == prefix->fp_proto)
+       return;
+
+    /*
+     * find and inform the covering entry that a new more specific
+     * has been inserted beneath it
+     */
+    fib_entry_cover_index = fib_table_get_less_specific_i(fib_table, prefix);
+    /*
+     * the indicies are the same when the default route is first added
+     */
+    if (fib_entry_cover_index != fib_entry_index)
+    {
+       fib_entry_cover_change_notify(fib_entry_cover_index,
+                                     fib_entry_index);
+    }
+}
+
+static void
+fib_table_entry_insert (fib_table_t *fib_table,
+                       const fib_prefix_t *prefix,
+                       fib_node_index_t fib_entry_index)
+{
+    vlib_smp_unsafe_warning();
+
+    fib_entry_lock(fib_entry_index);
+    fib_table->ft_total_route_counts++;
+
+    switch (prefix->fp_proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       ip4_fib_table_entry_insert(&fib_table->v4,
+                                  &prefix->fp_addr.ip4,
+                                  prefix->fp_len,
+                                  fib_entry_index);
+       break;
+    case FIB_PROTOCOL_IP6:
+       ip6_fib_table_entry_insert(fib_table->ft_index,
+                                  &prefix->fp_addr.ip6,
+                                  prefix->fp_len,
+                                  fib_entry_index);
+       break;
+    case FIB_PROTOCOL_MPLS:
+       mpls_fib_table_entry_insert(&fib_table->mpls,
+                                   prefix->fp_label,
+                                   prefix->fp_eos,
+                                   fib_entry_index);
+       break;
+    }
+
+    fib_table_post_insert_actions(fib_table, prefix, fib_entry_index);
+}
+
+void
+fib_table_fwding_dpo_update (u32 fib_index,
+                            const fib_prefix_t *prefix,
+                            const dpo_id_t *dpo)
+{
+    vlib_smp_unsafe_warning();
+
+    switch (prefix->fp_proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       return (ip4_fib_table_fwding_dpo_update(ip4_fib_get(fib_index),
+                                               &prefix->fp_addr.ip4,
+                                               prefix->fp_len,
+                                               dpo));
+    case FIB_PROTOCOL_IP6:
+       return (ip6_fib_table_fwding_dpo_update(fib_index,
+                                               &prefix->fp_addr.ip6,
+                                               prefix->fp_len,
+                                               dpo));
+    case FIB_PROTOCOL_MPLS:
+       return (mpls_fib_forwarding_table_update(mpls_fib_get(fib_index),
+                                                prefix->fp_label,
+                                                prefix->fp_eos,
+                                                dpo));
+    }
+}
+
+void
+fib_table_fwding_dpo_remove (u32 fib_index,
+                            const fib_prefix_t *prefix,
+                            const dpo_id_t *dpo)
+{
+    vlib_smp_unsafe_warning();
+
+    switch (prefix->fp_proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       return (ip4_fib_table_fwding_dpo_remove(ip4_fib_get(fib_index),
+                                               &prefix->fp_addr.ip4,
+                                               prefix->fp_len,
+                                               dpo));
+    case FIB_PROTOCOL_IP6:
+       return (ip6_fib_table_fwding_dpo_remove(fib_index,
+                                               &prefix->fp_addr.ip6,
+                                               prefix->fp_len,
+                                               dpo));
+    case FIB_PROTOCOL_MPLS:
+       return (mpls_fib_forwarding_table_reset(mpls_fib_get(fib_index),
+                                               prefix->fp_label,
+                                               prefix->fp_eos));
+    }
+}
+
+
+fib_node_index_t
+fib_table_entry_special_dpo_add (u32 fib_index,
+                                 const fib_prefix_t *prefix,
+                                 fib_source_t source,
+                                 fib_entry_flag_t flags,
+                                 const dpo_id_t *dpo)
+{
+    fib_node_index_t fib_entry_index;
+    fib_table_t *fib_table;
+
+    fib_table = fib_table_get(fib_index, prefix->fp_proto);
+    fib_entry_index = fib_table_lookup_exact_match_i(fib_table, prefix);
+
+    if (FIB_NODE_INDEX_INVALID == fib_entry_index)
+    {
+       fib_entry_index = fib_entry_create_special(fib_index, prefix,
+                                                  source, flags,
+                                                  dpo);
+
+       fib_table_entry_insert(fib_table, prefix, fib_entry_index);
+        fib_table->ft_src_route_counts[source]++;
+    }
+    else
+    {
+        int was_sourced;
+
+        was_sourced = fib_entry_is_sourced(fib_entry_index, source);
+       fib_entry_special_add(fib_entry_index, source, flags, dpo);
+
+        if (was_sourced != fib_entry_is_sourced(fib_entry_index, source))
+        {
+            fib_table->ft_src_route_counts[source]++;
+        }
+    }
+
+
+    return (fib_entry_index);
+}
+
+fib_node_index_t
+fib_table_entry_special_add (u32 fib_index,
+                            const fib_prefix_t *prefix,
+                            fib_source_t source,
+                            fib_entry_flag_t flags,
+                            adj_index_t adj_index)
+{
+    fib_node_index_t fib_entry_index;
+    dpo_id_t tmp_dpo = DPO_NULL;
+
+    if (ADJ_INDEX_INVALID != adj_index)
+    {
+        dpo_set(&tmp_dpo,
+                DPO_ADJACENCY,
+                FIB_PROTOCOL_MAX,
+                adj_index);
+    }
+    else
+    {
+        dpo_copy(&tmp_dpo, drop_dpo_get(fib_proto_to_dpo(prefix->fp_proto)));
+    }
+    fib_entry_index = fib_table_entry_special_dpo_add(fib_index, prefix, source,
+                                                      flags, &tmp_dpo);
+
+    dpo_unlock(&tmp_dpo);
+
+    return (fib_entry_index);
+}
+
+void
+fib_table_entry_special_dpo_update (fib_node_index_t fib_entry_index,
+                                   fib_source_t source,
+                                   fib_entry_flag_t flags,
+                                   const dpo_id_t *dpo)
+{
+    fib_prefix_t prefix;
+    u32 fib_index;
+
+    fib_entry_get_prefix(fib_entry_index, &prefix);
+    fib_index = fib_entry_get_fib_index(fib_entry_index);
+
+    fib_table_entry_special_dpo_add(fib_index, &prefix, source, flags, dpo);
+    fib_table_entry_special_remove(fib_index, &prefix, source);
+}
+
+void
+fib_table_entry_special_remove (u32 fib_index,
+                               const fib_prefix_t *prefix,
+                               fib_source_t source)
+{
+    /*
+     * 1 is it present
+     *   yes => remove source
+     *    2 - is it still sourced?
+     *      no => cover walk
+     */
+    fib_node_index_t fib_entry_index;
+    fib_table_t *fib_table;
+
+    fib_table = fib_table_get(fib_index, prefix->fp_proto);
+    fib_entry_index = fib_table_lookup_exact_match_i(fib_table, prefix);
+
+    if (FIB_NODE_INDEX_INVALID == fib_entry_index)
+    {
+       /*
+        * removing an etry that does not exist. i'll allow it.
+        */
+    }
+    else
+    {
+       fib_entry_src_flag_t src_flag;
+        int was_sourced;
+
+       /*
+        * don't nobody go nowhere
+        */
+       fib_entry_lock(fib_entry_index);
+        was_sourced = fib_entry_is_sourced(fib_entry_index, source);
+
+       src_flag = fib_entry_special_remove(fib_entry_index, source);
+
+       if (!(FIB_ENTRY_SRC_FLAG_ADDED & src_flag))
+       {
+           /*
+            * last source gone. remove from the table
+            */
+           fib_table_entry_remove(fib_table, prefix, fib_entry_index);
+
+           /*
+            * now the entry is no longer in the table, we can
+            * inform the entries that it covers to re-calculate their cover
+            */
+           fib_entry_cover_change_notify(fib_entry_index,
+                                         FIB_NODE_INDEX_INVALID);
+       }
+       /*
+        * else
+        *   still has sources, leave it be.
+        */
+        if (was_sourced != fib_entry_is_sourced(fib_entry_index, source))
+        {
+            fib_table->ft_src_route_counts[source]--;
+        }
+
+       fib_entry_unlock(fib_entry_index);
+    }
+}
+
+/**
+ * fib_table_route_path_fixup
+ *
+ * Convert attached hosts to attached next-hops.
+ * 
+ * This special case is required because an attached path will link to a
+ * glean, and the FIB entry will have the interface or API/CLI source. When
+ * the ARP/ND process is completes then that source (which will provide a
+ * complete adjacency) will be lower priority and so the FIB entry will
+ * remain linked to a glean and traffic will never reach the hosts. For
+ * an ATTAHCED_HOST path we can link the path directly to the [incomplete]
+ * adjacency.
+ */
+static void
+fib_table_route_path_fixup (const fib_prefix_t *prefix,
+                           fib_route_path_t *path)
+{
+    if (fib_prefix_is_host(prefix) &&
+       ip46_address_is_zero(&path->frp_addr) &&
+       path->frp_sw_if_index != ~0)
+    {
+       path->frp_addr = prefix->fp_addr;
+    }
+}                
+
+fib_node_index_t
+fib_table_entry_path_add (u32 fib_index,
+                         const fib_prefix_t *prefix,
+                         fib_source_t source,
+                         fib_entry_flag_t flags,
+                         fib_protocol_t next_hop_proto,
+                         const ip46_address_t *next_hop,
+                         u32 next_hop_sw_if_index,
+                         u32 next_hop_fib_index,
+                         u32 next_hop_weight,
+                         mpls_label_t next_hop_label,
+                         fib_route_path_flags_t path_flags)
+{
+    fib_route_path_t path = {
+       .frp_proto = next_hop_proto,
+       .frp_addr = (NULL == next_hop? zero_addr : *next_hop),
+       .frp_sw_if_index = next_hop_sw_if_index,
+       .frp_fib_index = next_hop_fib_index,
+       .frp_weight = next_hop_weight,
+       .frp_flags = path_flags,
+       .frp_label = next_hop_label,
+    };
+    fib_node_index_t fib_entry_index;
+    fib_route_path_t *paths = NULL;
+
+    fib_table_route_path_fixup(prefix, &path);
+    vec_add1(paths, path);
+
+    fib_entry_index = fib_table_entry_path_add2(fib_index, prefix,
+                                               source, flags, paths);
+
+    vec_free(paths);
+    return (fib_entry_index);
+}
+
+fib_node_index_t
+fib_table_entry_path_add2 (u32 fib_index,
+                          const fib_prefix_t *prefix,
+                          fib_source_t source,
+                          fib_entry_flag_t flags,
+                          const fib_route_path_t *rpath)
+{
+    fib_node_index_t fib_entry_index;
+    fib_table_t *fib_table;
+
+    fib_table = fib_table_get(fib_index, prefix->fp_proto);
+    fib_entry_index = fib_table_lookup_exact_match_i(fib_table, prefix);
+
+    if (FIB_NODE_INDEX_INVALID == fib_entry_index)
+    {
+       fib_entry_index = fib_entry_create(fib_index, prefix,
+                                          source, flags,
+                                          rpath);
+
+       fib_table_entry_insert(fib_table, prefix, fib_entry_index);
+        fib_table->ft_src_route_counts[source]++;
+    }
+    else
+    {
+        int was_sourced;
+
+        was_sourced = fib_entry_is_sourced(fib_entry_index, source);
+       fib_entry_path_add(fib_entry_index, source, flags, rpath);;
+
+        if (was_sourced != fib_entry_is_sourced(fib_entry_index, source))
+        {
+            fib_table->ft_src_route_counts[source]++;
+        }
+    }
+
+    return (fib_entry_index);
+}
+
+void
+fib_table_entry_path_remove2 (u32 fib_index,
+                            const fib_prefix_t *prefix,
+                            fib_source_t source,
+                             const fib_route_path_t *rpath)
+{
+    /*
+     * 1 is it present
+     *   yes => remove source
+     *    2 - is it still sourced?
+     *      no => cover walk
+     */
+    fib_node_index_t fib_entry_index;
+    fib_table_t *fib_table;
+
+    fib_table = fib_table_get(fib_index, prefix->fp_proto);
+    fib_entry_index = fib_table_lookup_exact_match_i(fib_table, prefix);
+
+    if (FIB_NODE_INDEX_INVALID == fib_entry_index)
+    {
+       /*
+        * removing an etry that does not exist. i'll allow it.
+        */
+    }
+    else
+    {
+       fib_entry_src_flag_t src_flag;
+        int was_sourced;
+
+       /*
+        * don't nobody go nowhere
+        */
+       fib_entry_lock(fib_entry_index);
+        was_sourced = fib_entry_is_sourced(fib_entry_index, source);
+
+       src_flag = fib_entry_path_remove(fib_entry_index, source, rpath);
+
+       if (!(FIB_ENTRY_SRC_FLAG_ADDED & src_flag))
+       {
+           /*
+            * last source gone. remove from the table
+            */
+           fib_table_entry_remove(fib_table, prefix, fib_entry_index);
+
+           /*
+            * now the entry is no longer in the table, we can
+            * inform the entries that it covers to re-calculate their cover
+            */
+           fib_entry_cover_change_notify(fib_entry_index,
+                                         FIB_NODE_INDEX_INVALID);
+       }
+       /*
+        * else
+        *   still has sources, leave it be.
+        */
+        if (was_sourced != fib_entry_is_sourced(fib_entry_index, source))
+        {
+            fib_table->ft_src_route_counts[source]--;
+        }
+
+       fib_entry_unlock(fib_entry_index);
+    }
+}
+
+void
+fib_table_entry_path_remove (u32 fib_index,
+                            const fib_prefix_t *prefix,
+                            fib_source_t source,
+                            fib_protocol_t next_hop_proto,
+                            const ip46_address_t *next_hop,
+                            u32 next_hop_sw_if_index,
+                            u32 next_hop_fib_index,
+                            u32 next_hop_weight,
+                            fib_route_path_flags_t path_flags)
+{
+    /*
+     * 1 is it present
+     *   yes => remove source
+     *    2 - is it still sourced?
+     *      no => cover walk
+     */
+    fib_route_path_t path = {
+       .frp_proto = next_hop_proto,
+       .frp_addr = (NULL == next_hop? zero_addr : *next_hop),
+       .frp_sw_if_index = next_hop_sw_if_index,
+       .frp_fib_index = next_hop_fib_index,
+       .frp_weight = next_hop_weight,
+       .frp_flags = path_flags,
+    };
+    fib_route_path_t *paths = NULL;
+
+    fib_table_route_path_fixup(prefix, &path);
+    vec_add1(paths, path);
+
+    fib_table_entry_path_remove2(fib_index, prefix, source, paths);
+
+    vec_free(paths);
+}
+
+static int
+fib_route_path_cmp_for_sort (void * v1,
+                            void * v2)
+{
+    return (fib_route_path_cmp(v1, v2));
+}
+
+fib_node_index_t
+fib_table_entry_update (u32 fib_index,
+                       const fib_prefix_t *prefix,
+                       fib_source_t source,
+                       fib_entry_flag_t flags,
+                       const fib_route_path_t *paths)
+{
+    fib_node_index_t fib_entry_index;
+    fib_table_t *fib_table;
+
+    fib_table = fib_table_get(fib_index, prefix->fp_proto);
+    fib_entry_index = fib_table_lookup_exact_match_i(fib_table, prefix);
+
+    /*
+     * sort the paths provided by the control plane. this means
+     * the paths and the extension on the entry will be sorted.
+     */
+    vec_sort_with_function(((fib_route_path_t*)paths), // const cast
+                          fib_route_path_cmp_for_sort);
+
+    if (FIB_NODE_INDEX_INVALID == fib_entry_index)
+    {
+       fib_entry_index = fib_entry_create(fib_index, prefix,
+                                          source, flags,
+                                          paths);
+
+       fib_table_entry_insert(fib_table, prefix, fib_entry_index);
+        fib_table->ft_src_route_counts[source]++;
+    }
+    else
+    {
+        int was_sourced;
+
+        was_sourced = fib_entry_is_sourced(fib_entry_index, source);
+       fib_entry_update(fib_entry_index, source, flags, paths);
+
+        if (was_sourced != fib_entry_is_sourced(fib_entry_index, source))
+        {
+            fib_table->ft_src_route_counts[source]++;
+        }
+    }
+
+    return (fib_entry_index);
+}
+
+fib_node_index_t
+fib_table_entry_update_one_path (u32 fib_index,
+                                const fib_prefix_t *prefix,
+                                fib_source_t source,
+                                fib_entry_flag_t flags,
+                                fib_protocol_t next_hop_proto,
+                                const ip46_address_t *next_hop,
+                                u32 next_hop_sw_if_index,
+                                u32 next_hop_fib_index,
+                                u32 next_hop_weight,
+                                mpls_label_t next_hop_label,
+                                fib_route_path_flags_t path_flags)
+{
+    fib_node_index_t fib_entry_index;
+    fib_route_path_t path = {
+       .frp_proto = next_hop_proto,
+       .frp_addr = (NULL == next_hop? zero_addr : *next_hop),
+       .frp_sw_if_index = next_hop_sw_if_index,
+       .frp_fib_index = next_hop_fib_index,
+       .frp_weight = next_hop_weight,
+       .frp_flags = path_flags,
+       .frp_label = next_hop_label,
+    };
+    fib_route_path_t *paths = NULL;
+
+    fib_table_route_path_fixup(prefix, &path);
+    vec_add1(paths, path);
+
+    fib_entry_index = 
+       fib_table_entry_update(fib_index, prefix, source, flags, paths);
+
+    vec_free(paths);
+
+    return (fib_entry_index);
+}
+
+static void
+fib_table_entry_delete_i (u32 fib_index,
+                         fib_node_index_t fib_entry_index,
+                         const fib_prefix_t *prefix,
+                         fib_source_t source)
+{
+    fib_entry_src_flag_t src_flag;
+    fib_table_t *fib_table;
+    int was_sourced;
+
+    fib_table = fib_table_get(fib_index, prefix->fp_proto);
+    was_sourced = fib_entry_is_sourced(fib_entry_index, source);
+
+    /*
+     * don't nobody go nowhere
+     */
+    fib_entry_lock(fib_entry_index);
+
+    src_flag = fib_entry_delete(fib_entry_index, source);
+
+    if (!(FIB_ENTRY_SRC_FLAG_ADDED & src_flag))
+    {
+       /*
+        * last source gone. remove from the table
+        */
+       fib_table_entry_remove(fib_table, prefix, fib_entry_index);
+
+       /*
+        * now the entry is no longer in the table, we can
+        * inform the entries that it covers to re-calculate their cover
+        */
+       fib_entry_cover_change_notify(fib_entry_index,
+                                     FIB_NODE_INDEX_INVALID);
+    }
+    /*
+     * else
+     *   still has sources, leave it be.
+     */
+    if (was_sourced != fib_entry_is_sourced(fib_entry_index, source))
+    {
+        fib_table->ft_src_route_counts[source]--;
+    }
+
+    fib_entry_unlock(fib_entry_index);
+}
+
+void
+fib_table_entry_delete (u32 fib_index,
+                       const fib_prefix_t *prefix,
+                       fib_source_t source)
+{
+    fib_node_index_t fib_entry_index;
+
+    fib_entry_index = fib_table_lookup_exact_match(fib_index, prefix);
+
+    if (FIB_NODE_INDEX_INVALID == fib_entry_index)
+    {
+       /*
+        * removing an etry that does not exist.
+        * i'll allow it, but i won't like it.
+        */
+       clib_warning("%U not in FIB", format_fib_prefix, prefix);
+    }
+    else
+    {
+       fib_table_entry_delete_i(fib_index, fib_entry_index, prefix, source);
+    }
+}
+
+void
+fib_table_entry_delete_index (fib_node_index_t fib_entry_index,
+                             fib_source_t source)
+{
+    fib_prefix_t prefix;
+
+    fib_entry_get_prefix(fib_entry_index, &prefix);
+
+    fib_table_entry_delete_i(fib_entry_get_fib_index(fib_entry_index),
+                             fib_entry_index, &prefix, source);
+}
+
+fib_node_index_t
+fib_table_entry_local_label_add (u32 fib_index,
+                                const fib_prefix_t *prefix,
+                                mpls_label_t label)
+{
+    fib_node_index_t fib_entry_index;
+    fib_entry_index = fib_table_entry_special_dpo_add(fib_index, prefix, 
+                                                      FIB_SOURCE_MPLS,
+                                                      FIB_ENTRY_FLAG_NONE,
+                                                      NULL);
+    fib_entry_set_source_data(fib_entry_index, FIB_SOURCE_MPLS, &label);
+
+    return (fib_entry_index);
+}
+
+void
+fib_table_entry_local_label_remove (u32 fib_index,
+                                   const fib_prefix_t *prefix,
+                                   mpls_label_t label)
+{
+    fib_node_index_t fib_entry_index;
+    const void *data;
+    mpls_label_t pl;
+
+    fib_entry_index = fib_table_lookup_exact_match(fib_index, prefix);
+
+    if (FIB_NODE_INDEX_INVALID == fib_entry_index)
+        return;
+
+    data = fib_entry_get_source_data(fib_entry_index, FIB_SOURCE_MPLS);
+
+    if (NULL == data)
+        return;
+
+    pl = *(mpls_label_t*)data;
+
+    if (pl != label)
+        return;
+
+    pl = MPLS_LABEL_INVALID;
+
+    fib_entry_set_source_data(fib_entry_index, FIB_SOURCE_MPLS, &pl);
+    fib_table_entry_special_remove(fib_index,
+                                  prefix,
+                                  FIB_SOURCE_MPLS);
+}
+
+u32
+fib_table_get_index_for_sw_if_index (fib_protocol_t proto,
+                                    u32 sw_if_index)
+{
+    switch (proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       return (ip4_fib_table_get_index_for_sw_if_index(sw_if_index));
+    case FIB_PROTOCOL_IP6:
+       return (ip6_fib_table_get_index_for_sw_if_index(sw_if_index));
+    case FIB_PROTOCOL_MPLS:
+       return (mpls_fib_table_get_index_for_sw_if_index(sw_if_index));
+    }
+    return (~0);
+}
+
+flow_hash_config_t
+fib_table_get_flow_hash_config (u32 fib_index,
+                               fib_protocol_t proto)
+{
+    switch (proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       return (ip4_fib_table_get_flow_hash_config(fib_index));
+    case FIB_PROTOCOL_IP6:
+       return (ip6_fib_table_get_flow_hash_config(fib_index));
+    case FIB_PROTOCOL_MPLS:
+       return (mpls_fib_table_get_flow_hash_config(fib_index));
+    }
+    return (0);
+}
+
+
+u32
+fib_table_get_table_id_for_sw_if_index (fib_protocol_t proto,
+                                       u32 sw_if_index)
+{
+    fib_table_t *fib_table;
+
+    fib_table = fib_table_get(fib_table_get_index_for_sw_if_index(
+                                 proto, sw_if_index),
+                             proto);
+
+    return ((NULL != fib_table ? fib_table->ft_table_id : ~0));
+}
+
+u32
+fib_table_find (fib_protocol_t proto,
+               u32 table_id)
+{
+    switch (proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       return (ip4_fib_index_from_table_id(table_id));
+    case FIB_PROTOCOL_IP6:
+       return (ip6_fib_index_from_table_id(table_id));
+    case FIB_PROTOCOL_MPLS:
+       return (mpls_fib_index_from_table_id(table_id));
+    }
+    return (~0);
+}
+
+u32
+fib_table_find_or_create_and_lock (fib_protocol_t proto,
+                                  u32 table_id)
+{
+    fib_table_t *fib_table;
+    fib_node_index_t fi;
+
+    switch (proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       fi = ip4_fib_table_find_or_create_and_lock(table_id);
+        break;
+    case FIB_PROTOCOL_IP6:
+       fi = ip6_fib_table_find_or_create_and_lock(table_id);
+        break;
+    case FIB_PROTOCOL_MPLS:
+       fi = mpls_fib_table_find_or_create_and_lock(table_id);
+        break;
+    default:
+        return (~0);        
+    }
+
+    fib_table = fib_table_get(fi, proto);
+
+    fib_table->ft_desc = format(NULL, "%U-VRF:%d",
+                                format_fib_protocol, proto,
+                                table_id);
+
+    return (fi);
+}
+
+u32
+fib_table_create_and_lock (fib_protocol_t proto,
+                           const char *const fmt,
+                           ...)
+{
+    fib_table_t *fib_table;
+    fib_node_index_t fi;
+    va_list ap;
+
+    va_start(ap, fmt);
+
+    switch (proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       fi = ip4_fib_table_create_and_lock();
+        break;
+    case FIB_PROTOCOL_IP6:
+       fi = ip6_fib_table_create_and_lock();
+        break;
+     case FIB_PROTOCOL_MPLS:
+       fi = mpls_fib_table_create_and_lock();
+        break;
+   default:
+        return (~0);        
+    }
+
+    fib_table = fib_table_get(fi, proto);
+
+    fib_table->ft_desc = va_format(fib_table->ft_desc, fmt, &ap);
+
+    va_end(ap);
+    return (fi);
+}
+
+static void
+fib_table_destroy (fib_table_t *fib_table)
+{
+    vec_free(fib_table->ft_desc);
+
+    switch (fib_table->ft_proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       ip4_fib_table_destroy(&fib_table->v4);
+       break;
+    case FIB_PROTOCOL_IP6:
+       ip6_fib_table_destroy(fib_table->ft_index);
+       break;
+    case FIB_PROTOCOL_MPLS:
+       mpls_fib_table_destroy(&fib_table->mpls);
+       break;
+    }
+}
+
+void
+fib_table_unlock (u32 fib_index,
+                 fib_protocol_t proto)
+{
+    fib_table_t *fib_table;
+
+    fib_table = fib_table_get(fib_index, proto);
+    fib_table->ft_locks--;
+
+    if (0 == fib_table->ft_locks)
+    {
+       fib_table_destroy(fib_table);
+    }
+}
+void
+fib_table_lock (u32 fib_index,
+               fib_protocol_t proto)
+{
+    fib_table_t *fib_table;
+
+    fib_table = fib_table_get(fib_index, proto);
+    fib_table->ft_locks++;
+}
+
+u32
+fib_table_get_num_entries (u32 fib_index,
+                          fib_protocol_t proto,
+                          fib_source_t source)
+{
+    fib_table_t *fib_table;
+
+    fib_table = fib_table_get(fib_index, proto);
+
+    return (fib_table->ft_src_route_counts[source]);
+}
+
+u8*
+format_fib_table_name (u8* s, va_list ap)
+{
+    fib_node_index_t fib_index = va_arg(ap, fib_node_index_t);
+    fib_protocol_t proto = va_arg(ap, int); // int promotion
+    fib_table_t *fib_table;
+
+    fib_table = fib_table_get(fib_index, proto);
+
+    s = format(s, "%v", fib_table->ft_desc);
+
+    return (s);
+}
+
+void
+fib_table_flush (u32 fib_index,
+                fib_protocol_t proto,
+                fib_source_t source)
+{
+    // FIXME
+    ASSERT(0);
+}
diff --git a/vnet/vnet/fib/fib_table.h b/vnet/vnet/fib/fib_table.h
new file mode 100644 (file)
index 0000000..d7c604f
--- /dev/null
@@ -0,0 +1,732 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FIB_TABLE_H__
+#define __FIB_TABLE_H__
+
+#include <vnet/ip/ip.h>
+#include <vnet/adj/adj.h>
+#include <vnet/fib/fib_entry.h>
+#include <vnet/mpls/mpls.h>
+#include <vnet/mpls/packet.h>
+
+/**
+ * @brief 
+ *   A protocol Independent FIB table
+ */
+typedef struct fib_table_t_
+{
+    /**
+     * A union of the protocol specific FIBs that provide the
+     * underlying LPM mechanism.
+     * This element is first in the struct so that it is in the
+     * first cache line.
+     */
+    union {
+       ip4_fib_t v4;
+       ip6_fib_t v6;
+       mpls_fib_t mpls;
+    };
+
+    /**
+     * Which protocol this table serves. Used to switch on the union above.
+     */
+    fib_protocol_t ft_proto;
+
+    /**
+     * number of locks on the table
+     */
+    u16 ft_locks;
+
+    /**
+     * Table ID (hash key) for this FIB.
+     */
+    u32 ft_table_id;
+
+    /**
+     * Index into FIB vector.
+     */
+    fib_node_index_t ft_index;
+
+    /**
+     * flow hash configuration
+     */
+    u32 ft_flow_hash_config;
+
+    /**
+     * Per-source route counters
+     */
+    u32 ft_src_route_counts[FIB_SOURCE_MAX];
+
+    /**
+     * Total route counters
+     */
+    u32 ft_total_route_counts;
+
+    /**
+     * Table description
+     */
+    u8* ft_desc;
+} fib_table_t;
+
+/**
+ * @brief
+ *  Format the description/name of the table
+ */
+extern u8* format_fib_table_name(u8* s, va_list ap);
+
+/**
+ * @brief
+ *  Perfom a longest prefix match in the non-forwarding table
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix to lookup
+ *
+ * @return
+ *  The index of the fib_entry_t for the best match, which may be the default route
+ */
+extern fib_node_index_t fib_table_lookup(u32 fib_index,
+                                        const fib_prefix_t *prefix);
+
+/**
+ * @brief
+ *  Perfom an exact match in the non-forwarding table
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix to lookup
+ *
+ * @return
+ *  The index of the fib_entry_t for the exact match, or INVALID
+ *  is there is no match.
+ */
+extern fib_node_index_t fib_table_lookup_exact_match(u32 fib_index,
+                                                    const fib_prefix_t *prefix);
+
+/**
+ * @brief
+ *  Get the less specific (covering) prefix
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix to lookup
+ *
+ * @return
+ *  The index of the less specific fib_entry_t.
+ */
+extern fib_node_index_t fib_table_get_less_specific(u32 fib_index,
+                                                   const fib_prefix_t *prefix);
+
+/**
+ * @brief
+ *  Add a 'special' entry to the FIB that links to the adj passed
+ *  A special entry is an entry that the FIB is not expect to resolve
+ *  via the usual mechanisms (i.e. recurisve or neighbour adj DB lookup).
+ *  Instead the client/source provides the adj to link to.
+ *  This add is reference counting per-source. So n 'removes' are required
+ *  for n 'adds', if the entry is no longer required.
+ *
+  * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix to add
+ *
+ * @param source
+ *  The ID of the client/source adding the entry.
+ *
+ * @param flags
+ *  Flags for the entry.
+ *
+ * @param adj_index
+ *  The adjacency to link to.
+ *
+ * @return
+ *  the index of the fib_entry_t that is created (or exists already).
+ */
+extern fib_node_index_t fib_table_entry_special_add(u32 fib_index,
+                                                   const fib_prefix_t *prefix,
+                                                   fib_source_t source,
+                                                   fib_entry_flag_t flags,
+                                                   adj_index_t adj_index);
+
+/**
+ * @brief
+ *  Add a 'special' entry to the FIB that links to the DPO passed
+ *  A special entry is an entry that the FIB is not expect to resolve
+ *  via the usual mechanisms (i.e. recurisve or neighbour adj DB lookup).
+ *  Instead the client/source provides the DPO to link to.
+ *  This add is reference counting per-source. So n 'removes' are required
+ *  for n 'adds', if the entry is no longer required.
+ *
+  * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix to add
+ *
+ * @param source
+ *  The ID of the client/source adding the entry.
+ *
+ * @param flags
+ *  Flags for the entry.
+ *
+ * @param dpo
+ *  The DPO to link to.
+ *
+ * @return
+ *  the index of the fib_entry_t that is created (or existed already).
+ */
+extern fib_node_index_t fib_table_entry_special_dpo_add(u32 fib_index,
+                                                        const fib_prefix_t *prefix,
+                                                        fib_source_t source,
+                                                        fib_entry_flag_t stype,
+                                                        const dpo_id_t *dpo);
+
+/**
+ * @brief
+ *  Update a 'special' entry to the FIB that links to the DPO passed
+ *  A special entry is an entry that the FIB is not expect to resolve
+ *  via the usual mechanisms (i.e. recurisve or neighbour adj DB lookup).
+ *  Instead the client/source provides the DPO to link to.
+ *  Special entries are add/remove reference counted per-source. So n
+ * 'removes' are required for n 'adds', if the entry is no longer required.
+ *  An 'update' can only be used after an 'add' and is therefore assumed to act
+ * on the reference instance of that add (an update is implemented as add/remove
+ * pair).
+ *
+ * @param fib_entry_index
+ *  The index of the FIB entry to update
+ *
+ * @param source
+ *  The ID of the client/source adding the entry.
+ *
+ * @param flags
+ *  Flags for the entry.
+ *
+ * @param dpo
+ *  The DPO to link to.
+ *
+ * @return
+ *  the index of the fib_entry_t that is created (or existed already).
+ */
+extern void fib_table_entry_special_dpo_update (fib_node_index_t fib_entry_index,
+                                               fib_source_t source,
+                                               fib_entry_flag_t stype,
+                                               const dpo_id_t *dpo);
+
+/**
+ * @brief
+ *  Remove a 'special' entry from the FIB.
+ *  This add is reference counting per-source. So n 'removes' are required
+ *  for n 'adds', if the entry is no longer required.
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix to remove
+ *
+ * @param source
+ *  The ID of the client/source adding the entry.
+ *
+ */
+extern void fib_table_entry_special_remove(u32 fib_index,
+                                          const fib_prefix_t *prefix,
+                                          fib_source_t source);
+
+/**
+ * @brief
+ *  Add one path to an entry (aka route) in the FIB. If the entry does not
+ *  exist, it will be created.
+ * See the documentation for fib_route_path_t for more descirptions of
+ * the path parameters.
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix for the entry to add
+ *
+ * @param source
+ *  The ID of the client/source adding the entry.
+ *
+ * @param flags
+ *  Flags for the entry.
+ *
+ * @paran next_hop_proto
+ *  The protocol of the next hop. This cannot be derived in the event that
+ * the next hop is all zeros.
+ *
+ * @param next_hop
+ *  The address of the next-hop.
+ *
+ * @param sw_if_index
+ *  The index of the interface.
+ *
+ * @param next_hop_fib_index,
+ *  The fib index of the next-hop for recursive resolution
+ *
+ * @param next_hop_weight
+ *  [un]equal cost path weight
+ *
+ * @param  next_hop_label
+ *  The path's out-going label. INVALID is there is none.
+ *
+ * @param  pf
+ *  Flags for the path
+ *
+ * @return
+ *  the index of the fib_entry_t that is created (or existed already).
+ */
+extern fib_node_index_t fib_table_entry_path_add(u32 fib_index,
+                                                const fib_prefix_t *prefix,
+                                                fib_source_t source,
+                                                fib_entry_flag_t flags,
+                                                fib_protocol_t next_hop_proto,
+                                                const ip46_address_t *next_hop,
+                                                u32 next_hop_sw_if_index,
+                                                u32 next_hop_fib_index,
+                                                u32 next_hop_weight,
+                                                mpls_label_t next_hop_label,
+                                                fib_route_path_flags_t pf);
+/**
+ * @brief
+ *  Add n paths to an entry (aka route) in the FIB. If the entry does not
+ *  exist, it will be created.
+ * See the documentation for fib_route_path_t for more descirptions of
+ * the path parameters.
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix for the entry to add
+ *
+ * @param source
+ *  The ID of the client/source adding the entry.
+ *
+ * @param flags
+ *  Flags for the entry.
+ *
+ * @param rpaths
+ *  A vector of paths.
+ *
+ * @return
+ *  the index of the fib_entry_t that is created (or existed already).
+ */
+extern fib_node_index_t fib_table_entry_path_add2(u32 fib_index,
+                                                 const fib_prefix_t *prefix,
+                                                 fib_source_t source,
+                                                 fib_entry_flag_t flags,
+                                                 const fib_route_path_t *rpath);
+
+/**
+ * @brief
+ * remove one path to an entry (aka route) in the FIB. If this is the entry's
+ * last path, then the entry will be removed, unless it has other sources.
+ * See the documentation for fib_route_path_t for more descirptions of
+ * the path parameters.
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix for the entry to add
+ *
+ * @param source
+ *  The ID of the client/source adding the entry.
+ *
+ * @paran next_hop_proto
+ *  The protocol of the next hop. This cannot be derived in the event that
+ * the next hop is all zeros.
+ *
+ * @param next_hop
+ *  The address of the next-hop.
+ *
+ * @param sw_if_index
+ *  The index of the interface.
+ *
+ * @param next_hop_fib_index,
+ *  The fib index of the next-hop for recursive resolution
+ *
+ * @param next_hop_weight
+ *  [un]equal cost path weight
+ *
+ * @param  pf
+ *  Flags for the path
+ */
+extern void fib_table_entry_path_remove(u32 fib_index,
+                                       const fib_prefix_t *prefix,
+                                       fib_source_t source,
+                                       fib_protocol_t next_hop_proto,
+                                       const ip46_address_t *next_hop,
+                                       u32 next_hop_sw_if_index,
+                                       u32 next_hop_fib_index,
+                                       u32 next_hop_weight,
+                                       fib_route_path_flags_t pf);
+
+/**
+ * @brief
+ * Remove n paths to an entry (aka route) in the FIB. If this is the entry's
+ * last path, then the entry will be removed, unless it has other sources.
+ * See the documentation for fib_route_path_t for more descirptions of
+ * the path parameters.
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix for the entry to add
+ *
+ * @param source
+ *  The ID of the client/source adding the entry.
+ *
+ * @param rpaths
+ *  A vector of paths.
+ */
+extern void fib_table_entry_path_remove2(u32 fib_index,
+                                        const fib_prefix_t *prefix,
+                                        fib_source_t source,
+                                        const fib_route_path_t *paths);
+
+/**
+ * @brief
+ *  Update an entry to have a new set of paths. If the entry does not
+ *  exist, it will be created.
+ * The difference between an 'path-add' and an update, is that path-add is
+ * an incremental addition of paths, whereas an update is a wholesale swap.
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix for the entry to add
+ *
+ * @param source
+ *  The ID of the client/source adding the entry.
+ *
+ * @param rpaths
+ *  A vector of paths.
+ *
+ * @return
+ *  the index of the fib_entry_t that is created (or existed already).
+ */
+extern fib_node_index_t fib_table_entry_update(u32 fib_index,
+                                              const fib_prefix_t *prefix,
+                                              fib_source_t source,
+                                              fib_entry_flag_t flags,
+                                              const fib_route_path_t *paths);
+
+/**
+ * @brief
+ *  Update the entry to have just one path. If the entry does not
+ *  exist, it will be created.
+ * See the documentation for fib_route_path_t for more descirptions of
+ * the path parameters.
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix for the entry to add
+ *
+ * @param source
+ *  The ID of the client/source adding the entry.
+ *
+ * @param flags
+ *  Flags for the entry.
+ *
+ * @paran next_hop_proto
+ *  The protocol of the next hop. This cannot be derived in the event that
+ * the next hop is all zeros.
+ *
+ * @param next_hop
+ *  The address of the next-hop.
+ *
+ * @param sw_if_index
+ *  The index of the interface.
+ *
+ * @param next_hop_fib_index,
+ *  The fib index of the next-hop for recursive resolution
+ *
+ * @param next_hop_weight
+ *  [un]equal cost path weight
+ *
+ * @param  next_hop_label
+ *  The path's out-going label. INVALID is there is none.
+ *
+ * @param  pf
+ *  Flags for the path
+ *
+ * @return
+ *  the index of the fib_entry_t that is created (or existed already).
+ */
+extern fib_node_index_t fib_table_entry_update_one_path(u32 fib_index,
+                                                       const fib_prefix_t *prefix,
+                                                       fib_source_t source,
+                                                       fib_entry_flag_t flags,
+                                                       fib_protocol_t next_hop_proto,
+                                                       const ip46_address_t *next_hop,
+                                                       u32 next_hop_sw_if_index,
+                                                       u32 next_hop_fib_index,
+                                                       u32 next_hop_weight,
+                                                       mpls_label_t next_hop_label,
+                                                       fib_route_path_flags_t pf);
+
+/**
+ * @brief
+ *  Add a MPLS local label for the prefix/route. If the entry does not
+ *  exist, it will be created. In theory more than one local label can be
+ *  added, but this is not yet supported.
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix for the entry to which to add the label
+ *
+ * @param label
+ *  The MPLS label to add
+ *
+ * @return
+ *  the index of the fib_entry_t that is created (or existed already).
+ */
+extern fib_node_index_t fib_table_entry_local_label_add(u32 fib_index,
+                                                       const fib_prefix_t *prefix,
+                                                       mpls_label_t label);
+/**
+ * @brief
+ *  remove a MPLS local label for the prefix/route.
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix for the entry to which to add the label
+ *
+ * @param label
+ *  The MPLS label to add
+ */
+extern void fib_table_entry_local_label_remove(u32 fib_index,
+                                              const fib_prefix_t *prefix,
+                                              mpls_label_t label);
+
+/**
+ * @brief
+ *  Delete a FIB entry. If the entry has no more sources, then it is
+ * removed from the table.
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @param prefix
+ *  The prefix for the entry to remove
+ *
+ * @param source
+ *  The ID of the client/source adding the entry.
+ */
+extern void fib_table_entry_delete(u32 fib_index,
+                                  const fib_prefix_t *prefix,
+                                  fib_source_t source);
+
+/**
+ * @brief
+ *  Delete a FIB entry. If the entry has no more sources, then it is
+ * removed from the table.
+ *
+ * @param entry_index
+ *  The index of the FIB entry
+ *
+ * @param source
+ *  The ID of the client/source adding the entry.
+ */
+extern void fib_table_entry_delete_index(fib_node_index_t entry_index,
+                                        fib_source_t source);
+
+/**
+ * @brief
+ *  Flush all entries from a table for the source
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @paran proto
+ *  The protocol of the entries in the table
+ *
+ * @param source
+ *  the source to flush
+ */
+extern void fib_table_flush(u32 fib_index,
+                           fib_protocol_t proto,
+                           fib_source_t source);
+
+/**
+ * @brief
+ *  Get the index of the FIB bound to the interface
+ *
+ * @paran proto
+ *  The protocol of the FIB (and thus the entries therein)
+ *
+ * @param sw_if_index
+ *  The interface index
+ *
+ * @return fib_index
+ *  The index of the FIB
+ */
+extern u32 fib_table_get_index_for_sw_if_index(fib_protocol_t proto,
+                                              u32 sw_if_index);
+
+/**
+ * @brief
+ *  Get the Table-ID of the FIB bound to the interface
+ *
+ * @paran proto
+ *  The protocol of the FIB (and thus the entries therein)
+ *
+ * @param sw_if_index
+ *  The interface index
+ *
+ * @return fib_index
+ *  The tableID of the FIB
+ */
+extern u32 fib_table_get_table_id_for_sw_if_index(fib_protocol_t proto,
+                                                 u32 sw_if_index);
+
+/**
+ * @brief
+ *  Get the index of the FIB for a Table-ID. This DOES NOT create the
+ * FIB if it does not exist.
+ *
+ * @paran proto
+ *  The protocol of the FIB (and thus the entries therein)
+ *
+ * @param table-id
+ *  The Table-ID
+ *
+ * @return fib_index
+ *  The index of the FIB, which may be INVALID.
+ */
+extern u32 fib_table_find(fib_protocol_t proto, u32 table_id);
+
+
+/**
+ * @brief
+ *  Get the index of the FIB for a Table-ID. This DOES create the
+ * FIB if it does not exist.
+ *
+ * @paran proto
+ *  The protocol of the FIB (and thus the entries therein)
+ *
+ * @param table-id
+ *  The Table-ID
+ *
+ * @return fib_index
+ *  The index of the FIB
+ */
+extern u32 fib_table_find_or_create_and_lock(fib_protocol_t proto,
+                                            u32 table_id);
+
+/**
+ * @brief
+ *  Create a new table with no table ID. This means it does not get
+ * added to the hash-table and so can only be found by using the index returned.
+ *
+ * @paran proto
+ *  The protocol of the FIB (and thus the entries therein)
+ *
+ * @param fmt
+ *  A string to describe the table
+ *
+ * @return fib_index
+ *  The index of the FIB
+ */
+extern u32 fib_table_create_and_lock(fib_protocol_t proto,
+                                     const char *const fmt,
+                                     ...);
+
+/**
+ * @brief
+ *  Get the flow hash configured used by the table
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @paran proto
+ *  The protocol of the FIB (and thus the entries therein)
+ *
+ * @return The flow hash config
+ */
+extern flow_hash_config_t fib_table_get_flow_hash_config(u32 fib_index,
+                                                        fib_protocol_t proto);
+
+/**
+ * @brief
+ * Take a reference counting lock on the table
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @paran proto
+ *  The protocol of the FIB (and thus the entries therein)
+ */ 
+extern void fib_table_unlock(u32 fib_index,
+                            fib_protocol_t proto);
+
+/**
+ * @brief
+ * Release a reference counting lock on the table. When the last lock
+ * has gone. the FIB is deleted.
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @paran proto
+ *  The protocol of the FIB (and thus the entries therein)
+ */ 
+extern void fib_table_lock(u32 fib_index,
+                          fib_protocol_t proto);
+
+/**
+ * @brief
+ * Return the number of entries in the FIB added by a given source.
+ *
+ * @param fib_index
+ *  The index of the FIB
+ *
+ * @paran proto
+ *  The protocol of the FIB (and thus the entries therein)
+ *
+ * @return number of sourced entries.
+ */ 
+extern u32 fib_table_get_num_entries(u32 fib_index,
+                                    fib_protocol_t proto,
+                                    fib_source_t source);
+
+/**
+ * @brief
+ * Get a pointer to a FIB table
+ */
+extern fib_table_t *fib_table_get(fib_node_index_t index,
+                                 fib_protocol_t proto);
+
+#endif
diff --git a/vnet/vnet/fib/fib_test.c b/vnet/vnet/fib/fib_test.c
new file mode 100644 (file)
index 0000000..898005e
--- /dev/null
@@ -0,0 +1,6330 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/fib/ip6_fib.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/fib/mpls_fib.h>
+#include <vnet/adj/adj.h>
+#include <vnet/dpo/load_balance.h>
+#include <vnet/dpo/load_balance_map.h>
+#include <vnet/dpo/mpls_label_dpo.h>
+#include <vnet/dpo/lookup_dpo.h>
+#include <vnet/dpo/drop_dpo.h>
+#include <vnet/dpo/receive_dpo.h>
+
+#include <vnet/mpls/mpls.h>
+
+#include <vnet/fib/fib_path_list.h>
+#include <vnet/fib/fib_walk.h>
+#include <vnet/fib/fib_node_list.h>
+
+#define FIB_TEST_I(_cond, _comment, _args...)                  \
+({                                                             \
+    int _evald = (_cond);                                      \
+    if (!(_evald)) {                                           \
+       fformat(stderr, "FAIL:%d: " _comment "\n",              \
+               __LINE__, ##_args);                             \
+    } else {                                                   \
+       fformat(stderr, "PASS:%d: " _comment "\n",              \
+               __LINE__, ##_args);                             \
+    }                                                          \
+    _evald;                                                    \
+})
+#define FIB_TEST(_cond, _comment, _args...)                    \
+{                                                              \
+    if (!FIB_TEST_I(_cond, _comment, ##_args)) {               \
+       return;\
+       ASSERT(!("FAIL: " _comment));                           \
+    }                                                          \
+}
+
+/**
+ * A 'i'm not fussed is this is not efficient' store of test data
+ */
+typedef struct test_main_t_ {
+    /**
+     * HW if indicies
+     */
+    u32 hw_if_indicies[4];
+    /**
+     * HW interfaces
+     */
+    vnet_hw_interface_t * hw[4];
+
+} test_main_t;
+static test_main_t test_main;
+
+/* fake ethernet device class, distinct from "fake-ethX" */
+static u8 * format_test_interface_name (u8 * s, va_list * args)
+{
+  u32 dev_instance = va_arg (*args, u32);
+  return format (s, "test-eth%d", dev_instance);
+}
+
+static uword dummy_interface_tx (vlib_main_t * vm,
+                                vlib_node_runtime_t * node,
+                                vlib_frame_t * frame)
+{
+  clib_warning ("you shouldn't be here, leaking buffers...");
+  return frame->n_vectors;
+}
+
+VNET_DEVICE_CLASS (test_interface_device_class,static) = {
+  .name = "Test interface",
+  .format_device_name = format_test_interface_name,
+  .tx_function = dummy_interface_tx,
+};
+
+static u8 *hw_address;
+
+static void
+fib_test_mk_intf (u32 ninterfaces)
+{
+    clib_error_t * error = NULL;
+    test_main_t *tm = &test_main;
+    u8 byte;
+    u32 i;
+
+    ASSERT(ninterfaces <= ARRAY_LEN(tm->hw_if_indicies));
+
+    for (i=0; i<6; i++)
+    {
+       byte = 0xd0+i;
+       vec_add1(hw_address, byte);
+    }
+
+    for (i = 0; i < ninterfaces; i++)
+    {
+       hw_address[5] = i;
+
+       error = ethernet_register_interface(vnet_get_main(),
+                                           ethernet_hw_interface_class.index,
+                                           i /* instance */,
+                                           hw_address,
+                                           &tm->hw_if_indicies[i], 
+                                           /* flag change */ 0);
+
+       FIB_TEST((NULL == error), "ADD interface %d", i);
+      
+       tm->hw[i] = vnet_get_hw_interface(vnet_get_main(),
+                                         tm->hw_if_indicies[i]);
+       vec_validate (ip4_main.fib_index_by_sw_if_index, tm->hw[i]->sw_if_index);
+       vec_validate (ip6_main.fib_index_by_sw_if_index, tm->hw[i]->sw_if_index);
+       ip4_main.fib_index_by_sw_if_index[tm->hw[i]->sw_if_index] = 0;
+       ip6_main.fib_index_by_sw_if_index[tm->hw[i]->sw_if_index] = 0;
+       error = vnet_sw_interface_set_flags(vnet_get_main(),
+                                           tm->hw[i]->sw_if_index,
+                                           VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+       FIB_TEST((NULL == error), "UP interface %d", i);
+    }
+    /*
+     * re-eval after the inevitable realloc
+     */
+    for (i = 0; i < ninterfaces; i++)
+    {
+       tm->hw[i] = vnet_get_hw_interface(vnet_get_main(),
+                                         tm->hw_if_indicies[i]);
+    }
+}
+
+#define FIB_TEST_REC_FORW(_rec_prefix, _via_prefix)                     \
+{                                                                       \
+    const dpo_id_t *_rec_dpo = fib_entry_contribute_ip_forwarding(      \
+        fib_table_lookup_exact_match(fib_index, (_rec_prefix)));        \
+    const dpo_id_t *_via_dpo = fib_entry_contribute_ip_forwarding(      \
+        fib_table_lookup(fib_index, (_via_prefix)));                    \
+    FIB_TEST(!dpo_cmp(_via_dpo,                                         \
+                      load_balance_get_bucket(_rec_dpo->dpoi_index, 0)), \
+             "%U is recursive via %U",                                  \
+             format_fib_prefix, (_rec_prefix),                          \
+             format_fib_prefix, _via_prefix);                           \
+}
+
+#define FIB_TEST_LB_BUCKET_VIA_ADJ(_prefix, _bucket, _ai)               \
+{                                                                       \
+    const dpo_id_t *_dpo = fib_entry_contribute_ip_forwarding(          \
+        fib_table_lookup_exact_match(fib_index, (_prefix)));            \
+    const dpo_id_t *_dpo1 =                                             \
+        load_balance_get_bucket(_dpo->dpoi_index, _bucket);             \
+    FIB_TEST(DPO_ADJACENCY == _dpo1->dpoi_type, "type is %U",           \
+             format_dpo_type, _dpo1->dpoi_type);                        \
+    FIB_TEST((_ai == _dpo1->dpoi_index),                                \
+            "%U bucket %d resolves via %U",                            \
+             format_fib_prefix, (_prefix),                              \
+             _bucket,                                                   \
+             format_dpo_id, _dpo1, 0);                                  \
+}
+
+static void
+fib_test_v4 (void)
+{
+    /*
+     * In the default table check for the presence and correct forwarding
+     * of the special entries
+     */
+    fib_node_index_t dfrt, fei, ai, ai2, locked_ai, ai_01, ai_02, ai_03;
+    const dpo_id_t *dpo, *dpo1, *dpo2, *dpo_drop;
+    const ip_adjacency_t *adj;
+    const load_balance_t *lb;
+    test_main_t *tm;
+    u32 fib_index;
+    int ii;
+
+    /* via 10.10.10.1 */
+    ip46_address_t nh_10_10_10_1 = {
+       .ip4.as_u32 = clib_host_to_net_u32(0x0a0a0a01),
+    };
+    /* via 10.10.10.2 */
+    ip46_address_t nh_10_10_10_2 = {
+       .ip4.as_u32 = clib_host_to_net_u32(0x0a0a0a02),
+    };
+
+    tm = &test_main;
+
+    /* Find or create FIB table 11 */
+    fib_index = fib_table_find_or_create_and_lock(FIB_PROTOCOL_IP4, 11);
+
+    for (ii = 0; ii < 4; ii++)
+    {
+       ip4_main.fib_index_by_sw_if_index[tm->hw[ii]->sw_if_index] = fib_index;
+    }
+
+    fib_prefix_t pfx_0_0_0_0_s_0 = {
+       .fp_len = 0,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4 = {
+               {0}
+           },
+       },
+    };
+
+    fib_prefix_t pfx = {
+       .fp_len = 0,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4 = {
+               {0}
+           },
+       },
+    };
+
+    dpo_drop = drop_dpo_get(DPO_PROTO_IP4);
+
+    dfrt = fib_table_lookup(fib_index, &pfx_0_0_0_0_s_0);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != dfrt), "default route present");
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(dfrt)),
+            "Default route is DROP");
+
+    pfx.fp_len = 32;
+    fei = fib_table_lookup(fib_index, &pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "all zeros route present");
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "all 0s route is DROP");
+
+    pfx.fp_addr.ip4.as_u32 = clib_host_to_net_u32(0xffffffff);
+    pfx.fp_len = 32;
+    fei = fib_table_lookup(fib_index, &pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "all ones route present");
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "all 1s route is DROP");
+
+    pfx.fp_addr.ip4.as_u32 = clib_host_to_net_u32(0xe0000000);
+    pfx.fp_len = 8;
+    fei = fib_table_lookup(fib_index, &pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "all-mcast route present");
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "all-mcast route is DROP");
+
+    pfx.fp_addr.ip4.as_u32 = clib_host_to_net_u32(0xf0000000);
+    pfx.fp_len = 8;
+    fei = fib_table_lookup(fib_index, &pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "class-e route present");
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "class-e route is DROP");
+
+    /*
+     * at this stage there are 5 entries in the test FIB (plus 5 in the default),
+     * all of which are special sourced and so none of which share path-lists.
+     * There are also 6 entries, and 6 non-shared path-lists, in the v6 default
+     * table
+     */
+#define NBR (5+5+6)
+    FIB_TEST((0 == fib_path_list_db_size()),   "path list DB is empty");
+    FIB_TEST((NBR == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * add interface routes.
+     *  validate presence of /24 attached and /32 recieve.
+     *  test for the presence of the receive address in the glean and local adj
+     */
+    fib_prefix_t local_pfx = {
+       .fp_len = 24,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4 = {
+               .as_u32 = clib_host_to_net_u32(0x0a0a0a0a),
+           },
+       },
+    };
+
+    fib_table_entry_update_one_path(fib_index, &local_pfx,
+                                   FIB_SOURCE_INTERFACE,
+                                   (FIB_ENTRY_FLAG_CONNECTED |
+                                    FIB_ENTRY_FLAG_ATTACHED),
+                                   FIB_PROTOCOL_IP4,
+                                   NULL,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1, // weight
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "attached interface route present");
+    FIB_TEST(((FIB_ENTRY_FLAG_ATTACHED | FIB_ENTRY_FLAG_CONNECTED) ==
+             fib_entry_get_flags(fei)),
+            "Flags set on attached interface");
+
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != ai), "attached interface route adj present");
+    adj = adj_get(ai);
+    FIB_TEST((IP_LOOKUP_NEXT_GLEAN == adj->lookup_next_index),
+            "attached interface adj is glean");
+    FIB_TEST((0 == ip46_address_cmp(&local_pfx.fp_addr,
+                                   &adj->sub_type.glean.receive_addr)),
+             "attached interface adj is receive ok");
+
+    local_pfx.fp_len = 32;
+    fib_table_entry_update_one_path(fib_index, &local_pfx,
+                                   FIB_SOURCE_INTERFACE,
+                                   (FIB_ENTRY_FLAG_CONNECTED |
+                                    FIB_ENTRY_FLAG_LOCAL),
+                                   FIB_PROTOCOL_IP4,
+                                   NULL,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1, // weight
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &local_pfx);
+    FIB_TEST(((FIB_ENTRY_FLAG_LOCAL | FIB_ENTRY_FLAG_CONNECTED) ==
+             fib_entry_get_flags(fei)),
+            "Flags set on local interface");
+
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "local interface route present");
+
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    dpo = load_balance_get_bucket(dpo->dpoi_index, 0);
+    FIB_TEST((DPO_RECEIVE == dpo->dpoi_type),
+            "local interface adj is local");
+    receive_dpo_t *rd = receive_dpo_get(dpo->dpoi_index);
+
+    FIB_TEST((0 == ip46_address_cmp(&local_pfx.fp_addr,
+                                   &rd->rd_addr)),
+             "local interface adj is receive ok");
+
+    FIB_TEST((2 == fib_table_get_num_entries(fib_index,
+                                             FIB_PROTOCOL_IP4,
+                                             FIB_SOURCE_INTERFACE)),
+             "2 Interface Source'd prefixes");
+
+    /*
+     * +2 interface routes +2 non-shared path-lists
+     */
+    FIB_TEST((0 == fib_path_list_db_size()),   "path list DB is empty");
+    FIB_TEST((NBR+2 == fib_path_list_pool_size()), "path list pool size is%d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+2 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * Modify the default route to be via an adj not yet known.
+     * this sources the defalut route with the API source, which is
+     * a higher preference to the DEFAULT_ROUTE source
+     */
+    pfx.fp_addr.ip4.as_u32 = 0;
+    pfx.fp_len = 0;
+    fib_table_entry_path_add(fib_index, &pfx,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_1,
+                            tm->hw[0]->sw_if_index,
+                            ~0, // invalid fib index
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &pfx);
+    FIB_TEST((FIB_ENTRY_FLAG_NONE == fib_entry_get_flags(fei)),
+            "Flags set on API route");
+
+    FIB_TEST((fei == dfrt), "default route same index");
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != ai), "default route adj present");
+    adj = adj_get(ai);
+    FIB_TEST((IP_LOOKUP_NEXT_ARP == adj->lookup_next_index),
+            "adj is incomplete");
+    FIB_TEST((0 == ip46_address_cmp(&nh_10_10_10_1, &adj->sub_type.nbr.next_hop)),
+             "adj nbr next-hop ok");
+    FIB_TEST((1 == fib_table_get_num_entries(fib_index,
+                                             FIB_PROTOCOL_IP4,
+                                             FIB_SOURCE_API)),
+             "1 API Source'd prefixes");
+
+    /*
+     * find the adj in the shared db
+     */
+    locked_ai = adj_nbr_add_or_lock(FIB_PROTOCOL_IP4,
+                                   FIB_LINK_IP4,
+                                   &nh_10_10_10_1,
+                                   tm->hw[0]->sw_if_index);
+    FIB_TEST((locked_ai == ai), "ADJ NBR DB find");
+    adj_unlock(locked_ai);
+
+    /*
+     * +1 shared path-list
+     */
+    FIB_TEST((1 == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+3 == fib_path_list_pool_size()), "path list pool size is%d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+2 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * remove the API source from the default route. We expected
+     * the route to remain, sourced by DEFAULT_ROUTE, and hence a DROP
+     */
+    pfx.fp_addr.ip4.as_u32 = 0;
+    pfx.fp_len = 0;
+    fib_table_entry_path_remove(fib_index, &pfx,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_10_1,
+                               tm->hw[0]->sw_if_index,
+                               ~0, // non-recursive path, so no FIB index
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup(fib_index, &pfx);
+
+    FIB_TEST((fei == dfrt), "default route same index");
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "Default route is DROP");
+
+    /*
+     * -1 shared-path-list
+     */
+    FIB_TEST((0 == fib_path_list_db_size()),   "path list DB is empty");
+    FIB_TEST((NBR+2 == fib_path_list_pool_size()), "path list pool size is%d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+2 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * Add an 2 ARP entry => a complete ADJ plus adj-fib.
+     */
+    fib_prefix_t pfx_10_10_10_1_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 10.10.10.1 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x0a0a0a01),
+       },
+    };
+    fib_prefix_t pfx_10_10_10_2_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 10.10.10.2 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x0a0a0a02),
+       },
+    };
+    fib_prefix_t pfx_11_11_11_11_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 11.11.11.11 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x0b0b0b0b),
+       },
+    };
+    u8 eth_addr[] = {
+       0xde, 0xde, 0xde, 0xba, 0xba, 0xba,
+    };
+
+    /*
+     * Add a route via an incomplete ADJ. then complete the ADJ
+     * Expect the route LB is updated to use complete adj type.
+     */
+    fei = fib_table_entry_update_one_path(fib_index,
+                                          &pfx_11_11_11_11_s_32,
+                                          FIB_SOURCE_API,
+                                          FIB_ENTRY_FLAG_ATTACHED,
+                                         FIB_PROTOCOL_IP4,
+                                          &pfx_10_10_10_1_s_32.fp_addr,
+                                          tm->hw[0]->sw_if_index,
+                                          ~0, // invalid fib index
+                                          1,
+                                          MPLS_LABEL_INVALID,
+                                          FIB_ROUTE_PATH_FLAG_NONE);
+
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    dpo1 = load_balance_get_bucket(dpo->dpoi_index, 0);
+    FIB_TEST(DPO_ADJACENCY_INCOMPLETE == dpo1->dpoi_type,
+             "11.11.11.11/32 via incomplete adj");
+
+    ai_01 = adj_nbr_add_or_lock(FIB_PROTOCOL_IP4,
+                               FIB_LINK_IP4,
+                               &pfx_10_10_10_1_s_32.fp_addr,
+                               tm->hw[0]->sw_if_index);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != ai_01), "adj created");
+    adj = adj_get(ai_01);
+    FIB_TEST((IP_LOOKUP_NEXT_ARP == adj->lookup_next_index),
+            "adj is incomplete");
+    FIB_TEST((0 == ip46_address_cmp(&pfx_10_10_10_1_s_32.fp_addr,
+                                   &adj->sub_type.nbr.next_hop)),
+             "adj nbr next-hop ok");
+
+    adj_nbr_update_rewrite(ai_01, eth_addr);
+    FIB_TEST((IP_LOOKUP_NEXT_REWRITE == adj->lookup_next_index),
+            "adj is complete");
+    FIB_TEST((0 == ip46_address_cmp(&pfx_10_10_10_1_s_32.fp_addr,
+                                   &adj->sub_type.nbr.next_hop)),
+             "adj nbr next-hop ok");
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_01 == ai), "ADJ-FIB resolves via adj");
+
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    dpo1 = load_balance_get_bucket(dpo->dpoi_index, 0);
+    FIB_TEST(DPO_ADJACENCY == dpo1->dpoi_type,
+             "11.11.11.11/32 via complete adj");
+
+    /*
+     * add the adj fib
+     */
+    fei = fib_table_entry_update_one_path(fib_index,
+                                          &pfx_10_10_10_1_s_32,
+                                          FIB_SOURCE_ADJ,
+                                          FIB_ENTRY_FLAG_ATTACHED,
+                                         FIB_PROTOCOL_IP4,
+                                          &pfx_10_10_10_1_s_32.fp_addr,
+                                          tm->hw[0]->sw_if_index,
+                                          ~0, // invalid fib index
+                                          1,
+                                          MPLS_LABEL_INVALID,
+                                          FIB_ROUTE_PATH_FLAG_NONE);
+    FIB_TEST((FIB_ENTRY_FLAG_ATTACHED  == fib_entry_get_flags(fei)),
+            "Flags set on adj-fib");
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_01 == ai), "ADJ-FIB resolves via adj");
+
+    fib_table_entry_path_remove(fib_index,
+                                &pfx_11_11_11_11_s_32,
+                                FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                                &pfx_10_10_10_1_s_32.fp_addr,
+                                tm->hw[0]->sw_if_index,
+                                ~0, // invalid fib index
+                                1,
+                                FIB_ROUTE_PATH_FLAG_NONE);
+
+    eth_addr[5] = 0xb2;
+
+    ai_02 = adj_nbr_add_or_lock(FIB_PROTOCOL_IP4,
+                               FIB_LINK_IP4,
+                               &pfx_10_10_10_2_s_32.fp_addr,
+                               tm->hw[0]->sw_if_index);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != ai_02), "adj created");
+    adj = adj_get(ai_02);
+    FIB_TEST((IP_LOOKUP_NEXT_ARP == adj->lookup_next_index),
+            "adj is incomplete");
+    FIB_TEST((0 == ip46_address_cmp(&pfx_10_10_10_2_s_32.fp_addr,
+                                   &adj->sub_type.nbr.next_hop)),
+             "adj nbr next-hop ok");
+
+    adj_nbr_update_rewrite(ai_02, eth_addr);
+    FIB_TEST((IP_LOOKUP_NEXT_REWRITE == adj->lookup_next_index),
+            "adj is complete");
+    FIB_TEST((0 == ip46_address_cmp(&pfx_10_10_10_2_s_32.fp_addr,
+                                   &adj->sub_type.nbr.next_hop)),
+             "adj nbr next-hop ok");
+    FIB_TEST((ai_01 != ai_02), "ADJs are different");
+
+    fib_table_entry_update_one_path(fib_index,
+                                   &pfx_10_10_10_2_s_32,
+                                   FIB_SOURCE_ADJ,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP4,
+                                   &pfx_10_10_10_2_s_32.fp_addr,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup(fib_index, &pfx_10_10_10_2_s_32);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_02 == ai), "ADJ-FIB resolves via adj");
+
+    /*
+     * +2 adj-fibs, and their non-shared path-lists
+     */
+    FIB_TEST((0 == fib_path_list_db_size()),   "path list DB is empty");
+    FIB_TEST((NBR+4 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+4 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * Add a 2 routes via the first ADJ. ensure path-list sharing
+     */
+    fib_prefix_t pfx_1_1_1_1_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 1.1.1.1/32 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x01010101),
+       },
+    };
+
+    fib_table_entry_path_add(fib_index,
+                            &pfx_1_1_1_1_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_1,
+                            tm->hw[0]->sw_if_index,
+                            ~0, // invalid fib index
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &pfx_1_1_1_1_s_32);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_01 == ai), "1.1.1.1 resolves via 10.10.10.1");
+
+    /*
+     * +1 entry and a shared path-list
+     */
+    FIB_TEST((1 == fib_path_list_db_size()),   "path list DB is empty");
+    FIB_TEST((NBR+5 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+5 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /* 1.1.2.0/24 */
+    fib_prefix_t pfx_1_1_2_0_s_24 = {
+       .fp_len = 24,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4.as_u32 = clib_host_to_net_u32(0x01010200),
+       }
+    };
+
+    fib_table_entry_path_add(fib_index,
+                            &pfx_1_1_2_0_s_24,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_1,
+                            tm->hw[0]->sw_if_index,
+                            ~0, // invalid fib index
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &pfx_1_1_2_0_s_24);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_01 == ai), "1.1.2.0/24 resolves via 10.10.10.1");
+
+    /*
+     * +1 entry only
+     */
+    FIB_TEST((1 == fib_path_list_db_size()),   "path list DB is empty");
+    FIB_TEST((NBR+5 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+6 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * modify 1.1.2.0/24 to use multipath.
+     */
+    fib_table_entry_path_add(fib_index,
+                            &pfx_1_1_2_0_s_24,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_2,
+                            tm->hw[0]->sw_if_index,
+                            ~0, // invalid fib index
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &pfx_1_1_2_0_s_24);
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+
+    dpo1 = load_balance_get_bucket(dpo->dpoi_index, 0);
+    FIB_TEST(DPO_ADJACENCY == dpo1->dpoi_type, "type is %d", dpo1->dpoi_type);
+    FIB_TEST((ai_01 == dpo1->dpoi_index),
+            "1.1.2.0/24 bucket 0 resolves via 10.10.10.1 (%d=%d)",
+             ai_01, dpo1->dpoi_index);
+
+    dpo1 = load_balance_get_bucket(dpo->dpoi_index, 1);
+    FIB_TEST(DPO_ADJACENCY == dpo1->dpoi_type, "type is %d", dpo1->dpoi_type);
+    FIB_TEST((ai_02 == dpo1->dpoi_index),
+            "1.1.2.0/24 bucket 1 resolves via 10.10.10.2");
+
+    /*
+     * +1 shared-pathlist
+     */
+    FIB_TEST((2 == fib_path_list_db_size()),   "path list DB is empty");
+    FIB_TEST((NBR+6 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+6 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * revert the modify
+     */
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_1_1_2_0_s_24,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_10_2,
+                               tm->hw[0]->sw_if_index,
+                               ~0,
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &pfx_1_1_2_0_s_24);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_01 == ai), "1.1.2.0/24 resolves via 10.10.10.1");
+
+    /*
+     * +1 shared-pathlist
+     */
+    FIB_TEST((1 == fib_path_list_db_size()),   "path list DB is %d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+5 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+6 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * Add 2 recursive routes:
+     *   100.100.100.100/32 via 1.1.1.1/32  => the via entry is installed.
+     *   100.100.100.101/32 via 1.1.1.1/32  => the via entry is installed.
+     */
+    fib_prefix_t bgp_100_pfx = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 100.100.100.100/32 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x64646464),
+       },
+    };
+    /* via 1.1.1.1 */
+    ip46_address_t nh_1_1_1_1 = {
+       .ip4.as_u32 = clib_host_to_net_u32(0x01010101),
+    };
+
+    fib_table_entry_path_add(fib_index,
+                            &bgp_100_pfx,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_1_1_1_1,
+                            ~0, // no index provided.
+                            fib_index, // nexthop in same fib as route
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST_REC_FORW(&bgp_100_pfx, &pfx_1_1_1_1_s_32);
+
+    /*
+     * +1 entry and +1 shared-path-list
+     */
+    FIB_TEST((2  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+6 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+7 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    fib_prefix_t bgp_101_pfx = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 100.100.100.101/32 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x64646465),
+       },
+    };
+
+    fib_table_entry_path_add(fib_index,
+                            &bgp_101_pfx,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_1_1_1_1,
+                            ~0, // no index provided.
+                            fib_index, // nexthop in same fib as route
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST_REC_FORW(&bgp_101_pfx, &pfx_1_1_1_1_s_32);
+
+    /*
+     * +1 entry, but the recursive path-list is shared.
+     */
+    FIB_TEST((2  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+6 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+8 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * An EXCLUSIVE route; one where the user (me) provides the exclusive
+     * adjacency through which the route will resovle
+     */
+    fib_prefix_t ex_pfx = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 4.4.4.4/32 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x04040404),
+       },
+    };
+
+    fib_table_entry_special_add(fib_index,
+                               &ex_pfx,
+                               FIB_SOURCE_SPECIAL,
+                               FIB_ENTRY_FLAG_EXCLUSIVE,
+                               locked_ai);
+    fei = fib_table_lookup_exact_match(fib_index, &ex_pfx);
+    FIB_TEST((ai == fib_entry_get_adj(fei)),
+            "Exclusive route links to user adj");
+
+    fib_table_entry_special_remove(fib_index,
+                                  &ex_pfx,
+                                  FIB_SOURCE_SPECIAL);
+    FIB_TEST(FIB_NODE_INDEX_INVALID ==
+            fib_table_lookup_exact_match(fib_index, &ex_pfx),
+            "Exclusive reoute removed");
+
+    /*
+     * An EXCLUSIVE route; one where the user (me) provides the exclusive
+     * adjacency through which the route will resovle
+     */
+    dpo_id_t ex_dpo = DPO_NULL;
+
+    lookup_dpo_add_or_lock_w_fib_index(fib_index,
+                                       DPO_PROTO_IP4,
+                                       LOOKUP_INPUT_DST_ADDR,
+                                       LOOKUP_TABLE_FROM_CONFIG,
+                                       &ex_dpo);
+
+    fib_table_entry_special_dpo_add(fib_index,
+                                    &ex_pfx,
+                                    FIB_SOURCE_SPECIAL,
+                                    FIB_ENTRY_FLAG_EXCLUSIVE,
+                                    &ex_dpo);
+    fei = fib_table_lookup_exact_match(fib_index, &ex_pfx);
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(!dpo_cmp(&ex_dpo, load_balance_get_bucket(dpo->dpoi_index, 0)),
+            "exclusive remote uses lookup DPO");
+
+    fib_table_entry_special_remove(fib_index,
+                                  &ex_pfx,
+                                  FIB_SOURCE_SPECIAL);
+    FIB_TEST(FIB_NODE_INDEX_INVALID ==
+            fib_table_lookup_exact_match(fib_index, &ex_pfx),
+            "Exclusive reoute removed");
+    dpo_reset(&ex_dpo);
+
+    /*
+     * Add a recursive route:
+     *   200.200.200.200/32 via 1.1.1.2/32  => the via entry is NOT installed.
+     */
+    fib_prefix_t bgp_200_pfx = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 200.200.200.200/32 */
+           .ip4.as_u32 = clib_host_to_net_u32(0xc8c8c8c8),
+       },
+    };
+    /* via 1.1.1.2 */
+    fib_prefix_t pfx_1_1_1_2_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4.as_u32 = clib_host_to_net_u32(0x01010102),
+       },
+    };
+
+    fib_table_entry_path_add(fib_index,
+                            &bgp_200_pfx,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &pfx_1_1_1_2_s_32.fp_addr,
+                            ~0, // no index provided.
+                            fib_index, // nexthop in same fib as route
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST_REC_FORW(&bgp_200_pfx, &pfx_1_1_1_2_s_32);
+
+    /*
+     * the adj should be recursive via drop, since the route resolves via
+     * the default route, which is itself a DROP 
+     */
+    fei = fib_table_lookup(fib_index, &pfx_1_1_1_2_s_32);
+    dpo1 = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(load_balance_is_drop(dpo1), "1.1.1.2/32 is drop");
+
+    /*
+     * +2 entry and +1 shared-path-list
+     */
+    FIB_TEST((3  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+7 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+10 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * Unequal Cost load-balance. 3:1 ratio. fits in a 4 bucket LB
+     */
+    fib_prefix_t pfx_1_2_3_4_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4.as_u32 = clib_host_to_net_u32(0x01020304),
+       },
+    };
+    fib_table_entry_path_add(fib_index,
+                            &pfx_1_2_3_4_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                             &nh_10_10_10_2,
+                             tm->hw[0]->sw_if_index,
+                             ~0,
+                             1,
+                             MPLS_LABEL_INVALID,
+                             FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_entry_path_add(fib_index,
+                                   &pfx_1_2_3_4_s_32,
+                                   FIB_SOURCE_API,
+                                   FIB_ENTRY_FLAG_NONE,
+                                  FIB_PROTOCOL_IP4,
+                                   &nh_10_10_10_1,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0,
+                                   3,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "1.2.3.4/32 presnet");
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    lb = load_balance_get(dpo->dpoi_index);
+    FIB_TEST((lb->lb_n_buckets == 4),
+             "1.2.3.4/32 LB has %d bucket",
+             lb->lb_n_buckets);
+
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_4_s_32, 0, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_4_s_32, 1, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_4_s_32, 2, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_4_s_32, 3, ai_02);
+
+    fib_table_entry_delete(fib_index,
+                           &pfx_1_2_3_4_s_32,
+                           FIB_SOURCE_API);
+
+    /*
+     * Unequal Cost load-balance. 4:1 ratio.
+     *  fits in a 16 bucket LB with ratio 13:3
+     */
+    fib_prefix_t pfx_1_2_3_5_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4.as_u32 = clib_host_to_net_u32(0x01020305),
+       },
+    };
+    fib_table_entry_path_add(fib_index,
+                            &pfx_1_2_3_5_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                             &nh_10_10_10_2,
+                             tm->hw[0]->sw_if_index,
+                             ~0,
+                             1,
+                             MPLS_LABEL_INVALID,
+                             FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_entry_path_add(fib_index,
+                                   &pfx_1_2_3_5_s_32,
+                                   FIB_SOURCE_API,
+                                   FIB_ENTRY_FLAG_NONE,
+                                  FIB_PROTOCOL_IP4,
+                                   &nh_10_10_10_1,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0,
+                                   4,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "1.2.3.5/32 presnet");
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    lb = load_balance_get(dpo->dpoi_index);
+    FIB_TEST((lb->lb_n_buckets == 16),
+             "1.2.3.5/32 LB has %d bucket",
+             lb->lb_n_buckets);
+
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 0, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 1, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 2, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 3, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 4, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 5, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 6, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 7, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 8, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 9, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 10, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 11, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 12, ai_01);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 13, ai_02);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 14, ai_02);
+    FIB_TEST_LB_BUCKET_VIA_ADJ(&pfx_1_2_3_5_s_32, 15, ai_02);
+
+    fib_table_entry_delete(fib_index,
+                           &pfx_1_2_3_5_s_32,
+                           FIB_SOURCE_API);
+
+    /*
+     * Add a recursive route:
+     *   200.200.200.201/32 via 1.1.1.200/32  => the via entry is NOT installed.
+     */
+    fib_prefix_t bgp_201_pfx = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 200.200.200.201/32 */
+           .ip4.as_u32 = clib_host_to_net_u32(0xc8c8c8c9),
+       },
+    };
+    /* via 1.1.1.200 */
+    fib_prefix_t pfx_1_1_1_200_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4.as_u32 = clib_host_to_net_u32(0x010101c8),
+       },
+    };
+
+    fib_table_entry_path_add(fib_index,
+                            &bgp_201_pfx,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &pfx_1_1_1_200_s_32.fp_addr,
+                            ~0, // no index provided.
+                            fib_index, // nexthop in same fib as route
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST_REC_FORW(&bgp_201_pfx, &pfx_1_1_1_200_s_32);
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_200_s_32);
+    FIB_TEST((FIB_ENTRY_FLAG_NONE == fib_entry_get_flags(fei)),
+            "Flags set on RR via non-attached");
+
+    /*
+     * +2 entry (BGP & RR) and +1 shared-path-list
+     */
+    FIB_TEST((4  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+8 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+12 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * insert a route that covers the missing 1.1.1.2/32. we epxect
+     * 200.200.200.200/32 and 200.200.200.201/32 to resolve through it.
+     */
+    fib_prefix_t pfx_1_1_1_0_s_24 = {
+       .fp_len = 24,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 1.1.1.0/24 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x01010100),
+       },
+    };
+
+    fib_table_entry_path_add(fib_index,
+                            &pfx_1_1_1_0_s_24,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_1,
+                            tm->hw[0]->sw_if_index,
+                            ~0, // invalid fib index
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &pfx_1_1_1_0_s_24);
+    dpo1 = fib_entry_contribute_ip_forwarding(fei);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_01 == ai), "1.1.1.0/24 resolves via 10.10.10.1");
+    fei = fib_table_lookup(fib_index, &pfx_1_1_1_2_s_32);
+    dpo1 = fib_entry_contribute_ip_forwarding(fei);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_01 == ai), "1.1.1.2/32 resolves via 10.10.10.1");
+    fei = fib_table_lookup(fib_index, &pfx_1_1_1_200_s_32);
+    dpo1 = fib_entry_contribute_ip_forwarding(fei);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_01 == ai), "1.1.1.200/24 resolves via 10.10.10.1");
+
+    /*
+     * +1 entry. 1.1.1.1/32 already uses 10.10.10.1 so no new pah-list
+     */
+    FIB_TEST((4  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+8 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+13 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * the recursive adj for 200.200.200.200 should be updated.
+     */
+    FIB_TEST_REC_FORW(&bgp_201_pfx, &pfx_1_1_1_200_s_32);
+    FIB_TEST_REC_FORW(&bgp_200_pfx, &pfx_1_1_1_2_s_32);
+
+    /*
+     * insert a more specific route than 1.1.1.0/24 that also covers the
+     * missing 1.1.1.2/32, but not 1.1.1.200/32. we epxect
+     * 200.200.200.200 to resolve through it.
+     */
+    fib_prefix_t pfx_1_1_1_0_s_28 = {
+       .fp_len = 28,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 1.1.1.0/24 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x01010100),
+       },
+    };
+
+    fib_table_entry_path_add(fib_index,
+                            &pfx_1_1_1_0_s_28,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_2,
+                            tm->hw[0]->sw_if_index,
+                            ~0, // invalid fib index
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &pfx_1_1_1_0_s_28);
+    dpo2 = fib_entry_contribute_ip_forwarding(fei);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_02 == ai), "1.1.1.0/24 resolves via 10.10.10.2");
+
+    /*
+     * +1 entry. +1 shared path-list
+     */
+    FIB_TEST((5  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+9 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+14 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * the recursive adj for 200.200.200.200 should be updated.
+     * 200.200.200.201 remains unchanged.
+     */
+    FIB_TEST_REC_FORW(&bgp_201_pfx, &pfx_1_1_1_200_s_32);
+    FIB_TEST_REC_FORW(&bgp_200_pfx, &pfx_1_1_1_2_s_32);
+
+    /*
+     * remove this /28. 200.200.200.200/32 should revert back to via 1.1.1.0/24
+     */
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_1_1_1_0_s_28,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_10_2,
+                               tm->hw[0]->sw_if_index,
+                               ~0,
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    FIB_TEST((fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_0_s_28) == 
+             FIB_NODE_INDEX_INVALID),
+            "1.1.1.0/28 removed");
+    FIB_TEST((fib_table_lookup(fib_index, &pfx_1_1_1_0_s_28) == 
+             fib_table_lookup(fib_index, &pfx_1_1_1_0_s_24)),
+            "1.1.1.0/28 lookup via /24");
+    FIB_TEST_REC_FORW(&bgp_201_pfx, &pfx_1_1_1_200_s_32);
+    FIB_TEST_REC_FORW(&bgp_200_pfx, &pfx_1_1_1_2_s_32);
+
+    /*
+     * -1 entry. -1 shared path-list
+     */
+    FIB_TEST((4  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+8 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+13 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * remove 1.1.1.0/24. 200.200.200.200/32 should revert back to via 0.0.0.0/0
+     */
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_1_1_1_0_s_24,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_10_1,
+                               tm->hw[0]->sw_if_index,
+                               ~0,
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    FIB_TEST((fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_0_s_24) == 
+             FIB_NODE_INDEX_INVALID),
+            "1.1.1.0/24 removed");
+
+    fei = fib_table_lookup(fib_index, &pfx_1_1_1_2_s_32);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "1.1.1.2/32 route is DROP");
+    fei = fib_table_lookup(fib_index, &pfx_1_1_1_200_s_32);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "1.1.1.200/32 route is DROP");
+
+    FIB_TEST_REC_FORW(&bgp_201_pfx, &pfx_1_1_1_200_s_32);
+    FIB_TEST_REC_FORW(&bgp_200_pfx, &pfx_1_1_1_2_s_32);
+
+    /*
+     * -1 entry
+     */
+    FIB_TEST((4  == fib_path_list_db_size()),   "path list DB population:%d",
+       fib_path_list_db_size());
+    FIB_TEST((NBR+8 == fib_path_list_pool_size()), "path list pool size is %d",
+       fib_path_list_pool_size());
+    FIB_TEST((NBR+12 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * insert the missing 1.1.1.2/32
+     */
+    fei = fib_table_entry_path_add(fib_index,
+                                  &pfx_1_1_1_2_s_32,
+                                  FIB_SOURCE_API,
+                                  FIB_ENTRY_FLAG_NONE,
+                                  FIB_PROTOCOL_IP4,
+                                  &nh_10_10_10_1,
+                                  tm->hw[0]->sw_if_index,
+                                  ~0, // invalid fib index
+                                  1,
+                                  MPLS_LABEL_INVALID,
+                                  FIB_ROUTE_PATH_FLAG_NONE);
+    dpo1 = fib_entry_contribute_ip_forwarding(fei);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai = ai_01), "1.1.1.2/32 resolves via 10.10.10.1");
+
+    FIB_TEST_REC_FORW(&bgp_201_pfx, &pfx_1_1_1_200_s_32);
+    FIB_TEST_REC_FORW(&bgp_200_pfx, &pfx_1_1_1_2_s_32);
+
+    /*
+     * no change. 1.1.1.2/32 was already there RR sourced.
+     */
+    FIB_TEST((4  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+8 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+12 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * remove 200.200.200.201/32 which does not have a valid via FIB
+     */
+    fib_table_entry_path_remove(fib_index,
+                               &bgp_201_pfx,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &pfx_1_1_1_200_s_32.fp_addr,
+                               ~0, // no index provided.
+                               fib_index,
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+
+    /*
+     * -2 entries (BGP and RR). -1 shared path-list;
+     */
+    FIB_TEST((fib_table_lookup_exact_match(fib_index, &bgp_201_pfx) == 
+             FIB_NODE_INDEX_INVALID),
+            "200.200.200.201/32 removed");
+    FIB_TEST((fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_200_s_32) == 
+             FIB_NODE_INDEX_INVALID),
+            "1.1.1.200/32 removed");
+
+    FIB_TEST((3  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+7 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+10 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * remove 200.200.200.200/32 which does have a valid via FIB
+     */
+    fib_table_entry_path_remove(fib_index,
+                               &bgp_200_pfx,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &pfx_1_1_1_2_s_32.fp_addr,
+                               ~0, // no index provided.
+                               fib_index,
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST((fib_table_lookup_exact_match(fib_index, &bgp_200_pfx) == 
+             FIB_NODE_INDEX_INVALID),
+            "200.200.200.200/32 removed");
+    FIB_TEST((fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_2_s_32) != 
+             FIB_NODE_INDEX_INVALID),
+            "1.1.1.2/32 still present");
+
+    /*
+     * -1 entry (BGP, the RR source is also API sourced). -1 shared path-list;
+     */
+    FIB_TEST((2  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+6 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+9 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * A recursive prefix that has a 2 path  load-balance.
+     * It also shares a next-hop with other BGP prefixes and hence
+     * test the ref counting of RR sourced prefixes and 2 level LB.
+     */
+    const fib_prefix_t bgp_102 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 100.100.100.101/32 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x64646466),
+       },
+    };
+    fib_table_entry_path_add(fib_index,
+                            &bgp_102,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &pfx_1_1_1_1_s_32.fp_addr,
+                            ~0, // no index provided.
+                            fib_index, // same as route
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fib_table_entry_path_add(fib_index,
+                            &bgp_102,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &pfx_1_1_1_2_s_32.fp_addr,
+                            ~0, // no index provided.
+                            fib_index, // same as route's FIB
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(fib_index, &bgp_102);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "100.100.100.102/32 presnet");
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+
+    fei  = fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_1_s_32);
+    dpo1 = fib_entry_contribute_ip_forwarding(fei);
+    fei  = fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_2_s_32);
+    dpo2 = fib_entry_contribute_ip_forwarding(fei);
+
+    lb = load_balance_get(dpo->dpoi_index);
+    FIB_TEST((lb->lb_n_buckets == 2), "Recursive LB has %d bucket", lb->lb_n_buckets);
+    FIB_TEST(!dpo_cmp(dpo1, load_balance_get_bucket(dpo->dpoi_index, 0)),
+            "First via 10.10.10.1");
+    FIB_TEST(!dpo_cmp(dpo2, load_balance_get_bucket(dpo->dpoi_index, 1)),
+            "Second via 10.10.10.1");
+
+    fib_table_entry_path_remove(fib_index,
+                               &bgp_102,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &pfx_1_1_1_1_s_32.fp_addr,
+                               ~0, // no index provided.
+                               fib_index, // same as route's FIB
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    fib_table_entry_path_remove(fib_index,
+                               &bgp_102,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &pfx_1_1_1_2_s_32.fp_addr,
+                               ~0, // no index provided.
+                               fib_index, // same as route's FIB
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(fib_index, &bgp_102);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "100.100.100.102/32 removed");
+
+    /*
+     * remove the remaining recursives
+     */
+    fib_table_entry_path_remove(fib_index,
+                               &bgp_100_pfx,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &pfx_1_1_1_1_s_32.fp_addr,
+                               ~0, // no index provided.
+                               fib_index, // same as route's FIB
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    fib_table_entry_path_remove(fib_index,
+                               &bgp_101_pfx,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &pfx_1_1_1_1_s_32.fp_addr,
+                               ~0, // no index provided.
+                               fib_index, // same as route's FIB
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    FIB_TEST((fib_table_lookup_exact_match(fib_index, &bgp_100_pfx) == 
+             FIB_NODE_INDEX_INVALID),
+            "100.100.100.100/32 removed");
+    FIB_TEST((fib_table_lookup_exact_match(fib_index, &bgp_101_pfx) == 
+             FIB_NODE_INDEX_INVALID),
+            "100.100.100.101/32 removed");
+
+    /*
+     * -2 entry (2*BGP, the RR source is also API sourced). -1 shared path-list;
+     */
+    FIB_TEST((1  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+5 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+7 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * Add a recursive route via a connected cover, using an adj-fib that does exist
+     */
+    fib_table_entry_path_add(fib_index,
+                            &bgp_200_pfx,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_1,
+                            ~0, // no index provided.
+                            fib_index, // Same as route's FIB
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+
+    /*
+     * +1 entry. +1 shared path-list (recursive via 10.10.10.1)
+     */
+    FIB_TEST((2  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+6 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+8 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    fei = fib_table_lookup_exact_match(fib_index, &bgp_200_pfx);
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+
+    fei  = fib_table_lookup_exact_match(fib_index, &pfx_10_10_10_1_s_32);
+    dpo1 = fib_entry_contribute_ip_forwarding(fei);
+
+    FIB_TEST(!dpo_cmp(dpo1, load_balance_get_bucket(dpo->dpoi_index, 0)),
+            "200.200.200.200/32 is recursive via adj for 10.10.10.1");
+
+    FIB_TEST((FIB_ENTRY_FLAG_ATTACHED  == fib_entry_get_flags(fei)),
+            "Flags set on RR via existing attached");
+
+    /*
+     * Add a recursive route via a connected cover, using and adj-fib that does
+     * not exist
+     */
+    ip46_address_t nh_10_10_10_3 = {
+       .ip4.as_u32 = clib_host_to_net_u32(0x0a0a0a03),
+    };
+    fib_prefix_t pfx_10_10_10_3 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = nh_10_10_10_3,
+    };
+
+    fib_table_entry_path_add(fib_index,
+                            &bgp_201_pfx,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_3,
+                            ~0, // no index provided.
+                            fib_index,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+
+    /*
+     * +2 entries (BGP and RR). +1 shared path-list (recursive via 10.10.10.3) and
+     * one unshared non-recursive via 10.10.10.3
+     */
+    FIB_TEST((3  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+8 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+10 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    ai_03 = adj_nbr_add_or_lock(FIB_PROTOCOL_IP4,
+                               FIB_LINK_IP4,
+                               &nh_10_10_10_3,
+                               tm->hw[0]->sw_if_index);
+
+    fei  = fib_table_lookup_exact_match(fib_index, &bgp_201_pfx);
+    dpo  = fib_entry_contribute_ip_forwarding(fei);
+    fei  = fib_table_lookup_exact_match(fib_index, &pfx_10_10_10_3);
+    dpo1 = fib_entry_contribute_ip_forwarding(fei);
+
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai == ai_03), "adj for 10.10.10.3/32 is via adj for 10.10.10.3");
+    FIB_TEST(((FIB_ENTRY_FLAG_ATTACHED | FIB_ENTRY_FLAG_CONNECTED) ==
+             fib_entry_get_flags(fei)),
+            "Flags set on RR via non-existing attached");
+
+    FIB_TEST(!dpo_cmp(dpo1, load_balance_get_bucket(dpo->dpoi_index, 0)),
+            "adj for 200.200.200.200/32 is recursive via adj for 10.10.10.3");
+
+    adj_unlock(ai_03);
+
+    /*
+     * remove the recursives
+     */
+    fib_table_entry_path_remove(fib_index,
+                               &bgp_200_pfx,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_10_1,
+                               ~0, // no index provided.
+                               fib_index, // same as route's FIB
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    fib_table_entry_path_remove(fib_index,
+                               &bgp_201_pfx,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_10_3,
+                               ~0, // no index provided.
+                               fib_index, // same as route's FIB
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST((fib_table_lookup_exact_match(fib_index, &bgp_201_pfx) ==
+             FIB_NODE_INDEX_INVALID),
+            "200.200.200.201/32 removed");
+    FIB_TEST((fib_table_lookup_exact_match(fib_index, &bgp_200_pfx) ==
+             FIB_NODE_INDEX_INVALID),
+            "200.200.200.200/32 removed");
+    FIB_TEST((fib_table_lookup_exact_match(fib_index, &pfx_10_10_10_3) ==
+             FIB_NODE_INDEX_INVALID),
+            "10.10.10.3/32 removed");
+
+    /*
+     * -3 entries (2*BGP and RR). -2 shared path-list (recursive via 10.10.10.3 &
+     *  10.10.10.1) and one unshared non-recursive via 10.10.10.3
+     */
+    FIB_TEST((1  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+5 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+7 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+
+    /*
+     * RECURSION LOOPS
+     *  Add 5.5.5.5/32 -> 5.5.5.6/32 -> 5.5.5.7/32 -> 5.5.5.5/32
+     */
+    fib_prefix_t pfx_5_5_5_5_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4.as_u32 = clib_host_to_net_u32(0x05050505),
+       },
+    };
+    fib_prefix_t pfx_5_5_5_6_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4.as_u32 = clib_host_to_net_u32(0x05050506),
+       },
+    };
+    fib_prefix_t pfx_5_5_5_7_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4.as_u32 = clib_host_to_net_u32(0x05050507),
+       },
+    };
+
+    fib_table_entry_path_add(fib_index,
+                            &pfx_5_5_5_5_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &pfx_5_5_5_6_s_32.fp_addr,
+                            ~0, // no index provided.
+                            fib_index,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fib_table_entry_path_add(fib_index,
+                            &pfx_5_5_5_6_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &pfx_5_5_5_7_s_32.fp_addr,
+                            ~0, // no index provided.
+                            fib_index,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fib_table_entry_path_add(fib_index,
+                            &pfx_5_5_5_7_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &pfx_5_5_5_5_s_32.fp_addr,
+                            ~0, // no index provided.
+                            fib_index,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    /*
+     * +3 entries, +3 shared path-list
+     */
+    FIB_TEST((4  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+8 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+10 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * All the entries have only looped paths, so they are all drop
+     */
+    fei = fib_table_lookup(fib_index, &pfx_5_5_5_7_s_32);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "LB for 5.5.5.7/32 is via adj for DROP");
+    fei = fib_table_lookup(fib_index, &pfx_5_5_5_5_s_32);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "LB for 5.5.5.5/32 is via adj for DROP");
+    fei = fib_table_lookup(fib_index, &pfx_5_5_5_6_s_32);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "LB for 5.5.5.6/32 is via adj for DROP");
+
+    /*
+     * provide 5.5.5.6/32 with alternate path.
+     * this will allow only 5.5.5.6/32 to forward with this path, the others
+     * are still drop since the loop is still present.
+     */
+    fib_table_entry_path_add(fib_index,
+                            &pfx_5_5_5_6_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_1,
+                            tm->hw[0]->sw_if_index,
+                            ~0,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+
+
+    fei = fib_table_lookup(fib_index, &pfx_5_5_5_6_s_32);
+    dpo1 = fib_entry_contribute_ip_forwarding(fei);
+
+    lb = load_balance_get(dpo1->dpoi_index);
+    FIB_TEST((lb->lb_n_buckets == 1), "5.5.5.6 LB has %d bucket", lb->lb_n_buckets);
+
+    dpo2 = load_balance_get_bucket(dpo1->dpoi_index, 0);
+    FIB_TEST(DPO_ADJACENCY == dpo2->dpoi_type, "type is %d", dpo2->dpoi_type);
+    FIB_TEST((ai_01 == dpo2->dpoi_index),
+            "5.5.5.6 bucket 0 resolves via 10.10.10.2");
+
+    fei = fib_table_lookup(fib_index, &pfx_5_5_5_7_s_32);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "LB for 5.5.5.7/32 is via adj for DROP");
+    fei = fib_table_lookup(fib_index, &pfx_5_5_5_5_s_32);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "LB for 5.5.5.5/32 is via adj for DROP");
+
+    /*
+     * remove the alternate path for 5.5.5.6/32
+     * back to all drop
+     */
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_5_5_5_6_s_32,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_10_1,
+                               tm->hw[0]->sw_if_index,
+                               ~0,
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup(fib_index, &pfx_5_5_5_7_s_32);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "LB for 5.5.5.7/32 is via adj for DROP");
+    fei = fib_table_lookup(fib_index, &pfx_5_5_5_5_s_32);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "LB for 5.5.5.5/32 is via adj for DROP");
+    fei = fib_table_lookup(fib_index, &pfx_5_5_5_6_s_32);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "LB for 5.5.5.6/32 is via adj for DROP");
+
+    /*
+     * break the loop by giving 5.5.5.5/32 a new set of paths
+     * expect all to forward via this new path.
+     */
+    fib_table_entry_update_one_path(fib_index,
+                                   &pfx_5_5_5_5_s_32,
+                                   FIB_SOURCE_API,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP4,
+                                   &nh_10_10_10_1,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup(fib_index, &pfx_5_5_5_5_s_32);
+    dpo1 = fib_entry_contribute_ip_forwarding(fei);
+    lb = load_balance_get(dpo1->dpoi_index);
+    FIB_TEST((lb->lb_n_buckets == 1), "5.5.5.5 LB has %d bucket", lb->lb_n_buckets);
+
+    dpo2 = load_balance_get_bucket(dpo1->dpoi_index, 0);
+    FIB_TEST(DPO_ADJACENCY == dpo2->dpoi_type, "type is %d", dpo2->dpoi_type);
+    FIB_TEST((ai_01 == dpo2->dpoi_index),
+            "5.5.5.5 bucket 0 resolves via 10.10.10.2");
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_5_5_5_7_s_32);
+    dpo2 = fib_entry_contribute_ip_forwarding(fei);
+
+    lb = load_balance_get(dpo2->dpoi_index);
+    FIB_TEST((lb->lb_n_buckets == 1), "Recursive LB has %d bucket", lb->lb_n_buckets);
+    FIB_TEST(!dpo_cmp(dpo1, load_balance_get_bucket(dpo2->dpoi_index, 0)),
+            "5.5.5.5.7 via 5.5.5.5");
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_5_5_5_6_s_32);
+    dpo1 = fib_entry_contribute_ip_forwarding(fei);
+
+    lb = load_balance_get(dpo1->dpoi_index);
+    FIB_TEST((lb->lb_n_buckets == 1), "Recursive LB has %d bucket", lb->lb_n_buckets);
+    FIB_TEST(!dpo_cmp(dpo2, load_balance_get_bucket(dpo1->dpoi_index, 0)),
+            "5.5.5.5.6 via 5.5.5.7");
+
+    /*
+     * revert back to the loop. so we can remove the prefixes with
+     * the loop intact
+     */
+    fib_table_entry_update_one_path(fib_index,
+                                   &pfx_5_5_5_5_s_32,
+                                   FIB_SOURCE_API,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP4,
+                                   &pfx_5_5_5_6_s_32.fp_addr,
+                                   ~0, // no index provided.
+                                   fib_index,
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup(fib_index, &pfx_5_5_5_7_s_32);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "LB for 5.5.5.7/32 is via adj for DROP");
+    fei = fib_table_lookup(fib_index, &pfx_5_5_5_5_s_32);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "LB for 5.5.5.5/32 is via adj for DROP");
+    fei = fib_table_lookup(fib_index, &pfx_5_5_5_6_s_32);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "LB for 5.5.5.6/32 is via adj for DROP");
+
+    /*
+     * remove all the 5.5.5.x/32 prefixes
+     */
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_5_5_5_5_s_32,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &pfx_5_5_5_6_s_32.fp_addr,
+                               ~0, // no index provided.
+                               fib_index, // same as route's FIB
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_5_5_5_6_s_32,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &pfx_5_5_5_7_s_32.fp_addr,
+                               ~0, // no index provided.
+                               fib_index, // same as route's FIB
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_5_5_5_7_s_32,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &pfx_5_5_5_5_s_32.fp_addr,
+                               ~0, // no index provided.
+                               fib_index, // same as route's FIB
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_5_5_5_6_s_32,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_10_2,
+                               ~0, // no index provided.
+                               fib_index, // same as route's FIB
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+
+    /*
+     * -3 entries, -3 shared path-list
+     */
+    FIB_TEST((1  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+5 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+7 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * Single level loop 5.5.5.5/32 via 5.5.5.5/32
+     */
+    fib_table_entry_path_add(fib_index,
+                            &pfx_5_5_5_6_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &pfx_5_5_5_6_s_32.fp_addr,
+                            ~0, // no index provided.
+                            fib_index,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &pfx_5_5_5_6_s_32);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+            "1-level 5.5.5.6/32 loop is via adj for DROP");
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_5_5_5_6_s_32,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &pfx_5_5_5_6_s_32.fp_addr,
+                               ~0, // no index provided.
+                               fib_index, // same as route's FIB
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    FIB_TEST(FIB_NODE_INDEX_INVALID ==
+            fib_table_lookup_exact_match(fib_index, &pfx_5_5_5_6_s_32),
+            "1-level 5.5.5.6/32 loop is removed");
+
+    /*
+     * add-remove test. no change.
+     */
+    FIB_TEST((1  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+5 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+7 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * A recursive route with recursion constraints.
+     *  200.200.200.200/32 via 1.1.1.1 is recurse via host constrained
+     */
+    fib_table_entry_path_add(fib_index,
+                            &bgp_200_pfx,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_1_1_1_1,
+                            ~0,
+                            fib_index,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_RESOLVE_VIA_HOST);
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_1_s_32);
+    dpo2 = fib_entry_contribute_ip_forwarding(fei);
+
+    fei = fib_table_lookup_exact_match(fib_index, &bgp_200_pfx);
+    dpo1 = fib_entry_contribute_ip_forwarding(fei);
+
+    FIB_TEST(!dpo_cmp(dpo2, load_balance_get_bucket(dpo1->dpoi_index, 0)),
+            "adj for 200.200.200.200/32 is recursive via adj for 1.1.1.1");
+
+    /*
+     * save the load-balance. we expect it to be inplace modified
+     */
+    lb = load_balance_get(dpo1->dpoi_index);
+
+    /*
+     * add a covering prefix for the via fib that would otherwise serve
+     * as the resolving route when the host is removed
+     */
+    fib_table_entry_path_add(fib_index,
+                            &pfx_1_1_1_0_s_28,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_1,
+                            tm->hw[0]->sw_if_index,
+                            ~0, // invalid fib index
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_0_s_28);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai == ai_01),
+            "adj for 1.1.1.0/28 is via adj for 1.1.1.1");
+
+    /*
+     * remove the host via FIB - expect the BGP prefix to be drop
+     */
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_1_1_1_1_s_32,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_10_1,
+                               tm->hw[0]->sw_if_index,
+                               ~0, // invalid fib index
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST(!dpo_cmp(dpo_drop, load_balance_get_bucket(dpo1->dpoi_index, 0)),
+            "adj for 200.200.200.200/32 is recursive via adj for DROP");
+
+    /*
+     * add the via-entry host reoute back. expect to resolve again
+     */
+    fib_table_entry_path_add(fib_index,
+                            &pfx_1_1_1_1_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_1,
+                            tm->hw[0]->sw_if_index,
+                            ~0, // invalid fib index
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    FIB_TEST(!dpo_cmp(dpo2, load_balance_get_bucket(dpo1->dpoi_index, 0)),
+            "adj for 200.200.200.200/32 is recursive via adj for 1.1.1.1");
+
+    /*
+     * add another path for the recursive. it will then have 2.
+     */
+    fib_prefix_t pfx_1_1_1_3_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4.as_u32 = clib_host_to_net_u32(0x01010103),
+       },
+    };
+    fib_table_entry_path_add(fib_index,
+                            &pfx_1_1_1_3_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_2,
+                            tm->hw[0]->sw_if_index,
+                            ~0, // invalid fib index
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+
+    fib_table_entry_path_add(fib_index,
+                            &bgp_200_pfx,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &pfx_1_1_1_3_s_32.fp_addr,
+                            ~0,
+                            fib_index,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_RESOLVE_VIA_HOST);
+
+    fei = fib_table_lookup_exact_match(fib_index, &bgp_200_pfx);
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_1_s_32);
+    dpo2 = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(!dpo_cmp(dpo2, load_balance_get_bucket(dpo->dpoi_index, 0)),
+            "adj for 200.200.200.200/32 is recursive via adj for 1.1.1.1");
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_3_s_32);
+    dpo1 = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(!dpo_cmp(dpo1, load_balance_get_bucket(dpo->dpoi_index, 1)),
+            "adj for 200.200.200.200/32 is recursive via adj for 1.1.1.3");
+
+    /*
+     * expect the lb-map used by the recursive's load-balance is using both buckets
+     */
+    load_balance_map_t *lbm;
+    index_t lbmi;
+
+    lb = load_balance_get(dpo->dpoi_index);
+    lbmi = lb->lb_map;
+    load_balance_map_lock(lbmi);
+    lbm = load_balance_map_get(lbmi);
+
+    FIB_TEST(lbm->lbm_buckets[0] == 0,
+             "LB maps's bucket 0 is %d",
+             lbm->lbm_buckets[0]);
+    FIB_TEST(lbm->lbm_buckets[1] == 1,
+             "LB maps's bucket 1 is %d",
+             lbm->lbm_buckets[1]);
+
+    /*
+     * withdraw one of the /32 via-entrys.
+     * that ECMP path will be unresolved and forwarding should continue on the
+     * other available path. this is an iBGP PIC edge failover.
+     * Test the forwarding changes without re-fetching the adj from the
+     * recursive entry. this ensures its the same one that is updated; i.e. an
+     * inplace-modify.
+     */
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_1_1_1_1_s_32,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_10_1,
+                               tm->hw[0]->sw_if_index,
+                               ~0, // invalid fib index
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup_exact_match(fib_index, &bgp_200_pfx);
+    FIB_TEST(!dpo_cmp(dpo, fib_entry_contribute_ip_forwarding(fei)),
+            "post PIC 200.200.200.200/32 was inplace modified");
+
+    FIB_TEST(!dpo_cmp(dpo1, load_balance_get_bucket_i(lb, 0)),
+            "post PIC adj for 200.200.200.200/32 is recursive"
+            " via adj for 1.1.1.3");
+
+    /*
+     * the LB maps that was locked above should have been modified to remove
+     * the path that was down, and thus its bucket points to a path that is
+     * still up.
+     */
+    FIB_TEST(lbm->lbm_buckets[0] == 1,
+             "LB maps's bucket 0 is %d",
+             lbm->lbm_buckets[0]);
+    FIB_TEST(lbm->lbm_buckets[1] == 1,
+             "LB maps's bucket 1 is %d",
+             lbm->lbm_buckets[1]);
+
+    load_balance_map_unlock(lb->lb_map);
+
+    /*
+     * add it back. again 
+     */
+    fib_table_entry_path_add(fib_index,
+                            &pfx_1_1_1_1_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_1,
+                            tm->hw[0]->sw_if_index,
+                            ~0, // invalid fib index
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST(!dpo_cmp(dpo2, load_balance_get_bucket_i(lb, 0)),
+            "post PIC recovery adj for 200.200.200.200/32 is recursive "
+            "via adj for 1.1.1.1");
+    FIB_TEST(!dpo_cmp(dpo1, load_balance_get_bucket_i(lb, 1)),
+            "post PIC recovery adj for 200.200.200.200/32 is recursive "
+            "via adj for 1.1.1.3");
+
+    fei = fib_table_lookup_exact_match(fib_index, &bgp_200_pfx);
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(lb == load_balance_get(dpo->dpoi_index),
+            "post PIC 200.200.200.200/32 was inplace modified");
+
+    /*
+     * add a 3rd path. this makes the LB 16 buckets. 
+     */
+    fib_table_entry_path_add(fib_index,
+                            &bgp_200_pfx,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &pfx_1_1_1_2_s_32.fp_addr,
+                            ~0,
+                            fib_index,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_RESOLVE_VIA_HOST);
+
+    fei = fib_table_lookup_exact_match(fib_index, &bgp_200_pfx);
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(lb == load_balance_get(dpo->dpoi_index),
+            "200.200.200.200/32 was inplace modified for 3rd path");
+    FIB_TEST(16 == lb->lb_n_buckets,
+            "200.200.200.200/32 was inplace modified for 3rd path to 16 buckets");
+
+    lbmi = lb->lb_map;
+    load_balance_map_lock(lbmi);
+    lbm = load_balance_map_get(lbmi);
+
+    for (ii = 0; ii < 16; ii++)
+    {
+        FIB_TEST(lbm->lbm_buckets[ii] == ii,
+                 "LB Map for 200.200.200.200/32 at %d is %d",
+                 ii, lbm->lbm_buckets[ii]);
+    }
+
+    /*
+     * trigger PIC by removing the first via-entry
+     * the first 6 buckets of the map should map to the next 6
+     */
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_1_1_1_1_s_32,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_10_1,
+                               tm->hw[0]->sw_if_index,
+                               ~0,
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup_exact_match(fib_index, &bgp_200_pfx);
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(lb == load_balance_get(dpo->dpoi_index),
+            "200.200.200.200/32 was inplace modified for 3rd path");
+    FIB_TEST(2 == lb->lb_n_buckets,
+            "200.200.200.200/32 was inplace modified for 3rd path remove to 2 buckets");
+
+    for (ii = 0; ii < 6; ii++)
+    {
+        FIB_TEST(lbm->lbm_buckets[ii] == ii+6,
+                 "LB Map for 200.200.200.200/32 at %d is %d",
+                 ii, lbm->lbm_buckets[ii]);
+    }
+    for (ii = 6; ii < 16; ii++)
+    {
+        FIB_TEST(lbm->lbm_buckets[ii] == ii,
+                 "LB Map for 200.200.200.200/32 at %d is %d",
+                 ii, lbm->lbm_buckets[ii]);
+    }
+
+
+    /*
+     * tidy up
+     */
+    fib_table_entry_path_add(fib_index,
+                             &pfx_1_1_1_1_s_32,
+                             FIB_SOURCE_API,
+                             FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                             &nh_10_10_10_1,
+                             tm->hw[0]->sw_if_index,
+                             ~0,
+                             1,
+                             MPLS_LABEL_INVALID,
+                             FIB_ROUTE_PATH_FLAG_NONE);
+
+    fib_table_entry_path_remove(fib_index,
+                                &bgp_200_pfx,
+                                FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                                &pfx_1_1_1_2_s_32.fp_addr,
+                                ~0,
+                                fib_index,
+                                1,
+                                MPLS_LABEL_INVALID);
+    fib_table_entry_path_remove(fib_index,
+                               &bgp_200_pfx,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_1_1_1_1,
+                               ~0,
+                               fib_index,
+                               1,
+                               FIB_ROUTE_PATH_RESOLVE_VIA_HOST);
+    fib_table_entry_path_remove(fib_index,
+                               &bgp_200_pfx,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &pfx_1_1_1_3_s_32.fp_addr,
+                               ~0,
+                               fib_index,
+                               1,
+                               FIB_ROUTE_PATH_RESOLVE_VIA_HOST);
+    fib_table_entry_delete(fib_index,
+                          &pfx_1_1_1_3_s_32,
+                          FIB_SOURCE_API);
+    fib_table_entry_delete(fib_index,
+                          &pfx_1_1_1_0_s_28,
+                          FIB_SOURCE_API);
+    FIB_TEST((FIB_NODE_INDEX_INVALID ==
+             fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_0_s_28)),
+            "1.1.1.1/28 removed");
+    FIB_TEST((FIB_NODE_INDEX_INVALID ==
+             fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_3_s_32)),
+            "1.1.1.3/32 removed");
+    FIB_TEST((FIB_NODE_INDEX_INVALID ==
+             fib_table_lookup_exact_match(fib_index, &bgp_200_pfx)),
+            "200.200.200.200/32 removed");
+
+    /*
+     * add-remove test. no change.
+     */
+    FIB_TEST((1  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+5 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+7 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * A route whose paths are built up iteratively and then removed
+     * all at once
+     */
+    fib_prefix_t pfx_4_4_4_4_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 4.4.4.4/32 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x04040404),
+       },
+    };
+
+    fib_table_entry_path_add(fib_index,
+                            &pfx_4_4_4_4_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_1,
+                            tm->hw[0]->sw_if_index,
+                            ~0,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fib_table_entry_path_add(fib_index,
+                            &pfx_4_4_4_4_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_2,
+                            tm->hw[0]->sw_if_index,
+                            ~0,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fib_table_entry_path_add(fib_index,
+                            &pfx_4_4_4_4_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_3,
+                            tm->hw[0]->sw_if_index,
+                            ~0,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    FIB_TEST(FIB_NODE_INDEX_INVALID !=
+            fib_table_lookup_exact_match(fib_index, &pfx_4_4_4_4_s_32),
+            "4.4.4.4/32 present");
+
+    fib_table_entry_delete(fib_index,
+                          &pfx_4_4_4_4_s_32,
+                          FIB_SOURCE_API);
+    FIB_TEST(FIB_NODE_INDEX_INVALID ==
+            fib_table_lookup_exact_match(fib_index, &pfx_4_4_4_4_s_32),
+            "4.4.4.4/32 removed");
+
+    /*
+     * add-remove test. no change.
+     */
+    FIB_TEST((1  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+5 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+7 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * A route with multiple paths at once
+     */
+    fib_route_path_t *r_paths = NULL;
+
+    for (ii = 0; ii < 4; ii++)
+    {
+       fib_route_path_t r_path = {
+           .frp_proto = FIB_PROTOCOL_IP4,
+           .frp_addr = {
+               .ip4.as_u32 = clib_host_to_net_u32(0x0a0a0a02 + ii),
+           },
+           .frp_sw_if_index = tm->hw[0]->sw_if_index,
+           .frp_weight = 1,
+           .frp_fib_index = ~0,
+       };
+       vec_add1(r_paths, r_path);
+    }
+
+    fib_table_entry_update(fib_index,
+                          &pfx_4_4_4_4_s_32,
+                          FIB_SOURCE_API,
+                          FIB_ENTRY_FLAG_NONE,
+                          r_paths);
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_4_4_4_4_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "4.4.4.4/32 present");
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+
+    lb = load_balance_get(dpo->dpoi_index);
+    FIB_TEST((lb->lb_n_buckets == 4), "4.4.4.4/32 lb over %d paths", lb->lb_n_buckets);
+
+    fib_table_entry_delete(fib_index,
+                          &pfx_4_4_4_4_s_32,
+                          FIB_SOURCE_API);
+    FIB_TEST(FIB_NODE_INDEX_INVALID ==
+            fib_table_lookup_exact_match(fib_index, &pfx_4_4_4_4_s_32),
+            "4.4.4.4/32 removed");
+    vec_free(r_paths);
+
+    /*
+     * add-remove test. no change.
+     */
+    FIB_TEST((1  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+5 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+7 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * A route deag route
+     */
+    fib_table_entry_path_add(fib_index,
+                            &pfx_4_4_4_4_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &zero_addr,
+                            ~0,
+                            fib_index,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_4_4_4_4_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "4.4.4.4/32 present");
+
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    dpo = load_balance_get_bucket(dpo->dpoi_index, 0);
+    lookup_dpo_t *lkd = lookup_dpo_get(dpo->dpoi_index);
+
+    FIB_TEST((fib_index == lkd->lkd_fib_index),
+            "4.4.4.4/32 is deag in %d %U",
+             lkd->lkd_fib_index,
+             format_dpo_id, dpo, 0);
+
+    fib_table_entry_delete(fib_index,
+                          &pfx_4_4_4_4_s_32,
+                          FIB_SOURCE_API);
+    FIB_TEST(FIB_NODE_INDEX_INVALID ==
+            fib_table_lookup_exact_match(fib_index, &pfx_4_4_4_4_s_32),
+            "4.4.4.4/32 removed");
+    vec_free(r_paths);
+
+    /*
+     * add-remove test. no change.
+     */
+    FIB_TEST((1  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+5 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+7 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * CLEANUP
+     *   remove: 1.1.1.2/32, 1.1.2.0/24 and 1.1.1.1/32
+     *           all of which are via 10.10.10.1, Itf1
+     */
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_1_1_1_2_s_32,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_10_1,
+                               tm->hw[0]->sw_if_index,
+                               ~0,
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_1_1_1_1_s_32,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_10_1,
+                               tm->hw[0]->sw_if_index,
+                               ~0,
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_1_1_2_0_s_24,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_10_1,
+                               tm->hw[0]->sw_if_index,
+                               ~0,
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST(FIB_NODE_INDEX_INVALID ==
+            fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_1_s_32),
+            "1.1.1.1/32 removed");
+    FIB_TEST(FIB_NODE_INDEX_INVALID ==
+            fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_2_s_32),
+            "1.1.1.2/32 removed");
+    FIB_TEST(FIB_NODE_INDEX_INVALID ==
+            fib_table_lookup_exact_match(fib_index, &pfx_1_1_2_0_s_24),
+            "1.1.2.0/24 removed");
+
+    /*
+     * -3 entries and -1 shared path-list
+     */
+    FIB_TEST((0  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+4 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+4 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * An attached-host route. Expect to link to the incomplete adj
+     */
+    fib_prefix_t pfx_4_1_1_1_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 4.1.1.1/32 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x04010101),
+       },
+    };
+    fib_table_entry_path_add(fib_index,
+                            &pfx_4_1_1_1_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &zero_addr,
+                            tm->hw[0]->sw_if_index,
+                            fib_index,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_4_1_1_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "4.1.1.1/32 present");
+    ai = fib_entry_get_adj(fei);
+
+    ai2 = adj_nbr_add_or_lock(FIB_PROTOCOL_IP4,
+                             FIB_LINK_IP4,
+                             &pfx_4_1_1_1_s_32.fp_addr,
+                             tm->hw[0]->sw_if_index);
+    FIB_TEST((ai == ai2), "Attached-host link to incomplete ADJ");
+    adj_unlock(ai2);
+
+    /*
+     * +1 entry and +1 shared path-list
+     */
+    FIB_TEST((1  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+5 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+5 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    fib_table_entry_delete(fib_index,
+                          &pfx_4_1_1_1_s_32,
+                          FIB_SOURCE_API);
+
+    FIB_TEST((0  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+4 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+4 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * add a v6 prefix via v4 next-hops
+     */
+    fib_prefix_t pfx_2001_s_64 = {
+       .fp_len = 64,
+       .fp_proto = FIB_PROTOCOL_IP6,
+       .fp_addr = {
+           .ip6.as_u64[0] = clib_host_to_net_u64(0x2001000000000000),
+       },
+    };
+    fei = fib_table_entry_path_add(0, //default v6 table
+                                  &pfx_2001_s_64,
+                                  FIB_SOURCE_API,
+                                  FIB_ENTRY_FLAG_NONE,
+                                  FIB_PROTOCOL_IP4,
+                                  &nh_10_10_10_1,
+                                  tm->hw[0]->sw_if_index,
+                                  fib_index,
+                                  1,
+                                  MPLS_LABEL_INVALID,
+                                  FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup_exact_match(0, &pfx_2001_s_64);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "2001::/64 present");
+    ai = fib_entry_get_adj(fei);
+    adj = adj_get(ai);
+    FIB_TEST((adj->lookup_next_index == IP_LOOKUP_NEXT_ARP),
+            "2001::/64 via ARP-adj");
+    FIB_TEST((adj->ia_link == FIB_LINK_IP6),
+            "2001::/64 is link type v6");
+    FIB_TEST((adj->ia_nh_proto == FIB_PROTOCOL_IP4),
+            "2001::/64 ADJ-adj is NH proto v4");
+    fib_table_entry_delete(0, &pfx_2001_s_64, FIB_SOURCE_API);
+
+
+    /*
+     * CLEANUP
+     *    remove adj-fibs: 
+     */
+    fib_table_entry_delete(fib_index,
+                          &pfx_10_10_10_1_s_32,
+                          FIB_SOURCE_ADJ);
+    fib_table_entry_delete(fib_index,
+                          &pfx_10_10_10_2_s_32,
+                          FIB_SOURCE_ADJ);
+    FIB_TEST(FIB_NODE_INDEX_INVALID ==
+            fib_table_lookup_exact_match(fib_index, &pfx_10_10_10_1_s_32),
+            "10.10.10.1/32 adj-fib removed");
+    FIB_TEST(FIB_NODE_INDEX_INVALID ==
+            fib_table_lookup_exact_match(fib_index, &pfx_10_10_10_2_s_32),
+            "10.10.10.2/32 adj-fib removed");
+
+    /*
+     * -2 entries and -2 non-shared path-list
+     */
+    FIB_TEST((0  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR+2 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR+2 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * unlock the 2 adjacencies for which this test provided a rewrite.
+     * These are the last locks on these adjs. they should thus go away.
+     */
+    adj_unlock(ai_02);
+    adj_unlock(ai_01);
+
+    FIB_TEST((0 == adj_nbr_db_size()), "ADJ DB size is %d",
+            adj_nbr_db_size());
+    
+    /*
+     * CLEANUP
+     *   remove the interface prefixes
+     */
+    local_pfx.fp_len = 32;
+    fib_table_entry_special_remove(fib_index, &local_pfx,
+                                  FIB_SOURCE_INTERFACE);
+    fei = fib_table_lookup(fib_index, &local_pfx);
+
+    FIB_TEST(FIB_NODE_INDEX_INVALID ==
+            fib_table_lookup_exact_match(fib_index, &local_pfx),
+            "10.10.10.10/32 adj-fib removed");
+
+    local_pfx.fp_len = 24;
+    fib_table_entry_delete(fib_index, &local_pfx,
+                          FIB_SOURCE_INTERFACE);
+
+    FIB_TEST(FIB_NODE_INDEX_INVALID ==
+            fib_table_lookup_exact_match(fib_index, &local_pfx),
+            "10.10.10.10/24 adj-fib removed");
+
+    /*
+     * -2 entries and -2 non-shared path-list
+     */
+    FIB_TEST((0  == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * Last but not least, remove the VRF
+     */
+    FIB_TEST((0 == fib_table_get_num_entries(fib_index,
+                                             FIB_PROTOCOL_IP4,
+                                             FIB_SOURCE_API)),
+             "NO API Source'd prefixes");
+    FIB_TEST((0 == fib_table_get_num_entries(fib_index,
+                                             FIB_PROTOCOL_IP4,
+                                             FIB_SOURCE_RR)),
+             "NO RR Source'd prefixes");
+    FIB_TEST((0 == fib_table_get_num_entries(fib_index,
+                                             FIB_PROTOCOL_IP4,
+                                             FIB_SOURCE_INTERFACE)),
+             "NO INterface Source'd prefixes");
+
+    fib_table_unlock(fib_index, FIB_PROTOCOL_IP4);
+
+    FIB_TEST((0  == fib_path_list_db_size()), "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NBR-5 == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NBR-5 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    return;
+}
+
+static void
+fib_test_v6 (void)
+{
+    /*
+     * In the default table check for the presence and correct forwarding
+     * of the special entries
+     */
+    fib_node_index_t dfrt, fei, ai, locked_ai, ai_01, ai_02;
+    const dpo_id_t *dpo, *dpo_drop;
+    const ip_adjacency_t *adj;
+    const receive_dpo_t *rd;
+    test_main_t *tm;
+    u32 fib_index;
+    int ii;
+
+    FIB_TEST((0 == adj_nbr_db_size()), "ADJ DB size is %d",
+            adj_nbr_db_size());
+
+    /* via 2001:0:0:1::2 */
+    ip46_address_t nh_2001_2 = {
+       .ip6 = {
+           .as_u64 = {
+               [0] = clib_host_to_net_u64(0x2001000000000001),
+               [1] = clib_host_to_net_u64(0x0000000000000002),
+           },
+       },
+    };
+
+    tm = &test_main;
+
+    dpo_drop = drop_dpo_get(DPO_PROTO_IP6);
+
+    /* Find or create FIB table 11 */
+    fib_index = fib_table_find_or_create_and_lock(FIB_PROTOCOL_IP6, 11);
+
+    for (ii = 0; ii < 4; ii++)
+    {
+       ip6_main.fib_index_by_sw_if_index[tm->hw[ii]->sw_if_index] = fib_index;
+    }
+
+    fib_prefix_t pfx_0_0 = {
+       .fp_len = 0,
+       .fp_proto = FIB_PROTOCOL_IP6,
+       .fp_addr = {
+           .ip6 = {
+               {0, 0},
+           },
+       },
+    };
+
+    dfrt = fib_table_lookup(fib_index, &pfx_0_0);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != dfrt), "default route present");
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(dfrt)),
+            "Default route is DROP");
+
+    dpo = fib_entry_contribute_ip_forwarding(dfrt);
+    FIB_TEST((dpo->dpoi_index == ip6_fib_table_fwding_lookup(
+                                    &ip6_main,
+                                    1,
+                                    &pfx_0_0.fp_addr.ip6)),
+            "default-route; fwd and non-fwd tables match");
+
+    // FIXME - check specials.
+
+    /*
+     * At this stage there is one v4 FIB with 5 routes and two v6 FIBs
+     * each with 6 entries. All entries are special so no path-list sharing.
+     */
+#define NPS (5+6+6)
+    FIB_TEST((0 == fib_path_list_db_size()),   "path list DB is empty");
+    FIB_TEST((NPS == fib_path_list_pool_size()), "path list pool size is %d",
+            fib_path_list_pool_size());
+    FIB_TEST((NPS == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * add interface routes.
+     *  validate presence of /64 attached and /128 recieve.
+     *  test for the presence of the receive address in the glean and local adj
+     *
+     * receive on 2001:0:0:1::1/128
+     */
+    fib_prefix_t local_pfx = {
+       .fp_len = 64,
+       .fp_proto = FIB_PROTOCOL_IP6,
+       .fp_addr = {
+           .ip6 = {
+               .as_u64 = {
+                   [0] = clib_host_to_net_u64(0x2001000000000001),
+                   [1] = clib_host_to_net_u64(0x0000000000000001),
+               },
+           },
+       }
+    };
+
+    fib_table_entry_update_one_path(fib_index, &local_pfx,
+                                   FIB_SOURCE_INTERFACE,
+                                   (FIB_ENTRY_FLAG_CONNECTED |
+                                    FIB_ENTRY_FLAG_ATTACHED),
+                                   FIB_PROTOCOL_IP6,
+                                   NULL,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0,
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(fib_index, &local_pfx);
+
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "attached interface route present");
+
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != ai), "attached interface route adj present");
+    adj = adj_get(ai);
+    FIB_TEST((IP_LOOKUP_NEXT_GLEAN == adj->lookup_next_index),
+            "attached interface adj is glean");
+    FIB_TEST((0 == ip46_address_cmp(&local_pfx.fp_addr,
+                                   &adj->sub_type.glean.receive_addr)),
+             "attached interface adj is receive ok");
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST((dpo->dpoi_index == ip6_fib_table_fwding_lookup(
+                                    &ip6_main,
+                                    1,
+                                    &local_pfx.fp_addr.ip6)),
+            "attached-route; fwd and non-fwd tables match");
+
+    local_pfx.fp_len = 128;
+    fib_table_entry_update_one_path(fib_index, &local_pfx,
+                                   FIB_SOURCE_INTERFACE,
+                                   (FIB_ENTRY_FLAG_CONNECTED |
+                                    FIB_ENTRY_FLAG_LOCAL),
+                                   FIB_PROTOCOL_IP6,
+                                   NULL,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &local_pfx);
+
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "local interface route present");
+
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    dpo = load_balance_get_bucket(dpo->dpoi_index, 0);
+    FIB_TEST((DPO_RECEIVE == dpo->dpoi_type),
+            "local interface adj is local");
+    rd = receive_dpo_get(dpo->dpoi_index);
+
+    FIB_TEST((0 == ip46_address_cmp(&local_pfx.fp_addr,
+                                   &rd->rd_addr)),
+             "local interface adj is receive ok");
+
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST((dpo->dpoi_index == ip6_fib_table_fwding_lookup(
+                                    &ip6_main,
+                                    1,
+                                    &local_pfx.fp_addr.ip6)),
+            "local-route; fwd and non-fwd tables match");
+
+    /*
+     * +2 entries. +2 unshared path-lists
+     */
+    FIB_TEST((0 == fib_path_list_db_size()),   "path list DB is empty");
+    FIB_TEST((NPS+2 == fib_path_list_pool_size()), "path list pool size is%d",
+            fib_path_list_pool_size());
+    FIB_TEST((NPS+2 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * Modify the default route to be via an adj not yet known.
+     * this sources the defalut route with the API source, which is
+     * a higher preference to the DEFAULT_ROUTE source
+     */
+    fib_table_entry_path_add(fib_index, &pfx_0_0,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP6,
+                            &nh_2001_2,
+                            tm->hw[0]->sw_if_index,
+                            ~0,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &pfx_0_0);
+
+    FIB_TEST((fei == dfrt), "default route same index");
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != ai), "default route adj present");
+    adj = adj_get(ai);
+    FIB_TEST((IP_LOOKUP_NEXT_ARP == adj->lookup_next_index),
+            "adj is incomplete");
+    FIB_TEST((0 == ip46_address_cmp(&nh_2001_2, &adj->sub_type.nbr.next_hop)),
+             "adj nbr next-hop ok");
+
+    /*
+     * find the adj in the shared db
+     */
+    locked_ai = adj_nbr_add_or_lock(FIB_PROTOCOL_IP6,
+                                   FIB_LINK_IP6,
+                                   &nh_2001_2,
+                                   tm->hw[0]->sw_if_index);
+    FIB_TEST((locked_ai == ai), "ADJ NBR DB find");
+    adj_unlock(locked_ai);
+
+    /*
+     * no more entires. +1 shared path-list
+     */
+    FIB_TEST((1 == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NPS+3 == fib_path_list_pool_size()), "path list pool size is%d",
+            fib_path_list_pool_size());
+    FIB_TEST((NPS+2 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * remove the API source from the default route. We expected
+     * the route to remain, sourced by DEFAULT_ROUTE, and hence a DROP
+     */
+    fib_table_entry_path_remove(fib_index, &pfx_0_0,
+                               FIB_SOURCE_API, 
+                               FIB_PROTOCOL_IP6,
+                               &nh_2001_2,
+                               tm->hw[0]->sw_if_index,
+                               ~0,
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &pfx_0_0);
+
+    FIB_TEST((fei == dfrt), "default route same index");
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(dfrt)),
+            "Default route is DROP");
+
+    /*
+     * no more entires. -1 shared path-list
+     */
+    FIB_TEST((0 == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NPS+2 == fib_path_list_pool_size()), "path list pool size is%d",
+            fib_path_list_pool_size());
+    FIB_TEST((NPS+2 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * Add an 2 ARP entry => a complete ADJ plus adj-fib.
+     */
+    fib_prefix_t pfx_2001_1_2_s_128 = {
+       .fp_len   = 128,
+       .fp_proto = FIB_PROTOCOL_IP6,
+       .fp_addr  = {
+           .ip6 = {
+               .as_u64 = {
+                   [0] = clib_host_to_net_u64(0x2001000000000001),
+                   [1] = clib_host_to_net_u64(0x0000000000000002),
+               },
+           },
+       }
+    };
+    fib_prefix_t pfx_2001_1_3_s_128 = {
+       .fp_len   = 128,
+       .fp_proto = FIB_PROTOCOL_IP6,
+       .fp_addr  = {
+           .ip6 = {
+               .as_u64 = {
+                   [0] = clib_host_to_net_u64(0x2001000000000001),
+                   [1] = clib_host_to_net_u64(0x0000000000000003),
+               },
+           },
+       }
+    };
+    u8 eth_addr[] = {
+       0xde, 0xde, 0xde, 0xba, 0xba, 0xba,
+    };
+
+    ai_01 = adj_nbr_add_or_lock(FIB_PROTOCOL_IP6,
+                               FIB_LINK_IP6,
+                               &pfx_2001_1_2_s_128.fp_addr,
+                               tm->hw[0]->sw_if_index);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != ai_01), "adj created");
+    adj = adj_get(ai_01);
+    FIB_TEST((IP_LOOKUP_NEXT_ARP == adj->lookup_next_index),
+            "adj is incomplete");
+    FIB_TEST((0 == ip46_address_cmp(&pfx_2001_1_2_s_128.fp_addr,
+                                   &adj->sub_type.nbr.next_hop)),
+             "adj nbr next-hop ok");
+
+    adj_nbr_update_rewrite(ai_01, eth_addr);
+    FIB_TEST((IP_LOOKUP_NEXT_REWRITE == adj->lookup_next_index),
+            "adj is complete");
+    FIB_TEST((0 == ip46_address_cmp(&pfx_2001_1_2_s_128.fp_addr,
+                                   &adj->sub_type.nbr.next_hop)),
+             "adj nbr next-hop ok");
+
+    fib_table_entry_update_one_path(fib_index,
+                                   &pfx_2001_1_2_s_128,
+                                   FIB_SOURCE_ADJ,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP6,
+                                   &pfx_2001_1_2_s_128.fp_addr,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0,
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup(fib_index, &pfx_2001_1_2_s_128);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_01 == ai), "ADJ-FIB resolves via adj");
+
+    eth_addr[5] = 0xb2;
+
+    ai_02 = adj_nbr_add_or_lock(FIB_PROTOCOL_IP6,
+                               FIB_LINK_IP6,
+                               &pfx_2001_1_3_s_128.fp_addr,
+                               tm->hw[0]->sw_if_index);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != ai_02), "adj created");
+    adj = adj_get(ai_02);
+    FIB_TEST((IP_LOOKUP_NEXT_ARP == adj->lookup_next_index),
+            "adj is incomplete");
+    FIB_TEST((0 == ip46_address_cmp(&pfx_2001_1_3_s_128.fp_addr,
+                                   &adj->sub_type.nbr.next_hop)),
+             "adj nbr next-hop ok");
+
+    adj_nbr_update_rewrite(ai_02, eth_addr);
+    FIB_TEST((IP_LOOKUP_NEXT_REWRITE == adj->lookup_next_index),
+            "adj is complete");
+    FIB_TEST((0 == ip46_address_cmp(&pfx_2001_1_3_s_128.fp_addr,
+                                   &adj->sub_type.nbr.next_hop)),
+             "adj nbr next-hop ok");
+    FIB_TEST((ai_01 != ai_02), "ADJs are different");
+
+    fib_table_entry_update_one_path(fib_index,
+                                   &pfx_2001_1_3_s_128,
+                                   FIB_SOURCE_ADJ,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP6,
+                                   &pfx_2001_1_3_s_128.fp_addr,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0,
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup(fib_index, &pfx_2001_1_3_s_128);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_02 == ai), "ADJ-FIB resolves via adj");
+
+    /*
+     * +2 entries, +2 unshread path-lists.
+     */
+    FIB_TEST((0 == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NPS+4 == fib_path_list_pool_size()), "path list pool size is%d",
+            fib_path_list_pool_size());
+    FIB_TEST((NPS+4 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * Add a 2 routes via the first ADJ. ensure path-list sharing
+     */
+    fib_prefix_t pfx_2001_a_s_64 = {
+       .fp_len   = 64,
+       .fp_proto = FIB_PROTOCOL_IP6,
+       .fp_addr  = {
+           .ip6 = {
+               .as_u64 = {
+                   [0] = clib_host_to_net_u64(0x200100000000000a),
+                   [1] = clib_host_to_net_u64(0x0000000000000000),
+               },
+           },
+       }
+    };
+    fib_prefix_t pfx_2001_b_s_64 = {
+       .fp_len   = 64,
+       .fp_proto = FIB_PROTOCOL_IP6,
+       .fp_addr  = {
+           .ip6 = {
+               .as_u64 = {
+                   [0] = clib_host_to_net_u64(0x200100000000000b),
+                   [1] = clib_host_to_net_u64(0x0000000000000000),
+               },
+           },
+       }
+    };
+
+    fib_table_entry_path_add(fib_index,
+                            &pfx_2001_a_s_64,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP6,
+                            &nh_2001_2,
+                            tm->hw[0]->sw_if_index,
+                            ~0,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &pfx_2001_a_s_64);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_01 == ai), "2001::a/64 resolves via 2001:0:0:1::1");
+    fib_table_entry_path_add(fib_index,
+                            &pfx_2001_b_s_64,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP6,
+                            &nh_2001_2,
+                            tm->hw[0]->sw_if_index,
+                            ~0,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &pfx_2001_b_s_64);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_01 == ai), "2001::b/64 resolves via 2001:0:0:1::1");
+
+    /*
+     * +2 entries, +1 shared path-list.
+     */
+    FIB_TEST((1 == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NPS+5 == fib_path_list_pool_size()), "path list pool size is%d",
+            fib_path_list_pool_size());
+    FIB_TEST((NPS+6 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * add a v4 prefix via a v6 next-hop
+     */
+    fib_prefix_t pfx_1_1_1_1_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4.as_u32 = 0x01010101,
+       },
+    };
+    fei = fib_table_entry_path_add(0, // default table
+                                  &pfx_1_1_1_1_s_32,
+                                  FIB_SOURCE_API,
+                                  FIB_ENTRY_FLAG_NONE,
+                                  FIB_PROTOCOL_IP6,
+                                  &nh_2001_2,
+                                  tm->hw[0]->sw_if_index,
+                                  ~0,
+                                  1,
+                                  MPLS_LABEL_INVALID,
+                                  FIB_ROUTE_PATH_FLAG_NONE);
+    FIB_TEST(fei == fib_table_lookup_exact_match(0, &pfx_1_1_1_1_s_32),
+            "1.1.1.1/32 o v6 route present");
+    ai = fib_entry_get_adj(fei);
+    adj = adj_get(ai);
+    FIB_TEST((adj->lookup_next_index == IP_LOOKUP_NEXT_ARP),
+            "1.1.1.1/32 via ARP-adj");
+    FIB_TEST((adj->ia_link == FIB_LINK_IP4),
+            "1.1.1.1/32 ADJ-adj is link type v4");
+    FIB_TEST((adj->ia_nh_proto == FIB_PROTOCOL_IP6),
+            "1.1.1.1/32 ADJ-adj is NH proto v6");
+    fib_table_entry_delete(0, &pfx_1_1_1_1_s_32, FIB_SOURCE_API);
+
+    /*
+     * An attached route
+     */
+    fib_prefix_t pfx_2001_c_s_64 = {
+       .fp_len   = 64,
+       .fp_proto = FIB_PROTOCOL_IP6,
+       .fp_addr  = {
+           .ip6 = {
+               .as_u64 = {
+                   [0] = clib_host_to_net_u64(0x200100000000000c),
+                   [1] = clib_host_to_net_u64(0x0000000000000000),
+               },
+           },
+       }
+    };
+    fib_table_entry_path_add(fib_index,
+                            &pfx_2001_c_s_64,
+                            FIB_SOURCE_CLI,
+                            FIB_ENTRY_FLAG_ATTACHED,
+                            FIB_PROTOCOL_IP6,
+                            NULL,
+                            tm->hw[0]->sw_if_index,
+                            ~0,
+                            1,
+                            MPLS_LABEL_INVALID,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_c_s_64);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "attached route present");
+    ai = fib_entry_get_adj(fei);
+    adj = adj_get(ai);
+    FIB_TEST((adj->lookup_next_index == IP_LOOKUP_NEXT_GLEAN),
+            "2001:0:0:c/64 attached resolves via glean");
+
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_2001_c_s_64,
+                               FIB_SOURCE_CLI,
+                               FIB_PROTOCOL_IP6,
+                               NULL,
+                               tm->hw[0]->sw_if_index,
+                               ~0,
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_c_s_64);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "attached route removed");
+
+    /*
+     * Shutdown the interface on which we have a connected and through
+     * which the routes are reachable.
+     * This will result in the connected, adj-fibs, and routes linking to drop
+     * The local/for-us prefix continues to receive.
+     */
+    clib_error_t * error;
+
+    error = vnet_sw_interface_set_flags(vnet_get_main(),
+                                       tm->hw[0]->sw_if_index,
+                                       ~VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+    FIB_TEST((NULL == error), "Interface shutdown OK");
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_b_s_64);
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(!dpo_cmp(dpo_drop, load_balance_get_bucket(dpo->dpoi_index, 0)),
+            "2001::b/64 resolves via drop");
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_a_s_64);
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(!dpo_cmp(dpo_drop, load_balance_get_bucket(dpo->dpoi_index, 0)),
+            "2001::a/64 resolves via drop");
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_1_3_s_128);
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(!dpo_cmp(dpo_drop, load_balance_get_bucket(dpo->dpoi_index, 0)),
+            "2001:0:0:1::3/64 resolves via drop");
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_1_2_s_128);
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(!dpo_cmp(dpo_drop, load_balance_get_bucket(dpo->dpoi_index, 0)),
+            "2001:0:0:1::2/64 resolves via drop");
+    fei = fib_table_lookup_exact_match(fib_index, &local_pfx);
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(dpo_cmp(dpo_drop, load_balance_get_bucket(dpo->dpoi_index, 0)),
+            "2001:0:0:1::1/128 not drop");
+    local_pfx.fp_len = 64;
+    fei = fib_table_lookup_exact_match(fib_index, &local_pfx);
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(!dpo_cmp(dpo_drop, load_balance_get_bucket(dpo->dpoi_index, 0)),
+            "2001:0:0:1/64 resolves via drop");
+
+    /*
+     * no change
+     */
+    FIB_TEST((1 == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NPS+5 == fib_path_list_pool_size()), "path list pool size is%d",
+            fib_path_list_pool_size());
+    FIB_TEST((NPS+6 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * shutdown one of the other interfaces, then add a connected.
+     * and swap one of the routes to it.
+     */
+    error = vnet_sw_interface_set_flags(vnet_get_main(),
+                                       tm->hw[1]->sw_if_index,
+                                       ~VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+    FIB_TEST((NULL == error), "Interface 1 shutdown OK");
+
+    fib_prefix_t connected_pfx = {
+       .fp_len = 64,
+       .fp_proto = FIB_PROTOCOL_IP6,
+       .fp_addr = {
+           .ip6 = {
+               /* 2001:0:0:2::1/64 */
+               .as_u64 = {
+                   [0] = clib_host_to_net_u64(0x2001000000000002),
+                   [1] = clib_host_to_net_u64(0x0000000000000001),
+               },
+           },
+       }
+    };
+    fib_table_entry_update_one_path(fib_index, &connected_pfx,
+                                   FIB_SOURCE_INTERFACE,
+                                   (FIB_ENTRY_FLAG_CONNECTED |
+                                    FIB_ENTRY_FLAG_ATTACHED),
+                                   FIB_PROTOCOL_IP6,
+                                   NULL,
+                                   tm->hw[1]->sw_if_index,
+                                   ~0,
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(fib_index, &connected_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "attached interface route present");
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    dpo = load_balance_get_bucket(dpo->dpoi_index, 0);
+    FIB_TEST(!dpo_cmp(dpo, dpo_drop),
+             "2001:0:0:2/64 not resolves via drop");
+
+    connected_pfx.fp_len = 128;
+    fib_table_entry_update_one_path(fib_index, &connected_pfx,
+                                   FIB_SOURCE_INTERFACE,
+                                   (FIB_ENTRY_FLAG_CONNECTED |
+                                    FIB_ENTRY_FLAG_LOCAL),
+                                   FIB_PROTOCOL_IP6,
+                                   NULL,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup(fib_index, &connected_pfx);
+
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "local interface route present");
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    dpo = load_balance_get_bucket(dpo->dpoi_index, 0);
+    FIB_TEST((DPO_RECEIVE == dpo->dpoi_type),
+            "local interface adj is local");
+    rd = receive_dpo_get(dpo->dpoi_index);
+    FIB_TEST((0 == ip46_address_cmp(&connected_pfx.fp_addr,
+                                   &rd->rd_addr)),
+             "local interface adj is receive ok");
+
+    /*
+     * +2 entries, +2 unshared path-lists
+     */
+    FIB_TEST((1 == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NPS+7 == fib_path_list_pool_size()), "path list pool size is%d",
+            fib_path_list_pool_size());
+    FIB_TEST((NPS+8 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+
+    /*
+     * bring the interface back up. we expected the routes to return
+     * to normal forwarding.
+     */
+    error = vnet_sw_interface_set_flags(vnet_get_main(),
+                                       tm->hw[0]->sw_if_index,
+                                       VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+    FIB_TEST((NULL == error), "Interface bring-up OK");
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_a_s_64);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_01 == ai), "2001::a/64 resolves via 2001:0:0:1::1");
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_b_s_64);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_01 == ai), "2001::b/64 resolves via 2001:0:0:1::1");
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_1_3_s_128);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_02 == ai), "ADJ-FIB resolves via adj");
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_1_2_s_128);
+    ai = fib_entry_get_adj(fei);
+    FIB_TEST((ai_01 == ai), "ADJ-FIB resolves via adj");
+    local_pfx.fp_len = 64;
+    fei = fib_table_lookup_exact_match(fib_index, &local_pfx);
+    ai = fib_entry_get_adj(fei);
+    adj = adj_get(ai);
+    FIB_TEST((IP_LOOKUP_NEXT_GLEAN == adj->lookup_next_index),
+            "attached interface adj is glean");
+
+    /*
+     * Delete the interface that the routes reolve through.
+     * Again no routes are removed. They all point to drop.
+     *
+     * This is considered an error case. The control plane should
+     * not remove interfaces through which routes resolve, but
+     * such things can happen. ALL affected routes will drop.
+     */
+    vnet_delete_hw_interface(vnet_get_main(), tm->hw_if_indicies[0]);
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_b_s_64);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+             "2001::b/64 resolves via drop");
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_a_s_64);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+             "2001::b/64 resolves via drop");
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_1_3_s_128);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+             "2001:0:0:1::3/64 resolves via drop");
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_1_2_s_128);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+             "2001:0:0:1::2/64 resolves via drop");
+    fei = fib_table_lookup_exact_match(fib_index, &local_pfx);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+             "2001:0:0:1::1/128 is drop");
+    local_pfx.fp_len = 64;
+    fei = fib_table_lookup_exact_match(fib_index, &local_pfx);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+             "2001:0:0:1/64 resolves via drop");
+
+    /*
+     * no change
+     */
+    FIB_TEST((1 == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NPS+7 == fib_path_list_pool_size()), "path list pool size is%d",
+            fib_path_list_pool_size());
+    FIB_TEST((NPS+8 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * Add the interface back. routes stay unresolved.
+     */
+    error = ethernet_register_interface(vnet_get_main(),
+                                       test_interface_device_class.index,
+                                       0 /* instance */,
+                                       hw_address,
+                                       &tm->hw_if_indicies[0],
+                                       /* flag change */ 0);
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_b_s_64);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+             "2001::b/64 resolves via drop");
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_a_s_64);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+             "2001::b/64 resolves via drop");
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_1_3_s_128);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+             "2001:0:0:1::3/64 resolves via drop");
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_2001_1_2_s_128);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+             "2001:0:0:1::2/64 resolves via drop");
+    fei = fib_table_lookup_exact_match(fib_index, &local_pfx);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+             "2001:0:0:1::1/128 is drop");
+    local_pfx.fp_len = 64;
+    fei = fib_table_lookup_exact_match(fib_index, &local_pfx);
+    FIB_TEST(load_balance_is_drop(fib_entry_contribute_ip_forwarding(fei)),
+             "2001:0:0:1/64 resolves via drop");
+
+    /*
+     * CLEANUP ALL the routes
+     */
+    fib_table_entry_delete(fib_index,
+                          &pfx_2001_c_s_64,
+                          FIB_SOURCE_API);
+    fib_table_entry_delete(fib_index,
+                          &pfx_2001_a_s_64,
+                          FIB_SOURCE_API);
+    fib_table_entry_delete(fib_index,
+                          &pfx_2001_b_s_64,
+                          FIB_SOURCE_API);
+    fib_table_entry_delete(fib_index,
+                          &pfx_2001_1_3_s_128,
+                          FIB_SOURCE_ADJ);
+    fib_table_entry_delete(fib_index,
+                          &pfx_2001_1_2_s_128,
+                          FIB_SOURCE_ADJ);
+    local_pfx.fp_len = 64;
+    fib_table_entry_delete(fib_index, &local_pfx,
+                          FIB_SOURCE_INTERFACE);
+    local_pfx.fp_len = 128;
+    fib_table_entry_special_remove(fib_index, &local_pfx,
+                                  FIB_SOURCE_INTERFACE);
+    connected_pfx.fp_len = 64;
+    fib_table_entry_delete(fib_index, &connected_pfx,
+                          FIB_SOURCE_INTERFACE);
+    connected_pfx.fp_len = 128;
+    fib_table_entry_special_remove(fib_index, &connected_pfx,
+                                  FIB_SOURCE_INTERFACE);
+
+    FIB_TEST((FIB_NODE_INDEX_INVALID ==
+             fib_table_lookup_exact_match(fib_index, &pfx_2001_a_s_64)),
+            "2001::a/64 removed");
+    FIB_TEST((FIB_NODE_INDEX_INVALID ==
+             fib_table_lookup_exact_match(fib_index, &pfx_2001_b_s_64)),
+            "2001::b/64 removed");
+    FIB_TEST((FIB_NODE_INDEX_INVALID ==
+             fib_table_lookup_exact_match(fib_index, &pfx_2001_1_3_s_128)),
+            "2001:0:0:1::3/128 removed");
+    FIB_TEST((FIB_NODE_INDEX_INVALID ==
+             fib_table_lookup_exact_match(fib_index, &pfx_2001_1_2_s_128)),
+            "2001:0:0:1::3/128 removed");
+    local_pfx.fp_len = 64;
+    FIB_TEST((FIB_NODE_INDEX_INVALID ==
+             fib_table_lookup_exact_match(fib_index, &local_pfx)),
+            "2001:0:0:1/64 removed");
+    local_pfx.fp_len = 128;
+    FIB_TEST((FIB_NODE_INDEX_INVALID ==
+             fib_table_lookup_exact_match(fib_index, &local_pfx)),
+            "2001:0:0:1::1/128 removed");
+    connected_pfx.fp_len = 64;
+    FIB_TEST((FIB_NODE_INDEX_INVALID ==
+             fib_table_lookup_exact_match(fib_index, &connected_pfx)),
+            "2001:0:0:2/64 removed");
+    connected_pfx.fp_len = 128;
+    FIB_TEST((FIB_NODE_INDEX_INVALID ==
+             fib_table_lookup_exact_match(fib_index, &connected_pfx)),
+            "2001:0:0:2::1/128 removed");
+
+    /*
+     * -8 entries. -7 path-lists (1 was shared).
+     */
+    FIB_TEST((0 == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NPS == fib_path_list_pool_size()), "path list pool size is%d",
+            fib_path_list_pool_size());
+    FIB_TEST((NPS == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    /*
+     * now remove the VRF
+     */
+    fib_table_unlock(fib_index, FIB_PROTOCOL_IP6);
+
+    FIB_TEST((0 == fib_path_list_db_size()),   "path list DB population:%d",
+            fib_path_list_db_size());
+    FIB_TEST((NPS-6 == fib_path_list_pool_size()), "path list pool size is%d",
+            fib_path_list_pool_size());
+    FIB_TEST((NPS-6 == fib_entry_pool_size()), "entry pool size is %d",
+            fib_entry_pool_size());
+
+    adj_unlock(ai_02);
+    adj_unlock(ai_01);
+
+    /*
+     * return the interfaces to up state
+     */
+    error = vnet_sw_interface_set_flags(vnet_get_main(),
+                                       tm->hw[0]->sw_if_index,
+                                       VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+    error = vnet_sw_interface_set_flags(vnet_get_main(),
+                                       tm->hw[1]->sw_if_index,
+                                       VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+
+    FIB_TEST((0 == adj_nbr_db_size()), "ADJ DB size is %d",
+            adj_nbr_db_size());
+}
+
+/*
+ * Test the recursive route route handling for GRE tunnels
+ */
+static void
+fib_test_gre (void)
+{
+    /* fib_node_index_t fei; */
+    /* u32 fib_index = 0; */
+    /* test_main_t *tm; */
+    /* u32 ii; */
+
+    /* tm = &test_main; */
+
+    /* for (ii = 0; ii < 4; ii++) */
+    /* { */
+    /*         ip4_main.fib_index_by_sw_if_index[tm->hw[ii]->sw_if_index] = 0; */
+    /* } */
+
+    /* /\* */
+    /*  * add interface routes. We'll assume this works. It's more rigorously */
+    /*  * tested elsewhere. */
+    /*  *\/ */
+    /* fib_prefix_t local_pfx = { */
+    /*         .fp_len = 24, */
+    /*         .fp_proto = FIB_PROTOCOL_IP4, */
+    /*         .fp_addr = { */
+    /*             .ip4 = { */
+    /*                 /\* 10.10.10.10 *\/ */
+    /*                 .as_u32 = clib_host_to_net_u32(0x0a0a0a0a), */
+    /*             }, */
+    /*         }, */
+    /* }; */
+
+    /* fib_table_entry_update_one_path(fib_index, &local_pfx, */
+    /*                                     FIB_SOURCE_INTERFACE, */
+    /*                                     (FIB_ENTRY_FLAG_CONNECTED | */
+    /*                                      FIB_ENTRY_FLAG_ATTACHED), */
+    /*                                     NULL, */
+    /*                                     tm->hw[0]->sw_if_index, */
+    /*                                     ~0, */
+    /*                                     1, */
+    /*                                     FIB_ROUTE_PATH_FLAG_NONE); */
+    /* fei = fib_table_lookup_exact_match(fib_index, &local_pfx); */
+    /* FIB_TEST((FIB_NODE_INDEX_INVALID != fei), */
+    /*              "attached interface route present"); */
+
+    /* local_pfx.fp_len = 32; */
+    /* fib_table_entry_update_one_path(fib_index, &local_pfx, */
+    /*                                     FIB_SOURCE_INTERFACE, */
+    /*                                     (FIB_ENTRY_FLAG_CONNECTED | */
+    /*                                      FIB_ENTRY_FLAG_LOCAL), */
+    /*                                     NULL, */
+    /*                                     tm->hw[0]->sw_if_index, */
+    /*                                     ~0, // invalid fib index */
+    /*                                     1, */
+    /*                                     FIB_ROUTE_PATH_FLAG_NONE); */
+    /* fei = fib_table_lookup_exact_match(fib_index, &local_pfx); */
+
+    /* FIB_TEST((FIB_NODE_INDEX_INVALID != fei), */
+    /*              "local interface route present"); */
+
+    /* fib_prefix_t local2_pfx = { */
+    /*         .fp_len = 24, */
+    /*         .fp_proto = FIB_PROTOCOL_IP4, */
+    /*         .fp_addr = { */
+    /*             .ip4 = { */
+    /*                 /\* 10.10.11.11 *\/ */
+    /*                 .as_u32 = clib_host_to_net_u32(0x0a0a0b0b), */
+    /*             }, */
+    /*         }, */
+    /* }; */
+
+    /* fib_table_entry_update_one_path(fib_index, &local2_pfx, */
+    /*                                     FIB_SOURCE_INTERFACE, */
+    /*                                     (FIB_ENTRY_FLAG_CONNECTED | */
+    /*                                      FIB_ENTRY_FLAG_ATTACHED), */
+    /*                                     NULL, */
+    /*                                     tm->hw[1]->sw_if_index, */
+    /*                                     ~0, */
+    /*                                     1, */
+    /*                                     FIB_ROUTE_PATH_FLAG_NONE); */
+    /* fei = fib_table_lookup_exact_match(fib_index, &local2_pfx); */
+    /* FIB_TEST((FIB_NODE_INDEX_INVALID != fei), */
+    /*              "attached interface route present"); */
+
+    /* local2_pfx.fp_len = 32; */
+    /* fib_table_entry_update_one_path(fib_index, &local2_pfx, */
+    /*                                     FIB_SOURCE_INTERFACE, */
+    /*                                     (FIB_ENTRY_FLAG_CONNECTED | */
+    /*                                      FIB_ENTRY_FLAG_LOCAL), */
+    /*                                     NULL, */
+    /*                                     tm->hw[0]->sw_if_index, */
+    /*                                     ~0, // invalid fib index */
+    /*                                     1, */
+    /*                                     FIB_ROUTE_PATH_FLAG_NONE); */
+    /* fei = fib_table_lookup_exact_match(fib_index, &local2_pfx); */
+
+    /* FIB_TEST((FIB_NODE_INDEX_INVALID != fei), */
+    /*              "local interface route present"); */
+
+    /* /\* */
+    /*  * Add the route that will be used to resolve the tunnel's destination */
+    /*  *\/ */
+    /* fib_prefix_t route_pfx = { */
+    /*         .fp_len = 24, */
+    /*         .fp_proto = FIB_PROTOCOL_IP4, */
+    /*         .fp_addr = { */
+    /*             .ip4 = { */
+    /*                 /\* 1.1.1.0/24 *\/ */
+    /*                 .as_u32 = clib_host_to_net_u32(0x01010100), */
+    /*             }, */
+    /*         }, */
+    /* }; */
+    /* /\* 10.10.10.2 *\/ */
+    /* ip46_address_t nh_10_10_10_2 = { */
+    /*         .ip4.as_u32 = clib_host_to_net_u32(0x0a0a0a02), */
+    /* }; */
+
+    /* fib_table_entry_path_add(fib_index, &route_pfx, */
+    /*                              FIB_SOURCE_API, */
+    /*                              FIB_ENTRY_FLAG_NONE, */
+    /*                              &nh_10_10_10_2, */
+    /*                              tm->hw[0]->sw_if_index, */
+    /*                              ~0, */
+    /*                              1, */
+    /*                              FIB_ROUTE_PATH_FLAG_NONE); */
+    /* FIB_TEST((FIB_NODE_INDEX_INVALID != */
+    /*               fib_table_lookup_exact_match(fib_index, &local_pfx)), */
+    /*              "route present"); */
+
+    /* /\* */
+    /*  * Add a tunnel */
+    /*  *\/ */
+    /* /\* 1.1.1.1 *\/ */
+    /* fib_prefix_t tun_dst_pfx = { */
+    /*         .fp_len = 32, */
+    /*         .fp_proto = FIB_PROTOCOL_IP4, */
+    /*         .fp_addr = { */
+    /*             .ip4.as_u32 = clib_host_to_net_u32(0x01010101), */
+    /*         }, */
+    /* }; */
+    /* /\* 10.10.10.10 *\/ */
+    /* ip4_address_t tun_src = { */
+    /*         .as_u32 = clib_host_to_net_u32(0x0a0a0a0a), */
+    /* }; */
+    /* /\* 172.16.0.1 *\/ */
+    /* ip4_address_t tun_itf = { */
+    /*         .as_u32 = clib_host_to_net_u32(0xac100001), */
+    /* }; */
+    /* fib_prefix_t tun_itf_pfx = { */
+    /*         .fp_len = 30, */
+    /*         .fp_proto = FIB_PROTOCOL_IP4, */
+    /*         .fp_addr = { */
+    /*             .ip4 = tun_itf, */
+    /*         }, */
+    /* }; */
+    /* u32 *encap_labels = NULL; */
+    /* u32 label = 0xbaba; */
+    /* u32 encap_index; */
+    /* u32 tunnel_sw_if_index; */
+    
+    /* int rv; */
+
+    /* /\* */
+    /*  * First we need the MPLS Encap present */
+    /*  * */
+    /*  * Pretty sure this is broken. the wiki say the 1st aparamter address */
+    /*  * should be the tunnel's interface address, which makes some sense. But */
+    /*  * the code for tunnel creation checks for the tunnel's destination */
+    /*  * address. curious... */
+    /*  *\/ */
+    /* vec_add1(encap_labels, label);  */
+    /* rv =  vnet_mpls_add_del_encap(&tun_dst_pfx.fp_addr.ip4,  */
+    /*                                   0, // inner VRF */
+    /*                                   encap_labels, */
+    /*                                   ~0, // policy_tunnel_index, */
+    /*                                   0, // no_dst_hash, */
+    /*                                   &encap_index, */
+    /*                                   1); // ADD */
+    /* FIB_TEST((0 == rv), "MPLS encap created"); */
+
+    /* /\* */
+    /*  * now create the tunnel */
+    /*  *\/ */
+    /* rv = vnet_mpls_gre_add_del_tunnel(&tun_src, */
+    /*                                       &tun_dst_pfx.fp_addr.ip4, */
+    /*                                       &tun_itf_pfx.fp_addr.ip4, */
+    /*                                       tun_itf_pfx.fp_len, */
+    /*                                       0, // inner VRF */
+    /*                                       0, // outer VRF */
+    /*                                       &tunnel_sw_if_index, */
+    /*                                   0, // l2 only */
+    /*                                       1);  // ADD */
+    /* FIB_TEST((0 == rv), "Tunnel created"); */
+
+    /* /\* */
+    /*  * add it again. just for giggles. */
+    /*  *\/ */
+    /* rv = vnet_mpls_gre_add_del_tunnel(&tun_src, */
+    /*                                       &tun_dst_pfx.fp_addr.ip4, */
+    /*                                       &tun_itf_pfx.fp_addr.ip4, */
+    /*                                       tun_itf_pfx.fp_len, */
+    /*                                       0, // inner VRF */
+    /*                                       0, // outer VRF */
+    /*                                       &tunnel_sw_if_index, */
+    /*                                   0, // l2 only */
+    /*                                       1);  // ADD */
+    /* FIB_TEST((0 != rv), "Duplicate Tunnel not created"); */
+
+    /* /\* */
+    /*  * Find the route added for the tunnel subnet and check that */
+    /*  * it has a midchin adj that is stacked on the adj used to reach the */
+    /*  * tunnel destination */
+    /*  *\/ */
+    /* ip_adjacency_t *midchain_adj, *route_adj, *adjfib_adj; */
+    /* adj_index_t midchain_ai, route_ai, adjfib_ai1, adjfib_ai2; */
+    /* ip_lookup_main_t *lm; */
+
+    /* lm = &ip4_main.lookup_main; */
+
+    /* fei = fib_table_lookup_exact_match(fib_index, &tun_itf_pfx); */
+    /* FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "tun itf route present"); */
+    /* midchain_ai  = fib_entry_contribute_forwarding(fei); */
+    /* midchain_adj = adj_get(midchain_ai); */
+
+    /* FIB_TEST((IP_LOOKUP_NEXT_MIDCHAIN == midchain_adj->lookup_next_index), */
+    /*              "Tunnel interface links to midchain"); */
+
+    /* fei = fib_table_lookup_exact_match(fib_index, &route_pfx); */
+    /* route_ai = fib_entry_contribute_forwarding(fei); */
+    /* FIB_TEST((midchain_adj->sub_type.midchain.adj_index == route_ai), */
+    /*              "tunnel midchain it stacked on route adj"); */
+
+    /* /\* */
+    /*  * update the route to the tunnel's destination to load-balance via */
+    /*  * interface 1. */
+    /*  *\/ */
+    /* /\* 10.10.11.2 *\/ */
+    /* ip46_address_t nh_10_10_11_2 = {        */
+    /*         .ip4.as_u32 = clib_host_to_net_u32(0x0a0a0b02), */
+    /* }; */
+
+    /* fib_table_entry_path_add(fib_index, &route_pfx, */
+    /*                              FIB_SOURCE_API, */
+    /*                              FIB_ENTRY_FLAG_NONE, */
+    /*                              &nh_10_10_11_2, */
+    /*                              tm->hw[1]->sw_if_index, */
+    /*                              ~0, */
+    /*                              1, */
+    /*                              FIB_ROUTE_PATH_FLAG_NONE); */
+
+    /* /\* */
+    /*  * the tunnels midchain should have re-stacked. This tests that the */
+    /*  * route re-resolution backwalk works to a tunnel interface. */
+    /*  *\/ */
+    /* fei = fib_table_lookup_exact_match(fib_index, &route_pfx); */
+    /* FIB_TEST((route_ai != fib_entry_contribute_forwarding(fei)), "route changed"); */
+    /* route_ai = fib_entry_contribute_forwarding(fei); */
+
+    /* midchain_adj = adj_get(midchain_ai); */
+
+    /* FIB_TEST((midchain_adj->sub_type.midchain.adj_index == route_ai), */
+    /*              "tunnel midchain has re-stacked on route adj"); */
+
+    /* route_adj = adj_get(route_ai); */
+
+    /* FIB_TEST((2 == route_adj->n_adj), "Route adj is multipath"); */
+
+    /* /\* */
+    /*  * At this stage both nieghbour adjs are incomplete, so the same should */
+    /*  * be true of the multipath adj */
+    /*  *\/ */
+    /* FIB_TEST((IP_LOOKUP_NEXT_ARP == route_adj->lookup_next_index), */
+    /*              "Adj0 is ARP: %d", route_adj->lookup_next_index); */
+    /* FIB_TEST((IP_LOOKUP_NEXT_ARP == (route_adj+1)->lookup_next_index), */
+    /*              "Adj1 is ARP"); */
+
+    /* /\* */
+    /*  * do the equivalent of creating an ARP entry for 10.10.10.2. */
+    /*  *  This will complete the adj, and this */
+    /*  * change should be refelct in the multipath too. */
+    /*  *\/ */
+    /* u8* rewrite = NULL, byte = 0xd; */
+    /* vec_add(rewrite, &byte, 6); */
+
+    /* adjfib_ai1 = adj_nbr_add_or_lock(FIB_PROTOCOL_IP4, */
+    /*                                     FIB_LINK_IP4, */
+    /*                                     &nh_10_10_10_2, */
+    /*                                     tm->hw[0]->sw_if_index); */
+    /* adj_nbr_update_rewrite(FIB_PROTOCOL_IP4, */
+    /*                            adjfib_ai1, */
+    /*                            rewrite); */
+    /* adjfib_adj = adj_get(adjfib_ai1); */
+    /* FIB_TEST((IP_LOOKUP_NEXT_REWRITE == adjfib_adj->lookup_next_index), */
+    /*              "Adj-fib10 adj is rewrite"); */
+
+    /* adjfib_ai2 = adj_nbr_add_or_lock(FIB_PROTOCOL_IP4, */
+    /*                                     FIB_LINK_IP4, */
+    /*                                     &nh_10_10_11_2, */
+    /*                                     tm->hw[1]->sw_if_index); */
+    /* adj_nbr_update_rewrite(FIB_PROTOCOL_IP4, */
+    /*                            adjfib_ai2, */
+    /*                            rewrite); */
+
+    /* adjfib_adj = adj_get(adjfib_ai2); */
+
+    /* FIB_TEST((IP_LOOKUP_NEXT_REWRITE == adjfib_adj->lookup_next_index), */
+    /*              "Adj-fib11 adj is rewrite"); */
+
+    /* fei = fib_table_lookup_exact_match(fib_index, &route_pfx); */
+    /* FIB_TEST((route_ai != fib_entry_contribute_forwarding(fei)), "route changed"); */
+    /* route_ai = fib_entry_contribute_forwarding(fei); */
+    /* route_adj = adj_get(route_ai); */
+    /* FIB_TEST((IP_LOOKUP_NEXT_REWRITE == route_adj->lookup_next_index), */
+    /*              "Adj0 is rewrite"); */
+    /* FIB_TEST((IP_LOOKUP_NEXT_REWRITE == (route_adj+1)->lookup_next_index), */
+    /*              "Adj1 is rewrite"); */
+
+    /* /\* */
+    /*  * CLEANUP */
+    /*  *\/ */
+    /* adj_index_t drop_ai = adj_get_special(FIB_PROTOCOL_IP4, */
+    /*                                           ADJ_SPECIAL_TYPE_DROP); */
+
+    /* /\* */
+    /*  * remove the route that the tunnel resovles via. expect */
+    /*  * it to now resolve via the default route, which is drop */
+    /*  *\/ */
+    /* fib_table_entry_path_remove(fib_index, &route_pfx, */
+    /*                                 FIB_SOURCE_API, */
+    /*                                 &nh_10_10_10_2, */
+    /*                                 tm->hw[0]->sw_if_index, */
+    /*                                 ~0, */
+    /*                                 1, */
+    /*                                 FIB_ROUTE_PATH_FLAG_NONE); */
+    /* fib_table_entry_path_remove(fib_index, &route_pfx, */
+    /*                                 FIB_SOURCE_API, */
+    /*                                 &nh_10_10_11_2, */
+    /*                                 tm->hw[1]->sw_if_index, */
+    /*                                 ~0, */
+    /*                                 1, */
+    /*                                 FIB_ROUTE_PATH_FLAG_NONE); */
+    /* FIB_TEST((FIB_NODE_INDEX_INVALID != */
+    /*               fib_table_lookup_exact_match(fib_index, &local_pfx)), */
+    /*              "route present"); */
+    /* midchain_adj = adj_get(midchain_ai); */
+    /* FIB_TEST((midchain_adj->sub_type.midchain.adj_index == drop_ai), */
+    /*              "tunnel midchain has re-stacked on drop"); */
+
+    /* /\* */
+    /*  * remove the tunnel and its MPLS encaps */
+    /*  *\/ */
+    /* rv = vnet_mpls_gre_add_del_tunnel(&tun_src, */
+    /*                                       &tun_dst_pfx.fp_addr.ip4, */
+    /*                                       &tun_itf_pfx.fp_addr.ip4, */
+    /*                                       tun_itf_pfx.fp_len, */
+    /*                                       0, // inner VRF */
+    /*                                       0, // outer VRF */
+    /*                                       &tunnel_sw_if_index, */
+    /*                                   0, // l2 only */
+    /*                                       0);  // DEL */
+    /* FIB_TEST((0 == rv), "Tunnel removed"); */
+    /* rv = vnet_mpls_gre_add_del_tunnel(&tun_src, */
+    /*                                       &tun_dst_pfx.fp_addr.ip4, */
+    /*                                       &tun_itf_pfx.fp_addr.ip4, */
+    /*                                       tun_itf_pfx.fp_len, */
+    /*                                       0, // inner VRF */
+    /*                                       0, // outer VRF */
+    /*                                       &tunnel_sw_if_index, */
+    /*                                   0, // l2 only */
+    /*                                       0);  // DEL */
+    /* FIB_TEST((0 != rv), "No existant Tunnel not removed"); */
+
+    /* rv =  vnet_mpls_add_del_encap(&tun_dst_pfx.fp_addr.ip4,  */
+    /*                                   0, // inner VRF */
+    /*                                   encap_labels, */
+    /*                                   ~0, // policy_tunnel_index, */
+    /*                                   0, // no_dst_hash, */
+    /*                                   NULL, */
+    /*                                   0); // ADD */
+    /* FIB_TEST((0 == rv), "MPLS encap deleted"); */
+
+    /* vec_free(encap_labels); */
+
+    /* /\* */
+    /*  * no more FIB entries expected */
+    /*  *\/ */
+    /* fei = fib_table_lookup_exact_match(fib_index, &tun_itf_pfx); */
+    /* FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "tun itf route removed"); */
+    /* fei = fib_table_lookup_exact_match(fib_index, &tun_dst_pfx); */
+    /* FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "tun dst route removed"); */
+
+    /* /\* */
+    /*  * CLEANUP the connecteds */
+    /*  *\/ */
+    /* local2_pfx.fp_len = 24; */
+    /* fib_table_entry_delete(fib_index, &local2_pfx, */
+    /*                            FIB_SOURCE_INTERFACE); */
+    /* fei = fib_table_lookup_exact_match(fib_index, &local2_pfx); */
+    /* FIB_TEST((FIB_NODE_INDEX_INVALID == fei), */
+    /*              "attached interface route remove"); */
+
+    /* local2_pfx.fp_len = 32; */
+    /* fib_table_entry_special_remove(fib_index, &local2_pfx, */
+    /*                                    FIB_SOURCE_INTERFACE); */
+    /* fei = fib_table_lookup_exact_match(fib_index, &local2_pfx); */
+    /* FIB_TEST((FIB_NODE_INDEX_INVALID == fei), */
+    /*              "local interface route removed"); */
+    /* local_pfx.fp_len = 24; */
+    /* fib_table_entry_delete(fib_index, &local_pfx, */
+    /*                                 FIB_SOURCE_INTERFACE); */
+    /* fei = fib_table_lookup_exact_match(fib_index, &local_pfx); */
+    /* FIB_TEST((FIB_NODE_INDEX_INVALID == fei), */
+    /*              "attached interface route remove"); */
+
+    /* local_pfx.fp_len = 32; */
+    /* fib_table_entry_special_remove(fib_index, &local_pfx, */
+    /*                                    FIB_SOURCE_INTERFACE); */
+    /* fei = fib_table_lookup_exact_match(fib_index, &local_pfx); */
+    /* FIB_TEST((FIB_NODE_INDEX_INVALID == fei), */
+    /*              "local interface route removed"); */
+}
+
+/*
+ * Test Attached Exports
+ */
+static void
+fib_test_ae (void)
+{
+    const dpo_id_t *dpo, *dpo_drop;
+    const u32 fib_index = 0;
+    fib_node_index_t fei;
+    test_main_t *tm;
+    ip4_main_t *im;
+
+    tm = &test_main;
+    im = &ip4_main;
+
+    FIB_TEST((0 == adj_nbr_db_size()), "ADJ DB size is %d",
+            adj_nbr_db_size());
+
+    /*
+     * add interface routes. We'll assume this works. It's more rigorously
+     * tested elsewhere.
+     */
+    fib_prefix_t local_pfx = {
+       .fp_len = 24,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4 = {
+               /* 10.10.10.10 */
+               .as_u32 = clib_host_to_net_u32(0x0a0a0a0a),
+           },
+       },
+    };
+
+    vec_validate(im->fib_index_by_sw_if_index, tm->hw[0]->sw_if_index);
+    im->fib_index_by_sw_if_index[tm->hw[0]->sw_if_index] = fib_index;
+
+    dpo_drop = drop_dpo_get(DPO_PROTO_IP4);
+
+    fib_table_entry_update_one_path(fib_index, &local_pfx,
+                                   FIB_SOURCE_INTERFACE,
+                                   (FIB_ENTRY_FLAG_CONNECTED |
+                                    FIB_ENTRY_FLAG_ATTACHED),
+                                   FIB_PROTOCOL_IP4,
+                                   NULL,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0,
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(fib_index, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei),
+            "attached interface route present");
+
+    local_pfx.fp_len = 32;
+    fib_table_entry_update_one_path(fib_index, &local_pfx,
+                                   FIB_SOURCE_INTERFACE,
+                                   (FIB_ENTRY_FLAG_CONNECTED |
+                                    FIB_ENTRY_FLAG_LOCAL),
+                                   FIB_PROTOCOL_IP4,
+                                   NULL,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(fib_index, &local_pfx);
+
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei),
+            "local interface route present");
+
+    /*
+     * Add an 2 ARP entry => a complete ADJ plus adj-fib.
+     */
+    fib_prefix_t pfx_10_10_10_1_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 10.10.10.1 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x0a0a0a01),
+       },
+    };
+    fib_node_index_t ai;
+
+    fib_table_entry_update_one_path(fib_index,
+                                   &pfx_10_10_10_1_s_32,
+                                   FIB_SOURCE_ADJ,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP4,
+                                   &pfx_10_10_10_1_s_32.fp_addr,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup(fib_index, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib1 created");
+    ai = fib_entry_get_adj(fei);
+
+    /*
+     * create another FIB table into which routes will be imported
+     */
+    u32 import_fib_index1;
+
+    import_fib_index1 = fib_table_find_or_create_and_lock(FIB_PROTOCOL_IP4, 11);
+
+    /*
+     * Add an attached route in the import FIB
+     */
+    local_pfx.fp_len = 24;
+    fib_table_entry_update_one_path(import_fib_index1,
+                                   &local_pfx,
+                                   FIB_SOURCE_API,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP4,
+                                   NULL,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(import_fib_index1, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "attached export created");
+
+    /*
+     * check for the presence of the adj-fibs in the import table
+     */
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib1 imported");
+    FIB_TEST((ai == fib_entry_get_adj(fei)),
+            "adj-fib1 Import uses same adj as export");
+
+    /*
+     * check for the presence of the local in the import table
+     */
+    local_pfx.fp_len = 32;
+    fei = fib_table_lookup_exact_match(import_fib_index1, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "local imported");
+
+    /*
+     * Add another adj-fin in the export table. Expect this
+     * to get magically exported;
+     */
+    fib_prefix_t pfx_10_10_10_2_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 10.10.10.2 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x0a0a0a02),
+       },
+    };
+
+    fib_table_entry_update_one_path(fib_index,
+                                   &pfx_10_10_10_2_s_32,
+                                   FIB_SOURCE_ADJ,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP4,
+                                   &pfx_10_10_10_2_s_32.fp_addr,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_10_10_10_2_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib2 present");
+    ai = fib_entry_get_adj(fei);
+
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_2_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib2 imported");
+    FIB_TEST((ai == fib_entry_get_adj(fei)),
+            "Import uses same adj as export");
+
+    /*
+     * create a 2nd FIB table into which routes will be imported
+     */
+    u32 import_fib_index2;
+
+    import_fib_index2 = fib_table_find_or_create_and_lock(FIB_PROTOCOL_IP4, 12);
+
+    /*
+     * Add an attached route in the import FIB
+     */
+    local_pfx.fp_len = 24;
+    fib_table_entry_update_one_path(import_fib_index2,
+                                   &local_pfx,
+                                   FIB_SOURCE_API,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP4,
+                                   NULL,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(import_fib_index1, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "attached export created");
+
+    /*
+     * check for the presence of all the adj-fibs and local in the import table
+     */
+    fei = fib_table_lookup_exact_match(import_fib_index2, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib1 imported");
+    fei = fib_table_lookup_exact_match(import_fib_index2, &pfx_10_10_10_2_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib2 imported");
+    local_pfx.fp_len = 32;
+    fei = fib_table_lookup_exact_match(import_fib_index2, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "local imported");
+
+    /*
+     * add a 3rd adj-fib. expect it to be exported to both tables.
+     */
+    fib_prefix_t pfx_10_10_10_3_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 10.10.10.3 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x0a0a0a03),
+       },
+    };
+
+    fib_table_entry_update_one_path(fib_index,
+                                   &pfx_10_10_10_3_s_32,
+                                   FIB_SOURCE_ADJ,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP4,
+                                   &pfx_10_10_10_3_s_32.fp_addr,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_10_10_10_3_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib3 present");
+    ai = fib_entry_get_adj(fei);
+
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_3_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib3 imported to FIB1");
+    FIB_TEST((ai == fib_entry_get_adj(fei)),
+            "Import uses same adj as export");
+    fei = fib_table_lookup_exact_match(import_fib_index2, &pfx_10_10_10_3_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib3 imported to FIB2");
+    FIB_TEST((ai == fib_entry_get_adj(fei)),
+            "Import uses same adj as export");
+
+    /*
+     * remove the 3rd adj fib. we expect it to be removed from both FIBs
+     */
+    fib_table_entry_delete(fib_index,
+                          &pfx_10_10_10_3_s_32,
+                          FIB_SOURCE_ADJ);
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_10_10_10_3_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "ADJ-fib3 remved");
+
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_3_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "ADJ-fib3 removed from FIB1");
+
+    fei = fib_table_lookup_exact_match(import_fib_index2, &pfx_10_10_10_3_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "ADJ-fib3 removed from FIB2");
+
+    /*
+     * remove the attached route from the 2nd FIB. expect the imported
+     * entires to be removed
+     */
+    local_pfx.fp_len = 24;
+    fib_table_entry_delete(import_fib_index2,
+                          &local_pfx,
+                          FIB_SOURCE_API);
+    fei = fib_table_lookup_exact_match(import_fib_index2, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "attached export removed");
+
+    fei = fib_table_lookup_exact_match(import_fib_index2, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "ADJ-fib1 removed from FIB2");
+    fei = fib_table_lookup_exact_match(import_fib_index2, &pfx_10_10_10_2_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "ADJ-fib2 removed from FIB2");
+    local_pfx.fp_len = 32;
+    fei = fib_table_lookup_exact_match(import_fib_index2, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "local removed from FIB2");
+
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib1 still in FIB1");
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_2_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib2 still in FIB1");
+    local_pfx.fp_len = 32;
+    fei = fib_table_lookup_exact_match(import_fib_index1, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "local still in FIB1");
+
+    /*
+     * modify the route in FIB1 so it is no longer attached. expect the imported
+     * entires to be removed
+     */
+    local_pfx.fp_len = 24;
+    fib_table_entry_update_one_path(import_fib_index1,
+                                   &local_pfx,
+                                   FIB_SOURCE_API,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP4,
+                                   &pfx_10_10_10_2_s_32.fp_addr,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "ADJ-fib1 removed from FIB1");
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_2_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "ADJ-fib2 removed from FIB1");
+    local_pfx.fp_len = 32;
+    fei = fib_table_lookup_exact_match(import_fib_index1, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "local removed from FIB1");
+
+    /*
+     * modify it back to attached. expect the adj-fibs back
+     */
+    local_pfx.fp_len = 24;
+    fib_table_entry_update_one_path(import_fib_index1,
+                                   &local_pfx,
+                                   FIB_SOURCE_API,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP4,
+                                   NULL,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib1 imported in FIB1");
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_2_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib2 imported in FIB1");
+    local_pfx.fp_len = 32;
+    fei = fib_table_lookup_exact_match(import_fib_index1, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "local imported in FIB1");
+
+    /*
+     * add a covering attached next-hop for the interface address, so we have
+     * a valid adj to find when we check the forwarding tables
+     */
+    fib_prefix_t pfx_10_0_0_0_s_8 = {
+       .fp_len = 8,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           /* 10.0.0.0 */
+           .ip4.as_u32 = clib_host_to_net_u32(0x0a000000),
+       },
+    };
+
+    fei = fib_table_entry_update_one_path(fib_index,
+                                          &pfx_10_0_0_0_s_8,
+                                          FIB_SOURCE_API,
+                                          FIB_ENTRY_FLAG_NONE,
+                                         FIB_PROTOCOL_IP4,
+                                          &pfx_10_10_10_3_s_32.fp_addr,
+                                          tm->hw[0]->sw_if_index,
+                                          ~0, // invalid fib index
+                                          1,
+                                          MPLS_LABEL_INVALID,
+                                          FIB_ROUTE_PATH_FLAG_NONE);
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+
+    /*
+     * remove the route in the export fib. expect the adj-fibs to be removed
+     */
+    local_pfx.fp_len = 24;
+    fib_table_entry_delete(fib_index,
+                          &local_pfx,
+                          FIB_SOURCE_INTERFACE);
+
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "Delete export: ADJ-fib1 removed from FIB1");
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_2_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "ADJ-fib2 removed from FIB1");
+    local_pfx.fp_len = 32;
+    fei = fib_table_lookup_exact_match(import_fib_index1, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "local removed from FIB1");
+
+    /*
+     * the adj-fibs in the export VRF are present in the FIB table,
+     * but not installed in forwarding, since they have no attached cover.
+     * Consequently a lookup in the MTRIE gives the adj for the covering
+     * route 10.0.0.0/8.
+     */
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib1 in export");
+
+    index_t lbi;
+    lbi = ip4_fib_forwarding_lookup(fib_index, &pfx_10_10_10_1_s_32.fp_addr.ip4);
+    FIB_TEST(lbi == dpo->dpoi_index,
+             "10.10.10.1 forwards on \n%U not \n%U",
+             format_load_balance, lbi, 0,
+             format_dpo_id, dpo, 0);
+    lbi = ip4_fib_forwarding_lookup(fib_index, &pfx_10_10_10_2_s_32.fp_addr.ip4);
+    FIB_TEST(lbi == dpo->dpoi_index,
+             "10.10.10.2 forwards on %U", format_dpo_id, dpo, 0);
+    lbi = ip4_fib_forwarding_lookup(fib_index, &pfx_10_10_10_3_s_32.fp_addr.ip4);
+    FIB_TEST(lbi == dpo->dpoi_index,
+             "10.10.10.3 forwards on %U", format_dpo_id, dpo, 0);
+
+    /*
+     * add the export prefix back, but not as attached.
+     * No adj-fibs in export nor import tables
+     */
+    local_pfx.fp_len = 24;
+    fei = fib_table_entry_update_one_path(fib_index,
+                                          &local_pfx,
+                                          FIB_SOURCE_API,
+                                          FIB_ENTRY_FLAG_NONE,
+                                         FIB_PROTOCOL_IP4,
+                                          &pfx_10_10_10_1_s_32.fp_addr,
+                                          tm->hw[0]->sw_if_index,
+                                          ~0, // invalid fib index
+                                          1,
+                                          MPLS_LABEL_INVALID,
+                                          FIB_ROUTE_PATH_FLAG_NONE);
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "non-attached in export: ADJ-fib1 in export");
+    lbi = ip4_fib_forwarding_lookup(fib_index, &pfx_10_10_10_1_s_32.fp_addr.ip4);
+    FIB_TEST(lbi == dpo->dpoi_index,
+             "10.10.10.1 forwards on %U", format_dpo_id, dpo, 0);
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib1 in export");
+    lbi = ip4_fib_forwarding_lookup(fib_index, &pfx_10_10_10_2_s_32.fp_addr.ip4);
+    FIB_TEST(lbi == dpo->dpoi_index,
+             "10.10.10.2 forwards on %U", format_dpo_id, dpo, 0);
+
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "ADJ-fib1 removed from FIB1");
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_2_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "ADJ-fib2 removed from FIB1");
+    local_pfx.fp_len = 32;
+    fei = fib_table_lookup_exact_match(import_fib_index1, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fei), "local removed from FIB1");
+
+    /*
+     * modify the export prefix so it is attached. expect all covereds to return
+     */
+    local_pfx.fp_len = 24;
+    fib_table_entry_update_one_path(fib_index,
+                                   &local_pfx,
+                                   FIB_SOURCE_API,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP4,
+                                   NULL,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib1 reinstalled in export");
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(dpo_cmp(dpo_drop, load_balance_get_bucket(dpo->dpoi_index, 0)),
+            "Adj-fib1 is not drop in export");
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_10_10_10_2_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib2 reinstalled in export");
+    local_pfx.fp_len = 32;
+    fei = fib_table_lookup_exact_match(fib_index, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "local reinstalled in export");
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "attached in export: ADJ-fib1 imported");
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(dpo_cmp(dpo_drop, load_balance_get_bucket(dpo->dpoi_index, 0)),
+            "Adj-fib1 is not drop in export");
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib1 imported");
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_2_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib2 imported");
+    local_pfx.fp_len = 32;
+    fei = fib_table_lookup_exact_match(import_fib_index1, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "local imported");
+
+    /*
+     * modify the export prefix so connected. no change.
+     */
+    local_pfx.fp_len = 24;
+    fib_table_entry_update_one_path(fib_index, &local_pfx,
+                                   FIB_SOURCE_INTERFACE,
+                                   (FIB_ENTRY_FLAG_CONNECTED |
+                                    FIB_ENTRY_FLAG_ATTACHED),
+                                   FIB_PROTOCOL_IP4,
+                                   NULL,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0,
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib1 reinstalled in export");
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(dpo_cmp(dpo_drop, load_balance_get_bucket(dpo->dpoi_index, 0)),
+            "Adj-fib1 is not drop in export");
+    fei = fib_table_lookup_exact_match(fib_index, &pfx_10_10_10_2_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib2 reinstalled in export");
+    local_pfx.fp_len = 32;
+    fei = fib_table_lookup_exact_match(fib_index, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "local reinstalled in export");
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "attached in export: ADJ-fib1 imported");
+    dpo = fib_entry_contribute_ip_forwarding(fei);
+    FIB_TEST(dpo_cmp(dpo_drop, load_balance_get_bucket(dpo->dpoi_index, 0)),
+            "Adj-fib1 is not drop in export");
+    fei = fib_table_lookup_exact_match(import_fib_index1, &pfx_10_10_10_2_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "ADJ-fib2 imported");
+    local_pfx.fp_len = 32;
+    fei = fib_table_lookup_exact_match(import_fib_index1, &local_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "local imported");
+
+    /*
+     * CLEANUP
+     */
+    fib_table_entry_delete(fib_index,
+                           &pfx_10_0_0_0_s_8,
+                           FIB_SOURCE_API);
+    fib_table_entry_delete(fib_index,
+                          &pfx_10_10_10_1_s_32,
+                          FIB_SOURCE_ADJ);
+    fib_table_entry_delete(fib_index,
+                          &pfx_10_10_10_2_s_32,
+                          FIB_SOURCE_ADJ);
+    local_pfx.fp_len = 32;
+    fib_table_entry_delete(fib_index,
+                          &local_pfx,
+                          FIB_SOURCE_INTERFACE);
+    local_pfx.fp_len = 24;
+    fib_table_entry_delete(fib_index,
+                          &local_pfx,
+                          FIB_SOURCE_API);
+    fib_table_entry_delete(fib_index,
+                          &local_pfx,
+                          FIB_SOURCE_INTERFACE);
+    local_pfx.fp_len = 24;
+    fib_table_entry_delete(import_fib_index1,
+                          &local_pfx,
+                          FIB_SOURCE_API);
+
+    fib_table_unlock(import_fib_index1, FIB_PROTOCOL_IP4);
+    fib_table_unlock(import_fib_index2, FIB_PROTOCOL_IP4);
+
+    FIB_TEST((0 == adj_nbr_db_size()), "ADJ DB size is %d",
+            adj_nbr_db_size());
+}
+
+typedef enum fib_test_lb_bucket_type_t_ {
+    FT_LB_LABEL_O_ADJ,
+    FT_LB_LABEL_O_LB,
+    FT_LB_O_LB,
+    FT_LB_SPECIAL,
+    FT_LB_ADJ,
+} fib_test_lb_bucket_type_t;
+
+typedef struct fib_test_lb_bucket_t_ {
+    fib_test_lb_bucket_type_t type;
+
+    union
+    {
+       struct
+       {
+           mpls_eos_bit_t eos;
+           mpls_label_t label;
+           u8 ttl;
+           adj_index_t adj;
+       } label_o_adj;
+       struct
+       {
+           mpls_eos_bit_t eos;
+           mpls_label_t label;
+           u8 ttl;
+           index_t lb;
+       } label_o_lb;
+       struct
+       {
+           index_t adj;
+       } adj;
+       struct
+       {
+           index_t lb;
+       } lb;
+       struct
+       {
+           index_t adj;
+       } special;
+    };
+} fib_test_lb_bucket_t;
+
+#define FIB_TEST_LB(_cond, _comment, _args...)                 \
+{                                                              \
+    if (!FIB_TEST_I(_cond, _comment, ##_args)) {               \
+       return (0);                                             \
+    }                                                          \
+}
+
+static int
+fib_test_validate_lb_v (const load_balance_t *lb,
+                       u16 n_buckets,
+                       va_list ap)
+{
+    const dpo_id_t *dpo;
+    int bucket;
+
+    FIB_TEST_LB((n_buckets == lb->lb_n_buckets), "n_buckets = %d", lb->lb_n_buckets);
+
+    for (bucket = 0; bucket < n_buckets; bucket++)
+    {
+       const fib_test_lb_bucket_t *exp;
+
+       exp = va_arg(ap, fib_test_lb_bucket_t*);
+       dpo = load_balance_get_bucket_i(lb, bucket);
+
+       switch (exp->type)
+       {
+       case FT_LB_LABEL_O_ADJ:
+           {
+               const mpls_label_dpo_t *mld;
+                mpls_label_t hdr;
+               FIB_TEST_LB((DPO_MPLS_LABEL == dpo->dpoi_type),
+                          "bucket %d stacks on %U",
+                          bucket,
+                          format_dpo_type, dpo->dpoi_type);
+           
+               mld = mpls_label_dpo_get(dpo->dpoi_index);
+                hdr = clib_net_to_host_u32(mld->mld_hdr.label_exp_s_ttl);
+
+               FIB_TEST_LB((vnet_mpls_uc_get_label(hdr) ==
+                            exp->label_o_adj.label),
+                           "bucket %d stacks on label %d",
+                           bucket,
+                           exp->label_o_adj.label);
+
+               FIB_TEST_LB((vnet_mpls_uc_get_s(hdr) ==
+                            exp->label_o_adj.eos),
+                           "bucket %d stacks on label %d %U",
+                           bucket,
+                           exp->label_o_adj.label,
+                           format_mpls_eos_bit, exp->label_o_adj.eos);
+
+               FIB_TEST_LB((DPO_ADJACENCY_INCOMPLETE == mld->mld_dpo.dpoi_type),
+                           "bucket %d label stacks on %U",
+                           bucket,
+                           format_dpo_type, mld->mld_dpo.dpoi_type);
+
+               FIB_TEST_LB((exp->label_o_adj.adj == mld->mld_dpo.dpoi_index),
+                           "bucket %d label stacks on adj %d",
+                           bucket,
+                           exp->label_o_adj.adj);
+           }
+           break;
+       case FT_LB_LABEL_O_LB:
+           {
+               const mpls_label_dpo_t *mld;
+                mpls_label_t hdr;
+
+               FIB_TEST_LB((DPO_MPLS_LABEL == dpo->dpoi_type),
+                          "bucket %d stacks on %U",
+                          bucket,
+                          format_dpo_type, dpo->dpoi_type);
+           
+               mld = mpls_label_dpo_get(dpo->dpoi_index);
+                hdr = clib_net_to_host_u32(mld->mld_hdr.label_exp_s_ttl);
+
+               FIB_TEST_LB((vnet_mpls_uc_get_label(hdr) ==
+                            exp->label_o_lb.label),
+                           "bucket %d stacks on label %d",
+                           bucket,
+                           exp->label_o_lb.label);
+
+               FIB_TEST_LB((vnet_mpls_uc_get_s(hdr) ==
+                            exp->label_o_lb.eos),
+                           "bucket %d stacks on label %d %U",
+                           bucket,
+                           exp->label_o_lb.label,
+                           format_mpls_eos_bit, exp->label_o_lb.eos);
+
+               FIB_TEST_LB((DPO_LOAD_BALANCE == mld->mld_dpo.dpoi_type),
+                           "bucket %d label stacks on %U",
+                           bucket,
+                           format_dpo_type, mld->mld_dpo.dpoi_type);
+
+               FIB_TEST_LB((exp->label_o_lb.lb == mld->mld_dpo.dpoi_index),
+                           "bucket %d label stacks on LB %d",
+                           bucket,
+                           exp->label_o_lb.lb);
+           }
+           break;
+       case FT_LB_ADJ:
+           FIB_TEST_I(((DPO_ADJACENCY == dpo->dpoi_type) ||
+                       (DPO_ADJACENCY_INCOMPLETE == dpo->dpoi_type)),
+                      "bucket %d stacks on %U",
+                      bucket,
+                      format_dpo_type, dpo->dpoi_type);
+           FIB_TEST_LB((exp->adj.adj == dpo->dpoi_index),
+                       "bucket %d stacks on adj %d",
+                       bucket,
+                       exp->adj.adj);
+           break;
+       case FT_LB_O_LB:
+           FIB_TEST_I((DPO_LOAD_BALANCE == dpo->dpoi_type),
+                       "bucket %d stacks on %U",
+                       bucket,
+                       format_dpo_type, dpo->dpoi_type);
+           FIB_TEST_LB((exp->lb.lb == dpo->dpoi_index),
+                       "bucket %d stacks on lb %d",
+                       bucket,
+                       exp->lb.lb);
+           break;
+       case FT_LB_SPECIAL:
+           FIB_TEST_I((DPO_DROP == dpo->dpoi_type),
+                      "bucket %d stacks on %U",
+                      bucket,
+                      format_dpo_type, dpo->dpoi_type);
+           FIB_TEST_LB((exp->special.adj == dpo->dpoi_index),
+                       "bucket %d stacks on drop %d",
+                       bucket,
+                       exp->adj.adj);
+           break;
+       }
+    }
+    return (!0);
+}
+
+static int
+fib_test_validate_entry (fib_node_index_t fei,
+                        fib_forward_chain_type_t fct,
+                        u16 n_buckets,
+                        ...)
+{
+    const load_balance_t *lb;
+    dpo_id_t dpo = DPO_NULL;
+    va_list ap;
+    int res;
+
+    va_start(ap, n_buckets);
+
+    fib_entry_contribute_forwarding(fei, fct, &dpo);
+
+    FIB_TEST_LB((DPO_LOAD_BALANCE == dpo.dpoi_type),
+               "Entry links to %U",
+               format_dpo_type, dpo.dpoi_type);
+    lb = load_balance_get(dpo.dpoi_index);
+
+    res = fib_test_validate_lb_v(lb, n_buckets, ap);
+
+    dpo_reset(&dpo);
+
+    va_end(ap);
+
+    return (res);
+}
+
+/*
+ * Test the recursive route route handling for GRE tunnels
+ */
+static void
+fib_test_label (void)
+{
+    fib_node_index_t fei, ai_mpls_10_10_10_1, ai_v4_10_10_11_1, ai_v4_10_10_11_2, ai_mpls_10_10_11_2, ai_mpls_10_10_11_1;
+    const u32 fib_index = 0;
+    test_main_t *tm;
+    ip4_main_t *im;
+    int lb_count;
+
+    lb_count = pool_elts(load_balance_pool);
+    tm = &test_main;
+    im = &ip4_main;
+
+    /*
+     * add interface routes. We'll assume this works. It's more rigorously
+     * tested elsewhere.
+     */
+    fib_prefix_t local0_pfx = {
+       .fp_len = 24,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4 = {
+               /* 10.10.10.10 */
+               .as_u32 = clib_host_to_net_u32(0x0a0a0a0a),
+           },
+       },
+    };
+
+    FIB_TEST((0 == adj_nbr_db_size()), "ADJ DB size is %d",
+            adj_nbr_db_size());
+
+    vec_validate(im->fib_index_by_sw_if_index, tm->hw[0]->sw_if_index);
+    im->fib_index_by_sw_if_index[tm->hw[0]->sw_if_index] = fib_index;
+
+    fib_table_entry_update_one_path(fib_index, &local0_pfx,
+                                   FIB_SOURCE_INTERFACE,
+                                   (FIB_ENTRY_FLAG_CONNECTED |
+                                    FIB_ENTRY_FLAG_ATTACHED),
+                                   FIB_PROTOCOL_IP4,
+                                   NULL,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0,
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(fib_index, &local0_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei),
+            "attached interface route present");
+
+    local0_pfx.fp_len = 32;
+    fib_table_entry_update_one_path(fib_index, &local0_pfx,
+                                   FIB_SOURCE_INTERFACE,
+                                   (FIB_ENTRY_FLAG_CONNECTED |
+                                    FIB_ENTRY_FLAG_LOCAL),
+                                   FIB_PROTOCOL_IP4,
+                                   NULL,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(fib_index, &local0_pfx);
+
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei),
+            "local interface route present");
+
+    fib_prefix_t local1_pfx = {
+       .fp_len = 24,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4 = {
+               /* 10.10.11.10 */
+               .as_u32 = clib_host_to_net_u32(0x0a0a0b0a),
+           },
+       },
+    };
+
+    vec_validate(im->fib_index_by_sw_if_index, tm->hw[1]->sw_if_index);
+    im->fib_index_by_sw_if_index[tm->hw[1]->sw_if_index] = fib_index;
+
+    fib_table_entry_update_one_path(fib_index, &local1_pfx,
+                                   FIB_SOURCE_INTERFACE,
+                                   (FIB_ENTRY_FLAG_CONNECTED |
+                                    FIB_ENTRY_FLAG_ATTACHED),
+                                   FIB_PROTOCOL_IP4,
+                                   NULL,
+                                   tm->hw[1]->sw_if_index,
+                                   ~0,
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(fib_index, &local1_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei),
+            "attached interface route present");
+
+    local1_pfx.fp_len = 32;
+    fib_table_entry_update_one_path(fib_index, &local1_pfx,
+                                   FIB_SOURCE_INTERFACE,
+                                   (FIB_ENTRY_FLAG_CONNECTED |
+                                    FIB_ENTRY_FLAG_LOCAL),
+                                   FIB_PROTOCOL_IP4,
+                                   NULL,
+                                   tm->hw[1]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+    fei = fib_table_lookup_exact_match(fib_index, &local1_pfx);
+
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei),
+            "local interface route present");
+
+    ip46_address_t nh_10_10_10_1 = {
+       .ip4 = {
+           .as_u32 = clib_host_to_net_u32(0x0a0a0a01),
+       },
+    };
+    ip46_address_t nh_10_10_11_1 = {
+       .ip4 = {
+           .as_u32 = clib_host_to_net_u32(0x0a0a0b01),
+       },
+    };
+    ip46_address_t nh_10_10_11_2 = {
+       .ip4 = {
+           .as_u32 = clib_host_to_net_u32(0x0a0a0b02),
+       },
+    };
+
+    ai_v4_10_10_11_1 = adj_nbr_add_or_lock(FIB_PROTOCOL_IP4,
+                                           FIB_LINK_IP4,
+                                           &nh_10_10_11_1,
+                                           tm->hw[1]->sw_if_index);
+    ai_v4_10_10_11_2 = adj_nbr_add_or_lock(FIB_PROTOCOL_IP4,
+                                           FIB_LINK_IP4,
+                                           &nh_10_10_11_2,
+                                           tm->hw[1]->sw_if_index);
+    ai_mpls_10_10_10_1 = adj_nbr_add_or_lock(FIB_PROTOCOL_IP4,
+                                             FIB_LINK_MPLS,
+                                             &nh_10_10_10_1,
+                                             tm->hw[0]->sw_if_index);
+    ai_mpls_10_10_11_2 = adj_nbr_add_or_lock(FIB_PROTOCOL_IP4,
+                                             FIB_LINK_MPLS,
+                                             &nh_10_10_11_2,
+                                             tm->hw[1]->sw_if_index);
+    ai_mpls_10_10_11_1 = adj_nbr_add_or_lock(FIB_PROTOCOL_IP4,
+                                             FIB_LINK_MPLS,
+                                             &nh_10_10_11_1,
+                                             tm->hw[1]->sw_if_index);
+
+    /*
+     * Add an etry with one path with a real out-going label
+     */
+    fib_prefix_t pfx_1_1_1_1_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4.as_u32 = clib_host_to_net_u32(0x01010101),
+       },
+    };
+    fib_test_lb_bucket_t l99_eos_o_10_10_10_1 = {
+       .type = FT_LB_LABEL_O_ADJ,
+       .label_o_adj = {
+           .adj = ai_mpls_10_10_10_1,
+           .label = 99,
+           .eos = MPLS_EOS,
+       },
+    };
+    fib_test_lb_bucket_t l99_neos_o_10_10_10_1 = {
+       .type = FT_LB_LABEL_O_ADJ,
+       .label_o_adj = {
+           .adj = ai_mpls_10_10_10_1,
+           .label = 99,
+           .eos = MPLS_NON_EOS,
+       },
+    };
+    fib_table_entry_update_one_path(fib_index,
+                                   &pfx_1_1_1_1_s_32,
+                                   FIB_SOURCE_API,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP4,
+                                   &nh_10_10_10_1,
+                                   tm->hw[0]->sw_if_index,
+                                   ~0, // invalid fib index
+                                   1,
+                                   99,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup(fib_index, &pfx_1_1_1_1_s_32);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != fei), "1.1.1.1/32 created");
+
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    1,
+                                    &l99_eos_o_10_10_10_1),
+            "1.1.1.1/32 LB 1 bucket via label 99 over 10.10.10.1");
+
+    /*
+     * add a path with an implicit NULL label
+     */
+    fib_test_lb_bucket_t a_o_10_10_11_1 = {
+       .type = FT_LB_ADJ,
+       .adj = {
+           .adj = ai_v4_10_10_11_1,
+       },
+    };
+    fib_test_lb_bucket_t a_mpls_o_10_10_11_1 = {
+       .type = FT_LB_ADJ,
+       .adj = {
+           .adj = ai_mpls_10_10_11_1,
+       },
+    };
+
+    fei = fib_table_entry_path_add(fib_index,
+                                  &pfx_1_1_1_1_s_32,
+                                  FIB_SOURCE_API,
+                                  FIB_ENTRY_FLAG_NONE,
+                                  FIB_PROTOCOL_IP4,
+                                  &nh_10_10_11_1,
+                                  tm->hw[1]->sw_if_index,
+                                  ~0, // invalid fib index
+                                  1,
+                                  MPLS_IETF_IMPLICIT_NULL_LABEL,
+                                  FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    2,
+                                    &l99_eos_o_10_10_10_1,
+                                    &a_o_10_10_11_1),
+            "1.1.1.1/32 LB 2 buckets via: "
+            "label 99 over 10.10.10.1, "
+            "adj over 10.10.11.1");
+
+    /*
+     * assign the route a local label
+     */
+    fib_table_entry_local_label_add(fib_index,
+                                   &pfx_1_1_1_1_s_32,
+                                   24001);
+
+    fib_prefix_t pfx_24001_eos = {
+       .fp_proto = FIB_PROTOCOL_MPLS,
+       .fp_label = 24001,
+       .fp_eos = MPLS_EOS,
+    };
+    fib_prefix_t pfx_24001_neos = {
+       .fp_proto = FIB_PROTOCOL_MPLS,
+       .fp_label = 24001,
+       .fp_eos = MPLS_NON_EOS,
+    };
+
+    /*
+     * The EOS entry should link to both the paths,
+     *  and use an ip adj for the imp-null
+     * The NON-EOS entry should link to both the paths,
+     *  and use an mpls adj for the imp-null
+     */
+    fei = fib_table_lookup(MPLS_FIB_DEFAULT_TABLE_ID,
+                          &pfx_24001_eos);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_MPLS_EOS,
+                                    2,
+                                    &l99_eos_o_10_10_10_1,
+                                    &a_o_10_10_11_1),
+            "24001/eos LB 2 buckets via: "
+            "label 99 over 10.10.10.1, "
+            "adj over 10.10.11.1");
+
+
+    fei = fib_table_lookup(MPLS_FIB_DEFAULT_TABLE_ID,
+                          &pfx_24001_neos);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS,
+                                    2,
+                                    &l99_neos_o_10_10_10_1,
+                                    &a_mpls_o_10_10_11_1),
+            "24001/neos LB 1 bucket via: "
+            "label 99 over 10.10.10.1 ",
+            "mpls-adj via 10.10.11.1");
+
+    /*
+     * add an unlabelled path, this is excluded from the neos chains,
+     */
+    fib_test_lb_bucket_t adj_o_10_10_11_2 = {
+       .type = FT_LB_ADJ,
+       .adj = {
+           .adj = ai_v4_10_10_11_2,
+       },
+    };
+
+    fei = fib_table_entry_path_add(fib_index,
+                                  &pfx_1_1_1_1_s_32,
+                                  FIB_SOURCE_API,
+                                  FIB_ENTRY_FLAG_NONE,
+                                  FIB_PROTOCOL_IP4,
+                                  &nh_10_10_11_2,
+                                  tm->hw[1]->sw_if_index,
+                                  ~0, // invalid fib index
+                                  1,
+                                  MPLS_LABEL_INVALID,
+                                  FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    16, // 3 choices spread over 16 buckets
+                                    &l99_eos_o_10_10_10_1,
+                                    &l99_eos_o_10_10_10_1,
+                                    &l99_eos_o_10_10_10_1,
+                                    &l99_eos_o_10_10_10_1,
+                                    &l99_eos_o_10_10_10_1,
+                                    &l99_eos_o_10_10_10_1,
+                                    &a_o_10_10_11_1,
+                                    &a_o_10_10_11_1,
+                                    &a_o_10_10_11_1,
+                                    &a_o_10_10_11_1,
+                                    &a_o_10_10_11_1,
+                                    &adj_o_10_10_11_2,
+                                    &adj_o_10_10_11_2,
+                                    &adj_o_10_10_11_2,
+                                    &adj_o_10_10_11_2,
+                                    &adj_o_10_10_11_2),
+            "1.1.1.1/32 LB 16 buckets via: "
+            "label 99 over 10.10.10.1, "
+            "adj over 10.10.11.1",
+            "adj over 10.10.11.2");
+
+    /*
+     * get and lock a reference to the non-eos of the via entry 1.1.1.1/32
+     */
+    dpo_id_t non_eos_1_1_1_1 = DPO_NULL;
+    fib_entry_contribute_forwarding(fei,
+                                   FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS,
+                                   &non_eos_1_1_1_1);
+
+    /*
+     * n-eos has only the 2 labelled paths
+     */
+    fei = fib_table_lookup(MPLS_FIB_DEFAULT_TABLE_ID,
+                          &pfx_24001_neos);
+
+    FIB_TEST(fib_test_validate_entry(fei,
+                                    FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS,
+                                    2,
+                                    &l99_neos_o_10_10_10_1,
+                                    &a_mpls_o_10_10_11_1),
+            "24001/neos LB 2 buckets via: "
+            "label 99 over 10.10.10.1, "
+            "adj-mpls over 10.10.11.2");
+
+    /*
+     * A labelled recursive
+     */
+    fib_prefix_t pfx_2_2_2_2_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4.as_u32 = clib_host_to_net_u32(0x02020202),
+       },
+    };
+    fib_test_lb_bucket_t l1600_eos_o_1_1_1_1 = {
+       .type = FT_LB_LABEL_O_LB,
+       .label_o_lb = {
+           .lb = non_eos_1_1_1_1.dpoi_index,
+           .label = 1600,
+           .eos = MPLS_EOS,
+       },
+    };
+
+    fib_table_entry_update_one_path(fib_index,
+                                   &pfx_2_2_2_2_s_32,
+                                   FIB_SOURCE_API,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP4,
+                                   &pfx_1_1_1_1_s_32.fp_addr,
+                                   ~0,
+                                   fib_index,
+                                   1,
+                                   1600,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup(fib_index, &pfx_2_2_2_2_s_32);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    1,
+                                    &l1600_eos_o_1_1_1_1),
+            "2.2.2.2.2/32 LB 1 buckets via: "
+            "label 1600 over 1.1.1.1");
+
+    /*
+     * we are holding a lock on the non-eos LB of the via-entry.
+     * do a PIC-core failover by shutting the link of the via-entry.
+     *
+     * shut down the link with the valid label
+     */
+    vnet_sw_interface_set_flags(vnet_get_main(),
+                               tm->hw[0]->sw_if_index,
+                               0);
+
+    fei = fib_table_lookup(fib_index, &pfx_1_1_1_1_s_32);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    2,
+                                    &a_o_10_10_11_1,
+                                    &adj_o_10_10_11_2),
+            "1.1.1.1/32 LB 2 buckets via: "
+            "adj over 10.10.11.1, ",
+            "adj-v4 over 10.10.11.2");
+
+    fei = fib_table_lookup(MPLS_FIB_DEFAULT_TABLE_ID,
+                          &pfx_24001_eos);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_MPLS_EOS,
+                                    2,
+                                    &a_o_10_10_11_1,
+                                    &adj_o_10_10_11_2),
+            "24001/eos LB 2 buckets via: "
+            "adj over 10.10.11.1, ",
+            "adj-v4 over 10.10.11.2");
+
+    fei = fib_table_lookup(MPLS_FIB_DEFAULT_TABLE_ID,
+                          &pfx_24001_neos);
+    FIB_TEST(fib_test_validate_entry(fei,
+                                    FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS,
+                                    1,
+                                    &a_mpls_o_10_10_11_1),
+            "24001/neos LB 1 buckets via: "
+            "adj-mpls over 10.10.11.2");
+
+    /*
+     * test that the pre-failover load-balance has been in-place
+     * modified
+     */
+    dpo_id_t current = DPO_NULL;
+    fib_entry_contribute_forwarding(fei,
+                                   FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS,
+                                   &current);
+
+    FIB_TEST(!dpo_cmp(&non_eos_1_1_1_1,
+                      &current),
+            "PIC-core LB inplace modified %U %U",
+             format_dpo_id, &non_eos_1_1_1_1, 0,
+             format_dpo_id, &current, 0);
+
+    dpo_reset(&non_eos_1_1_1_1);
+    dpo_reset(&current);
+
+    /*
+     * no-shut the link with the valid label
+     */
+    vnet_sw_interface_set_flags(vnet_get_main(),
+                               tm->hw[0]->sw_if_index,
+                               VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+
+    fei = fib_table_lookup(fib_index, &pfx_1_1_1_1_s_32);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    16, // 3 choices spread over 16 buckets
+                                    &l99_eos_o_10_10_10_1,
+                                    &l99_eos_o_10_10_10_1,
+                                    &l99_eos_o_10_10_10_1,
+                                    &l99_eos_o_10_10_10_1,
+                                    &l99_eos_o_10_10_10_1,
+                                    &l99_eos_o_10_10_10_1,
+                                    &a_o_10_10_11_1,
+                                    &a_o_10_10_11_1,
+                                    &a_o_10_10_11_1,
+                                    &a_o_10_10_11_1,
+                                    &a_o_10_10_11_1,
+                                    &adj_o_10_10_11_2,
+                                    &adj_o_10_10_11_2,
+                                    &adj_o_10_10_11_2,
+                                    &adj_o_10_10_11_2,
+                                    &adj_o_10_10_11_2),
+            "1.1.1.1/32 LB 16 buckets via: "
+            "label 99 over 10.10.10.1, "
+            "adj over 10.10.11.1",
+            "adj-v4 over 10.10.11.2");
+
+
+    fei = fib_table_lookup(MPLS_FIB_DEFAULT_TABLE_ID,
+                          &pfx_24001_eos);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_MPLS_EOS,
+                                    16, // 3 choices spread over 16 buckets
+                                    &l99_eos_o_10_10_10_1,
+                                    &l99_eos_o_10_10_10_1,
+                                    &l99_eos_o_10_10_10_1,
+                                    &l99_eos_o_10_10_10_1,
+                                    &l99_eos_o_10_10_10_1,
+                                    &l99_eos_o_10_10_10_1,
+                                    &a_o_10_10_11_1,
+                                    &a_o_10_10_11_1,
+                                    &a_o_10_10_11_1,
+                                    &a_o_10_10_11_1,
+                                    &a_o_10_10_11_1,
+                                    &adj_o_10_10_11_2,
+                                    &adj_o_10_10_11_2,
+                                    &adj_o_10_10_11_2,
+                                    &adj_o_10_10_11_2,
+                                    &adj_o_10_10_11_2),
+            "24001/eos LB 16 buckets via: "
+            "label 99 over 10.10.10.1, "
+            "adj over 10.10.11.1",
+            "adj-v4 over 10.10.11.2");
+
+    fei = fib_table_lookup(MPLS_FIB_DEFAULT_TABLE_ID,
+                          &pfx_24001_neos);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS,
+                                    2,
+                                    &l99_neos_o_10_10_10_1,
+                                    &a_mpls_o_10_10_11_1),
+            "24001/neos LB 2 buckets via: "
+            "label 99 over 10.10.10.1, "
+            "adj-mpls over 10.10.11.2");
+
+    /*
+     * remove the first path with the valid label
+     */
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_1_1_1_1_s_32,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_10_1,
+                               tm->hw[0]->sw_if_index,
+                               ~0, // invalid fib index
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup(fib_index, &pfx_1_1_1_1_s_32);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    2,
+                                    &a_o_10_10_11_1,
+                                    &adj_o_10_10_11_2),
+            "1.1.1.1/32 LB 2 buckets via: "
+            "adj over 10.10.11.1",
+            "adj-v4 over 10.10.11.2");
+
+    fei = fib_table_lookup(MPLS_FIB_DEFAULT_TABLE_ID,
+                          &pfx_24001_eos);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_MPLS_EOS,
+                                    2,
+                                    &a_o_10_10_11_1,
+                                    &adj_o_10_10_11_2),
+            "24001/eos LB 2 buckets via: "
+            "adj over 10.10.11.1",
+            "adj-v4 over 10.10.11.2");
+
+    fei = fib_table_lookup(MPLS_FIB_DEFAULT_TABLE_ID,
+                          &pfx_24001_neos);
+
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS,
+                                    1,
+                                    &a_mpls_o_10_10_11_1),
+            "24001/neos LB 1 buckets via: "
+            "adj-mpls over 10.10.11.2");
+
+    /*
+     * remove the other path with a valid label
+     */
+    fib_test_lb_bucket_t bucket_drop = {
+       .type = FT_LB_SPECIAL,
+       .special = {
+           .adj = 1,
+       },
+    };
+
+    fib_table_entry_path_remove(fib_index,
+                               &pfx_1_1_1_1_s_32,
+                               FIB_SOURCE_API,
+                               FIB_PROTOCOL_IP4,
+                               &nh_10_10_11_1,
+                               tm->hw[1]->sw_if_index,
+                               ~0, // invalid fib index
+                               1,
+                               FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup(fib_index, &pfx_1_1_1_1_s_32);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    1,
+                                    &adj_o_10_10_11_2),
+            "1.1.1.1/32 LB 1 buckets via: "
+            "adj over 10.10.11.2");
+
+    fei = fib_table_lookup(MPLS_FIB_DEFAULT_TABLE_ID,
+                          &pfx_24001_eos);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_MPLS_EOS,
+                                    1,
+                                    &adj_o_10_10_11_2),
+            "24001/eos LB 1 buckets via: "
+            "adj over 10.10.11.2");
+
+    fei = fib_table_lookup(MPLS_FIB_DEFAULT_TABLE_ID,
+                          &pfx_24001_neos);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS,
+                                     1,
+                                     &bucket_drop),
+            "24001/eos LB 1 buckets via: DROP");
+
+    /*
+     * add back the path with the valid label
+     */
+    fib_table_entry_path_add(fib_index,
+                            &pfx_1_1_1_1_s_32,
+                            FIB_SOURCE_API,
+                            FIB_ENTRY_FLAG_NONE,
+                            FIB_PROTOCOL_IP4,
+                            &nh_10_10_10_1,
+                            tm->hw[0]->sw_if_index,
+                            ~0, // invalid fib index
+                            1,
+                            99,
+                            FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup(fib_index, &pfx_1_1_1_1_s_32);
+    FIB_TEST(fib_test_validate_entry(fei,
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    2,
+                                    &l99_eos_o_10_10_10_1,
+                                    &adj_o_10_10_11_2),
+            "1.1.1.1/32 LB 2 buckets via: "
+            "label 99 over 10.10.10.1, "
+            "adj over 10.10.11.2");
+
+    fei = fib_table_lookup(MPLS_FIB_DEFAULT_TABLE_ID,
+                          &pfx_24001_eos);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_MPLS_EOS,
+                                    2,
+                                    &l99_eos_o_10_10_10_1,
+                                    &adj_o_10_10_11_2),
+            "24001/eos LB 2 buckets via: "
+            "label 99 over 10.10.10.1, "
+            "adj over 10.10.11.2");
+
+    fei = fib_table_lookup(MPLS_FIB_DEFAULT_TABLE_ID,
+                          &pfx_24001_neos);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS,
+                                    1,
+                                    &l99_neos_o_10_10_10_1),
+            "24001/neos LB 1 buckets via: "
+            "label 99 over 10.10.10.1");
+
+    /*
+     * remove the local label
+     */
+    fib_table_entry_local_label_remove(fib_index,
+                                      &pfx_1_1_1_1_s_32,
+                                      24001);
+
+    fei = fib_table_lookup(fib_index, &pfx_1_1_1_1_s_32);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    2,
+                                    &l99_eos_o_10_10_10_1,
+                                    &adj_o_10_10_11_2),
+            "24001/eos LB 2 buckets via: "
+            "label 99 over 10.10.10.1, "
+            "adj over 10.10.11.2");
+
+    FIB_TEST((FIB_NODE_INDEX_INVALID ==
+             mpls_fib_index_from_table_id(MPLS_FIB_DEFAULT_TABLE_ID)),
+            "No more MPLS FIB entries => table removed");
+
+    /*
+     * add another via-entry for the recursive
+     */
+    fib_prefix_t pfx_1_1_1_2_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4.as_u32 = clib_host_to_net_u32(0x01010102),
+       },
+    };
+    fib_test_lb_bucket_t l101_eos_o_10_10_10_1 = {
+       .type = FT_LB_LABEL_O_ADJ,
+       .label_o_adj = {
+           .adj = ai_mpls_10_10_10_1,
+           .label = 101,
+           .eos = MPLS_EOS,
+       },
+    };
+
+    fei = fib_table_entry_update_one_path(fib_index,
+                                         &pfx_1_1_1_2_s_32,
+                                         FIB_SOURCE_API,
+                                         FIB_ENTRY_FLAG_NONE,
+                                         FIB_PROTOCOL_IP4,
+                                         &nh_10_10_10_1,
+                                         tm->hw[0]->sw_if_index,
+                                         ~0, // invalid fib index
+                                         1,
+                                         101,
+                                         FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST(fib_test_validate_entry(fei,
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    1,
+                                    &l101_eos_o_10_10_10_1),
+            "1.1.1.2/32 LB 1 buckets via: "
+            "label 101 over 10.10.10.1");
+
+    dpo_id_t non_eos_1_1_1_2 = DPO_NULL;
+    fib_entry_contribute_forwarding(fib_table_lookup(fib_index,
+                                                    &pfx_1_1_1_1_s_32),
+                                   FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS,
+                                   &non_eos_1_1_1_1);
+    fib_entry_contribute_forwarding(fib_table_lookup(fib_index,
+                                                    &pfx_1_1_1_2_s_32),
+                                   FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS,
+                                   &non_eos_1_1_1_2);
+
+    fib_test_lb_bucket_t l1601_eos_o_1_1_1_2 = {
+       .type = FT_LB_LABEL_O_LB,
+       .label_o_lb = {
+           .lb = non_eos_1_1_1_2.dpoi_index,
+           .label = 1601,
+           .eos = MPLS_EOS,
+       },
+    };
+    l1600_eos_o_1_1_1_1.label_o_lb.lb = non_eos_1_1_1_1.dpoi_index;
+
+    fei = fib_table_entry_path_add(fib_index,
+                                  &pfx_2_2_2_2_s_32,
+                                  FIB_SOURCE_API,
+                                  FIB_ENTRY_FLAG_NONE,
+                                  FIB_PROTOCOL_IP4,
+                                  &pfx_1_1_1_2_s_32.fp_addr,
+                                  ~0,
+                                  fib_index,
+                                  1,
+                                  1601,
+                                  FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    2,
+                                    &l1600_eos_o_1_1_1_1,
+                                    &l1601_eos_o_1_1_1_2),
+            "2.2.2.2/32 LB 2 buckets via: "
+            "label 1600 via 1.1,1.1, "
+            "label 16001 via 1.1.1.2");
+
+    /*
+     * update the via-entry so it no longer has an imp-null path.
+     * the LB for the recursive can use an imp-null
+     */
+    fei = fib_table_entry_update_one_path(fib_index,
+                                         &pfx_1_1_1_2_s_32,
+                                         FIB_SOURCE_API,
+                                         FIB_ENTRY_FLAG_NONE,
+                                         FIB_PROTOCOL_IP4,
+                                         &nh_10_10_11_1,
+                                         tm->hw[1]->sw_if_index,
+                                         ~0, // invalid fib index
+                                         1,
+                                         MPLS_IETF_IMPLICIT_NULL_LABEL,
+                                         FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST(fib_test_validate_entry(fei,
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    1,
+                                    &a_o_10_10_11_1),
+            "1.1.1.2/32 LB 1 buckets via: "
+            "adj 10.10.11.1");
+    fei = fib_table_lookup(fib_index, &pfx_2_2_2_2_s_32);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    2,
+                                    &l1600_eos_o_1_1_1_1,
+                                    &l1601_eos_o_1_1_1_2),
+            "2.2.2.2/32 LB 2 buckets via: "
+            "label 1600 via 1.1,1.1, "
+            "label 16001 via 1.1.1.2");
+
+    /*
+     * update the via-entry so it no longer has labelled paths.
+     * the LB for the recursive should exclue this via form its LB
+     */
+    fei = fib_table_entry_update_one_path(fib_index,
+                                         &pfx_1_1_1_2_s_32,
+                                         FIB_SOURCE_API,
+                                         FIB_ENTRY_FLAG_NONE,
+                                         FIB_PROTOCOL_IP4,
+                                         &nh_10_10_11_1,
+                                         tm->hw[1]->sw_if_index,
+                                         ~0, // invalid fib index
+                                         1,
+                                         MPLS_LABEL_INVALID,
+                                         FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST(fib_test_validate_entry(fei,
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    1,
+                                    &a_o_10_10_11_1),
+            "1.1.1.2/32 LB 1 buckets via: "
+            "adj 10.10.11.1");
+    fei = fib_table_lookup(fib_index, &pfx_2_2_2_2_s_32);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    1,
+                                    &l1600_eos_o_1_1_1_1),
+            "2.2.2.2/32 LB 1 buckets via: "
+            "label 1600 via 1.1,1.1");
+
+    dpo_reset(&non_eos_1_1_1_1);
+    dpo_reset(&non_eos_1_1_1_2);
+
+    /*
+     * Add a recursive with no out-labels. We expect to use the IP of the via
+     */
+    fib_prefix_t pfx_2_2_2_3_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4.as_u32 = clib_host_to_net_u32(0x02020203),
+       },
+    };
+    dpo_id_t ip_1_1_1_1 = DPO_NULL;
+
+    fib_table_entry_update_one_path(fib_index,
+                                   &pfx_2_2_2_3_s_32,
+                                   FIB_SOURCE_API,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP4,
+                                   &pfx_1_1_1_1_s_32.fp_addr,
+                                   ~0,
+                                   fib_index,
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+
+    fib_entry_contribute_forwarding(fib_table_lookup(fib_index,
+                                                    &pfx_1_1_1_1_s_32),
+                                   FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                   &ip_1_1_1_1);
+
+    fib_test_lb_bucket_t ip_o_1_1_1_1 = {
+       .type = FT_LB_O_LB,
+       .lb = {
+           .lb = ip_1_1_1_1.dpoi_index,
+       },
+    };
+
+    fei = fib_table_lookup(fib_index, &pfx_2_2_2_3_s_32);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    1,
+                                    &ip_o_1_1_1_1),
+            "2.2.2.2.3/32 LB 1 buckets via: "
+            "ip 1.1.1.1");
+
+    /*
+     * Add a recursive with an imp-null out-label. 
+     * We expect to use the IP of the via
+     */
+    fib_prefix_t pfx_2_2_2_4_s_32 = {
+       .fp_len = 32,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr = {
+           .ip4.as_u32 = clib_host_to_net_u32(0x02020204),
+       },
+    };
+
+    fib_table_entry_update_one_path(fib_index,
+                                   &pfx_2_2_2_4_s_32,
+                                   FIB_SOURCE_API,
+                                   FIB_ENTRY_FLAG_NONE,
+                                   FIB_PROTOCOL_IP4,
+                                   &pfx_1_1_1_1_s_32.fp_addr,
+                                   ~0,
+                                   fib_index,
+                                   1,
+                                   MPLS_LABEL_INVALID,
+                                   FIB_ROUTE_PATH_FLAG_NONE);
+
+    fei = fib_table_lookup(fib_index, &pfx_2_2_2_4_s_32);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    1,
+                                    &ip_o_1_1_1_1),
+            "2.2.2.2.4/32 LB 1 buckets via: "
+            "ip 1.1.1.1");
+
+    dpo_reset(&ip_1_1_1_1);
+
+    /*
+     * cleanup
+     */
+    fib_table_entry_delete(fib_index,
+                          &pfx_1_1_1_2_s_32,
+                          FIB_SOURCE_API);
+
+    fei = fib_table_lookup(fib_index, &pfx_2_2_2_2_s_32);
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    1,
+                                    &l1600_eos_o_1_1_1_1),
+            "2.2.2.2/32 LB 1 buckets via: "
+            "label 1600 via 1.1,1.1");
+
+    fib_table_entry_delete(fib_index,
+                          &pfx_1_1_1_1_s_32,
+                          FIB_SOURCE_API);
+
+    FIB_TEST(fib_test_validate_entry(fei, 
+                                    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+                                    1,
+                                    &bucket_drop),
+            "2.2.2.2/32 LB 1 buckets via: DROP");
+
+    fib_table_entry_delete(fib_index,
+                          &pfx_2_2_2_2_s_32,
+                          FIB_SOURCE_API);
+    fib_table_entry_delete(fib_index,
+                          &pfx_2_2_2_3_s_32,
+                          FIB_SOURCE_API);
+    fib_table_entry_delete(fib_index,
+                          &pfx_2_2_2_4_s_32,
+                          FIB_SOURCE_API);
+
+    adj_unlock(ai_mpls_10_10_10_1);
+    adj_unlock(ai_mpls_10_10_11_2);
+    adj_unlock(ai_v4_10_10_11_1);
+    adj_unlock(ai_v4_10_10_11_2);
+    adj_unlock(ai_mpls_10_10_11_1);
+
+    FIB_TEST((0 == adj_nbr_db_size()), "ADJ DB size is %d",
+            adj_nbr_db_size());
+
+    local0_pfx.fp_len = 32;
+    fib_table_entry_delete(fib_index,
+                          &local0_pfx,
+                          FIB_SOURCE_INTERFACE);
+    local0_pfx.fp_len = 24;
+    fib_table_entry_delete(fib_index,
+                          &local0_pfx,
+                          FIB_SOURCE_INTERFACE);
+    local1_pfx.fp_len = 32;
+    fib_table_entry_delete(fib_index,
+                          &local1_pfx,
+                          FIB_SOURCE_INTERFACE);
+    local1_pfx.fp_len = 24;
+    fib_table_entry_delete(fib_index,
+                          &local1_pfx,
+                          FIB_SOURCE_INTERFACE);
+
+    /*
+     * +1 for the drop LB in the MPLS tables.
+     */
+    FIB_TEST(lb_count+1 == pool_elts(load_balance_pool),
+            "Load-balance resources freed %d of %d",
+             lb_count+1, pool_elts(load_balance_pool));
+}
+
+#define N_TEST_CHILDREN 4
+#define PARENT_INDEX 0
+
+typedef struct fib_node_test_t_
+{
+    fib_node_t node;
+    u32 sibling;
+    u32 index;
+    fib_node_back_walk_ctx_t *ctxs;
+    u32 destroyed;
+} fib_node_test_t;
+
+static fib_node_test_t fib_test_nodes[N_TEST_CHILDREN+1];
+
+#define PARENT() (&fib_test_nodes[PARENT_INDEX].node)
+
+#define FOR_EACH_TEST_CHILD(_tc)                     \
+    for (ii = 1, (_tc) = &fib_test_nodes[1];         \
+         ii < N_TEST_CHILDREN+1;                     \
+         ii++, (_tc) = &fib_test_nodes[ii])
+
+static fib_node_t *
+fib_test_child_get_node (fib_node_index_t index)
+{
+    return (&fib_test_nodes[index].node);
+}
+
+static int fib_test_walk_spawns_walks;
+
+static fib_node_back_walk_rc_t
+fib_test_child_back_walk_notify (fib_node_t *node,
+                                 fib_node_back_walk_ctx_t *ctx)
+{
+    fib_node_test_t *tc = (fib_node_test_t*) node;
+
+    vec_add1(tc->ctxs, *ctx);
+
+    if (1 == fib_test_walk_spawns_walks)
+        fib_walk_sync(FIB_NODE_TYPE_TEST, tc->index, ctx);
+    if (2 == fib_test_walk_spawns_walks)
+        fib_walk_async(FIB_NODE_TYPE_TEST, tc->index,
+                       FIB_WALK_PRIORITY_HIGH, ctx);
+
+    return (FIB_NODE_BACK_WALK_CONTINUE);
+}
+
+static void
+fib_test_child_last_lock_gone (fib_node_t *node)
+{
+    fib_node_test_t *tc = (fib_node_test_t *)node;
+
+    tc->destroyed = 1;
+}
+
+/**
+ * The FIB walk's graph node virtual function table
+ */
+static const fib_node_vft_t fib_test_child_vft = {
+    .fnv_get = fib_test_child_get_node,
+    .fnv_last_lock = fib_test_child_last_lock_gone,
+    .fnv_back_walk = fib_test_child_back_walk_notify,
+};
+
+/*
+ * the function (that should have been static but isn't so I can do this)
+ * that processes the walk from the async queue,
+ */
+f64 fib_walk_process_queues(vlib_main_t * vm,
+                            const f64 quota);
+u32 fib_walk_queue_get_size(fib_walk_priority_t prio);
+
+static void
+fib_test_walk (void)
+{
+    fib_node_back_walk_ctx_t high_ctx = {}, low_ctx = {};
+    fib_node_test_t *tc;
+    vlib_main_t *vm;
+    u32 ii;
+
+    vm = vlib_get_main();
+    fib_node_register_type(FIB_NODE_TYPE_TEST, &fib_test_child_vft);
+
+    /*
+     * init a fake node on which we will add children
+     */
+    fib_node_init(&fib_test_nodes[PARENT_INDEX].node,
+                  FIB_NODE_TYPE_TEST);
+
+    FOR_EACH_TEST_CHILD(tc)
+    {
+        fib_node_init(&tc->node, FIB_NODE_TYPE_TEST);
+        fib_node_lock(&tc->node);
+        tc->ctxs = NULL;
+        tc->index = ii;
+        tc->sibling = fib_node_child_add(FIB_NODE_TYPE_TEST,
+                                         PARENT_INDEX,
+                                         FIB_NODE_TYPE_TEST, ii);
+    }
+
+    /*
+     * enqueue a walk across the parents children.
+     */
+    high_ctx.fnbw_reason = FIB_NODE_BW_REASON_RESOLVE;
+
+    fib_walk_async(FIB_NODE_TYPE_TEST, PARENT_INDEX,
+                   FIB_WALK_PRIORITY_HIGH, &high_ctx);
+    FIB_TEST(N_TEST_CHILDREN+1 == fib_node_list_get_size(PARENT()->fn_children),
+             "Parent has %d children pre-walk",
+             fib_node_list_get_size(PARENT()->fn_children));
+
+    /*
+     * give the walk a large amount of time so it gets to the end
+     */
+    fib_walk_process_queues(vm, 1);
+
+    FOR_EACH_TEST_CHILD(tc)
+    {
+        FIB_TEST(1 == vec_len(tc->ctxs),
+                 "%d child visitsed %d times",
+                 ii, vec_len(tc->ctxs));
+        vec_free(tc->ctxs);
+    }
+    FIB_TEST(0 == fib_walk_queue_get_size(FIB_WALK_PRIORITY_HIGH),
+             "Queue is empty post walk");
+    FIB_TEST(N_TEST_CHILDREN == fib_node_list_get_size(PARENT()->fn_children),
+             "Parent has %d children post walk",
+             fib_node_list_get_size(PARENT()->fn_children));
+
+    /*
+     * walk again. should be no increase in the number of visits, since
+     * the walk will have terminated.
+     */
+    fib_walk_process_queues(vm, 1);
+
+    FOR_EACH_TEST_CHILD(tc)
+    {
+        FIB_TEST(0 == vec_len(tc->ctxs),
+                 "%d child visitsed %d times",
+                 ii, vec_len(tc->ctxs));
+    }
+
+    /*
+     * schedule a low and hig priority walk. expect the high to be performed
+     * before the low.
+     * schedule the high prio walk first so that it is further from the head
+     * of the dependency list. that way it won't merge with the low one.
+     */
+    high_ctx.fnbw_reason = FIB_NODE_BW_REASON_FLAG_RESOLVE;
+    low_ctx.fnbw_reason = FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE;
+
+    fib_walk_async(FIB_NODE_TYPE_TEST, PARENT_INDEX,
+                   FIB_WALK_PRIORITY_HIGH, &high_ctx);
+    fib_walk_async(FIB_NODE_TYPE_TEST, PARENT_INDEX,
+                   FIB_WALK_PRIORITY_LOW, &low_ctx);
+
+    fib_walk_process_queues(vm, 1);
+
+    FOR_EACH_TEST_CHILD(tc)
+    {
+        FIB_TEST(high_ctx.fnbw_reason == tc->ctxs[0].fnbw_reason,
+                 "%d child visitsed by high prio walk", ii);
+        FIB_TEST(low_ctx.fnbw_reason  == tc->ctxs[1].fnbw_reason,
+                 "%d child visitsed by low prio walk", ii);
+        vec_free(tc->ctxs);
+    }
+    FIB_TEST(0 == fib_walk_queue_get_size(FIB_WALK_PRIORITY_HIGH),
+             "Queue is empty post prio walk");
+    FIB_TEST(N_TEST_CHILDREN == fib_node_list_get_size(PARENT()->fn_children),
+             "Parent has %d children post prio walk",
+             fib_node_list_get_size(PARENT()->fn_children));
+
+    /*
+     * schedule 2 walks of the same priority that can be megred.
+     * expect that each child is thus visited only once.
+     */
+    high_ctx.fnbw_reason = FIB_NODE_BW_REASON_FLAG_RESOLVE;
+    low_ctx.fnbw_reason  = FIB_NODE_BW_REASON_FLAG_RESOLVE;
+
+    fib_walk_async(FIB_NODE_TYPE_TEST, PARENT_INDEX,
+                   FIB_WALK_PRIORITY_HIGH, &high_ctx);
+    fib_walk_async(FIB_NODE_TYPE_TEST, PARENT_INDEX,
+                   FIB_WALK_PRIORITY_HIGH, &low_ctx);
+
+    fib_walk_process_queues(vm, 1);
+
+    FOR_EACH_TEST_CHILD(tc)
+    {
+        FIB_TEST(1 == vec_len(tc->ctxs),
+                 "%d child visitsed %d times during merge walk",
+                 ii, vec_len(tc->ctxs));
+        vec_free(tc->ctxs);
+    }
+    FIB_TEST(0 == fib_walk_queue_get_size(FIB_WALK_PRIORITY_HIGH),
+             "Queue is empty post merge walk");
+    FIB_TEST(N_TEST_CHILDREN == fib_node_list_get_size(PARENT()->fn_children),
+             "Parent has %d children post merge walk",
+             fib_node_list_get_size(PARENT()->fn_children));
+
+    /*
+     * schedule 2 walks of the same priority that cannot be megred.
+     * expect that each child is thus visited twice and in the order
+     * in which the walks were scheduled.
+     */
+    high_ctx.fnbw_reason = FIB_NODE_BW_REASON_FLAG_RESOLVE;
+    low_ctx.fnbw_reason  = FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE;
+
+    fib_walk_async(FIB_NODE_TYPE_TEST, PARENT_INDEX,
+                   FIB_WALK_PRIORITY_HIGH, &high_ctx);
+    fib_walk_async(FIB_NODE_TYPE_TEST, PARENT_INDEX,
+                   FIB_WALK_PRIORITY_HIGH, &low_ctx);
+
+    fib_walk_process_queues(vm, 1);
+
+    FOR_EACH_TEST_CHILD(tc)
+    {
+        FIB_TEST(high_ctx.fnbw_reason == tc->ctxs[0].fnbw_reason,
+                 "%d child visitsed by high prio walk", ii);
+        FIB_TEST(low_ctx.fnbw_reason  == tc->ctxs[1].fnbw_reason,
+                 "%d child visitsed by low prio walk", ii);
+        vec_free(tc->ctxs);
+    }
+    FIB_TEST(0 == fib_walk_queue_get_size(FIB_WALK_PRIORITY_HIGH),
+             "Queue is empty post no-merge walk");
+    FIB_TEST(N_TEST_CHILDREN == fib_node_list_get_size(PARENT()->fn_children),
+             "Parent has %d children post no-merge walk",
+             fib_node_list_get_size(PARENT()->fn_children));
+
+    /*
+     * schedule a walk that makes one one child progress.
+     * we do this by giving the queue draining process zero
+     * time quanta. it's a do..while loop, so it does something.
+     */
+    high_ctx.fnbw_reason = FIB_NODE_BW_REASON_RESOLVE;
+
+    fib_walk_async(FIB_NODE_TYPE_TEST, PARENT_INDEX,
+                   FIB_WALK_PRIORITY_HIGH, &high_ctx);
+    fib_walk_process_queues(vm, 0);
+
+    FOR_EACH_TEST_CHILD(tc)
+    {
+        if (ii == N_TEST_CHILDREN)
+        {
+            FIB_TEST(1 == vec_len(tc->ctxs),
+                     "%d child visitsed %d times in zero quanta walk",
+                     ii, vec_len(tc->ctxs));
+        }
+        else
+        {
+            FIB_TEST(0 == vec_len(tc->ctxs),
+                     "%d child visitsed %d times in 0 quanta walk",
+                     ii, vec_len(tc->ctxs));
+        }
+    }
+    FIB_TEST(1 == fib_walk_queue_get_size(FIB_WALK_PRIORITY_HIGH),
+             "Queue is not empty post zero quanta walk");
+    FIB_TEST(N_TEST_CHILDREN+1 == fib_node_list_get_size(PARENT()->fn_children),
+             "Parent has %d children post zero qunta walk",
+             fib_node_list_get_size(PARENT()->fn_children));
+
+    /*
+     * another one step
+     */
+    fib_walk_process_queues(vm, 0);
+
+    FOR_EACH_TEST_CHILD(tc)
+    {
+        if (ii >= N_TEST_CHILDREN-1)
+        {
+            FIB_TEST(1 == vec_len(tc->ctxs),
+                     "%d child visitsed %d times in 2nd zero quanta walk",
+                     ii, vec_len(tc->ctxs));
+        }
+        else
+        {
+            FIB_TEST(0 == vec_len(tc->ctxs),
+                     "%d child visitsed %d times in 2nd 0 quanta walk",
+                     ii, vec_len(tc->ctxs));
+        }
+    }
+    FIB_TEST(1 == fib_walk_queue_get_size(FIB_WALK_PRIORITY_HIGH),
+             "Queue is not empty post zero quanta walk");
+    FIB_TEST(N_TEST_CHILDREN+1 == fib_node_list_get_size(PARENT()->fn_children),
+             "Parent has %d children post zero qunta walk",
+             fib_node_list_get_size(PARENT()->fn_children));
+
+    /*
+     * schedule another walk that will catch-up and merge.
+     */
+    fib_walk_async(FIB_NODE_TYPE_TEST, PARENT_INDEX,
+                   FIB_WALK_PRIORITY_HIGH, &high_ctx);
+    fib_walk_process_queues(vm, 1);
+
+    FOR_EACH_TEST_CHILD(tc)
+    {
+        if (ii >= N_TEST_CHILDREN-1)
+        {
+            FIB_TEST(2 == vec_len(tc->ctxs),
+                     "%d child visitsed %d times in 2nd zero quanta merge walk",
+                     ii, vec_len(tc->ctxs));
+            vec_free(tc->ctxs);
+        }
+        else
+        {
+            FIB_TEST(1 == vec_len(tc->ctxs),
+                     "%d child visitsed %d times in 2nd 0 quanta merge walk",
+                     ii, vec_len(tc->ctxs));
+            vec_free(tc->ctxs);
+        }
+    }
+    FIB_TEST(0 == fib_walk_queue_get_size(FIB_WALK_PRIORITY_HIGH),
+             "Queue is not empty post 2nd zero quanta merge walk");
+    FIB_TEST(N_TEST_CHILDREN == fib_node_list_get_size(PARENT()->fn_children),
+             "Parent has %d children post 2nd zero qunta merge walk",
+             fib_node_list_get_size(PARENT()->fn_children));
+
+    /*
+     * park a async walk in the middle of the list, then have an sync walk catch
+     * it. same expectations as async catches async.
+     */
+    high_ctx.fnbw_reason = FIB_NODE_BW_REASON_RESOLVE;
+
+    fib_walk_async(FIB_NODE_TYPE_TEST, PARENT_INDEX,
+                   FIB_WALK_PRIORITY_HIGH, &high_ctx);
+
+    fib_walk_process_queues(vm, 0);
+    fib_walk_process_queues(vm, 0);
+
+    fib_walk_sync(FIB_NODE_TYPE_TEST, PARENT_INDEX, &high_ctx);
+
+    FOR_EACH_TEST_CHILD(tc)
+    {
+        if (ii >= N_TEST_CHILDREN-1)
+        {
+            FIB_TEST(2 == vec_len(tc->ctxs),
+                     "%d child visitsed %d times in sync catches async walk",
+                     ii, vec_len(tc->ctxs));
+            vec_free(tc->ctxs);
+        }
+        else
+        {
+            FIB_TEST(1 == vec_len(tc->ctxs),
+                     "%d child visitsed %d times in sync catches async walk",
+                     ii, vec_len(tc->ctxs));
+            vec_free(tc->ctxs);
+        }
+    }
+    FIB_TEST(0 == fib_walk_queue_get_size(FIB_WALK_PRIORITY_HIGH),
+             "Queue is not empty post 2nd zero quanta merge walk");
+    FIB_TEST(N_TEST_CHILDREN == fib_node_list_get_size(PARENT()->fn_children),
+             "Parent has %d children post 2nd zero qunta merge walk",
+             fib_node_list_get_size(PARENT()->fn_children));
+
+    /*
+     * make the parent a child of one of its children, thus inducing a routing loop.
+     */
+    fib_test_nodes[PARENT_INDEX].sibling =
+        fib_node_child_add(FIB_NODE_TYPE_TEST,
+                           1, // the first child
+                           FIB_NODE_TYPE_TEST,
+                           PARENT_INDEX);
+
+    /*
+     * execute a sync walk from the parent. each child visited spawns more sync
+     * walks. we expect the walk to terminate.
+     */
+    fib_test_walk_spawns_walks = 1;
+
+    fib_walk_sync(FIB_NODE_TYPE_TEST, PARENT_INDEX, &high_ctx);
+
+    FOR_EACH_TEST_CHILD(tc)
+    {
+        /*
+         * child 1 - which is last in the list - has the loop.
+         * the other children a re thus visitsed first. the we meet
+         * child 1. we go round the loop again, visting the other children.
+         * then we meet the walk in the dep list and bail. child 1 is not visitsed
+         * again.
+         */
+        if (1 == ii)
+        {
+            FIB_TEST(1 == vec_len(tc->ctxs),
+                     "child %d visitsed %d times during looped sync walk",
+                     ii, vec_len(tc->ctxs));
+        }
+        else
+        {
+            FIB_TEST(2 == vec_len(tc->ctxs),
+                     "child %d visitsed %d times during looped sync walk",
+                     ii, vec_len(tc->ctxs));
+        }
+        vec_free(tc->ctxs);
+    }
+    FIB_TEST(N_TEST_CHILDREN == fib_node_list_get_size(PARENT()->fn_children),
+             "Parent has %d children post sync loop walk",
+             fib_node_list_get_size(PARENT()->fn_children));
+
+    /*
+     * the walk doesn't reach the max depth because the infra knows that sync
+     * meets sync implies a loop and bails early.
+     */
+    FIB_TEST(high_ctx.fnbw_depth == 9,
+             "Walk context depth %d post sync loop walk",
+             high_ctx.fnbw_depth);
+
+    /*
+     * execute an async walk of the graph loop, with each child spawns sync walks
+     */
+    high_ctx.fnbw_depth = 0;
+    fib_walk_async(FIB_NODE_TYPE_TEST, PARENT_INDEX,
+                   FIB_WALK_PRIORITY_HIGH, &high_ctx);
+
+    fib_walk_process_queues(vm, 1);
+
+    FOR_EACH_TEST_CHILD(tc)
+    {
+        /*
+         * we don't really care how many times the children are visisted, as long as
+         * it is more than once.
+         */
+        FIB_TEST(1 <= vec_len(tc->ctxs),
+                 "child %d visitsed %d times during looped aync spawns sync walk",
+                 ii, vec_len(tc->ctxs));
+        vec_free(tc->ctxs);
+    }
+
+    /*
+     * execute an async walk of the graph loop, with each child spawns async walks
+     */
+    fib_test_walk_spawns_walks = 2;
+    high_ctx.fnbw_depth = 0;
+    fib_walk_async(FIB_NODE_TYPE_TEST, PARENT_INDEX,
+                   FIB_WALK_PRIORITY_HIGH, &high_ctx);
+
+    fib_walk_process_queues(vm, 1);
+
+    FOR_EACH_TEST_CHILD(tc)
+    {
+        /*
+         * we don't really care how many times the children are visisted, as long as
+         * it is more than once.
+         */
+        FIB_TEST(1 <= vec_len(tc->ctxs),
+                 "child %d visitsed %d times during looped async spawns async walk",
+                 ii, vec_len(tc->ctxs));
+                vec_free(tc->ctxs);
+    }
+
+
+    fib_node_child_remove(FIB_NODE_TYPE_TEST,
+                          1, // the first child
+                          fib_test_nodes[PARENT_INDEX].sibling);
+
+    /*
+     * cleanup
+     */
+    FOR_EACH_TEST_CHILD(tc)
+    {
+        fib_node_child_remove(FIB_NODE_TYPE_TEST, PARENT_INDEX,
+                              tc->sibling);
+        fib_node_deinit(&tc->node);
+        fib_node_unlock(&tc->node);
+    }
+    fib_node_deinit(PARENT());
+
+    /*
+     * The parent will be destroyed when the last lock on it goes.
+     * this test ensures all the walk objects are unlocking it.
+     */
+    FIB_TEST((1 == fib_test_nodes[PARENT_INDEX].destroyed),
+             "Parent was destroyed");
+}
+
+static void
+lfib_test_deagg (void)
+{
+    const mpls_label_t deag_label = 50;
+    const u32 lfib_index = 0;
+    const u32 fib_index = 0;
+    dpo_id_t dpo = DPO_NULL;
+    const dpo_id_t *dpo1;
+    fib_node_index_t lfe;
+    lookup_dpo_t *lkd;
+    test_main_t *tm;
+    int lb_count;
+
+    tm = &test_main;
+    lb_count = pool_elts(load_balance_pool);
+
+    FIB_TEST((0 == adj_nbr_db_size()), "ADJ DB size is %d",
+            adj_nbr_db_size());
+
+    /*
+     * MPLS enable an interface so we get the MPLS table created
+     */
+    mpls_sw_interface_enable_disable(&mpls_main,
+                                     tm->hw[0]->sw_if_index,
+                                     1);
+
+    /*
+     * Test the specials stack properly.
+     */
+    fib_prefix_t exp_null_v6_pfx = {
+       .fp_proto = FIB_PROTOCOL_MPLS,
+       .fp_eos = MPLS_EOS,
+       .fp_label = MPLS_IETF_IPV6_EXPLICIT_NULL_LABEL,
+       .fp_payload_proto = DPO_PROTO_IP6,
+    };
+    lfe = fib_table_lookup(lfib_index, &exp_null_v6_pfx);
+    FIB_TEST((FIB_NODE_INDEX_INVALID != lfe),
+            "%U/%U present",
+            format_mpls_unicast_label, MPLS_IETF_IPV6_EXPLICIT_NULL_LABEL,
+            format_mpls_eos_bit, MPLS_EOS);
+    fib_entry_contribute_forwarding(lfe,
+                                   FIB_FORW_CHAIN_TYPE_MPLS_EOS,
+                                   &dpo);
+    dpo1 = load_balance_get_bucket(dpo.dpoi_index, 0);
+    lkd = lookup_dpo_get(dpo1->dpoi_index);
+
+    FIB_TEST((fib_index == lkd->lkd_fib_index),
+              "%U/%U is deag in %d %U",
+             format_mpls_unicast_label, deag_label,
+             format_mpls_eos_bit, MPLS_EOS,
+             lkd->lkd_fib_index,
+             format_dpo_id, &dpo, 0);
+    FIB_TEST((LOOKUP_INPUT_DST_ADDR == lkd->lkd_input),
+             "%U/%U is dst deag",
+             format_mpls_unicast_label, deag_label,
+             format_mpls_eos_bit, MPLS_EOS);
+    FIB_TEST((LOOKUP_TABLE_FROM_INPUT_INTERFACE == lkd->lkd_table),
+             "%U/%U is lookup in interface's table",
+             format_mpls_unicast_label, deag_label,
+             format_mpls_eos_bit, MPLS_EOS);
+    FIB_TEST((DPO_PROTO_IP6 == lkd->lkd_proto),
+             "%U/%U is %U dst deag",
+             format_mpls_unicast_label, deag_label,
+             format_mpls_eos_bit, MPLS_EOS,
+             format_dpo_proto, lkd->lkd_proto);
+
+
+    /*
+     * A route deag route for EOS
+     */
+    fib_prefix_t pfx = {
+       .fp_proto = FIB_PROTOCOL_MPLS,
+       .fp_eos = MPLS_EOS,
+       .fp_label = deag_label,
+       .fp_payload_proto = DPO_PROTO_IP4,
+    };
+    lfe = fib_table_entry_path_add(lfib_index,
+                                  &pfx,
+                                  FIB_SOURCE_CLI,
+                                  FIB_ENTRY_FLAG_NONE,
+                                  FIB_PROTOCOL_IP4,
+                                  &zero_addr,
+                                  ~0,
+                                  fib_index,
+                                  1,
+                                  MPLS_LABEL_INVALID,
+                                  FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST((lfe == fib_table_lookup(lfib_index, &pfx)),
+              "%U/%U present",
+              format_mpls_unicast_label, deag_label,
+              format_mpls_eos_bit, MPLS_EOS);
+
+    fib_entry_contribute_forwarding(lfe,
+                                   FIB_FORW_CHAIN_TYPE_MPLS_EOS,
+                                   &dpo);
+    dpo1 = load_balance_get_bucket(dpo.dpoi_index, 0);
+    lkd = lookup_dpo_get(dpo1->dpoi_index);
+
+    FIB_TEST((fib_index == lkd->lkd_fib_index),
+              "%U/%U is deag in %d %U",
+             format_mpls_unicast_label, deag_label,
+             format_mpls_eos_bit, MPLS_EOS,
+             lkd->lkd_fib_index,
+             format_dpo_id, &dpo, 0);
+    FIB_TEST((LOOKUP_INPUT_DST_ADDR == lkd->lkd_input),
+             "%U/%U is dst deag",
+             format_mpls_unicast_label, deag_label,
+             format_mpls_eos_bit, MPLS_EOS);
+    FIB_TEST((DPO_PROTO_IP4 == lkd->lkd_proto),
+             "%U/%U is %U dst deag",
+             format_mpls_unicast_label, deag_label,
+             format_mpls_eos_bit, MPLS_EOS,
+             format_dpo_proto, lkd->lkd_proto);
+
+    fib_table_entry_delete_index(lfe, FIB_SOURCE_CLI);
+
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fib_table_lookup(lfib_index,
+                                                        &pfx)),
+              "%U/%U not present",
+              format_mpls_unicast_label, deag_label,
+              format_mpls_eos_bit, MPLS_EOS);
+
+    /*
+     * A route deag route for non-EOS
+     */
+    pfx.fp_eos = MPLS_NON_EOS;
+    lfe = fib_table_entry_path_add(lfib_index,
+                                  &pfx,
+                                  FIB_SOURCE_CLI,
+                                  FIB_ENTRY_FLAG_NONE,
+                                  FIB_PROTOCOL_IP4,
+                                  &zero_addr,
+                                  ~0,
+                                  lfib_index,
+                                  1,
+                                  MPLS_LABEL_INVALID,
+                                  FIB_ROUTE_PATH_FLAG_NONE);
+
+    FIB_TEST((lfe == fib_table_lookup(lfib_index, &pfx)),
+              "%U/%U present",
+              format_mpls_unicast_label, deag_label,
+              format_mpls_eos_bit, MPLS_NON_EOS);
+
+    fib_entry_contribute_forwarding(lfe,
+                                   FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS,
+                                   &dpo);
+    dpo1 = load_balance_get_bucket(dpo.dpoi_index, 0);
+    lkd = lookup_dpo_get(dpo1->dpoi_index);
+
+    FIB_TEST((fib_index == lkd->lkd_fib_index),
+              "%U/%U is deag in %d %U",
+             format_mpls_unicast_label, deag_label,
+             format_mpls_eos_bit, MPLS_NON_EOS,
+             lkd->lkd_fib_index,
+             format_dpo_id, &dpo, 0);
+    FIB_TEST((LOOKUP_INPUT_DST_ADDR == lkd->lkd_input),
+             "%U/%U is dst deag",
+             format_mpls_unicast_label, deag_label,
+             format_mpls_eos_bit, MPLS_NON_EOS);
+
+    FIB_TEST((DPO_PROTO_MPLS == lkd->lkd_proto),
+             "%U/%U is %U dst deag",
+             format_mpls_unicast_label, deag_label,
+             format_mpls_eos_bit, MPLS_NON_EOS,
+             format_dpo_proto, lkd->lkd_proto);
+
+    fib_table_entry_delete_index(lfe, FIB_SOURCE_CLI);
+
+    FIB_TEST((FIB_NODE_INDEX_INVALID == fib_table_lookup(lfib_index,
+                                                        &pfx)),
+              "%U/%U not present",
+              format_mpls_unicast_label, deag_label,
+              format_mpls_eos_bit, MPLS_EOS);
+
+
+    mpls_sw_interface_enable_disable(&mpls_main,
+                                     tm->hw[0]->sw_if_index,
+                                     0);
+
+    dpo_reset(&dpo);
+    /*
+     * +1 for the drop LB in the MPLS tables.
+     */
+    FIB_TEST(lb_count+1 == pool_elts(load_balance_pool),
+            "Load-balance resources freed %d of %d",
+             lb_count+1, pool_elts(load_balance_pool));
+}
+
+static clib_error_t *
+lfib_test (vlib_main_t * vm, 
+           unformat_input_t * input,
+           vlib_cli_command_t * cmd_arg)
+{
+    fib_test_mk_intf(4);
+
+    lfib_test_deagg();
+
+    return (NULL);
+}
+
+static clib_error_t *
+fib_test (vlib_main_t * vm, 
+         unformat_input_t * input,
+         vlib_cli_command_t * cmd_arg)
+{
+    fib_test_mk_intf(4);
+
+    if (unformat (input, "ip"))
+    {
+       fib_test_v4();
+       fib_test_v6();
+    }
+    else if (unformat (input, "gre"))
+    {
+       fib_test_gre();
+    }
+    else if (unformat (input, "label"))
+    {
+       fib_test_label();
+    }
+    else if (unformat (input, "ae"))
+    {
+       fib_test_ae();
+    }
+    else if (unformat (input, "walk"))
+    {
+       fib_test_walk();
+    }
+    else
+    {
+        /*
+         * These walk UT aren't run as part of the full suite, since the
+         * fib-walk process must be disabled in order for the tests to work
+         *
+         * fib_test_walk();
+         */
+       fib_test_v4();
+       fib_test_v6();
+       fib_test_gre();
+       fib_test_ae();
+       fib_test_label();
+    }
+
+    return (NULL);
+}
+
+VLIB_CLI_COMMAND (test_fib_command, static) = {
+    .path = "test fib",
+    .short_help = "fib unit tests - DO NOT RUN ON A LIVE SYSTEM",
+    .function = fib_test,
+};
+
+VLIB_CLI_COMMAND (test_lfib_command, static) = {
+    .path = "test lfib",
+    .short_help = "mpls label fib unit tests - DO NOT RUN ON A LIVE SYSTEM",
+    .function = lfib_test,
+};
+
+clib_error_t *
+fib_test_init (vlib_main_t *vm)
+{
+    return 0;
+}
+
+VLIB_INIT_FUNCTION (fib_test_init);
diff --git a/vnet/vnet/fib/fib_types.c b/vnet/vnet/fib/fib_types.c
new file mode 100644 (file)
index 0000000..bf76c55
--- /dev/null
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/ip/ip.h>
+
+#include <vnet/fib/fib_types.h>
+#include <vnet/fib/fib_internal.h>
+#include <vnet/mpls/mpls.h>
+
+/*
+ * arrays of protocol and link names
+ */
+static const char* fib_protocol_names[] = FIB_PROTOCOLS;
+static const char* fib_link_names[] = FIB_LINKS;
+static const char* fib_forw_chain_names[] = FIB_FORW_CHAINS;
+
+u8 *
+format_fib_protocol (u8 * s, va_list ap)
+{
+    fib_protocol_t proto = va_arg(ap, int); // fib_protocol_t promotion
+
+    return (format (s, "%s", fib_protocol_names[proto]));
+}
+
+u8 *
+format_fib_link (u8 * s, va_list ap)
+{
+    fib_link_t link = va_arg(ap, int); // fib_link_t promotion
+
+    return (format (s, "%s", fib_link_names[link]));
+}
+
+u8 *
+format_fib_forw_chain_type (u8 * s, va_list * args)
+{
+    fib_forward_chain_type_t fct = va_arg(*args, int);
+
+    return (format (s, "%s", fib_forw_chain_names[fct]));
+}
+
+void
+fib_prefix_from_ip46_addr (const ip46_address_t *addr,
+                          fib_prefix_t *pfx)
+{
+    ASSERT(!ip46_address_is_zero(addr));
+
+    pfx->fp_proto = ((ip46_address_is_ip4(addr) ?
+                     FIB_PROTOCOL_IP4 :
+                     FIB_PROTOCOL_IP6));
+    pfx->fp_len = ((ip46_address_is_ip4(addr) ?
+                   32 : 128));
+    pfx->fp_addr = *addr;
+}
+
+int
+fib_prefix_cmp (const fib_prefix_t *p1,
+               const fib_prefix_t *p2)
+{
+    int res;
+
+    res = (p1->fp_proto - p2->fp_proto);
+
+    if (0 == res)
+    {
+       switch (p1->fp_proto)
+       {
+       case FIB_PROTOCOL_IP4:
+       case FIB_PROTOCOL_IP6:
+           res = (p1->fp_len - p2->fp_len);
+
+           if (0 == res)
+           {
+               res = ip46_address_cmp(&p1->fp_addr, &p2->fp_addr);
+           }
+           break;
+       case FIB_PROTOCOL_MPLS:
+           res = (p1->fp_label - p2->fp_label);
+
+           if (0 == res)
+           {
+               res = (p1->fp_eos - p2->fp_eos);
+           }
+           break;
+       }
+    }
+
+    return (res);
+}
+
+int
+fib_prefix_is_cover (const fib_prefix_t *p1,
+                    const fib_prefix_t *p2)
+{
+    switch (p1->fp_proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       return (ip4_destination_matches_route(&ip4_main,
+                                             &p1->fp_addr.ip4,
+                                             &p2->fp_addr.ip4,
+                                             p1->fp_len));
+    case FIB_PROTOCOL_IP6:
+       return (ip6_destination_matches_route(&ip6_main,
+                                             &p1->fp_addr.ip6,
+                                             &p2->fp_addr.ip6,
+                                             p1->fp_len));
+    case FIB_PROTOCOL_MPLS:
+       break;
+    }
+    return (0);
+}
+
+int
+fib_prefix_is_host (const fib_prefix_t *prefix)
+{
+    switch (prefix->fp_proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       return (prefix->fp_len == 32);
+    case FIB_PROTOCOL_IP6:
+       return (prefix->fp_len == 128);
+    case FIB_PROTOCOL_MPLS:
+       return (!0);
+    }
+    return (0);
+}
+
+u8 *
+format_fib_prefix (u8 * s, va_list * args)
+{
+    fib_prefix_t *fp = va_arg (*args, fib_prefix_t *);
+
+    /*
+     * protocol specific so it prints ::/0 correctly.
+     */
+    switch (fp->fp_proto)
+    {
+    case FIB_PROTOCOL_IP6:
+    {
+       ip6_address_t p6 = fp->fp_addr.ip6;
+
+       ip6_address_mask(&p6, &(ip6_main.fib_masks[fp->fp_len]));
+       s = format (s, "%U", format_ip6_address, &p6);
+       break;
+    }
+    case FIB_PROTOCOL_IP4:
+    {
+       ip4_address_t p4 = fp->fp_addr.ip4;
+       p4.as_u32 &= ip4_main.fib_masks[fp->fp_len];
+
+       s = format (s, "%U", format_ip4_address, &p4);
+       break;
+    }
+    case FIB_PROTOCOL_MPLS:
+       s = format (s, "%U:%U",
+                   format_mpls_unicast_label, fp->fp_label,
+                   format_mpls_eos_bit, fp->fp_eos);
+       break;
+    }
+    s = format (s, "/%d", fp->fp_len);
+
+    return (s);
+}
+
+int
+fib_route_path_cmp (const fib_route_path_t *rpath1,
+                   const fib_route_path_t *rpath2)
+{
+    int res;
+
+    res = ip46_address_cmp(&rpath1->frp_addr,
+                          &rpath2->frp_addr);
+
+    if (0 != res) return (res);
+
+    res = vnet_sw_interface_compare(vnet_get_main(),
+                                   rpath1->frp_sw_if_index,
+                                   rpath2->frp_sw_if_index);
+
+    if (0 != res) return (res);
+
+    if (ip46_address_is_zero(&rpath1->frp_addr))
+    {
+       res = rpath1->frp_fib_index - rpath2->frp_fib_index;
+    }
+
+    return (res);
+}
+
+dpo_proto_t
+fib_proto_to_dpo (fib_protocol_t fib_proto)
+{
+    switch (fib_proto)
+    {
+    case FIB_PROTOCOL_IP6:
+        return (DPO_PROTO_IP6);
+    case FIB_PROTOCOL_IP4:
+        return (DPO_PROTO_IP4);
+    case FIB_PROTOCOL_MPLS:
+        return (DPO_PROTO_MPLS);
+    }
+    ASSERT(0);
+    return (0);
+}
+
+fib_protocol_t
+dpo_proto_to_fib (dpo_proto_t dpo_proto)
+{
+    switch (dpo_proto)
+    {
+    case DPO_PROTO_IP6:
+        return (FIB_PROTOCOL_IP6);
+    case DPO_PROTO_IP4:
+        return (FIB_PROTOCOL_IP4);
+    case DPO_PROTO_MPLS:
+        return (FIB_PROTOCOL_MPLS);
+    }
+    ASSERT(0);
+    return (0);
+}
+
+fib_link_t
+fib_proto_to_link (fib_protocol_t proto)
+{
+    switch (proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       return (FIB_LINK_IP4);
+    case FIB_PROTOCOL_IP6:
+       return (FIB_LINK_IP6);
+    case FIB_PROTOCOL_MPLS:
+       return (FIB_LINK_MPLS);
+    }
+    ASSERT(0);
+    return (0);
+}
+
+fib_forward_chain_type_t
+fib_proto_to_forw_chain_type (fib_protocol_t proto)
+{
+    switch (proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       return (FIB_FORW_CHAIN_TYPE_UNICAST_IP4);
+    case FIB_PROTOCOL_IP6:
+       return (FIB_FORW_CHAIN_TYPE_UNICAST_IP6);
+    case FIB_PROTOCOL_MPLS:
+       return (FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS);
+    }
+    ASSERT(0);
+    return (FIB_FORW_CHAIN_TYPE_UNICAST_IP4);
+}
+
+fib_link_t
+fib_forw_chain_type_to_link_type (fib_forward_chain_type_t fct)
+{
+    switch (fct)
+    {
+    case FIB_FORW_CHAIN_TYPE_UNICAST_IP4:
+       return (FIB_LINK_IP4);
+    case FIB_FORW_CHAIN_TYPE_UNICAST_IP6:
+       return (FIB_LINK_IP6);
+    case FIB_FORW_CHAIN_TYPE_MPLS_EOS:
+       /*
+        * insufficient information to to convert
+        */
+       ASSERT(0);
+       break;
+    case FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS:
+       return (FIB_LINK_MPLS);
+    }
+    return (FIB_LINK_IP4);
+}
+
+dpo_proto_t
+fib_forw_chain_type_to_dpo_proto (fib_forward_chain_type_t fct)
+{
+    switch (fct)
+    {
+    case FIB_FORW_CHAIN_TYPE_UNICAST_IP4:
+       return (DPO_PROTO_IP4);
+    case FIB_FORW_CHAIN_TYPE_UNICAST_IP6:
+       return (DPO_PROTO_IP6);
+    case FIB_FORW_CHAIN_TYPE_MPLS_EOS:
+       /*
+        * insufficient information to to convert
+        */
+       ASSERT(0);
+       break;
+    case FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS:
+       return (DPO_PROTO_MPLS);
+    }
+    return (FIB_LINK_IP4);
+}
diff --git a/vnet/vnet/fib/fib_types.h b/vnet/vnet/fib/fib_types.h
new file mode 100644 (file)
index 0000000..4ebd68d
--- /dev/null
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FIB_TYPES_H__
+#define __FIB_TYPES_H__
+
+#include <vlib/vlib.h>
+#include <vnet/ip/ip6_packet.h>
+#include <vnet/mpls/packet.h>
+#include <vnet/dpo/dpo.h>
+
+/**
+ * A typedef of a node index.
+ * we make this typedef so the code becomes easier for a human to parse.
+ */
+typedef u32 fib_node_index_t;
+#define FIB_NODE_INDEX_INVALID ((fib_node_index_t)(~0))
+
+/**
+ * Protocol Type. packed so it consumes a u8 only
+ */
+typedef enum fib_protocol_t_ {
+#if CLIB_DEBUG > 0
+    FIB_PROTOCOL_IP4 = 1,
+#else
+    FIB_PROTOCOL_IP4 = 0,
+#endif
+    FIB_PROTOCOL_IP6,
+    FIB_PROTOCOL_MPLS,
+}  __attribute__ ((packed)) fib_protocol_t;
+
+#define FIB_PROTOCOLS {                        \
+    [FIB_PROTOCOL_IP4] = "ipv4",       \
+    [FIB_PROTOCOL_IP6] = "ipv6",        \
+    [FIB_PROTOCOL_MPLS] = "MPLS",       \
+}
+
+/**
+ * Definition outside of enum so it does not need to be included in non-defaulted
+ * switch statements
+ */
+#define FIB_PROTOCOL_MAX (FIB_PROTOCOL_MPLS + 1)
+
+/**
+ * Not part of the enum so it does not have to be handled in switch statements
+ */
+#define FIB_PROTOCOL_NONE (FIB_PROTOCOL_MAX+1)
+
+/**
+ * Link Type. This maps directly into the ethertype.
+ */
+typedef enum fib_link_t_ {
+#if CLIB_DEBUG > 0
+    FIB_LINK_IP4 = 1,
+#else
+    FIB_LINK_IP4 = 0,
+#endif
+    FIB_LINK_IP6,
+    FIB_LINK_MPLS,
+}  __attribute__ ((packed)) fib_link_t;
+
+/**
+ * Definition outside of enum so it does not need to be included in non-defaulted
+ * switch statements
+ */
+#define FIB_LINK_NUM (FIB_LINK_MPLS+1)
+
+#define FIB_LINKS {            \
+    [FIB_LINK_IP4] = "ipv4",   \
+    [FIB_LINK_IP6] = "ipv6",   \
+    [FIB_LINK_MPLS] = "mpls",   \
+}
+
+#define FOR_EACH_FIB_LINK(_item)  \
+    for (_item = FIB_LINK_IP4;   \
+        _item <= FIB_LINK_MPLS;  \
+        _item++)
+
+#define FOR_EACH_FIB_IP_LINK(_item)  \
+    for (_item = FIB_LINK_IP4;      \
+        _item <= FIB_LINK_IP6;      \
+        _item++)
+
+/**
+ * @brief Convert from a protocol to a link type
+ */
+fib_link_t fib_proto_to_link (fib_protocol_t proto);
+
+/**
+ * FIB output chain type. When a child object requests a forwarding contribution
+ * from a parent, it does so for a particular scenario. This enumererates those
+ * sceanrios
+ */
+typedef enum fib_forward_chain_type_t_ {
+    /**
+     * Contribute an object that is to be used to forward IP4 packets
+     */
+    FIB_FORW_CHAIN_TYPE_UNICAST_IP4,
+    /**
+     * Contribute an object that is to be used to forward IP6 packets
+     */
+    FIB_FORW_CHAIN_TYPE_UNICAST_IP6,
+    /**
+     * Contribute an object that is to be used to forward non-end-of-stack
+     * MPLS packets
+     */
+    FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS,
+    /**
+     * Contribute an object that is to be used to forward end-of-stack
+     * MPLS packets. This is a convenient ID for clients. A real EOS chain
+     * must be pay-load protocol specific. This
+     * option is converted into one of the other three internally.
+     */
+    FIB_FORW_CHAIN_TYPE_MPLS_EOS,
+}  __attribute__ ((packed)) fib_forward_chain_type_t;
+
+#define FIB_FORW_CHAINS {                                      \
+    [FIB_FORW_CHAIN_TYPE_UNICAST_IP4]   = "unicast-ip4",       \
+    [FIB_FORW_CHAIN_TYPE_UNICAST_IP6]   = "unicast-ip6",       \
+    [FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS]  = "mpls-neos",         \
+    [FIB_FORW_CHAIN_TYPE_MPLS_EOS]      = "mpls-eos",          \
+}
+
+#define FIB_FORW_CHAIN_NUM (FIB_FORW_CHAIN_TYPE_MPLS_EOS+1)
+
+#define FOR_EACH_FIB_FORW_CHAIN(_item)                   \
+    for (_item = FIB_FORW_CHAIN_TYPE_UNICAST_IP4;        \
+        _item <= FIB_FORW_CHAIN_TYPE_MPLS_EOS;           \
+        _item++)
+
+/**
+ * @brief Convert from a chain type to the adjacencies link type
+ */
+extern fib_link_t fib_forw_chain_type_to_link_type(fib_forward_chain_type_t fct);
+
+/**
+ * @brief Convert from a payload-protocol to a chain type.
+ */
+extern fib_forward_chain_type_t fib_proto_to_forw_chain_type(fib_protocol_t proto);
+
+/**
+ * @brief Convert from a chain type to the DPO proto it will install
+ */
+extern dpo_proto_t fib_forw_chain_type_to_dpo_proto(fib_forward_chain_type_t fct);
+
+/**
+ * Aggregrate type for a prefix
+ */
+typedef struct fib_prefix_t_ {
+    /**
+     * The mask length
+     */
+    u16 fp_len;
+
+    /**
+     * protocol type
+     */
+    fib_protocol_t fp_proto;
+
+    /**
+     * Pad to keep the address 4 byte aligned
+     */
+    u8 ___fp___pad;
+
+    union {
+       /**
+        * The address type is not deriveable from the fp_addr member.
+        * If it's v4, then the first 3 u32s of the address will be 0.
+        * v6 addresses (even v4 mapped ones) have at least 2 u32s assigned
+        * to non-zero values. true. but when it's all zero, one cannot decide.
+        */
+       ip46_address_t fp_addr;
+
+       struct {
+           mpls_label_t fp_label;
+           mpls_eos_bit_t fp_eos;
+           /**
+            * This protocol determines the payload protocol of packets
+            * that will be forwarded by this entry once the label is popped.
+            * For a non-eos entry it will be MPLS.
+            */
+           dpo_proto_t fp_payload_proto;
+       };
+    };
+} fib_prefix_t;
+
+_Static_assert(STRUCT_OFFSET_OF(fib_prefix_t, fp_addr) == 4,
+              "FIB Prefix's address is 4 byte aligned.");
+
+/**
+ * \brief Compare two prefixes for equality
+ */
+extern int fib_prefix_cmp(const fib_prefix_t *p1,
+                         const fib_prefix_t *p2);
+
+/**
+ * \brief Compare two prefixes for covering relationship
+ *
+ * \return non-zero if the first prefix is a cover for the second
+ */
+extern int fib_prefix_is_cover(const fib_prefix_t *p1,
+                              const fib_prefix_t *p2);
+
+/**
+ * \brief Return true is the prefix is a host prefix
+ */
+extern int fib_prefix_is_host(const fib_prefix_t *p);
+
+extern u8 * format_fib_prefix(u8 * s, va_list * args);
+extern u8 * format_fib_forw_chain_type(u8 * s, va_list * args);
+
+extern dpo_proto_t fib_proto_to_dpo(fib_protocol_t fib_proto);
+extern fib_protocol_t dpo_proto_to_fib(dpo_proto_t dpo_proto);
+
+/**
+ * Enurmeration of special path/entry types
+ */
+typedef enum fib_special_type_t_ {
+    /**
+     * Marker. Add new types after this one.
+     */
+    FIB_SPECIAL_TYPE_FIRST = 0,
+    /**
+     * Local/for-us paths
+     */
+    FIB_SPECIAL_TYPE_LOCAL = FIB_SPECIAL_TYPE_FIRST,
+    /**
+     * drop paths
+     */
+    FIB_SPECIAL_TYPE_DROP,
+    /**
+     * Marker. Add new types before this one, then update it.
+     */
+    FIB_SPECIAL_TYPE_LAST = FIB_SPECIAL_TYPE_DROP,
+} __attribute__ ((packed)) fib_special_type_t;
+
+/**
+ * The maximum number of types
+ */
+#define FIB_SPEICAL_TYPE_MAX (FIB_SPEICAL_TYPE_LAST + 1)
+
+#define FOR_EACH_FIB_SPEICAL_TYPE(_item)               \
+    for (_item = FIB_TYPE_SPEICAL_FIRST;               \
+        _item <= FIB_SPEICAL_TYPE_LAST; _item++)
+
+extern u8 * format_fib_protocol(u8 * s, va_list ap);
+extern u8 * format_fib_link(u8 *s, va_list ap);
+
+/**
+ * Path flags from the control plane
+ */
+typedef enum fib_route_path_flags_t_
+{
+    FIB_ROUTE_PATH_FLAG_NONE = 0,
+    /**
+     * Recursion constraint of via a host prefix
+     */
+    FIB_ROUTE_PATH_RESOLVE_VIA_HOST = (1 << 0),
+    /**
+     * Recursion constraint of via an attahced prefix
+     */
+    FIB_ROUTE_PATH_RESOLVE_VIA_ATTACHED = (1 << 1),
+} fib_route_path_flags_t;
+
+/**
+ * @brief 
+ * A representation of a path as described by a route producer.
+ * These paramenters will determine the path 'type', of which there are:
+ * 1) Attached-next-hop:
+ *   a single peer on a link.
+ *   It is 'attached' because it is in the same sub-net as the router, on a link
+ *   directly connected to the route.
+ *   It is 'next=hop' since the next-hop address of the peer is known.
+ * 2) Attached:
+ *  the next-hop is not known. but we can ARP for it.
+ * 3) Recursive.
+ *  The next-hop is known but the interface is not. So to find the adj to use
+ *  we must recursively resolve the next-hop.
+ * 3) deaggregate (deag)
+ *  A further lookup is required.
+ */
+typedef struct fib_route_path_t_ {
+    /**
+     * The protocol of the address below. We need this since the all
+     * zeros address is ambiguous.
+     */
+    fib_protocol_t frp_proto;
+    /**
+     * The next-hop address.
+     * Will be NULL for attached paths.
+     * Will be all zeros for attached-next-hop paths on a p2p interface
+     * Will be all zeros for a deag path.
+     */
+    ip46_address_t frp_addr;
+    /**
+     * The interface.
+     * Will be invalid for recursive paths.
+     */
+    u32 frp_sw_if_index;
+    /**
+     * The FIB index to lookup the nexthop
+     * Only valid for recursive paths.
+     */
+    u32 frp_fib_index;
+    /**
+     * [un]equal cost path weight
+     */
+    u32 frp_weight;
+    /**
+     * flags on the path
+     */
+    fib_route_path_flags_t frp_flags;
+    /**
+     * The outgoing MPLS label. INVALID implies no label.
+     */
+    mpls_label_t frp_label;
+} fib_route_path_t;
+
+#endif
diff --git a/vnet/vnet/fib/fib_walk.c b/vnet/vnet/fib/fib_walk.c
new file mode 100644 (file)
index 0000000..79e3ad0
--- /dev/null
@@ -0,0 +1,775 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/fib/fib_walk.h>
+#include <vnet/fib/fib_node_list.h>
+
+/**
+ * The flags on a walk
+ */
+typedef enum fib_walk_flags_t_
+{
+    /**
+     * A synchronous walk.
+     * This walk will run to completion, i.e. visit ALL the children.
+     * It is a depth first traversal of the graph.
+     */
+    FIB_WALK_FLAG_SYNC = (1 << 0),
+    /**
+     * An asynchronous walk.
+     * This walk will be scheduled to run in the background. It will thus visits
+     * the children at a later point in time.
+     * It is a depth first traversal of the graph.
+     */
+    FIB_WALK_FLAG_ASYNC = (1 << 1),
+    /**
+     * An indication that the walk is currently executing.
+     */
+    FIB_WALK_FLAG_EXECUTING = (1 << 2),
+} fib_walk_flags_t;
+
+/**
+ * A representation of a graph walk from a parent object to its children
+ */
+typedef struct fib_walk_t_
+{
+    /**
+     * FIB node linkage. This object is not in the FIB object graph,
+     * but it is present in other node's dependency lists, so it needs to
+     * be pointerable to.
+     */
+    fib_node_t fw_node;
+
+    /**
+     * the walk's flags
+     */
+    fib_walk_flags_t fw_flags;
+
+    /**
+     * Sibling index in the dependency list
+     */
+    u32 fw_dep_sibling;
+
+    /**
+     * Sibling index in the list of all walks
+     */
+    u32 fw_prio_sibling;
+
+    /**
+     * Pointer to the node whose dependants this walk is walking
+     */
+    fib_node_ptr_t fw_parent;
+
+    /**
+     * Number of nodes visited by this walk. saved for debugging purposes.
+     */
+    u32 fw_n_visits;
+
+    /**
+     * The reasons this walk is occuring.
+     * This is a vector ordered in time. The reasons and the front were started
+     * first, and so should be acted first when a node is visisted.
+     */
+    fib_node_back_walk_ctx_t *fw_ctx;
+} fib_walk_t;
+
+/**
+ * @brief The pool of all walk objects
+ */
+static fib_walk_t *fib_walk_pool;
+
+/**
+ * @brief There's only one event type sent to the walk process
+ */
+#define FIB_WALK_EVENT 0
+
+/**
+ * Statistics maintained per-walk queue
+ */
+typedef enum fib_walk_queue_stats_t_
+{
+    FIB_WALK_SCHEDULED,
+    FIB_WALK_COMPLETED,
+} fib_walk_queue_stats_t;
+#define FIB_WALK_QUEUE_STATS_NUM (FIB_WALK_COMPLETED+1)
+
+#define FIB_WALK_QUEUE_STATS {           \
+    [FIB_WALK_SCHEDULED] = "scheduled",  \
+    [FIB_WALK_COMPLETED] = "completed",  \
+}
+
+#define FOR_EACH_FIB_WALK_QUEUE_STATS(_wqs)   \
+    for ((_wqs) = FIB_WALK_SCHEDULED;         \
+         (_wqs) < FIB_WALK_QUEUE_STATS_NUM;   \
+         (_wqs)++)
+
+/**
+ * The names of the walk stats
+ */
+static const char * const fib_walk_queue_stats_names[] = FIB_WALK_QUEUE_STATS;
+
+/**
+ * A represenation of one queue of walk
+ */
+typedef struct fib_walk_queue_t_
+{
+    /**
+     * Qeuee stats
+     */
+    u64 fwq_stats[FIB_WALK_QUEUE_STATS_NUM];
+
+    /**
+     * The node list which acts as the queue
+     */
+    fib_node_list_t fwq_queue;
+} fib_walk_queue_t;
+
+/**
+ * A set of priority queues for outstanding walks
+ */
+typedef struct fib_walk_queues_t_
+{
+    fib_walk_queue_t fwqs_queues[FIB_WALK_PRIORITY_NUM];
+} fib_walk_queues_t;
+
+/**
+ * The global queues of outstanding walks
+ */
+static fib_walk_queues_t fib_walk_queues;
+
+/**
+ * The names of the walk priorities
+ */
+static const char * const fib_walk_priority_names[] = FIB_WALK_PRIORITIES;
+
+u8*
+format_fib_walk_priority (u8 *s, va_list ap)
+{
+    fib_walk_priority_t prio = va_arg(ap, fib_walk_priority_t);
+
+    ASSERT(prio < FIB_WALK_PRIORITY_NUM);
+
+    return (format(s, "%s", fib_walk_priority_names[prio]));
+}
+static u8*
+format_fib_walk_queue_stats (u8 *s, va_list ap)
+{
+    fib_walk_queue_stats_t wqs = va_arg(ap, fib_walk_queue_stats_t);
+
+    ASSERT(wqs < FIB_WALK_QUEUE_STATS_NUM);
+
+    return (format(s, "%s", fib_walk_queue_stats_names[wqs]));
+}
+
+static index_t
+fib_walk_get_index (fib_walk_t *fwalk)
+{
+    return (fwalk - fib_walk_pool);
+}
+
+static fib_walk_t *
+fib_walk_get (index_t fwi)
+{
+    return (pool_elt_at_index(fib_walk_pool, fwi));
+}
+
+/*
+ * not static so it can be used in the unit tests
+ */
+u32
+fib_walk_queue_get_size (fib_walk_priority_t prio)
+{
+    return (fib_node_list_get_size(fib_walk_queues.fwqs_queues[prio].fwq_queue));
+}
+
+static fib_node_index_t
+fib_walk_queue_get_front (fib_walk_priority_t prio)
+{
+    fib_node_ptr_t wp;
+
+    fib_node_list_get_front(fib_walk_queues.fwqs_queues[prio].fwq_queue, &wp);
+
+    return (wp.fnp_index);
+}
+
+static void
+fib_walk_destroy (fib_walk_t *fwalk)
+{
+    if (FIB_NODE_INDEX_INVALID != fwalk->fw_prio_sibling)
+    {
+        fib_node_list_elt_remove(fwalk->fw_prio_sibling);
+    }
+    fib_node_child_remove(fwalk->fw_parent.fnp_type,
+                          fwalk->fw_parent.fnp_index,
+                          fwalk->fw_dep_sibling);
+
+    fib_node_deinit(&fwalk->fw_node);
+    pool_put(fib_walk_pool, fwalk);
+}
+
+/**
+ * return code when advancing a walk
+ */
+typedef enum fib_walk_advance_rc_t_
+{
+    /**
+     * The walk is complete
+     */
+    FIB_WALK_ADVANCE_DONE,
+    /**
+     * the walk has more work
+     */
+    FIB_WALK_ADVANCE_MORE,
+    /**
+     * The walk merged with the one in front
+     */
+    FIB_WALK_ADVANCE_MERGE,
+} fib_walk_advance_rc_t;
+
+/**
+ * @brief Advance the walk one element in its work list
+ */
+static fib_walk_advance_rc_t
+fib_walk_advance (fib_node_index_t fwi)
+{
+    fib_node_back_walk_ctx_t *ctx;
+    fib_node_back_walk_rc_t wrc;
+    fib_node_ptr_t sibling;
+    fib_walk_t *fwalk;
+    int more_elts;
+
+    /*
+     * this walk function is re-entrant - walks acan spawn walks.
+     * fib_walk_t objects come from a pool, so they can realloc. we need 
+     * to retch from said pool at the appropriate times.
+     */
+    fwalk = fib_walk_get(fwi);
+
+    more_elts = fib_node_list_elt_get_next(fwalk->fw_dep_sibling, &sibling);
+
+    if (more_elts)
+    {
+        vec_foreach(ctx, fwalk->fw_ctx)
+        {
+            wrc = fib_node_back_walk_one(&sibling, ctx);
+
+            fwalk = fib_walk_get(fwi);
+            fwalk->fw_n_visits++;
+
+            if (FIB_NODE_BACK_WALK_MERGE == wrc)
+            {
+                /*
+                 * this walk has merged with the one further along the node's
+                 * dependecy list.
+                 */
+                return (FIB_WALK_ADVANCE_MERGE);
+            }
+        }
+        /*
+         * move foward to the next node to visit
+         */
+        more_elts = fib_node_list_advance(fwalk->fw_dep_sibling);
+    }
+
+    if (more_elts)
+    {
+        return (FIB_WALK_ADVANCE_MORE);
+    }
+
+    return (FIB_WALK_ADVANCE_DONE);
+}
+
+/**
+ * First guesses as to good values
+ */
+#define SHORT_SLEEP 1e-8
+#define LONG_SLEEP  1e-3
+#define QUOTA       1e-4
+
+/**
+ * @brief Service the queues
+ * This is not declared static so that it can be unit tested - i know i know...
+ */
+f64
+fib_walk_process_queues (vlib_main_t * vm,
+                         const f64 quota)
+{
+    fib_walk_priority_t prio;
+    fib_walk_advance_rc_t rc;
+    fib_node_index_t fwi;
+    fib_walk_t *fwalk;
+
+    f64 sleep_time, start_time; // , vector_rate;
+
+    start_time = vlib_time_now(vm);
+
+    FOR_EACH_FIB_WALK_PRIORITY(prio)
+    {
+        while (0 != fib_walk_queue_get_size(prio))
+        {
+            fwi = fib_walk_queue_get_front(prio);
+
+            /*
+             * set this walk as executing
+             */
+            fwalk = fib_walk_get(fwi);
+            fwalk->fw_flags |= FIB_WALK_FLAG_EXECUTING;
+
+            do
+            {
+                rc = fib_walk_advance(fwi);
+            } while (((vlib_time_now(vm) - start_time) < quota) &&
+                     (FIB_WALK_ADVANCE_MORE == rc));
+
+            /*
+             * if this walk has no more work then pop it from the queue
+             * and move on to the next.
+             */
+            if (FIB_WALK_ADVANCE_MORE != rc)
+            {
+                fwalk = fib_walk_get(fwi);
+                fib_walk_destroy(fwalk);
+                fib_walk_queues.fwqs_queues[prio].fwq_stats[FIB_WALK_COMPLETED]++;
+            }
+            else
+            {
+                /*
+                 * passed our work quota. sleep time.
+                 */
+                fwalk = fib_walk_get(fwi);
+                fwalk->fw_flags &= ~FIB_WALK_FLAG_EXECUTING;
+                sleep_time = SHORT_SLEEP;
+                goto that_will_do_for_now;
+            }
+        }
+    }
+    /*
+     * got to the end of all the work
+     */
+    sleep_time = LONG_SLEEP;
+
+that_will_do_for_now:
+    return (sleep_time);
+}
+
+/**
+ * @brief The 'fib-walk' process's main loop.
+ */
+static uword
+fib_walk_process (vlib_main_t * vm,
+                  vlib_node_runtime_t * node,
+                  vlib_frame_t * f)
+{
+    f64 sleep_time;
+
+    sleep_time = SHORT_SLEEP;
+
+    while (1)
+    {
+        vlib_process_wait_for_event_or_clock(vm, sleep_time);
+
+        /*
+         * there may be lots of event queued between the processes,
+         * but the walks we want to schedule are in the priority queues,
+         * so we ignore the process events.
+         */
+        vlib_process_get_events(vm, NULL);
+
+        sleep_time = fib_walk_process_queues(vm, QUOTA);
+    }
+
+    /*
+     * Unreached
+     */
+    ASSERT(!"WTF");
+    return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (fib_walk_process_node,static) = {
+    .function = fib_walk_process,
+    .type = VLIB_NODE_TYPE_PROCESS,
+    .name = "fib-walk",
+};
+/* *INDENT-ON* */
+
+/**
+ * @brief Allocate a new walk object
+ */ 
+static fib_walk_t *
+fib_walk_alloc (fib_node_type_t parent_type,
+                fib_node_index_t parent_index,
+                fib_walk_flags_t flags,
+                fib_node_back_walk_ctx_t *ctx)
+{
+    fib_walk_t *fwalk;
+
+    pool_get(fib_walk_pool, fwalk);
+
+    fib_node_init(&fwalk->fw_node, FIB_NODE_TYPE_WALK);
+
+    fwalk->fw_flags = flags;
+    fwalk->fw_dep_sibling  = FIB_NODE_INDEX_INVALID;
+    fwalk->fw_prio_sibling = FIB_NODE_INDEX_INVALID;
+    fwalk->fw_parent.fnp_index = parent_index;
+    fwalk->fw_parent.fnp_type = parent_type;
+    fwalk->fw_ctx = NULL;
+
+    /*
+     * make a copy of the backwalk context so the depth count remains
+     * the same for each sibling visitsed. This is important in the case
+     * where a parents has a loop via one child, but all the others are not.
+     * if the looped child were visited first, the depth count would exceed, the
+     * max and the walk would terminate before it reached the other siblings.
+     */
+    vec_add1(fwalk->fw_ctx, *ctx);
+
+    return (fwalk);
+}
+
+/**
+ * @brief Enqueue a walk onto the appropriate priority queue. Then signal
+ * the background process there is work to do.
+ */
+static index_t
+fib_walk_prio_queue_enquue (fib_walk_priority_t prio,
+                            fib_walk_t *fwalk)
+{
+    index_t sibling;
+
+    sibling = fib_node_list_push_front(fib_walk_queues.fwqs_queues[prio].fwq_queue,
+                                       0,
+                                       FIB_NODE_TYPE_WALK,
+                                       fib_walk_get_index(fwalk));
+    fib_walk_queues.fwqs_queues[prio].fwq_stats[FIB_WALK_SCHEDULED]++;
+
+    /*
+     * poke the fib-walk process to perform the async walk.
+     * we are not passing it specific data, hence the last two args,
+     * the process will drain the queues
+     */
+    vlib_process_signal_event(vlib_get_main(),
+                              fib_walk_process_node.index,
+                              FIB_WALK_EVENT,
+                              FIB_WALK_EVENT);
+
+    return (sibling);
+}
+
+void
+fib_walk_async (fib_node_type_t parent_type,
+                fib_node_index_t parent_index,
+                fib_walk_priority_t prio,
+                fib_node_back_walk_ctx_t *ctx)
+{
+    fib_walk_t *fwalk;
+
+    if (FIB_NODE_GRAPH_MAX_DEPTH < ++ctx->fnbw_depth)
+    {
+        /*
+         * The walk has reached the maximum depth. there is a loop in the graph.
+         * bail.
+         */
+        return;
+    }
+
+    fwalk = fib_walk_alloc(parent_type,
+                           parent_index,
+                           FIB_WALK_FLAG_ASYNC,
+                           ctx);
+
+    fwalk->fw_dep_sibling = fib_node_child_add(parent_type,
+                                               parent_index,
+                                               FIB_NODE_TYPE_WALK,
+                                               fib_walk_get_index(fwalk));
+    
+    fwalk->fw_prio_sibling = fib_walk_prio_queue_enquue(prio, fwalk);
+}
+
+/**
+ * @brief Back walk all the children of a FIB node.
+ *
+ * note this is a synchronous depth first walk. Children visited may propagate
+ * the walk to thier children. Other children node types may not propagate,
+ * synchronously but instead queue the walk for later async completion.
+ */
+void
+fib_walk_sync (fib_node_type_t parent_type,
+               fib_node_index_t parent_index,
+               fib_node_back_walk_ctx_t *ctx)
+{
+    fib_walk_advance_rc_t rc;
+    fib_node_index_t fwi;
+    fib_walk_t *fwalk;
+
+    if (FIB_NODE_GRAPH_MAX_DEPTH < ++ctx->fnbw_depth)
+    {
+        /*
+         * The walk has reached the maximum depth. there is a loop in the graph.
+         * bail.
+         */
+        return;
+    }
+
+    fwalk = fib_walk_alloc(parent_type,
+                           parent_index,
+                           FIB_WALK_FLAG_SYNC,
+                           ctx);
+
+    fwalk->fw_dep_sibling = fib_node_child_add(parent_type,
+                                               parent_index,
+                                               FIB_NODE_TYPE_WALK,
+                                               fib_walk_get_index(fwalk));
+    fwi = fib_walk_get_index(fwalk);
+
+    while (1)
+    {
+        /*
+         * set this walk as executing
+         */
+        fwalk->fw_flags |= FIB_WALK_FLAG_EXECUTING;
+
+        do
+        {
+            rc = fib_walk_advance(fwi);
+        } while (FIB_WALK_ADVANCE_MORE == rc);
+
+
+        /*
+         * this walk function is re-entrant - walks can spawn walks.
+         * fib_walk_t objects come from a pool, so they can realloc. we need 
+         * to re-fetch from said pool at the appropriate times.
+         */
+        fwalk = fib_walk_get(fwi);
+
+        if (FIB_WALK_ADVANCE_MERGE == rc)
+        {
+            /*
+             * this sync walk merged with an walk in front.
+             * by reqeusting a sync walk the client wanted all children walked,
+             * so we ditch the walk object in hand and continue with the one
+             * we merged into
+             */
+            fib_node_ptr_t merged_walk;
+
+            fib_node_list_elt_get_next(fwalk->fw_dep_sibling, &merged_walk);
+
+            ASSERT(FIB_NODE_INDEX_INVALID != merged_walk.fnp_index);
+            ASSERT(FIB_NODE_TYPE_WALK == merged_walk.fnp_type);
+
+            fib_walk_destroy(fwalk);
+
+            fwi = merged_walk.fnp_index;
+            fwalk = fib_walk_get(fwi);            
+
+            if (FIB_WALK_FLAG_EXECUTING & fwalk->fw_flags)
+            {
+                /*
+                 * we are executing a sync walk, and we have met with another
+                 * walk that is also executing. since only one walk executs at once
+                 * (there is no multi-threading) this implies we have met ourselves
+                 * and hence the is a loop in the graph.
+                 * This function is re-entrant, so the walk object we met is being
+                 * acted on in a stack frame below this one. We must therefore not
+                 * continue with it now, but let the stack unwind and along the
+                 * appropriate frame to read the depth count and bail.
+                 */
+                fwalk = NULL;
+                break;
+            }
+        }
+        else
+        {
+            /*
+             * the walk reached the end of the depdency list.
+             */
+            break;
+        }
+    }
+
+    if (NULL != fwalk)
+    {
+        fib_walk_destroy(fwalk);
+    }
+}
+
+static fib_node_t *
+fib_walk_get_node (fib_node_index_t index)
+{
+    fib_walk_t *fwalk;
+
+    fwalk = fib_walk_get(index);
+
+    return (&(fwalk->fw_node));
+}
+
+/**
+ * Walk objects are not parents, nor are they locked.
+ * are no-ops
+ */
+static void
+fib_walk_last_lock_gone (fib_node_t *node)
+{
+    ASSERT(0);
+}
+
+static fib_walk_t*
+fib_walk_get_from_node (fib_node_t *node)
+{
+    return ((fib_walk_t*)(((char*)node) -
+                          STRUCT_OFFSET_OF(fib_walk_t, fw_node)));
+}
+
+/**
+ * @brief Another back walk has reach this walk.
+ * Megre them so there is only one left. It is this node being
+ * visited that will remain, so copy or merge the context onto it.
+ */
+static fib_node_back_walk_rc_t
+fib_walk_back_walk_notify (fib_node_t *node,
+                          fib_node_back_walk_ctx_t *ctx)
+{
+    fib_node_back_walk_ctx_t *old;
+    fib_walk_t *fwalk;
+
+    fwalk = fib_walk_get_from_node(node);
+
+    /*
+     * check whether the walk context can be merge with another,
+     * or whether it needs to be appended.
+     */
+    vec_foreach(old, fwalk->fw_ctx)
+    {
+        /*
+         * we can merge walks if the reason for the walk is the same.
+         */
+        if (old->fnbw_reason == ctx->fnbw_reason)
+        {
+            /*
+             * copy the largest of the depth values. in the presence of a loop,
+             * the same walk will merge with itself. if we take the smaller depth
+             * then it will never end.
+             */
+            old->fnbw_depth = ((old->fnbw_depth >= ctx->fnbw_depth) ?
+                                old->fnbw_depth : 
+                                ctx->fnbw_depth);
+            goto out;
+        }
+    }
+
+    /*
+     * walks could not be merged, this means that the walk infront needs to
+     * perform different action to this one that has caught up. the one in front
+     * was scheduled first so append the new walk context to the back of the list.
+     */
+    vec_add1(fwalk->fw_ctx, *ctx);
+
+out:
+    return (FIB_NODE_BACK_WALK_MERGE);
+}
+
+/**
+ * The FIB walk's graph node virtual function table
+ */
+static const fib_node_vft_t fib_walk_vft = {
+    .fnv_get = fib_walk_get_node,
+    .fnv_last_lock = fib_walk_last_lock_gone,
+    .fnv_back_walk = fib_walk_back_walk_notify,
+};
+
+void
+fib_walk_module_init (void)
+{
+    fib_walk_priority_t prio;
+
+    FOR_EACH_FIB_WALK_PRIORITY(prio)
+    {
+        fib_walk_queues.fwqs_queues[prio].fwq_queue = fib_node_list_create();
+    }
+
+    fib_node_register_type(FIB_NODE_TYPE_WALK, &fib_walk_vft);
+}
+
+static u8*
+format_fib_walk (u8* s, va_list ap)
+{
+    fib_node_index_t fwi = va_arg(ap, fib_node_index_t);
+    fib_walk_t *fwalk;
+
+    fwalk = fib_walk_get(fwi);
+
+    return (format(s, "  parent:{%s:%d} visits:%d flags:%d",
+                   fib_node_type_get_name(fwalk->fw_parent.fnp_type),
+                   fwalk->fw_parent.fnp_index,
+                   fwalk->fw_n_visits,
+                   fwalk->fw_flags));
+}
+
+static clib_error_t *
+fib_walk_show (vlib_main_t * vm,
+               unformat_input_t * input,
+               vlib_cli_command_t * cmd)
+{
+    fib_walk_queue_stats_t wqs;
+    fib_walk_priority_t prio;
+    fib_node_ptr_t sibling;
+    fib_node_index_t fwi;
+    fib_walk_t *fwalk;
+    int more_elts;
+
+    vlib_cli_output(vm, "FIB Walk queues:");
+
+    FOR_EACH_FIB_WALK_PRIORITY(prio)
+    {
+        vlib_cli_output(vm, " %U priority queue:",
+                        format_fib_walk_priority, prio);
+        vlib_cli_output(vm, "  Stats: ");
+
+        FOR_EACH_FIB_WALK_QUEUE_STATS(wqs)
+        {
+            vlib_cli_output(vm, "    %U:%d",
+                            format_fib_walk_queue_stats, wqs,
+                            fib_walk_queues.fwqs_queues[prio].fwq_stats[wqs]);
+        }
+        vlib_cli_output(vm, "  Occupancy:%d",
+                        fib_node_list_get_size(
+                            fib_walk_queues.fwqs_queues[prio].fwq_queue));
+
+        more_elts = fib_node_list_get_front(
+                        fib_walk_queues.fwqs_queues[prio].fwq_queue,
+                        &sibling);
+
+        while (more_elts)
+        {
+            ASSERT(FIB_NODE_INDEX_INVALID != sibling.fnp_index);
+            ASSERT(FIB_NODE_TYPE_WALK == sibling.fnp_type);
+
+            fwi = sibling.fnp_index;
+            fwalk = fib_walk_get(fwi);            
+
+            vlib_cli_output(vm, "  %U", format_fib_walk, fwi);
+
+            more_elts = fib_node_list_elt_get_next(fwalk->fw_prio_sibling,
+                                                   &sibling);
+        }
+    }
+    return (NULL);
+}
+
+VLIB_CLI_COMMAND (fib_walk_show_command, static) = {
+    .path = "show fib walk",
+    .short_help = "show fib walk",
+    .function = fib_walk_show,
+};
diff --git a/vnet/vnet/fib/fib_walk.h b/vnet/vnet/fib/fib_walk.h
new file mode 100644 (file)
index 0000000..7ae99d0
--- /dev/null
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FIB_WALK_H__
+#define __FIB_WALK_H__
+
+#include <vnet/fib/fib_node.h>
+
+/**
+ * @brief Walk priorities.
+ * Strict priorities. All walks a priority n are completed before n+1 is started.
+ * Increasing numerical value implies decreasing priority.
+ */
+typedef enum fib_walk_priority_t_
+{
+    FIB_WALK_PRIORITY_HIGH = 0,
+    FIB_WALK_PRIORITY_LOW  = 1,
+} fib_walk_priority_t;
+
+#define FIB_WALK_PRIORITY_NUM (FIB_WALK_PRIORITY_LOW+1)
+
+#define FIB_WALK_PRIORITIES {           \
+    [FIB_WALK_PRIORITY_HIGH] = "high",  \
+    [FIB_WALK_PRIORITY_LOW]  = "low",   \
+}
+
+#define FOR_EACH_FIB_WALK_PRIORITY(_prio)         \
+    for ((_prio) = FIB_WALK_PRIORITY_HIGH;        \
+         (_prio) < FIB_WALK_PRIORITY_NUM;         \
+         (_prio)++)
+
+extern void fib_walk_module_init(void);
+
+extern void fib_walk_async(fib_node_type_t parent_type,
+                           fib_node_index_t parent_index,
+                           fib_walk_priority_t prio,
+                           fib_node_back_walk_ctx_t *ctx);
+
+extern void fib_walk_sync(fib_node_type_t parent_type,
+                          fib_node_index_t parent_index,
+                          fib_node_back_walk_ctx_t *ctx);
+
+extern u8* format_fib_walk_priority(u8 *s, va_list ap);
+
+#endif
+
diff --git a/vnet/vnet/fib/ip4_fib.c b/vnet/vnet/fib/ip4_fib.c
new file mode 100644 (file)
index 0000000..21ebb7a
--- /dev/null
@@ -0,0 +1,542 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/fib_entry.h>
+#include <vnet/fib/ip4_fib.h>
+
+/*
+ * A table of pefixes to be added to tables and the sources for them
+ */
+typedef struct ip4_fib_table_special_prefix_t_ {
+    fib_prefix_t ift_prefix;
+    fib_source_t ift_source;
+    fib_entry_flag_t ift_flag;
+} ip4_fib_table_special_prefix_t;
+
+static const ip4_fib_table_special_prefix_t ip4_specials[] = {
+    {
+       /* 0.0.0.0/0*/
+       .ift_prefix = {
+           .fp_addr = {
+               .ip4.data_u32 = 0,
+           },
+           .fp_len  = 0,
+           .fp_proto = FIB_PROTOCOL_IP4,
+       },
+       .ift_source = FIB_SOURCE_DEFAULT_ROUTE,
+       .ift_flag   = FIB_ENTRY_FLAG_DROP,
+    },
+    {
+       /* 0.0.0.0/32*/
+       .ift_prefix = {
+           .fp_addr = {
+               .ip4.data_u32 = 0,
+           },
+           .fp_len  = 32,
+           .fp_proto = FIB_PROTOCOL_IP4,
+       },
+       .ift_source = FIB_SOURCE_DEFAULT_ROUTE,
+       .ift_flag   = FIB_ENTRY_FLAG_DROP,
+    },
+    {
+       /*
+        * 240.0.0.0/8
+        * drop class E
+        */
+       .ift_prefix = {
+           .fp_addr = {
+               .ip4.data_u32 = 0xf0000000,
+           },
+           .fp_len   = 8,
+           .fp_proto = FIB_PROTOCOL_IP4,
+       },
+       .ift_source = FIB_SOURCE_SPECIAL,
+       .ift_flag   = FIB_ENTRY_FLAG_DROP,
+
+    },
+    {
+       /*
+        * 224.0.0.0/8
+        * drop all mcast
+        */
+       .ift_prefix = {
+           .fp_addr = {
+               .ip4.data_u32 = 0xe0000000,
+           },
+           .fp_len   = 8,
+           .fp_proto = FIB_PROTOCOL_IP4,
+       },
+       .ift_source = FIB_SOURCE_SPECIAL,
+       .ift_flag    = FIB_ENTRY_FLAG_DROP,
+    },
+    {
+       /*
+        * 255.255.255.255/32
+        * drop, but we'll allow it to be usurped by the likes of DHCP
+        */
+       .ift_prefix = {
+           .fp_addr = {
+               .ip4.data_u32 = 0xffffffff,
+           },
+           .fp_len   = 32,
+           .fp_proto = FIB_PROTOCOL_IP4,
+       },
+       .ift_source = FIB_SOURCE_DEFAULT_ROUTE,
+       .ift_flag   = FIB_ENTRY_FLAG_DROP,
+    }
+};
+
+
+static u32
+ip4_create_fib_with_table_id (u32 table_id)
+{
+    fib_table_t *fib_table;
+
+    pool_get_aligned(ip4_main.fibs, fib_table, CLIB_CACHE_LINE_BYTES);
+    memset(fib_table, 0, sizeof(*fib_table));
+
+    fib_table->ft_proto = FIB_PROTOCOL_IP4;
+    fib_table->ft_index =
+       fib_table->v4.index =
+           (fib_table - ip4_main.fibs);
+
+    hash_set (ip4_main.fib_index_by_table_id, table_id, fib_table->ft_index);
+
+    fib_table->ft_table_id =
+       fib_table->v4.table_id =
+           table_id;
+    fib_table->ft_flow_hash_config = 
+       fib_table->v4.flow_hash_config =
+           IP_FLOW_HASH_DEFAULT;
+    fib_table->v4.fwd_classify_table_index = ~0;
+    fib_table->v4.rev_classify_table_index = ~0;
+    
+    fib_table_lock(fib_table->ft_index, FIB_PROTOCOL_IP4);
+
+    ip4_mtrie_init(&fib_table->v4.mtrie);
+
+    /*
+     * add the special entries into the new FIB
+     */
+    int ii;
+
+    for (ii = 0; ii < ARRAY_LEN(ip4_specials); ii++)
+    {
+       fib_prefix_t prefix = ip4_specials[ii].ift_prefix;
+
+       prefix.fp_addr.ip4.data_u32 =
+           clib_host_to_net_u32(prefix.fp_addr.ip4.data_u32);
+
+       fib_table_entry_special_add(fib_table->ft_index,
+                                   &prefix,
+                                   ip4_specials[ii].ift_source,
+                                   ip4_specials[ii].ift_flag,
+                                   ADJ_INDEX_INVALID);
+    }
+
+    return (fib_table->ft_index);
+}
+
+void
+ip4_fib_table_destroy (ip4_fib_t *fib)
+{
+    fib_table_t *fib_table = (fib_table_t*)fib;
+    int ii;
+
+    /*
+     * remove all the specials we added when the table was created.
+     */
+    for (ii = 0; ii < ARRAY_LEN(ip4_specials); ii++)
+    {
+       fib_prefix_t prefix = ip4_specials[ii].ift_prefix;
+
+       prefix.fp_addr.ip4.data_u32 =
+           clib_host_to_net_u32(prefix.fp_addr.ip4.data_u32);
+
+       fib_table_entry_special_remove(fib_table->ft_index,
+                                      &prefix,
+                                      ip4_specials[ii].ift_source);
+    }
+
+    /*
+     * validate no more routes.
+     */
+    ASSERT(0 == fib_table->ft_total_route_counts);
+    FOR_EACH_FIB_SOURCE(ii)
+    {
+       ASSERT(0 == fib_table->ft_src_route_counts[ii]);
+    }
+
+    if (~0 != fib_table->ft_table_id)
+    {
+       hash_unset (ip4_main.fib_index_by_table_id, fib_table->ft_table_id);
+    }
+    pool_put(ip4_main.fibs, fib_table);
+}
+
+
+u32
+ip4_fib_table_find_or_create_and_lock (u32 table_id)
+{
+    u32 index;
+
+    index = ip4_fib_index_from_table_id(table_id);
+    if (~0 == index)
+       return ip4_create_fib_with_table_id(table_id);
+
+    fib_table_lock(index, FIB_PROTOCOL_IP4);
+
+    return (index);
+}
+
+u32
+ip4_fib_table_create_and_lock (void)
+{
+    return (ip4_create_fib_with_table_id(~0));
+}
+
+u32
+ip4_fib_table_get_index_for_sw_if_index (u32 sw_if_index)
+{
+    if (sw_if_index >= vec_len(ip4_main.fib_index_by_sw_if_index))
+    {
+       /*
+        * This is the case for interfaces that are not yet mapped to
+        * a IP table
+        */
+       return (~0);
+    }
+    return (ip4_main.fib_index_by_sw_if_index[sw_if_index]);
+}
+
+flow_hash_config_t
+ip4_fib_table_get_flow_hash_config (u32 fib_index)
+{
+    return (ip4_fib_get(fib_index)->flow_hash_config);
+}
+
+/*
+ * ip4_fib_table_lookup_exact_match
+ *
+ * Exact match prefix lookup
+ */
+fib_node_index_t
+ip4_fib_table_lookup_exact_match (const ip4_fib_t *fib,
+                                 const ip4_address_t *addr,
+                                 u32 len)
+{
+    uword * hash, * result;
+    u32 key;
+
+    hash = fib->fib_entry_by_dst_address[len];
+    key  = (addr->data_u32 & ip4_main.fib_masks[len]);
+
+    result = hash_get(hash, key);
+
+    if (NULL != result) {
+       return (result[0]);
+    }
+    return (FIB_NODE_INDEX_INVALID);
+}
+
+/*
+ * ip4_fib_table_lookup_adj
+ *
+ * Longest prefix match
+ */
+index_t
+ip4_fib_table_lookup_lb (ip4_fib_t *fib,
+                        const ip4_address_t *addr)
+{
+    fib_node_index_t fei;
+
+    fei = ip4_fib_table_lookup(fib, addr, 32);
+
+    if (FIB_NODE_INDEX_INVALID != fei)
+    {
+       const dpo_id_t *dpo;
+
+       dpo = fib_entry_contribute_ip_forwarding(fei);
+
+       return (dpo->dpoi_index);
+    }
+    return (INDEX_INVALID);
+}
+
+/*
+ * ip4_fib_table_lookup
+ *
+ * Longest prefix match
+ */
+fib_node_index_t
+ip4_fib_table_lookup (const ip4_fib_t *fib,
+                     const ip4_address_t *addr,
+                     u32 len)
+{
+    uword * hash, * result;
+    i32 mask_len;
+    u32 key;
+
+    for (mask_len = len; mask_len >= 0; mask_len--)
+    {
+       hash = fib->fib_entry_by_dst_address[mask_len];
+       key = (addr->data_u32 & ip4_main.fib_masks[mask_len]);
+
+       result = hash_get (hash, key);
+
+       if (NULL != result) {
+           return (result[0]);
+       }
+    }
+    return (FIB_NODE_INDEX_INVALID);
+}
+
+void
+ip4_fib_table_entry_insert (ip4_fib_t *fib,
+                           const ip4_address_t *addr,
+                           u32 len,
+                           fib_node_index_t fib_entry_index)
+{
+    uword * hash, * result;
+    u32 key;
+
+    key = (addr->data_u32 & ip4_main.fib_masks[len]);
+    hash = fib->fib_entry_by_dst_address[len];
+    result = hash_get (hash, key);
+
+    if (NULL == result) {
+       /*
+        * adding a new entry
+        */
+       if (NULL == hash) {
+           hash = hash_create (32 /* elts */, sizeof (uword));
+           hash_set_flags (hash, HASH_FLAG_NO_AUTO_SHRINK);
+       }
+       hash = hash_set(hash, key, fib_entry_index);
+       fib->fib_entry_by_dst_address[len] = hash;
+    }
+    else
+    {
+       ASSERT(0);
+    }
+}
+
+void
+ip4_fib_table_entry_remove (ip4_fib_t *fib,
+                           const ip4_address_t *addr,
+                           u32 len)
+{
+    uword * hash, * result;
+    u32 key;
+
+    key = (addr->data_u32 & ip4_main.fib_masks[len]);
+    hash = fib->fib_entry_by_dst_address[len];
+    result = hash_get (hash, key);
+
+    if (NULL == result)
+    {
+       /*
+        * removing a non-existant entry. i'll allow it.
+        */
+    }
+    else 
+    {
+       hash_unset(hash, key);
+    }
+
+    fib->fib_entry_by_dst_address[len] = hash;
+}
+
+void
+ip4_fib_table_fwding_dpo_update (ip4_fib_t *fib,
+                                const ip4_address_t *addr,
+                                u32 len,
+                                const dpo_id_t *dpo)
+{
+    ip4_fib_mtrie_add_del_route(fib, *addr, len, dpo->dpoi_index, 0); // ADD
+}
+
+void
+ip4_fib_table_fwding_dpo_remove (ip4_fib_t *fib,
+                                const ip4_address_t *addr,
+                                u32 len,
+                                const dpo_id_t *dpo)
+{
+    ip4_fib_mtrie_add_del_route(fib, *addr, len, dpo->dpoi_index, 1); // DELETE
+}
+
+static void
+ip4_fib_table_show_all (ip4_fib_t *fib,
+                       vlib_main_t * vm)
+{
+    fib_node_index_t *fib_entry_indicies;
+    fib_node_index_t *fib_entry_index;
+    int i;
+
+    fib_entry_indicies = NULL;
+
+    for (i = 0; i < ARRAY_LEN (fib->fib_entry_by_dst_address); i++)
+    {
+       uword * hash = fib->fib_entry_by_dst_address[i];
+
+       if (NULL != hash)
+       {
+           hash_pair_t * p;
+
+           hash_foreach_pair (p, hash,
+           ({
+               vec_add1(fib_entry_indicies, p->value[0]);
+           }));
+       }
+    }
+
+    vec_sort_with_function(fib_entry_indicies, fib_entry_cmp_for_sort);
+
+    vec_foreach(fib_entry_index, fib_entry_indicies)
+    {
+       vlib_cli_output(vm, "%U",
+                        format_fib_entry,
+                        *fib_entry_index,
+                        FIB_ENTRY_FORMAT_BRIEF);
+    }
+
+    vec_free(fib_entry_indicies);
+}
+
+static void
+ip4_fib_table_show_one (ip4_fib_t *fib,
+                       vlib_main_t * vm,
+                       ip4_address_t *address,
+                       u32 mask_len)
+{    
+    vlib_cli_output(vm, "%U",
+                    format_fib_entry,
+                    ip4_fib_table_lookup(fib, address, mask_len),
+                    FIB_ENTRY_FORMAT_DETAIL);
+}
+
+static clib_error_t *
+ip4_show_fib (vlib_main_t * vm,
+             unformat_input_t * input,
+             vlib_cli_command_t * cmd)
+{
+    ip4_main_t * im4 = &ip4_main;
+    fib_table_t * fib_table;
+    int verbose, matching, mtrie;
+    ip4_address_t matching_address;
+    u32 matching_mask = 32;
+    int i, table_id = -1, fib_index = ~0;
+
+    verbose = 1;
+    matching = 0;
+    mtrie = 0;
+    while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+       if (unformat (input, "brief") || unformat (input, "summary")
+           || unformat (input, "sum"))
+           verbose = 0;
+
+       else if (unformat (input, "mtrie"))
+           mtrie = 1;
+
+       else if (unformat (input, "%U/%d",
+                          unformat_ip4_address, &matching_address, &matching_mask))
+           matching = 1;
+
+       else if (unformat (input, "%U", unformat_ip4_address, &matching_address))
+           matching = 1;
+
+       else if (unformat (input, "table %d", &table_id))
+           ;
+       else if (unformat (input, "index %d", &fib_index))
+           ;
+       else
+           break;
+    }
+
+    pool_foreach (fib_table, im4->fibs,
+    ({
+       ip4_fib_t *fib = &fib_table->v4;
+
+       if (table_id >= 0 && table_id != (int)fib->table_id)
+           continue;
+       if (fib_index != ~0 && fib_index != (int)fib->index)
+           continue;
+
+       vlib_cli_output (vm, "%U, fib_index %d, flow hash: %U", 
+                        format_fib_table_name, fib->index, FIB_PROTOCOL_IP4,
+                        fib->index,
+                        format_ip_flow_hash_config, fib->flow_hash_config);
+
+       /* Show summary? */
+       if (! verbose)
+       {
+           vlib_cli_output (vm, "%=20s%=16s", "Prefix length", "Count");
+           for (i = 0; i < ARRAY_LEN (fib->fib_entry_by_dst_address); i++)
+           {
+               uword * hash = fib->fib_entry_by_dst_address[i];
+               uword n_elts = hash_elts (hash);
+               if (n_elts > 0)
+                   vlib_cli_output (vm, "%20d%16d", i, n_elts);
+           }
+           continue;
+       }
+
+       if (!matching)
+       {
+           ip4_fib_table_show_all(fib, vm);
+       }
+       else
+       {
+           ip4_fib_table_show_one(fib, vm, &matching_address, matching_mask);
+       }
+
+       if (mtrie)
+           vlib_cli_output (vm, "%U", format_ip4_fib_mtrie, &fib->mtrie);
+    }));
+
+    return 0;
+}
+
+/*?
+ * Show FIB/route entries
+ *
+ * @cliexpar
+ * @cliexstart{show ip fib}
+ * Display the IPv4 FIB.
+ * This command will run for a long time when the FIBs comprise millions of entries.
+ *   vpp# sh ip fib
+ *   Table 0
+ *   Destination         Packets          Bytes         Adjacency
+ *   6.0.0.0/8                          0               0 weight 1, index 3
+ *                                                       arp fake-eth0 6.0.0.1/8
+ *   6.0.0.1/32                         0               0 weight 1, index 4
+ *                                                        local 6.0.0.1/8
+ *
+ *  And so forth. Use 'show ip fib summary' for a summary:
+ *
+ *   vpp# sh ip fib summary
+ *   Table 0
+ *   Prefix length         Count
+ *         8               1
+ *        32               4
+ * @cliexend
+ ?*/
+VLIB_CLI_COMMAND (ip4_show_fib_command, static) = {
+    .path = "show ip fib",
+    .short_help = "show ip fib [mtrie] [summary] [table <n>] [<ip4-addr>] [clear] [include-empty]",
+    .function = ip4_show_fib,
+};
diff --git a/vnet/vnet/fib/ip4_fib.h b/vnet/vnet/fib/ip4_fib.h
new file mode 100644 (file)
index 0000000..cf312cd
--- /dev/null
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @brief The IPv4 FIB
+ *
+ * FIBs are composed of two prefix data-bases (akak tables). The non-forwarding
+ * table contains all the routes that the control plane has programmed, the
+ * forwarding table contains the sub-set of those routes that can be used to
+ * forward packets.
+ * In the IPv4 FIB the non-forwarding table is an array of hash tables indexed
+ * by mask length, the forwarding table is an mtrie
+ *
+ * This IPv4 FIB is used by the protocol independent FIB. So directly using
+ * this APIs in client code is not encouraged. However, this IPv4 FIB can be
+ * used if all the client wants is an IPv4 prefix data-base
+ */
+
+#ifndef __IP4_FIB_H__
+#define __IP4_FIB_H__
+
+#include <vlib/vlib.h>
+#include <vnet/ip/ip.h>
+#include <vnet/fib/fib_entry.h>
+#include <vnet/fib/fib_table.h>
+
+extern fib_node_index_t ip4_fib_table_lookup(const ip4_fib_t *fib,
+                                            const ip4_address_t *addr,
+                                            u32 len);
+extern fib_node_index_t ip4_fib_table_lookup_exact_match(const ip4_fib_t *fib,
+                                                        const ip4_address_t *addr,
+                                                        u32 len);
+
+extern void ip4_fib_table_entry_remove(ip4_fib_t *fib,
+                                      const ip4_address_t *addr,
+                                      u32 len);
+
+extern void ip4_fib_table_entry_insert(ip4_fib_t *fib,
+                                      const ip4_address_t *addr,
+                                      u32 len,
+                                      fib_node_index_t fib_entry_index);
+extern void ip4_fib_table_destroy(ip4_fib_t *fib);
+
+extern void ip4_fib_table_fwding_dpo_update(ip4_fib_t *fib,
+                                           const ip4_address_t *addr,
+                                           u32 len,
+                                           const dpo_id_t *dpo);
+
+extern void ip4_fib_table_fwding_dpo_remove(ip4_fib_t *fib,
+                                           const ip4_address_t *addr,
+                                           u32 len,
+                                           const dpo_id_t *dpo);
+extern u32 ip4_fib_table_lookup_lb (ip4_fib_t *fib,
+                                   const ip4_address_t * dst);
+
+/**
+ * @brief Get the FIB at the given index
+ */
+static inline ip4_fib_t *
+ip4_fib_get (u32 index)
+{
+    return (&(pool_elt_at_index(ip4_main.fibs, index)->v4));
+}
+
+always_inline u32
+ip4_fib_lookup (ip4_main_t * im, u32 sw_if_index, ip4_address_t * dst)
+{
+    return (ip4_fib_table_lookup_lb(
+               ip4_fib_get(vec_elt (im->fib_index_by_sw_if_index, sw_if_index)),
+               dst));
+}
+
+/**
+ * @brief Get or create an IPv4 fib.
+ *
+ * Get or create an IPv4 fib with the provided table ID.
+ *
+ * @param table_id
+ *      When set to \c ~0, an arbitrary and unused fib ID is picked
+ *      and can be retrieved with \c ret->table_id.
+ *      Otherwise, the fib ID to be used to retrieve or create the desired fib.
+ * @returns A pointer to the retrieved or created fib.
+ *
+ */
+extern u32 ip4_fib_table_find_or_create_and_lock(u32 table_id);
+extern u32 ip4_fib_table_create_and_lock(void);
+
+
+static inline 
+u32 ip4_fib_index_from_table_id (u32 table_id)
+{
+  ip4_main_t * im = &ip4_main;
+  uword * p;
+
+  p = hash_get (im->fib_index_by_table_id, table_id);
+  if (!p)
+    return ~0;
+
+  return p[0];
+}
+
+extern u32 ip4_fib_table_get_index_for_sw_if_index(u32 sw_if_index);
+
+extern flow_hash_config_t ip4_fib_table_get_flow_hash_config(u32 fib_index);
+
+
+always_inline index_t
+ip4_fib_forwarding_lookup (u32 fib_index,
+                           const ip4_address_t * addr)
+{
+    ip4_fib_mtrie_leaf_t leaf;
+    ip4_fib_mtrie_t * mtrie;
+
+    mtrie = &ip4_fib_get(fib_index)->mtrie;
+
+    leaf = IP4_FIB_MTRIE_LEAF_ROOT;
+    leaf = ip4_fib_mtrie_lookup_step (mtrie, leaf, addr, 0);
+    leaf = ip4_fib_mtrie_lookup_step (mtrie, leaf, addr, 1);
+    leaf = ip4_fib_mtrie_lookup_step (mtrie, leaf, addr, 2);
+    leaf = ip4_fib_mtrie_lookup_step (mtrie, leaf, addr, 3);
+
+    /* Handle default route. */
+    leaf = (leaf == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie->default_leaf : leaf);
+    
+    return (ip4_fib_mtrie_leaf_get_adj_index(leaf));
+}
+
+
+#endif
+
diff --git a/vnet/vnet/fib/ip6_fib.c b/vnet/vnet/fib/ip6_fib.c
new file mode 100644 (file)
index 0000000..772ce74
--- /dev/null
@@ -0,0 +1,698 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/fib/ip6_fib.h>
+#include <vnet/fib/fib_table.h>
+
+static void
+vnet_ip6_fib_init (u32 fib_index)
+{
+    fib_prefix_t pfx = {
+       .fp_proto = FIB_PROTOCOL_IP6,
+       .fp_len = 0,
+       .fp_addr = {
+           .ip6 = {
+               { 0, 0, },
+           },
+       }
+    };
+
+    /*
+     * Add the default route.
+     */
+    fib_table_entry_special_add(fib_index,
+                               &pfx,
+                               FIB_SOURCE_DEFAULT_ROUTE,
+                               FIB_ENTRY_FLAG_DROP,
+                               ADJ_INDEX_INVALID);
+
+    /*
+     * Add ff02::1:ff00:0/104 via local route for all tables.
+     *  This is required for neighbor discovery to work.
+     */
+    ip6_set_solicited_node_multicast_address(&pfx.fp_addr.ip6, 0);
+    pfx.fp_len = 104;
+    fib_table_entry_special_add(fib_index,
+                               &pfx,
+                               FIB_SOURCE_SPECIAL,
+                               FIB_ENTRY_FLAG_LOCAL,
+                               ADJ_INDEX_INVALID);
+
+    /*
+     * Add all-routers multicast address via local route for all tables
+     */
+    ip6_set_reserved_multicast_address (&pfx.fp_addr.ip6,
+                                       IP6_MULTICAST_SCOPE_link_local,
+                                       IP6_MULTICAST_GROUP_ID_all_routers);
+    pfx.fp_len = 128;
+    fib_table_entry_special_add(fib_index,
+                               &pfx,
+                               FIB_SOURCE_SPECIAL,
+                               FIB_ENTRY_FLAG_LOCAL,
+                               ADJ_INDEX_INVALID);
+
+    /*
+     * Add all-nodes multicast address via local route for all tables
+     */
+    ip6_set_reserved_multicast_address (&pfx.fp_addr.ip6,
+                                       IP6_MULTICAST_SCOPE_link_local,
+                                       IP6_MULTICAST_GROUP_ID_all_hosts);
+    pfx.fp_len = 128;
+    fib_table_entry_special_add(fib_index,
+                               &pfx,
+                               FIB_SOURCE_SPECIAL,
+                               FIB_ENTRY_FLAG_LOCAL,
+                               ADJ_INDEX_INVALID);
+
+    /*
+     *  Add all-mldv2  multicast address via local route for all tables
+     */
+    ip6_set_reserved_multicast_address (&pfx.fp_addr.ip6,
+                                       IP6_MULTICAST_SCOPE_link_local,
+                                       IP6_MULTICAST_GROUP_ID_mldv2_routers);
+    pfx.fp_len = 128;
+    fib_table_entry_special_add(fib_index,
+                               &pfx,
+                               FIB_SOURCE_SPECIAL,
+                               FIB_ENTRY_FLAG_LOCAL,
+                               ADJ_INDEX_INVALID);
+
+    /*
+     * all link local for us
+     */
+    pfx.fp_addr.ip6.as_u64[0] = clib_host_to_net_u64 (0xFE80000000000000ULL);
+    pfx.fp_addr.ip6.as_u64[1] = 0;
+    pfx.fp_len = 10;
+    fib_table_entry_special_add(fib_index,
+                               &pfx,
+                               FIB_SOURCE_SPECIAL,
+                               FIB_ENTRY_FLAG_LOCAL,
+                               ADJ_INDEX_INVALID);
+}
+
+static u32
+create_fib_with_table_id (u32 table_id)
+{
+    fib_table_t *fib_table;
+
+    pool_get_aligned(ip6_main.fibs, fib_table, CLIB_CACHE_LINE_BYTES);
+    memset(fib_table, 0, sizeof(*fib_table));
+
+    fib_table->ft_proto = FIB_PROTOCOL_IP6;
+    fib_table->ft_index =
+       fib_table->v6.index =
+           (fib_table - ip6_main.fibs);
+
+    hash_set(ip6_main.fib_index_by_table_id, table_id, fib_table->ft_index);
+
+    fib_table->ft_table_id =
+       fib_table->v6.table_id =
+           table_id;
+    fib_table->ft_flow_hash_config = 
+       fib_table->v6.flow_hash_config =
+           IP_FLOW_HASH_DEFAULT;
+
+    vnet_ip6_fib_init(fib_table->ft_index);
+    fib_table_lock(fib_table->ft_index, FIB_PROTOCOL_IP6);
+
+    return (fib_table->ft_index);
+}
+
+u32
+ip6_fib_table_find_or_create_and_lock (u32 table_id)
+{
+    uword * p;
+
+    p = hash_get (ip6_main.fib_index_by_table_id, table_id);
+    if (NULL == p)
+       return create_fib_with_table_id(table_id);
+    
+    fib_table_lock(p[0], FIB_PROTOCOL_IP6);
+
+    return (p[0]);
+}
+
+u32
+ip6_fib_table_create_and_lock (void)
+{
+    return (create_fib_with_table_id(~0));
+}
+
+void
+ip6_fib_table_destroy (u32 fib_index)
+{
+    fib_prefix_t pfx = {
+       .fp_proto = FIB_PROTOCOL_IP6,
+       .fp_len = 0,
+       .fp_addr = {
+           .ip6 = {
+               { 0, 0, },
+           },
+       }
+    };
+
+    /*
+     * the default route.
+     */
+    fib_table_entry_special_remove(fib_index,
+                                  &pfx,
+                                  FIB_SOURCE_DEFAULT_ROUTE);
+
+
+    /*
+     * ff02::1:ff00:0/104
+     */
+    ip6_set_solicited_node_multicast_address(&pfx.fp_addr.ip6, 0);
+    pfx.fp_len = 104;
+    fib_table_entry_special_remove(fib_index,
+                                  &pfx,
+                                  FIB_SOURCE_SPECIAL);
+
+    /*
+     * all-routers multicast address
+     */
+    ip6_set_reserved_multicast_address (&pfx.fp_addr.ip6,
+                                       IP6_MULTICAST_SCOPE_link_local,
+                                       IP6_MULTICAST_GROUP_ID_all_routers);
+    pfx.fp_len = 128;
+    fib_table_entry_special_remove(fib_index,
+                                  &pfx,
+                                  FIB_SOURCE_SPECIAL);
+
+    /*
+     * all-nodes multicast address
+     */
+    ip6_set_reserved_multicast_address (&pfx.fp_addr.ip6,
+                                       IP6_MULTICAST_SCOPE_link_local,
+                                       IP6_MULTICAST_GROUP_ID_all_hosts);
+    pfx.fp_len = 128;
+    fib_table_entry_special_remove(fib_index,
+                                  &pfx,
+                                  FIB_SOURCE_SPECIAL);
+
+    /*
+     * all-mldv2 multicast address
+     */
+    ip6_set_reserved_multicast_address (&pfx.fp_addr.ip6,
+                                       IP6_MULTICAST_SCOPE_link_local,
+                                       IP6_MULTICAST_GROUP_ID_mldv2_routers);
+    pfx.fp_len = 128;
+    fib_table_entry_special_remove(fib_index,
+                                  &pfx,
+                                  FIB_SOURCE_SPECIAL);
+
+    /*
+     * all link local 
+     */
+    pfx.fp_addr.ip6.as_u64[0] = clib_host_to_net_u64 (0xFE80000000000000ULL);
+    pfx.fp_addr.ip6.as_u64[1] = 0;
+    pfx.fp_len = 10;
+    fib_table_entry_special_remove(fib_index,
+                                  &pfx,
+                                  FIB_SOURCE_SPECIAL);
+
+    fib_table_t *fib_table = fib_table_get(fib_index, FIB_PROTOCOL_IP6);
+    fib_source_t source;
+    
+     /*
+     * validate no more routes.
+     */
+    ASSERT(0 == fib_table->ft_total_route_counts);
+    FOR_EACH_FIB_SOURCE(source)
+    {
+       ASSERT(0 == fib_table->ft_src_route_counts[source]);
+    }
+
+    if (~0 != fib_table->ft_table_id)
+    {
+       hash_unset (ip6_main.fib_index_by_table_id, fib_table->ft_table_id);
+    }
+    pool_put(ip6_main.fibs, fib_table);
+}
+
+fib_node_index_t
+ip6_fib_table_lookup (u32 fib_index,
+                     const ip6_address_t *addr,
+                     u32 len)
+{
+    const ip6_fib_table_instance_t *table;
+    BVT(clib_bihash_kv) kv, value;
+    int i, n_p, rv;
+    u64 fib;
+
+    table = &ip6_main.ip6_table[IP6_FIB_TABLE_NON_FWDING];
+    n_p = vec_len (table->prefix_lengths_in_search_order);
+
+    kv.key[0] = addr->as_u64[0];
+    kv.key[1] = addr->as_u64[1];
+    fib = ((u64)((fib_index))<<32);
+
+    /*
+     * start search from a mask length same length or shorter.
+     * we don't want matches longer than the mask passed
+     */
+    i = 0;
+    while (i < n_p && table->prefix_lengths_in_search_order[i] > len)
+    {
+        i++;
+    }
+
+    for (; i < n_p; i++)
+    {
+       int dst_address_length = table->prefix_lengths_in_search_order[i];
+       ip6_address_t * mask = &ip6_main.fib_masks[dst_address_length];
+      
+       ASSERT(dst_address_length >= 0 && dst_address_length <= 128);
+       //As lengths are decreasing, masks are increasingly specific.
+       kv.key[0] &= mask->as_u64[0];
+       kv.key[1] &= mask->as_u64[1];
+       kv.key[2] = fib | dst_address_length;
+      
+       rv = BV(clib_bihash_search_inline_2)(&table->ip6_hash, &kv, &value);
+       if (rv == 0)
+           return value.value;
+    }
+
+    return (FIB_NODE_INDEX_INVALID);
+}
+
+fib_node_index_t
+ip6_fib_table_lookup_exact_match (u32 fib_index,
+                                 const ip6_address_t *addr,
+                                 u32 len)
+{
+    const ip6_fib_table_instance_t *table;
+    BVT(clib_bihash_kv) kv, value;
+    ip6_address_t *mask;
+    u64 fib;
+    int rv;
+
+    table = &ip6_main.ip6_table[IP6_FIB_TABLE_NON_FWDING];
+    mask = &ip6_main.fib_masks[len];
+    fib = ((u64)((fib_index))<<32);
+
+    kv.key[0] = addr->as_u64[0] & mask->as_u64[0];
+    kv.key[1] = addr->as_u64[1] & mask->as_u64[1];
+    kv.key[2] = fib | len;
+      
+    rv = BV(clib_bihash_search_inline_2)(&table->ip6_hash, &kv, &value);
+    if (rv == 0)
+       return value.value;
+
+    return (FIB_NODE_INDEX_INVALID);
+}
+
+static void
+compute_prefix_lengths_in_search_order (ip6_fib_table_instance_t *table)
+{
+    int i;
+    vec_reset_length (table->prefix_lengths_in_search_order);
+    /* Note: bitmap reversed so this is in fact a longest prefix match */
+    clib_bitmap_foreach (i, table->non_empty_dst_address_length_bitmap,
+    ({
+       int dst_address_length = 128 - i;
+       vec_add1(table->prefix_lengths_in_search_order, dst_address_length);
+    }));
+}
+
+void
+ip6_fib_table_entry_remove (u32 fib_index,
+                           const ip6_address_t *addr,
+                           u32 len)
+{
+    ip6_fib_table_instance_t *table;
+    BVT(clib_bihash_kv) kv;
+    ip6_address_t *mask;
+    u64 fib;
+
+    table = &ip6_main.ip6_table[IP6_FIB_TABLE_NON_FWDING];
+    mask = &ip6_main.fib_masks[len];
+    fib = ((u64)((fib_index))<<32);
+
+    kv.key[0] = addr->as_u64[0] & mask->as_u64[0];
+    kv.key[1] = addr->as_u64[1] & mask->as_u64[1];
+    kv.key[2] = fib | len;
+
+    BV(clib_bihash_add_del)(&table->ip6_hash, &kv, 0);
+
+    /* refcount accounting */
+    ASSERT (table->dst_address_length_refcounts[len] > 0);
+    if (--table->dst_address_length_refcounts[len] == 0)
+    {
+       table->non_empty_dst_address_length_bitmap =
+            clib_bitmap_set (table->non_empty_dst_address_length_bitmap, 
+                             128 - len, 0);
+       compute_prefix_lengths_in_search_order (table);
+    }
+}
+
+void
+ip6_fib_table_entry_insert (u32 fib_index,
+                           const ip6_address_t *addr,
+                           u32 len,
+                           fib_node_index_t fib_entry_index)
+{
+    ip6_fib_table_instance_t *table;
+    BVT(clib_bihash_kv) kv;
+    ip6_address_t *mask;
+    u64 fib;
+
+    table = &ip6_main.ip6_table[IP6_FIB_TABLE_NON_FWDING];
+    mask = &ip6_main.fib_masks[len];
+    fib = ((u64)((fib_index))<<32);
+
+    kv.key[0] = addr->as_u64[0] & mask->as_u64[0];
+    kv.key[1] = addr->as_u64[1] & mask->as_u64[1];
+    kv.key[2] = fib | len;
+    kv.value = fib_entry_index;
+
+    BV(clib_bihash_add_del)(&table->ip6_hash, &kv, 1);
+
+    table->dst_address_length_refcounts[len]++;
+
+    table->non_empty_dst_address_length_bitmap =
+        clib_bitmap_set (table->non_empty_dst_address_length_bitmap, 
+                        128 - len, 1);
+    compute_prefix_lengths_in_search_order (table);
+}
+
+u32 
+ip6_fib_table_fwding_lookup (ip6_main_t * im,
+                             u32 fib_index,
+                             const ip6_address_t * dst)
+{
+    const ip6_fib_table_instance_t *table;
+    int i, len;
+    int rv;
+    BVT(clib_bihash_kv) kv, value;
+    u64 fib;
+
+    table = &ip6_main.ip6_table[IP6_FIB_TABLE_FWDING];
+    len = vec_len (table->prefix_lengths_in_search_order);
+
+    kv.key[0] = dst->as_u64[0];
+    kv.key[1] = dst->as_u64[1];
+    fib = ((u64)((fib_index))<<32);
+
+    for (i = 0; i < len; i++)
+    {
+       int dst_address_length = table->prefix_lengths_in_search_order[i];
+       ip6_address_t * mask = &ip6_main.fib_masks[dst_address_length];
+      
+       ASSERT(dst_address_length >= 0 && dst_address_length <= 128);
+       //As lengths are decreasing, masks are increasingly specific.
+       kv.key[0] &= mask->as_u64[0];
+       kv.key[1] &= mask->as_u64[1];
+       kv.key[2] = fib | dst_address_length;
+      
+       rv = BV(clib_bihash_search_inline_2)(&table->ip6_hash, &kv, &value);
+       if (rv == 0)
+           return value.value;
+    }
+
+    /* default route is always present */
+    ASSERT(0);
+    return 0;
+}
+
+u32 ip6_fib_table_fwding_lookup_with_if_index (ip6_main_t * im,
+                                              u32 sw_if_index,
+                                              const ip6_address_t * dst)
+{
+    u32 fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index);
+    return ip6_fib_table_fwding_lookup(im, fib_index, dst);
+}
+
+flow_hash_config_t
+ip6_fib_table_get_flow_hash_config (u32 fib_index)
+{
+    return (ip6_fib_get(fib_index)->flow_hash_config);
+}
+
+u32
+ip6_fib_table_get_index_for_sw_if_index (u32 sw_if_index)
+{
+    if (sw_if_index >= vec_len(ip6_main.fib_index_by_sw_if_index))
+    {
+       /*
+        * This is the case for interfaces that are not yet mapped to
+        * a IP table
+        */
+       return (~0);
+    }
+    return (ip6_main.fib_index_by_sw_if_index[sw_if_index]);
+}
+
+void
+ip6_fib_table_fwding_dpo_update (u32 fib_index,
+                                const ip6_address_t *addr,
+                                u32 len,
+                                const dpo_id_t *dpo)
+{
+    ip6_fib_table_instance_t *table;
+    BVT(clib_bihash_kv) kv;
+    ip6_address_t *mask;
+    u64 fib;
+
+    table = &ip6_main.ip6_table[IP6_FIB_TABLE_FWDING];
+    mask = &ip6_main.fib_masks[len];
+    fib = ((u64)((fib_index))<<32);
+
+    kv.key[0] = addr->as_u64[0] & mask->as_u64[0];
+    kv.key[1] = addr->as_u64[1] & mask->as_u64[1];
+    kv.key[2] = fib | len;
+    kv.value = dpo->dpoi_index;
+
+    BV(clib_bihash_add_del)(&table->ip6_hash, &kv, 1);
+
+    table->dst_address_length_refcounts[len]++;
+
+    table->non_empty_dst_address_length_bitmap =
+        clib_bitmap_set (table->non_empty_dst_address_length_bitmap, 
+                        128 - len, 1);
+    compute_prefix_lengths_in_search_order (table);
+}
+
+void
+ip6_fib_table_fwding_dpo_remove (u32 fib_index,
+                                const ip6_address_t *addr,
+                                u32 len,
+                                const dpo_id_t *dpo)
+{
+    ip6_fib_table_instance_t *table;
+    BVT(clib_bihash_kv) kv;
+    ip6_address_t *mask;
+    u64 fib;
+
+    table = &ip6_main.ip6_table[IP6_FIB_TABLE_FWDING];
+    mask = &ip6_main.fib_masks[len];
+    fib = ((u64)((fib_index))<<32);
+
+    kv.key[0] = addr->as_u64[0] & mask->as_u64[0];
+    kv.key[1] = addr->as_u64[1] & mask->as_u64[1];
+    kv.key[2] = fib | len;
+    kv.value = dpo->dpoi_index;
+
+    BV(clib_bihash_add_del)(&table->ip6_hash, &kv, 0);
+
+    /* refcount accounting */
+    ASSERT (table->dst_address_length_refcounts[len] > 0);
+    if (--table->dst_address_length_refcounts[len] == 0)
+    {
+       table->non_empty_dst_address_length_bitmap =
+            clib_bitmap_set (table->non_empty_dst_address_length_bitmap, 
+                             128 - len, 0);
+       compute_prefix_lengths_in_search_order (table);
+    }
+}
+
+typedef struct ip6_fib_show_ctx_t_ {
+    u32 fib_index;
+    fib_node_index_t *entries;
+} ip6_fib_show_ctx_t;
+
+static void
+ip6_fib_table_collect_entries (clib_bihash_kv_24_8_t * kvp,
+                              void *arg)
+{
+    ip6_fib_show_ctx_t *ctx = arg;
+
+    if ((kvp->key[2] >> 32) == ctx->fib_index)
+    {
+       vec_add1(ctx->entries, kvp->value);
+    }
+}
+
+static void
+ip6_fib_table_show_all (ip6_fib_t *fib,
+                       vlib_main_t * vm)
+{
+    fib_node_index_t *fib_entry_index;
+    ip6_fib_show_ctx_t ctx = {
+       .fib_index = fib->index,
+       .entries = NULL,
+    };
+    ip6_main_t *im = &ip6_main;
+
+    BV(clib_bihash_foreach_key_value_pair)(&im->ip6_table[IP6_FIB_TABLE_NON_FWDING].ip6_hash,
+                                          ip6_fib_table_collect_entries,
+                                          &ctx);
+
+    vec_sort_with_function(ctx.entries, fib_entry_cmp_for_sort);
+
+    vec_foreach(fib_entry_index, ctx.entries)
+    {
+       vlib_cli_output(vm, "%U",
+                        format_fib_entry,
+                        *fib_entry_index,
+                        FIB_ENTRY_FORMAT_BRIEF);
+    }
+
+    vec_free(ctx.entries);
+}
+
+static void
+ip6_fib_table_show_one (ip6_fib_t *fib,
+                       vlib_main_t * vm,
+                       ip6_address_t *address,
+                       u32 mask_len)
+{
+    vlib_cli_output(vm, "%U",
+                    format_fib_entry,
+                    ip6_fib_table_lookup(fib->index, address, mask_len),
+                    FIB_ENTRY_FORMAT_DETAIL);
+}
+
+typedef struct {
+  u32 fib_index;
+  u64 count_by_prefix_length[129];
+} count_routes_in_fib_at_prefix_length_arg_t;
+
+static void count_routes_in_fib_at_prefix_length 
+(BVT(clib_bihash_kv) * kvp, void *arg)
+{
+  count_routes_in_fib_at_prefix_length_arg_t * ap = arg;
+  int mask_width;
+
+  if ((kvp->key[2]>>32) != ap->fib_index)
+    return;
+
+  mask_width = kvp->key[2] & 0xFF;
+
+  ap->count_by_prefix_length[mask_width]++;
+}
+
+static clib_error_t *
+ip6_show_fib (vlib_main_t * vm,
+             unformat_input_t * input,
+             vlib_cli_command_t * cmd)
+{
+    count_routes_in_fib_at_prefix_length_arg_t _ca, *ca = &_ca;
+    ip6_main_t * im6 = &ip6_main;
+    fib_table_t *fib_table;
+    ip6_fib_t * fib;
+    int verbose, matching;
+    ip6_address_t matching_address;
+    u32 mask_len  = 128;
+    int table_id = -1, fib_index = ~0;
+
+    verbose = 1;
+    matching = 0;
+
+    while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+       if (unformat (input, "brief")   ||
+           unformat (input, "summary") ||
+           unformat (input, "sum"))
+           verbose = 0;
+
+       else if (unformat (input, "%U/%d",
+                          unformat_ip6_address, &matching_address, &mask_len))
+           matching = 1;
+
+       else if (unformat (input, "%U", unformat_ip6_address, &matching_address))
+           matching = 1;
+
+       else if (unformat (input, "table %d", &table_id))
+           ;
+       else if (unformat (input, "index %d", &fib_index))
+           ;
+       else
+           break;
+    }
+
+    pool_foreach (fib_table, im6->fibs,
+    ({
+       fib = &(fib_table->v6);
+       if (table_id >= 0 && table_id != (int)fib->table_id)
+           continue;
+       if (fib_index != ~0 && fib_index != (int)fib->index)
+           continue;
+
+       vlib_cli_output (vm, "%s, fib_index %d, flow hash: %U", 
+                        fib_table->ft_desc, fib->index,
+                        format_ip_flow_hash_config, fib->flow_hash_config);
+
+       /* Show summary? */
+       if (! verbose)
+       {
+           BVT(clib_bihash) * h = &im6->ip6_table[IP6_FIB_TABLE_NON_FWDING].ip6_hash;
+           int len;
+
+           vlib_cli_output (vm, "%=20s%=16s", "Prefix length", "Count");
+
+           memset (ca, 0, sizeof(*ca));
+           ca->fib_index = fib->index;
+
+           BV(clib_bihash_foreach_key_value_pair)
+               (h, count_routes_in_fib_at_prefix_length, ca);
+
+           for (len = 128; len >= 0; len--)
+            {
+               if (ca->count_by_prefix_length[len])
+                   vlib_cli_output (vm, "%=20d%=16lld", 
+                                    len, ca->count_by_prefix_length[len]);
+            }
+           continue;
+       }
+
+       if (!matching)
+       {
+           ip6_fib_table_show_all(fib, vm);
+       }
+       else
+       {
+           ip6_fib_table_show_one(fib, vm, &matching_address, mask_len);
+       }
+    }));
+
+    return 0;
+}
+
+/*?
+ * Show FIB6/route entries
+ *
+ * @cliexpar
+ * @cliexstart{show ip fib}
+ * Display the IPv6 FIB.
+ * This command will run for a long time when the FIBs comprise millions of entries.
+ * See 'show ip fib'
+ * @cliexend
+ ?*/
+VLIB_CLI_COMMAND (ip6_show_fib_command, static) = {
+    .path = "show ip6 fib",
+    .short_help = "show ip6 fib [summary] [table <n>] [<ip6-addr>] [verboase]",
+    .function = ip6_show_fib,
+};
diff --git a/vnet/vnet/fib/ip6_fib.h b/vnet/vnet/fib/ip6_fib.h
new file mode 100644 (file)
index 0000000..f6af993
--- /dev/null
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __IP6_FIB_H__
+#define __IP6_FIB_H__
+
+#include <vlib/vlib.h>
+#include <vnet/ip/format.h>
+#include <vnet/fib/fib_entry.h>
+#include <vnet/fib/fib_table.h>
+#include <vnet/ip/lookup.h>
+#include <vnet/dpo/load_balance.h>
+
+extern fib_node_index_t ip6_fib_table_lookup(u32 fib_index,
+                                            const ip6_address_t *addr,
+                                            u32 len);
+extern fib_node_index_t ip6_fib_table_lookup_exact_match(u32 fib_index,
+                                                        const ip6_address_t *addr,
+                                                        u32 len);
+
+extern void ip6_fib_table_entry_remove(u32 fib_index,
+                                      const ip6_address_t *addr,
+                                      u32 len);
+
+extern void ip6_fib_table_entry_insert(u32 fib_index,
+                                      const ip6_address_t *addr,
+                                      u32 len,
+                                      fib_node_index_t fib_entry_index);
+extern void ip6_fib_table_destroy(u32 fib_index);
+
+extern void ip6_fib_table_fwding_dpo_update(u32 fib_index,
+                                           const ip6_address_t *addr,
+                                           u32 len,
+                                           const dpo_id_t *dpo);
+
+extern void ip6_fib_table_fwding_dpo_remove(u32 fib_index,
+                                           const ip6_address_t *addr,
+                                           u32 len,
+                                           const dpo_id_t *dpo);
+
+u32 ip6_fib_table_fwding_lookup_with_if_index(ip6_main_t * im,
+                                             u32 sw_if_index,
+                                             const ip6_address_t * dst);
+u32 ip6_fib_table_fwding_lookup(ip6_main_t * im,
+                               u32 fib_index, 
+                               const ip6_address_t * dst);
+
+/**
+ * @biref return the DPO that the LB stacks on.
+ */
+always_inline u32
+ip6_src_lookup_for_packet (ip6_main_t * im,
+                           vlib_buffer_t * b,
+                           ip6_header_t * i)
+{
+    if (vnet_buffer (b)->ip.adj_index[VLIB_RX] == ~0)
+    {
+        const dpo_id_t *dpo;
+        index_t lbi;
+
+        lbi = ip6_fib_table_fwding_lookup_with_if_index(
+                  im,
+                  vnet_buffer (b)->sw_if_index[VLIB_RX],
+                  &i->src_address);
+
+        dpo = load_balance_get_bucket_i(load_balance_get(lbi), 0);
+
+        if (dpo_is_adj(dpo))
+        {
+            vnet_buffer (b)->ip.adj_index[VLIB_RX] = dpo->dpoi_index;
+        }
+    }
+    return vnet_buffer (b)->ip.adj_index[VLIB_RX];
+}
+
+/**
+ * \brief Get or create an IPv6 fib.
+ *
+ * Get or create an IPv4 fib with the provided table ID.
+ *
+ * \param im
+ *      ip4_main pointer.
+ * \param table_id
+ *      When set to \c ~0, an arbitrary and unused fib ID is picked
+ *      and can be retrieved with \c ret->table_id.
+ *      Otherwise, the fib ID to be used to retrieve or create the desired fib.
+ * \returns A pointer to the retrieved or created fib.
+ *
+ */
+extern u32 ip6_fib_table_find_or_create_and_lock(u32 table_id);
+extern u32 ip6_fib_table_create_and_lock(void);
+
+static inline ip6_fib_t *
+ip6_fib_get (fib_node_index_t index)
+{
+    ASSERT(!pool_is_free_index(ip6_main.fibs, index));
+    return (&pool_elt_at_index (ip6_main.fibs, index)->v6);
+}
+
+static inline 
+u32 ip6_fib_index_from_table_id (u32 table_id)
+{
+  ip6_main_t * im = &ip6_main;
+  uword * p;
+
+  p = hash_get (im->fib_index_by_table_id, table_id);
+  if (!p)
+    return ~0;
+
+  return p[0];
+}
+
+extern u32 ip6_fib_table_get_index_for_sw_if_index(u32 sw_if_index);
+
+extern flow_hash_config_t ip6_fib_table_get_flow_hash_config(u32 fib_index);
+
+#endif
+
diff --git a/vnet/vnet/fib/mpls_fib.c b/vnet/vnet/fib/mpls_fib.c
new file mode 100644 (file)
index 0000000..8f1ccef
--- /dev/null
@@ -0,0 +1,439 @@
+/*
+ * mpls_fib.h: The Label/MPLS FIB
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * An MPLS_FIB table;
+ *
+ * The entries in the table are programmed wtih one or more MOIs. These MOIs
+ * may result in different forwarding actions for end-of-stack (EOS) and non-EOS
+ * packets. Whether the two actions are the same more often than they are
+ * different, or vice versa, is a function of the deployment in which the router
+ * is used and thus not predictable.
+ * The desgin choice to make with an MPLS_FIB table is:
+ *  1 - 20 bit key: label only.
+ *      When the EOS and non-EOS actions differ the result is a 'EOS-choice' object.
+ *  2 - 21 bit key: label and EOS-bit.
+ *      The result is then the specific action based on EOS-bit.
+ *
+ * 20 bit key:
+ *   Advantages:
+ *    - lower memory overhead, since there are few DB entries.
+ *   Disadvantages:
+ *    - slower DP performance in the case the chains differ, as more objects are
+ *      encounterd in the switch path
+ *
+ * 21 bit key:
+ *   Advantages:
+ *    - faster DP performance
+ *   Disadvantages
+ *    - increased memory footprint.
+ *
+ * Switching between schemes based on observed/measured action similarity is not
+ * considered on the grounds of complexity and flip-flopping.
+ *
+ * VPP mantra - favour performance over memory. We choose a 21 bit key.  
+ */
+
+#include <vnet/fib/fib_table.h>
+#include <vnet/dpo/load_balance.h>
+#include <vnet/dpo/drop_dpo.h>
+#include <vnet/dpo/punt_dpo.h>
+#include <vnet/dpo/lookup_dpo.h>
+#include <vnet/mpls/mpls.h>
+
+/**
+ * All lookups in an MPLS_FIB table must result in a DPO of type load-balance.
+ * This is the default result which links to drop
+ */
+static index_t mpls_fib_drop_dpo_index = INDEX_INVALID;
+
+/**
+ * FIXME
+ */
+#define MPLS_FLOW_HASH_DEFAULT 0
+
+static inline u32
+mpls_fib_entry_mk_key (mpls_label_t label,
+                      mpls_eos_bit_t eos)
+{
+    ASSERT(eos <= 1);
+    return (label << 1 | eos);
+}
+
+u32
+mpls_fib_index_from_table_id (u32 table_id)
+{
+    mpls_main_t *mm = &mpls_main;
+    uword * p;
+
+    p = hash_get (mm->fib_index_by_table_id, table_id);
+    if (!p)
+       return FIB_NODE_INDEX_INVALID;
+
+    return p[0];
+}
+
+static u32
+mpls_fib_create_with_table_id (u32 table_id)
+{
+    dpo_id_t dpo = DPO_NULL;
+    fib_table_t *fib_table;
+    mpls_eos_bit_t eos;
+    mpls_fib_t *mf;
+    int i;
+
+    pool_get_aligned(mpls_main.fibs, fib_table, CLIB_CACHE_LINE_BYTES);
+    memset(fib_table, 0, sizeof(*fib_table));
+
+    fib_table->ft_proto = FIB_PROTOCOL_MPLS;
+    fib_table->ft_index =
+       (fib_table - mpls_main.fibs);
+
+    hash_set (mpls_main.fib_index_by_table_id, table_id, fib_table->ft_index);
+
+    fib_table->ft_table_id =
+       table_id;
+    fib_table->ft_flow_hash_config = 
+       MPLS_FLOW_HASH_DEFAULT;
+    fib_table->v4.fwd_classify_table_index = ~0;
+    fib_table->v4.rev_classify_table_index = ~0;
+    
+    fib_table_lock(fib_table->ft_index, FIB_PROTOCOL_MPLS);
+
+    if (INDEX_INVALID == mpls_fib_drop_dpo_index)
+    {
+       mpls_fib_drop_dpo_index = load_balance_create(1, DPO_PROTO_MPLS, 0);
+       load_balance_set_bucket(mpls_fib_drop_dpo_index,
+                               0,
+                                drop_dpo_get(DPO_PROTO_MPLS));
+    }
+
+    mf = &fib_table->mpls;
+    mf->mf_entries = hash_create(0, sizeof(fib_node_index_t));
+    for (i = 0; i < MPLS_FIB_DB_SIZE; i++)
+    {
+       /*
+        * initialise each DPO in the data-path lookup table
+        * to be the special MPLS drop
+        */
+       mf->mf_lbs[i] = mpls_fib_drop_dpo_index;
+    }
+
+    /*
+     * non-default forwarding for the special labels.
+     */
+    fib_prefix_t prefix = {
+       .fp_proto = FIB_PROTOCOL_MPLS,
+       .fp_payload_proto = DPO_PROTO_MPLS,
+    };
+
+    /*
+     * PUNT the router alert, both EOS and non-eos
+     */
+    prefix.fp_label = MPLS_IETF_ROUTER_ALERT_LABEL;
+    FOR_EACH_MPLS_EOS_BIT(eos)
+    {
+       prefix.fp_eos = eos;
+        fib_table_entry_special_dpo_add(fib_table->ft_index,
+                                       &prefix,
+                                       FIB_SOURCE_SPECIAL,
+                                       FIB_ENTRY_FLAG_EXCLUSIVE,
+                                       punt_dpo_get(DPO_PROTO_MPLS));
+    }
+
+    /*
+     * IPv4 explicit NULL EOS lookup in the interface's IPv4 table
+     */
+    prefix.fp_label = MPLS_IETF_IPV4_EXPLICIT_NULL_LABEL;
+    prefix.fp_payload_proto = DPO_PROTO_IP4;
+    prefix.fp_eos = MPLS_EOS;
+
+    lookup_dpo_add_or_lock_w_fib_index(0, // unused
+                                       DPO_PROTO_IP4,
+                                       LOOKUP_INPUT_DST_ADDR,
+                                       LOOKUP_TABLE_FROM_INPUT_INTERFACE,
+                                       &dpo);
+    fib_table_entry_special_dpo_add(fib_table->ft_index,
+                                   &prefix,
+                                   FIB_SOURCE_SPECIAL,
+                                   FIB_ENTRY_FLAG_EXCLUSIVE,
+                                    &dpo);
+
+    prefix.fp_payload_proto = DPO_PROTO_MPLS;
+    prefix.fp_eos = MPLS_NON_EOS;
+
+    lookup_dpo_add_or_lock_w_fib_index(0, //unsued
+                                       DPO_PROTO_MPLS,
+                                       LOOKUP_INPUT_DST_ADDR,
+                                       LOOKUP_TABLE_FROM_INPUT_INTERFACE,
+                                       &dpo);
+    fib_table_entry_special_dpo_add(fib_table->ft_index,
+                                   &prefix,
+                                   FIB_SOURCE_SPECIAL,
+                                   FIB_ENTRY_FLAG_EXCLUSIVE,
+                                    &dpo);
+
+    /*
+     * IPv6 explicit NULL EOS lookup in the interface's IPv6 table
+     */
+    prefix.fp_label = MPLS_IETF_IPV6_EXPLICIT_NULL_LABEL;
+    prefix.fp_payload_proto = DPO_PROTO_IP6;
+    prefix.fp_eos = MPLS_EOS;
+
+    lookup_dpo_add_or_lock_w_fib_index(0, //unused
+                                       DPO_PROTO_IP6,
+                                       LOOKUP_INPUT_DST_ADDR,
+                                       LOOKUP_TABLE_FROM_INPUT_INTERFACE,
+                                       &dpo);
+    fib_table_entry_special_dpo_add(fib_table->ft_index,
+                                   &prefix,
+                                   FIB_SOURCE_SPECIAL,
+                                   FIB_ENTRY_FLAG_EXCLUSIVE,
+                                    &dpo);
+
+    prefix.fp_payload_proto = DPO_PROTO_MPLS;
+    prefix.fp_eos = MPLS_NON_EOS;
+    lookup_dpo_add_or_lock_w_fib_index(0, // unsued
+                                       DPO_PROTO_MPLS,
+                                       LOOKUP_INPUT_DST_ADDR,
+                                       LOOKUP_TABLE_FROM_INPUT_INTERFACE,
+                                       &dpo);
+    fib_table_entry_special_dpo_add(fib_table->ft_index,
+                                   &prefix,
+                                   FIB_SOURCE_SPECIAL,
+                                   FIB_ENTRY_FLAG_EXCLUSIVE,
+                                    &dpo);
+
+    return (fib_table->ft_index);
+}
+
+u32
+mpls_fib_table_find_or_create_and_lock (u32 table_id)
+{
+    u32 index;
+
+    index = mpls_fib_index_from_table_id(table_id);
+    if (~0 == index)
+       return mpls_fib_create_with_table_id(table_id);
+
+    fib_table_lock(index, FIB_PROTOCOL_MPLS);
+
+    return (index);
+}
+u32
+mpls_fib_table_create_and_lock (void)
+{
+    return (mpls_fib_create_with_table_id(~0));
+}
+
+void
+mpls_fib_table_destroy (mpls_fib_t *mf)
+{
+    fib_table_t *fib_table = (fib_table_t*)mf;
+    fib_prefix_t prefix = {
+       .fp_proto = FIB_PROTOCOL_MPLS,
+    };
+    mpls_label_t special_labels[] = {
+       MPLS_IETF_ROUTER_ALERT_LABEL,
+       MPLS_IETF_IPV6_EXPLICIT_NULL_LABEL,
+       MPLS_IETF_IPV4_EXPLICIT_NULL_LABEL,
+    };
+    mpls_eos_bit_t eos;
+    u32 ii;
+
+    for (ii = 0; ii < ARRAY_LEN(special_labels); ii++)
+    {
+       FOR_EACH_MPLS_EOS_BIT(eos)
+       {
+           prefix.fp_label = special_labels[ii];
+           prefix.fp_eos   = eos;
+
+           fib_table_entry_delete(fib_table->ft_index,
+                                  &prefix,
+                                  FIB_SOURCE_SPECIAL);
+       }
+    }
+    if (~0 != fib_table->ft_table_id)
+    {
+       hash_unset(mpls_main.fib_index_by_table_id,
+                  fib_table->ft_table_id);
+    }
+    hash_delete(mf->mf_entries);
+
+    pool_put(mpls_main.fibs, fib_table);
+}
+
+fib_node_index_t
+mpls_fib_table_lookup (const mpls_fib_t *mf,
+                      mpls_label_t label,
+                      mpls_eos_bit_t eos)
+{
+    uword *p;
+
+    p = hash_get(mf->mf_entries, mpls_fib_entry_mk_key(label, eos));
+
+    if (NULL == p)
+       return FIB_NODE_INDEX_INVALID;
+
+    return p[0];
+}
+
+void
+mpls_fib_table_entry_insert (mpls_fib_t *mf,
+                            mpls_label_t label,
+                            mpls_eos_bit_t eos,
+                            fib_node_index_t lfei)
+{
+    hash_set(mf->mf_entries, mpls_fib_entry_mk_key(label, eos), lfei);
+}
+
+void
+mpls_fib_table_entry_remove (mpls_fib_t *mf,
+                            mpls_label_t label,
+                            mpls_eos_bit_t eos)
+{
+    hash_unset(mf->mf_entries, mpls_fib_entry_mk_key(label, eos));
+}
+
+void
+mpls_fib_forwarding_table_update (mpls_fib_t *mf,
+                                 mpls_label_t label,
+                                 mpls_eos_bit_t eos,
+                                 const dpo_id_t *dpo)
+{
+    mpls_label_t key;
+
+    ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
+
+    key = mpls_fib_entry_mk_key(label, eos);
+
+    mf->mf_lbs[key] = dpo->dpoi_index;
+}
+
+void
+mpls_fib_forwarding_table_reset (mpls_fib_t *mf,
+                                mpls_label_t label,
+                                mpls_eos_bit_t eos)
+{
+    mpls_label_t key;
+
+    key = mpls_fib_entry_mk_key(label, eos);
+
+    mf->mf_lbs[key] = mpls_fib_drop_dpo_index;
+}
+
+flow_hash_config_t
+mpls_fib_table_get_flow_hash_config (u32 fib_index)
+{
+    // FIXME.
+    return (0);
+}
+
+static void
+mpls_fib_table_show_all (const mpls_fib_t *mpls_fib,
+                        vlib_main_t * vm)
+{
+    fib_node_index_t lfei, *lfeip, *lfeis = NULL;
+    mpls_label_t key;
+
+    hash_foreach(key, lfei, mpls_fib->mf_entries,
+    ({
+       vec_add1(lfeis, lfei);
+    }));
+
+    vec_sort_with_function(lfeis, fib_entry_cmp_for_sort);
+
+    vec_foreach(lfeip, lfeis)
+    {
+       vlib_cli_output (vm, "%U",
+                        format_fib_entry, *lfeip,
+                        FIB_ENTRY_FORMAT_DETAIL);
+    }
+    vec_free(lfeis);
+}
+
+static void
+mpls_fib_table_show_one (const mpls_fib_t *mpls_fib,
+                        mpls_label_t label,
+                        vlib_main_t * vm)
+{    
+    fib_node_index_t lfei;
+    mpls_eos_bit_t eos;
+
+    FOR_EACH_MPLS_EOS_BIT(eos)
+    {    
+       lfei = mpls_fib_table_lookup(mpls_fib, label, eos);
+
+       if (FIB_NODE_INDEX_INVALID != lfei)
+       {
+           vlib_cli_output (vm, "%U", 
+                            format_fib_entry, lfei, FIB_ENTRY_FORMAT_DETAIL);
+       }
+    }
+}
+
+static clib_error_t *
+mpls_fib_show (vlib_main_t * vm,
+              unformat_input_t * input,
+              vlib_cli_command_t * cmd)
+{
+    fib_table_t * fib_table;
+    mpls_label_t label;
+    int table_id;
+
+    table_id = -1;
+    label = MPLS_LABEL_INVALID;
+
+    while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+       /* if (unformat (input, "brief") || unformat (input, "summary") */
+       /*     || unformat (input, "sum")) */
+       /*     verbose = 0; */
+
+       if (unformat (input, "%d", &label))
+           continue;
+       else if (unformat (input, "table %d", &table_id))
+           ;
+       else
+           break;
+    }
+
+    pool_foreach (fib_table, mpls_main.fibs,
+    ({
+       if (table_id >= 0 && table_id != fib_table->ft_table_id)
+           continue;
+
+       vlib_cli_output (vm, "%v, fib_index %d",
+                        fib_table->ft_desc, mpls_main.fibs - fib_table);
+
+       if (MPLS_LABEL_INVALID == label)
+       {
+           mpls_fib_table_show_all(&(fib_table->mpls), vm);
+       }
+       else
+       {
+           mpls_fib_table_show_one(&(fib_table->mpls), label, vm);
+       }
+    }));
+
+    return 0;
+}
+
+VLIB_CLI_COMMAND (mpls_fib_show_command, static) = {
+    .path = "show mpls fib",
+    .short_help = "show mpls fib [summary] [table <n>]",
+    .function = mpls_fib_show,
+};
diff --git a/vnet/vnet/fib/mpls_fib.h b/vnet/vnet/fib/mpls_fib.h
new file mode 100644 (file)
index 0000000..42c9a86
--- /dev/null
@@ -0,0 +1,106 @@
+/*
+ * mpls_fib.h: The Label/MPLS FIB
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MPLS_FIB_TABLE_H__
+#define __MPLS_FIB_TABLE_H__
+
+#include <vnet/vnet.h>
+#include <vnet/mpls/mpls.h>
+#include <vnet/fib/fib_types.h>
+#include <vnet/dpo/dpo.h>
+#include <vnet/mpls/mpls.h>
+#include <vnet/fib/fib_table.h>
+
+static inline mpls_fib_t*
+mpls_fib_get (fib_node_index_t index)
+{
+    if (!pool_is_free_index(mpls_main.fibs, index))
+       return (&(pool_elt_at_index(mpls_main.fibs, index)->mpls));
+    return (NULL);
+}
+
+extern u32 mpls_fib_table_find_or_create_and_lock(u32 table_id);
+extern u32 mpls_fib_table_create_and_lock(void);
+// extern mpls_fib_t * mpls_fib_find(u32 table_id);
+extern u32 mpls_fib_index_from_table_id(u32 table_id);
+
+extern u8 *format_mpls_fib_table_name(u8 * s, va_list * args);
+
+extern fib_node_index_t mpls_fib_table_entry_add_from_ip_fib_entry (
+    u32 table_id,
+    mpls_label_t label,
+    mpls_eos_bit_t eos,
+    fib_node_index_t fib_entry_index);
+
+
+extern fib_node_index_t mpls_fib_table_lookup(const mpls_fib_t *mf,
+                                             mpls_label_t label,
+                                             mpls_eos_bit_t eos);
+
+extern void mpls_fib_table_entry_remove(mpls_fib_t *mf,
+                                       mpls_label_t label,
+                                       mpls_eos_bit_t eos);
+extern void mpls_fib_table_entry_insert(mpls_fib_t *mf,
+                                       mpls_label_t label,
+                                       mpls_eos_bit_t eos,
+                                       fib_node_index_t fei);
+extern void mpls_fib_table_destroy(mpls_fib_t *mf);
+
+
+
+extern void mpls_fib_forwarding_table_update(mpls_fib_t *mf,
+                                            mpls_label_t label,
+                                            mpls_eos_bit_t eos,
+                                            const dpo_id_t *dpo);
+extern void mpls_fib_forwarding_table_reset(mpls_fib_t *mf,
+                                           mpls_label_t label,
+                                           mpls_eos_bit_t eos);
+
+/**
+ * @brief
+ *  Lookup a label and EOS bit in the MPLS_FIB table to retrieve the
+ *  load-balance index to be used for packet forwarding.
+ */
+static inline index_t
+mpls_fib_table_forwarding_lookup (u32 mpls_fib_index,
+                                 const mpls_unicast_header_t *hdr)
+{
+    mpls_label_t label;
+    mpls_fib_t *mf;
+    u32 key;
+
+    label = clib_net_to_host_u32(hdr->label_exp_s_ttl);
+    key = (vnet_mpls_uc_get_label(label) << 1) | vnet_mpls_uc_get_s(label);
+
+    mf = mpls_fib_get(mpls_fib_index);
+
+    return (mf->mf_lbs[key]);
+}
+
+static inline u32
+mpls_fib_table_get_index_for_sw_if_index (u32 sw_if_index)
+{
+    mpls_main_t *mm = &mpls_main;
+
+    ASSERT(vec_len(mm->fib_index_by_sw_if_index) < sw_if_index);
+
+    return (mm->fib_index_by_sw_if_index[sw_if_index]);
+}
+
+extern flow_hash_config_t mpls_fib_table_get_flow_hash_config(u32 fib_index);
+
+#endif
index f00977c..9f8adc7 100644 (file)
 
 #include <vnet/vnet.h>
 #include <vnet/gre/gre.h>
+#include <vnet/adj/adj.h>
 
 gre_main_t gre_main;
 
-typedef CLIB_PACKED (struct {
-  ip4_header_t ip4;
-  gre_header_t gre;
-}) ip4_and_gre_header_t;
-
 typedef struct {
   union {
     ip4_and_gre_header_t ip4_and_gre;
@@ -233,179 +229,39 @@ gre_interface_tx (vlib_main_t * vm,
       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
 
       /* 
-       * As long as we have enough pkts left to process two pkts
-       * and prefetch two pkts...
+       * FIXME DUAL LOOP
        */
-      while (n_left_from >= 4 && n_left_to_next >= 2)
-       {
-          vlib_buffer_t * b0, * b1;
-          ip4_header_t * ip0, * ip1;
-          ip4_and_gre_union_t * h0, * h1;
-         u32 bi0, next0, bi1, next1;
-         __attribute__((unused)) u8 error0, error1;
-          u16 gre_protocol0, gre_protocol1;
-      
-         /* Prefetch the next iteration */
-         {
-           vlib_buffer_t * p2, * p3;
-
-           p2 = vlib_get_buffer (vm, from[2]);
-           p3 = vlib_get_buffer (vm, from[3]);
-
-           vlib_prefetch_buffer_header (p2, LOAD);
-           vlib_prefetch_buffer_header (p3, LOAD);
-
-            /* 
-             * Prefetch packet data. We expect to overwrite
-             * the inbound L2 header with an ip header and a
-             * gre header. Might want to prefetch the last line
-             * of rewrite space as well; need profile data
-             */
-           CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
-           CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
-         }
-
-          /* Pick up the next two buffer indices */
-         bi0 = from[0];
-         bi1 = from[1];
-
-          /* Speculatively enqueue them where we sent the last buffer */
-         to_next[0] = bi0;
-         to_next[1] = bi1;
-         from += 2;
-         to_next += 2;
-         n_left_to_next -= 2;
-         n_left_from -= 2;
-      
-         b0 = vlib_get_buffer (vm, bi0);
-         b1 = vlib_get_buffer (vm, bi1);
-
-          vnet_buffer (b0)->sw_if_index[VLIB_TX] = t->outer_fib_index;
-          vnet_buffer (b1)->sw_if_index[VLIB_TX] = t->outer_fib_index;
-
-          if (PREDICT_FALSE(t->teb))
-          {
-            gre_protocol0 = clib_net_to_host_u16(GRE_PROTOCOL_teb);
-            gre_protocol1 = clib_net_to_host_u16(GRE_PROTOCOL_teb);
-          }
-          else
-          {
-            ip0 = vlib_buffer_get_current (b0);
-            gre_protocol0 = clib_net_to_host_u16 (0x800);
-            gre_protocol0 =
-                ((ip0->ip_version_and_header_length & 0xF0) == 0x60) ?
-                0x86DD : gre_protocol0;
-
-            ip1 = vlib_buffer_get_current (b1);
-            gre_protocol1 = clib_net_to_host_u16 (0x800);
-            gre_protocol1 =
-                ((ip1->ip_version_and_header_length & 0xF0) == 0x60) ?
-                0x86DD : gre_protocol1;
-          }
-
-          vlib_buffer_advance (b0, -sizeof(*h0));
-          vlib_buffer_advance (b1, -sizeof(*h1));
-
-          h0 = vlib_buffer_get_current (b0);
-          h1 = vlib_buffer_get_current (b1);
-          h0->as_u64[0] = 0;
-          h0->as_u64[1] = 0;
-          h0->as_u64[2] = 0;
-
-          h1->as_u64[0] = 0;
-          h1->as_u64[1] = 0;
-          h1->as_u64[2] = 0;
-
-          ip0 = &h0->ip4_and_gre.ip4;
-          h0->ip4_and_gre.gre.protocol = gre_protocol0;
-          ip0->ip_version_and_header_length = 0x45;
-          ip0->ttl = 254;
-          ip0->protocol = IP_PROTOCOL_GRE;
-
-          ip1 = &h1->ip4_and_gre.ip4;
-          h1->ip4_and_gre.gre.protocol = gre_protocol1;
-          ip1->ip_version_and_header_length = 0x45;
-          ip1->ttl = 254;
-          ip1->protocol = IP_PROTOCOL_GRE;
-
-          ip0->length = 
-            clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
-          ip1->length = 
-            clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1));
-          ip0->src_address.as_u32 = t->tunnel_src.as_u32;
-          ip1->src_address.as_u32 = t->tunnel_src.as_u32;
-          ip0->dst_address.as_u32 = t->tunnel_dst.as_u32;
-          ip1->dst_address.as_u32 = t->tunnel_dst.as_u32;
-          ip0->checksum = ip4_header_checksum (ip0);
-          ip1->checksum = ip4_header_checksum (ip1);
-
-          /* ip4_lookup will route to the tunnel partner */
-          next0 = GRE_OUTPUT_NEXT_LOOKUP;
-          next1 = GRE_OUTPUT_NEXT_LOOKUP;
-          error0 = GRE_ERROR_NONE;
-          error1 = GRE_ERROR_NONE;
-
-          /* 
-           * Enqueue 2 pkts. This macro deals with next0 != next1,
-           * acquiring enqueue rights to the indicated next
-           * node input frame, etc.
-           */
-         vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
-                                          to_next, n_left_to_next,
-                                          bi0, bi1, next0, next1);
-       }
 
       while (n_left_from > 0 && n_left_to_next > 0)
        {
-         vlib_buffer_t * b0;
+          u32 bi0, adj_index0, next0;
+         const ip_adjacency_t * adj0;
+          const dpo_id_t *dpo0;
           ip4_header_t * ip0;
-          ip4_and_gre_union_t * h0;
-         u32 bi0, next0;
-         __attribute__((unused)) u8 error0;
-          u16 gre_protocol0;
-      
-         bi0 = to_next[0] = from[0];
-         from += 1;
-         n_left_from -= 1;
-         to_next += 1;
-         n_left_to_next -= 1;
-      
-         b0 = vlib_get_buffer (vm, bi0);
-
-          vnet_buffer (b0)->sw_if_index[VLIB_TX] = t->outer_fib_index;
+          vlib_buffer_t * b0;
+
+          bi0 = from[0];
+          to_next[0] = bi0;
+          from += 1;
+          to_next += 1;
+          n_left_from -= 1;
+          n_left_to_next -= 1;
+
+          b0 = vlib_get_buffer(vm, bi0);
           ip0 = vlib_buffer_get_current (b0);
-          if (PREDICT_FALSE(t->teb))
-          {
-            gre_protocol0 = clib_net_to_host_u16(GRE_PROTOCOL_teb);
-          }
-          else
-          {
-            gre_protocol0 = clib_net_to_host_u16 (0x800);
-            gre_protocol0 =
-                ((ip0->ip_version_and_header_length & 0xF0) == 0x60) ?
-                0x86DD : gre_protocol0;
-          }
-
-          vlib_buffer_advance (b0, -sizeof(*h0));
-
-          h0 = vlib_buffer_get_current (b0);
-          h0->as_u64[0] = 0;
-          h0->as_u64[1] = 0;
-          h0->as_u64[2] = 0;
-
-          ip0 = &h0->ip4_and_gre.ip4;
-          h0->ip4_and_gre.gre.protocol = gre_protocol0;
-          ip0->ip_version_and_header_length = 0x45;
-          ip0->ttl = 254;
-          ip0->protocol = IP_PROTOCOL_GRE;
+
+          /* Fixup the checksum and len fields in the LISP tunnel encap
+           * that was applied at the midchain node */
           ip0->length = 
             clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
-          ip0->src_address.as_u32 = t->tunnel_src.as_u32;
-          ip0->dst_address.as_u32 = t->tunnel_dst.as_u32;
           ip0->checksum = ip4_header_checksum (ip0);
 
-          next0 = GRE_OUTPUT_NEXT_LOOKUP;
-          error0 = GRE_ERROR_NONE;
+          /* Follow the DPO on which the midchain is stacked */
+          adj_index0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX];
+         adj0 = adj_get(adj_index0);
+          dpo0 = &adj0->sub_type.midchain.next_dpo;
+          next0 = dpo0->dpoi_next_node;
+          vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
 
           if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) 
             {
index ad599d2..beb13d9 100644 (file)
@@ -25,6 +25,7 @@
 #include <vnet/ip/ip4_packet.h>
 #include <vnet/pg/pg.h>
 #include <vnet/ip/format.h>
+#include <vnet/adj/adj_types.h>
 
 extern vnet_hw_interface_class_t gre_hw_interface_class;
 
@@ -50,12 +51,44 @@ typedef struct {
 } gre_protocol_info_t;
 
 typedef struct {
+  /**
+   * Linkage into the FIB object graph
+   */
+  fib_node_t node;
+
+  /**
+   * The tunnel's source/local address
+   */
   ip4_address_t tunnel_src;
+  /**
+   * The tunnel's destination/remote address
+   */
   ip4_address_t tunnel_dst;
+  /**
+   * The FIB in which the src.dst address are present
+   */
   u32 outer_fib_index;
   u32 hw_if_index;
   u32 sw_if_index;
   u8 teb;
+
+  /**
+   * The FIB entry sourced by the tunnel for its destination prefix
+   */
+  fib_node_index_t fib_entry_index;
+
+  /**
+   * The tunnel is a child of the FIB entry for its desintion. This is
+   * so it receives updates when the forwarding information for that entry
+   * changes.
+   * The tunnels sibling index on the FIB entry's dependency list.
+   */
+  u32 sibling_index;
+
+  /**
+   * The index of the midchain adjacency created for this tunnel
+   */
+  adj_index_t adj_index[FIB_LINK_NUM];
 } gre_tunnel_t;
 
 typedef struct {
@@ -80,6 +113,15 @@ typedef struct {
   vnet_main_t * vnet_main;
 } gre_main_t;
 
+/**
+ * @brief IPv4 and GRE header.
+ *
+*/
+typedef CLIB_PACKED (struct {
+  ip4_header_t ip4;
+  gre_header_t gre;
+}) ip4_and_gre_header_t;
+
 always_inline gre_protocol_info_t *
 gre_get_protocol_info (gre_main_t * em, gre_protocol_t protocol)
 {
index 864c384..10e9ff9 100644 (file)
 #include <vnet/pg/pg.h>
 #include <vnet/gre/gre.h>
 #include <vnet/ip/format.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/adj/adj_midchain.h>
+#include <vnet/mpls/mpls.h>
+
+static inline u64
+gre_mk_key (const ip4_address_t *src,
+            const ip4_address_t *dst,
+            u32 out_fib_index)
+{
+  // FIXME. the fib index should be part of the key
+  return ((u64)src->as_u32 << 32 | (u64)dst->as_u32);
+}
 
-u8 * format_gre_tunnel (u8 * s, va_list * args)
+static u8 *
+format_gre_tunnel (u8 * s, va_list * args)
 {
   gre_tunnel_t * t = va_arg (*args, gre_tunnel_t *);
+  int detail = va_arg (*args, int);
   gre_main_t * gm = &gre_main;
 
   s = format (s,
@@ -32,11 +46,193 @@ u8 * format_gre_tunnel (u8 * s, va_list * args)
               format_ip4_address, &t->tunnel_dst,
               (t->teb ? "teb" : "ip"),
               t->outer_fib_index);
+  if (detail)
+  {
+      s = format (s, "\n  fib-entry:%d adj-ip4:%d adj-ip6:%d adj-mpls:%d",
+                  t->fib_entry_index,
+                  t->adj_index[FIB_LINK_IP4],
+                  t->adj_index[FIB_LINK_IP6],
+                  t->adj_index[FIB_LINK_MPLS]);
+  }
+
   return s;
 }
 
-int vnet_gre_add_del_tunnel
-  (vnet_gre_add_del_tunnel_args_t *a, u32 * sw_if_indexp)
+static gre_tunnel_t *
+gre_tunnel_db_find (const ip4_address_t *src,
+                    const ip4_address_t *dst,
+                    u32 out_fib_index)
+{
+  gre_main_t * gm = &gre_main;
+  uword * p;
+  u64 key;
+
+  key = gre_mk_key(src, dst, out_fib_index);
+
+  p = hash_get (gm->tunnel_by_key, key);
+
+  if (NULL == p)
+    return (NULL);
+
+  return (pool_elt_at_index (gm->tunnels, p[0]));
+}
+
+static void
+gre_tunnel_db_add (const gre_tunnel_t *t)
+{
+  gre_main_t * gm = &gre_main;
+  u64 key;
+
+  key = gre_mk_key(&t->tunnel_src, &t->tunnel_dst, t->outer_fib_index);
+  hash_set (gm->tunnel_by_key, key, t - gm->tunnels);
+}
+
+static void
+gre_tunnel_db_remove (const gre_tunnel_t *t)
+{
+  gre_main_t * gm = &gre_main;
+  u64 key;
+
+  key = gre_mk_key(&t->tunnel_src, &t->tunnel_dst, t->outer_fib_index);
+  hash_unset (gm->tunnel_by_key, key);
+}
+
+static gre_tunnel_t *
+gre_tunnel_from_fib_node (fib_node_t *node)
+{
+#if (CLIB_DEBUG > 0)
+    ASSERT(FIB_NODE_TYPE_GRE_TUNNEL == node->fn_type);
+#endif
+    return ((gre_tunnel_t*) (((char*)node) -
+                             STRUCT_OFFSET_OF(gre_tunnel_t, node)));
+}
+
+/*
+ * gre_tunnel_stack
+ *
+ * 'stack' (resolve the recursion for) the tunnel's midchain adjacency
+ */
+static void
+gre_tunnel_stack (gre_tunnel_t *gt)
+{
+    fib_link_t linkt;
+
+    /*
+     * find the adjacency that is contributed by the FIB entry
+     * that this tunnel resovles via, and use it as the next adj
+     * in the midchain
+     */
+    FOR_EACH_FIB_LINK(linkt)
+    {
+        if (ADJ_INDEX_INVALID != gt->adj_index[linkt])
+        {
+            adj_nbr_midchain_stack(
+                gt->adj_index[linkt],
+                fib_entry_contribute_ip_forwarding(gt->fib_entry_index));
+        }
+    }
+}
+
+/**
+ * Function definition to backwalk a FIB node
+ */
+static fib_node_back_walk_rc_t
+gre_tunnel_back_walk (fib_node_t *node,
+                          fib_node_back_walk_ctx_t *ctx)
+{
+    gre_tunnel_stack(gre_tunnel_from_fib_node(node));
+
+    return (FIB_NODE_BACK_WALK_CONTINUE);
+}
+
+/**
+ * Function definition to get a FIB node from its index
+ */
+static fib_node_t*
+gre_tunnel_fib_node_get (fib_node_index_t index)
+{
+    gre_tunnel_t * gt;
+    gre_main_t * gm;
+
+    gm  = &gre_main;
+    gt = pool_elt_at_index(gm->tunnels, index);
+
+    return (&gt->node);
+}
+
+/**
+ * Function definition to inform the FIB node that its last lock has gone.
+ */
+static void
+gre_tunnel_last_lock_gone (fib_node_t *node)
+{
+    /*
+     * The MPLS GRE tunnel is a root of the graph. As such
+     * it never has children and thus is never locked.
+     */
+    ASSERT(0);
+}
+
+/*
+ * Virtual function table registered by MPLS GRE tunnels
+ * for participation in the FIB object graph.
+ */
+const static fib_node_vft_t gre_vft = {
+    .fnv_get = gre_tunnel_fib_node_get,
+    .fnv_last_lock = gre_tunnel_last_lock_gone,
+    .fnv_back_walk = gre_tunnel_back_walk,
+};
+
+static int
+gre_proto_from_fib_link (fib_link_t link)
+{
+    switch (link)
+    {
+    case FIB_LINK_IP4:
+        return (GRE_PROTOCOL_ip4);
+    case FIB_LINK_IP6:
+        return (GRE_PROTOCOL_ip6);
+    case FIB_LINK_MPLS:
+        return (GRE_PROTOCOL_mpls_unicast);
+    }
+    ASSERT(0);
+    return (GRE_PROTOCOL_ip4);
+}
+
+static u8 *
+gre_rewrite (gre_tunnel_t * t,
+             fib_link_t link)
+{
+  ip4_and_gre_header_t * h0;
+  u8 * rewrite_data = 0;
+
+  vec_validate_init_empty (rewrite_data, sizeof (*h0) - 1, 0);
+
+  h0 = (ip4_and_gre_header_t *) rewrite_data;
+
+  if (t->teb)
+  {
+      h0->gre.protocol = clib_net_to_host_u16(GRE_PROTOCOL_teb);
+  }
+  else
+  {
+      h0->gre.protocol = clib_host_to_net_u16(gre_proto_from_fib_link(link));
+  }
+
+  h0->ip4.ip_version_and_header_length = 0x45;
+  h0->ip4.ttl = 254;
+  h0->ip4.protocol = IP_PROTOCOL_GRE;
+  /* $$$ fixup ip4 header length and checksum after-the-fact */
+  h0->ip4.src_address.as_u32 = t->tunnel_src.as_u32;
+  h0->ip4.dst_address.as_u32 = t->tunnel_dst.as_u32;
+  h0->ip4.checksum = ip4_header_checksum (&h0->ip4);
+
+  return (rewrite_data);
+}
+
+static int 
+vnet_gre_tunnel_add (vnet_gre_add_del_tunnel_args_t *a,
+                     u32 * sw_if_indexp)
 {
   gre_main_t * gm = &gre_main;
   vnet_main_t * vnm = gm->vnet_main;
@@ -44,49 +240,45 @@ int vnet_gre_add_del_tunnel
   gre_tunnel_t * t;
   vnet_hw_interface_t * hi;
   u32 hw_if_index, sw_if_index;
-  u32 slot;
   u32 outer_fib_index;
-  uword * p;
-  u64 key;
   u8 address[6];
   clib_error_t *error;
+  fib_link_t linkt;
+  u8 *rewrite;
 
-  key = (u64)a->src.as_u32 << 32 | (u64)a->dst.as_u32;
-  p = hash_get (gm->tunnel_by_key, key);
+  outer_fib_index = ip4_fib_index_from_table_id(a->outer_fib_id);
 
-  if (a->is_add) {
-    /* check if same src/dst pair exists */
-    if (p)
-      return VNET_API_ERROR_INVALID_VALUE;
+  if (~0 == outer_fib_index)
+    return VNET_API_ERROR_NO_SUCH_FIB;
 
-    p = hash_get (im->fib_index_by_table_id, a->outer_fib_id);
-    if (! p)
-      return VNET_API_ERROR_NO_SUCH_FIB;
+  t = gre_tunnel_db_find(&a->src, &a->dst, a->outer_fib_id);
 
-    outer_fib_index = p[0];
+  if (NULL != t)
+    return VNET_API_ERROR_INVALID_VALUE;
 
-    pool_get_aligned (gm->tunnels, t, CLIB_CACHE_LINE_BYTES);
-    memset (t, 0, sizeof (*t));
+  pool_get_aligned (gm->tunnels, t, CLIB_CACHE_LINE_BYTES);
+  memset (t, 0, sizeof (*t));
+  fib_node_init(&t->node, FIB_NODE_TYPE_GRE_TUNNEL);
 
-    if (vec_len (gm->free_gre_tunnel_hw_if_indices) > 0) {
-        vnet_interface_main_t * im = &vnm->interface_main;
+  if (vec_len (gm->free_gre_tunnel_hw_if_indices) > 0) {
+      vnet_interface_main_t * im = &vnm->interface_main;
 
-        hw_if_index = gm->free_gre_tunnel_hw_if_indices
+      hw_if_index = gm->free_gre_tunnel_hw_if_indices
           [vec_len (gm->free_gre_tunnel_hw_if_indices)-1];
-          _vec_len (gm->free_gre_tunnel_hw_if_indices) -= 1;
+      _vec_len (gm->free_gre_tunnel_hw_if_indices) -= 1;
 
-        hi = vnet_get_hw_interface (vnm, hw_if_index);
-        hi->dev_instance = t - gm->tunnels;
-        hi->hw_instance = hi->dev_instance;
+      hi = vnet_get_hw_interface (vnm, hw_if_index);
+      hi->dev_instance = t - gm->tunnels;
+      hi->hw_instance = hi->dev_instance;
 
-        /* clear old stats of freed tunnel before reuse */
-        sw_if_index = hi->sw_if_index;
-        vnet_interface_counter_lock(im);
-        vlib_zero_combined_counter
+      /* clear old stats of freed tunnel before reuse */
+      sw_if_index = hi->sw_if_index;
+      vnet_interface_counter_lock(im);
+      vlib_zero_combined_counter
           (&im->combined_sw_if_counters[VNET_INTERFACE_COUNTER_TX], sw_if_index);
-        vlib_zero_combined_counter
+      vlib_zero_combined_counter
           (&im->combined_sw_if_counters[VNET_INTERFACE_COUNTER_RX], sw_if_index);
-        vlib_zero_simple_counter
+      vlib_zero_simple_counter
           (&im->sw_if_counters[VNET_INTERFACE_COUNTER_DROP], sw_if_index);
         vnet_interface_counter_unlock(im);
     } else {
@@ -111,67 +303,186 @@ int vnet_gre_add_del_tunnel
           return VNET_API_ERROR_INVALID_REGISTRATION;
         }
       } else {
-        hw_if_index = vnet_register_interface
-          (vnm, gre_device_class.index, t - gm->tunnels,
-           gre_hw_interface_class.index,
-           t - gm->tunnels);
+       hw_if_index = vnet_register_interface
+           (vnm, gre_device_class.index, t - gm->tunnels,
+            gre_hw_interface_class.index,
+            t - gm->tunnels);
       }
       hi = vnet_get_hw_interface (vnm, hw_if_index);
       sw_if_index = hi->sw_if_index;
     }
 
-    t->hw_if_index = hw_if_index;
-    t->outer_fib_index = outer_fib_index;
-    t->sw_if_index = sw_if_index;
+  t->hw_if_index = hw_if_index;
+  t->outer_fib_index = outer_fib_index;
+  t->sw_if_index = sw_if_index;
 
-    vec_validate_init_empty (gm->tunnel_index_by_sw_if_index, sw_if_index, ~0);
-    gm->tunnel_index_by_sw_if_index[sw_if_index] = t - gm->tunnels;
+  vec_validate_init_empty (gm->tunnel_index_by_sw_if_index, sw_if_index, ~0);
+  gm->tunnel_index_by_sw_if_index[sw_if_index] = t - gm->tunnels;
 
-    vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
-    im->fib_index_by_sw_if_index[sw_if_index] = t->outer_fib_index;
+  vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
+  im->fib_index_by_sw_if_index[sw_if_index] = t->outer_fib_index;
+  ip4_sw_interface_enable_disable(sw_if_index, 1);
 
-    hi->min_packet_bytes = 64 + sizeof (gre_header_t) + sizeof (ip4_header_t);
-    hi->per_packet_overhead_bytes =
+  hi->min_packet_bytes = 64 + sizeof (gre_header_t) + sizeof (ip4_header_t);
+  hi->per_packet_overhead_bytes =
       /* preamble */ 8 + /* inter frame gap */ 12;
 
-    /* Standard default gre MTU. */
-    hi->max_l3_packet_bytes[VLIB_RX] = hi->max_l3_packet_bytes[VLIB_TX] = 9000;
+  /* Standard default gre MTU. */
+  hi->max_l3_packet_bytes[VLIB_RX] = hi->max_l3_packet_bytes[VLIB_TX] = 9000;
+
+  clib_memcpy (&t->tunnel_src, &a->src, sizeof (t->tunnel_src));
+  clib_memcpy (&t->tunnel_dst, &a->dst, sizeof (t->tunnel_dst));
+
+  gre_tunnel_db_add(t);
+
+  /*
+   * source the FIB entry for the tunnel's destination
+   * and become a child thereof. The tunnel will then get poked
+   * when the forwarding for the entry updates, and the tunnel can
+   * re-stack accordingly
+   */
+  const fib_prefix_t tun_dst_pfx = {
+      .fp_len = 32,
+      .fp_proto = FIB_PROTOCOL_IP4,
+      .fp_addr = {
+          .ip4 = t->tunnel_dst,
+      }
+  };
+
+  t->fib_entry_index =
+      fib_table_entry_special_add(outer_fib_index,
+                                  &tun_dst_pfx,
+                                  FIB_SOURCE_RR,
+                                  FIB_ENTRY_FLAG_NONE,
+                                  ADJ_INDEX_INVALID);
+  t->sibling_index =
+      fib_entry_child_add(t->fib_entry_index,
+                          FIB_NODE_TYPE_GRE_TUNNEL,
+                          t - gm->tunnels);
+
+  /*
+   * create and update the midchain adj this tunnel sources.
+   * We could be smarter here and trigger this on an interface proto enable,
+   * like we do for MPLS.
+   */
+  for (linkt = FIB_LINK_IP4; linkt <= FIB_LINK_IP6; linkt++)
+  {
+      t->adj_index[linkt] = adj_nbr_add_or_lock(FIB_PROTOCOL_IP4,
+                                                linkt,
+                                                &zero_addr,
+                                                sw_if_index);
+
+      rewrite = gre_rewrite(t, linkt);
+      adj_nbr_midchain_update_rewrite(t->adj_index[linkt],
+                                      hi->tx_node_index,
+                                      rewrite);
+      vec_free(rewrite);
+  }
+  t->adj_index[FIB_LINK_MPLS] = ADJ_INDEX_INVALID;
 
-    t->teb = a->teb;
-    clib_memcpy (&t->tunnel_src, &a->src, sizeof (t->tunnel_src));
-    clib_memcpy (&t->tunnel_dst, &a->dst, sizeof (t->tunnel_dst));
+  t->teb = a->teb;
+  clib_memcpy (&t->tunnel_src, &a->src, sizeof (t->tunnel_src));
+  clib_memcpy (&t->tunnel_dst, &a->dst, sizeof (t->tunnel_dst));
+  gre_tunnel_stack(t);
 
-    hash_set (gm->tunnel_by_key, key, t - gm->tunnels);
+  if (sw_if_indexp)
+    *sw_if_indexp = sw_if_index;
 
-    slot = vlib_node_add_named_next_with_slot
-      (vnm->vlib_main, hi->tx_node_index, "ip4-lookup", GRE_OUTPUT_NEXT_LOOKUP);
+  return 0;
+}
 
-    ASSERT (slot == GRE_OUTPUT_NEXT_LOOKUP);
+static int 
+vnet_gre_tunnel_delete (vnet_gre_add_del_tunnel_args_t *a,
+                        u32 * sw_if_indexp)
+{
+  gre_main_t * gm = &gre_main;
+  vnet_main_t * vnm = gm->vnet_main;
+  gre_tunnel_t * t;
+  fib_link_t linkt;
+  u32 sw_if_index;
+
+  t = gre_tunnel_db_find(&a->src, &a->dst, a->outer_fib_id);
 
-  } else { /* !is_add => delete */
-    /* tunnel needs to exist */
-    if (! p)
-      return VNET_API_ERROR_NO_SUCH_ENTRY;
+  if (NULL == t)
+    return VNET_API_ERROR_NO_SUCH_ENTRY;
 
-    t = pool_elt_at_index (gm->tunnels, p[0]);
+  sw_if_index = t->sw_if_index;
+  vnet_sw_interface_set_flags (vnm, sw_if_index, 0 /* down */);
+  /* make sure tunnel is removed from l2 bd or xconnect */
+  set_int_l2_mode(gm->vlib_main, vnm, MODE_L3, sw_if_index, 0, 0, 0, 0);
+  vec_add1 (gm->free_gre_tunnel_hw_if_indices, t->hw_if_index);
+  gm->tunnel_index_by_sw_if_index[sw_if_index] = ~0;
+  ip4_sw_interface_enable_disable(sw_if_index, 0);
 
-    sw_if_index = t->sw_if_index;
-    vnet_sw_interface_set_flags (vnm, sw_if_index, 0 /* down */);
-    /* make sure tunnel is removed from l2 bd or xconnect */
-    set_int_l2_mode(gm->vlib_main, vnm, MODE_L3, sw_if_index, 0, 0, 0, 0);
-    vec_add1 (gm->free_gre_tunnel_hw_if_indices, t->hw_if_index);
-    gm->tunnel_index_by_sw_if_index[sw_if_index] = ~0;
+  fib_entry_child_remove(t->fib_entry_index,
+                         t->sibling_index);
+  fib_table_entry_delete_index(t->fib_entry_index,
+                               FIB_SOURCE_RR);
 
-    hash_unset (gm->tunnel_by_key, key);
-    pool_put (gm->tunnels, t);
+  FOR_EACH_FIB_LINK(linkt)
+  {
+      adj_unlock(t->adj_index[linkt]);
   }
 
+  gre_tunnel_db_remove(t);
+  fib_node_deinit(&t->node);
+  pool_put (gm->tunnels, t);
+
   if (sw_if_indexp)
     *sw_if_indexp = sw_if_index;
 
   return 0;
 }
 
+int
+vnet_gre_add_del_tunnel (vnet_gre_add_del_tunnel_args_t *a,
+                         u32 * sw_if_indexp)
+{
+  if (a->is_add)
+    return (vnet_gre_tunnel_add(a, sw_if_indexp));
+  else
+    return (vnet_gre_tunnel_delete(a, sw_if_indexp));
+}
+
+static void
+gre_sw_interface_mpls_state_change (u32 sw_if_index,
+                                    u32 is_enable)
+{
+  gre_main_t *gm = &gre_main;
+  vnet_hw_interface_t * hi;
+  gre_tunnel_t *t;
+  u8 *rewrite;
+
+  if ((vec_len(gm->tunnel_index_by_sw_if_index) < sw_if_index) ||
+      (~0 == gm->tunnel_index_by_sw_if_index[sw_if_index]))
+      return;
+
+  t = pool_elt_at_index(gm->tunnels,
+                        gm->tunnel_index_by_sw_if_index[sw_if_index]);
+
+  if (is_enable)
+    {
+      hi = vnet_get_hw_interface (vnet_get_main(), t->hw_if_index);
+      t->adj_index[FIB_LINK_MPLS] =
+          adj_nbr_add_or_lock(FIB_PROTOCOL_IP4,
+                              FIB_LINK_MPLS,
+                              &zero_addr,
+                              sw_if_index);
+
+      rewrite = gre_rewrite(t, FIB_LINK_MPLS);
+      adj_nbr_midchain_update_rewrite(t->adj_index[FIB_LINK_MPLS],
+                                      hi->tx_node_index,
+                                      rewrite);
+      vec_free(rewrite);
+    }
+  else
+    {
+      adj_unlock(t->adj_index[FIB_LINK_MPLS]);
+      t->adj_index[FIB_LINK_MPLS] = ADJ_INDEX_INVALID;
+    }
+
+  gre_tunnel_stack(t);
+}
 
 static clib_error_t *
 create_gre_tunnel_command_fn (vlib_main_t * vm,
@@ -216,13 +527,15 @@ create_gre_tunnel_command_fn (vlib_main_t * vm,
       return clib_error_return (0, "src and dst are identical");
 
   memset (a, 0, sizeof (*a));
-  a->is_add = is_add;
   a->outer_fib_id = outer_fib_id;
   a->teb = teb;
   clib_memcpy(&a->src, &src, sizeof(src));
   clib_memcpy(&a->dst, &dst, sizeof(dst));
 
-  rv = vnet_gre_add_del_tunnel (a, &sw_if_index);
+  if (is_add)
+    rv = vnet_gre_tunnel_add(a, &sw_if_index);
+  else
+    rv = vnet_gre_tunnel_delete(a, &sw_if_index);
 
   switch(rv)
     {
@@ -255,14 +568,32 @@ show_gre_tunnel_command_fn (vlib_main_t * vm,
 {
   gre_main_t * gm = &gre_main;
   gre_tunnel_t * t;
+  u32 ti = ~0;
 
   if (pool_elts (gm->tunnels) == 0)
     vlib_cli_output (vm, "No GRE tunnels configured...");
 
-  pool_foreach (t, gm->tunnels,
-  ({
-    vlib_cli_output (vm, "%U", format_gre_tunnel, t);
-  }));
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "%d", &ti))
+        ;
+      else
+        break;
+    }
+
+  if (~0 == ti)
+    {
+      pool_foreach (t, gm->tunnels,
+      ({
+          vlib_cli_output (vm, "%U", format_gre_tunnel, t, 0);
+      }));
+    }
+  else
+  {
+      t = pool_elt_at_index(gm->tunnels, ti);
+
+      vlib_cli_output (vm, "%U", format_gre_tunnel, t, 1);
+  }
 
   return 0;
 }
@@ -275,6 +606,11 @@ VLIB_CLI_COMMAND (show_gre_tunnel_command, static) = {
 /* force inclusion from application's main.c */
 clib_error_t *gre_interface_init (vlib_main_t *vm)
 {
+  vec_add1(mpls_main.mpls_interface_state_change_callbacks,
+           gre_sw_interface_mpls_state_change);
+
+  fib_node_register_type(FIB_NODE_TYPE_GRE_TUNNEL, &gre_vft);
+
   return 0;
 }
 VLIB_INIT_FUNCTION(gre_interface_init);
index d5ea4b6..b55f551 100644 (file)
@@ -18,6 +18,7 @@
 #include <vlib/vlib.h>
 #include <vnet/pg/pg.h>
 #include <vnet/gre/gre.h>
+#include <vnet/mpls/mpls.h>
 #include <vppinfra/sparse_vec.h>
 
 #define foreach_gre_input_next                 \
@@ -25,7 +26,8 @@ _(PUNT, "error-punt")                           \
 _(DROP, "error-drop")                           \
 _(ETHERNET_INPUT, "ethernet-input")             \
 _(IP4_INPUT, "ip4-input")                       \
-_(IP6_INPUT, "ip6-input")                      
+_(IP6_INPUT, "ip6-input")                      \
+_(MPLS_INPUT, "mpls-input")
 
 typedef enum {
 #define _(s,n) GRE_INPUT_NEXT_##s,
@@ -66,13 +68,17 @@ gre_input (vlib_main_t * vm,
           vlib_frame_t * from_frame)
 {
   gre_main_t * gm = &gre_main;
+  mpls_main_t * mm = &mpls_main;
+  ip4_main_t * ip4m = &ip4_main;
   gre_input_runtime_t * rt = (void *) node->runtime_data;
   __attribute__((unused)) u32 n_left_from, next_index, * from, * to_next;
   u64 cached_tunnel_key = (u64) ~0;
-  u32 cached_tunnel_sw_if_index = 0, tunnel_sw_if_index;
+  u32 cached_tunnel_sw_if_index = 0, tunnel_sw_if_index = 0;
   u32 cached_tunnel_fib_index = 0, tunnel_fib_index;
 
   u32 cpu_index = os_get_cpu_number();
+  u32 len;
+  vnet_interface_main_t *im = &gm->vnet_main->interface_main;
 
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
@@ -141,7 +147,7 @@ gre_input (vlib_main_t * vm,
          /* Index sparse array with network byte order. */
          protocol0 = h0->protocol;
          protocol1 = h1->protocol;
-         sparse_vec_index2 (rt->next_by_protocol, protocol0, protocol1, 
+         sparse_vec_index2 (rt->next_by_protocol, protocol0, protocol1,
                              &i0, &i1);
           next0 = vec_elt(rt->next_by_protocol, i0);
           next1 = vec_elt(rt->next_by_protocol, i1);
@@ -154,10 +160,10 @@ gre_input (vlib_main_t * vm,
           version1 = clib_net_to_host_u16 (h1->flags_and_version);
           verr1 =  version1 & GRE_VERSION_MASK;
 
-          b0->error = verr0 ? node->errors[GRE_ERROR_UNSUPPORTED_VERSION] 
+          b0->error = verr0 ? node->errors[GRE_ERROR_UNSUPPORTED_VERSION]
               : b0->error;
           next0 = verr0 ? GRE_INPUT_NEXT_DROP : next0;
-          b1->error = verr1 ? node->errors[GRE_ERROR_UNSUPPORTED_VERSION] 
+          b1->error = verr1 ? node->errors[GRE_ERROR_UNSUPPORTED_VERSION]
               : b1->error;
           next1 = verr1 ? GRE_INPUT_NEXT_DROP : next1;
 
@@ -176,7 +182,6 @@ gre_input (vlib_main_t * vm,
                   gre_tunnel_t * t;
                   uword * p;
 
-                  ip4_main_t * ip4m = &ip4_main;
                   p = hash_get (gm->tunnel_by_key, key);
                   if (!p)
                     {
@@ -199,19 +204,56 @@ gre_input (vlib_main_t * vm,
                   tunnel_sw_if_index = cached_tunnel_sw_if_index;
                   tunnel_fib_index = cached_tunnel_fib_index;
                 }
+            }
+          else if (PREDICT_TRUE(next0 == GRE_INPUT_NEXT_MPLS_INPUT))
+            {
+              u64 key = ((u64)(vnet_buffer(b0)->gre.dst) << 32) |
+                         (u64)(vnet_buffer(b0)->gre.src);
+
+              if (cached_tunnel_key != key)
+                {
+                  vnet_hw_interface_t * hi;
+                  mpls_gre_tunnel_t * t;
+                  uword * p;
 
-              u32 len = vlib_buffer_length_in_chain (vm, b0);
-              vnet_interface_main_t *im = &gm->vnet_main->interface_main;
-              vlib_increment_combined_counter (im->combined_sw_if_counters
-                                               + VNET_INTERFACE_COUNTER_RX,
-                                               cpu_index,
-                                               tunnel_sw_if_index,
-                                               1 /* packets */,
-                                               len /* bytes */);
-
-              vnet_buffer(b0)->sw_if_index[VLIB_TX] = tunnel_fib_index;
-              vnet_buffer(b0)->sw_if_index[VLIB_RX] = tunnel_sw_if_index;
+                  p = hash_get (gm->tunnel_by_key, key);
+                  if (!p)
+                    {
+                      next0 = GRE_INPUT_NEXT_DROP;
+                      b0->error = node->errors[GRE_ERROR_NO_SUCH_TUNNEL];
+                      goto drop0;
+                    }
+                  t = pool_elt_at_index (mm->gre_tunnels, p[0]);
+                  hi = vnet_get_hw_interface (gm->vnet_main,
+                                              t->hw_if_index);
+                  tunnel_sw_if_index = hi->sw_if_index;
+                  tunnel_fib_index = vec_elt (ip4m->fib_index_by_sw_if_index,
+                                              tunnel_sw_if_index);
+
+                  cached_tunnel_sw_if_index = tunnel_sw_if_index;
+                  cached_tunnel_fib_index = tunnel_fib_index;
+                }
+              else
+                {
+                  tunnel_sw_if_index = cached_tunnel_sw_if_index;
+                  tunnel_fib_index = cached_tunnel_fib_index;
+                }
             }
+          else
+            {
+               next0 = GRE_INPUT_NEXT_DROP;
+                goto drop0;
+            }
+          len = vlib_buffer_length_in_chain (vm, b0);
+          vlib_increment_combined_counter (im->combined_sw_if_counters
+                                           + VNET_INTERFACE_COUNTER_RX,
+                                           cpu_index,
+                                           tunnel_sw_if_index,
+                                           1 /* packets */,
+                                           len /* bytes */);
+
+          vnet_buffer(b0)->sw_if_index[VLIB_TX] = tunnel_fib_index;
+          vnet_buffer(b0)->sw_if_index[VLIB_RX] = tunnel_sw_if_index;
 
 drop0:
           if (PREDICT_FALSE(next1 == GRE_INPUT_NEXT_IP4_INPUT
@@ -227,7 +269,6 @@ drop0:
                   gre_tunnel_t * t;
                   uword * p;
 
-                  ip4_main_t * ip4m = &ip4_main;
                   p = hash_get (gm->tunnel_by_key, key);
                   if (!p)
                     {
@@ -250,23 +291,62 @@ drop0:
                   tunnel_sw_if_index = cached_tunnel_sw_if_index;
                   tunnel_fib_index = cached_tunnel_fib_index;
                 }
+            }
+          else if (PREDICT_TRUE(next1 == GRE_INPUT_NEXT_MPLS_INPUT))
+            {
+              u64 key = ((u64)(vnet_buffer(b1)->gre.dst) << 32) |
+                         (u64)(vnet_buffer(b1)->gre.src);
 
-              u32 len = vlib_buffer_length_in_chain (vm, b1);
-              vnet_interface_main_t *im = &gm->vnet_main->interface_main;
-              vlib_increment_combined_counter (im->combined_sw_if_counters
-                                               + VNET_INTERFACE_COUNTER_RX,
-                                               cpu_index,
-                                               tunnel_sw_if_index,
-                                               1 /* packets */,
-                                               len /* bytes */);
-
-              vnet_buffer(b1)->sw_if_index[VLIB_TX] = tunnel_fib_index;
-              vnet_buffer(b1)->sw_if_index[VLIB_RX] = tunnel_sw_if_index;
+              if (cached_tunnel_key != key)
+                {
+                  vnet_hw_interface_t * hi;
+                  mpls_gre_tunnel_t * t;
+                  uword * p;
+
+                  ip4_main_t * ip4m = &ip4_main;
+                  p = hash_get (gm->tunnel_by_key, key);
+                  if (!p)
+                    {
+                      next1 = GRE_INPUT_NEXT_DROP;
+                      b1->error = node->errors[GRE_ERROR_NO_SUCH_TUNNEL];
+                      goto drop1;
+                    }
+                  t = pool_elt_at_index (mm->gre_tunnels, p[0]);
+                  hi = vnet_get_hw_interface (gm->vnet_main,
+                                              t->hw_if_index);
+                  tunnel_sw_if_index = hi->sw_if_index;
+                  tunnel_fib_index = vec_elt (ip4m->fib_index_by_sw_if_index,
+                                              tunnel_sw_if_index);
+
+                  cached_tunnel_sw_if_index = tunnel_sw_if_index;
+                  cached_tunnel_fib_index = tunnel_fib_index;
+                }
+              else
+                {
+                  tunnel_sw_if_index = cached_tunnel_sw_if_index;
+                  tunnel_fib_index = cached_tunnel_fib_index;
+                }
             }
+          else
+            {
+               next1 = GRE_INPUT_NEXT_DROP;
+                goto drop1;
+            }
+          len = vlib_buffer_length_in_chain (vm, b1);
+          vlib_increment_combined_counter (im->combined_sw_if_counters
+                                           + VNET_INTERFACE_COUNTER_RX,
+                                           cpu_index,
+                                           tunnel_sw_if_index,
+                                           1 /* packets */,
+                                           len /* bytes */);
+
+          vnet_buffer(b1)->sw_if_index[VLIB_TX] = tunnel_fib_index;
+          vnet_buffer(b1)->sw_if_index[VLIB_RX] = tunnel_sw_if_index;
+
 drop1:
-          if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) 
+          if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
             {
-              gre_rx_trace_t *tr = vlib_add_trace (vm, node, 
+              gre_rx_trace_t *tr = vlib_add_trace (vm, node,
                                                    b0, sizeof (*tr));
               tr->tunnel_id = ~0;
               tr->length = ip0->length;
@@ -274,9 +354,9 @@ drop1:
               tr->dst.as_u32 = ip0->dst_address.as_u32;
             }
 
-          if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) 
+          if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
             {
-              gre_rx_trace_t *tr = vlib_add_trace (vm, node, 
+              gre_rx_trace_t *tr = vlib_add_trace (vm, node,
                                                    b1, sizeof (*tr));
               tr->tunnel_id = ~0;
               tr->length = ip1->length;
@@ -336,6 +416,7 @@ drop1:
           /* For IP payload we need to find source interface
              so we can increase counters and help forward node to
              pick right FIB */
+          /* RPF check for ip4/ip6 input */
           if (PREDICT_FALSE(next0 == GRE_INPUT_NEXT_IP4_INPUT
                             || next0 == GRE_INPUT_NEXT_IP6_INPUT
                             || next0 == GRE_INPUT_NEXT_ETHERNET_INPUT))
@@ -349,7 +430,6 @@ drop1:
                   gre_tunnel_t * t;
                   uword * p;
 
-                  ip4_main_t * ip4m = &ip4_main;
                   p = hash_get (gm->tunnel_by_key, key);
                   if (!p)
                     {
@@ -372,26 +452,63 @@ drop1:
                   tunnel_sw_if_index = cached_tunnel_sw_if_index;
                   tunnel_fib_index = cached_tunnel_fib_index;
                 }
+            }
+          else if (PREDICT_TRUE(next0 == GRE_INPUT_NEXT_MPLS_INPUT))
+            {
+              u64 key = ((u64)(vnet_buffer(b0)->gre.dst) << 32) |
+                         (u64)(vnet_buffer(b0)->gre.src);
 
-              u32 len = vlib_buffer_length_in_chain (vm, b0);
-              vnet_interface_main_t *im = &gm->vnet_main->interface_main;
-              vlib_increment_combined_counter (im->combined_sw_if_counters
-                                               + VNET_INTERFACE_COUNTER_RX,
-                                               cpu_index,
-                                               tunnel_sw_if_index,
-                                               1 /* packets */,
-                                               len /* bytes */);
-
-              vnet_buffer(b0)->sw_if_index[VLIB_TX] = tunnel_fib_index;
-              vnet_buffer(b0)->sw_if_index[VLIB_RX] = tunnel_sw_if_index;
+              if (cached_tunnel_key != key)
+                {
+                  vnet_hw_interface_t * hi;
+                  mpls_gre_tunnel_t * t;
+                  uword * p;
+
+                  p = hash_get (gm->tunnel_by_key, key);
+                  if (!p)
+                    {
+                      next0 = GRE_INPUT_NEXT_DROP;
+                      b0->error = node->errors[GRE_ERROR_NO_SUCH_TUNNEL];
+                      goto drop;
+                    }
+                  t = pool_elt_at_index (mm->gre_tunnels, p[0]);
+                  hi = vnet_get_hw_interface (gm->vnet_main,
+                                              t->hw_if_index);
+                  tunnel_sw_if_index = hi->sw_if_index;
+                  tunnel_fib_index = vec_elt (ip4m->fib_index_by_sw_if_index,
+                                              tunnel_sw_if_index);
+
+                  cached_tunnel_sw_if_index = tunnel_sw_if_index;
+                  cached_tunnel_fib_index = tunnel_fib_index;
+                }
+              else
+                {
+                  tunnel_sw_if_index = cached_tunnel_sw_if_index;
+                  tunnel_fib_index = cached_tunnel_fib_index;
+                }
+            }
+          else
+            {
+               next0 = GRE_INPUT_NEXT_DROP;
+                goto drop;
             }
+          len = vlib_buffer_length_in_chain (vm, b0);
+          vlib_increment_combined_counter (im->combined_sw_if_counters
+                                           + VNET_INTERFACE_COUNTER_RX,
+                                           cpu_index,
+                                           tunnel_sw_if_index,
+                                           1 /* packets */,
+                                           len /* bytes */);
+
+          vnet_buffer(b0)->sw_if_index[VLIB_TX] = tunnel_fib_index;
+          vnet_buffer(b0)->sw_if_index[VLIB_RX] = tunnel_sw_if_index;
 
 drop:
           if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) 
             {
               gre_rx_trace_t *tr = vlib_add_trace (vm, node, 
                                                    b0, sizeof (*tr));
-              tr->tunnel_id = ~0;
+              tr->tunnel_id = tunnel_sw_if_index;
               tr->length = ip0->length;
               tr->src.as_u32 = ip0->src_address.as_u32;
               tr->dst.as_u32 = ip0->dst_address.as_u32;
@@ -509,7 +626,7 @@ static clib_error_t * gre_input_init (vlib_main_t * vm)
   ASSERT(ip4_input);
   ip6_input = vlib_get_node_by_name (vm, (u8 *)"ip6-input");
   ASSERT(ip6_input);
-  mpls_unicast_input = vlib_get_node_by_name (vm, (u8 *)"mpls-gre-input");
+  mpls_unicast_input = vlib_get_node_by_name (vm, (u8 *)"mpls-input");
   ASSERT(mpls_unicast_input);
 
   gre_register_input_protocol (vm, GRE_PROTOCOL_teb,
index 67fc641..05eea03 100644 (file)
@@ -515,11 +515,11 @@ VLIB_REGISTER_NODE (handoff_dispatch_node) = {
   .n_next_nodes = HANDOFF_DISPATCH_N_NEXT,
 
   .next_nodes = {
-    [HANDOFF_DISPATCH_NEXT_DROP] = "error-drop",
-    [HANDOFF_DISPATCH_NEXT_ETHERNET_INPUT] = "ethernet-input",
-    [HANDOFF_DISPATCH_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
-    [HANDOFF_DISPATCH_NEXT_IP6_INPUT] = "ip6-input",
-    [HANDOFF_DISPATCH_NEXT_MPLS_INPUT] = "mpls-gre-input",
+        [HANDOFF_DISPATCH_NEXT_DROP] = "error-drop",
+        [HANDOFF_DISPATCH_NEXT_ETHERNET_INPUT] = "ethernet-input",
+        [HANDOFF_DISPATCH_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
+        [HANDOFF_DISPATCH_NEXT_IP6_INPUT] = "ip6-input",
+        [HANDOFF_DISPATCH_NEXT_MPLS_INPUT] = "mpls-input",
   },
 };
 /* *INDENT-ON* */
index 0083263..9320f56 100644 (file)
@@ -20,7 +20,7 @@
 #include <vnet/ethernet/ethernet.h>
 #include <vnet/ip/ip4_packet.h>
 #include <vnet/ip/ip6_packet.h>
-#include <vnet/mpls-gre/packet.h>
+#include <vnet/mpls/packet.h>
 
 typedef enum
 {
index 772c3bc..595ed14 100644 (file)
@@ -449,8 +449,16 @@ vnet_sw_interface_set_flags_helper (vnet_main_t * vnm, u32 sw_if_index,
          mc_serialize (vm->mc_main, &vnet_sw_interface_set_flags_msg, &s);
        }
 
-      error = call_elf_section_interface_callbacks
-       (vnm, sw_if_index, flags, vnm->sw_interface_admin_up_down_functions);
+      /* set the flags now before invoking the registered clients
+       * so that the state they query is consistent with the state here notified */
+      old_flags = si->flags;
+      si->flags &= ~mask;
+      si->flags |= flags;
+      if ((flags | old_flags) & VNET_SW_INTERFACE_FLAG_ADMIN_UP)
+       error = call_elf_section_interface_callbacks
+         (vnm, sw_if_index, flags,
+          vnm->sw_interface_admin_up_down_functions);
+      si->flags = old_flags;
 
       if (error)
        goto done;
index 7738bb6..9f032e9 100644 (file)
@@ -459,7 +459,8 @@ typedef enum
   VNET_INTERFACE_COUNTER_RX_MISS = 5,
   VNET_INTERFACE_COUNTER_RX_ERROR = 6,
   VNET_INTERFACE_COUNTER_TX_ERROR = 7,
-  VNET_N_SIMPLE_INTERFACE_COUNTER = 8,
+  VNET_INTERFACE_COUNTER_MPLS = 8,
+  VNET_N_SIMPLE_INTERFACE_COUNTER = 9,
   /* Combined counters. */
   VNET_INTERFACE_COUNTER_RX = 0,
   VNET_INTERFACE_COUNTER_TX = 1,
index 7d828f5..477716d 100644 (file)
@@ -45,6 +45,8 @@
 #include <vnet/vnet.h>
 #include <vnet/ip/ip.h>
 #include <vppinfra/bitmap.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/fib/ip6_fib.h>
 
 static int
 compare_interface_names (void *a1, void *a2)
@@ -290,8 +292,8 @@ show_sw_interfaces (vlib_main_t * vm,
          fib_index6 = vec_elt (im6->fib_index_by_sw_if_index,
                                si->sw_if_index);
 
-       fib4 = vec_elt_at_index (im4->fibs, fib_index4);
-       fib6 = vec_elt_at_index (im6->fibs, fib_index6);
+       fib4 = ip4_fib_get (fib_index4);
+       fib6 = ip6_fib_get (fib_index6);
 
        if (si->flags & VNET_SW_INTERFACE_FLAG_UNNUMBERED)
          vlib_cli_output
index 81a819a..735d47e 100644 (file)
@@ -105,6 +105,7 @@ clib_error_t *vnet_create_sw_interface (vnet_main_t * vnm,
 
 void vnet_delete_hw_interface (vnet_main_t * vnm, u32 hw_if_index);
 void vnet_delete_sw_interface (vnet_main_t * vnm, u32 sw_if_index);
+int vnet_sw_interface_is_p2p (vnet_main_t * vnm, u32 sw_if_index);
 
 always_inline uword
 vnet_sw_interface_get_flags (vnet_main_t * vnm, u32 sw_if_index)
index 4d73d6b..0d0eb6c 100644 (file)
@@ -48,6 +48,12 @@ unformat_function_t unformat_ip_protocol;
 format_function_t format_tcp_udp_port;
 unformat_function_t unformat_tcp_udp_port;
 
+typedef enum format_ip_adjacency_flags_t_
+{
+    FORMAT_IP_ADJACENCY_NONE,
+    FORMAT_IP_ADJACENCY_DETAIL = (1 << 0),
+} format_ip_adjacency_flags_t;
+
 format_function_t format_ip_adjacency;
 format_function_t format_ip_adjacency_packet_data;
 
index fc74e9d..f9fe486 100644 (file)
 
 typedef struct ip4_fib_t {
   /* Hash table for each prefix length mapping. */
-  uword * adj_index_by_dst_address[33];
-
-  /* Temporary vectors for holding new/old values for hash_set. */
-  uword * new_hash_values, * old_hash_values;
+  uword * fib_entry_by_dst_address[33];
 
   /* Mtrie for fast lookups.  Hash is used to maintain overlapping prefixes. */
   ip4_fib_mtrie_t mtrie;
@@ -62,7 +59,7 @@ typedef struct ip4_fib_t {
   u32 index;
 
   /* flow hash configuration */
-  u32 flow_hash_config;
+  flow_hash_config_t flow_hash_config;
 
   /* N-tuple classifier indices */
   u32 fwd_classify_table_index;
@@ -72,22 +69,6 @@ typedef struct ip4_fib_t {
 
 struct ip4_main_t;
 
-typedef void (ip4_add_del_route_function_t)
-  (struct ip4_main_t * im,
-   uword opaque,
-   ip4_fib_t * fib,
-   u32 flags,
-   ip4_address_t * address,
-   u32 address_length,
-   void * old_result,
-   void * new_result);
-
-typedef struct {
-  ip4_add_del_route_function_t * function;
-  uword required_flags;
-  uword function_opaque;
-} ip4_add_del_route_callback_t;
-
 typedef void (ip4_add_del_interface_address_function_t)
   (struct ip4_main_t * im,
    uword opaque,
@@ -115,23 +96,20 @@ typedef struct ip4_main_t {
   ip_lookup_main_t lookup_main;
 
   /** Vector of FIBs. */
-  ip4_fib_t * fibs;
+  struct fib_table_t_ * fibs;
 
   u32 fib_masks[33];
 
   /** Table index indexed by software interface. */
   u32 * fib_index_by_sw_if_index;
 
+  /* IP4 enabled count by software interface */
+  u8 * ip_enabled_by_sw_if_index;
+
   /** Hash table mapping table id to fib index.
      ID space is not necessarily dense; index space is dense. */
   uword * fib_index_by_table_id;
 
-  /** Vector of functions to call when routes are added/deleted. */
-  ip4_add_del_route_callback_t * add_del_route_callbacks;
-
-  /** Hash table mapping interface route rewrite adjacency index by sw if index. */
-  uword * interface_route_adj_index_by_sw_if_index;
-
   /** Functions to call when interface address changes. */
   ip4_add_del_interface_address_callback_t * add_del_interface_address_callbacks;
 
@@ -159,11 +137,15 @@ typedef struct ip4_main_t {
   u32 ip4_unicast_rx_feature_lookup;
   /** Built-in unicast feature path index, see @ref ip_feature_init_cast()  */
   u32 ip4_unicast_rx_feature_source_and_port_range_check;
+  /** Built-in unicast feature path indice, see @ref ip_feature_init_cast()  */
+  u32 ip4_unicast_rx_feature_drop;
 
   /** Built-in multicast feature path index */
   u32 ip4_multicast_rx_feature_vpath;
   /** Built-in multicast feature path index */
   u32 ip4_multicast_rx_feature_lookup;
+  /** Built-in multicast feature path indices */
+  u32 ip4_multicast_rx_feature_drop;
 
   /** Built-in unicast feature path index, see @ref ip_feature_init_cast()  */
   u32 ip4_unicast_tx_feature_source_and_port_range_check;
@@ -235,30 +217,13 @@ extern vlib_node_registration_t ip4_lookup_node;
 extern vlib_node_registration_t ip4_rewrite_node;
 extern vlib_node_registration_t ip4_rewrite_local_node;
 extern vlib_node_registration_t ip4_arp_node;
-
-u32 ip4_fib_lookup_with_table (ip4_main_t * im, u32 fib_index, ip4_address_t * dst,
-                              u32 disable_default_route);
-
-always_inline u32
-ip4_fib_lookup_buffer (ip4_main_t * im, u32 fib_index, ip4_address_t * dst,
-                      vlib_buffer_t * b)
-{
-  return ip4_fib_lookup_with_table (im, fib_index, dst,
-                                   /* disable_default_route */ 0);
-}
-
-always_inline u32
-ip4_fib_lookup (ip4_main_t * im, u32 sw_if_index, ip4_address_t * dst)
-{
-  u32 fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index);
-  return ip4_fib_lookup_with_table (im, fib_index, dst,
-                                   /* disable_default_route */ 0);
-}
+extern vlib_node_registration_t ip4_glean_node;
+extern vlib_node_registration_t ip4_midchain_node;
 
 always_inline uword
-ip4_destination_matches_route (ip4_main_t * im,
-                              ip4_address_t * key,
-                              ip4_address_t * dest,
+ip4_destination_matches_route (const ip4_main_t * im,
+                              const ip4_address_t * key,
+                              const ip4_address_t * dest,
                               uword dest_length)
 { return 0 == ((key->data_u32 ^ dest->data_u32) & im->fib_masks[dest_length]); }
 
@@ -280,15 +245,26 @@ ip4_unaligned_destination_matches_route (ip4_main_t * im,
 { return 0 == ((clib_mem_unaligned (&key->data_u32, u32) ^ dest->data_u32) & im->fib_masks[dest_length]); }
 
 always_inline int
-ip4_src_address_for_packet (ip4_main_t * im, vlib_buffer_t * p, ip4_address_t * src, u32 sw_if_index)
+ip4_src_address_for_packet (ip_lookup_main_t * lm,
+                           u32 sw_if_index,
+                           ip4_address_t * src)
 {
-  ip_lookup_main_t * lm = &im->lookup_main;
-  ip_interface_address_t * ia = ip_interface_address_for_packet (lm, p, sw_if_index);
-  if (ia == NULL)
-    return -1;
-  ip4_address_t * a = ip_interface_address_get_address (lm, ia);
-  *src = a[0];
-  return 0;
+    u32 if_add_index = 
+       lm->if_address_pool_index_by_sw_if_index[sw_if_index];
+    if (PREDICT_TRUE(if_add_index != ~0)) {
+       ip_interface_address_t *if_add = 
+           pool_elt_at_index(lm->if_address_pool, if_add_index);
+       ip4_address_t *if_ip = 
+           ip_interface_address_get_address(lm, if_add);
+       *src = *if_ip;
+       return 0;
+    }
+    else
+    {
+       ASSERT(0);
+       src->as_u32 = 0;
+    }
+    return (!0);
 }
 
 /* Find interface address which matches destination. */
@@ -315,126 +291,20 @@ ip4_interface_address_matching_destination (ip4_main_t * im, ip4_address_t * dst
   return result;
 }
 
+ip4_address_t *
+ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index,
+                             ip_interface_address_t ** result_ia);
+
 clib_error_t *
 ip4_add_del_interface_address (vlib_main_t * vm, u32 sw_if_index,
                               ip4_address_t * address, u32 address_length,
                               u32 is_del);
 
-int ip4_address_compare (ip4_address_t * a1, ip4_address_t * a2);
-
-/* Add/del a route to the FIB. */
-
-#define IP4_ROUTE_FLAG_ADD (0 << 0)
-#define IP4_ROUTE_FLAG_DEL (1 << 0)
-#define IP4_ROUTE_FLAG_TABLE_ID  (0 << 1)
-#define IP4_ROUTE_FLAG_FIB_INDEX (1 << 1)
-#define IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY (1 << 2)
-#define IP4_ROUTE_FLAG_NO_REDISTRIBUTE (1 << 3)
-/* Not last add/del in group.  Facilities batching requests into packets. */
-#define IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP (1 << 4)
-/* Dynamic route created via ARP reply. */
-#define IP4_ROUTE_FLAG_NEIGHBOR (1 << 5)
-
-typedef struct {
-  /* IP4_ROUTE_FLAG_* */
-  u32 flags;
-
-  /* Either index of fib or table_id to hash and get fib.
-     IP4_ROUTE_FLAG_FIB_INDEX specifies index; otherwise table_id is assumed. */
-  u32 table_index_or_table_id;
-
-  /* Destination address (prefix) and length. */
-  ip4_address_t dst_address;
-  u32 dst_address_length;
-
-  /* Adjacency to use for this destination. */
-  u32 adj_index;
-
-  /* If specified adjacencies to add and then
-     use for this destination.  add_adj/n_add_adj
-     are override adj_index if specified. */
-  ip_adjacency_t * add_adj;
-  u32 n_add_adj;
-} ip4_add_del_route_args_t;
-
-/**
- * \brief Get or create an IPv4 fib.
- *
- * Get or create an IPv4 fib with the provided fib ID or index.
- * The fib ID is a possibly-sparse user-defined value while
- * the fib index defines the position of the fib in the fib vector.
- *
- * \param im
- *      ip4_main pointer.
- * \param table_index_or_id
- *      The table index if \c IP4_ROUTE_FLAG_FIB_INDEX bit is set in \p flags.
- *      Otherwise, when set to \c ~0, an arbitrary and unused fib ID is picked
- *      and can be retrieved with \c ret->table_id.
- *      Otherwise, the fib ID to be used to retrieve or create the desired fib.
- * \param flags
- *      Indicates whether \p table_index_or_id is the fib index or ID.
- *      When the bit \c IP4_ROUTE_FLAG_FIB_INDEX is set, \p table_index_or_id
- *      is considered as the fib index, and the fib ID otherwise.
- * \returns A pointer to the retrieved or created fib.
- *
- * \remark When getting a fib with the fib index, the fib MUST already exist.
- */
-ip4_fib_t *
-find_ip4_fib_by_table_index_or_id (ip4_main_t * im, 
-                                   u32 table_index_or_id, u32 flags);
-
-void ip4_add_del_route (ip4_main_t * im, ip4_add_del_route_args_t * args);
-
-void ip4_add_del_route_next_hop (ip4_main_t * im,
-                                 u32 flags,
-                                 ip4_address_t * dst_address,
-                                 u32 dst_address_length,
-                                 ip4_address_t * next_hop,
-                                 u32 next_hop_sw_if_index,
-                                 u32 next_hop_weight, u32 adj_index, 
-                                 u32 explicit_fib_index);
-
-u32
-ip4_route_get_next_hop_adj (ip4_main_t * im,
-                           u32 fib_index,
-                           ip4_address_t *next_hop,
-                           u32 next_hop_sw_if_index,
-                           u32 explicit_fib_index);
-
-void *
-ip4_get_route (ip4_main_t * im,
-              u32 fib_index_or_table_id,
-              u32 flags,
-              u8 * address,
-              u32 address_length);
-
 void
-ip4_foreach_matching_route (ip4_main_t * im,
-                           u32 table_index_or_table_id,
-                           u32 flags,
-                           ip4_address_t * address,
-                           u32 address_length,
-                           ip4_address_t ** results,
-                           u8 ** result_lengths);
-
-void ip4_delete_matching_routes (ip4_main_t * im,
-                                u32 table_index_or_table_id,
-                                u32 flags,
-                                ip4_address_t * address,
-                                u32 address_length);
-
-void ip4_maybe_remap_adjacencies (ip4_main_t * im,
-                                 u32 table_index_or_table_id,
-                                 u32 flags);
-
-void ip4_adjacency_set_interface_route (vnet_main_t * vnm,
-                                       ip_adjacency_t * adj,
-                                       u32 sw_if_index,
-                                       u32 if_address_index);
+ip4_sw_interface_enable_disable (u32 sw_if_index,
+                                u32 is_enable);
 
-ip4_address_t *
-ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index,
-                             ip_interface_address_t ** result_ia);
+int ip4_address_compare (ip4_address_t * a1, ip4_address_t * a2);
 
 /* Send an ARP request to see if given destination is reachable on given interface. */
 clib_error_t *
@@ -458,7 +328,7 @@ void ip4_register_protocol (u32 protocol, u32 node_index);
 
 serialize_function_t serialize_vnet_ip4_main, unserialize_vnet_ip4_main;
 
-int vnet_set_ip4_flow_hash (u32 table_id, u32 flow_hash_config);
+int vnet_set_ip4_flow_hash (u32 table_id, flow_hash_config_t flow_hash_config);
 
 void ip4_mtrie_init (ip4_fib_mtrie_t * m);
 
@@ -468,7 +338,8 @@ int vnet_set_ip4_classify_intfc (vlib_main_t * vm, u32 sw_if_index,
 /* Compute flow hash.  We'll use it to select which adjacency to use for this
    flow.  And other things. */
 always_inline u32
-ip4_compute_flow_hash (ip4_header_t * ip, u32 flow_hash_config)
+ip4_compute_flow_hash (const ip4_header_t * ip,
+                      flow_hash_config_t flow_hash_config)
 {
     tcp_header_t * tcp = (void *) (ip + 1);
     u32 a, b, c, t1, t2;
index 751260a..4c49d0e 100644 (file)
 
 #include <vnet/vnet.h>
 #include <vnet/ip/ip.h>
-/** for ethernet_header_t */
-#include <vnet/ethernet/ethernet.h>
-/** for ethernet_arp_header_t */
-#include <vnet/ethernet/arp_packet.h>  
+#include <vnet/ethernet/ethernet.h>    /* for ethernet_header_t */
+#include <vnet/ethernet/arp_packet.h>  /* for ethernet_arp_header_t */
 #include <vnet/ppp/ppp.h>
-/** for srp_hw_interface_class */
-#include <vnet/srp/srp.h>
-/** for API error numbers */
-#include <vnet/api_errno.h>     
-
-/** @file
-    vnet ip4 forwarding
-*/
-
-/* This is really, really simple but stupid fib. */
-u32
-ip4_fib_lookup_with_table (ip4_main_t * im, u32 fib_index,
-                          ip4_address_t * dst,
-                          u32 disable_default_route)
-{
-  ip_lookup_main_t * lm = &im->lookup_main;
-  ip4_fib_t * fib = vec_elt_at_index (im->fibs, fib_index);
-  uword * p, * hash, key;
-  i32 i, i_min, dst_address, ai;
-
-  i_min = disable_default_route ? 1 : 0;
-  dst_address = clib_mem_unaligned (&dst->data_u32, u32);
-  for (i = ARRAY_LEN (fib->adj_index_by_dst_address) - 1; i >= i_min; i--)
-    {
-      hash = fib->adj_index_by_dst_address[i];
-      if (! hash)
-       continue;
-
-      key = dst_address & im->fib_masks[i];
-      if ((p = hash_get (hash, key)) != 0)
-       {
-         ai = p[0];
-         goto done;
-       }
-    }
-
-  /* Nothing matches in table. */
-  ai = lm->miss_adj_index;
-
- done:
-  return ai;
-}
-
-/** @brief Create FIB from table ID and init all hashing.
-    @param im - @ref ip4_main_t
-    @param table_id - table ID
-    @return fib - @ref ip4_fib_t
-*/
-static ip4_fib_t *
-create_fib_with_table_id (ip4_main_t * im, u32 table_id)
-{
-  ip4_fib_t * fib;
-  hash_set (im->fib_index_by_table_id, table_id, vec_len (im->fibs));
-  vec_add2 (im->fibs, fib, 1);
-  fib->table_id = table_id;
-  fib->index = fib - im->fibs;
-  /* IP_FLOW_HASH_DEFAULT is net value of 5 tuple flags without "reverse" bit */
-  fib->flow_hash_config = IP_FLOW_HASH_DEFAULT;
-  fib->fwd_classify_table_index = ~0;
-  fib->rev_classify_table_index = ~0;
-  ip4_mtrie_init (&fib->mtrie);
-  return fib;
-}
-
-/** @brief Find existing or Create new FIB based on index
-    @param im @ref ip4_main_t
-    @param table_index_or_id - overloaded parameter referring
-           to the table or a table's index in the FIB vector
-    @param flags - used to check if table_index_or_id was a table or
-           an index (detected by @ref IP4_ROUTE_FLAG_FIB_INDEX)
-    @return either the existing or a new ip4_fib_t entry
-*/
-ip4_fib_t *
-find_ip4_fib_by_table_index_or_id (ip4_main_t * im,
-                                   u32 table_index_or_id, u32 flags)
-{
-  uword * p, fib_index;
-
-  fib_index = table_index_or_id;
-  /* If this isn't a FIB_INDEX ... */
-  if (! (flags & IP4_ROUTE_FLAG_FIB_INDEX))
-    {
-      /* If passed ~0 then request the next table available */
-      if (table_index_or_id == ~0) {
-        table_index_or_id = 0;
-        while ((p = hash_get (im->fib_index_by_table_id, table_index_or_id))) {
-          table_index_or_id++;
-        }
-       /* Create the next table and return the ip4_fib_t associated with it */
-       return create_fib_with_table_id (im, table_index_or_id);
-      }
-      /* A specific table_id was requested.. */
-      p = hash_get (im->fib_index_by_table_id, table_index_or_id);
-      /* ... and if it doesn't exist create it else grab its index */
-      if (! p)
-       return create_fib_with_table_id (im, table_index_or_id);
-      fib_index = p[0];
-    }
-  /* Return the ip4_fib_t associated with this index */
-  return vec_elt_at_index (im->fibs, fib_index);
-}
-
-static void
-ip4_fib_init_adj_index_by_dst_address (ip_lookup_main_t * lm,
-                                      ip4_fib_t * fib,
-                                      u32 address_length)
-{
-  hash_t * h;
-  uword max_index;
-
-  ASSERT (lm->fib_result_n_bytes >= sizeof (uword));
-  lm->fib_result_n_words = round_pow2 (lm->fib_result_n_bytes, sizeof (uword)) / sizeof (uword);
-
-  fib->adj_index_by_dst_address[address_length] =
-    hash_create (32 /* elts */, lm->fib_result_n_words * sizeof (uword));
-
-  hash_set_flags (fib->adj_index_by_dst_address[address_length],
-                  HASH_FLAG_NO_AUTO_SHRINK);
-
-  h = hash_header (fib->adj_index_by_dst_address[address_length]);
-  max_index = (hash_value_bytes (h) / sizeof (fib->new_hash_values[0])) - 1;
-
-  /* Initialize new/old hash value vectors. */
-  vec_validate_init_empty (fib->new_hash_values, max_index, ~0);
-  vec_validate_init_empty (fib->old_hash_values, max_index, ~0);
-}
-
-static void
-ip4_fib_set_adj_index (ip4_main_t * im,
-                      ip4_fib_t * fib,
-                      u32 flags,
-                      u32 dst_address_u32,
-                      u32 dst_address_length,
-                      u32 adj_index)
-{
-  ip_lookup_main_t * lm = &im->lookup_main;
-  uword * hash;
-
-  if (vec_bytes(fib->old_hash_values))
-    memset (fib->old_hash_values, ~0, vec_bytes (fib->old_hash_values));
-  if (vec_bytes(fib->new_hash_values))
-    memset (fib->new_hash_values, ~0, vec_bytes (fib->new_hash_values));
-  fib->new_hash_values[0] = adj_index;
-
-  /* Make sure adj index is valid. */
-  if (CLIB_DEBUG > 0)
-    (void) ip_get_adjacency (lm, adj_index);
-
-  hash = fib->adj_index_by_dst_address[dst_address_length];
-
-  hash = _hash_set3 (hash, dst_address_u32,
-                    fib->new_hash_values,
-                    fib->old_hash_values);
-
-  fib->adj_index_by_dst_address[dst_address_length] = hash;
-
-  if (vec_len (im->add_del_route_callbacks) > 0)
-    {
-      ip4_add_del_route_callback_t * cb;
-      ip4_address_t d;
-      uword * p;
-
-      d.data_u32 = dst_address_u32;
-      vec_foreach (cb, im->add_del_route_callbacks)
-       if ((flags & cb->required_flags) == cb->required_flags)
-         cb->function (im, cb->function_opaque,
-                       fib, flags,
-                       &d, dst_address_length,
-                       fib->old_hash_values,
-                       fib->new_hash_values);
-
-      p = hash_get (hash, dst_address_u32);
-      /* hash_get should never return NULL here */
-      if (p)
-          clib_memcpy (p, fib->new_hash_values, 
-                       vec_bytes (fib->new_hash_values));
-      else
-          ASSERT(0);
-    }
-}
-
-void ip4_add_del_route (ip4_main_t * im, ip4_add_del_route_args_t * a)
-{
-  ip_lookup_main_t * lm = &im->lookup_main;
-  ip4_fib_t * fib;
-  u32 dst_address, dst_address_length, adj_index, old_adj_index;
-  uword * hash, is_del;
-  ip4_add_del_route_callback_t * cb;
-
-  /* Either create new adjacency or use given one depending on arguments. */
-  if (a->n_add_adj > 0)
-    {
-      ip_add_adjacency (lm, a->add_adj, a->n_add_adj, &adj_index);
-      ip_call_add_del_adjacency_callbacks (lm, adj_index, /* is_del */ 0);
-    }
-  else
-    adj_index = a->adj_index;
-
-  dst_address = a->dst_address.data_u32;
-  dst_address_length = a->dst_address_length;
-  fib = find_ip4_fib_by_table_index_or_id (im, a->table_index_or_table_id, a->flags);
-
-  ASSERT (dst_address_length < ARRAY_LEN (im->fib_masks));
-  dst_address &= im->fib_masks[dst_address_length];
-
-  if (! fib->adj_index_by_dst_address[dst_address_length])
-    ip4_fib_init_adj_index_by_dst_address (lm, fib, dst_address_length);
-
-  hash = fib->adj_index_by_dst_address[dst_address_length];
-
-  is_del = (a->flags & IP4_ROUTE_FLAG_DEL) != 0;
-
-  if (is_del)
-    {
-      fib->old_hash_values[0] = ~0;
-      hash = _hash_unset (hash, dst_address, fib->old_hash_values);
-      fib->adj_index_by_dst_address[dst_address_length] = hash;
-
-      if (vec_len (im->add_del_route_callbacks) > 0
-         && fib->old_hash_values[0] != ~0) /* make sure destination was found in hash */
-       {
-         fib->new_hash_values[0] = ~0;
-         vec_foreach (cb, im->add_del_route_callbacks)
-           if ((a->flags & cb->required_flags) == cb->required_flags)
-             cb->function (im, cb->function_opaque,
-                           fib, a->flags,
-                           &a->dst_address, dst_address_length,
-                           fib->old_hash_values,
-                           fib->new_hash_values);
-       }
-    }
-  else
-    ip4_fib_set_adj_index (im, fib, a->flags, dst_address, dst_address_length,
-                          adj_index);
-
-  old_adj_index = fib->old_hash_values[0];
-
-  /* Avoid spurious reference count increments */
-  if (old_adj_index == adj_index
-      && adj_index != ~0
-      && !(a->flags & IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY))
-    {
-      ip_adjacency_t * adj = ip_get_adjacency (lm, adj_index);
-      if (adj->share_count > 0)
-        adj->share_count --;
-    }
-
-  ip4_fib_mtrie_add_del_route (fib, a->dst_address, dst_address_length,
-                              is_del ? old_adj_index : adj_index,
-                              is_del);
-
-  /* Delete old adjacency index if present and changed. */
-  if (! (a->flags & IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY)
-      && old_adj_index != ~0
-      && old_adj_index != adj_index)
-    ip_del_adjacency (lm, old_adj_index);
-}
-
-
-u32
-ip4_route_get_next_hop_adj (ip4_main_t * im,
-                           u32 fib_index,
-                           ip4_address_t *next_hop,
-                           u32 next_hop_sw_if_index,
-                           u32 explicit_fib_index)
-{
-  ip_lookup_main_t * lm = &im->lookup_main;
-  vnet_main_t * vnm = vnet_get_main();
-  uword * nh_hash, * nh_result;
-  int is_interface_next_hop;
-  u32 nh_adj_index;
-  ip4_fib_t * fib;
-
-  fib = vec_elt_at_index (im->fibs, fib_index);
-
-  is_interface_next_hop = next_hop->data_u32 == 0;
-  if (is_interface_next_hop)
-    {
-      nh_result = hash_get (im->interface_route_adj_index_by_sw_if_index, next_hop_sw_if_index);
-      if (nh_result)
-         nh_adj_index = *nh_result;
-      else
-        {
-          ip_adjacency_t * adj;
-          adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
-                                  &nh_adj_index);
-          ip4_adjacency_set_interface_route (vnm, adj, next_hop_sw_if_index, /* if_address_index */ ~0);
-          ip_call_add_del_adjacency_callbacks (lm, nh_adj_index, /* is_del */ 0);
-          hash_set (im->interface_route_adj_index_by_sw_if_index, next_hop_sw_if_index, nh_adj_index);
-       }
-    }
-  else if (next_hop_sw_if_index == ~0)
-    {
-      /* next-hop is recursive. we always need a indirect adj
-       * for recursive paths. Any LPM we perform now will give
-       * us a valid adj, but without tracking the next-hop we
-       * have no way to keep it valid.
-       */
-      ip_adjacency_t add_adj;
-      memset (&add_adj, 0, sizeof(add_adj));
-      add_adj.n_adj = 1;
-      add_adj.lookup_next_index = IP_LOOKUP_NEXT_INDIRECT;
-      add_adj.indirect.next_hop.ip4.as_u32 = next_hop->as_u32;
-      add_adj.explicit_fib_index = explicit_fib_index;
-      ip_add_adjacency (lm, &add_adj, 1, &nh_adj_index);
-    }
-  else
-    {
-      nh_hash = fib->adj_index_by_dst_address[32];
-      nh_result = hash_get (nh_hash, next_hop->data_u32);
-
-      /* Next hop must be known. */
-      if (! nh_result)
-        {
-         ip_adjacency_t * adj;
-
-         /* no /32 exists, get the longest prefix match */
-         nh_adj_index = ip4_fib_lookup_with_table (im, fib_index,
-                                                   next_hop, 0);
-         adj = ip_get_adjacency (lm, nh_adj_index);
-         /* if ARP interface adjacency is present, we need to
-            install ARP adjaceny for specific next hop */
-         if (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP &&
-             adj->arp.next_hop.ip4.as_u32 == 0)
-           {
-             nh_adj_index = vnet_arp_glean_add(fib_index, next_hop);
-           }
-       }
-      else
-        {
-         nh_adj_index = *nh_result;
-       }
-    }
-
-  return (nh_adj_index);
-}
-
-void
-ip4_add_del_route_next_hop (ip4_main_t * im,
-                           u32 flags,
-                           ip4_address_t * dst_address,
-                           u32 dst_address_length,
-                           ip4_address_t * next_hop,
-                           u32 next_hop_sw_if_index,
-                           u32 next_hop_weight, u32 adj_index, 
-                            u32 explicit_fib_index)
-{
-  vnet_main_t * vnm = vnet_get_main();
-  ip_lookup_main_t * lm = &im->lookup_main;
-  u32 fib_index;
-  ip4_fib_t * fib;
-  u32 dst_address_u32, old_mp_adj_index, new_mp_adj_index;
-  u32 dst_adj_index, nh_adj_index;
-  uword * dst_hash, * dst_result;
-  ip_adjacency_t * dst_adj;
-  ip_multipath_adjacency_t * old_mp, * new_mp;
-  int is_del = (flags & IP4_ROUTE_FLAG_DEL) != 0;
-  clib_error_t * error = 0;
-
-  if (explicit_fib_index == (u32)~0)
-      fib_index = vec_elt (im->fib_index_by_sw_if_index, next_hop_sw_if_index);
-  else
-      fib_index = explicit_fib_index;
-
-  fib = vec_elt_at_index (im->fibs, fib_index);
-
-  /* Lookup next hop to be added or deleted. */
-  if (adj_index == (u32)~0)
-    {
-       nh_adj_index = ip4_route_get_next_hop_adj(im, fib_index,
-                                                 next_hop,
-                                                 next_hop_sw_if_index,
-                                                 explicit_fib_index);
-    }
-  else
-    {
-      nh_adj_index = adj_index;
-    }
-  ASSERT (dst_address_length < ARRAY_LEN (im->fib_masks));
-  dst_address_u32 = dst_address->data_u32 & im->fib_masks[dst_address_length];
-
-  dst_hash = fib->adj_index_by_dst_address[dst_address_length];
-  dst_result = hash_get (dst_hash, dst_address_u32);
-  if (dst_result)
-    {
-      dst_adj_index = dst_result[0];
-      dst_adj = ip_get_adjacency (lm, dst_adj_index);
-    }
-  else
-    {
-      /* For deletes destination must be known. */
-      if (is_del)
-       {
-          vnm->api_errno = VNET_API_ERROR_UNKNOWN_DESTINATION;
-         error = clib_error_return (0, "unknown destination %U/%d",
-                                    format_ip4_address, dst_address,
-                                    dst_address_length);
-         goto done;
-       }
-
-      dst_adj_index = ~0;
-      dst_adj = 0;
-    }
-
-  /* Ignore adds of X/32 with next hop of X. */
-  if (! is_del
-      && dst_address_length == 32
-      && dst_address->data_u32 == next_hop->data_u32 
-      && adj_index != (u32)~0)
-    {
-      vnm->api_errno = VNET_API_ERROR_PREFIX_MATCHES_NEXT_HOP;
-      error = clib_error_return (0, "prefix matches next hop %U/%d",
-                                 format_ip4_address, dst_address,
-                                 dst_address_length);
-      goto done;
-    }
-
-  /* Destination is not known and default weight is set so add route
-     to existing non-multipath adjacency */
-  if (dst_adj_index == ~0 && next_hop_weight == 1 && next_hop_sw_if_index == ~0)
-    {
-      /* create / delete additional mapping of existing adjacency */
-      ip4_add_del_route_args_t a;
-
-      a.table_index_or_table_id = fib_index;
-      a.flags = ((is_del ? IP4_ROUTE_FLAG_DEL : IP4_ROUTE_FLAG_ADD)
-                | IP4_ROUTE_FLAG_FIB_INDEX
-                | IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY
-                | (flags & (IP4_ROUTE_FLAG_NO_REDISTRIBUTE
-                            | IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP)));
-      a.dst_address = dst_address[0];
-      a.dst_address_length = dst_address_length;
-      a.adj_index = nh_adj_index;
-      a.add_adj = 0;
-      a.n_add_adj = 0;
-
-      ip4_add_del_route (im, &a);
-      goto done;
-    }
-
-  old_mp_adj_index = dst_adj ? dst_adj->heap_handle : ~0;
-
-  if (! ip_multipath_adjacency_add_del_next_hop
-      (lm, is_del,
-       old_mp_adj_index,
-       nh_adj_index,
-       next_hop_weight,
-       &new_mp_adj_index))
-    {
-      vnm->api_errno = VNET_API_ERROR_NEXT_HOP_NOT_FOUND_MP;
-      error = clib_error_return (0, "requested deleting next-hop %U not found in multi-path",
-                                format_ip4_address, next_hop);
-      goto done;
-    }
-  
-  old_mp = new_mp = 0;
-  if (old_mp_adj_index != ~0)
-    old_mp = vec_elt_at_index (lm->multipath_adjacencies, old_mp_adj_index);
-  if (new_mp_adj_index != ~0)
-    new_mp = vec_elt_at_index (lm->multipath_adjacencies, new_mp_adj_index);
-
-  if (old_mp != new_mp)
-    {
-      ip4_add_del_route_args_t a;
-      ip_adjacency_t * adj;
-
-      a.table_index_or_table_id = fib_index;
-      a.flags = ((is_del && ! new_mp ? IP4_ROUTE_FLAG_DEL : IP4_ROUTE_FLAG_ADD)
-                | IP4_ROUTE_FLAG_FIB_INDEX
-                | IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY
-                | (flags & (IP4_ROUTE_FLAG_NO_REDISTRIBUTE | IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP)));
-      a.dst_address = dst_address[0];
-      a.dst_address_length = dst_address_length;
-      a.adj_index = new_mp ? new_mp->adj_index : dst_adj_index;
-      a.add_adj = 0;
-      a.n_add_adj = 0;
-
-      ip4_add_del_route (im, &a);
-
-      adj = ip_get_adjacency (lm, new_mp ? new_mp->adj_index : dst_adj_index);
-      if (adj->n_adj == 1)
-        adj->share_count += is_del ? -1 : 1;
-    }
-
- done:
-  if (error)
-    clib_error_report (error);
-}
-
-void *
-ip4_get_route (ip4_main_t * im,
-              u32 table_index_or_table_id,
-              u32 flags,
-              u8 * address,
-              u32 address_length)
-{
-  ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_index_or_table_id, flags);
-  u32 dst_address = * (u32 *) address;
-  uword * hash, * p;
-
-  ASSERT (address_length < ARRAY_LEN (im->fib_masks));
-  dst_address &= im->fib_masks[address_length];
-
-  hash = fib->adj_index_by_dst_address[address_length];
-  p = hash_get (hash, dst_address);
-  return (void *) p;
-}
-
-void
-ip4_foreach_matching_route (ip4_main_t * im,
-                           u32 table_index_or_table_id,
-                           u32 flags,
-                           ip4_address_t * address,
-                           u32 address_length,
-                           ip4_address_t ** results,
-                           u8 ** result_lengths)
-{
-  ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_index_or_table_id, flags);
-  u32 dst_address = address->data_u32;
-  u32 this_length = address_length;
-  
-  if (*results)
-    _vec_len (*results) = 0;
-  if (*result_lengths)
-    _vec_len (*result_lengths) = 0;
-
-  while (this_length <= 32 && vec_len (results) == 0)
-    {
-      uword k, v;
-      hash_foreach (k, v, fib->adj_index_by_dst_address[this_length], ({
-       if (0 == ((k ^ dst_address) & im->fib_masks[address_length]))
-         {
-           ip4_address_t a;
-           a.data_u32 = k;
-           vec_add1 (*results, a);
-           vec_add1 (*result_lengths, this_length);
-         }
-      }));
-
-      this_length++;
-    }
-}
-
-void ip4_maybe_remap_adjacencies (ip4_main_t * im,
-                                 u32 table_index_or_table_id,
-                                 u32 flags)
-{
-  ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_index_or_table_id, flags);
-  ip_lookup_main_t * lm = &im->lookup_main;
-  u32 i, l;
-  ip4_address_t a;
-  ip4_add_del_route_callback_t * cb;
-  static ip4_address_t * to_delete;
-
-  if (lm->n_adjacency_remaps == 0)
-    return;
-
-  for (l = 0; l <= 32; l++)
-    {
-      hash_pair_t * p;
-      uword * hash = fib->adj_index_by_dst_address[l];
-
-      if (hash_elts (hash) == 0)
-       continue;
-
-      if (to_delete)
-       _vec_len (to_delete) = 0;
-
-      hash_foreach_pair (p, hash, ({
-       u32 adj_index = p->value[0];
-       u32 m = vec_elt (lm->adjacency_remap_table, adj_index);
-
-       if (m)
-         {
-           /* Record destination address from hash key. */
-           a.data_u32 = p->key;
-
-           /* New adjacency points to nothing: so delete prefix. */
-           if (m == ~0)
-             vec_add1 (to_delete, a);
-           else
-             {
-               /* Remap to new adjacency. */
-               clib_memcpy (fib->old_hash_values, p->value, vec_bytes (fib->old_hash_values));
-
-               /* Set new adjacency value. */
-               fib->new_hash_values[0] = p->value[0] = m - 1;
-
-               vec_foreach (cb, im->add_del_route_callbacks)
-                 if ((flags & cb->required_flags) == cb->required_flags)
-                   cb->function (im, cb->function_opaque,
-                                 fib, flags | IP4_ROUTE_FLAG_ADD,
-                                 &a, l,
-                                 fib->old_hash_values,
-                                 fib->new_hash_values);
-             }
-         }
-      }));
-
-      fib->new_hash_values[0] = ~0;
-      for (i = 0; i < vec_len (to_delete); i++)
-       {
-         hash = _hash_unset (hash, to_delete[i].data_u32, fib->old_hash_values);
-         vec_foreach (cb, im->add_del_route_callbacks)
-           if ((flags & cb->required_flags) == cb->required_flags)
-             cb->function (im, cb->function_opaque,
-                           fib, flags | IP4_ROUTE_FLAG_DEL,
-                           &a, l,
-                           fib->old_hash_values,
-                           fib->new_hash_values);
-       }
-    }
-
-  /* Also remap adjacencies in mtrie. */
-  ip4_mtrie_maybe_remap_adjacencies (lm, &fib->mtrie);
-
-  /* Reset mapping table. */
-  vec_zero (lm->adjacency_remap_table);
-
-  /* All remaps have been performed. */
-  lm->n_adjacency_remaps = 0;
-}
-
-void ip4_delete_matching_routes (ip4_main_t * im,
-                                u32 table_index_or_table_id,
-                                u32 flags,
-                                ip4_address_t * address,
-                                u32 address_length)
-{
-  static ip4_address_t * matching_addresses;
-  static u8 * matching_address_lengths;
-  u32 l, i;
-  ip4_add_del_route_args_t a;
-
-  a.flags = IP4_ROUTE_FLAG_DEL | IP4_ROUTE_FLAG_NO_REDISTRIBUTE | flags;
-  a.table_index_or_table_id = table_index_or_table_id;
-  a.adj_index = ~0;
-  a.add_adj = 0;
-  a.n_add_adj = 0;
-
-  for (l = address_length + 1; l <= 32; l++)
-    {
-      ip4_foreach_matching_route (im, table_index_or_table_id, flags,
-                                 address,
-                                 l,
-                                 &matching_addresses,
-                                 &matching_address_lengths);
-      for (i = 0; i < vec_len (matching_addresses); i++)
-       {
-         a.dst_address = matching_addresses[i];
-         a.dst_address_length = matching_address_lengths[i];
-         ip4_add_del_route (im, &a);
-       }
-    }
-
-  ip4_maybe_remap_adjacencies (im, table_index_or_table_id, flags);
-}
+#include <vnet/srp/srp.h>      /* for srp_hw_interface_class */
+#include <vnet/api_errno.h>     /* for API error numbers */
+#include <vnet/fib/fib_table.h> /* for FIB table and entry creation */
+#include <vnet/fib/fib_entry.h> /* for FIB table and entry creation */
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/dpo/load_balance.h>
+#include <vnet/dpo/classify_dpo.h>
 
 void
 ip4_forward_next_trace (vlib_main_t * vm,
@@ -712,12 +60,10 @@ always_inline uword
 ip4_lookup_inline (vlib_main_t * vm,
                   vlib_node_runtime_t * node,
                   vlib_frame_t * frame,
-                  int lookup_for_responses_to_locally_received_packets,
-                  int is_indirect)
+                  int lookup_for_responses_to_locally_received_packets)
 {
   ip4_main_t * im = &ip4_main;
-  ip_lookup_main_t * lm = &im->lookup_main;
-  vlib_combined_counter_main_t * cm = &im->lookup_main.adjacency_counters;
+  vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
   u32 n_left_from, n_left_to_next, * from, * to_next;
   ip_lookup_next_t next;
   u32 cpu_index = os_get_cpu_number();
@@ -732,217 +78,194 @@ ip4_lookup_inline (vlib_main_t * vm,
                           to_next, n_left_to_next);
 
       while (n_left_from >= 4 && n_left_to_next >= 2)
-       {
-         vlib_buffer_t * p0, * p1;
-         ip4_header_t * ip0, * ip1;
-         __attribute__((unused)) tcp_header_t * tcp0, * tcp1;
-         ip_lookup_next_t next0, next1;
-         ip_adjacency_t * adj0, * adj1;
-         ip4_fib_mtrie_t * mtrie0, * mtrie1;
-         ip4_fib_mtrie_leaf_t leaf0, leaf1;
-         ip4_address_t * dst_addr0, *dst_addr1;
-         __attribute__((unused)) u32 pi0, fib_index0, adj_index0, is_tcp_udp0;
-         __attribute__((unused)) u32 pi1, fib_index1, adj_index1, is_tcp_udp1;
-          u32 flow_hash_config0, flow_hash_config1;
+       {
+         vlib_buffer_t * p0, * p1;
+         ip4_header_t * ip0, * ip1;
+         __attribute__((unused)) tcp_header_t * tcp0, * tcp1;
+         ip_lookup_next_t next0, next1;
+         const load_balance_t * lb0, * lb1;
+         ip4_fib_mtrie_t * mtrie0, * mtrie1;
+         ip4_fib_mtrie_leaf_t leaf0, leaf1;
+         ip4_address_t * dst_addr0, *dst_addr1;
+         __attribute__((unused)) u32 pi0, fib_index0, lb_index0, is_tcp_udp0;
+         __attribute__((unused)) u32 pi1, fib_index1, lb_index1, is_tcp_udp1;
+          flow_hash_config_t flow_hash_config0, flow_hash_config1;
           u32 hash_c0, hash_c1;
-         u32 wrong_next;
+         u32 wrong_next;
+         const dpo_id_t *dpo0, *dpo1;
 
-         /* Prefetch next iteration. */
-         {
-           vlib_buffer_t * p2, * p3;
+         /* Prefetch next iteration. */
+         {
+           vlib_buffer_t * p2, * p3;
 
-           p2 = vlib_get_buffer (vm, from[2]);
-           p3 = vlib_get_buffer (vm, from[3]);
+           p2 = vlib_get_buffer (vm, from[2]);
+           p3 = vlib_get_buffer (vm, from[3]);
 
-           vlib_prefetch_buffer_header (p2, LOAD);
-           vlib_prefetch_buffer_header (p3, LOAD);
+           vlib_prefetch_buffer_header (p2, LOAD);
+           vlib_prefetch_buffer_header (p3, LOAD);
 
-           CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD);
-           CLIB_PREFETCH (p3->data, sizeof (ip0[0]), LOAD);
-         }
+           CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD);
+           CLIB_PREFETCH (p3->data, sizeof (ip0[0]), LOAD);
+         }
 
-         pi0 = to_next[0] = from[0];
-         pi1 = to_next[1] = from[1];
+         pi0 = to_next[0] = from[0];
+         pi1 = to_next[1] = from[1];
 
-         p0 = vlib_get_buffer (vm, pi0);
-         p1 = vlib_get_buffer (vm, pi1);
+         p0 = vlib_get_buffer (vm, pi0);
+         p1 = vlib_get_buffer (vm, pi1);
 
-         ip0 = vlib_buffer_get_current (p0);
-         ip1 = vlib_buffer_get_current (p1);
+         ip0 = vlib_buffer_get_current (p0);
+         ip1 = vlib_buffer_get_current (p1);
 
-         if (is_indirect)
-           {
-             ip_adjacency_t * iadj0, * iadj1;
-             iadj0 = ip_get_adjacency (lm, vnet_buffer(p0)->ip.adj_index[VLIB_TX]);
-             iadj1 = ip_get_adjacency (lm, vnet_buffer(p1)->ip.adj_index[VLIB_TX]);
-             dst_addr0 = &iadj0->indirect.next_hop.ip4;
-             dst_addr1 = &iadj1->indirect.next_hop.ip4;
-           }
-         else
-           {
-             dst_addr0 = &ip0->dst_address;
-             dst_addr1 = &ip1->dst_address;
-           }
+         dst_addr0 = &ip0->dst_address;
+         dst_addr1 = &ip1->dst_address;
 
-         fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
-         fib_index1 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p1)->sw_if_index[VLIB_RX]);
+         fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
+         fib_index1 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p1)->sw_if_index[VLIB_RX]);
           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
             fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
           fib_index1 = (vnet_buffer(p1)->sw_if_index[VLIB_TX] == (u32)~0) ?
             fib_index1 : vnet_buffer(p1)->sw_if_index[VLIB_TX];
 
 
-         if (! lookup_for_responses_to_locally_received_packets)
-           {
-             mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
-             mtrie1 = &vec_elt_at_index (im->fibs, fib_index1)->mtrie;
-
-             leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
-
-             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 0);
-             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 0);
-           }
-
-         tcp0 = (void *) (ip0 + 1);
-         tcp1 = (void *) (ip1 + 1);
-
-         is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP
-                        || ip0->protocol == IP_PROTOCOL_UDP);
-         is_tcp_udp1 = (ip1->protocol == IP_PROTOCOL_TCP
-                        || ip1->protocol == IP_PROTOCOL_UDP);
-
-         if (! lookup_for_responses_to_locally_received_packets)
-           {
-             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 1);
-             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 1);
-           }
-
-         if (! lookup_for_responses_to_locally_received_packets)
-           {
-             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
-             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 2);
-           }
-
-         if (! lookup_for_responses_to_locally_received_packets)
-           {
-             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
-             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 3);
-           }
-
-         if (lookup_for_responses_to_locally_received_packets)
-           {
-             adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
-             adj_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_RX];
-           }
-         else
-           {
-             /* Handle default route. */
-             leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
-             leaf1 = (leaf1 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie1->default_leaf : leaf1);
-
-             adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
-             adj_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
-           }
-
-         ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
-                                                          dst_addr0,
-                                                          /* no_default_route */ 0));
-         ASSERT (adj_index1 == ip4_fib_lookup_with_table (im, fib_index1,
-                                                          dst_addr1,
-                                                          /* no_default_route */ 0));
-         adj0 = ip_get_adjacency (lm, adj_index0);
-         adj1 = ip_get_adjacency (lm, adj_index1);
-
-         next0 = adj0->lookup_next_index;
-         next1 = adj1->lookup_next_index;
-
-         /* Use flow hash to compute multipath adjacency. */
+         if (! lookup_for_responses_to_locally_received_packets)
+           {
+             mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
+             mtrie1 = &ip4_fib_get (fib_index1)->mtrie;
+
+             leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
+
+             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 0);
+             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 0);
+           }
+
+         tcp0 = (void *) (ip0 + 1);
+         tcp1 = (void *) (ip1 + 1);
+
+         is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP
+                        || ip0->protocol == IP_PROTOCOL_UDP);
+         is_tcp_udp1 = (ip1->protocol == IP_PROTOCOL_TCP
+                        || ip1->protocol == IP_PROTOCOL_UDP);
+
+         if (! lookup_for_responses_to_locally_received_packets)
+           {
+             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 1);
+             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 1);
+           }
+
+         if (! lookup_for_responses_to_locally_received_packets)
+           {
+             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
+             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 2);
+           }
+
+         if (! lookup_for_responses_to_locally_received_packets)
+           {
+             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
+             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 3);
+           }
+
+         if (lookup_for_responses_to_locally_received_packets)
+           {
+             lb_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
+             lb_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_RX];
+           }
+         else
+           {
+             /* Handle default route. */
+             leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
+             leaf1 = (leaf1 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie1->default_leaf : leaf1);
+
+             lb_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+             lb_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
+           }
+
+         lb0 = load_balance_get (lb_index0);
+         lb1 = load_balance_get (lb_index1);
+
+         /* Use flow hash to compute multipath adjacency. */
           hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
           hash_c1 = vnet_buffer (p1)->ip.flow_hash = 0;
-          if (PREDICT_FALSE (adj0->n_adj > 1))
+          if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
             {
-              flow_hash_config0 = 
-                vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config;
-              hash_c0 = vnet_buffer (p0)->ip.flow_hash = 
+              flow_hash_config0 = lb0->lb_hash_config;
+              hash_c0 = vnet_buffer (p0)->ip.flow_hash =
                 ip4_compute_flow_hash (ip0, flow_hash_config0);
             }
-          if (PREDICT_FALSE(adj1->n_adj > 1))
+          if (PREDICT_FALSE(lb0->lb_n_buckets > 1))
             {
-              flow_hash_config1 = 
-                vec_elt_at_index (im->fibs, fib_index1)->flow_hash_config;
-              hash_c1 = vnet_buffer (p1)->ip.flow_hash = 
+             flow_hash_config1 = lb1->lb_hash_config;
+              hash_c1 = vnet_buffer (p1)->ip.flow_hash =
                 ip4_compute_flow_hash (ip1, flow_hash_config1);
             }
 
-         ASSERT (adj0->n_adj > 0);
-         ASSERT (adj1->n_adj > 0);
-         ASSERT (is_pow2 (adj0->n_adj));
-         ASSERT (is_pow2 (adj1->n_adj));
-         adj_index0 += (hash_c0 & (adj0->n_adj - 1));
-         adj_index1 += (hash_c1 & (adj1->n_adj - 1));
-
-         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
-         vnet_buffer (p1)->ip.adj_index[VLIB_TX] = adj_index1;
-
-         if (is_indirect)
-           {
-             /* ARP for next-hop not packet's destination address */
-             if (adj0->lookup_next_index == IP_LOOKUP_NEXT_ARP)
-               ip0->dst_address.as_u32 = dst_addr0->as_u32;
-              if (adj1->lookup_next_index == IP_LOOKUP_NEXT_ARP)
-                ip1->dst_address.as_u32 = dst_addr1->as_u32;
-           }
-
-          vlib_increment_combined_counter 
-              (cm, cpu_index, adj_index0, 1,
-               vlib_buffer_length_in_chain (vm, p0) 
+         ASSERT (lb0->lb_n_buckets > 0);
+         ASSERT (is_pow2 (lb0->lb_n_buckets));
+         ASSERT (lb1->lb_n_buckets > 0);
+         ASSERT (is_pow2 (lb1->lb_n_buckets));
+
+         dpo0 = load_balance_get_bucket_i(lb0,
+                                           (hash_c0 &
+                                            (lb0->lb_n_buckets_minus_1)));
+         dpo1 = load_balance_get_bucket_i(lb1,
+                                           (hash_c1 &
+                                            (lb0->lb_n_buckets_minus_1)));
+
+         next0 = dpo0->dpoi_next_node;
+         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
+         next1 = dpo1->dpoi_next_node;
+         vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
+
+          vlib_increment_combined_counter
+              (cm, cpu_index, lb_index0, 1,
+               vlib_buffer_length_in_chain (vm, p0)
                + sizeof(ethernet_header_t));
-          vlib_increment_combined_counter 
-              (cm, cpu_index, adj_index1, 1,
+          vlib_increment_combined_counter
+              (cm, cpu_index, lb_index1, 1,
                vlib_buffer_length_in_chain (vm, p1)
                + sizeof(ethernet_header_t));
 
-         from += 2;
-         to_next += 2;
-         n_left_to_next -= 2;
-         n_left_from -= 2;
-
-         wrong_next = (next0 != next) + 2*(next1 != next);
-         if (PREDICT_FALSE (wrong_next != 0))
-           {
-             switch (wrong_next)
-               {
-               case 1:
-                 /* A B A */
-                 to_next[-2] = pi1;
-                 to_next -= 1;
-                 n_left_to_next += 1;
-                 vlib_set_next_frame_buffer (vm, node, next0, pi0);
-                 break;
-
-               case 2:
-                 /* A A B */
-                 to_next -= 1;
-                 n_left_to_next += 1;
-                 vlib_set_next_frame_buffer (vm, node, next1, pi1);
-                 break;
-
-               case 3:
-                 /* A B C */
-                 to_next -= 2;
-                 n_left_to_next += 2;
-                 vlib_set_next_frame_buffer (vm, node, next0, pi0);
-                 vlib_set_next_frame_buffer (vm, node, next1, pi1);
-                 if (next0 == next1)
-                   {
-                     /* A B B */
-                     vlib_put_next_frame (vm, node, next, n_left_to_next);
-                     next = next1;
-                     vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
-                   }
-               }
-           }
-       }
+         from += 2;
+         to_next += 2;
+         n_left_to_next -= 2;
+         n_left_from -= 2;
+
+         wrong_next = (next0 != next) + 2*(next1 != next);
+         if (PREDICT_FALSE (wrong_next != 0))
+           {
+             switch (wrong_next)
+               {
+               case 1:
+                 /* A B A */
+                 to_next[-2] = pi1;
+                 to_next -= 1;
+                 n_left_to_next += 1;
+                 vlib_set_next_frame_buffer (vm, node, next0, pi0);
+                 break;
+
+               case 2:
+                 /* A A B */
+                 to_next -= 1;
+                 n_left_to_next += 1;
+                 vlib_set_next_frame_buffer (vm, node, next1, pi1);
+                 break;
+
+               case 3:
+                 /* A B C */
+                 to_next -= 2;
+                 n_left_to_next += 2;
+                 vlib_set_next_frame_buffer (vm, node, next0, pi0);
+                 vlib_set_next_frame_buffer (vm, node, next1, pi1);
+                 if (next0 == next1)
+                   {
+                     /* A B B */
+                     vlib_put_next_frame (vm, node, next, n_left_to_next);
+                     next = next1;
+                     vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+                   }
+               }
+           }
+       }
     
       while (n_left_from > 0 && n_left_to_next > 0)
        {
@@ -950,12 +273,14 @@ ip4_lookup_inline (vlib_main_t * vm,
          ip4_header_t * ip0;
          __attribute__((unused)) tcp_header_t * tcp0;
          ip_lookup_next_t next0;
-         ip_adjacency_t * adj0;
+         const load_balance_t *lb0;
          ip4_fib_mtrie_t * mtrie0;
          ip4_fib_mtrie_leaf_t leaf0;
          ip4_address_t * dst_addr0;
-         __attribute__((unused)) u32 pi0, fib_index0, adj_index0, is_tcp_udp0;
-          u32 flow_hash_config0, hash_c0;
+         __attribute__((unused)) u32 pi0, fib_index0, is_tcp_udp0, lbi0;
+          flow_hash_config_t flow_hash_config0;
+         const dpo_id_t *dpo0;
+         u32 hash_c0;
 
          pi0 = from[0];
          to_next[0] = pi0;
@@ -964,16 +289,7 @@ ip4_lookup_inline (vlib_main_t * vm,
 
          ip0 = vlib_buffer_get_current (p0);
 
-         if (is_indirect)
-           {
-             ip_adjacency_t * iadj0;
-             iadj0 = ip_get_adjacency (lm, vnet_buffer(p0)->ip.adj_index[VLIB_TX]);
-             dst_addr0 = &iadj0->indirect.next_hop.ip4;
-           }
-         else
-           {
-             dst_addr0 = &ip0->dst_address;
-           }
+         dst_addr0 = &ip0->dst_address;
 
          fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
@@ -981,7 +297,7 @@ ip4_lookup_inline (vlib_main_t * vm,
 
          if (! lookup_for_responses_to_locally_received_packets)
            {
-             mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
+             mtrie0 = &ip4_fib_get( fib_index0)->mtrie;
 
              leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
 
@@ -1003,50 +319,39 @@ ip4_lookup_inline (vlib_main_t * vm,
            leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
 
          if (lookup_for_responses_to_locally_received_packets)
-           adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
+           lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
          else
            {
              /* Handle default route. */
              leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
-             adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+             lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
            }
 
-         ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
-                                                          dst_addr0,
-                                                          /* no_default_route */ 0));
-
-         adj0 = ip_get_adjacency (lm, adj_index0);
-
-         next0 = adj0->lookup_next_index;
+         lb0 = load_balance_get (lbi0);
 
          /* Use flow hash to compute multipath adjacency. */
           hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
-          if (PREDICT_FALSE(adj0->n_adj > 1))
+          if (PREDICT_FALSE(lb0->lb_n_buckets > 1))
             {
-              flow_hash_config0 = 
-                vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config;
+             flow_hash_config0 = lb0->lb_hash_config;
 
               hash_c0 = vnet_buffer (p0)->ip.flow_hash = 
                 ip4_compute_flow_hash (ip0, flow_hash_config0);
             }
 
-         ASSERT (adj0->n_adj > 0);
-         ASSERT (is_pow2 (adj0->n_adj));
-         adj_index0 += (hash_c0 & (adj0->n_adj - 1));
+         ASSERT (lb0->lb_n_buckets > 0);
+         ASSERT (is_pow2 (lb0->lb_n_buckets));
 
-         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
+         dpo0 = load_balance_get_bucket_i(lb0,
+                                           (hash_c0 &
+                                            (lb0->lb_n_buckets_minus_1)));
 
-          if (is_indirect)
-            {
-              /* ARP for next-hop not packet's destination address */
-              if (adj0->lookup_next_index == IP_LOOKUP_NEXT_ARP)
-                ip0->dst_address.as_u32 = dst_addr0->as_u32;
-            }
+         next0 = dpo0->dpoi_next_node;
+         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
 
-          vlib_increment_combined_counter 
-              (cm, cpu_index, adj_index0, 1,
-               vlib_buffer_length_in_chain (vm, p0)
-               + sizeof(ethernet_header_t));
+         vlib_increment_combined_counter 
+              (cm, cpu_index, lbi0, 1,
+               vlib_buffer_length_in_chain (vm, p0));
 
          from += 1;
          to_next += 1;
@@ -1113,55 +418,135 @@ ip4_lookup (vlib_main_t * vm,
            vlib_frame_t * frame)
 {
   return ip4_lookup_inline (vm, node, frame,
-                           /* lookup_for_responses_to_locally_received_packets */ 0,
-                           /* is_indirect */ 0);
+                           /* lookup_for_responses_to_locally_received_packets */ 0);
 
 }
 
-void ip4_adjacency_set_interface_route (vnet_main_t * vnm,
-                                       ip_adjacency_t * adj,
-                                       u32 sw_if_index,
-                                       u32 if_address_index)
+static u8 * format_ip4_lookup_trace (u8 * s, va_list * args);
+
+VLIB_REGISTER_NODE (ip4_lookup_node) = {
+  .function = ip4_lookup,
+  .name = "ip4-lookup",
+  .vector_size = sizeof (u32),
+
+  .format_trace = format_ip4_lookup_trace,
+  .n_next_nodes = IP_LOOKUP_N_NEXT,
+  .next_nodes = IP4_LOOKUP_NEXT_NODES,
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (ip4_lookup_node, ip4_lookup)
+
+always_inline uword
+ip4_load_balance (vlib_main_t * vm,
+                 vlib_node_runtime_t * node,
+                 vlib_frame_t * frame)
 {
-  vnet_hw_interface_t * hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
-  ip_lookup_next_t n;
-  vnet_l3_packet_type_t packet_type;
-  u32 node_index;
+  vlib_combined_counter_main_t * cm = &load_balance_main.lbm_via_counters;
+  u32 n_left_from, n_left_to_next, * from, * to_next;
+  ip_lookup_next_t next;
+  u32 cpu_index = os_get_cpu_number();
 
-  if (hw->hw_class_index == ethernet_hw_interface_class.index
-      || hw->hw_class_index == srp_hw_interface_class.index)
-    {
-      /* 
-       * We have a bit of a problem in this case. ip4-arp uses
-       * the rewrite_header.next_index to hand pkts to the
-       * indicated inteface output node. We can end up in
-       * ip4_rewrite_local, too, which also pays attention to 
-       * rewrite_header.next index. Net result: a hack in
-       * ip4_rewrite_local...
-       */
-      n = IP_LOOKUP_NEXT_ARP;
-      node_index = ip4_arp_node.index;
-      adj->if_address_index = if_address_index;
-      adj->arp.next_hop.ip4.as_u32 = 0;
-      ip46_address_reset(&adj->arp.next_hop);
-      packet_type = VNET_L3_PACKET_TYPE_ARP;
-    }
-  else
+  from = vlib_frame_vector_args (frame);
+  n_left_from = frame->n_vectors;
+  next = node->cached_next_index;
+
+  if (node->flags & VLIB_NODE_FLAG_TRACE)
+      ip4_forward_next_trace(vm, node, frame, VLIB_TX);
+
+  while (n_left_from > 0)
     {
-      n = IP_LOOKUP_NEXT_REWRITE;
-      node_index = ip4_rewrite_node.index;
-      packet_type = VNET_L3_PACKET_TYPE_IP4;
+      vlib_get_next_frame (vm, node, next,
+                          to_next, n_left_to_next);
+
+    
+      while (n_left_from > 0 && n_left_to_next > 0)
+       {
+         ip_lookup_next_t next0;
+         const load_balance_t *lb0;
+         vlib_buffer_t * p0;
+         u32 pi0, lbi0, hc0;
+         const ip4_header_t *ip0;
+         const dpo_id_t *dpo0;
+
+         pi0 = from[0];
+         to_next[0] = pi0;
+
+         p0 = vlib_get_buffer (vm, pi0);
+
+         ip0 = vlib_buffer_get_current (p0);
+         lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
+
+         lb0 = load_balance_get(lbi0);
+         hc0 = lb0->lb_hash_config;
+         vnet_buffer(p0)->ip.flow_hash = ip4_compute_flow_hash(ip0, hc0);
+
+         dpo0 = load_balance_get_bucket_i(lb0, 
+                                          vnet_buffer(p0)->ip.flow_hash &
+                                          (lb0->lb_n_buckets_minus_1));
+
+         next0 = dpo0->dpoi_next_node;
+         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
+
+         vlib_increment_combined_counter 
+              (cm, cpu_index, lbi0, 1,
+               vlib_buffer_length_in_chain (vm, p0));
+
+         from += 1;
+         to_next += 1;
+         n_left_to_next -= 1;
+         n_left_from -= 1;
+
+         if (PREDICT_FALSE (next0 != next))
+           {
+             n_left_to_next += 1;
+             vlib_put_next_frame (vm, node, next, n_left_to_next);
+             next = next0;
+             vlib_get_next_frame (vm, node, next,
+                                  to_next, n_left_to_next);
+             to_next[0] = pi0;
+             to_next += 1;
+             n_left_to_next -= 1;
+           }
+       }
+
+      vlib_put_next_frame (vm, node, next, n_left_to_next);
     }
 
-  adj->lookup_next_index = n;
-  vnet_rewrite_for_sw_interface
-    (vnm,
-     packet_type,
-     sw_if_index,
-     node_index,
-     VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST,
-     &adj->rewrite_header,
-     sizeof (adj->rewrite_data));
+  return frame->n_vectors;
+}
+
+static u8 * format_ip4_forward_next_trace (u8 * s, va_list * args);
+
+VLIB_REGISTER_NODE (ip4_load_balance_node) = {
+  .function = ip4_load_balance,
+  .name = "ip4-load-balance",
+  .vector_size = sizeof (u32),
+  .sibling_of = "ip4-lookup",
+
+  .format_trace = format_ip4_forward_next_trace,
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (ip4_load_balance_node, ip4_load_balance)
+
+/* get first interface address */
+ip4_address_t *
+ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index,
+                             ip_interface_address_t ** result_ia)
+{
+  ip_lookup_main_t * lm = &im->lookup_main;
+  ip_interface_address_t * ia = 0;
+  ip4_address_t * result = 0;
+
+  foreach_ip_interface_address (lm, ia, sw_if_index, 
+                                1 /* honor unnumbered */,
+  ({
+    ip4_address_t * a = ip_interface_address_get_address (lm, ia);
+    result = a;
+    break;
+  }));
+  if (result_ia)
+    *result_ia = result ? ia : 0;
+  return result;
 }
 
 static void
@@ -1169,115 +554,160 @@ ip4_add_interface_routes (u32 sw_if_index,
                          ip4_main_t * im, u32 fib_index,
                          ip_interface_address_t * a)
 {
-  vnet_main_t * vnm = vnet_get_main();
   ip_lookup_main_t * lm = &im->lookup_main;
-  ip_adjacency_t * adj;
   ip4_address_t * address = ip_interface_address_get_address (lm, a);
-  ip4_add_del_route_args_t x;
-  vnet_hw_interface_t * hw_if = vnet_get_sup_hw_interface (vnm, sw_if_index);
-  u32 classify_table_index;
-
-  /* Add e.g. 1.0.0.0/8 as interface route (arp for Ethernet). */
-  x.table_index_or_table_id = fib_index;
-  x.flags = (IP4_ROUTE_FLAG_ADD
-            | IP4_ROUTE_FLAG_FIB_INDEX
-            | IP4_ROUTE_FLAG_NO_REDISTRIBUTE);
-  x.dst_address = address[0];
-  x.dst_address_length = a->address_length;
-  x.n_add_adj = 0;
-  x.add_adj = 0;
+  fib_prefix_t pfx = {
+      .fp_len = a->address_length,
+      .fp_proto = FIB_PROTOCOL_IP4,
+      .fp_addr.ip4 = *address,
+  };
 
   a->neighbor_probe_adj_index = ~0;
-  if (a->address_length < 32)
+
+  if (pfx.fp_len < 32)
+  {
+      fib_node_index_t fei;
+
+      fei = fib_table_entry_update_one_path(fib_index,
+                                           &pfx,
+                                           FIB_SOURCE_INTERFACE,
+                                           (FIB_ENTRY_FLAG_CONNECTED |
+                                            FIB_ENTRY_FLAG_ATTACHED),
+                                           FIB_PROTOCOL_IP4,
+                                           NULL, /* No next-hop address */
+                                           sw_if_index,
+                                           ~0, // invalid FIB index
+                                           1,
+                                           MPLS_LABEL_INVALID,
+                                           FIB_ROUTE_PATH_FLAG_NONE);
+      a->neighbor_probe_adj_index = fib_entry_get_adj(fei);
+  }
+
+  pfx.fp_len = 32;
+
+  if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index))
+  {
+      u32 classify_table_index =
+         lm->classify_table_index_by_sw_if_index [sw_if_index];
+      if (classify_table_index != (u32) ~0)
+      {
+          dpo_id_t dpo = DPO_NULL;
+
+          dpo_set(&dpo,
+                  DPO_CLASSIFY,
+                  DPO_PROTO_IP4,
+                  classify_dpo_create(FIB_PROTOCOL_IP4,
+                                      classify_table_index));
+
+         fib_table_entry_special_dpo_add(fib_index,
+                                          &pfx,
+                                          FIB_SOURCE_CLASSIFY,
+                                          FIB_ENTRY_FLAG_NONE,
+                                          &dpo);
+          dpo_reset(&dpo);
+      }
+  }
+
+  fib_table_entry_update_one_path(fib_index,
+                                 &pfx,
+                                 FIB_SOURCE_INTERFACE,
+                                 (FIB_ENTRY_FLAG_CONNECTED |
+                                  FIB_ENTRY_FLAG_LOCAL),
+                                 FIB_PROTOCOL_IP4,
+                                 &pfx.fp_addr,
+                                 sw_if_index,
+                                 ~0, // invalid FIB index
+                                 1,
+                                 MPLS_LABEL_INVALID,
+                                 FIB_ROUTE_PATH_FLAG_NONE);
+}
+
+static void
+ip4_del_interface_routes (ip4_main_t * im,
+                         u32 fib_index,
+                         ip4_address_t * address,
+                         u32 address_length)
+{
+    fib_prefix_t pfx = {
+       .fp_len = address_length,
+       .fp_proto = FIB_PROTOCOL_IP4,
+       .fp_addr.ip4 = *address,
+    };
+
+    if (pfx.fp_len < 32)
     {
-      adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
-                             &x.adj_index);
-      ip4_adjacency_set_interface_route (vnm, adj, sw_if_index, a - lm->if_address_pool);
-      ip_call_add_del_adjacency_callbacks (lm, x.adj_index, /* is_del */ 0);
-      ip4_add_del_route (im, &x);
-      a->neighbor_probe_adj_index = x.adj_index;
+       fib_table_entry_delete(fib_index,
+                              &pfx,
+                              FIB_SOURCE_INTERFACE);
     }
-  
-  /* Add e.g. 1.1.1.1/32 as local to this host. */
-  adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
-                         &x.adj_index);
-  
-  classify_table_index = ~0;
-  if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index))
-    classify_table_index = lm->classify_table_index_by_sw_if_index [sw_if_index];
-  if (classify_table_index != (u32) ~0)
+
+    pfx.fp_len = 32;
+    fib_table_entry_delete(fib_index,
+                          &pfx,
+                          FIB_SOURCE_INTERFACE);
+}
+
+void
+ip4_sw_interface_enable_disable (u32 sw_if_index,
+                                u32 is_enable)
+{
+  vlib_main_t * vm = vlib_get_main();
+  ip4_main_t * im = &ip4_main;
+  ip_lookup_main_t * lm = &im->lookup_main;
+  u32 ci, cast;
+  u32 lookup_feature_index;
+
+  vec_validate_init_empty (im->ip_enabled_by_sw_if_index, sw_if_index, 0);
+
+  /*
+   * enable/disable only on the 1<->0 transition
+   */
+  if (is_enable)
     {
-      adj->lookup_next_index = IP_LOOKUP_NEXT_CLASSIFY;
-      adj->classify.table_index = classify_table_index;
+      if (1 != ++im->ip_enabled_by_sw_if_index[sw_if_index])
+        return;
     }
   else
-    adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
-  
-  adj->if_address_index = a - lm->if_address_pool;
-  adj->rewrite_header.sw_if_index = sw_if_index;
-  adj->rewrite_header.max_l3_packet_bytes = hw_if->max_l3_packet_bytes[VLIB_RX];
-  /* 
-   * Local adjs are never to be rewritten. Spoofed pkts w/ src = dst = local
-   * fail an RPF-ish check, but still go thru the rewrite code...
-   */
-  adj->rewrite_header.data_bytes = 0;
+    {
+      ASSERT(im->ip_enabled_by_sw_if_index[sw_if_index] > 0);
+      if (0 != --im->ip_enabled_by_sw_if_index[sw_if_index])
+        return;
+    }
+
+  for (cast = 0; cast <= VNET_IP_RX_MULTICAST_FEAT; cast++)
+    {
+      ip_config_main_t * cm = &lm->feature_config_mains[cast];
+      vnet_config_main_t * vcm = &cm->config_main;
+
+      vec_validate_init_empty (cm->config_index_by_sw_if_index, sw_if_index, ~0);
+      ci = cm->config_index_by_sw_if_index[sw_if_index];
 
-  ip_call_add_del_adjacency_callbacks (lm, x.adj_index, /* is_del */ 0);
-  x.dst_address_length = 32;
-  ip4_add_del_route (im, &x);
-}
+      if (cast == VNET_IP_RX_UNICAST_FEAT)
+       lookup_feature_index = im->ip4_unicast_rx_feature_lookup;
+      else
+       lookup_feature_index = im->ip4_multicast_rx_feature_lookup;
 
-static void
-ip4_del_interface_routes (ip4_main_t * im, u32 fib_index, ip4_address_t * address, u32 address_length)
-{
-  ip4_add_del_route_args_t x;
-
-  /* Add e.g. 1.0.0.0/8 as interface route (arp for Ethernet). */
-  x.table_index_or_table_id = fib_index;
-  x.flags = (IP4_ROUTE_FLAG_DEL
-            | IP4_ROUTE_FLAG_FIB_INDEX
-            | IP4_ROUTE_FLAG_NO_REDISTRIBUTE);
-  x.dst_address = address[0];
-  x.dst_address_length = address_length;
-  x.adj_index = ~0;
-  x.n_add_adj = 0;
-  x.add_adj = 0;
-
-  if (address_length < 32)
-    ip4_add_del_route (im, &x);
-
-  x.dst_address_length = 32;
-  ip4_add_del_route (im, &x);
-
-  ip4_delete_matching_routes (im,
-                             fib_index,
-                             IP4_ROUTE_FLAG_FIB_INDEX,
-                             address,
-                             address_length);
+      if (is_enable)
+       ci = vnet_config_add_feature (vm, vcm,
+                                     ci,
+                                     lookup_feature_index,
+                                     /* config data */ 0,
+                                     /* # bytes of config data */ 0);
+      else
+       ci = vnet_config_del_feature (vm, vcm,
+                                     ci,
+                                     lookup_feature_index,
+                                     /* config data */ 0,
+                                     /* # bytes of config data */ 0);
+      cm->config_index_by_sw_if_index[sw_if_index] = ci;
+    }
 }
 
-typedef struct {
-    u32 sw_if_index;
-    ip4_address_t address;
-    u32 length;
-} ip4_interface_address_t;
-
-static clib_error_t *
-ip4_add_del_interface_address_internal (vlib_main_t * vm,
-                                       u32 sw_if_index,
-                                       ip4_address_t * new_address,
-                                       u32 new_length,
-                                       u32 redistribute,
-                                       u32 insert_routes,
-                                       u32 is_del);
-
 static clib_error_t *
 ip4_add_del_interface_address_internal (vlib_main_t * vm,
                                        u32 sw_if_index,
                                        ip4_address_t * address,
                                        u32 address_length,
-                                       u32 redistribute,
-                                       u32 insert_routes,
                                        u32 is_del)
 {
   vnet_main_t * vnm = vnet_get_main();
@@ -1292,9 +722,15 @@ ip4_add_del_interface_address_internal (vlib_main_t * vm,
                     vec_elt (im->fib_index_by_sw_if_index, sw_if_index));
   vec_add1 (addr_fib, ip4_af);
 
-  /* When adding an address check that it does not conflict with an existing address. */
+  /* FIXME-LATER
+   * there is no support for adj-fib handling in the presence of overlapping
+   * subnets on interfaces. Easy fix - disallow overlapping subnets, like
+   * most routers do.
+   */
   if (! is_del)
     {
+      /* When adding an address check that it does not conflict
+        with an existing address. */
       ip_interface_address_t * ia;
       foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index, 
                                     0 /* honor unnumbered */,
@@ -1307,7 +743,7 @@ ip4_add_del_interface_address_internal (vlib_main_t * vm,
                                    format_ip4_address_and_length, address, address_length,
                                    format_ip4_address_and_length, x, ia->address_length,
                                    format_vnet_sw_if_index_name, vnm, sw_if_index);
-      }));
+       }));
     }
 
   elts_before = pool_elts (lm->if_address_pool);
@@ -1322,18 +758,16 @@ ip4_add_del_interface_address_internal (vlib_main_t * vm,
   if (error)
     goto done;
   
-  if (vnet_sw_interface_is_admin_up (vnm, sw_if_index) && insert_routes)
-    {
-      if (is_del)
-       ip4_del_interface_routes (im, ip4_af.fib_index, address,
-                                 address_length);
-      
-      else
-          ip4_add_interface_routes (sw_if_index,
-                                    im, ip4_af.fib_index,
-                                    pool_elt_at_index 
-                                    (lm->if_address_pool, if_address_index));
-    }
+  ip4_sw_interface_enable_disable(sw_if_index, !is_del);
+
+  if (is_del)
+      ip4_del_interface_routes (im, ip4_af.fib_index, address,
+                               address_length);
+  else
+      ip4_add_interface_routes (sw_if_index,
+                               im, ip4_af.fib_index,
+                               pool_elt_at_index 
+                               (lm->if_address_pool, if_address_index));
 
   /* If pool did not grow/shrink: add duplicate address. */
   if (elts_before != pool_elts (lm->if_address_pool))
@@ -1358,48 +792,9 @@ ip4_add_del_interface_address (vlib_main_t * vm, u32 sw_if_index,
 {
   return ip4_add_del_interface_address_internal
     (vm, sw_if_index, address, address_length,
-     /* redistribute */ 1,
-     /* insert_routes */ 1,
      is_del);
 }
 
-static clib_error_t *
-ip4_sw_interface_admin_up_down (vnet_main_t * vnm,
-                               u32 sw_if_index,
-                               u32 flags)
-{
-  ip4_main_t * im = &ip4_main;
-  ip_interface_address_t * ia;
-  ip4_address_t * a;
-  u32 is_admin_up, fib_index;
-  
-  /* Fill in lookup tables with default table (0). */
-  vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
-  
-  vec_validate_init_empty (im->lookup_main.if_address_pool_index_by_sw_if_index, sw_if_index, ~0);
-  
-  is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
-  
-  fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index);
-
-  foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index, 
-                                0 /* honor unnumbered */,
-  ({
-    a = ip_interface_address_get_address (&im->lookup_main, ia);
-    if (is_admin_up)
-      ip4_add_interface_routes (sw_if_index,
-                               im, fib_index,
-                               ia);
-    else
-      ip4_del_interface_routes (im, fib_index,
-                               a, ia->address_length);
-  }));
-
-  return 0;
-}
-VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ip4_sw_interface_admin_up_down);
-
 /* Built-in ip4 unicast rx feature path definition */
 VNET_IP4_UNICAST_FEATURE_INIT (ip4_inacl, static) = {
   .node_name = "ip4-inacl", 
@@ -1449,10 +844,17 @@ VNET_IP4_UNICAST_FEATURE_INIT (ip4_vpath, static) = {
 
 VNET_IP4_UNICAST_FEATURE_INIT (ip4_lookup, static) = {
   .node_name = "ip4-lookup",
-  .runs_before = 0, /* not before any other features */
+  .runs_before = ORDER_CONSTRAINTS {"ip4-drop", 0},
   .feature_index = &ip4_main.ip4_unicast_rx_feature_lookup,
 };
 
+VNET_IP4_UNICAST_FEATURE_INIT (ip4_drop, static) = {
+  .node_name = "ip4-drop",
+  .runs_before = 0, /* not before any other features */
+  .feature_index = &ip4_main.ip4_unicast_rx_feature_drop,
+};
+
+
 /* Built-in ip4 multicast rx feature path definition */
 VNET_IP4_MULTICAST_FEATURE_INIT (ip4_vpath_mc, static) = {
   .node_name = "vpath-input-ip4",
@@ -1462,10 +864,16 @@ VNET_IP4_MULTICAST_FEATURE_INIT (ip4_vpath_mc, static) = {
 
 VNET_IP4_MULTICAST_FEATURE_INIT (ip4_lookup_mc, static) = {
   .node_name = "ip4-lookup-multicast",
-  .runs_before = 0, /* not before any other features */
+  .runs_before = ORDER_CONSTRAINTS {"ip4-drop", 0},
   .feature_index = &ip4_main.ip4_multicast_rx_feature_lookup,
 };
 
+VNET_IP4_MULTICAST_FEATURE_INIT (ip4_mc_drop, static) = {
+  .node_name = "ip4-drop",
+  .runs_before = 0, /* last feature */
+  .feature_index = &ip4_main.ip4_multicast_rx_feature_drop,
+};
+
 static char * rx_feature_start_nodes[] = 
   { "ip4-input", "ip4-input-no-checksum"};
 
@@ -1488,7 +896,6 @@ VNET_IP4_TX_FEATURE_INIT (interface_output, static) = {
   .feature_index = &ip4_main.ip4_tx_feature_interface_output,
 };
 
-
 static clib_error_t *
 ip4_feature_init (vlib_main_t * vm, ip4_main_t * im)
 {
@@ -1520,7 +927,7 @@ ip4_feature_init (vlib_main_t * vm, ip4_main_t * im)
                                          feature_start_nodes,
                                          feature_start_len,
                                          cast,
-                                         1 /* is_ip4 */)))
+                                         VNET_L3_PACKET_TYPE_IP4)))
         return error;
     }
 
@@ -1538,6 +945,9 @@ ip4_sw_interface_add_del (vnet_main_t * vnm,
   u32 ci, cast;
   u32 feature_index;
 
+  /* Fill in lookup tables with default table (0). */
+  vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
+
   for (cast = 0; cast < VNET_N_IP_FEAT; cast++)
     {
       ip_config_main_t * cm = &lm->feature_config_mains[cast];
@@ -1547,9 +957,9 @@ ip4_sw_interface_add_del (vnet_main_t * vnm,
       ci = cm->config_index_by_sw_if_index[sw_if_index];
 
       if (cast == VNET_IP_RX_UNICAST_FEAT)
-        feature_index = im->ip4_unicast_rx_feature_lookup;
+        feature_index = im->ip4_unicast_rx_feature_drop;
       else if (cast == VNET_IP_RX_MULTICAST_FEAT)
-        feature_index = im->ip4_multicast_rx_feature_lookup;
+        feature_index = im->ip4_multicast_rx_feature_drop;
       else
         feature_index = im->ip4_tx_feature_interface_output;
 
@@ -1560,14 +970,16 @@ ip4_sw_interface_add_del (vnet_main_t * vnm,
                                      /* config data */ 0,
                                      /* # bytes of config data */ 0);
       else
-       ci = vnet_config_del_feature (vm, vcm,
-                                     ci,
-                                      feature_index,
-                                     /* config data */ 0,
-                                     /* # bytes of config data */ 0);
-
+        {
+          ci = vnet_config_del_feature (vm, vcm, ci,
+                                        feature_index,
+                                        /* config data */ 0,
+                                        /* # bytes of config data */ 0);
+          if (vec_len(im->ip_enabled_by_sw_if_index) > sw_if_index)
+              im->ip_enabled_by_sw_if_index[sw_if_index] = 0;
+        }
       cm->config_index_by_sw_if_index[sw_if_index] = ci;
-      /* 
+      /*
        * note: do not update the tx feature count here.
        */
     }
@@ -1577,44 +989,6 @@ ip4_sw_interface_add_del (vnet_main_t * vnm,
 
 VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ip4_sw_interface_add_del);
 
-static u8 * format_ip4_lookup_trace (u8 * s, va_list * args);
-
-VLIB_REGISTER_NODE (ip4_lookup_node) = {
-  .function = ip4_lookup,
-  .name = "ip4-lookup",
-  .vector_size = sizeof (u32),
-
-  .format_trace = format_ip4_lookup_trace,
-
-  .n_next_nodes = IP4_LOOKUP_N_NEXT,
-  .next_nodes = IP4_LOOKUP_NEXT_NODES,
-};
-
-VLIB_NODE_FUNCTION_MULTIARCH (ip4_lookup_node, ip4_lookup);
-
-static uword
-ip4_indirect (vlib_main_t * vm,
-               vlib_node_runtime_t * node,
-               vlib_frame_t * frame)
-{
-  return ip4_lookup_inline (vm, node, frame,
-                           /* lookup_for_responses_to_locally_received_packets */ 0,
-                           /* is_indirect */ 1);
-}
-
-VLIB_REGISTER_NODE (ip4_indirect_node) = {
-  .function = ip4_indirect,
-  .name = "ip4-indirect",
-  .vector_size = sizeof (u32),
-  .sibling_of = "ip4-lookup",
-  .format_trace = format_ip4_lookup_trace,
-
-  .n_next_nodes = 0,
-};
-
-VLIB_NODE_FUNCTION_MULTIARCH (ip4_indirect_node, ip4_indirect);
-
-
 /* Global IP4 main. */
 ip4_main_t ip4_main;
 
@@ -1636,11 +1010,11 @@ ip4_lookup_init (vlib_main_t * vm)
       im->fib_masks[i] = clib_host_to_net_u32 (m);
     }
 
-  /* Create FIB with index 0 and table id of 0. */
-  find_ip4_fib_by_table_index_or_id (im, /* table id */ 0, IP4_ROUTE_FLAG_TABLE_ID);
-
   ip_lookup_init (&im->lookup_main, /* is_ip6 */ 0);
 
+  /* Create FIB with index 0 and table id of 0. */
+  fib_table_find_or_create_and_lock(FIB_PROTOCOL_IP4, 0);
+
   {
     pg_node_t * pn;
     pn = pg_get_node (ip4_lookup_node.index);
@@ -1708,12 +1082,12 @@ static u8 * format_ip4_lookup_trace (u8 * s, va_list * args)
   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
   ip4_forward_next_trace_t * t = va_arg (*args, ip4_forward_next_trace_t *);
   vnet_main_t * vnm = vnet_get_main();
-  ip4_main_t * im = &ip4_main;
   uword indent = format_get_indent (s);
 
   s = format (s, "fib %d adj-idx %d : %U flow hash: 0x%08x",
               t->fib_index, t->adj_index, format_ip_adjacency,
-              vnm, &im->lookup_main, t->adj_index, t->flow_hash);
+              vnm, t->adj_index, FORMAT_IP_ADJACENCY_NONE, 
+             t->flow_hash);
   s = format (s, "\n%U%U",
               format_white_space, indent,
               format_ip4_header, t->packet_data);
@@ -1726,16 +1100,16 @@ static u8 * format_ip4_rewrite_trace (u8 * s, va_list * args)
   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
   ip4_forward_next_trace_t * t = va_arg (*args, ip4_forward_next_trace_t *);
   vnet_main_t * vnm = vnet_get_main();
-  ip4_main_t * im = &ip4_main;
   uword indent = format_get_indent (s);
 
   s = format (s, "tx_sw_if_index %d adj-idx %d : %U flow hash: 0x%08x",
               t->fib_index, t->adj_index, format_ip_adjacency,
-              vnm, &im->lookup_main, t->adj_index, t->flow_hash);
+              vnm, t->adj_index, FORMAT_IP_ADJACENCY_NONE,
+             t->flow_hash);
   s = format (s, "\n%U%U",
               format_white_space, indent,
               format_ip_adjacency_packet_data,
-              vnm, &im->lookup_main, t->adj_index,
+              vnm, t->adj_index,
               t->packet_data, sizeof (t->packet_data));
   return s;
 }
@@ -1863,12 +1237,6 @@ ip4_punt (vlib_main_t * vm,
          vlib_frame_t * frame)
 { return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_ADJACENCY_PUNT); }
 
-static uword
-ip4_miss (vlib_main_t * vm,
-         vlib_node_runtime_t * node,
-         vlib_frame_t * frame)
-{ return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_DST_LOOKUP_MISS); }
-
 VLIB_REGISTER_NODE (ip4_drop_node,static) = {
   .function = ip4_drop,
   .name = "ip4-drop",
@@ -1882,7 +1250,7 @@ VLIB_REGISTER_NODE (ip4_drop_node,static) = {
   },
 };
 
-VLIB_NODE_FUNCTION_MULTIARCH (ip4_drop_node, ip4_drop);
+VLIB_NODE_FUNCTION_MULTIARCH (ip4_drop_node, ip4_drop)
 
 VLIB_REGISTER_NODE (ip4_punt_node,static) = {
   .function = ip4_punt,
@@ -1897,22 +1265,7 @@ VLIB_REGISTER_NODE (ip4_punt_node,static) = {
   },
 };
 
-VLIB_NODE_FUNCTION_MULTIARCH (ip4_punt_node, ip4_punt);
-
-VLIB_REGISTER_NODE (ip4_miss_node,static) = {
-  .function = ip4_miss,
-  .name = "ip4-miss",
-  .vector_size = sizeof (u32),
-
-  .format_trace = format_ip4_forward_next_trace,
-
-  .n_next_nodes = 1,
-  .next_nodes = {
-    [0] = "error-drop",
-  },
-};
-
-VLIB_NODE_FUNCTION_MULTIARCH (ip4_miss_node, ip4_miss);
+VLIB_NODE_FUNCTION_MULTIARCH (ip4_punt_node, ip4_punt)
 
 /* Compute TCP/UDP/ICMP4 checksum in software. */
 u16
@@ -2009,26 +1362,27 @@ ip4_local (vlib_main_t * vm,
       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
 
       while (n_left_from >= 4 && n_left_to_next >= 2)
-       {
-         vlib_buffer_t * p0, * p1;
-         ip4_header_t * ip0, * ip1;
-         udp_header_t * udp0, * udp1;
-         ip4_fib_mtrie_t * mtrie0, * mtrie1;
-         ip4_fib_mtrie_leaf_t leaf0, leaf1;
-         ip_adjacency_t * adj0, * adj1;
-         u32 pi0, ip_len0, udp_len0, flags0, next0, fib_index0, adj_index0;
-         u32 pi1, ip_len1, udp_len1, flags1, next1, fib_index1, adj_index1;
-         i32 len_diff0, len_diff1;
-         u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0;
-         u8 error1, is_udp1, is_tcp_udp1, good_tcp_udp1, proto1;
-         u8 enqueue_code;
+       {
+         vlib_buffer_t * p0, * p1;
+         ip4_header_t * ip0, * ip1;
+         udp_header_t * udp0, * udp1;
+         ip4_fib_mtrie_t * mtrie0, * mtrie1;
+         ip4_fib_mtrie_leaf_t leaf0, leaf1;
+         const dpo_id_t *dpo0, *dpo1;
+         const load_balance_t *lb0, *lb1;
+         u32 pi0, ip_len0, udp_len0, flags0, next0, fib_index0, lbi0;
+         u32 pi1, ip_len1, udp_len1, flags1, next1, fib_index1, lbi1;
+         i32 len_diff0, len_diff1;
+         u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0;
+         u8 error1, is_udp1, is_tcp_udp1, good_tcp_udp1, proto1;
+         u8 enqueue_code;
       
-         pi0 = to_next[0] = from[0];
-         pi1 = to_next[1] = from[1];
-         from += 2;
-         n_left_from -= 2;
-         to_next += 2;
-         n_left_to_next -= 2;
+         pi0 = to_next[0] = from[0];
+         pi1 = to_next[1] = from[1];
+         from += 2;
+         n_left_from -= 2;
+         to_next += 2;
+         n_left_to_next -= 2;
       
          p0 = vlib_get_buffer (vm, pi0);
          p1 = vlib_get_buffer (vm, pi1);
@@ -2041,8 +1395,8 @@ ip4_local (vlib_main_t * vm,
          fib_index1 = vec_elt (im->fib_index_by_sw_if_index, 
                                 vnet_buffer(p1)->sw_if_index[VLIB_RX]);
 
-         mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
-         mtrie1 = &vec_elt_at_index (im->fibs, fib_index1)->mtrie;
+         mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
+         mtrie1 = &ip4_fib_get (fib_index1)->mtrie;
 
          leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
 
@@ -2130,41 +1484,42 @@ ip4_local (vlib_main_t * vm,
 
          leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
          leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 3);
+         leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
+         leaf1 = (leaf1 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie1->default_leaf : leaf1);
 
-         vnet_buffer (p0)->ip.adj_index[VLIB_RX] = adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
-          vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
+         vnet_buffer (p0)->ip.adj_index[VLIB_RX] = lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+          vnet_buffer (p0)->ip.adj_index[VLIB_TX] = lbi0;
 
-         vnet_buffer (p1)->ip.adj_index[VLIB_RX] = adj_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
-          vnet_buffer (p1)->ip.adj_index[VLIB_TX] = adj_index1;
+         vnet_buffer (p1)->ip.adj_index[VLIB_RX] = lbi1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
+          vnet_buffer (p1)->ip.adj_index[VLIB_TX] = lbi1;
 
-         ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
-                                                          &ip0->src_address,
-                                                          /* no_default_route */ 1));
-         ASSERT (adj_index1 == ip4_fib_lookup_with_table (im, fib_index1,
-                                                          &ip1->src_address,
-                                                          /* no_default_route */ 1));
-
-         adj0 = ip_get_adjacency (lm, adj_index0);
-         adj1 = ip_get_adjacency (lm, adj_index1);
+         lb0 = load_balance_get(lbi0);
+         lb1 = load_balance_get(lbi1);
+         dpo0 = load_balance_get_bucket_i(lb0, 0);
+         dpo1 = load_balance_get_bucket_i(lb1, 0);
 
          /* 
            * Must have a route to source otherwise we drop the packet.
            * ip4 broadcasts are accepted, e.g. to make dhcp client work
            */
          error0 = (error0 == IP4_ERROR_UNKNOWN_PROTOCOL
-                   && adj0->lookup_next_index != IP_LOOKUP_NEXT_REWRITE
-                   && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP
-                   && adj0->lookup_next_index != IP_LOOKUP_NEXT_LOCAL
+                   && dpo0->dpoi_type != DPO_ADJACENCY
+                   && dpo0->dpoi_type != DPO_ADJACENCY_INCOMPLETE
                    && ip0->dst_address.as_u32 != 0xFFFFFFFF
                    ? IP4_ERROR_SRC_LOOKUP_MISS
                    : error0);
+          error0 = (dpo0->dpoi_type == DPO_RECEIVE ?
+                    IP4_ERROR_SPOOFED_LOCAL_PACKETS : 
+                    error0);
          error1 = (error1 == IP4_ERROR_UNKNOWN_PROTOCOL
-                   && adj1->lookup_next_index != IP_LOOKUP_NEXT_REWRITE
-                   && adj1->lookup_next_index != IP_LOOKUP_NEXT_ARP
-                   && adj1->lookup_next_index != IP_LOOKUP_NEXT_LOCAL
-                   && ip0->dst_address.as_u32 != 0xFFFFFFFF
+                   && dpo1->dpoi_type != DPO_ADJACENCY
+                   && dpo1->dpoi_type != DPO_ADJACENCY_INCOMPLETE
+                   && ip1->dst_address.as_u32 != 0xFFFFFFFF
                    ? IP4_ERROR_SRC_LOOKUP_MISS
                    : error1);
+          error1 = (dpo0->dpoi_type == DPO_RECEIVE ?
+                    IP4_ERROR_SPOOFED_LOCAL_PACKETS : 
+                    error1);
 
          next0 = lm->local_next_by_ip_protocol[proto0];
          next1 = lm->local_next_by_ip_protocol[proto1];
@@ -2220,11 +1575,12 @@ ip4_local (vlib_main_t * vm,
          udp_header_t * udp0;
          ip4_fib_mtrie_t * mtrie0;
          ip4_fib_mtrie_leaf_t leaf0;
-         ip_adjacency_t * adj0;
-         u32 pi0, next0, ip_len0, udp_len0, flags0, fib_index0, adj_index0;
+         u32 pi0, next0, ip_len0, udp_len0, flags0, fib_index0, lbi0;
          i32 len_diff0;
          u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0;
-      
+         load_balance_t *lb0;
+         const dpo_id_t *dpo0;
+
          pi0 = to_next[0] = from[0];
          from += 1;
          n_left_from -= 1;
@@ -2238,7 +1594,7 @@ ip4_local (vlib_main_t * vm,
          fib_index0 = vec_elt (im->fib_index_by_sw_if_index, 
                                 vnet_buffer(p0)->sw_if_index[VLIB_RX]);
 
-         mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
+         mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
 
          leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
 
@@ -2296,24 +1652,30 @@ ip4_local (vlib_main_t * vm,
                    : error0);
 
          leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
+         leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
 
-         vnet_buffer (p0)->ip.adj_index[VLIB_RX] = adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
-          vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
+         lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+          vnet_buffer (p0)->ip.adj_index[VLIB_TX] = lbi0;
 
-         ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
-                                                          &ip0->src_address,
-                                                          /* no_default_route */ 1));
+         lb0 = load_balance_get(lbi0);
+         dpo0 = load_balance_get_bucket_i(lb0, 0);
 
-         adj0 = ip_get_adjacency (lm, adj_index0);
+         vnet_buffer (p0)->ip.adj_index[VLIB_TX] =
+             vnet_buffer (p0)->ip.adj_index[VLIB_RX] =
+                 dpo0->dpoi_index;
 
          /* Must have a route to source otherwise we drop the packet. */
          error0 = (error0 == IP4_ERROR_UNKNOWN_PROTOCOL
-                   && adj0->lookup_next_index != IP_LOOKUP_NEXT_REWRITE
-                   && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP
-                   && adj0->lookup_next_index != IP_LOOKUP_NEXT_LOCAL
+                   && dpo0->dpoi_type != DPO_ADJACENCY
+                   && dpo0->dpoi_type != DPO_ADJACENCY_INCOMPLETE
+                   && dpo0->dpoi_type != DPO_RECEIVE
                    && ip0->dst_address.as_u32 != 0xFFFFFFFF
                    ? IP4_ERROR_SRC_LOOKUP_MISS
                    : error0);
+          /* Packet originated from a local address => spoofing */
+          error0 = (dpo0->dpoi_type == DPO_RECEIVE ?
+                    IP4_ERROR_SPOOFED_LOCAL_PACKETS : 
+                    error0);
 
          next0 = lm->local_next_by_ip_protocol[proto0];
 
@@ -2356,7 +1718,7 @@ VLIB_REGISTER_NODE (ip4_local_node,static) = {
   },
 };
 
-VLIB_NODE_FUNCTION_MULTIARCH (ip4_local_node, ip4_local);
+VLIB_NODE_FUNCTION_MULTIARCH (ip4_local_node, ip4_local)
 
 void ip4_register_protocol (u32 protocol, u32 node_index)
 {
@@ -2394,10 +1756,11 @@ VLIB_CLI_COMMAND (show_ip_local, static) = {
   .short_help = "Show ip local protocol table",
 };
 
-static uword
-ip4_arp (vlib_main_t * vm,
-        vlib_node_runtime_t * node,
-        vlib_frame_t * frame)
+always_inline uword
+ip4_arp_inline (vlib_main_t * vm,
+               vlib_node_runtime_t * node,
+               vlib_frame_t * frame,
+               int is_glean)
 {
   vnet_main_t * vnm = vnet_get_main();
   ip4_main_t * im = &ip4_main;
@@ -2441,12 +1804,11 @@ ip4_arp (vlib_main_t * vm,
 
       while (n_left_from > 0 && n_left_to_next_drop > 0)
        {
+         u32 pi0, adj_index0, a0, b0, c0, m0, sw_if_index0, drop0;
+         ip_adjacency_t * adj0;
          vlib_buffer_t * p0;
          ip4_header_t * ip0;
-         ethernet_header_t * eh0;
-         u32 pi0, adj_index0, a0, b0, c0, m0, sw_if_index0, drop0;
          uword bm0;
-         ip_adjacency_t * adj0;
 
          pi0 = from[0];
 
@@ -2456,35 +1818,10 @@ ip4_arp (vlib_main_t * vm,
          adj0 = ip_get_adjacency (lm, adj_index0);
          ip0 = vlib_buffer_get_current (p0);
 
-         /* If packet destination is not local, send ARP to next hop */
-         if (adj0->arp.next_hop.ip4.as_u32)
-           ip0->dst_address.data_u32 = adj0->arp.next_hop.ip4.as_u32;
-
-         /* 
-          * if ip4_rewrite_local applied the IP_LOOKUP_NEXT_ARP
-          * rewrite to this packet, we need to skip it here.
-          * Note, to distinguish from src IP addr *.8.6.*, we
-          * check for a bcast eth dest instead of IPv4 version.
-           */
-         eh0 = (ethernet_header_t*)ip0;
-         if ((ip0->ip_version_and_header_length & 0xF0) != 0x40)
-            {
-              u32 vlan_num = 0;
-              u16 * etype = &eh0->type;
-              while ((*etype == clib_host_to_net_u16 (0x8100)) //dot1q 
-                  || (*etype == clib_host_to_net_u16 (0x88a8)))//dot1ad 
-                {
-                  vlan_num += 1;
-                  etype += 2; //vlan tag also 16 bits, same as etype
-                }
-              if (*etype == clib_host_to_net_u16 (0x0806))     //arp
-                {
-                  vlib_buffer_advance (
-                      p0, sizeof(ethernet_header_t) + (4*vlan_num));
-                  ip0 = vlib_buffer_get_current (p0);
-                }
-            }
-
+         /*
+          * this is the Glean case, so we are ARPing for the
+          * packet's destination 
+          */
          a0 = hash_seeds[0];
          b0 = hash_seeds[1];
          c0 = hash_seeds[2];
@@ -2492,7 +1829,14 @@ ip4_arp (vlib_main_t * vm,
          sw_if_index0 = adj0->rewrite_header.sw_if_index;
          vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0;
 
-         a0 ^= ip0->dst_address.data_u32;
+          if (is_glean)
+          {
+              a0 ^= ip0->dst_address.data_u32;
+          }
+          else
+          {
+              a0 ^= adj0->sub_type.nbr.next_hop.ip4.data_u32;
+          }
          b0 ^= sw_if_index0;
 
          hash_v3_finalize32 (a0, b0, c0);
@@ -2522,10 +1866,11 @@ ip4_arp (vlib_main_t * vm,
            * Can happen if the control-plane is programming tables
            * with traffic flowing; at least that's today's lame excuse.
            */
-          if (adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP) 
-            {
-              p0->error = node->errors[IP4_ARP_ERROR_NON_ARP_ADJ];
-            }
+         if ((is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_GLEAN) ||
+             (!is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP))
+         {
+           p0->error = node->errors[IP4_ARP_ERROR_NON_ARP_ADJ];
+         }
           else
          /* Send ARP request. */
          {
@@ -2545,15 +1890,32 @@ ip4_arp (vlib_main_t * vm,
            clib_memcpy (h0->ip4_over_ethernet[0].ethernet, hw_if0->hw_address,
                    sizeof (h0->ip4_over_ethernet[0].ethernet));
 
-           if (ip4_src_address_for_packet (im, p0, &h0->ip4_over_ethernet[0].ip4, sw_if_index0)) {
-               //No source address available
-               p0->error = node->errors[IP4_ARP_ERROR_NO_SOURCE_ADDRESS];
-               vlib_buffer_free(vm, &bi0, 1);
-               continue;
+           if (is_glean)
+           {
+               /* The interface's source address is stashed in the Glean Adj */
+               h0->ip4_over_ethernet[0].ip4 = adj0->sub_type.glean.receive_addr.ip4;
+
+               /* Copy in destination address we are requesting. This is the
+               * glean case, so it's the packet's destination.*/
+               h0->ip4_over_ethernet[1].ip4.data_u32 = ip0->dst_address.data_u32;
            }
+           else
+           {
+               /* Src IP address in ARP header. */
+               if (ip4_src_address_for_packet(lm, sw_if_index0,
+                                              &h0->ip4_over_ethernet[0].ip4))
+               {
+                   /* No source address available */
+                   p0->error = node->errors[IP4_ARP_ERROR_NO_SOURCE_ADDRESS];
+                   vlib_buffer_free(vm, &bi0, 1);
+                   continue;
+               }
 
-           /* Copy in destination address we are requesting. */
-           h0->ip4_over_ethernet[1].ip4.data_u32 = ip0->dst_address.data_u32;
+               /* Copy in destination address we are requesting from the
+                  incomplete adj */
+               h0->ip4_over_ethernet[1].ip4.data_u32 =
+                   adj0->sub_type.nbr.next_hop.ip4.as_u32;
+           }
 
            vlib_buffer_copy_trace_flag (vm, p0, bi0);
            b0 = vlib_get_buffer (vm, bi0);
@@ -2571,6 +1933,22 @@ ip4_arp (vlib_main_t * vm,
   return frame->n_vectors;
 }
 
+static uword
+ip4_arp (vlib_main_t * vm,
+        vlib_node_runtime_t * node,
+        vlib_frame_t * frame)
+{
+    return (ip4_arp_inline(vm, node, frame, 0));
+}
+
+static uword
+ip4_glean (vlib_main_t * vm,
+          vlib_node_runtime_t * node,
+          vlib_frame_t * frame)
+{
+    return (ip4_arp_inline(vm, node, frame, 1));
+}
+
 static char * ip4_arp_error_strings[] = {
   [IP4_ARP_ERROR_DROP] = "address overflow drops",
   [IP4_ARP_ERROR_REQUEST_SENT] = "ARP requests sent",
@@ -2596,6 +1974,22 @@ VLIB_REGISTER_NODE (ip4_arp_node) = {
   },
 };
 
+VLIB_REGISTER_NODE (ip4_glean_node) = {
+  .function = ip4_glean,
+  .name = "ip4-glean",
+  .vector_size = sizeof (u32),
+
+  .format_trace = format_ip4_forward_next_trace,
+
+  .n_errors = ARRAY_LEN (ip4_arp_error_strings),
+  .error_strings = ip4_arp_error_strings,
+
+  .n_next_nodes = IP4_ARP_N_NEXT,
+  .next_nodes = {
+    [IP4_ARP_NEXT_DROP] = "error-drop",
+  },
+};
+
 #define foreach_notrace_ip4_arp_error           \
 _(DROP)                                         \
 _(REQUEST_SENT)                                 \
@@ -2720,7 +2114,7 @@ ip4_rewrite_inline (vlib_main_t * vm,
          u32 pi1, rw_len1, next1, error1, checksum1, adj_index1;
           u32 next0_override, next1_override;
           u32 tx_sw_if_index0, tx_sw_if_index1;
-      
+
           if (rewrite_for_locally_received_packets)
               next0_override = next1_override = 0;
 
@@ -2818,21 +2212,9 @@ ip4_rewrite_inline (vlib_main_t * vm,
       
           if (rewrite_for_locally_received_packets)
             {
-              /*
-               * If someone sends e.g. an icmp4 w/ src = dst = interface addr,
-               * we end up here with a local adjacency in hand
-               * The local adj rewrite data is 0xfefe on purpose.
-               * Bad engineer, no donut for you.
-               */
-              if (PREDICT_FALSE(adj0->lookup_next_index 
-                                == IP_LOOKUP_NEXT_LOCAL))
-                error0 = IP4_ERROR_SPOOFED_LOCAL_PACKETS;
               if (PREDICT_FALSE(adj0->lookup_next_index
                                 == IP_LOOKUP_NEXT_ARP))
                 next0_override = IP4_REWRITE_NEXT_ARP;
-              if (PREDICT_FALSE(adj1->lookup_next_index 
-                                == IP_LOOKUP_NEXT_LOCAL))
-                error1 = IP4_ERROR_SPOOFED_LOCAL_PACKETS;
               if (PREDICT_FALSE(adj1->lookup_next_index
                                 == IP_LOOKUP_NEXT_ARP))
                 next1_override = IP4_REWRITE_NEXT_ARP;
@@ -2869,14 +2251,14 @@ ip4_rewrite_inline (vlib_main_t * vm,
            */
           if (PREDICT_FALSE (rw_len0 > sizeof(ethernet_header_t)))
               vlib_increment_combined_counter 
-                  (&lm->adjacency_counters,
+                  (&adjacency_counters,
                    cpu_index, adj_index0, 
                    /* packet increment */ 0,
                    /* byte increment */ rw_len0-sizeof(ethernet_header_t));
 
           if (PREDICT_FALSE (rw_len1 > sizeof(ethernet_header_t)))
               vlib_increment_combined_counter 
-                  (&lm->adjacency_counters,
+                  (&adjacency_counters,
                    cpu_index, adj_index1, 
                    /* packet increment */ 0,
                    /* byte increment */ rw_len1-sizeof(ethernet_header_t));
@@ -2945,7 +2327,7 @@ ip4_rewrite_inline (vlib_main_t * vm,
          u32 pi0, rw_len0, adj_index0, next0, error0, checksum0;
           u32 next0_override;
           u32 tx_sw_if_index0;
-      
+
           if (rewrite_for_locally_received_packets)
               next0_override = 0;
 
@@ -3000,15 +2382,6 @@ ip4_rewrite_inline (vlib_main_t * vm,
 
           if (rewrite_for_locally_received_packets)
             {
-              /*
-               * If someone sends e.g. an icmp4 w/ src = dst = interface addr,
-               * we end up here with a local adjacency in hand
-               * The local adj rewrite data is 0xfefe on purpose.
-               * Bad engineer, no donut for you.
-               */
-              if (PREDICT_FALSE(adj0->lookup_next_index 
-                                == IP_LOOKUP_NEXT_LOCAL))
-                error0 = IP4_ERROR_SPOOFED_LOCAL_PACKETS;
               /* 
                * We have to override the next_index in ARP adjacencies,
                * because they're set up for ip4-arp, not this node...
@@ -3028,7 +2401,7 @@ ip4_rewrite_inline (vlib_main_t * vm,
           
           if (PREDICT_FALSE (rw_len0 > sizeof(ethernet_header_t)))
               vlib_increment_combined_counter 
-                  (&lm->adjacency_counters,
+                  (&adjacency_counters,
                    cpu_index, adj_index0, 
                    /* packet increment */ 0,
                    /* byte increment */ rw_len0-sizeof(ethernet_header_t));
@@ -3172,6 +2545,15 @@ ip4_rewrite_local (vlib_main_t * vm,
                             /* rewrite_for_locally_received_packets */ 1);
 }
 
+static uword
+ip4_midchain (vlib_main_t * vm,
+             vlib_node_runtime_t * node,
+             vlib_frame_t * frame)
+{
+  return ip4_rewrite_inline (vm, node, frame,
+                            /* rewrite_for_locally_received_packets */ 0);
+}
+
 VLIB_REGISTER_NODE (ip4_rewrite_node) = {
   .function = ip4_rewrite_transit,
   .name = "ip4-rewrite-transit",
@@ -3187,7 +2569,23 @@ VLIB_REGISTER_NODE (ip4_rewrite_node) = {
   },
 };
 
-VLIB_NODE_FUNCTION_MULTIARCH (ip4_rewrite_node, ip4_rewrite_transit);
+VLIB_NODE_FUNCTION_MULTIARCH (ip4_rewrite_node, ip4_rewrite_transit)
+
+VLIB_REGISTER_NODE (ip4_midchain_node) = {
+  .function = ip4_midchain,
+  .name = "ip4-midchain",
+  .vector_size = sizeof (u32),
+
+  .format_trace = format_ip4_forward_next_trace,
+
+  .n_next_nodes = 2,
+  .next_nodes = {
+    [IP4_REWRITE_NEXT_DROP] = "error-drop",
+    [IP4_REWRITE_NEXT_ARP] = "ip4-arp",
+  },
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (ip4_midchain_node, ip4_midchain)
 
 VLIB_REGISTER_NODE (ip4_rewrite_local_node) = {
   .function = ip4_rewrite_local,
@@ -3201,7 +2599,7 @@ VLIB_REGISTER_NODE (ip4_rewrite_local_node) = {
   .n_next_nodes = 0,
 };
 
-VLIB_NODE_FUNCTION_MULTIARCH (ip4_rewrite_local_node, ip4_rewrite_local);
+VLIB_NODE_FUNCTION_MULTIARCH (ip4_rewrite_local_node, ip4_rewrite_local)
 
 static clib_error_t *
 add_del_interface_table (vlib_main_t * vm,
@@ -3232,13 +2630,18 @@ add_del_interface_table (vlib_main_t * vm,
 
   {
     ip4_main_t * im = &ip4_main;
-    ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_id, IP4_ROUTE_FLAG_TABLE_ID);
-
-    if (fib) 
-      {
-        vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
-        im->fib_index_by_sw_if_index[sw_if_index] = fib->index;
-    }
+    u32 fib_index;
+
+    fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4,
+                                                   table_id);
+
+    //
+    // FIXME-LATER
+    //  changing an interface's table has consequences for any connecteds
+    //  and adj-fibs already installed.
+    //
+    vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
+    im->fib_index_by_sw_if_index[sw_if_index] = fib_index;
   }
 
  done:
@@ -3272,8 +2675,7 @@ ip4_lookup_multicast (vlib_main_t * vm,
                      vlib_frame_t * frame)
 {
   ip4_main_t * im = &ip4_main;
-  ip_lookup_main_t * lm = &im->lookup_main;
-  vlib_combined_counter_main_t * cm = &im->lookup_main.adjacency_counters;
+  vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
   u32 n_left_from, n_left_to_next, * from, * to_next;
   ip_lookup_next_t next;
   u32 cpu_index = os_get_cpu_number();
@@ -3290,12 +2692,12 @@ ip4_lookup_multicast (vlib_main_t * vm,
       while (n_left_from >= 4 && n_left_to_next >= 2)
        {
          vlib_buffer_t * p0, * p1;
-         u32 pi0, pi1, adj_index0, adj_index1, wrong_next;
+         u32 pi0, pi1, lb_index0, lb_index1, wrong_next;
          ip_lookup_next_t next0, next1;
          ip4_header_t * ip0, * ip1;
-         ip_adjacency_t * adj0, * adj1;
           u32 fib_index0, fib_index1;
-          u32 flow_hash_config0, flow_hash_config1;
+         const dpo_id_t *dpo0, *dpo1;
+         const load_balance_t * lb0, * lb1;
 
          /* Prefetch next iteration. */
          {
@@ -3327,46 +2729,44 @@ ip4_lookup_multicast (vlib_main_t * vm,
           fib_index1 = (vnet_buffer(p1)->sw_if_index[VLIB_TX] == (u32)~0) ?
             fib_index1 : vnet_buffer(p1)->sw_if_index[VLIB_TX];
 
-         adj_index0 = ip4_fib_lookup_buffer (im, fib_index0, 
-                                              &ip0->dst_address, p0);
-         adj_index1 = ip4_fib_lookup_buffer (im, fib_index1, 
-                                              &ip1->dst_address, p1);
-
-         adj0 = ip_get_adjacency (lm, adj_index0);
-         adj1 = ip_get_adjacency (lm, adj_index1);
-
-         next0 = adj0->lookup_next_index;
-         next1 = adj1->lookup_next_index;
+         lb_index0 = ip4_fib_table_lookup_lb (ip4_fib_get(fib_index0),
+                                               &ip0->dst_address);
+         lb_index1 = ip4_fib_table_lookup_lb (ip4_fib_get(fib_index1),
+                                               &ip1->dst_address);
 
-          flow_hash_config0 = 
-              vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config;
+         lb0 = load_balance_get (lb_index0);
+         lb1 = load_balance_get (lb_index1);
 
-          flow_hash_config1 = 
-              vec_elt_at_index (im->fibs, fib_index1)->flow_hash_config;
+         ASSERT (lb0->lb_n_buckets > 0);
+         ASSERT (is_pow2 (lb0->lb_n_buckets));
+         ASSERT (lb1->lb_n_buckets > 0);
+         ASSERT (is_pow2 (lb1->lb_n_buckets));
 
          vnet_buffer (p0)->ip.flow_hash = ip4_compute_flow_hash 
-              (ip0, flow_hash_config0);
+              (ip0, lb0->lb_hash_config);
                                                                   
          vnet_buffer (p1)->ip.flow_hash = ip4_compute_flow_hash 
-              (ip1, flow_hash_config1);
+              (ip1, lb1->lb_hash_config);
 
-         ASSERT (adj0->n_adj > 0);
-         ASSERT (adj1->n_adj > 0);
-         ASSERT (is_pow2 (adj0->n_adj));
-         ASSERT (is_pow2 (adj1->n_adj));
-         adj_index0 += (vnet_buffer (p0)->ip.flow_hash & (adj0->n_adj - 1));
-         adj_index1 += (vnet_buffer (p1)->ip.flow_hash & (adj1->n_adj - 1));
+         dpo0 = load_balance_get_bucket_i(lb0,
+                                           (vnet_buffer (p0)->ip.flow_hash &
+                                            (lb0->lb_n_buckets_minus_1)));
+         dpo1 = load_balance_get_bucket_i(lb1,
+                                           (vnet_buffer (p1)->ip.flow_hash &
+                                            (lb0->lb_n_buckets_minus_1)));
 
-         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
-         vnet_buffer (p1)->ip.adj_index[VLIB_TX] = adj_index1;
+         next0 = dpo0->dpoi_next_node;
+         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
+         next1 = dpo1->dpoi_next_node;
+         vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
 
           if (1) /* $$$$$$ HACK FIXME */
          vlib_increment_combined_counter 
-              (cm, cpu_index, adj_index0, 1,
+              (cm, cpu_index, lb_index0, 1,
                vlib_buffer_length_in_chain (vm, p0));
           if (1) /* $$$$$$ HACK FIXME */
          vlib_increment_combined_counter 
-              (cm, cpu_index, adj_index1, 1,
+              (cm, cpu_index, lb_index1, 1,
                vlib_buffer_length_in_chain (vm, p1));
 
          from += 2;
@@ -3415,11 +2815,11 @@ ip4_lookup_multicast (vlib_main_t * vm,
        {
          vlib_buffer_t * p0;
          ip4_header_t * ip0;
-         u32 pi0, adj_index0;
+         u32 pi0, lb_index0;
          ip_lookup_next_t next0;
-         ip_adjacency_t * adj0;
           u32 fib_index0;
-          u32 flow_hash_config0;
+         const dpo_id_t *dpo0;
+         const load_balance_t * lb0;
 
          pi0 = from[0];
          to_next[0] = pi0;
@@ -3433,28 +2833,27 @@ ip4_lookup_multicast (vlib_main_t * vm,
           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
               fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
           
-         adj_index0 = ip4_fib_lookup_buffer (im, fib_index0, 
-                                              &ip0->dst_address, p0);
-
-         adj0 = ip_get_adjacency (lm, adj_index0);
+         lb_index0 = ip4_fib_table_lookup_lb (ip4_fib_get(fib_index0),
+                                               &ip0->dst_address);
 
-         next0 = adj0->lookup_next_index;
+         lb0 = load_balance_get (lb_index0);
 
-          flow_hash_config0 = 
-              vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config;
+         ASSERT (lb0->lb_n_buckets > 0);
+         ASSERT (is_pow2 (lb0->lb_n_buckets));
 
-         vnet_buffer (p0)->ip.flow_hash = 
-            ip4_compute_flow_hash (ip0, flow_hash_config0);
+         vnet_buffer (p0)->ip.flow_hash = ip4_compute_flow_hash 
+              (ip0, lb0->lb_hash_config);
 
-         ASSERT (adj0->n_adj > 0);
-         ASSERT (is_pow2 (adj0->n_adj));
-         adj_index0 += (vnet_buffer (p0)->ip.flow_hash & (adj0->n_adj - 1));
+         dpo0 = load_balance_get_bucket_i(lb0,
+                                           (vnet_buffer (p0)->ip.flow_hash &
+                                            (lb0->lb_n_buckets_minus_1)));
 
-         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
+         next0 = dpo0->dpoi_next_node;
+         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
 
           if (1) /* $$$$$$ HACK FIXME */
               vlib_increment_combined_counter 
-                  (cm, cpu_index, adj_index0, 1,
+                  (cm, cpu_index, lb_index0, 1,
                    vlib_buffer_length_in_chain (vm, p0));
 
          from += 1;
@@ -3494,7 +2893,7 @@ VLIB_REGISTER_NODE (ip4_lookup_multicast_node,static) = {
   .n_next_nodes = 0,
 };
 
-VLIB_NODE_FUNCTION_MULTIARCH (ip4_lookup_multicast_node, ip4_lookup_multicast);
+VLIB_NODE_FUNCTION_MULTIARCH (ip4_lookup_multicast_node, ip4_lookup_multicast)
 
 VLIB_REGISTER_NODE (ip4_multicast_node,static) = {
   .function = ip4_drop,
@@ -3511,12 +2910,11 @@ VLIB_REGISTER_NODE (ip4_multicast_node,static) = {
 
 int ip4_lookup_validate (ip4_address_t *a, u32 fib_index0)
 {
-  ip4_main_t * im = &ip4_main;
   ip4_fib_mtrie_t * mtrie0;
   ip4_fib_mtrie_leaf_t leaf0;
-  u32 adj_index0;
+  u32 lbi0;
     
-  mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
+  mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
 
   leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 0);
@@ -3527,11 +2925,9 @@ int ip4_lookup_validate (ip4_address_t *a, u32 fib_index0)
   /* Handle default route. */
   leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
   
-  adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+  lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
   
-  return adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
-                                                  a, 
-                                                  /* no_default_route */ 0);
+  return lbi0 == ip4_fib_table_lookup_lb (ip4_fib_get(fib_index0), a);
 }
  
 static clib_error_t *
@@ -3595,7 +2991,7 @@ int vnet_set_ip4_flow_hash (u32 table_id, u32 flow_hash_config)
   if (p == 0)
     return VNET_API_ERROR_NO_SUCH_FIB;
 
-  fib = vec_elt_at_index (im4->fibs, p[0]);
+  fib = ip4_fib_get (p[0]);
 
   fib->flow_hash_config = flow_hash_config;
   return 0;
@@ -3719,44 +3115,3 @@ VLIB_CLI_COMMAND (set_ip_classify_command, static) = {
     .function = set_ip_classify_command_fn,
 };
 
-
-#define TEST_CODE 1
-#if TEST_CODE > 0
-
-static clib_error_t *
-set_interface_output_feature_command_fn (vlib_main_t * vm,
-                                         unformat_input_t * input,
-                                         vlib_cli_command_t * cmd)
-{
-  vnet_main_t * vnm = vnet_get_main();
-  u32 sw_if_index = ~0;
-  int is_add = 1;
-  ip4_main_t * im = &ip4_main;
-  ip_lookup_main_t * lm = &im->lookup_main;
-
-  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) 
-    {
-      if (unformat (input, "%U", unformat_vnet_sw_interface, vnm, &sw_if_index))
-        ;
-      else if (unformat (input, "del"))
-        is_add = 0;
-      else
-        break;
-    }
-
-  if (sw_if_index == ~0)
-    return clib_error_return (0, "unknown interface `%U'",
-                              format_unformat_error, input);
-
-  lm->tx_sw_if_has_ip_output_features =
-    clib_bitmap_set (lm->tx_sw_if_has_ip_output_features, sw_if_index, is_add);
-
-  return 0;
-}
-
-VLIB_CLI_COMMAND (set_interface_output_feature, static) = {
-  .path = "set interface output feature",
-  .function = set_interface_output_feature_command_fn,
-  .short_help = "set interface output feature <intfc>",
-};
-#endif /* TEST_CODE */
index 006610a..3641824 100644 (file)
@@ -38,6 +38,7 @@
  */
 
 #include <vnet/ip/ip.h>
+#include <vnet/fib/fib_entry.h>
 
 static void
 ply_init (ip4_fib_mtrie_ply_t * p, ip4_fib_mtrie_leaf_t init, uword prefix_len)
@@ -401,21 +402,27 @@ ip4_fib_mtrie_add_del_route (ip4_fib_t * fib,
          unset_leaf (m, &a, root_ply, 0);
 
          /* Find next less specific route and insert into mtrie. */
-         for (i = ARRAY_LEN (fib->adj_index_by_dst_address) - 1; i >= 1; i--)
+         for (i = dst_address_length - 1; i >= 1; i--)
            {
              uword * p;
+              index_t lbi;
              ip4_address_t key;
 
-             if (! fib->adj_index_by_dst_address[i])
+             if (! fib->fib_entry_by_dst_address[i])
                continue;
              
              key.as_u32 = dst_address.as_u32 & im->fib_masks[i];
-             p = hash_get (fib->adj_index_by_dst_address[i], key.as_u32);
+             p = hash_get (fib->fib_entry_by_dst_address[i], key.as_u32);
              if (p)
                {
+                 lbi = fib_entry_contribute_ip_forwarding(p[0])->dpoi_index;
+                 if (INDEX_INVALID == lbi)
+                   continue;
+
                  a.dst_address = key;
+                 a.adj_index = lbi;
                  a.dst_address_length = i;
-                 a.adj_index = p[0];
+
                  set_leaf (m, &a, /* ply_index */ 0, /* dst_address_byte_index */ 0);
                  break;
                }
@@ -424,65 +431,6 @@ ip4_fib_mtrie_add_del_route (ip4_fib_t * fib,
     }
 }
 
-always_inline uword
-maybe_remap_leaf (ip_lookup_main_t * lm, ip4_fib_mtrie_leaf_t * p)
-{
-  ip4_fib_mtrie_leaf_t l = p[0];
-  uword was_remapped_to_empty_leaf = 0;
-  if (ip4_fib_mtrie_leaf_is_terminal (l))
-    {
-      u32 adj_index = ip4_fib_mtrie_leaf_get_adj_index (l);
-      u32 m = vec_elt (lm->adjacency_remap_table, adj_index);
-      if (m)
-       {
-         was_remapped_to_empty_leaf = m == ~0;
-
-          /*
-           * The intent of the original form - which dates to 2013 or
-           * earlier - is not obvious. Here's the original:
-           * 
-           * if (was_remapped_to_empty_leaf)
-           *   p[0] = (was_remapped_to_empty_leaf
-           *           ? IP4_FIB_MTRIE_LEAF_EMPTY
-           *           : ip4_fib_mtrie_leaf_set_adj_index (m - 1));
-           *
-           * Notice the outer "if (was_remapped_to_empty_leaf)"
-           * means that p[0] is always set to IP4_FIB_MTRIE_LEAF_EMPTY,
-           * and is otherwise left intact.
-           * 
-           * It seems unlikely that the adjacency mapping scheme
-           * works in detail. Coverity correctly complains that the 
-           * else-case of the original ternary expression is dead code.
-           */
-         if (was_remapped_to_empty_leaf)
-            p[0] = IP4_FIB_MTRIE_LEAF_EMPTY;
-       }
-    }
-  return was_remapped_to_empty_leaf;
-}
-
-static void maybe_remap_ply (ip_lookup_main_t * lm, ip4_fib_mtrie_ply_t * ply)
-{
-  u32 n_remapped_to_empty = 0;
-  u32 i;
-  for (i = 0; i < ARRAY_LEN (ply->leaves); i++)
-    n_remapped_to_empty += maybe_remap_leaf (lm, &ply->leaves[i]);
-  if (n_remapped_to_empty > 0)
-    {
-      ASSERT (n_remapped_to_empty <= ply->n_non_empty_leafs);
-      ply->n_non_empty_leafs -= n_remapped_to_empty;
-      if (ply->n_non_empty_leafs == 0)
-       os_panic ();
-    }
-}
-
-void ip4_mtrie_maybe_remap_adjacencies (ip_lookup_main_t * lm, ip4_fib_mtrie_t * m)
-{
-  ip4_fib_mtrie_ply_t * ply;
-  pool_foreach (ply, m->ply_pool, maybe_remap_ply (lm, ply));
-  maybe_remap_leaf (lm, &m->default_leaf);
-}
-
 /* Returns number of bytes of memory used by mtrie. */
 static uword mtrie_memory_usage (ip4_fib_mtrie_t * m, ip4_fib_mtrie_ply_t * p)
 {
index 31de41e..c49937d 100644 (file)
@@ -51,7 +51,7 @@
    1 => empty (adjacency index of zero is special miss adjacency). */
 typedef u32 ip4_fib_mtrie_leaf_t;
 
-#define IP4_FIB_MTRIE_LEAF_EMPTY (1 + 2*IP_LOOKUP_MISS_ADJ_INDEX)
+#define IP4_FIB_MTRIE_LEAF_EMPTY (1 + 2*0)
 #define IP4_FIB_MTRIE_LEAF_ROOT  (0 + 2*0)
 
 always_inline u32 ip4_fib_mtrie_leaf_is_empty (ip4_fib_mtrie_leaf_t n)
@@ -115,6 +115,9 @@ typedef struct {
         - 1 * sizeof (i32)];
 } ip4_fib_mtrie_ply_t;
 
+_Static_assert(0  == sizeof(ip4_fib_mtrie_ply_t) % CLIB_CACHE_LINE_BYTES,
+              "IP4 Mtrie ply cache line");
+
 typedef struct {
   /* Pool of plies.  Index zero is root ply. */
   ip4_fib_mtrie_ply_t * ply_pool;
@@ -136,15 +139,13 @@ void ip4_fib_mtrie_add_del_route (struct ip4_fib_t * f,
 /* Returns adjacency index. */
 u32 ip4_mtrie_lookup_address (ip4_fib_mtrie_t * m, ip4_address_t dst);
 
-void ip4_mtrie_maybe_remap_adjacencies (ip_lookup_main_t * lm, ip4_fib_mtrie_t * m);
-
 format_function_t format_ip4_fib_mtrie;
 
 /* Lookup step.  Processes 1 byte of 4 byte ip4 address. */
 always_inline ip4_fib_mtrie_leaf_t
 ip4_fib_mtrie_lookup_step (ip4_fib_mtrie_t * m,
                           ip4_fib_mtrie_leaf_t current_leaf,
-                          ip4_address_t * dst_address,
+                          const ip4_address_t * dst_address,
                           u32 dst_address_byte_index)
 {
   ip4_fib_mtrie_leaf_t next_leaf;
index ebfa767..8a469ba 100644 (file)
  */
 #include <vnet/ip/ip.h>
 #include <vnet/ip/ip_source_and_port_range_check.h>
+#include <vnet/dpo/load_balance.h>
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/ip4_fib.h>
 
+/**
+ * @brief The pool of range chack DPOs
+ */
+static protocol_port_range_dpo_t *ppr_dpo_pool;
+
+/**
+ * @brief Dynamically registered DPO type
+ */
+static dpo_type_t ppr_dpo_type;
 
 vlib_node_registration_t ip4_source_port_and_range_check_rx;
 vlib_node_registration_t ip4_source_port_and_range_check_tx;
@@ -73,23 +85,20 @@ typedef enum
 
 
 static inline u32
-check_adj_port_range_x1 (ip_adjacency_t * adj, u16 dst_port, u32 next)
+check_adj_port_range_x1 (const protocol_port_range_dpo_t * ppr_dpo,
+                        u16 dst_port, u32 next)
 {
-  protocol_port_range_t *range;
+  const protocol_port_range_t *range;
   u16x8vec_t key;
   u16x8vec_t diff1;
   u16x8vec_t diff2;
   u16x8vec_t sum, sum_equal_diff2;
   u16 sum_nonzero, sum_equal, winner_mask;
   int i;
-  u8 *rwh;
 
-  if (adj->lookup_next_index != IP_LOOKUP_NEXT_ICMP_ERROR || dst_port == 0)
+  if (NULL == ppr_dpo || dst_port == 0)
     return IP4_SOURCE_AND_PORT_RANGE_CHECK_NEXT_DROP;
 
-  rwh = (u8 *) (&adj->rewrite_header);
-  range = (protocol_port_range_t *) rwh;
-
   /* Make the obvious screw-case work. A variant also works w/ no MMX */
   if (PREDICT_FALSE (dst_port == 65535))
     {
@@ -100,20 +109,20 @@ check_adj_port_range_x1 (ip_adjacency_t * adj, u16 dst_port, u32 next)
           i++)
        {
          for (j = 0; j < 8; j++)
-           if (range->low.as_u16[j] == 65535)
+           if (ppr_dpo->blocks[i].low.as_u16[j] == 65535)
              return next;
-         range++;
        }
       return IP4_SOURCE_AND_PORT_RANGE_CHECK_NEXT_DROP;
     }
 
   key.as_u16x8 = u16x8_splat (dst_port);
 
-  for (i = 0; i < VLIB_BUFFER_PRE_DATA_SIZE / sizeof (protocol_port_range_t);
-       i++)
+  for (i = 0; i < ppr_dpo->n_used_blocks; i++)
     {
-      diff1.as_u16x8 = u16x8_sub_saturate (range->low.as_u16x8, key.as_u16x8);
-      diff2.as_u16x8 = u16x8_sub_saturate (range->hi.as_u16x8, key.as_u16x8);
+      diff1.as_u16x8 =
+       u16x8_sub_saturate (ppr_dpo->blocks[i].low.as_u16x8, key.as_u16x8);
+      diff2.as_u16x8 =
+       u16x8_sub_saturate (ppr_dpo->blocks[i].hi.as_u16x8, key.as_u16x8);
       sum.as_u16x8 = u16x8_add (diff1.as_u16x8, diff2.as_u16x8);
       sum_equal_diff2.as_u16x8 =
        u16x8_is_equal (sum.as_u16x8, diff2.as_u16x8);
@@ -127,6 +136,12 @@ check_adj_port_range_x1 (ip_adjacency_t * adj, u16 dst_port, u32 next)
   return IP4_SOURCE_AND_PORT_RANGE_CHECK_NEXT_DROP;
 }
 
+always_inline protocol_port_range_dpo_t *
+protocol_port_range_dpo_get (index_t index)
+{
+  return (pool_elt_at_index (ppr_dpo_pool, index));
+}
+
 always_inline uword
 ip4_source_and_port_range_check_inline (vlib_main_t * vm,
                                        vlib_node_runtime_t * node,
@@ -154,264 +169,263 @@ ip4_source_and_port_range_check_inline (vlib_main_t * vm,
       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
 
 
-      while (n_left_from >= 4 && n_left_to_next >= 2)
-       {
-         vlib_buffer_t *b0, *b1;
-         ip4_header_t *ip0, *ip1;
-         ip4_fib_mtrie_t *mtrie0, *mtrie1;
-         ip4_fib_mtrie_leaf_t leaf0, leaf1;
-         ip_source_and_port_range_check_config_t *c0, *c1;
-         ip_adjacency_t *adj0 = 0, *adj1 = 0;
-         u32 bi0, next0, adj_index0, pass0, save_next0, fib_index0;
-         u32 bi1, next1, adj_index1, pass1, save_next1, fib_index1;
-         udp_header_t *udp0, *udp1;
-
-         /* Prefetch next iteration. */
-         {
-           vlib_buffer_t *p2, *p3;
-
-           p2 = vlib_get_buffer (vm, from[2]);
-           p3 = vlib_get_buffer (vm, from[3]);
-
-           vlib_prefetch_buffer_header (p2, LOAD);
-           vlib_prefetch_buffer_header (p3, LOAD);
-
-           CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD);
-           CLIB_PREFETCH (p3->data, sizeof (ip1[0]), LOAD);
-         }
-
-         bi0 = to_next[0] = from[0];
-         bi1 = to_next[1] = from[1];
-         from += 2;
-         to_next += 2;
-         n_left_from -= 2;
-         n_left_to_next -= 2;
-
-         b0 = vlib_get_buffer (vm, bi0);
-         b1 = vlib_get_buffer (vm, bi1);
-
-         fib_index0 =
-           vec_elt (im->fib_index_by_sw_if_index,
-                    vnet_buffer (b0)->sw_if_index[VLIB_RX]);
-         fib_index1 =
-           vec_elt (im->fib_index_by_sw_if_index,
-                    vnet_buffer (b1)->sw_if_index[VLIB_RX]);
-
-         ip0 = vlib_buffer_get_current (b0);
-         ip1 = vlib_buffer_get_current (b1);
-
-         if (is_tx)
-           {
-             c0 = vnet_get_config_data (&tx_cm->config_main,
-                                        &b0->current_config_index,
-                                        &next0, sizeof (c0[0]));
-             c1 = vnet_get_config_data (&tx_cm->config_main,
-                                        &b1->current_config_index,
-                                        &next1, sizeof (c1[0]));
-           }
-         else
-           {
-             c0 = vnet_get_config_data (&rx_cm->config_main,
-                                        &b0->current_config_index,
-                                        &next0, sizeof (c0[0]));
-             c1 = vnet_get_config_data (&rx_cm->config_main,
-                                        &b1->current_config_index,
-                                        &next1, sizeof (c1[0]));
-           }
-
-         /* we can't use the default VRF here... */
-         for (i = 0; i < IP_SOURCE_AND_PORT_RANGE_CHECK_N_PROTOCOLS; i++)
-           {
-             ASSERT (c0->fib_index[i] && c1->fib_index[i]);
-           }
-
-
-         if (is_tx)
-           {
-             if (ip0->protocol == IP_PROTOCOL_UDP)
-               fib_index0 =
-                 c0->fib_index
-                 [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_UDP_IN];
-             if (ip0->protocol == IP_PROTOCOL_TCP)
-               fib_index0 =
-                 c0->fib_index
-                 [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_TCP_IN];
-           }
-         else
-           {
-             if (ip0->protocol == IP_PROTOCOL_UDP)
-               fib_index0 =
-                 c0->fib_index
-                 [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_UDP_OUT];
-             if (ip0->protocol == IP_PROTOCOL_TCP)
-               fib_index0 =
-                 c0->fib_index
-                 [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_TCP_OUT];
-           }
-
-         if (PREDICT_TRUE (fib_index0 != ~0))
-           {
-
-             mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
-
-             leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
-
-             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0,
-                                                &ip0->src_address, 0);
-
-             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0,
-                                                &ip0->src_address, 1);
-
-             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0,
-                                                &ip0->src_address, 2);
-
-             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0,
-                                                &ip0->src_address, 3);
-
-             adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
-
-             ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
-                                                              &ip0->src_address,
-                                                              0
-                                                              /* use dflt rt */
-                     ));
-             adj0 = ip_get_adjacency (lm, adj_index0);
-           }
-
-         if (is_tx)
-           {
-             if (ip1->protocol == IP_PROTOCOL_UDP)
-               fib_index1 =
-                 c1->fib_index
-                 [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_UDP_IN];
-             if (ip1->protocol == IP_PROTOCOL_TCP)
-               fib_index1 =
-                 c1->fib_index
-                 [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_TCP_IN];
-           }
-         else
-           {
-             if (ip1->protocol == IP_PROTOCOL_UDP)
-               fib_index1 =
-                 c1->fib_index
-                 [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_UDP_OUT];
-             if (ip1->protocol == IP_PROTOCOL_TCP)
-               fib_index1 =
-                 c1->fib_index
-                 [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_TCP_OUT];
-           }
-
-         if (PREDICT_TRUE (fib_index1 != ~0))
-           {
-
-             mtrie1 = &vec_elt_at_index (im->fibs, fib_index1)->mtrie;
-
-             leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
-
-             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1,
-                                                &ip1->src_address, 0);
-
-             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1,
-                                                &ip1->src_address, 1);
-
-             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1,
-                                                &ip1->src_address, 2);
-
-             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1,
-                                                &ip1->src_address, 3);
-
-             adj_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
-
-             ASSERT (adj_index1 == ip4_fib_lookup_with_table (im, fib_index1,
-                                                              &ip1->src_address,
-                                                              0));
-             adj1 = ip_get_adjacency (lm, adj_index1);
-           }
-
-         pass0 = 0;
-         pass0 |= adj0 == 0;
-         pass0 |= ip4_address_is_multicast (&ip0->src_address);
-         pass0 |=
-           ip0->src_address.as_u32 == clib_host_to_net_u32 (0xFFFFFFFF);
-         pass0 |= (ip0->protocol != IP_PROTOCOL_UDP)
-           && (ip0->protocol != IP_PROTOCOL_TCP);
-
-         pass1 = 0;
-         pass1 |= adj1 == 0;
-         pass1 |= ip4_address_is_multicast (&ip1->src_address);
-         pass1 |=
-           ip1->src_address.as_u32 == clib_host_to_net_u32 (0xFFFFFFFF);
-         pass1 |= (ip1->protocol != IP_PROTOCOL_UDP)
-           && (ip1->protocol != IP_PROTOCOL_TCP);
-
-         save_next0 = next0;
-         udp0 = ip4_next_header (ip0);
-         save_next1 = next1;
-         udp1 = ip4_next_header (ip1);
-
-         if (PREDICT_TRUE (pass0 == 0))
-           {
-             good_packets++;
-             next0 = check_adj_port_range_x1
-               (adj0, clib_net_to_host_u16 (udp0->dst_port), next0);
-             good_packets -= (save_next0 != next0);
-             b0->error = error_node->errors
-               [IP4_SOURCE_AND_PORT_RANGE_CHECK_ERROR_CHECK_FAIL];
-           }
-
-         if (PREDICT_TRUE (pass1 == 0))
-           {
-             good_packets++;
-             next1 = check_adj_port_range_x1
-               (adj1, clib_net_to_host_u16 (udp1->dst_port), next1);
-             good_packets -= (save_next1 != next1);
-             b1->error = error_node->errors
-               [IP4_SOURCE_AND_PORT_RANGE_CHECK_ERROR_CHECK_FAIL];
-           }
-
-         if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
-                            && (b0->flags & VLIB_BUFFER_IS_TRACED)))
-           {
-             ip4_source_and_port_range_check_trace_t *t =
-               vlib_add_trace (vm, node, b0, sizeof (*t));
-             t->pass = next0 == save_next0;
-             t->bypass = pass0;
-             t->fib_index = fib_index0;
-             t->src_addr.as_u32 = ip0->src_address.as_u32;
-             t->port = (pass0 == 0) ?
-               clib_net_to_host_u16 (udp0->dst_port) : 0;
-             t->is_tcp = ip0->protocol == IP_PROTOCOL_TCP;
-           }
-
-         if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
-                            && (b1->flags & VLIB_BUFFER_IS_TRACED)))
-           {
-             ip4_source_and_port_range_check_trace_t *t =
-               vlib_add_trace (vm, node, b1, sizeof (*t));
-             t->pass = next1 == save_next1;
-             t->bypass = pass1;
-             t->fib_index = fib_index1;
-             t->src_addr.as_u32 = ip1->src_address.as_u32;
-             t->port = (pass1 == 0) ?
-               clib_net_to_host_u16 (udp1->dst_port) : 0;
-             t->is_tcp = ip1->protocol == IP_PROTOCOL_TCP;
-           }
-
-         vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
-                                          to_next, n_left_to_next,
-                                          bi0, bi1, next0, next1);
-       }
+      /*     while (n_left_from >= 4 && n_left_to_next >= 2) */
+      /*       { */
+      /*         vlib_buffer_t *b0, *b1; */
+      /*         ip4_header_t *ip0, *ip1; */
+      /*         ip4_fib_mtrie_t *mtrie0, *mtrie1; */
+      /*         ip4_fib_mtrie_leaf_t leaf0, leaf1; */
+      /*         ip_source_and_port_range_check_config_t *c0, *c1; */
+      /*         ip_adjacency_t *adj0 = 0, *adj1 = 0; */
+      /*         u32 bi0, next0, adj_index0, pass0, save_next0, fib_index0; */
+      /*         u32 bi1, next1, adj_index1, pass1, save_next1, fib_index1; */
+      /*         udp_header_t *udp0, *udp1; */
+
+      /*         /\* Prefetch next iteration. *\/ */
+      /*         { */
+      /*           vlib_buffer_t *p2, *p3; */
+
+      /*           p2 = vlib_get_buffer (vm, from[2]); */
+      /*           p3 = vlib_get_buffer (vm, from[3]); */
+
+      /*           vlib_prefetch_buffer_header (p2, LOAD); */
+      /*           vlib_prefetch_buffer_header (p3, LOAD); */
+
+      /*           CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD); */
+      /*           CLIB_PREFETCH (p3->data, sizeof (ip1[0]), LOAD); */
+      /*         } */
+
+      /*         bi0 = to_next[0] = from[0]; */
+      /*         bi1 = to_next[1] = from[1]; */
+      /*         from += 2; */
+      /*         to_next += 2; */
+      /*         n_left_from -= 2; */
+      /*         n_left_to_next -= 2; */
+
+      /*         b0 = vlib_get_buffer (vm, bi0); */
+      /*         b1 = vlib_get_buffer (vm, bi1); */
+
+      /*         fib_index0 = */
+      /*           vec_elt (im->fib_index_by_sw_if_index, */
+      /*                 vnet_buffer (b0)->sw_if_index[VLIB_RX]); */
+      /*         fib_index1 = */
+      /*           vec_elt (im->fib_index_by_sw_if_index, */
+      /*                 vnet_buffer (b1)->sw_if_index[VLIB_RX]); */
+
+      /*         ip0 = vlib_buffer_get_current (b0); */
+      /*         ip1 = vlib_buffer_get_current (b1); */
+
+      /*         if (is_tx) */
+      /*           { */
+      /*             c0 = vnet_get_config_data (&tx_cm->config_main, */
+      /*                                     &b0->current_config_index, */
+      /*                                     &next0, sizeof (c0[0])); */
+      /*             c1 = vnet_get_config_data (&tx_cm->config_main, */
+      /*                                     &b1->current_config_index, */
+      /*                                     &next1, sizeof (c1[0])); */
+      /*           } */
+      /*         else */
+      /*           { */
+      /*             c0 = vnet_get_config_data (&rx_cm->config_main, */
+      /*                                     &b0->current_config_index, */
+      /*                                     &next0, sizeof (c0[0])); */
+      /*             c1 = vnet_get_config_data (&rx_cm->config_main, */
+      /*                                     &b1->current_config_index, */
+      /*                                     &next1, sizeof (c1[0])); */
+      /*           } */
+
+      /*         /\* we can't use the default VRF here... *\/ */
+      /*         for (i = 0; i < IP_SOURCE_AND_PORT_RANGE_CHECK_N_PROTOCOLS; i++) */
+      /*           { */
+      /*             ASSERT (c0->fib_index[i] && c1->fib_index[i]); */
+      /*           } */
+
+
+      /*         if (is_tx) */
+      /*           { */
+      /*             if (ip0->protocol == IP_PROTOCOL_UDP) */
+      /*            fib_index0 = */
+      /*              c0->fib_index */
+      /*              [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_UDP_IN]; */
+      /*             if (ip0->protocol == IP_PROTOCOL_TCP) */
+      /*            fib_index0 = */
+      /*              c0->fib_index */
+      /*              [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_TCP_IN]; */
+      /*           } */
+      /*         else */
+      /*           { */
+      /*             if (ip0->protocol == IP_PROTOCOL_UDP) */
+      /*            fib_index0 = */
+      /*              c0->fib_index */
+      /*              [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_UDP_OUT]; */
+      /*             if (ip0->protocol == IP_PROTOCOL_TCP) */
+      /*            fib_index0 = */
+      /*              c0->fib_index */
+      /*              [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_TCP_OUT]; */
+      /*           } */
+
+      /*         if (PREDICT_TRUE (fib_index0 != ~0)) */
+      /*           { */
+
+      /*             mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie; */
+
+      /*             leaf0 = IP4_FIB_MTRIE_LEAF_ROOT; */
+
+      /*             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, */
+      /*                                             &ip0->src_address, 0); */
+
+      /*             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, */
+      /*                                             &ip0->src_address, 1); */
+
+      /*             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, */
+      /*                                             &ip0->src_address, 2); */
+
+      /*             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, */
+      /*                                             &ip0->src_address, 3); */
+
+      /*             adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0); */
+
+      /*             ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0, */
+      /*                                                           &ip0->src_address, */
+      /*                                                           0 */
+      /*                                                           /\* use dflt rt *\/ */
+      /*                  )); */
+      /*             adj0 = ip_get_adjacency (lm, adj_index0); */
+      /*           } */
+
+      /*         if (is_tx) */
+      /*           { */
+      /*             if (ip1->protocol == IP_PROTOCOL_UDP) */
+      /*            fib_index1 = */
+      /*              c1->fib_index */
+      /*              [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_UDP_IN]; */
+      /*             if (ip1->protocol == IP_PROTOCOL_TCP) */
+      /*            fib_index1 = */
+      /*              c1->fib_index */
+      /*              [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_TCP_IN]; */
+      /*           } */
+      /*         else */
+      /*           { */
+      /*             if (ip1->protocol == IP_PROTOCOL_UDP) */
+      /*            fib_index1 = */
+      /*              c1->fib_index */
+      /*              [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_UDP_OUT]; */
+      /*             if (ip1->protocol == IP_PROTOCOL_TCP) */
+      /*            fib_index1 = */
+      /*              c1->fib_index */
+      /*              [IP_SOURCE_AND_PORT_RANGE_CHECK_PROTOCOL_TCP_OUT]; */
+      /*           } */
+
+      /*         if (PREDICT_TRUE (fib_index1 != ~0)) */
+      /*           { */
+
+      /*             mtrie1 = &vec_elt_at_index (im->fibs, fib_index1)->mtrie; */
+
+      /*             leaf1 = IP4_FIB_MTRIE_LEAF_ROOT; */
+
+      /*             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, */
+      /*                                             &ip1->src_address, 0); */
+
+      /*             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, */
+      /*                                             &ip1->src_address, 1); */
+
+      /*             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, */
+      /*                                             &ip1->src_address, 2); */
+
+      /*             leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, */
+      /*                                             &ip1->src_address, 3); */
+
+      /*             adj_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1); */
+
+      /*             ASSERT (adj_index1 == ip4_fib_lookup_with_table (im, fib_index1, */
+      /*                                                           &ip1->src_address, */
+      /*                                                           0)); */
+      /*             adj1 = ip_get_adjacency (lm, adj_index1); */
+      /*           } */
+
+      /*         pass0 = 0; */
+      /*         pass0 |= adj0 == 0; */
+      /*         pass0 |= ip4_address_is_multicast (&ip0->src_address); */
+      /*         pass0 |= */
+      /*           ip0->src_address.as_u32 == clib_host_to_net_u32 (0xFFFFFFFF); */
+      /*         pass0 |= (ip0->protocol != IP_PROTOCOL_UDP) */
+      /*           && (ip0->protocol != IP_PROTOCOL_TCP); */
+
+      /*         pass1 = 0; */
+      /*         pass1 |= adj1 == 0; */
+      /*         pass1 |= ip4_address_is_multicast (&ip1->src_address); */
+      /*         pass1 |= */
+      /*           ip1->src_address.as_u32 == clib_host_to_net_u32 (0xFFFFFFFF); */
+      /*         pass1 |= (ip1->protocol != IP_PROTOCOL_UDP) */
+      /*           && (ip1->protocol != IP_PROTOCOL_TCP); */
+
+      /*         save_next0 = next0; */
+      /*         udp0 = ip4_next_header (ip0); */
+      /*         save_next1 = next1; */
+      /*         udp1 = ip4_next_header (ip1); */
+
+      /*         if (PREDICT_TRUE (pass0 == 0)) */
+      /*           { */
+      /*             good_packets++; */
+      /*             next0 = check_adj_port_range_x1 */
+      /*            (adj0, clib_net_to_host_u16 (udp0->dst_port), next0); */
+      /*             good_packets -= (save_next0 != next0); */
+      /*             b0->error = error_node->errors */
+      /*            [IP4_SOURCE_AND_PORT_RANGE_CHECK_ERROR_CHECK_FAIL]; */
+      /*           } */
+
+      /*         if (PREDICT_TRUE (pass1 == 0)) */
+      /*           { */
+      /*             good_packets++; */
+      /*             next1 = check_adj_port_range_x1 */
+      /*            (adj1, clib_net_to_host_u16 (udp1->dst_port), next1); */
+      /*             good_packets -= (save_next1 != next1); */
+      /*             b1->error = error_node->errors */
+      /*            [IP4_SOURCE_AND_PORT_RANGE_CHECK_ERROR_CHECK_FAIL]; */
+      /*           } */
+
+      /*         if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) */
+      /*                         && (b0->flags & VLIB_BUFFER_IS_TRACED))) */
+      /*           { */
+      /*             ip4_source_and_port_range_check_trace_t *t = */
+      /*            vlib_add_trace (vm, node, b0, sizeof (*t)); */
+      /*             t->pass = next0 == save_next0; */
+      /*             t->bypass = pass0; */
+      /*             t->fib_index = fib_index0; */
+      /*             t->src_addr.as_u32 = ip0->src_address.as_u32; */
+      /*             t->port = (pass0 == 0) ? */
+      /*            clib_net_to_host_u16 (udp0->dst_port) : 0; */
+      /*             t->is_tcp = ip0->protocol == IP_PROTOCOL_TCP; */
+      /*           } */
+
+      /*         if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) */
+      /*                         && (b1->flags & VLIB_BUFFER_IS_TRACED))) */
+      /*           { */
+      /*             ip4_source_and_port_range_check_trace_t *t = */
+      /*            vlib_add_trace (vm, node, b1, sizeof (*t)); */
+      /*             t->pass = next1 == save_next1; */
+      /*             t->bypass = pass1; */
+      /*             t->fib_index = fib_index1; */
+      /*             t->src_addr.as_u32 = ip1->src_address.as_u32; */
+      /*             t->port = (pass1 == 0) ? */
+      /*            clib_net_to_host_u16 (udp1->dst_port) : 0; */
+      /*             t->is_tcp = ip1->protocol == IP_PROTOCOL_TCP; */
+      /*           } */
+
+      /*         vlib_validate_buffer_enqueue_x2 (vm, node, next_index, */
+      /*                                       to_next, n_left_to_next, */
+      /*                                       bi0, bi1, next0, next1); */
+      /*       } */
 
       while (n_left_from > 0 && n_left_to_next > 0)
        {
          vlib_buffer_t *b0;
          ip4_header_t *ip0;
-         ip4_fib_mtrie_t *mtrie0;
-         ip4_fib_mtrie_leaf_t leaf0;
          ip_source_and_port_range_check_config_t *c0;
-         ip_adjacency_t *adj0 = 0;
-         u32 bi0, next0, adj_index0, pass0, save_next0, fib_index0;
+         u32 bi0, next0, lb_index0, pass0, save_next0, fib_index0;
          udp_header_t *udp0;
+         const protocol_port_range_dpo_t *ppr_dpo0 = NULL;
+         const dpo_id_t *dpo;
 
          bi0 = from[0];
          to_next[0] = bi0;
@@ -476,35 +490,25 @@ ip4_source_and_port_range_check_inline (vlib_main_t * vm,
 
          if (fib_index0 != ~0)
            {
+             lb_index0 = ip4_fib_forwarding_lookup (fib_index0,
+                                                    &ip0->src_address);
 
-             mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
-
-             leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
-
-             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0,
-                                                &ip0->src_address, 0);
-
-             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0,
-                                                &ip0->src_address, 1);
-
-             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0,
-                                                &ip0->src_address, 2);
-
-             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0,
-                                                &ip0->src_address, 3);
+             dpo =
+               load_balance_get_bucket_i (load_balance_get (lb_index0), 0);
 
-             adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
-
-             ASSERT (adj_index0 == ip4_fib_lookup_with_table
-                     (im, fib_index0,
-                      &ip0->src_address, 0 /* use default route */ ));
-             adj0 = ip_get_adjacency (lm, adj_index0);
+             if (ppr_dpo_type == dpo->dpoi_type)
+               {
+                 ppr_dpo0 = protocol_port_range_dpo_get (dpo->dpoi_index);
+               }
+             /*
+              * else the lookup hit an enty that was no inserted
+              * by this range checker, which is the default route
+              */
            }
          /*
           * $$$ which (src,dst) categories should we always pass?
           */
          pass0 = 0;
-         pass0 |= adj0 == 0;
          pass0 |= ip4_address_is_multicast (&ip0->src_address);
          pass0 |=
            ip0->src_address.as_u32 == clib_host_to_net_u32 (0xFFFFFFFF);
@@ -518,7 +522,7 @@ ip4_source_and_port_range_check_inline (vlib_main_t * vm,
            {
              good_packets++;
              next0 = check_adj_port_range_x1
-               (adj0, clib_net_to_host_u16 (udp0->dst_port), next0);
+               (ppr_dpo0, clib_net_to_host_u16 (udp0->dst_port), next0);
              good_packets -= (save_next0 != next0);
              b0->error = error_node->errors
                [IP4_SOURCE_AND_PORT_RANGE_CHECK_ERROR_CHECK_FAIL];
@@ -558,6 +562,7 @@ ip4_source_and_port_range_check_inline (vlib_main_t * vm,
                                 IP4_SOURCE_AND_PORT_RANGE_CHECK_ERROR_CHECK_OK,
                                 good_packets);
   return frame->n_vectors;
+  return 0;
 }
 
 static uword
@@ -786,209 +791,299 @@ VLIB_CLI_COMMAND (set_interface_ip_source_and_port_range_check_command,
 /* *INDENT-ON* */
 
 static u8 *
-format_source_and_port_rc_adjacency (u8 * s, va_list * args)
+format_ppr_dpo (u8 * s, va_list * args)
 {
-  CLIB_UNUSED (vnet_main_t * vnm) = va_arg (*args, vnet_main_t *);
-  ip_lookup_main_t *lm = va_arg (*args, ip_lookup_main_t *);
-  u32 adj_index = va_arg (*args, u32);
-  ip_adjacency_t *adj = ip_get_adjacency (lm, adj_index);
-  source_range_check_main_t *srm = &source_range_check_main;
-  u8 *rwh = (u8 *) (&adj->rewrite_header);
-  protocol_port_range_t *range;
+  index_t index = va_arg (args, index_t);
+  CLIB_UNUSED (u32 indent) = va_arg (args, u32);
+
+  protocol_port_range_dpo_t *ppr_dpo;
   int i, j;
   int printed = 0;
 
-  range = (protocol_port_range_t *) rwh;
+  ppr_dpo = protocol_port_range_dpo_get (index);
 
   s = format (s, "allow ");
 
-  for (i = 0; i < srm->ranges_per_adjacency; i++)
+  for (i = 0; i < ppr_dpo->n_used_blocks; i++)
     {
       for (j = 0; j < 8; j++)
        {
-         if (range->low.as_u16[j])
+         if (ppr_dpo->blocks[i].low.as_u16[j])
            {
              if (printed)
                s = format (s, ", ");
-             if (range->hi.as_u16[j] > (range->low.as_u16[j] + 1))
-               s = format (s, "%d-%d", (u32) range->low.as_u16[j],
-                           (u32) range->hi.as_u16[j] - 1);
+             if (ppr_dpo->blocks[i].hi.as_u16[j] >
+                 (ppr_dpo->blocks[i].low.as_u16[j] + 1))
+               s =
+                 format (s, "%d-%d", (u32) ppr_dpo->blocks[i].low.as_u16[j],
+                         (u32) ppr_dpo->blocks[i].hi.as_u16[j] - 1);
              else
-               s = format (s, "%d", range->low.as_u16[j]);
+               s = format (s, "%d", ppr_dpo->blocks[i].low.as_u16[j]);
              printed = 1;
            }
        }
-      range++;
     }
   return s;
 }
 
+static void
+ppr_dpo_lock (dpo_id_t * dpo)
+{
+}
+
+static void
+ppr_dpo_unlock (dpo_id_t * dpo)
+{
+}
+
+const static dpo_vft_t ppr_vft = {
+  .dv_lock = ppr_dpo_lock,
+  .dv_unlock = ppr_dpo_unlock,
+  .dv_format = format_ppr_dpo,
+};
+
+const static char *const ppr_ip4_nodes[] = {
+  "ip4-source-and-port-range-check-rx",
+  NULL,
+};
+
+const static char *const *const ppr_nodes[DPO_PROTO_NUM] = {
+  [DPO_PROTO_IP4] = ppr_ip4_nodes,
+};
+
 clib_error_t *
 ip4_source_and_port_range_check_init (vlib_main_t * vm)
 {
   source_range_check_main_t *srm = &source_range_check_main;
-  ip4_main_t *im = &ip4_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
 
   srm->vlib_main = vm;
   srm->vnet_main = vnet_get_main ();
 
-  srm->ranges_per_adjacency =
-    VLIB_BUFFER_PRE_DATA_SIZE / (2 * sizeof (u16x8));
-  srm->special_adjacency_format_function_index =
-    vnet_register_special_adjacency_format_function (lm,
-                                                    format_source_and_port_rc_adjacency);
-  ASSERT (srm->special_adjacency_format_function_index);
+  ppr_dpo_type = dpo_register_new_type (&ppr_vft, ppr_nodes);
 
   return 0;
 }
 
 VLIB_INIT_FUNCTION (ip4_source_and_port_range_check_init);
 
-int
-add_port_range_adjacency (ip4_address_t * address,
-                         u32 length,
-                         u32 adj_index,
-                         u16 * low_ports, u16 * high_ports, u32 fib_index)
+protocol_port_range_dpo_t *
+protocol_port_range_dpo_alloc (void)
 {
-  ip_adjacency_t *adj;
-  int i, j, k;
-  source_range_check_main_t *srm = &source_range_check_main;
-  ip4_main_t *im = &ip4_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
-  protocol_port_range_t *range;
-  u8 *rwh;
+  protocol_port_range_dpo_t *ppr_dpo;
 
-  adj = ip_get_adjacency (lm, adj_index);
-  /* $$$$ fixme: add ports if address + mask match */
-  if (adj->lookup_next_index == IP_LOOKUP_NEXT_ICMP_ERROR)
-    return VNET_API_ERROR_INCORRECT_ADJACENCY_TYPE;
+  pool_get_aligned (ppr_dpo_pool, ppr_dpo, CLIB_CACHE_LINE_BYTES);
+  memset (ppr_dpo, 0, sizeof (*ppr_dpo));
 
-  ip_adjacency_t template_adj;
-  ip4_add_del_route_args_t a;
+  ppr_dpo->n_free_ranges = N_PORT_RANGES_PER_DPO;
 
-  memset (&template_adj, 0, sizeof (template_adj));
+  return (ppr_dpo);
+}
 
-  template_adj.lookup_next_index = IP_LOOKUP_NEXT_ICMP_ERROR;
-  template_adj.if_address_index = ~0;
-  template_adj.special_adjacency_format_function_index =
-    srm->special_adjacency_format_function_index;
 
-  rwh = (u8 *) (&template_adj.rewrite_header);
+static int
+add_port_range_adjacency (u32 fib_index,
+                         ip4_address_t * address,
+                         u32 length, u16 * low_ports, u16 * high_ports)
+{
+  protocol_port_range_dpo_t *ppr_dpo;
+  dpo_id_t dpop = DPO_NULL;
+  int i, j, k;
 
-  range = (protocol_port_range_t *) rwh;
+  fib_node_index_t fei;
+  fib_prefix_t pfx = {
+    .fp_proto = FIB_PROTOCOL_IP4,
+    .fp_len = length,
+    .fp_addr = {
+               .ip4 = *address,
+               },
+  };
+
+  /*
+   * check to see if we have already sourced this prefix
+   */
+  fei = fib_table_lookup_exact_match (fib_index, &pfx);
+
+  if (FIB_NODE_INDEX_INVALID == fei)
+    {
+      /*
+       * this is a first time add for this prefix.
+       */
+      ppr_dpo = protocol_port_range_dpo_alloc ();
+    }
+  else
+    {
+      /*
+       * the prefix is already there.
+       * check it was sourced by us, and if so get the ragne DPO from it.
+       */
+      dpo_id_t dpo = DPO_NULL;
+      const dpo_id_t *bucket;
+
+      if (fib_entry_get_dpo_for_source (fei, FIB_SOURCE_SPECIAL, &dpo))
+       {
+         /*
+          * there is existing state. we'll want to add the new ranges to it
+          */
+         bucket =
+           load_balance_get_bucket_i (load_balance_get (dpo.dpoi_index), 0);
+         ppr_dpo = protocol_port_range_dpo_get (bucket->dpoi_index);
+         dpo_reset (&dpo);
+       }
+      else
+       {
+         /*
+          * there is no PPR state associated with this prefix,
+          * so we'll need a new DPO
+          */
+         ppr_dpo = protocol_port_range_dpo_alloc ();
+       }
+    }
 
-  if (vec_len (low_ports) > 8 * srm->ranges_per_adjacency)
+  if (vec_len (low_ports) > ppr_dpo->n_free_ranges)
     return VNET_API_ERROR_EXCEEDED_NUMBER_OF_RANGES_CAPACITY;
 
   j = k = 0;
 
   for (i = 0; i < vec_len (low_ports); i++)
     {
-      for (; j < srm->ranges_per_adjacency; j++)
+      for (; j < N_BLOCKS_PER_DPO; j++)
        {
          for (; k < 8; k++)
            {
-             if (range->low.as_u16[k] == 0)
+             if (ppr_dpo->blocks[j].low.as_u16[k] == 0)
                {
-                 range->low.as_u16[k] = low_ports[i];
-                 range->hi.as_u16[k] = high_ports[i];
-                 k++;
-                 if (k == 7)
-                   {
-                     k = 0;
-                     j++;
-                   }
-                 goto doublebreak2;
+                 ppr_dpo->blocks[j].low.as_u16[k] = low_ports[i];
+                 ppr_dpo->blocks[j].hi.as_u16[k] = high_ports[i];
+                 goto doublebreak;
                }
            }
-         k = 0;
-         range++;
        }
-      j = 0;
-      /* Too many ports specified... */
-      return VNET_API_ERROR_EXCEEDED_NUMBER_OF_PORTS_CAPACITY;
-
-    doublebreak2:;
+    doublebreak:;
     }
+  ppr_dpo->n_used_blocks = j + 1;
 
-  memset (&a, 0, sizeof (a));
-  a.flags = IP4_ROUTE_FLAG_FIB_INDEX;
-  a.table_index_or_table_id = fib_index;
-  a.dst_address = address[0];
-  a.dst_address_length = length;
-  a.add_adj = &template_adj;
-  a.n_add_adj = 1;
+  /*
+   * add or update the entry in the FIB
+   */
+  dpo_set (&dpop, ppr_dpo_type, DPO_PROTO_IP4, (ppr_dpo - ppr_dpo_pool));
+
+  if (FIB_NODE_INDEX_INVALID == fei)
+    {
+      fib_table_entry_special_dpo_add (fib_index,
+                                      &pfx,
+                                      FIB_SOURCE_SPECIAL,
+                                      FIB_ENTRY_FLAG_NONE, &dpop);
+    }
+  else
+    {
+      fib_table_entry_special_dpo_update (fei,
+                                         FIB_SOURCE_SPECIAL,
+                                         FIB_ENTRY_FLAG_NONE, &dpop);
+    }
 
-  ip4_add_del_route (im, &a);
   return 0;
 }
 
-int
-remove_port_range_adjacency (ip4_address_t * address,
-                            u32 length,
-                            u32 adj_index,
-                            u16 * low_ports, u16 * high_ports, u32 fib_index)
+static int
+remove_port_range_adjacency (u32 fib_index,
+                            ip4_address_t * address,
+                            u32 length, u16 * low_ports, u16 * high_ports)
 {
-  ip_adjacency_t *adj;
+  protocol_port_range_dpo_t *ppr_dpo;
+  fib_node_index_t fei;
   int i, j, k;
-  source_range_check_main_t *srm = &source_range_check_main;
-  ip4_main_t *im = &ip4_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
-  protocol_port_range_t *range;
-  u8 *rwh;
 
-  adj = ip_get_adjacency (lm, adj_index);
-  if (adj->lookup_next_index != IP_LOOKUP_NEXT_ICMP_ERROR)     /* _ICMP_ERROR is a dummy placeholder */
-    return VNET_API_ERROR_INCORRECT_ADJACENCY_TYPE;
+  fib_prefix_t pfx = {
+    .fp_proto = FIB_PROTOCOL_IP4,
+    .fp_len = length,
+    .fp_addr = {
+               .ip4 = *address,
+               },
+  };
+
+  /*
+   * check to see if we have sourced this prefix
+   */
+  fei = fib_table_lookup_exact_match (fib_index, &pfx);
 
-  rwh = (u8 *) (&adj->rewrite_header);
+  if (FIB_NODE_INDEX_INVALID == fei)
+    {
+      /*
+       * not one of ours
+       */
+      return VNET_API_ERROR_INCORRECT_ADJACENCY_TYPE;
+    }
+  else
+    {
+      /*
+       * the prefix is already there.
+       * check it was sourced by us
+       */
+      dpo_id_t dpo = DPO_NULL;
+      const dpo_id_t *bucket;
+
+      if (fib_entry_get_dpo_for_source (fei, FIB_SOURCE_SPECIAL, &dpo))
+       {
+         /*
+          * there is existing state. we'll want to add the new ranges to it
+          */
+         bucket =
+           load_balance_get_bucket_i (load_balance_get (dpo.dpoi_index), 0);
+         ppr_dpo = protocol_port_range_dpo_get (bucket->dpoi_index);
+         dpo_reset (&dpo);
+       }
+      else
+       {
+         /*
+          * not one of ours
+          */
+         return VNET_API_ERROR_INCORRECT_ADJACENCY_TYPE;
+       }
+    }
 
   for (i = 0; i < vec_len (low_ports); i++)
     {
-      range = (protocol_port_range_t *) rwh;
-      for (j = 0; j < srm->ranges_per_adjacency; j++)
+      for (j = 0; j < N_BLOCKS_PER_DPO; j++)
        {
          for (k = 0; k < 8; k++)
            {
-             if (low_ports[i] == range->low.as_u16[k] &&
-                 high_ports[i] == range->hi.as_u16[k])
+             if (low_ports[i] == ppr_dpo->blocks[j].low.as_u16[k] &&
+                 high_ports[i] == ppr_dpo->blocks[j].hi.as_u16[k])
                {
-                 range->low.as_u16[k] = range->hi.as_u16[k] = 0;
+                 ppr_dpo->blocks[j].low.as_u16[k] =
+                   ppr_dpo->blocks[j].hi.as_u16[k] = 0;
                  goto doublebreak;
                }
            }
-         range++;
        }
     doublebreak:;
     }
 
-  range = (protocol_port_range_t *) rwh;
+  ppr_dpo->n_free_ranges = 0;
+
   /* Have we deleted all ranges yet? */
-  for (i = 0; i < srm->ranges_per_adjacency; i++)
+  for (i = 0; i < N_BLOCKS_PER_DPO; i++)
     {
       for (j = 0; j < 8; j++)
        {
-         if (range->low.as_u16[i] != 0)
-           goto still_occupied;
+         if (ppr_dpo->blocks[j].low.as_u16[i] == 0)
+           ppr_dpo->n_free_ranges++;
        }
-      range++;
     }
-  /* Yes, lose the adjacency... */
-  {
-    ip4_add_del_route_args_t a;
-
-    memset (&a, 0, sizeof (a));
-    a.flags = IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_DEL;
-    a.table_index_or_table_id = fib_index;
-    a.dst_address = address[0];
-    a.dst_address_length = length;
-    a.adj_index = adj_index;
-    ip4_add_del_route (im, &a);
-  }
-
-still_occupied:
-  ;
+
+  if (N_PORT_RANGES_PER_DPO == ppr_dpo->n_free_ranges)
+    {
+      /* Yes, lose the adjacency... */
+      fib_table_entry_special_remove (fib_index, &pfx, FIB_SOURCE_SPECIAL);
+    }
+  else
+    {
+      /*
+       * compact the ranges down to a contiguous block
+       */
+      // FIXME. TODO.
+    }
+
   return 0;
 }
 
@@ -1010,35 +1105,19 @@ ip4_source_and_port_range_check_add_del (ip4_address_t * address,
                                         u16 * low_ports,
                                         u16 * high_ports, int is_add)
 {
-
-  ip4_main_t *im = &ip4_main;
-  //  ip_lookup_main_t * lm = &im->lookup_main;
-  uword *p;
   u32 fib_index;
-  u32 adj_index;
-
-  p = hash_get (im->fib_index_by_table_id, vrf_id);
-  if (!p)
-    {
-      ip4_fib_t *f;
-      f = find_ip4_fib_by_table_index_or_id (im, vrf_id, 0 /* flags */ );
-      fib_index = f->index;
-    }
-  else
-    fib_index = p[0];
 
-  adj_index = ip4_fib_lookup_with_table
-    (im, fib_index, address, 0 /* disable_default_route */ );
+  fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, vrf_id);
 
   if (is_add == 0)
     {
-      remove_port_range_adjacency (address, length, adj_index, low_ports,
-                                  high_ports, fib_index);
+      remove_port_range_adjacency (fib_index, address, length,
+                                  low_ports, high_ports);
     }
   else
     {
-      add_port_range_adjacency (address, length, adj_index, low_ports,
-                               high_ports, fib_index);
+      add_port_range_adjacency (fib_index, address, length,
+                               low_ports, high_ports);
     }
 
   return 0;
@@ -1159,24 +1238,20 @@ show_source_and_port_range_check_fn (vlib_main_t * vm,
                                     unformat_input_t * input,
                                     vlib_cli_command_t * cmd)
 {
-  source_range_check_main_t *srm = &source_range_check_main;
-  ip4_main_t *im = &ip4_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
-  protocol_port_range_t *range;
+  protocol_port_range_dpo_t *ppr_dpo;
   u32 fib_index;
-  ip4_address_t addr;
   u8 addr_set = 0;
   u32 vrf_id = ~0;
   int rv, i, j;
-  u32 adj_index;
-  ip_adjacency_t *adj;
   u32 port = 0;
-  u8 *rwh;
-  uword *p;
+  fib_prefix_t pfx = {
+    .fp_proto = FIB_PROTOCOL_IP4,
+    .fp_len = 32,
+  };
 
   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
     {
-      if (unformat (input, "%U", unformat_ip4_address, &addr))
+      if (unformat (input, "%U", unformat_ip4_address, &pfx.fp_addr.ip4))
        addr_set = 1;
       else if (unformat (input, "vrf %d", &vrf_id))
        ;
@@ -1192,51 +1267,58 @@ show_source_and_port_range_check_fn (vlib_main_t * vm,
   if (vrf_id == ~0)
     return clib_error_return (0, "VRF ID required, not specified");
 
-  p = hash_get (im->fib_index_by_table_id, vrf_id);
-  if (p == 0)
+  fib_index = fib_table_find (FIB_PROTOCOL_IP4, vrf_id);
+  if (~0 == fib_index)
     return clib_error_return (0, "VRF %d not found", vrf_id);
-  fib_index = p[0];
 
-  adj_index = ip4_fib_lookup_with_table
-    (im, fib_index, &addr, 0 /* disable_default_route */ );
+  /*
+   * find the longest prefix match on the address requested,
+   * check it was sourced by us
+   */
+  dpo_id_t dpo = DPO_NULL;
+  const dpo_id_t *bucket;
 
-  adj = ip_get_adjacency (lm, adj_index);
-
-  if (adj->lookup_next_index != IP_LOOKUP_NEXT_ICMP_ERROR)
+  if (!fib_entry_get_dpo_for_source (fib_table_lookup (fib_index, &pfx),
+                                    FIB_SOURCE_SPECIAL, &dpo))
     {
-      vlib_cli_output (vm, "%U: src address drop", format_ip4_address, &addr);
+      /*
+       * not one of ours
+       */
+      vlib_cli_output (vm, "%U: src address drop", format_ip4_address,
+                      &pfx.fp_addr.ip4);
       return 0;
     }
 
+  bucket = load_balance_get_bucket_i (load_balance_get (dpo.dpoi_index), 0);
+  ppr_dpo = protocol_port_range_dpo_get (bucket->dpoi_index);
+  dpo_reset (&dpo);
+
   if (port)
     {
-      rv = check_adj_port_range_x1 (adj, (u16) port, 1234);
+      rv = check_adj_port_range_x1 (ppr_dpo, (u16) port, 1234);
       if (rv == 1234)
        vlib_cli_output (vm, "%U port %d PASS", format_ip4_address,
-                        &addr, port);
+                        &pfx.fp_addr.ip4, port);
       else
        vlib_cli_output (vm, "%U port %d FAIL", format_ip4_address,
-                        &addr, port);
+                        &pfx.fp_addr.ip4, port);
       return 0;
     }
   else
     {
       u8 *s;
-      rwh = (u8 *) (&adj->rewrite_header);
-
-      s = format (0, "%U: ", format_ip4_address, &addr);
 
-      range = (protocol_port_range_t *) rwh;
+      s = format (0, "%U: ", format_ip4_address, &pfx.fp_addr.ip4);
 
-      for (i = 0; i < srm->ranges_per_adjacency; i++)
+      for (i = 0; i < N_BLOCKS_PER_DPO; i++)
        {
          for (j = 0; j < 8; j++)
            {
-             if (range->low.as_u16[j])
-               s = format (s, "%d - %d ", (u32) range->low.as_u16[j],
-                           (u32) range->hi.as_u16[j]);
+             if (ppr_dpo->blocks[i].low.as_u16[j])
+               s = format (s, "%d - %d ",
+                           (u32) ppr_dpo->blocks[i].low.as_u16[j],
+                           (u32) ppr_dpo->blocks[i].hi.as_u16[j]);
            }
-         range++;
        }
       vlib_cli_output (vm, "%s", s);
       vec_free (s);
index 1f8e721..2323ac2 100644 (file)
@@ -38,6 +38,8 @@
  */
 
 #include <vnet/ip/ip.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/dpo/load_balance.h>
 
 typedef struct {
   u8 packet_data[64];
@@ -110,9 +112,12 @@ ip4_source_check_inline (vlib_main_t * vm,
          ip4_fib_mtrie_t * mtrie0, * mtrie1;
          ip4_fib_mtrie_leaf_t leaf0, leaf1;
          ip4_source_check_config_t * c0, * c1;
-         ip_adjacency_t * adj0, * adj1;
-         u32 pi0, next0, pass0, adj_index0;
-         u32 pi1, next1, pass1, adj_index1;
+         const load_balance_t * lb0, * lb1;
+         u32 pi0, next0, pass0, lb_index0;
+         u32 pi1, next1, pass1, lb_index1;
+          const ip_adjacency_t *adj0, *adj1;
+          const dpo_id_t *dpo0, *dpo1;
+          u32 ii0, ii1;
 
          /* Prefetch next iteration. */
          {
@@ -150,8 +155,8 @@ ip4_source_check_inline (vlib_main_t * vm,
                                     &next1,
                                     sizeof (c1[0]));
 
-         mtrie0 = &vec_elt_at_index (im->fibs, c0->fib_index)->mtrie;
-         mtrie1 = &vec_elt_at_index (im->fibs, c1->fib_index)->mtrie;
+         mtrie0 = &ip4_fib_get (c0->fib_index)->mtrie;
+         mtrie1 = &ip4_fib_get (c1->fib_index)->mtrie;
 
          leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
 
@@ -167,29 +172,70 @@ ip4_source_check_inline (vlib_main_t * vm,
          leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
          leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 3);
 
-         adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
-         adj_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
+         lb_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+         lb_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
 
-         ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, c0->fib_index,
-                                                          &ip0->src_address,
-                                                          c0->no_default_route));
-         ASSERT (adj_index1 == ip4_fib_lookup_with_table (im, c1->fib_index,
-                                                          &ip1->src_address,
-                                                          c1->no_default_route));
-
-         adj0 = ip_get_adjacency (lm, adj_index0);
-         adj1 = ip_get_adjacency (lm, adj_index1);
+         lb0 = load_balance_get(lb_index0);
+         lb1 = load_balance_get(lb_index1);
 
          /* Pass multicast. */
          pass0 = ip4_address_is_multicast (&ip0->src_address) || ip0->src_address.as_u32 == clib_host_to_net_u32(0xFFFFFFFF);
          pass1 = ip4_address_is_multicast (&ip1->src_address) || ip1->src_address.as_u32 == clib_host_to_net_u32(0xFFFFFFFF);
 
-         pass0 |= (adj0->lookup_next_index == IP_LOOKUP_NEXT_REWRITE
-                   && (source_check_type == IP4_SOURCE_CHECK_REACHABLE_VIA_ANY
-                       || vnet_buffer (p0)->sw_if_index[VLIB_RX] == adj0->rewrite_header.sw_if_index));
-         pass1 |= (adj1->lookup_next_index == IP_LOOKUP_NEXT_REWRITE
-                   && (source_check_type == IP4_SOURCE_CHECK_REACHABLE_VIA_ANY
-                       || vnet_buffer (p1)->sw_if_index[VLIB_RX] == adj1->rewrite_header.sw_if_index));
+          if (PREDICT_TRUE(1 == lb0->lb_n_buckets))
+          {
+              dpo0 = load_balance_get_bucket_i(lb0, 0);
+              if (PREDICT_TRUE(dpo0->dpoi_type == DPO_ADJACENCY))
+              {
+                  pass0 |= (source_check_type ==
+                            IP4_SOURCE_CHECK_REACHABLE_VIA_ANY);
+                  adj0 = adj_get(dpo0->dpoi_index);
+                  pass0 |= (vnet_buffer (p0)->sw_if_index[VLIB_RX] ==
+                            adj0->rewrite_header.sw_if_index);
+              }
+          }
+          else
+          {
+              for (ii0 = 0; ii0 < lb0->lb_n_buckets && !pass0; ii0++)
+              {
+                  dpo0 = load_balance_get_bucket_i(lb0, ii0);
+                  if (PREDICT_TRUE(dpo0->dpoi_type == DPO_ADJACENCY))
+                  {
+                      pass0 |= (source_check_type ==
+                                IP4_SOURCE_CHECK_REACHABLE_VIA_ANY);
+                      adj0 = adj_get(dpo0->dpoi_index);
+                      pass0 |= (vnet_buffer (p0)->sw_if_index[VLIB_RX] ==
+                                adj0->rewrite_header.sw_if_index);
+                  }
+              }
+          }
+          if (PREDICT_TRUE(1 == lb1->lb_n_buckets))
+          {
+              dpo1 = load_balance_get_bucket_i(lb1, 0);
+              if (PREDICT_TRUE(dpo1->dpoi_type == DPO_ADJACENCY))
+              {
+                  pass1 |= (source_check_type ==
+                            IP4_SOURCE_CHECK_REACHABLE_VIA_ANY);
+                  adj1 = adj_get(dpo1->dpoi_index);
+                  pass1 |= (vnet_buffer (p1)->sw_if_index[VLIB_RX] ==
+                            adj1->rewrite_header.sw_if_index);
+              }
+          }
+          else
+          {
+              for (ii1 = 0; ii1 < lb1->lb_n_buckets && !pass1; ii1++)
+              {
+                  dpo1 = load_balance_get_bucket_i(lb1, ii1);
+                 if (PREDICT_TRUE(dpo1->dpoi_type == DPO_ADJACENCY))
+                  {
+                      pass1 |= (source_check_type ==
+                                IP4_SOURCE_CHECK_REACHABLE_VIA_ANY);
+                      adj1 = adj_get(dpo1->dpoi_index);
+                      pass1 |= (vnet_buffer (p1)->sw_if_index[VLIB_RX] ==
+                                adj1->rewrite_header.sw_if_index);
+                  }
+              }
+          }
 
          next0 = (pass0 ? next0 : IP4_SOURCE_CHECK_NEXT_DROP);
          next1 = (pass1 ? next1 : IP4_SOURCE_CHECK_NEXT_DROP);
@@ -210,7 +256,10 @@ ip4_source_check_inline (vlib_main_t * vm,
          ip4_fib_mtrie_leaf_t leaf0;
          ip4_source_check_config_t * c0;
          ip_adjacency_t * adj0;
-         u32 pi0, next0, pass0, adj_index0;
+         u32 pi0, next0, pass0, lb_index0;
+         const load_balance_t * lb0;
+          const dpo_id_t *dpo0;
+          u32 ii0;
 
          pi0 = from[0];
          to_next[0] = pi0;
@@ -227,7 +276,7 @@ ip4_source_check_inline (vlib_main_t * vm,
                                     &next0,
                                     sizeof (c0[0]));
 
-         mtrie0 = &vec_elt_at_index (im->fibs, c0->fib_index)->mtrie;
+         mtrie0 = &ip4_fib_get (c0->fib_index)->mtrie;
 
          leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
 
@@ -239,19 +288,40 @@ ip4_source_check_inline (vlib_main_t * vm,
 
          leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
 
-         adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+         lb_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
 
-         ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, c0->fib_index,
-                                                          &ip0->src_address,
-                                                          c0->no_default_route));
-         adj0 = ip_get_adjacency (lm, adj_index0);
+         lb0 = load_balance_get(lb_index0);
 
          /* Pass multicast. */
          pass0 = ip4_address_is_multicast (&ip0->src_address) || ip0->src_address.as_u32 == clib_host_to_net_u32(0xFFFFFFFF);
 
-         pass0 |= (adj0->lookup_next_index == IP_LOOKUP_NEXT_REWRITE
-                   && (source_check_type == IP4_SOURCE_CHECK_REACHABLE_VIA_ANY
-                       || vnet_buffer (p0)->sw_if_index[VLIB_RX] == adj0->rewrite_header.sw_if_index));
+          if (PREDICT_TRUE(1 == lb0->lb_n_buckets))
+          {
+              dpo0 = load_balance_get_bucket_i(lb0, 0);
+              if (PREDICT_TRUE(dpo0->dpoi_type == DPO_ADJACENCY))
+              {
+                  pass0 |= (source_check_type ==
+                            IP4_SOURCE_CHECK_REACHABLE_VIA_ANY);
+                  adj0 = adj_get(dpo0->dpoi_index);
+                  pass0 |= (vnet_buffer (p0)->sw_if_index[VLIB_RX] ==
+                            adj0->rewrite_header.sw_if_index);
+              }
+          }
+          else
+          {
+              for (ii0 = 0; ii0 < lb0->lb_n_buckets && !pass0; ii0++)
+              {
+                  dpo0 = load_balance_get_bucket_i(lb0, ii0);
+                  if (PREDICT_TRUE(dpo0->dpoi_type == DPO_ADJACENCY))
+                  {
+                      pass0 |= (source_check_type ==
+                                IP4_SOURCE_CHECK_REACHABLE_VIA_ANY);
+                      adj0 = adj_get(dpo0->dpoi_index);
+                      pass0 |= (vnet_buffer (p0)->sw_if_index[VLIB_RX] ==
+                                adj0->rewrite_header.sw_if_index);
+                  }
+              }
+          }
 
          next0 = (pass0 ? next0 : IP4_SOURCE_CHECK_NEXT_DROP);
          p0->error = error_node->errors[IP4_ERROR_UNICAST_SOURCE_CHECK_FAILS];
index ff088e7..b76a719 100644 (file)
@@ -142,7 +142,7 @@ thrash (vlib_main_t * vm,
     }
 
   /* Find or create FIB table 11 */
-  fib = find_ip4_fib_by_table_index_or_id (im, table_id, IP4_ROUTE_FLAG_TABLE_ID);
+  fib = ip4_fib_find_or_create_fib_by_table_id (table_id);
 
   for (i = tm->test_interfaces_created; i < ninterfaces; i++)
     {
@@ -164,6 +164,7 @@ thrash (vlib_main_t * vm,
       hw = vnet_get_hw_interface (vnm, hw_if_index);
       vec_validate (im->fib_index_by_sw_if_index, hw->sw_if_index);
       im->fib_index_by_sw_if_index[hw->sw_if_index] = fib->index;
+      ip4_sw_interface_enable_disable(sw_if_index, 1);
     }
 
   tm->test_interfaces_created = ninterfaces;
index f5f3de8..36be649 100644 (file)
@@ -71,27 +71,11 @@ typedef struct {
   u32 index;
 
   /* flow hash configuration */
-  u32 flow_hash_config;
+  flow_hash_config_t flow_hash_config;
 } ip6_fib_t;
 
 struct ip6_main_t;
 
-typedef void (ip6_add_del_route_function_t)
-  (struct ip6_main_t * im,
-   uword opaque,
-   ip6_fib_t * fib,
-   u32 flags,
-   ip6_address_t * address,
-   u32 address_length,
-   void * old_result,
-   void * new_result);
-
-typedef struct {
-  ip6_add_del_route_function_t * function;
-  uword required_flags;
-  uword function_opaque;
-} ip6_add_del_route_callback_t;
-
 typedef void (ip6_add_del_interface_address_function_t)
   (struct ip6_main_t * im,
    uword opaque,
@@ -106,31 +90,63 @@ typedef struct {
   uword function_opaque;
 } ip6_add_del_interface_address_callback_t;
 
-typedef struct ip6_main_t {
-  BVT(clib_bihash) ip6_lookup_table;
+/**
+ * Enumeration of the FIB table instance types
+ */
+typedef enum ip6_fib_table_instance_type_t_ {
+    /**
+     * This table stores the routes that are used to forward traffic.
+     * The key is the prefix, the result the adjacnecy to forward on.
+     */
+    IP6_FIB_TABLE_FWDING,
+    /**
+     * The table that stores ALL routes learned by the DP.
+     * Some of these routes may not be ready to install in forwarding 
+     * at a given time. 
+     * The key in this table is the prefix, the result is the fib_entry_t
+     */
+    IP6_FIB_TABLE_NON_FWDING,
+} ip6_fib_table_instance_type_t;
+
+#define IP6_FIB_NUM_TABLES (IP6_FIB_TABLE_NON_FWDING+1)
 
-  ip_lookup_main_t lookup_main;
+/**
+ * A represenation of a single IP6 table
+ */
+typedef struct ip6_fib_table_instance_t_ {
+  /* The hash table */
+  BVT(clib_bihash) ip6_hash;
 
   /* bitmap / refcounts / vector of mask widths to search */
   uword * non_empty_dst_address_length_bitmap;
   u8 * prefix_lengths_in_search_order;
   i32 dst_address_length_refcounts[129];
+} ip6_fib_table_instance_t;
+
+typedef struct ip6_main_t {
+  /**
+   * The two FIB tables; fwding and non-fwding
+   */
+  ip6_fib_table_instance_t ip6_table[IP6_FIB_NUM_TABLES];
+
+  ip_lookup_main_t lookup_main;
   
-  /* Vector of FIBs. */
-  ip6_fib_t * fibs;
+  /* Pool of FIBs. */
+  struct fib_table_t_ * fibs;
 
+  /* Network byte orders subnet mask for each prefix length */
   ip6_address_t fib_masks[129];
 
   /* Table index indexed by software interface. */
   u32 * fib_index_by_sw_if_index;
 
+  /* IP6 enabled count by software interface */
+  u8 * ip_enabled_by_sw_if_index;
+
   /* Hash table mapping table id to fib index.
      ID space is not necessarily dense; index space is dense. */
   uword * fib_index_by_table_id;
 
-  /* Vector of functions to call when routes are added/deleted. */
-  ip6_add_del_route_callback_t * add_del_route_callbacks;
-
   /* Hash table mapping interface rewrite adjacency index by sw if index. */
   uword * interface_route_adj_index_by_sw_if_index;
 
@@ -156,8 +172,10 @@ typedef struct ip6_main_t {
   u32 ip6_unicast_rx_feature_l2tp_decap;
   u32 ip6_unicast_rx_feature_vpath;
   u32 ip6_unicast_rx_feature_lookup;
+  u32 ip6_unicast_rx_feature_drop;
 
   /* Built-in multicast feature path indices */
+  u32 ip6_multicast_rx_feature_drop;
   u32 ip6_multicast_rx_feature_vpath;
   u32 ip6_multicast_rx_feature_lookup;
   
@@ -226,6 +244,8 @@ extern vlib_node_registration_t ip6_input_node;
 extern vlib_node_registration_t ip6_rewrite_node;
 extern vlib_node_registration_t ip6_rewrite_local_node;
 extern vlib_node_registration_t ip6_discover_neighbor_node;
+extern vlib_node_registration_t ip6_glean_node;
+extern vlib_node_registration_t ip6_midchain_node;
 
 extern vlib_node_registration_t ip6_icmp_neighbor_discovery_event_node;
 
@@ -242,40 +262,10 @@ typedef union {
   } up_down_event;
 } ip6_icmp_neighbor_discovery_event_data_t;
 
-u32 ip6_fib_lookup (ip6_main_t * im, u32 sw_if_index, ip6_address_t * dst);
-u32 ip6_fib_lookup_with_table (ip6_main_t * im, u32 fib_index, 
-                               ip6_address_t * dst);
-
-/**
- * \brief Get or create an IPv6 fib.
- *
- * Get or create an IPv6 fib with the provided fib ID or index.
- * The fib ID is a possibly-sparse user-defined value while
- * the fib index defines the position of the fib in the fib vector.
- *
- * \param im
- *      ip6_main pointer.
- * \param table_index_or_id
- *      The table index if \c IP6_ROUTE_FLAG_FIB_INDEX bit is set in \p flags.
- *      Otherwise, when set to \c ~0, an arbitrary and unused fib ID is picked
- *      and can be retrieved with \c ret->table_id.
- *      Otherwise, it is the fib ID to be used to retrieve or create the desired fib.
- * \param flags
- *      Indicates whether \p table_index_or_id is the fib index or ID.
- *      When the bit \c IP6_ROUTE_FLAG_FIB_INDEX is set, \p table_index_or_id
- *      is considered as the fib index, and the fib ID otherwise.
- * \return A pointer to the retrieved or created fib.
- *
- * \remark When getting a fib with the fib index, the fib MUST already exist.
- */
-ip6_fib_t * find_ip6_fib_by_table_index_or_id (ip6_main_t * im, 
-                                               u32 table_index_or_id, 
-                                               u32 flags);
-
 always_inline uword
-ip6_destination_matches_route (ip6_main_t * im,
-                              ip6_address_t * key,
-                              ip6_address_t * dest,
+ip6_destination_matches_route (const ip6_main_t * im,
+                              const ip6_address_t * key,
+                              const ip6_address_t * dest,
                               uword dest_length)
 {
   int i;
@@ -313,25 +303,26 @@ ip6_unaligned_destination_matches_route (ip6_main_t * im,
 }
 
 always_inline int
-ip6_src_address_for_packet (ip6_main_t * im, vlib_buffer_t * p, ip6_address_t * src, u32 sw_if_index)
-{
-  ip_lookup_main_t * lm = &im->lookup_main;
-  ip_interface_address_t * ia = ip_interface_address_for_packet (lm, p, sw_if_index);
-  if (ia == NULL)
-    return -1;
-  ip6_address_t * a = ip_interface_address_get_address (lm, ia);
-  *src = a[0];
-  return 0;
-}
-
-always_inline u32
-ip6_src_lookup_for_packet (ip6_main_t * im, vlib_buffer_t * b, ip6_header_t * i)
+ip6_src_address_for_packet (ip_lookup_main_t * lm,
+                           u32 sw_if_index,
+                           ip6_address_t * src)
 {
-  if (vnet_buffer (b)->ip.adj_index[VLIB_RX] == ~0)
-    vnet_buffer (b)->ip.adj_index[VLIB_RX]
-      = ip6_fib_lookup (im, vnet_buffer (b)->sw_if_index[VLIB_RX],
-                       &i->src_address);
-  return vnet_buffer (b)->ip.adj_index[VLIB_RX];
+    u32 if_add_index = 
+       lm->if_address_pool_index_by_sw_if_index[sw_if_index];
+    if (PREDICT_TRUE(if_add_index != ~0)) {
+       ip_interface_address_t *if_add = 
+           pool_elt_at_index(lm->if_address_pool, if_add_index);
+       ip6_address_t *if_ip = 
+           ip_interface_address_get_address(lm, if_add);
+       *src = *if_ip;
+       return (0);
+    }
+    else
+    {
+       src->as_u64[0] = 0;
+       src->as_u64[1] = 0;
+    }
+    return (!0);
 }
 
 /* Find interface address which matches destination. */
@@ -362,95 +353,12 @@ clib_error_t *
 ip6_add_del_interface_address (vlib_main_t * vm, u32 sw_if_index,
                               ip6_address_t * address, u32 address_length,
                               u32 is_del);
+void
+ip6_sw_interface_enable_disable (u32 sw_if_index,
+                                u32 is_enable);
 
 int ip6_address_compare (ip6_address_t * a1, ip6_address_t * a2);
 
-/* Add/del a route to the FIB. */
-
-#define IP6_ROUTE_FLAG_ADD (0 << 0)
-#define IP6_ROUTE_FLAG_DEL (1 << 0)
-#define IP6_ROUTE_FLAG_TABLE_ID  (0 << 1)
-#define IP6_ROUTE_FLAG_FIB_INDEX (1 << 1)
-#define IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY (1 << 2)
-#define IP6_ROUTE_FLAG_NO_REDISTRIBUTE (1 << 3)
-#define IP6_ROUTE_FLAG_NOT_LAST_IN_GROUP (1 << 4)
-/* Dynamic route created via neighbor discovery. */
-#define IP6_ROUTE_FLAG_NEIGHBOR (1 << 5)
-
-typedef struct {
-  /* IP6_ROUTE_FLAG_* */
-  u32 flags;
-
-  /* Either index of fib or table_id to hash and get fib.
-     IP6_ROUTE_FLAG_FIB_INDEX specifies index; otherwise table_id is assumed. */
-  u32 table_index_or_table_id;
-
-  /* Destination address (prefix) and length. */
-  ip6_address_t dst_address;
-  u32 dst_address_length;
-
-  /* Adjacency to use for this destination. */
-  u32 adj_index;
-
-  /* If specified adjacencies to add and then
-     use for this destination.  add_adj/n_add_adj
-     are override adj_index if specified. */
-  ip_adjacency_t * add_adj;
-  u32 n_add_adj;
-} ip6_add_del_route_args_t;
-
-void ip6_add_del_route (ip6_main_t * im, ip6_add_del_route_args_t * args);
-
-void ip6_add_del_route_next_hop (ip6_main_t * im,
-                                u32 flags,
-                                ip6_address_t * dst_address,
-                                u32 dst_address_length,
-                                ip6_address_t * next_hop,
-                                u32 next_hop_sw_if_index,
-                                u32 next_hop_weight, u32 adj_index,
-                                 u32 explicit_fib_index);
-
-u32
-ip6_route_get_next_hop_adj (ip6_main_t * im,
-                           u32 fib_index,
-                           ip6_address_t *next_hop,
-                           u32 next_hop_sw_if_index,
-                           u32 explicit_fib_index);
-
-u32
-ip6_get_route (ip6_main_t * im,
-              u32 fib_index_or_table_id,
-              u32 flags,
-              ip6_address_t * address,
-              u32 address_length);
-
-void
-ip6_foreach_matching_route (ip6_main_t * im,
-                           u32 table_index_or_table_id,
-                           u32 flags,
-                           ip6_address_t * address,
-                           u32 address_length,
-                           ip6_address_t ** results,
-                           u8 ** result_length);
-
-void ip6_delete_matching_routes (ip6_main_t * im,
-                                u32 table_index_or_table_id,
-                                u32 flags,
-                                ip6_address_t * address,
-                                u32 address_length);
-
-void ip6_maybe_remap_adjacencies (ip6_main_t * im,
-                                 u32 table_index_or_table_id,
-                                 u32 flags);
-
-void ip6_adjacency_set_interface_route (vnet_main_t * vnm,
-                                       ip_adjacency_t * adj,
-                                       u32 sw_if_index,
-                                       u32 if_address_index);
-
-u32
-vnet_ip6_neighbor_glean_add(u32 fib_index, void * next_hop_arg);
-
 clib_error_t *
 ip6_probe_neighbor (vlib_main_t * vm, ip6_address_t * dst, u32 sw_if_index);
 
@@ -481,8 +389,6 @@ vnet_unset_ip6_ethernet_neighbor (vlib_main_t * vm,
                                   ip6_address_t * a,
                                   u8 * link_layer_address,
                                   uword n_bytes_link_layer_address);
-void
-vnet_ip6_fib_init (ip6_main_t * im, u32 fib_index);
 
 void 
 ip6_link_local_address_from_ethernet_mac_address (ip6_address_t *ip,
@@ -492,7 +398,8 @@ void
 ip6_ethernet_mac_address_from_link_local_address (u8 *mac, 
                                                   ip6_address_t *ip);
 
-int vnet_set_ip6_flow_hash (u32 table_id, u32 flow_hash_config);
+int vnet_set_ip6_flow_hash (u32 table_id,
+                           flow_hash_config_t flow_hash_config);
 
 int
 ip6_neighbor_ra_config(vlib_main_t * vm, u32 sw_if_index, 
@@ -560,7 +467,8 @@ extern vlib_node_registration_t ip6_lookup_node;
 /* Compute flow hash.  We'll use it to select which Sponge to use for this
    flow.  And other things. */
 always_inline u32
-ip6_compute_flow_hash (ip6_header_t * ip, u32 flow_hash_config)
+ip6_compute_flow_hash (const ip6_header_t * ip,
+                      flow_hash_config_t flow_hash_config)
 {
     tcp_header_t * tcp = (void *) (ip + 1);
     u64 a, b, c;
index c977960..f7514dc 100644 (file)
 #include <vnet/ethernet/ethernet.h> /* for ethernet_header_t */
 #include <vnet/srp/srp.h>      /* for srp_hw_interface_class */
 #include <vppinfra/cache.h>
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/ip6_fib.h>
+#include <vnet/dpo/load_balance.h>
+#include <vnet/dpo/classify_dpo.h>
 
 #include <vppinfra/bihash_template.c>
 
-static void compute_prefix_lengths_in_search_order (ip6_main_t * im)
-{
-  int i;
-  vec_reset_length (im->prefix_lengths_in_search_order);
-  /* Note: bitmap reversed so this is in fact a longest prefix match */
-  clib_bitmap_foreach (i, im->non_empty_dst_address_length_bitmap,
-  ({
-    int dst_address_length = 128 - i;
-    vec_add1 (im->prefix_lengths_in_search_order, dst_address_length);
-  }));
-}
-
-u32 
-ip6_fib_lookup_with_table (ip6_main_t * im, u32 fib_index, ip6_address_t * dst)
-{
-  ip_lookup_main_t * lm = &im->lookup_main;
-  int i, len;
-  int rv;
-  BVT(clib_bihash_kv) kv, value;
-  u64 fib;
-
-  len = vec_len (im->prefix_lengths_in_search_order);
-
-  kv.key[0] = dst->as_u64[0];
-  kv.key[1] = dst->as_u64[1];
-  fib = ((u64)((fib_index))<<32);
-
-  for (i = 0; i < len; i++)
-    {
-      int dst_address_length = im->prefix_lengths_in_search_order[i];
-      ip6_address_t * mask = &im->fib_masks[dst_address_length];
-      
-      ASSERT(dst_address_length >= 0 && dst_address_length <= 128);
-      //As lengths are decreasing, masks are increasingly specific.
-      kv.key[0] &= mask->as_u64[0];
-      kv.key[1] &= mask->as_u64[1];
-      kv.key[2] = fib | dst_address_length;
-      
-      rv = BV(clib_bihash_search_inline_2)(&im->ip6_lookup_table, &kv, &value);
-      if (rv == 0)
-        return value.value;
-    }
-
-  return lm->miss_adj_index;
-}
-
-u32 ip6_fib_lookup (ip6_main_t * im, u32 sw_if_index, ip6_address_t * dst)
-{
-    u32 fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index);
-    return ip6_fib_lookup_with_table (im, fib_index, dst);
-}
-
-void
-vnet_ip6_fib_init (ip6_main_t * im, u32 fib_index)
-{
-  ip_lookup_main_t * lm = &im->lookup_main;
-  ip6_add_del_route_args_t a;
-  ip_adjacency_t * adj;
-
-  memset(&a, 0x0, sizeof(ip6_add_del_route_args_t));
-
-  a.table_index_or_table_id = fib_index;
-  a.flags = (IP6_ROUTE_FLAG_ADD
-            | IP6_ROUTE_FLAG_FIB_INDEX
-            | IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY
-            | IP6_ROUTE_FLAG_NO_REDISTRIBUTE);
-
-  /* Add ff02::1:ff00:0/104 via local route for all tables.
-     This is required for neighbor discovery to work. */
-  adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
-                         &a.adj_index);
-  adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
-  adj->if_address_index = ~0;
-  adj->rewrite_header.data_bytes = 0;
-
-  ip6_set_solicited_node_multicast_address (&a.dst_address, 0);
-
-  a.dst_address_length = 104;
-  ip6_add_del_route (im, &a);
-
-  /* Add all-routers multicast address via local route for all tables */
-  adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
-                         &a.adj_index);
-  adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
-  adj->if_address_index = ~0;
-  adj->rewrite_header.data_bytes = 0;
-
-  ip6_set_reserved_multicast_address (&a.dst_address,
-                                     IP6_MULTICAST_SCOPE_link_local,
-                                     IP6_MULTICAST_GROUP_ID_all_routers);
-  
-  a.dst_address_length = 128;  
-  ip6_add_del_route (im, &a);
-
-  /* Add all-nodes multicast address via local route for all tables */
-  adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
-                         &a.adj_index);
-  adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
-  adj->if_address_index = ~0;
-  adj->rewrite_header.data_bytes = 0;
-
-  ip6_set_reserved_multicast_address (&a.dst_address,
-                                     IP6_MULTICAST_SCOPE_link_local,
-                                     IP6_MULTICAST_GROUP_ID_all_hosts);
-
-  a.dst_address_length = 128;
-  ip6_add_del_route (im, &a);
-
-  /* Add all-mldv2  multicast address via local route for all tables */
-  adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
-                         &a.adj_index);
-  adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
-  adj->if_address_index = ~0;
-  adj->rewrite_header.data_bytes = 0;
-  
-  ip6_set_reserved_multicast_address (&a.dst_address,
-                                     IP6_MULTICAST_SCOPE_link_local,
-                                     IP6_MULTICAST_GROUP_ID_mldv2_routers);
-
-  a.dst_address_length = 128;
-  ip6_add_del_route (im, &a);
-}
-
-static ip6_fib_t *
-create_fib_with_table_id (ip6_main_t * im, u32 table_id)
-{
-  ip6_fib_t * fib;
-  hash_set (im->fib_index_by_table_id, table_id, vec_len (im->fibs));
-  vec_add2 (im->fibs, fib, 1);
-  fib->table_id = table_id;
-  fib->index = fib - im->fibs;
-  fib->flow_hash_config = IP_FLOW_HASH_DEFAULT;
-  vnet_ip6_fib_init (im, fib->index);
-  return fib;
-}
-
-ip6_fib_t *
-find_ip6_fib_by_table_index_or_id (ip6_main_t * im, u32 table_index_or_id, u32 flags)
-{
-  uword * p, fib_index;
-
-  fib_index = table_index_or_id;
-  if (! (flags & IP6_ROUTE_FLAG_FIB_INDEX))
-    {
-      if (table_index_or_id == ~0) {
-        table_index_or_id = 0;
-        while (hash_get (im->fib_index_by_table_id, table_index_or_id)) {
-          table_index_or_id++;
-        }
-        return create_fib_with_table_id (im, table_index_or_id);
-      }
-
-      p = hash_get (im->fib_index_by_table_id, table_index_or_id);
-      if (! p)
-       return create_fib_with_table_id (im, table_index_or_id);
-      fib_index = p[0];
-    }
-  return vec_elt_at_index (im->fibs, fib_index);
-}
-
-void ip6_add_del_route (ip6_main_t * im, ip6_add_del_route_args_t * a)
-{
-  ip_lookup_main_t * lm = &im->lookup_main;
-  ip6_fib_t * fib;
-  ip6_address_t dst_address;
-  u32 dst_address_length, adj_index;
-  uword is_del;
-  u32 old_adj_index = ~0;
-  BVT(clib_bihash_kv) kv, value;
-
-  vlib_smp_unsafe_warning();
-
-  is_del = (a->flags & IP6_ROUTE_FLAG_DEL) != 0;
-
-  /* Either create new adjacency or use given one depending on arguments. */
-  if (a->n_add_adj > 0)
-    {
-      ip_add_adjacency (lm, a->add_adj, a->n_add_adj, &adj_index);
-      ip_call_add_del_adjacency_callbacks (lm, adj_index, /* is_del */ 0);
-    }
-  else
-    adj_index = a->adj_index;
-
-  dst_address = a->dst_address;
-  dst_address_length = a->dst_address_length;
-  fib = find_ip6_fib_by_table_index_or_id (im, a->table_index_or_table_id, 
-                                           a->flags);
-
-  ASSERT (dst_address_length < ARRAY_LEN (im->fib_masks));
-  ip6_address_mask (&dst_address, &im->fib_masks[dst_address_length]);
-
-  /* refcount accounting */
-  if (is_del)
-    {
-      ASSERT (im->dst_address_length_refcounts[dst_address_length] > 0);
-      if (--im->dst_address_length_refcounts[dst_address_length] == 0)
-        {
-          im->non_empty_dst_address_length_bitmap =
-            clib_bitmap_set (im->non_empty_dst_address_length_bitmap, 
-                             128 - dst_address_length, 0);
-          compute_prefix_lengths_in_search_order (im);
-        }
-    }
-  else
-    {
-      im->dst_address_length_refcounts[dst_address_length]++;
-
-      im->non_empty_dst_address_length_bitmap =
-        clib_bitmap_set (im->non_empty_dst_address_length_bitmap, 
-                             128 - dst_address_length, 1);
-      compute_prefix_lengths_in_search_order (im);
-    }
-    
-  kv.key[0] = dst_address.as_u64[0];
-  kv.key[1] = dst_address.as_u64[1];
-  kv.key[2] = ((u64)((fib - im->fibs))<<32) | dst_address_length;
-
-  if (BV(clib_bihash_search)(&im->ip6_lookup_table, &kv, &value) == 0)
-    old_adj_index = value.value;
-
-  if (is_del)
-    BV(clib_bihash_add_del) (&im->ip6_lookup_table, &kv, 0 /* is_add */);
-  else
-    {
-      /* Make sure adj index is valid. */
-      if (CLIB_DEBUG > 0)
-        (void) ip_get_adjacency (lm, adj_index);
-
-      kv.value = adj_index;
-
-      BV(clib_bihash_add_del) (&im->ip6_lookup_table, &kv, 1 /* is_add */);
-    }
-
-  /* Avoid spurious reference count increments */
-  if (old_adj_index == adj_index 
-      && adj_index != ~0
-      && !(a->flags & IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY))
-    {
-      ip_adjacency_t * adj = ip_get_adjacency (lm, adj_index);
-      if (adj->share_count > 0)
-        adj->share_count --;
-    }
-
-  /* Delete old adjacency index if present and changed. */
-  {
-    if (! (a->flags & IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY)
-       && old_adj_index != ~0
-       && old_adj_index != adj_index)
-      ip_del_adjacency (lm, old_adj_index);
-  }
-}
-
-u32
-ip6_route_get_next_hop_adj (ip6_main_t * im,
-                           u32 fib_index,
-                           ip6_address_t *next_hop,
-                           u32 next_hop_sw_if_index,
-                           u32 explicit_fib_index)
-{
-  ip_lookup_main_t * lm = &im->lookup_main;
-  vnet_main_t * vnm = vnet_get_main();
-  int is_interface_next_hop;
-  uword * nh_result;
-  u32 nh_adj_index;
-  ip6_fib_t * fib;
-
-  fib = vec_elt_at_index (im->fibs, fib_index);
-
-  is_interface_next_hop = ip6_address_is_zero (next_hop);
-
-  if (is_interface_next_hop)
-    {
-      nh_result = hash_get (im->interface_route_adj_index_by_sw_if_index,
-                           next_hop_sw_if_index);
-      if (nh_result)
-         nh_adj_index = *nh_result;
-      else
-        {
-         ip_adjacency_t * adj;
-         adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
-                                 &nh_adj_index);
-         ip6_adjacency_set_interface_route (vnm, adj,
-                                            next_hop_sw_if_index, ~0);
-         ip_call_add_del_adjacency_callbacks
-             (lm, next_hop_sw_if_index, /* is_del */ 0);
-         hash_set (im->interface_route_adj_index_by_sw_if_index,
-                   next_hop_sw_if_index, nh_adj_index);
-       }
-    }
-  else if (next_hop_sw_if_index == ~0)
-    {
-      /* next-hop is recursive. we always need a indirect adj
-       * for recursive paths. Any LPM we perform now will give
-       * us a valid adj, but without tracking the next-hop we
-       * have no way to keep it valid.
-       */
-      ip_adjacency_t add_adj;
-      memset (&add_adj, 0, sizeof(add_adj));
-      add_adj.n_adj = 1;
-      add_adj.lookup_next_index = IP_LOOKUP_NEXT_INDIRECT;
-      add_adj.indirect.next_hop.ip6.as_u64[0] = next_hop->as_u64[0];
-      add_adj.indirect.next_hop.ip6.as_u64[1] = next_hop->as_u64[1];
-      add_adj.explicit_fib_index = explicit_fib_index;
-      ip_add_adjacency (lm, &add_adj, 1, &nh_adj_index);
-    }
-  else
-    {
-      BVT(clib_bihash_kv) kv, value;
-
-      /* Look for the interface /128 route */
-      kv.key[0] = next_hop->as_u64[0];
-      kv.key[1] = next_hop->as_u64[1];
-      kv.key[2] = ((u64)((fib - im->fibs))<<32) | 128;
-after_nd:
-      if (BV(clib_bihash_search)(&im->ip6_lookup_table, &kv, &value) < 0)
-        {
-          ip_adjacency_t * adj;
-          nh_adj_index = ip6_fib_lookup_with_table (im, fib_index, next_hop);
-          adj = ip_get_adjacency (lm, nh_adj_index);
-          /* if ND interface adjacencty is present, we need to
-           install ND adjaceny for specific next hop */
-          if (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP &&
-              adj->arp.next_hop.ip6.as_u64[0] == 0 &&
-              adj->arp.next_hop.ip6.as_u64[1] == 0)
-            {
-              nh_adj_index = vnet_ip6_neighbor_glean_add(fib_index, next_hop);
-            }
-          else if (next_hop->as_u8[0] == 0xfe)
-            {
-              //Next hop is link-local. No indirect in this case.
-              //Let's add it as a possible neighbor on this interface
-              ip6_address_t null_addr= {};
-              ip6_add_del_route_next_hop (im, IP6_ROUTE_FLAG_ADD,
-                                          next_hop, 128,
-                                          &null_addr, next_hop_sw_if_index,
-                                          1, ~0, fib_index);
-              goto after_nd;
-            }
-        }
-      else
-        {
-          nh_adj_index = value.value;
-        }
-    }
-
-  return (nh_adj_index);
-}
-
-void
-ip6_add_del_route_next_hop (ip6_main_t * im,
-                           u32 flags,
-                           ip6_address_t * dst_address,
-                           u32 dst_address_length,
-                           ip6_address_t * next_hop,
-                           u32 next_hop_sw_if_index,
-                           u32 next_hop_weight, u32 adj_index,
-                            u32 explicit_fib_index)
-{
-  vnet_main_t * vnm = vnet_get_main();
-  ip_lookup_main_t * lm = &im->lookup_main;
-  u32 fib_index;
-  ip6_fib_t * fib;
-  ip6_address_t masked_dst_address;
-  u32 old_mp_adj_index, new_mp_adj_index;
-  u32 dst_adj_index, nh_adj_index;
-  int rv;
-  ip_adjacency_t * dst_adj;
-  ip_multipath_adjacency_t * old_mp, * new_mp;
-  int is_del = (flags & IP6_ROUTE_FLAG_DEL) != 0;
-  clib_error_t * error = 0;
-  BVT(clib_bihash_kv) kv, value;
-
-  vlib_smp_unsafe_warning();
-
-  if (explicit_fib_index == (u32)~0)
-    fib_index = vec_elt (im->fib_index_by_sw_if_index, next_hop_sw_if_index);
-  else
-    fib_index = explicit_fib_index;
-
-  fib = vec_elt_at_index (im->fibs, fib_index);
-
-  /* Lookup next hop to be added or deleted. */
-  if (adj_index == (u32)~0)
-    {
-      nh_adj_index = ip6_route_get_next_hop_adj(im, fib_index,
-                                               next_hop,
-                                               next_hop_sw_if_index,
-                                               explicit_fib_index);
-    }
-  else
-    {
-      /* Look for the interface /128 route */
-      kv.key[0] = next_hop->as_u64[0];
-      kv.key[1] = next_hop->as_u64[1];
-      kv.key[2] = ((u64)((fib - im->fibs))<<32) | 128;
-      
-      if (BV(clib_bihash_search)(&im->ip6_lookup_table, &kv, &value) < 0)
-        {
-          vnm->api_errno = VNET_API_ERROR_UNKNOWN_DESTINATION;
-          error = clib_error_return (0, "next-hop %U/128 not in FIB",
-                                     format_ip6_address, next_hop);
-          goto done;
-        }
-      
-      nh_adj_index = value.value;
-    }
-
-  ASSERT (dst_address_length < ARRAY_LEN (im->fib_masks));
-  masked_dst_address = dst_address[0];
-  ip6_address_mask (&masked_dst_address, &im->fib_masks[dst_address_length]);
-
-  kv.key[0] = masked_dst_address.as_u64[0];
-  kv.key[1] = masked_dst_address.as_u64[1];
-  kv.key[2] = ((u64)((fib - im->fibs))<<32) | dst_address_length;
-
-  rv = BV(clib_bihash_search)(&im->ip6_lookup_table, &kv, &value);
-
-  if (rv == 0)
-    {
-      dst_adj_index = value.value;
-      dst_adj = ip_get_adjacency (lm, dst_adj_index);
-    }
-  else
-    {
-      /* For deletes destination must be known. */
-      if (is_del)
-       {
-          vnm->api_errno = VNET_API_ERROR_UNKNOWN_DESTINATION;
-         error = clib_error_return (0, "unknown destination %U/%d",
-                                    format_ip6_address, dst_address,
-                                    dst_address_length);
-         goto done;
-       }
-
-      dst_adj_index = ~0;
-      dst_adj = 0;
-    }
-
-  /* Ignore adds of X/128 with next hop of X. */
-  if (! is_del
-      && dst_address_length == 128
-      && ip6_address_is_equal (dst_address, next_hop))
-    {
-      vnm->api_errno = VNET_API_ERROR_PREFIX_MATCHES_NEXT_HOP;
-      error = clib_error_return (0, "prefix matches next hop %U/%d",
-                                 format_ip6_address, dst_address,
-                                 dst_address_length);
-      goto done;
-    }
-
-  /* Destination is not known and default weight is set so add route
-     to existing non-multipath adjacency */
-  if (dst_adj_index == ~0 && next_hop_weight == 1 && next_hop_sw_if_index == ~0)
-  {
-    /* create / delete additional mapping of existing adjacency */
-    ip6_add_del_route_args_t a;
-
-    a.table_index_or_table_id = fib_index;
-    a.flags = ((is_del ? IP6_ROUTE_FLAG_DEL : IP6_ROUTE_FLAG_ADD)
-        | IP6_ROUTE_FLAG_FIB_INDEX
-        | IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY
-        | (flags & (IP6_ROUTE_FLAG_NO_REDISTRIBUTE
-            | IP6_ROUTE_FLAG_NOT_LAST_IN_GROUP)));
-    a.dst_address = dst_address[0];
-    a.dst_address_length = dst_address_length;
-    a.adj_index = nh_adj_index;
-    a.add_adj = 0;
-    a.n_add_adj = 0;
-
-    ip6_add_del_route (im, &a);
-    goto done;
-  }
-
-  old_mp_adj_index = dst_adj ? dst_adj->heap_handle : ~0;
-
-  if (! ip_multipath_adjacency_add_del_next_hop
-      (lm, is_del,
-       dst_adj ? dst_adj->heap_handle : ~0,
-       nh_adj_index,
-       next_hop_weight,
-       &new_mp_adj_index))
-    {
-      vnm->api_errno = VNET_API_ERROR_NEXT_HOP_NOT_FOUND_MP;
-      error = clib_error_return 
-        (0, "requested deleting next-hop %U not found in multi-path",
-         format_ip6_address, next_hop);
-      goto done;
-    }
-  
-  old_mp = new_mp = 0;
-  if (old_mp_adj_index != ~0)
-    old_mp = vec_elt_at_index (lm->multipath_adjacencies, old_mp_adj_index);
-  if (new_mp_adj_index != ~0)
-    new_mp = vec_elt_at_index (lm->multipath_adjacencies, new_mp_adj_index);
-
-  if (old_mp != new_mp)
-    {
-      ip6_add_del_route_args_t a;
-      ip_adjacency_t * adj;
-
-      a.table_index_or_table_id = fib_index;
-      a.flags = ((is_del ? IP6_ROUTE_FLAG_DEL : IP6_ROUTE_FLAG_ADD)
-                | IP6_ROUTE_FLAG_FIB_INDEX
-                | IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY
-                | (flags & IP6_ROUTE_FLAG_NO_REDISTRIBUTE));
-      a.dst_address = dst_address[0];
-      a.dst_address_length = dst_address_length;
-      a.adj_index = new_mp ? new_mp->adj_index : dst_adj_index;
-      a.add_adj = 0;
-      a.n_add_adj = 0;
-
-      ip6_add_del_route (im, &a);
-
-      adj = ip_get_adjacency (lm, new_mp ? new_mp->adj_index : dst_adj_index);
-      if (adj->n_adj == 1)
-        adj->share_count += is_del ? -1 : 1;
-    }
-
- done:
-  if (error)
-    clib_error_report (error);
-}
-
-u32
-ip6_get_route (ip6_main_t * im,
-              u32 table_index_or_table_id,
-              u32 flags,
-              ip6_address_t * address,
-              u32 address_length)
-{
-  ip6_fib_t * fib = find_ip6_fib_by_table_index_or_id (im, table_index_or_table_id, flags);
-  ip6_address_t masked_address;
-  BVT(clib_bihash_kv) kv, value;
-
-  ASSERT (address_length < ARRAY_LEN (im->fib_masks));
-  clib_memcpy (&masked_address, address, sizeof (masked_address));
-  ip6_address_mask (&masked_address, &im->fib_masks[address_length]);
-
-  kv.key[0] = masked_address.as_u64[0];
-  kv.key[1] = masked_address.as_u64[1];
-  kv.key[2] = ((u64)((fib - im->fibs))<<32) | address_length;
-
-  if (BV(clib_bihash_search)(&im->ip6_lookup_table, &kv, &value) == 0)
-    return (value.value);
-  return 0;
-}
-
-void
-ip6_foreach_matching_route (ip6_main_t * im,
-                           u32 table_index_or_table_id,
-                           u32 flags,
-                           ip6_address_t * dst_address,
-                           u32 address_length,
-                           ip6_address_t ** results,
-                           u8 ** result_lengths)
-{
-  ip6_fib_t * fib = 
-    find_ip6_fib_by_table_index_or_id (im, table_index_or_table_id, flags);
-  BVT(clib_bihash) * h = &im->ip6_lookup_table;
-  BVT(clib_bihash_value) * v;
-  clib_bihash_bucket_t * b;
-  int i, j, k;
-  
-  if (*results)
-    _vec_len (*results) = 0;
-  if (*result_lengths)
-    _vec_len (*result_lengths) = 0;
-
-  /* Walk the table looking for routes which match the supplied address */
-  for (i = 0; i < h->nbuckets; i++)
-    {
-      b = &h->buckets [i];
-      if (b->offset == 0)
-          continue;
-
-      v = BV(clib_bihash_get_value) (h, b->offset);
-      for (j = 0; j < (1<<b->log2_pages); j++)
-        {
-          for (k = 0; k < BIHASH_KVP_PER_PAGE; k++)
-            {
-              if (BV(clib_bihash_is_free)(&v->kvp[k]))
-                continue;
-              
-              if ((v->kvp[k].key[2] 
-                   == (((u64)((fib - im->fibs))<<32) | address_length))
-                  && ip6_destination_matches_route 
-                  (im, dst_address, (ip6_address_t *) &v->kvp[k], 
-                   address_length))
-                {
-                  ip6_address_t * a;
-
-                  a = (ip6_address_t *)(&v->kvp[k]);
-
-                  vec_add1 (*results, a[0]);
-                  vec_add1 (*result_lengths, address_length);
-                }
-            }
-          v++;
-        }
-    }
-}
-
-void ip6_maybe_remap_adjacencies (ip6_main_t * im,
-                                 u32 table_index_or_table_id,
-                                 u32 flags)
-{
-#if SOONE
-  ip6_fib_t * fib 
-    = find_ip6_fib_by_table_index_or_id (im, table_index_or_table_id, flags);
-#endif
-  ip_lookup_main_t * lm = &im->lookup_main;
-
-  if (lm->n_adjacency_remaps == 0)
-    return;
-
-  clib_warning ("unimplemented, please report to vpp-dev@cisco.com");
-
-  /* All remaps have been performed. */
-  lm->n_adjacency_remaps = 0;
-}
-
-void ip6_delete_matching_routes (ip6_main_t * im,
-                                u32 table_index_or_table_id,
-                                u32 flags,
-                                ip6_address_t * address,
-                                u32 address_length)
-{
-  /* $$$$ static may be OK - this should happen only on thread 0 */
-  static ip6_address_t * matching_addresses;
-  static u8 * matching_address_lengths;
-  u32 l, i;
-  ip6_add_del_route_args_t a;
-
-  vlib_smp_unsafe_warning();
-
-  a.flags = IP6_ROUTE_FLAG_DEL | IP6_ROUTE_FLAG_NO_REDISTRIBUTE | flags;
-  a.table_index_or_table_id = table_index_or_table_id;
-  a.adj_index = ~0;
-  a.add_adj = 0;
-  a.n_add_adj = 0;
-
-  for (l = address_length + 1; l <= 128; l++)
-    {
-      ip6_foreach_matching_route (im, table_index_or_table_id, flags,
-                                 address,
-                                 l,
-                                 &matching_addresses,
-                                 &matching_address_lengths);
-      for (i = 0; i < vec_len (matching_addresses); i++)
-       {
-         a.dst_address = matching_addresses[i];
-         a.dst_address_length = matching_address_lengths[i];
-         ip6_add_del_route (im, &a);
-       }
-    }
-
-  ip6_maybe_remap_adjacencies (im, table_index_or_table_id, flags);
-}
-
 void
 ip6_forward_next_trace (vlib_main_t * vm,
                         vlib_node_runtime_t * node,
@@ -713,12 +58,10 @@ ip6_forward_next_trace (vlib_main_t * vm,
 always_inline uword
 ip6_lookup_inline (vlib_main_t * vm,
                   vlib_node_runtime_t * node,
-                  vlib_frame_t * frame,
-                  int is_indirect)
+                  vlib_frame_t * frame)
 {
   ip6_main_t * im = &ip6_main;
-  ip_lookup_main_t * lm = &im->lookup_main;
-  vlib_combined_counter_main_t * cm = &im->lookup_main.adjacency_counters;
+  vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
   u32 n_left_from, n_left_to_next, * from, * to_next;
   ip_lookup_next_t next;
   u32 cpu_index = os_get_cpu_number();
@@ -735,13 +78,14 @@ ip6_lookup_inline (vlib_main_t * vm,
       while (n_left_from >= 4 && n_left_to_next >= 2)
        {
          vlib_buffer_t * p0, * p1;
-         u32 pi0, pi1, adj_index0, adj_index1, wrong_next;
+         u32 pi0, pi1, lbi0, lbi1, wrong_next;
          ip_lookup_next_t next0, next1;
          ip6_header_t * ip0, * ip1;
-         ip_adjacency_t * adj0, * adj1;
          ip6_address_t * dst_addr0, * dst_addr1;
           u32 fib_index0, fib_index1;
           u32 flow_hash_config0, flow_hash_config1;
+         const dpo_id_t *dpo0, *dpo1;
+         const load_balance_t *lb0, *lb1;
 
          /* Prefetch next iteration. */
          {
@@ -765,19 +109,8 @@ ip6_lookup_inline (vlib_main_t * vm,
          ip0 = vlib_buffer_get_current (p0);
          ip1 = vlib_buffer_get_current (p1);
 
-         if (PREDICT_FALSE(is_indirect))
-           {
-             ip_adjacency_t * iadj0, * iadj1;
-             iadj0 = ip_get_adjacency (lm, vnet_buffer(p0)->ip.adj_index[VLIB_TX]);
-             iadj1 = ip_get_adjacency (lm, vnet_buffer(p1)->ip.adj_index[VLIB_TX]);
-             dst_addr0 = &iadj0->indirect.next_hop.ip6;
-             dst_addr1 = &iadj1->indirect.next_hop.ip6;
-           }
-         else
-           {
-             dst_addr0 = &ip0->dst_address;
-             dst_addr1 = &ip1->dst_address;
-           }
+         dst_addr0 = &ip0->dst_address;
+         dst_addr1 = &ip1->dst_address;
 
          fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
          fib_index1 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p1)->sw_if_index[VLIB_RX]);
@@ -787,69 +120,60 @@ ip6_lookup_inline (vlib_main_t * vm,
           fib_index1 = (vnet_buffer(p1)->sw_if_index[VLIB_TX] == (u32)~0) ?
             fib_index1 : vnet_buffer(p1)->sw_if_index[VLIB_TX];
 
-         adj_index0 = ip6_fib_lookup_with_table (im, fib_index0, dst_addr0);
-         adj_index1 = ip6_fib_lookup_with_table (im, fib_index1, dst_addr1);
+         lbi0 = ip6_fib_table_fwding_lookup (im, fib_index0, dst_addr0);
+         lbi1 = ip6_fib_table_fwding_lookup (im, fib_index1, dst_addr1);
 
-         adj0 = ip_get_adjacency (lm, adj_index0);
-         adj1 = ip_get_adjacency (lm, adj_index1);
-
-          if (PREDICT_FALSE (adj0->explicit_fib_index != ~0))
-            {
-              adj_index0 = ip6_fib_lookup_with_table 
-                (im, adj0->explicit_fib_index, dst_addr0);
-              adj0 = ip_get_adjacency (lm, adj_index0);
-            }
-          if (PREDICT_FALSE (adj1->explicit_fib_index != ~0))
-            {
-              adj_index1 = ip6_fib_lookup_with_table 
-                (im, adj1->explicit_fib_index, dst_addr1);
-              adj1 = ip_get_adjacency (lm, adj_index1);
-            }
-
-         next0 = adj0->lookup_next_index;
-         next1 = adj1->lookup_next_index;
-
-         /* Only process the HBH Option Header if explicitly configured to do so */
-          next0 = (ip0->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS) && im->hbh_enabled &&
-           adj_index0 ? (ip_lookup_next_t) IP6_LOOKUP_NEXT_HOP_BY_HOP : adj0->lookup_next_index;
-          next1 = (ip1->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS) && im->hbh_enabled &&
-           adj_index1 ? (ip_lookup_next_t) IP6_LOOKUP_NEXT_HOP_BY_HOP : adj1->lookup_next_index;
+         lb0 = load_balance_get (lbi0);
+         lb1 = load_balance_get (lbi1);
 
           vnet_buffer (p0)->ip.flow_hash = 
             vnet_buffer(p1)->ip.flow_hash = 0;
 
-          if (PREDICT_FALSE(adj0->n_adj > 1))
+          if (PREDICT_FALSE(lb0->lb_n_buckets > 1))
             {
-              flow_hash_config0 = 
-                vec_elt_at_index (im->fibs,fib_index0)->flow_hash_config;
+              flow_hash_config0 = lb0->lb_hash_config;
               vnet_buffer (p0)->ip.flow_hash = 
                 ip6_compute_flow_hash (ip0, flow_hash_config0);
             }
-
-          if (PREDICT_FALSE(adj1->n_adj > 1))
+          if (PREDICT_FALSE(lb1->lb_n_buckets > 1))
             {
-              flow_hash_config1 = 
-                vec_elt_at_index (im->fibs,fib_index0)->flow_hash_config;
-
+              flow_hash_config1 = lb1->lb_hash_config;
               vnet_buffer (p1)->ip.flow_hash = 
                 ip6_compute_flow_hash (ip1, flow_hash_config1);
             }
 
-         ASSERT (adj0->n_adj > 0);
-         ASSERT (adj1->n_adj > 0);
-         ASSERT (is_pow2 (adj0->n_adj));
-         ASSERT (is_pow2 (adj1->n_adj));
-         adj_index0 += (vnet_buffer (p0)->ip.flow_hash & (adj0->n_adj - 1));
-         adj_index1 += (vnet_buffer (p1)->ip.flow_hash & (adj1->n_adj - 1));
+         ASSERT (lb0->lb_n_buckets > 0);
+         ASSERT (lb1->lb_n_buckets > 0);
+         ASSERT (is_pow2 (lb0->lb_n_buckets));
+         ASSERT (is_pow2 (lb1->lb_n_buckets));
+         dpo0 = load_balance_get_bucket_i(lb0,
+                                           (vnet_buffer (p0)->ip.flow_hash &
+                                            lb0->lb_n_buckets_minus_1));
+         dpo1 = load_balance_get_bucket_i(lb1,
+                                           (vnet_buffer (p1)->ip.flow_hash &
+                                            lb1->lb_n_buckets_minus_1));
 
-         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
-         vnet_buffer (p1)->ip.adj_index[VLIB_TX] = adj_index1;
+         next0 = dpo0->dpoi_next_node;
+         next1 = dpo1->dpoi_next_node;
+
+         /* Only process the HBH Option Header if explicitly configured to do so */
+          next0 = ((ip0->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS) &&
+                  im->hbh_enabled) ?
+           (ip_lookup_next_t) IP6_LOOKUP_NEXT_HOP_BY_HOP :
+           next0;
+          next1 = ((ip1->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS) &&
+                  im->hbh_enabled) ?
+           (ip_lookup_next_t) IP6_LOOKUP_NEXT_HOP_BY_HOP :
+           next1;
+
+         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
+         vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
 
          vlib_increment_combined_counter 
-              (cm, cpu_index, adj_index0, 1,
+              (cm, cpu_index, lbi0, 1,
                vlib_buffer_length_in_chain (vm, p0));
          vlib_increment_combined_counter 
-              (cm, cpu_index, adj_index1, 1,
+              (cm, cpu_index, lbi1, 1,
                vlib_buffer_length_in_chain (vm, p1));
 
          from += 2;
@@ -898,11 +222,12 @@ ip6_lookup_inline (vlib_main_t * vm,
        {
          vlib_buffer_t * p0;
          ip6_header_t * ip0;
-         u32 pi0, adj_index0;
+         u32 pi0, lbi0;
          ip_lookup_next_t next0;
-         ip_adjacency_t * adj0;
+         load_balance_t * lb0;
          ip6_address_t * dst_addr0;
           u32 fib_index0, flow_hash_config0;
+         const dpo_id_t *dpo0;
 
          pi0 = from[0];
          to_next[0] = pi0;
@@ -911,57 +236,44 @@ ip6_lookup_inline (vlib_main_t * vm,
 
          ip0 = vlib_buffer_get_current (p0);
 
-         if (PREDICT_FALSE(is_indirect))
-           {
-             ip_adjacency_t * iadj0;
-             iadj0 = ip_get_adjacency (lm, vnet_buffer(p0)->ip.adj_index[VLIB_TX]);
-             dst_addr0 = &iadj0->indirect.next_hop.ip6;
-           }
-         else
-           {
-             dst_addr0 = &ip0->dst_address;
-           }
+         dst_addr0 = &ip0->dst_address;
 
          fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
             fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
 
           flow_hash_config0 = 
-              vec_elt_at_index (im->fibs,fib_index0)->flow_hash_config;
+              ip6_fib_get (fib_index0)->flow_hash_config;
 
-         adj_index0 = ip6_fib_lookup_with_table (im, fib_index0, dst_addr0);
+         lbi0 = ip6_fib_table_fwding_lookup (im, fib_index0, dst_addr0);
 
-         adj0 = ip_get_adjacency (lm, adj_index0);
-
-          if (PREDICT_FALSE (adj0->explicit_fib_index != ~0))
-            {
-              adj_index0 = ip6_fib_lookup_with_table
-                (im, adj0->explicit_fib_index, dst_addr0);
-              adj0 = ip_get_adjacency (lm, adj_index0);
-            }
-
-         /* Only process the HBH Option Header if explicitly configured to do so */
-          next0 = (ip0->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS) && im->hbh_enabled &&
-           adj_index0 ? (ip_lookup_next_t) IP6_LOOKUP_NEXT_HOP_BY_HOP : adj0->lookup_next_index;
+         lb0 = load_balance_get (lbi0);
 
           vnet_buffer (p0)->ip.flow_hash = 0;
 
-          if (PREDICT_FALSE(adj0->n_adj > 1))
+          if (PREDICT_FALSE(lb0->lb_n_buckets > 1))
             {
-              flow_hash_config0 = 
-                vec_elt_at_index (im->fibs,fib_index0)->flow_hash_config;
+              flow_hash_config0 = lb0->lb_hash_config;
               vnet_buffer (p0)->ip.flow_hash = 
                 ip6_compute_flow_hash (ip0, flow_hash_config0);
             }
 
-         ASSERT (adj0->n_adj > 0);
-         ASSERT (is_pow2 (adj0->n_adj));
-         adj_index0 += (vnet_buffer (p0)->ip.flow_hash & (adj0->n_adj - 1));
+         ASSERT (lb0->lb_n_buckets > 0);
+         ASSERT (is_pow2 (lb0->lb_n_buckets));
+         dpo0 = load_balance_get_bucket_i(lb0,
+                                           (vnet_buffer (p0)->ip.flow_hash &
+                                            lb0->lb_n_buckets_minus_1));
+         next0 = dpo0->dpoi_next_node;
+         /* Only process the HBH Option Header if explicitly configured to do so */
+          next0 = ((ip0->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS) &&
+                  im->hbh_enabled) ?
+           (ip_lookup_next_t) IP6_LOOKUP_NEXT_HOP_BY_HOP :
+           next0;
 
-         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
+         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
 
          vlib_increment_combined_counter 
-              (cm, cpu_index, adj_index0, 1,
+              (cm, cpu_index, lbi0, 1,
                vlib_buffer_length_in_chain (vm, p0));
 
          from += 1;
@@ -986,163 +298,171 @@ ip6_lookup_inline (vlib_main_t * vm,
     }
 
   if (node->flags & VLIB_NODE_FLAG_TRACE)
-      ip6_forward_next_trace(vm, node, frame, VLIB_TX);
+    ip6_forward_next_trace(vm, node, frame, VLIB_TX);
 
   return frame->n_vectors;
 }
 
-void ip6_adjacency_set_interface_route (vnet_main_t * vnm,
-                                       ip_adjacency_t * adj,
-                                       u32 sw_if_index,
-                                       u32 if_address_index)
-{
-  vnet_hw_interface_t * hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
-  ip_lookup_next_t n;
-  u32 node_index;
-
-  if (hw->hw_class_index == ethernet_hw_interface_class.index
-      || hw->hw_class_index == srp_hw_interface_class.index)
-    {
-      n = IP_LOOKUP_NEXT_ARP;
-      node_index = ip6_discover_neighbor_node.index;
-      adj->if_address_index = if_address_index;
-      adj->arp.next_hop.ip6.as_u64[0] = 0;
-      adj->arp.next_hop.ip6.as_u64[1] = 0;
-  }
-  else
-    {
-      n = IP_LOOKUP_NEXT_REWRITE;
-      node_index = ip6_rewrite_node.index;
-    }
-
- adj->lookup_next_index = n;
- adj->explicit_fib_index = ~0;
-
- vnet_rewrite_for_sw_interface
-   (vnm,
-    VNET_L3_PACKET_TYPE_IP6,
-    sw_if_index,
-    node_index,
-    VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST,
-    &adj->rewrite_header,
-    sizeof (adj->rewrite_data));
-}
-
 static void
 ip6_add_interface_routes (vnet_main_t * vnm, u32 sw_if_index,
                          ip6_main_t * im, u32 fib_index,
                          ip_interface_address_t * a)
 {
   ip_lookup_main_t * lm = &im->lookup_main;
-  ip_adjacency_t * adj;
   ip6_address_t * address = ip_interface_address_get_address (lm, a);
-  ip6_add_del_route_args_t x;
-  vnet_hw_interface_t * hw_if = vnet_get_sup_hw_interface (vnm, sw_if_index);
-  u32 classify_table_index;
-
-  /* Add e.g. 1.0.0.0/8 as interface route (arp for Ethernet). */
-  x.table_index_or_table_id = fib_index;
-  x.flags = (IP6_ROUTE_FLAG_ADD
-            | IP6_ROUTE_FLAG_FIB_INDEX
-            | IP6_ROUTE_FLAG_NO_REDISTRIBUTE);
-  x.dst_address = address[0];
-  x.dst_address_length = a->address_length;
-  x.n_add_adj = 0;
-  x.add_adj = 0;
+  fib_prefix_t pfx = {
+      .fp_len = a->address_length,
+      .fp_proto = FIB_PROTOCOL_IP6,
+      .fp_addr.ip6 = *address,
+  };
 
   a->neighbor_probe_adj_index = ~0;
   if (a->address_length < 128)
-    {
-      adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
-                             &x.adj_index);
-      ip6_adjacency_set_interface_route (vnm, adj, sw_if_index, a - lm->if_address_pool);
-      ip_call_add_del_adjacency_callbacks (lm, x.adj_index, /* is_del */ 0);
-      ip6_add_del_route (im, &x);
-      a->neighbor_probe_adj_index = x.adj_index;
-    }
-
-  /* Add e.g. ::1/128 as local to this host. */
-  adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
-                         &x.adj_index);
+  {
+      fib_node_index_t fei;
+
+      fei = fib_table_entry_update_one_path(fib_index,
+                                           &pfx,
+                                           FIB_SOURCE_INTERFACE,
+                                           (FIB_ENTRY_FLAG_CONNECTED |
+                                            FIB_ENTRY_FLAG_ATTACHED),
+                                           FIB_PROTOCOL_IP6,
+                                           NULL, /* No next-hop address */
+                                           sw_if_index,
+                                           ~0, // invalid FIB index
+                                           1,
+                                           MPLS_LABEL_INVALID,
+                                           FIB_ROUTE_PATH_FLAG_NONE);
+      a->neighbor_probe_adj_index = fib_entry_get_adj(fei);
+  }
 
-  classify_table_index = ~0;
+  pfx.fp_len = 128;
   if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index))
-    classify_table_index = lm->classify_table_index_by_sw_if_index [sw_if_index];
-  if (classify_table_index != (u32) ~0)
+  {
+      u32 classify_table_index =
+         lm->classify_table_index_by_sw_if_index [sw_if_index];
+      if (classify_table_index != (u32) ~0)
+      {
+          dpo_id_t dpo = DPO_NULL;
+
+          dpo_set(&dpo,
+                  DPO_CLASSIFY,
+                  DPO_PROTO_IP4,
+                  classify_dpo_create(FIB_PROTOCOL_IP6,
+                                      classify_table_index));
+
+         fib_table_entry_special_dpo_add(fib_index,
+                                          &pfx,
+                                          FIB_SOURCE_CLASSIFY,
+                                          FIB_ENTRY_FLAG_NONE,
+                                          &dpo);
+          dpo_reset(&dpo);
+      }
+  }
+
+  fib_table_entry_update_one_path(fib_index,
+                                 &pfx,
+                                 FIB_SOURCE_INTERFACE,
+                                 (FIB_ENTRY_FLAG_CONNECTED |
+                                  FIB_ENTRY_FLAG_LOCAL),
+                                 FIB_PROTOCOL_IP6,
+                                 &pfx.fp_addr,
+                                 sw_if_index,
+                                 ~0, // invalid FIB index
+                                 1,
+                                 MPLS_LABEL_INVALID,
+                                 FIB_ROUTE_PATH_FLAG_NONE);
+}
+
+static void
+ip6_del_interface_routes (ip6_main_t * im,
+                         u32 fib_index,
+                         ip6_address_t * address,
+                         u32 address_length)
+{
+    fib_prefix_t pfx = {
+       .fp_len = address_length,
+       .fp_proto = FIB_PROTOCOL_IP6,
+       .fp_addr.ip6 = *address,
+    };
+
+    if (pfx.fp_len < 128)
     {
-      adj->lookup_next_index = IP_LOOKUP_NEXT_CLASSIFY;
-      adj->classify.table_index = classify_table_index;
+       fib_table_entry_delete(fib_index,
+                              &pfx,
+                              FIB_SOURCE_INTERFACE);
+
     }
-  else
-    adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
-  
-  adj->if_address_index = a - lm->if_address_pool;
-  adj->rewrite_header.sw_if_index = sw_if_index;
-  adj->rewrite_header.max_l3_packet_bytes = hw_if->max_l3_packet_bytes[VLIB_RX];
-  adj->rewrite_header.data_bytes = 0;
-  ip_call_add_del_adjacency_callbacks (lm, x.adj_index, /* is_del */ 0);
-  x.dst_address_length = 128;
-  ip6_add_del_route (im, &x);
+
+    pfx.fp_len = 128;
+    fib_table_entry_delete(fib_index,
+                          &pfx,
+                          FIB_SOURCE_INTERFACE);
 }
 
-static void
-ip6_del_interface_routes (ip6_main_t * im, u32 fib_index,
-                         ip6_address_t * address, u32 address_length)
+void
+ip6_sw_interface_enable_disable (u32 sw_if_index,
+                                u32 is_enable)
 {
-  ip6_add_del_route_args_t x;
-
-  /* Add e.g. 1.0.0.0/8 as interface route (arp for Ethernet). */
-  x.table_index_or_table_id = fib_index;
-  x.flags = (IP6_ROUTE_FLAG_DEL
-            | IP6_ROUTE_FLAG_FIB_INDEX
-            | IP6_ROUTE_FLAG_NO_REDISTRIBUTE);
-  x.dst_address = address[0];
-  x.dst_address_length = address_length;
-  x.adj_index = ~0;
-  x.n_add_adj = 0;
-  x.add_adj = 0;
-
-  if (address_length < 128)
+  vlib_main_t * vm = vlib_get_main();
+  ip6_main_t * im = &ip6_main;
+  ip_lookup_main_t * lm = &im->lookup_main;
+  u32 ci, cast;
+  u32 lookup_feature_index;
+
+  vec_validate_init_empty (im->ip_enabled_by_sw_if_index, sw_if_index, 0);
+
+  /*
+   * enable/disable only on the 1<->0 transition
+   */
+  if (is_enable)
     {
-      /* Don't wipe out fe80::0/64 */
-      if (address_length != 64 || 
-          address[0].as_u64[0] != clib_net_to_host_u64(0xfe80000000000000ULL))
-        ip6_add_del_route (im, &x);
+      if (1 != ++im->ip_enabled_by_sw_if_index[sw_if_index])
+        return;
+    }
+  else
+    {
+      ASSERT(im->ip_enabled_by_sw_if_index[sw_if_index] > 0);
+      if (0 != --im->ip_enabled_by_sw_if_index[sw_if_index])
+        return;
     }
 
-  x.dst_address_length = 128;
-  ip6_add_del_route (im, &x);
+  for (cast = 0; cast <= VNET_IP_RX_MULTICAST_FEAT; cast++)
+    {
+      ip_config_main_t * cm = &lm->feature_config_mains[cast];
+      vnet_config_main_t * vcm = &cm->config_main;
 
-  ip6_delete_matching_routes (im,
-                             fib_index,
-                             IP6_ROUTE_FLAG_FIB_INDEX,
-                             address,
-                             address_length);
-}
+      vec_validate_init_empty (cm->config_index_by_sw_if_index, sw_if_index, ~0);
+      ci = cm->config_index_by_sw_if_index[sw_if_index];
 
-typedef struct {
-    u32 sw_if_index;
-    ip6_address_t address;
-    u32 length;
-} ip6_interface_address_t;
+      if (cast == VNET_IP_RX_UNICAST_FEAT)
+       lookup_feature_index = im->ip6_unicast_rx_feature_lookup;
+      else
+       lookup_feature_index = im->ip6_multicast_rx_feature_lookup;
 
-static clib_error_t *
-ip6_add_del_interface_address_internal (vlib_main_t * vm,
-                                       u32 sw_if_index,
-                                       ip6_address_t * new_address,
-                                       u32 new_length,
-                                       u32 redistribute,
-                                       u32 insert_routes,
-                                       u32 is_del);
+      if (is_enable)
+       ci = vnet_config_add_feature (vm, vcm,
+                                     ci,
+                                     lookup_feature_index,
+                                     /* config data */ 0,
+                                     /* # bytes of config data */ 0);
+      else
+       ci = vnet_config_del_feature (vm, vcm,
+                                     ci,
+                                     lookup_feature_index,
+                                     /* config data */ 0,
+                                     /* # bytes of config data */ 0);
 
-static clib_error_t *
-ip6_add_del_interface_address_internal (vlib_main_t * vm,
-                                       u32 sw_if_index,
-                                       ip6_address_t * address,
-                                       u32 address_length,
-                                       u32 redistribute,
-                                       u32 insert_routes,
-                                       u32 is_del)
+      cm->config_index_by_sw_if_index[sw_if_index] = ci;
+    }
+}
+
+clib_error_t *
+ip6_add_del_interface_address (vlib_main_t * vm,
+                              u32 sw_if_index,
+                              ip6_address_t * address,
+                              u32 address_length,
+                              u32 is_del)
 {
   vnet_main_t * vnm = vnet_get_main();
   ip6_main_t * im = &ip6_main;
@@ -1174,17 +494,13 @@ ip6_add_del_interface_address_internal (vlib_main_t * vm,
       goto done;
   }
 
-  if (vnet_sw_interface_is_admin_up (vnm, sw_if_index) && insert_routes)
-    {
-      if (is_del)
-       ip6_del_interface_routes (im, ip6_af.fib_index, address,
-                                 address_length);
-
-      else
-       ip6_add_interface_routes (vnm, sw_if_index,
-                                 im, ip6_af.fib_index,
-                                 pool_elt_at_index (lm->if_address_pool, if_address_index));
-    }
+  if (is_del)
+      ip6_del_interface_routes (im, ip6_af.fib_index, address,
+                               address_length);
+  else
+      ip6_add_interface_routes (vnm, sw_if_index,
+                               im, ip6_af.fib_index,
+                               pool_elt_at_index (lm->if_address_pool, if_address_index));
 
   {
     ip6_add_del_interface_address_callback_t * cb;
@@ -1200,18 +516,6 @@ ip6_add_del_interface_address_internal (vlib_main_t * vm,
   return error;
 }
 
-clib_error_t *
-ip6_add_del_interface_address (vlib_main_t * vm, u32 sw_if_index,
-                              ip6_address_t * address, u32 address_length,
-                              u32 is_del)
-{
-  return ip6_add_del_interface_address_internal
-    (vm, sw_if_index, address, address_length,
-     /* redistribute */ 1,
-     /* insert_routes */ 1,
-     is_del);
-}
-
 clib_error_t *
 ip6_sw_interface_admin_up_down (vnet_main_t * vnm,
                                u32 sw_if_index,
@@ -1282,10 +586,16 @@ VNET_IP6_UNICAST_FEATURE_INIT (ip6_vpath, static) = {
 
 VNET_IP6_UNICAST_FEATURE_INIT (ip6_lookup, static) = {
   .node_name = "ip6-lookup",
-  .runs_before = 0, /* not before any other features */
+  .runs_before = ORDER_CONSTRAINTS {"ip6-drop", 0},
   .feature_index = &ip6_main.ip6_unicast_rx_feature_lookup,
 };
 
+VNET_IP6_UNICAST_FEATURE_INIT (ip6_drop, static) = {
+  .node_name = "ip6-drop",
+  .runs_before = 0,  /*last feature*/
+  .feature_index = &ip6_main.ip6_unicast_rx_feature_drop,
+};
+
 /* Built-in ip6 multicast rx feature path definition (none now) */
 VNET_IP6_MULTICAST_FEATURE_INIT (ip6_vpath_mc, static) = {
   .node_name = "vpath-input-ip6",
@@ -1295,10 +605,16 @@ VNET_IP6_MULTICAST_FEATURE_INIT (ip6_vpath_mc, static) = {
 
 VNET_IP6_MULTICAST_FEATURE_INIT (ip6_lookup, static) = {
   .node_name = "ip6-lookup",
-  .runs_before = 0, /* not before any other features */
+  .runs_before = ORDER_CONSTRAINTS {"ip6-drop", 0},
   .feature_index = &ip6_main.ip6_multicast_rx_feature_lookup,
 };
 
+VNET_IP6_MULTICAST_FEATURE_INIT (ip6_drop_mc, static) = {
+  .node_name = "ip6-drop",
+  .runs_before = 0, /* last feature */
+  .feature_index = &ip6_main.ip6_multicast_rx_feature_drop,
+};
+
 static char * rx_feature_start_nodes[] = 
   {"ip6-input"};
 
@@ -1343,7 +659,7 @@ ip6_feature_init (vlib_main_t * vm, ip6_main_t * im)
                                          feature_start_nodes,
                                          feature_start_len,
                                          cast,
-                                         0 /* is_ip4 */)))
+                                         VNET_L3_PACKET_TYPE_IP6)))
         return error;
     }
   return 0;
@@ -1369,9 +685,9 @@ ip6_sw_interface_add_del (vnet_main_t * vnm,
       ci = cm->config_index_by_sw_if_index[sw_if_index];
 
       if (cast == VNET_IP_RX_UNICAST_FEAT)
-        feature_index = im->ip6_unicast_rx_feature_lookup;
+        feature_index = im->ip6_unicast_rx_feature_drop;
       else if (cast == VNET_IP_RX_MULTICAST_FEAT)
-        feature_index = im->ip6_multicast_rx_feature_lookup;
+        feature_index = im->ip6_multicast_rx_feature_drop;
       else 
         feature_index = im->ip6_tx_feature_interface_output;
 
@@ -1382,12 +698,14 @@ ip6_sw_interface_add_del (vnet_main_t * vnm,
                                      /* config data */ 0,
                                      /* # bytes of config data */ 0);
       else
-       ci = vnet_config_del_feature (vm, vcm,
-                                     ci,
-                                      feature_index,
-                                     /* config data */ 0,
-                                     /* # bytes of config data */ 0);
-
+        {
+          ci = vnet_config_del_feature (vm, vcm, ci,
+                                        feature_index,
+                                        /* config data */ 0,
+                                        /* # bytes of config data */ 0);
+          if (vec_len(im->ip_enabled_by_sw_if_index) > sw_if_index)
+              im->ip_enabled_by_sw_if_index[sw_if_index] = 0;
+        }
       cm->config_index_by_sw_if_index[sw_if_index] = ci;
       /* 
        * note: do not update the tx feature count here.
@@ -1403,7 +721,7 @@ ip6_lookup (vlib_main_t * vm,
            vlib_node_runtime_t * node,
            vlib_frame_t * frame)
 {
-  return ip6_lookup_inline (vm, node, frame, /* is_indirect */ 0);
+  return ip6_lookup_inline (vm, node, frame);
 }
 
 static u8 * format_ip6_lookup_trace (u8 * s, va_list * args);
@@ -1419,27 +737,97 @@ VLIB_REGISTER_NODE (ip6_lookup_node) = {
   .next_nodes = IP6_LOOKUP_NEXT_NODES,
 };
 
-VLIB_NODE_FUNCTION_MULTIARCH (ip6_lookup_node, ip6_lookup);
+VLIB_NODE_FUNCTION_MULTIARCH (ip6_lookup_node, ip6_lookup)
 
-static uword
-ip6_indirect (vlib_main_t * vm,
-             vlib_node_runtime_t * node,
-             vlib_frame_t * frame)
+always_inline uword
+ip6_load_balance (vlib_main_t * vm,
+                 vlib_node_runtime_t * node,
+                 vlib_frame_t * frame)
 {
-  return ip6_lookup_inline (vm, node, frame, /* is_indirect */ 1);
-}
+  vlib_combined_counter_main_t * cm = &load_balance_main.lbm_via_counters;
+  u32 n_left_from, n_left_to_next, * from, * to_next;
+  ip_lookup_next_t next;
+  u32 cpu_index = os_get_cpu_number();
+
+  from = vlib_frame_vector_args (frame);
+  n_left_from = frame->n_vectors;
+  next = node->cached_next_index;
+
+  if (node->flags & VLIB_NODE_FLAG_TRACE)
+      ip6_forward_next_trace(vm, node, frame, VLIB_TX);
+
+  while (n_left_from > 0)
+    {
+      vlib_get_next_frame (vm, node, next,
+                          to_next, n_left_to_next);
+
+    
+      while (n_left_from > 0 && n_left_to_next > 0)
+       {
+         ip_lookup_next_t next0;
+         const load_balance_t *lb0;
+         vlib_buffer_t * p0;
+         u32 pi0, lbi0, hc0;
+         const ip6_header_t *ip0;
+         const dpo_id_t *dpo0;
 
+         pi0 = from[0];
+         to_next[0] = pi0;
+
+         p0 = vlib_get_buffer (vm, pi0);
+
+         ip0 = vlib_buffer_get_current (p0);
+         lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
+
+         lb0 = load_balance_get(lbi0);
+         hc0 = lb0->lb_hash_config;
+         vnet_buffer(p0)->ip.flow_hash = ip6_compute_flow_hash(ip0, hc0);
+
+         dpo0 = load_balance_get_bucket_i(lb0, 
+                                          vnet_buffer(p0)->ip.flow_hash &
+                                          (lb0->lb_n_buckets - 1));
 
-VLIB_REGISTER_NODE (ip6_indirect_node) = {
-  .function = ip6_indirect,
-  .name = "ip6-indirect",
+         next0 = dpo0->dpoi_next_node;
+         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
+
+         vlib_increment_combined_counter 
+              (cm, cpu_index, lbi0, 1,
+               vlib_buffer_length_in_chain (vm, p0));
+
+         from += 1;
+         to_next += 1;
+         n_left_to_next -= 1;
+         n_left_from -= 1;
+
+         if (PREDICT_FALSE (next0 != next))
+           {
+             n_left_to_next += 1;
+             vlib_put_next_frame (vm, node, next, n_left_to_next);
+             next = next0;
+             vlib_get_next_frame (vm, node, next,
+                                  to_next, n_left_to_next);
+             to_next[0] = pi0;
+             to_next += 1;
+             n_left_to_next -= 1;
+           }
+       }
+
+      vlib_put_next_frame (vm, node, next, n_left_to_next);
+    }
+
+  return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ip6_load_balance_node) = {
+  .function = ip6_load_balance,
+  .name = "ip6-load-balance",
   .vector_size = sizeof (u32),
   .sibling_of = "ip6-lookup",
   .format_trace = format_ip6_lookup_trace,
   .n_next_nodes = 0,
 };
 
-VLIB_NODE_FUNCTION_MULTIARCH (ip6_indirect_node, ip6_indirect);
+VLIB_NODE_FUNCTION_MULTIARCH (ip6_load_balance_node, ip6_load_balance)
 
 typedef struct {
   /* Adjacency taken. */
@@ -1469,13 +857,10 @@ static u8 * format_ip6_lookup_trace (u8 * s, va_list * args)
   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
   ip6_forward_next_trace_t * t = va_arg (*args, ip6_forward_next_trace_t *);
-  vnet_main_t * vnm = vnet_get_main();
-  ip6_main_t * im = &ip6_main;
   uword indent = format_get_indent (s);
 
-  s = format (s, "fib %d adj-idx %d : %U flow hash: 0x%08x",
-              t->fib_index, t->adj_index, format_ip_adjacency,
-              vnm, &im->lookup_main, t->adj_index, t->flow_hash);
+  s = format (s, "fib %d dpo-idx %d : flow hash: 0x%08x",
+              t->fib_index, t->adj_index, t->flow_hash);
   s = format(s, "\n%U%U",
              format_white_space, indent,
              format_ip6_header, t->packet_data);
@@ -1489,16 +874,16 @@ static u8 * format_ip6_rewrite_trace (u8 * s, va_list * args)
   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
   ip6_forward_next_trace_t * t = va_arg (*args, ip6_forward_next_trace_t *);
   vnet_main_t * vnm = vnet_get_main();
-  ip6_main_t * im = &ip6_main;
   uword indent = format_get_indent (s);
 
   s = format (s, "tx_sw_if_index %d adj-idx %d : %U flow hash: 0x%08x",
               t->fib_index, t->adj_index, format_ip_adjacency,
-              vnm, &im->lookup_main, t->adj_index, t->flow_hash);
+              vnm, t->adj_index, FORMAT_IP_ADJACENCY_NONE,
+             t->flow_hash);
   s = format (s, "\n%U%U",
               format_white_space, indent,
               format_ip_adjacency_packet_data,
-              vnm, &im->lookup_main, t->adj_index,
+              vnm, t->adj_index,
               t->packet_data, sizeof (t->packet_data));
   return s;
 }
@@ -1628,12 +1013,6 @@ ip6_punt (vlib_main_t * vm,
          vlib_frame_t * frame)
 { return ip6_drop_or_punt (vm, node, frame, IP6_ERROR_ADJACENCY_PUNT); }
 
-static uword
-ip6_miss (vlib_main_t * vm,
-         vlib_node_runtime_t * node,
-         vlib_frame_t * frame)
-{ return ip6_drop_or_punt (vm, node, frame, IP6_ERROR_DST_LOOKUP_MISS); }
-
 VLIB_REGISTER_NODE (ip6_drop_node,static) = {
   .function = ip6_drop,
   .name = "ip6-drop",
@@ -1647,7 +1026,7 @@ VLIB_REGISTER_NODE (ip6_drop_node,static) = {
   },
 };
 
-VLIB_NODE_FUNCTION_MULTIARCH (ip6_drop_node, ip6_drop);
+VLIB_NODE_FUNCTION_MULTIARCH (ip6_drop_node, ip6_drop)
 
 VLIB_REGISTER_NODE (ip6_punt_node,static) = {
   .function = ip6_punt,
@@ -1662,22 +1041,7 @@ VLIB_REGISTER_NODE (ip6_punt_node,static) = {
   },
 };
 
-VLIB_NODE_FUNCTION_MULTIARCH (ip6_punt_node, ip6_punt);
-
-VLIB_REGISTER_NODE (ip6_miss_node,static) = {
-  .function = ip6_miss,
-  .name = "ip6-miss",
-  .vector_size = sizeof (u32),
-
-  .format_trace = format_ip6_forward_next_trace,
-
-  .n_next_nodes = 1,
-  .next_nodes = {
-    [0] = "error-drop",
-  },
-};
-
-VLIB_NODE_FUNCTION_MULTIARCH (ip6_miss_node, ip6_miss);
+VLIB_NODE_FUNCTION_MULTIARCH (ip6_punt_node, ip6_punt)
 
 VLIB_REGISTER_NODE (ip6_multicast_node,static) = {
   .function = ip6_drop,
@@ -1931,17 +1295,21 @@ ip6_local (vlib_main_t * vm,
 
          /* Drop packets from unroutable hosts. */
           /* If this is a neighbor solicitation (ICMP), skip source RPF check */
-         if (error0 == IP6_ERROR_UNKNOWN_PROTOCOL && type0 != IP_BUILTIN_PROTOCOL_ICMP)
+         if (error0 == IP6_ERROR_UNKNOWN_PROTOCOL &&
+             type0 != IP_BUILTIN_PROTOCOL_ICMP &&
+             !ip6_address_is_link_local_unicast(&ip0->src_address))
            {
              u32 src_adj_index0 = ip6_src_lookup_for_packet (im, p0, ip0);
-             error0 = (lm->miss_adj_index == src_adj_index0
+             error0 = (ADJ_INDEX_INVALID == src_adj_index0
                        ? IP6_ERROR_SRC_LOOKUP_MISS
                        : error0);
            }
-         if (error1 == IP6_ERROR_UNKNOWN_PROTOCOL && type1 != IP_BUILTIN_PROTOCOL_ICMP)
+         if (error1 == IP6_ERROR_UNKNOWN_PROTOCOL &&
+             type1 != IP_BUILTIN_PROTOCOL_ICMP &&
+             !ip6_address_is_link_local_unicast(&ip1->src_address))
            {
              u32 src_adj_index1 = ip6_src_lookup_for_packet (im, p1, ip1);
-             error1 = (lm->miss_adj_index == src_adj_index1
+             error1 = (ADJ_INDEX_INVALID == src_adj_index1
                        ? IP6_ERROR_SRC_LOOKUP_MISS
                        : error1);
            }
@@ -2018,10 +1386,12 @@ ip6_local (vlib_main_t * vm,
                    : error0);
 
           /* If this is a neighbor solicitation (ICMP), skip source RPF check */
-         if (error0 == IP6_ERROR_UNKNOWN_PROTOCOL && type0 != IP_BUILTIN_PROTOCOL_ICMP)
+         if (error0 == IP6_ERROR_UNKNOWN_PROTOCOL &&
+             type0 != IP_BUILTIN_PROTOCOL_ICMP &&
+             !ip6_address_is_link_local_unicast(&ip0->src_address))
            {
              u32 src_adj_index0 = ip6_src_lookup_for_packet (im, p0, ip0);
-             error0 = (lm->miss_adj_index == src_adj_index0
+             error0 = (ADJ_INDEX_INVALID == src_adj_index0
                        ? IP6_ERROR_SRC_LOOKUP_MISS
                        : error0);
            }
@@ -2057,7 +1427,7 @@ VLIB_REGISTER_NODE (ip6_local_node,static) = {
   },
 };
 
-VLIB_NODE_FUNCTION_MULTIARCH (ip6_local_node, ip6_local);
+VLIB_NODE_FUNCTION_MULTIARCH (ip6_local_node, ip6_local)
 
 void ip6_register_protocol (u32 protocol, u32 node_index)
 {
@@ -2082,9 +1452,10 @@ typedef enum {
 } ip6_discover_neighbor_error_t;
 
 static uword
-ip6_discover_neighbor (vlib_main_t * vm,
-                      vlib_node_runtime_t * node,
-                      vlib_frame_t * frame)
+ip6_discover_neighbor_inline (vlib_main_t * vm,
+                             vlib_node_runtime_t * node,
+                             vlib_frame_t * frame,
+                             int is_glean)
 {
   vnet_main_t * vnm = vnet_get_main();
   ip6_main_t * im = &ip6_main;
@@ -2144,11 +1515,11 @@ ip6_discover_neighbor (vlib_main_t * vm,
 
          adj0 = ip_get_adjacency (lm, adj_index0);
 
-         if (adj0->arp.next_hop.ip6.as_u64[0] ||
-             adj0->arp.next_hop.ip6.as_u64[1]) {
-           ip0->dst_address.as_u64[0] = adj0->arp.next_hop.ip6.as_u64[0];
-           ip0->dst_address.as_u64[1] = adj0->arp.next_hop.ip6.as_u64[1];
-         }
+         if (!is_glean)
+           {
+             ip0->dst_address.as_u64[0] = adj0->sub_type.nbr.next_hop.ip6.as_u64[0];
+             ip0->dst_address.as_u64[1] = adj0->sub_type.nbr.next_hop.ip6.as_u64[1];
+           }
 
          a0 = hash_seeds[0];
          b0 = hash_seeds[1];
@@ -2209,13 +1580,15 @@ ip6_discover_neighbor (vlib_main_t * vm,
              * Choose source address based on destination lookup 
              * adjacency. 
              */
-           if (ip6_src_address_for_packet (im, p0, &h0->ip.src_address,
-                                               sw_if_index0)) {
-               //There is no address on the interface
+           if (ip6_src_address_for_packet (lm,
+                                           sw_if_index0,
+                                           &h0->ip.src_address))
+             {
+               /* There is no address on the interface */
                p0->error = node->errors[IP6_DISCOVER_NEIGHBOR_ERROR_NO_SOURCE_ADDRESS];
                vlib_buffer_free(vm, &bi0, 1);
                continue;
-           }
+             }
 
            /* 
              * Destination address is a solicited node multicast address.  
@@ -2262,6 +1635,22 @@ ip6_discover_neighbor (vlib_main_t * vm,
   return frame->n_vectors;
 }
 
+static uword
+ip6_discover_neighbor (vlib_main_t * vm,
+                      vlib_node_runtime_t * node,
+                      vlib_frame_t * frame)
+{
+    return (ip6_discover_neighbor_inline(vm, node, frame, 0));
+}
+
+static uword
+ip6_glean (vlib_main_t * vm,
+          vlib_node_runtime_t * node,
+          vlib_frame_t * frame)
+{
+    return (ip6_discover_neighbor_inline(vm, node, frame, 1));
+}
+
 static char * ip6_discover_neighbor_error_strings[] = {
   [IP6_DISCOVER_NEIGHBOR_ERROR_DROP] = "address overflow drops",
   [IP6_DISCOVER_NEIGHBOR_ERROR_REQUEST_SENT] 
@@ -2287,6 +1676,23 @@ VLIB_REGISTER_NODE (ip6_discover_neighbor_node) = {
   },
 };
 
+VLIB_REGISTER_NODE (ip6_glean_node) = {
+  .function = ip6_glean,
+  .name = "ip6-glean",
+  .vector_size = sizeof (u32),
+
+  .format_trace = format_ip6_forward_next_trace,
+
+  .n_errors = ARRAY_LEN (ip6_discover_neighbor_error_strings),
+  .error_strings = ip6_discover_neighbor_error_strings,
+
+  .n_next_nodes = IP6_DISCOVER_NEIGHBOR_N_NEXT,
+  .next_nodes = {
+    [IP6_DISCOVER_NEIGHBOR_NEXT_DROP] = "error-drop",
+    [IP6_DISCOVER_NEIGHBOR_NEXT_REPLY_TX] = "interface-output",
+  },
+};
+
 clib_error_t *
 ip6_probe_neighbor (vlib_main_t * vm, ip6_address_t * dst, u32 sw_if_index)
 {
@@ -2474,31 +1880,17 @@ ip6_rewrite_inline (vlib_main_t * vm,
          adj0 = ip_get_adjacency (lm, adj_index0);
          adj1 = ip_get_adjacency (lm, adj_index1);
 
-          if (rewrite_for_locally_received_packets)
-            {
-              /*
-               * If someone sends e.g. an icmp6 w/ src = dst = interface addr,
-               * we end up here with a local adjacency in hand
-               */
-              if (PREDICT_FALSE(adj0->lookup_next_index 
-                                == IP_LOOKUP_NEXT_LOCAL))
-                error0 = IP6_ERROR_SPOOFED_LOCAL_PACKETS;
-              if (PREDICT_FALSE(adj1->lookup_next_index 
-                                == IP_LOOKUP_NEXT_LOCAL))
-                error1 = IP6_ERROR_SPOOFED_LOCAL_PACKETS;
-            }
-
          rw_len0 = adj0[0].rewrite_header.data_bytes;
          rw_len1 = adj1[0].rewrite_header.data_bytes;
           vnet_buffer(p0)->ip.save_rewrite_length = rw_len0;
           vnet_buffer(p1)->ip.save_rewrite_length = rw_len1;
 
-         vlib_increment_combined_counter (&lm->adjacency_counters,
+         vlib_increment_combined_counter (&adjacency_counters,
                                            cpu_index, 
                                           adj_index0,
                                           /* packet increment */ 0,
                                           /* byte increment */ rw_len0);
-         vlib_increment_combined_counter (&lm->adjacency_counters,
+         vlib_increment_combined_counter (&adjacency_counters,
                                            cpu_index, 
                                           adj_index1,
                                           /* packet increment */ 0,
@@ -2621,13 +2013,6 @@ ip6_rewrite_inline (vlib_main_t * vm,
                 }
            }
 
-          if (rewrite_for_locally_received_packets)
-            {
-              if (PREDICT_FALSE(adj0->lookup_next_index 
-                                == IP_LOOKUP_NEXT_LOCAL))
-                error0 = IP6_ERROR_SPOOFED_LOCAL_PACKETS;
-            }
-
          /* Guess we are only writing on simple Ethernet header. */
          vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
       
@@ -2635,7 +2020,7 @@ ip6_rewrite_inline (vlib_main_t * vm,
          rw_len0 = adj0[0].rewrite_header.data_bytes;
           vnet_buffer(p0)->ip.save_rewrite_length = rw_len0;
 
-         vlib_increment_combined_counter (&lm->adjacency_counters,
+         vlib_increment_combined_counter (&adjacency_counters,
                                            cpu_index, 
                                           adj_index0,
                                           /* packet increment */ 0,
@@ -2712,6 +2097,29 @@ ip6_rewrite_local (vlib_main_t * vm,
                             /* rewrite_for_locally_received_packets */ 1);
 }
 
+static uword
+ip6_midchain (vlib_main_t * vm,
+             vlib_node_runtime_t * node,
+             vlib_frame_t * frame)
+{
+  return ip6_rewrite_inline (vm, node, frame,
+                            /* rewrite_for_locally_received_packets */ 0);
+}
+
+VLIB_REGISTER_NODE (ip6_midchain_node) = {
+  .function = ip6_midchain,
+  .name = "ip6-midchain",
+  .vector_size = sizeof (u32),
+
+  .format_trace = format_ip6_forward_next_trace,
+
+  .next_nodes = {
+    [IP6_REWRITE_NEXT_DROP] = "error-drop",
+  },
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (ip6_midchain_node, ip6_midchain)
+
 VLIB_REGISTER_NODE (ip6_rewrite_node) = {
   .function = ip6_rewrite_transit,
   .name = "ip6-rewrite",
@@ -3207,12 +2615,17 @@ ip6_lookup_init (vlib_main_t * vm)
   if (im->lookup_table_size == 0)
     im->lookup_table_size = IP6_FIB_DEFAULT_HASH_MEMORY_SIZE;
   
-  BV(clib_bihash_init) (&im->ip6_lookup_table, "ip6 lookup table",
+  BV(clib_bihash_init) (&(im->ip6_table[IP6_FIB_TABLE_FWDING].ip6_hash),
+                       "ip6 FIB fwding table",
                         im->lookup_table_nbuckets,
                         im->lookup_table_size);
-  
+  BV(clib_bihash_init) (&im->ip6_table[IP6_FIB_TABLE_NON_FWDING].ip6_hash,
+                       "ip6 FIB non-fwding table",
+                        im->lookup_table_nbuckets,
+                        im->lookup_table_size);
+
   /* Create FIB with index 0 and table id of 0. */
-  find_ip6_fib_by_table_index_or_id (im, /* table id */ 0, IP6_ROUTE_FLAG_TABLE_ID);
+  fib_table_find_or_create_and_lock(FIB_PROTOCOL_IP6, 0);
 
   {
     pg_node_t * pn;
@@ -3282,17 +2695,14 @@ add_del_ip6_interface_table (vlib_main_t * vm,
     }
 
   {
-    ip6_main_t * im = &ip6_main;
-    ip6_fib_t * fib = 
-      find_ip6_fib_by_table_index_or_id (im, table_id, IP6_ROUTE_FLAG_TABLE_ID);
+    u32 fib_index = fib_table_find_or_create_and_lock(FIB_PROTOCOL_IP6,
+                                                      table_id);
 
-    if (fib) 
-      {
-        vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
-        im->fib_index_by_sw_if_index[sw_if_index] = fib->index;
-    }
+    vec_validate (ip6_main.fib_index_by_sw_if_index, sw_if_index);
+    ip6_main.fib_index_by_sw_if_index[sw_if_index] = fib_index;
   }
 
+
  done:
   return error;
 }
@@ -3368,7 +2778,7 @@ int vnet_set_ip6_flow_hash (u32 table_id, u32 flow_hash_config)
   if (p == 0)
     return -1;
 
-  fib = vec_elt_at_index (im6->fibs, p[0]);
+  fib = ip6_fib_get (p[0]);
 
   fib->flow_hash_config = flow_hash_config;
   return 1;
index 2a03703..d927d27 100644 (file)
@@ -24,6 +24,7 @@
 #include <vppinfra/elog.h>
 
 #include <vnet/ip/ip6_hop_by_hop.h>
+#include <vnet/fib/ip6_fib.h>
 
 char *ppc_state[] = { "None", "Encap", "Decap" };
 
@@ -935,48 +936,22 @@ ip6_ioam_set_destination (ip6_address_t * addr, u32 mask_width, u32 vrf_id,
   ip_lookup_main_t *lm = &im->lookup_main;
   ip_adjacency_t *adj;
   u32 fib_index;
-  u32 len, adj_index;
-  int i, rv;
-  uword *p;
-  BVT (clib_bihash_kv) kv, value;
+  u32 adj_index;
 
   if ((is_add + is_pop + is_none) != 1)
     return VNET_API_ERROR_INVALID_VALUE_2;
 
   /* Go find the adjacency we're supposed to tickle */
-  p = hash_get (im->fib_index_by_table_id, vrf_id);
+  fib_index = ip6_fib_index_from_table_id (vrf_id);
 
-  if (p == 0)
+  if (~0 == fib_index)
     return VNET_API_ERROR_NO_SUCH_FIB;
 
-  fib_index = p[0];
+  adj_index = ip6_fib_table_fwding_lookup (im, fib_index, addr);
 
-  len = vec_len (im->prefix_lengths_in_search_order);
-
-  for (i = 0; i < len; i++)
-    {
-      int dst_address_length = im->prefix_lengths_in_search_order[i];
-      ip6_address_t *mask = &im->fib_masks[dst_address_length];
-
-      if (dst_address_length != mask_width)
-       continue;
-
-      kv.key[0] = addr->as_u64[0] & mask->as_u64[0];
-      kv.key[1] = addr->as_u64[1] & mask->as_u64[1];
-      kv.key[2] = ((u64) ((fib_index)) << 32) | dst_address_length;
-
-      rv =
-       BV (clib_bihash_search_inline_2) (&im->ip6_lookup_table, &kv, &value);
-      if (rv == 0)
-       goto found;
-
-    }
-  return VNET_API_ERROR_NO_SUCH_ENTRY;
-
-found:
+  ASSERT (!"Not an ADJ");
 
   /* Got it, modify as directed... */
-  adj_index = value.value;
   adj = ip_get_adjacency (lm, adj_index);
 
   /* Restore original lookup-next action */
@@ -1015,7 +990,7 @@ ip6_set_ioam_destination_command_fn (vlib_main_t * vm,
   int is_pop = 0;
   int is_none = 0;
   u32 vrf_id = 0;
-  int rv;
+  // int rv;
 
   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
     {
@@ -1038,19 +1013,23 @@ ip6_set_ioam_destination_command_fn (vlib_main_t * vm,
   if (mask_width == ~0)
     return clib_error_return (0, "<address>/<mask-width> required");
 
-  rv = ip6_ioam_set_destination (&addr, mask_width, vrf_id,
-                                is_add, is_pop, is_none);
+  /* rv = ip6_ioam_set_destination (&addr, mask_width, vrf_id, */
+  /*                             is_add, is_pop, is_none); */
 
-  switch (rv)
-    {
-    case 0:
-      break;
-    default:
-      return clib_error_return (0, "ip6_ioam_set_destination returned %d",
-                               rv);
-    }
+  /* switch (rv) */
+  /*   { */
+  /*   case 0: */
+  /*     break; */
+  /*   default: */
+  /*     return clib_error_return (0, "ip6_ioam_set_destination returned %d", */
+  /*                            rv); */
+  /*   } */
 
-  return 0;
+  /* return 0; */
+
+  return clib_error_return (0,
+                           "ip6_ioam_set_destination Currnetly Disabled due to FIB2.0",
+                           1);
 }
 
 /* *INDENT-OFF* */
index a35f58a..11df776 100644 (file)
@@ -19,6 +19,9 @@
 #include <vnet/ethernet/ethernet.h>
 #include <vppinfra/mhash.h>
 #include <vppinfra/md5.h>
+#include <vnet/adj/adj.h>
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/ip6_fib.h>
 
 #if DPDK==1
 #include <vnet/devices/dpdk/dpdk.h>
@@ -38,9 +41,9 @@ typedef struct {
   u8 link_layer_address[8];
   u16 flags;
 #define IP6_NEIGHBOR_FLAG_STATIC (1 << 0)
-#define IP6_NEIGHBOR_FLAG_GLEAN  (2 << 0)
+#define IP6_NEIGHBOR_FLAG_DYNAMIC  (2 << 0)
   u64 cpu_time_last_updated;
-  u32 *adjacencies;
+  adj_index_t adj_index;
 } ip6_neighbor_t;
 
 /* advertised prefix option */ 
@@ -121,9 +124,9 @@ typedef struct {
   u32 seed;
   u64 randomizer;
   int ref_count;
-  u32 all_nodes_adj_index;
-  u32 all_routers_adj_index;
-  u32 all_mldv2_routers_adj_index;
+  adj_index_t all_nodes_adj_index;
+  adj_index_t all_routers_adj_index;
+  adj_index_t all_mldv2_routers_adj_index;
   
   /* timing information */
 #define DEF_MAX_RADV_INTERVAL 200
@@ -217,8 +220,8 @@ static u8 * format_ip6_neighbor_ip6_entry (u8 * s, va_list * va)
   if (! n)
     return format (s, "%=12s%=20s%=6s%=20s%=40s", "Time", "Address", "Flags", "Link layer", "Interface");
 
-  if (n->flags & IP6_NEIGHBOR_FLAG_GLEAN)
-    flags = format(flags, "G");
+  if (n->flags & IP6_NEIGHBOR_FLAG_DYNAMIC)
+    flags = format(flags, "D");
 
   if (n->flags & IP6_NEIGHBOR_FLAG_STATIC)
     flags = format(flags, "S");
@@ -330,6 +333,52 @@ static void set_unset_ip6_neighbor_rpc
 }
 #endif
 
+static void
+ip6_nd_mk_complete (ip6_neighbor_t * nbr)
+{
+  fib_prefix_t pfx = {
+      .fp_len = 128,
+      .fp_proto = FIB_PROTOCOL_IP6,
+      .fp_addr = {
+         .ip6 = nbr->key.ip6_address,
+      },
+  };
+  ip6_main_t *im;
+  u32 fib_index;
+
+  im = &ip6_main;
+  fib_index = im->fib_index_by_sw_if_index[nbr->key.sw_if_index];
+
+  /* only once please */
+  if (ADJ_INDEX_INVALID == nbr->adj_index)
+    {
+      nbr->adj_index =
+         adj_nbr_add_or_lock_w_rewrite(FIB_PROTOCOL_IP6,
+                                       FIB_LINK_IP6,
+                                       &pfx.fp_addr,
+                                       nbr->key.sw_if_index,
+                                       nbr->link_layer_address);
+      ASSERT(ADJ_INDEX_INVALID != nbr->adj_index);
+
+      fib_table_entry_update_one_path(fib_index,
+                                     &pfx,
+                                     FIB_SOURCE_ADJ,
+                                     FIB_ENTRY_FLAG_NONE,
+                                     FIB_PROTOCOL_IP6,
+                                     &pfx.fp_addr,
+                                     nbr->key.sw_if_index,
+                                     ~0,
+                                     1,
+                                     MPLS_LABEL_INVALID,
+                                     FIB_ROUTE_PATH_FLAG_NONE);
+    }
+  else
+    {
+      adj_nbr_update_rewrite(nbr->adj_index,
+                            nbr->link_layer_address);
+    }
+}
+
 int
 vnet_set_ip6_ethernet_neighbor (vlib_main_t * vm,
                                 u32 sw_if_index,
@@ -338,17 +387,12 @@ vnet_set_ip6_ethernet_neighbor (vlib_main_t * vm,
                                 uword n_bytes_link_layer_address,
                                 int is_static)
 {
-  vnet_main_t * vnm = vnet_get_main();
   ip6_neighbor_main_t * nm = &ip6_neighbor_main;
   ip6_neighbor_key_t k;
   ip6_neighbor_t * n = 0;
-  ip6_main_t * im = &ip6_main;
-  ip_lookup_main_t * lm = &im->lookup_main;
   int make_new_nd_cache_entry=1;
   uword * p;
   u32 next_index;
-  u32 adj_index;
-  ip_adjacency_t *existing_adj;
   pending_resolution_t * pr, * mc;
 
 #if DPDK > 0
@@ -376,77 +420,26 @@ vnet_set_ip6_ethernet_neighbor (vlib_main_t * vm,
     make_new_nd_cache_entry = 0;
   }
 
-  /* Note: always install the route. It might have been deleted */
-  ip6_add_del_route_args_t args;
-  ip_adjacency_t adj;
-
-  memset (&adj, 0, sizeof(adj));
-  adj.lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
-  adj.explicit_fib_index = ~0;
-
-  vnet_rewrite_for_sw_interface
-  (vnm,
-   VNET_L3_PACKET_TYPE_IP6,
-   sw_if_index,
-   ip6_rewrite_node.index,
-   link_layer_address,
-   &adj.rewrite_header,
-   sizeof (adj.rewrite_data));
-
-  /* result of this lookup should be next-hop adjacency */
-  adj_index = ip6_fib_lookup_with_table (im, im->fib_index_by_sw_if_index[sw_if_index], a);
-  existing_adj = ip_get_adjacency(lm, adj_index);
-
-  if (existing_adj->lookup_next_index == IP_LOOKUP_NEXT_ARP &&
-      existing_adj->arp.next_hop.ip6.as_u64[0] == a->as_u64[0] &&
-      existing_adj->arp.next_hop.ip6.as_u64[1] == a->as_u64[1])
-  {
-    u32 * ai;
-    u32 * adjs = 0;
-    
-    if (n)
-      adjs = vec_dup(n->adjacencies);
-    else
-      clib_warning ("ip6 neighbor n not set");
-    
-    /* Update all adj assigned to this arp entry */
-    vec_foreach(ai, adjs)
-    {
-      int i;
-      ip_adjacency_t * uadj = ip_get_adjacency(lm, *ai);
-      for (i = 0; i < uadj->n_adj; i++)
-        if (uadj[i].lookup_next_index == IP_LOOKUP_NEXT_ARP &&
-            uadj[i].arp.next_hop.ip6.as_u64[0] == a->as_u64[0] &&
-            uadj[i].arp.next_hop.ip6.as_u64[1] == a->as_u64[1])
-          ip_update_adjacency (lm, *ai + i, &adj);
-    }
-    vec_free(adjs);
-  }
-  else
-  {
-    /* create new adj */
-    args.table_index_or_table_id = im->fib_index_by_sw_if_index[sw_if_index];
-    args.flags = IP6_ROUTE_FLAG_FIB_INDEX | IP6_ROUTE_FLAG_ADD | IP6_ROUTE_FLAG_NEIGHBOR;
-    args.dst_address = a[0];
-    args.dst_address_length = 128;
-    args.adj_index = ~0;
-    args.add_adj = &adj;
-    args.n_add_adj = 1;
-    ip6_add_del_route (im, &args);
-  }
-
   if (make_new_nd_cache_entry) {
     pool_get (nm->neighbor_pool, n);
     mhash_set (&nm->neighbor_index_by_key, &k, n - nm->neighbor_pool,
                /* old value */ 0);
     n->key = k;
+    n->adj_index = ADJ_INDEX_INVALID;
   }
 
   /* Update time stamp and ethernet address. */
-  clib_memcpy (n->link_layer_address, link_layer_address, n_bytes_link_layer_address);
+  clib_memcpy (n->link_layer_address,
+              link_layer_address,
+              n_bytes_link_layer_address);
+
   n->cpu_time_last_updated = clib_cpu_time_now ();
   if (is_static)
     n->flags |= IP6_NEIGHBOR_FLAG_STATIC;
+  else
+    n->flags |= IP6_NEIGHBOR_FLAG_DYNAMIC;
+
+  ip6_nd_mk_complete(n);
 
   /* Customer(s) waiting for this address to be resolved? */
   p = mhash_get (&nm->pending_resolutions_by_address, a);
@@ -499,6 +492,40 @@ vnet_set_ip6_ethernet_neighbor (vlib_main_t * vm,
   return 0;
 }
 
+static void
+ip6_nd_mk_incomplete (ip6_neighbor_t *nbr)
+{
+  fib_prefix_t pfx = {
+      .fp_len = 128,
+      .fp_proto = FIB_PROTOCOL_IP6,
+      .fp_addr = {
+         .ip6 = nbr->key.ip6_address,
+      },
+  };
+  u32 fib_index;
+  ip6_main_t *im;
+
+  im = &ip6_main;
+  fib_index = im->fib_index_by_sw_if_index[nbr->key.sw_if_index];
+
+  /*
+   * revert the adj this ND entry sourced to incomplete
+   */
+  adj_nbr_update_rewrite(nbr->adj_index,
+                        NULL);
+
+  /*
+   * remove the FIB entry the ND entry sourced
+   */
+  fib_table_entry_delete(fib_index, &pfx, FIB_SOURCE_ADJ);
+
+  /*
+   * Unlock the adj now that the ARP entry is no longer a source
+   */
+  adj_unlock(nbr->adj_index);
+  nbr->adj_index = ADJ_INDEX_INVALID;
+}
+
 int
 vnet_unset_ip6_ethernet_neighbor (vlib_main_t * vm,
                                   u32 sw_if_index,
@@ -509,8 +536,6 @@ vnet_unset_ip6_ethernet_neighbor (vlib_main_t * vm,
   ip6_neighbor_main_t * nm = &ip6_neighbor_main;
   ip6_neighbor_key_t k;
   ip6_neighbor_t * n;
-  ip6_main_t * im = &ip6_main;
-  ip6_add_del_route_args_t args;
   uword * p;
   int rv = 0;
 
@@ -537,73 +562,16 @@ vnet_unset_ip6_ethernet_neighbor (vlib_main_t * vm,
     }
   
   n = pool_elt_at_index (nm->neighbor_pool, p[0]);
+
+  ip6_nd_mk_incomplete(n);
   mhash_unset (&nm->neighbor_index_by_key, &n->key, 0);
   pool_put (nm->neighbor_pool, n);
   
-  args.table_index_or_table_id = im->fib_index_by_sw_if_index[sw_if_index];
-  args.flags = IP6_ROUTE_FLAG_FIB_INDEX | IP6_ROUTE_FLAG_DEL 
-    | IP6_ROUTE_FLAG_NEIGHBOR;
-  args.dst_address = a[0];
-  args.dst_address_length = 128;
-  args.adj_index = ~0;
-  args.add_adj = NULL;
-  args.n_add_adj = 0;
-  ip6_add_del_route (im, &args);
  out:
   vlib_worker_thread_barrier_release(vm);
   return rv;
 }
 
-
-u32
-vnet_ip6_neighbor_glean_add(u32 fib_index, void * next_hop_arg)
-{
-  ip6_neighbor_main_t * nm = &ip6_neighbor_main;
-  ip6_main_t * im = &ip6_main;
-  ip_lookup_main_t * lm = &im->lookup_main;
-  ip6_address_t * next_hop = next_hop_arg;
-  ip_adjacency_t add_adj, *adj;
-  ip6_add_del_route_args_t args;
-  ip6_neighbor_t * n;
-  ip6_neighbor_key_t k;
-  u32 adj_index;
-
-  adj_index = ip6_fib_lookup_with_table(im, fib_index, next_hop);
-  adj = ip_get_adjacency(lm, adj_index);
-
-  if (!adj || adj->lookup_next_index != IP_LOOKUP_NEXT_ARP)
-    return ~0;
-
-  if (adj->arp.next_hop.ip6.as_u64[0] ||
-      adj->arp.next_hop.ip6.as_u64[1])
-    return adj_index;
-
-  k.sw_if_index = adj->rewrite_header.sw_if_index;
-  k.ip6_address = *next_hop;
-  k.pad = 0;
-  if (mhash_get (&nm->neighbor_index_by_key, &k))
-    return adj_index;
-
-  pool_get (nm->neighbor_pool, n);
-  mhash_set (&nm->neighbor_index_by_key, &k, n - nm->neighbor_pool, /* old value */ 0);
-  n->key = k;
-  n->cpu_time_last_updated = clib_cpu_time_now ();
-  n->flags = IP6_NEIGHBOR_FLAG_GLEAN;
-
-  memset(&args, 0, sizeof(args));
-  memcpy(&add_adj, adj, sizeof(add_adj));
-  add_adj.arp.next_hop.ip6 = *next_hop; /* install neighbor /128 route */
-  args.table_index_or_table_id = fib_index;
-  args.flags = IP6_ROUTE_FLAG_FIB_INDEX | IP6_ROUTE_FLAG_ADD | IP6_ROUTE_FLAG_NEIGHBOR;
-  args.dst_address = *next_hop;
-  args.dst_address_length = 128;
-  args.adj_index = ~0;
-  args.add_adj = &add_adj;
-  args.n_add_adj = 1;
-  ip6_add_del_route (im, &args);
-  return ip6_fib_lookup_with_table (im, fib_index, next_hop);
-}
-
 #if DPDK > 0
 static void ip6_neighbor_set_unset_rpc_callback 
 ( ip6_neighbor_set_unset_rpc_args_t * a)
@@ -728,7 +696,6 @@ icmp6_neighbor_solicitation_or_advertisement (vlib_main_t * vm,
 {
   vnet_main_t * vnm = vnet_get_main();
   ip6_main_t * im = &ip6_main;
-  ip_lookup_main_t * lm = &im->lookup_main;
   uword n_packets = frame->n_vectors;
   u32 * from, * to_next;
   u32 n_left_from, n_left_to_next, next_index, n_advertisements_sent;
@@ -787,17 +754,25 @@ icmp6_neighbor_solicitation_or_advertisement (vlib_main_t * vm,
          if (!ip6_sadd_unspecified && !ip6_sadd_link_local)
            {
              u32 src_adj_index0 = ip6_src_lookup_for_packet (im, p0, ip0);
-             ip_adjacency_t * adj0 = ip_get_adjacency (&im->lookup_main, src_adj_index0);
 
-              /* Allow all realistic-looking rewrite adjacencies to pass */
-              ni0 = adj0->lookup_next_index;
-              is_rewrite0 = (ni0 >= IP_LOOKUP_NEXT_ARP) &&
-                (ni0 < IP6_LOOKUP_N_NEXT);
+              if (ADJ_INDEX_INVALID != src_adj_index0)
+                {
+                  ip_adjacency_t * adj0 = ip_get_adjacency (&im->lookup_main, src_adj_index0);
 
-             error0 = ((adj0->rewrite_header.sw_if_index != sw_if_index0
-                         || ! is_rewrite0)
-                       ? ICMP6_ERROR_NEIGHBOR_SOLICITATION_SOURCE_NOT_ON_LINK
-                       : error0);
+                  /* Allow all realistic-looking rewrite adjacencies to pass */
+                  ni0 = adj0->lookup_next_index;
+                  is_rewrite0 = (ni0 >= IP_LOOKUP_NEXT_ARP) &&
+                      (ni0 < IP6_LOOKUP_N_NEXT);
+
+                  error0 = ((adj0->rewrite_header.sw_if_index != sw_if_index0
+                             || ! is_rewrite0)
+                            ? ICMP6_ERROR_NEIGHBOR_SOLICITATION_SOURCE_NOT_ON_LINK
+                            : error0);
+                }
+              else
+                {
+                  error0 = ICMP6_ERROR_NEIGHBOR_SOLICITATION_SOURCE_NOT_ON_LINK;
+                }
             }
              
          o0 = (void *) (h0 + 1);
@@ -820,21 +795,28 @@ icmp6_neighbor_solicitation_or_advertisement (vlib_main_t * vm,
 
          if (is_solicitation && error0 == ICMP6_ERROR_NONE)
            {
-             /* Check that target address is one that we know about. */
-             ip_interface_address_t * ia0;
-             ip6_address_fib_t ip6_af0;
-              void * oldheap;
-
-             ip6_addr_fib_init (&ip6_af0, &h0->target_address,
-                                vec_elt (im->fib_index_by_sw_if_index,
-                                         sw_if_index0));
-
-              /* Gross kludge, "thank you" MJ, don't even ask */
-              oldheap = clib_mem_set_heap (clib_per_cpu_mheaps[0]);
-             ia0 = ip_get_interface_address (lm, &ip6_af0);
-              clib_mem_set_heap (oldheap);
-             error0 = ia0 == 0 ? 
-                  ICMP6_ERROR_NEIGHBOR_SOLICITATION_SOURCE_UNKNOWN : error0;
+             /* Check that target address is local to this router. */
+              fib_node_index_t fei;
+             u32 fib_index;
+
+             fib_index = ip6_fib_table_get_index_for_sw_if_index(sw_if_index0);
+
+             if (~0 == fib_index)
+               {
+                 error0 = ICMP6_ERROR_NEIGHBOR_SOLICITATION_SOURCE_UNKNOWN;
+               }
+             else
+               {
+                 fei = ip6_fib_table_lookup_exact_match(fib_index,
+                                                        &h0->target_address,
+                                                        128);
+
+                 if (FIB_NODE_INDEX_INVALID == fei || 
+                     !(FIB_ENTRY_FLAG_LOCAL & fib_entry_get_flags(fei)))
+                   {
+                     error0 = ICMP6_ERROR_NEIGHBOR_SOLICITATION_SOURCE_UNKNOWN;
+                   }
+               }
            }
 
          if (is_solicitation)
@@ -1052,13 +1034,20 @@ icmp6_router_solicitation(vlib_main_t * vm,
          if (!is_unspecified && !is_link_local)
            {
              u32 src_adj_index0 = ip6_src_lookup_for_packet (im, p0, ip0);
-             ip_adjacency_t * adj0 = ip_get_adjacency (&im->lookup_main, src_adj_index0);
 
-             error0 = ((adj0->rewrite_header.sw_if_index != sw_if_index0
-                         || (adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP
-                             && adj0->lookup_next_index != IP_LOOKUP_NEXT_REWRITE))
-                       ? ICMP6_ERROR_ROUTER_SOLICITATION_SOURCE_NOT_ON_LINK
-                       : error0);
+              if (ADJ_INDEX_INVALID != src_adj_index0)
+                {
+                  ip_adjacency_t * adj0 = ip_get_adjacency (&im->lookup_main,
+                                                            src_adj_index0);
+
+                  error0 = (adj0->rewrite_header.sw_if_index != sw_if_index0
+                            ? ICMP6_ERROR_ROUTER_SOLICITATION_SOURCE_NOT_ON_LINK
+                            : error0);
+                }
+              else
+                {
+                  error0 = ICMP6_ERROR_ROUTER_SOLICITATION_SOURCE_NOT_ON_LINK;
+                }
          }
          
          /* check for source LL option and process */
@@ -1472,8 +1461,7 @@ icmp6_router_advertisement(vlib_main_t * vm,
 
                      /* check for MTU or prefix options or .. */
                      u8 * opt_hdr = (u8 *)(h0 + 1);
-                     while( options_len0 > 0 && 
-                             opt_hdr < p0->data + p0->current_data)
+                     while( options_len0 > 0)
                        {
                          icmp6_neighbor_discovery_option_header_t *o0 = ( icmp6_neighbor_discovery_option_header_t *)opt_hdr;
                          int opt_len = o0->n_data_u64s << 3;
@@ -1606,11 +1594,9 @@ ip6_neighbor_sw_interface_add_del (vnet_main_t * vnm,
                                   u32 sw_if_index,
                                   u32 is_add)
 {
-  ip6_main_t * im = &ip6_main;
   ip6_neighbor_main_t * nm = &ip6_neighbor_main;  
-  ip_lookup_main_t * lm = &im->lookup_main;
   ip6_radv_t * a= 0;  
-  u32 ri = ~0;;
+  u32 ri = ~0;
   vnet_sw_interface_t * sw_if0;
   ethernet_interface_t * eth_if0 = 0; 
 
@@ -1636,9 +1622,9 @@ ip6_neighbor_sw_interface_add_del (vnet_main_t * vnm,
          ip6_mldp_group_t *m;
          
          /* remove adjacencies */
-         ip_del_adjacency (lm,  a->all_nodes_adj_index); 
-         ip_del_adjacency (lm,  a->all_routers_adj_index);           
-         ip_del_adjacency (lm,  a->all_mldv2_routers_adj_index);             
+         adj_unlock(a->all_nodes_adj_index); 
+         adj_unlock(a->all_routers_adj_index);       
+         adj_unlock(a->all_mldv2_routers_adj_index);
          
          /* clean up prefix_pool */
          pool_foreach (p, a->adv_prefixes_pool, ({
@@ -1672,6 +1658,7 @@ ip6_neighbor_sw_interface_add_del (vnet_main_t * vnm,
          pool_put (nm->if_radv_pool,  a);
          nm->if_radv_pool_index_by_sw_if_index[sw_if_index] = ~0;
          ri = ~0;
+         ip6_sw_interface_enable_disable(sw_if_index, 0);
        }
     }
  else
@@ -1680,6 +1667,7 @@ ip6_neighbor_sw_interface_add_del (vnet_main_t * vnm,
        {
         vnet_hw_interface_t * hw_if0;
      
+        ip6_sw_interface_enable_disable(sw_if_index, 1);
         hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index);
         
         pool_get (nm->if_radv_pool, a);
@@ -1702,10 +1690,11 @@ ip6_neighbor_sw_interface_add_del (vnet_main_t * vnm,
         a->min_delay_between_radv = MIN_DELAY_BETWEEN_RAS;
         a->max_delay_between_radv = MAX_DELAY_BETWEEN_RAS;
         a->max_rtr_default_lifetime = MAX_DEF_RTR_LIFETIME;
-        a->seed = (u32) (clib_cpu_time_now() & 0xFFFFFFFF);
+        a->seed = random_default_seed();
         
         /* for generating random interface ids */
-        a->randomizer = random_u64 (&a->seed);
+        a->randomizer = 0x1119194911191949;
+        a->randomizer = random_u64 ((u32 *)&a->randomizer);
         
         a->initial_adverts_count = MAX_INITIAL_RTR_ADVERTISEMENTS ; 
         a->initial_adverts_sent = a->initial_adverts_count-1;
@@ -1727,66 +1716,34 @@ ip6_neighbor_sw_interface_add_del (vnet_main_t * vnm,
         mhash_init (&a->address_to_mldp_index, sizeof (uword), sizeof (ip6_address_t)); 
         
         {
-          ip_adjacency_t *adj;
           u8 link_layer_address[6] = 
             {0x33, 0x33, 0x00, 0x00, 0x00, IP6_MULTICAST_GROUP_ID_all_hosts};
           
-          adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
-                                  &a->all_nodes_adj_index);
-          
-          adj->lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
-          adj->if_address_index = ~0;
-          
-          vnet_rewrite_for_sw_interface
-            (vnm,
-             VNET_L3_PACKET_TYPE_IP6,
-             sw_if_index,
-             ip6_rewrite_node.index,
-             link_layer_address,
-             &adj->rewrite_header,
-             sizeof (adj->rewrite_data));
+          a->all_nodes_adj_index = adj_rewrite_add_and_lock(FIB_PROTOCOL_IP6,
+                                                            FIB_LINK_IP6,
+                                                            sw_if_index,
+                                                            link_layer_address);
         } 
         
         {
-          ip_adjacency_t *adj;
           u8 link_layer_address[6] = 
             {0x33, 0x33, 0x00, 0x00, 0x00, IP6_MULTICAST_GROUP_ID_all_routers};
        
-          adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
-                                  &a->all_routers_adj_index);
-          
-          adj->lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
-          adj->if_address_index = ~0;
-          
-          vnet_rewrite_for_sw_interface
-            (vnm,
-             VNET_L3_PACKET_TYPE_IP6,
-             sw_if_index,
-             ip6_rewrite_node.index,
-             link_layer_address,
-             &adj->rewrite_header,
-             sizeof (adj->rewrite_data));
+          a->all_routers_adj_index = adj_rewrite_add_and_lock(FIB_PROTOCOL_IP6,
+                                                              FIB_LINK_IP6,
+                                                              sw_if_index,
+                                                              link_layer_address);
         } 
         
         {
-          ip_adjacency_t *adj;
           u8 link_layer_address[6] = 
             {0x33, 0x33, 0x00, 0x00, 0x00, IP6_MULTICAST_GROUP_ID_mldv2_routers};
           
-          adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
-                                  &a->all_mldv2_routers_adj_index);
-          
-          adj->lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
-          adj->if_address_index = ~0;
-          
-          vnet_rewrite_for_sw_interface
-            (vnm,
-             VNET_L3_PACKET_TYPE_IP6,
-             sw_if_index,
-             ip6_rewrite_node.index,
-             link_layer_address,
-             &adj->rewrite_header,
-             sizeof (adj->rewrite_data));
+          a->all_mldv2_routers_adj_index = 
+              adj_rewrite_add_and_lock(FIB_PROTOCOL_IP6,
+                                       FIB_LINK_IP6,
+                                       sw_if_index,
+                                       link_layer_address);
         } 
         
         /* add multicast groups we will always be reporting  */
@@ -2969,7 +2926,8 @@ enable_ip6_interface(vlib_main_t * vm,
                  
                  /* essentially "enables" ipv6 on this interface */
                  error = ip6_add_del_interface_address (vm, sw_if_index,
-                                                        &link_local_address, 64 /* address width */,
+                                                        &link_local_address,
+                                                        128 /* address width */,
                                                         0 /* is_del */);
                  
                  if(error)
@@ -3255,87 +3213,10 @@ clib_error_t *ip6_set_neighbor_limit (u32 neighbor_limit)
   return 0;
 }
 
-
-static void
-ip6_neighbor_entry_del_adj(ip6_neighbor_t *n, u32 adj_index)
-{
-  int done = 0;
-  int i;
-  while (!done)
-    {
-      vec_foreach_index(i, n->adjacencies)
-        if (vec_elt(n->adjacencies, i) == adj_index)
-          {
-            vec_del1(n->adjacencies, i);
-            continue;
-          }
-      done = 1;
-    }
-}
-
-static void
-ip6_neighbor_entry_add_adj(ip6_neighbor_t *n, u32 adj_index)
-{
-  int i;
-  vec_foreach_index(i, n->adjacencies)
-    if (vec_elt(n->adjacencies, i) == adj_index)
-      return;
-  vec_add1(n->adjacencies, adj_index);
-}
-
-static void
-ip6_neighbor_add_del_adj_cb (struct ip_lookup_main_t * lm,
-                    u32 adj_index,
-                    ip_adjacency_t * adj,
-                    u32 is_del)
-{
-  ip6_neighbor_main_t * nm = &ip6_neighbor_main;
-  ip6_neighbor_key_t k;
-  ip6_neighbor_t *n = 0;
-  uword * p;
-  u32 ai;
-
-  for(ai = adj->heap_handle; ai < adj->heap_handle + adj->n_adj ; ai++)
-    {
-      adj = ip_get_adjacency (lm, ai);
-      if (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP &&
-          (adj->arp.next_hop.ip6.as_u64[0] || adj->arp.next_hop.ip6.as_u64[1]))
-        {
-          k.sw_if_index = adj->rewrite_header.sw_if_index;
-          k.ip6_address.as_u64[0] = adj->arp.next_hop.ip6.as_u64[0];
-          k.ip6_address.as_u64[1] = adj->arp.next_hop.ip6.as_u64[1];
-          k.pad = 0;
-          p = mhash_get (&nm->neighbor_index_by_key, &k);
-          if (p)
-            n = pool_elt_at_index (nm->neighbor_pool, p[0]);
-        }
-      else
-        continue;
-
-      if (is_del)
-        {
-          if (!n)
-            clib_warning("Adjacency contains unknown ND next hop %U (del)",
-                         format_ip46_address, &adj->arp.next_hop, IP46_TYPE_IP6);
-          else
-            ip6_neighbor_entry_del_adj(n, adj->heap_handle);
-        }
-      else /* add */
-        {
-          if (!n)
-            clib_warning("Adjacency contains unknown ND next hop %U (add)",
-                         format_ip46_address, &adj->arp.next_hop, IP46_TYPE_IP6);
-          else
-            ip6_neighbor_entry_add_adj(n, adj->heap_handle);
-        }
-    }
-}
-
 static clib_error_t * ip6_neighbor_init (vlib_main_t * vm)
 {
   ip6_neighbor_main_t * nm = &ip6_neighbor_main;
   ip6_main_t * im = &ip6_main;
-  ip_lookup_main_t * lm = &im->lookup_main;
  
   mhash_init (&nm->neighbor_index_by_key,
              /* value size */ sizeof (uword),
@@ -3375,8 +3256,6 @@ static clib_error_t * ip6_neighbor_init (vlib_main_t * vm)
       (im->discover_neighbor_next_index_by_hw_if_index, 32, 0 /* drop */);
 #endif
 
-  ip_register_add_del_adjacency_callback(lm, ip6_neighbor_add_del_adj_cb);
-
   return 0;
 }
 
@@ -3593,5 +3472,3 @@ int vnet_ip6_nd_term (vlib_main_t * vm,
   return 0;
 
 }
-
-                     
index c83e576..29fa4a4 100644 (file)
@@ -70,6 +70,8 @@ typedef CLIB_PACKED (union {
 #define ip46_address_mask_ip4(ip46)    ((ip46)->pad[0] = (ip46)->pad[1] = (ip46)->pad[2] = 0)
 #define ip46_address_set_ip4(ip46, ip) (ip46_address_mask_ip4(ip46), (ip46)->ip4 = (ip)[0])
 #define ip46_address_reset(ip46)       ((ip46)->as_u64[0] = (ip46)->as_u64[1] = 0)
+#define ip46_address_cmp(ip46_1, ip46_2) (memcmp(ip46_1, ip46_2, sizeof(*ip46_1)))
+#define ip46_address_is_zero(ip46)     (((ip46)->as_u64[0] == 0) && ((ip46)->as_u64[1] == 0))
 
 always_inline void
 ip6_addr_fib_init (ip6_address_fib_t * addr_fib, ip6_address_t * address,
@@ -302,6 +304,22 @@ always_inline void *
 ip6_next_header (ip6_header_t * i)
 { return (void *) (i + 1); }
 
+always_inline void
+ip6_copy_header (ip6_header_t * dst,
+                 const ip6_header_t *src)
+{
+    dst->ip_version_traffic_class_and_flow_label =
+        src->ip_version_traffic_class_and_flow_label;
+    dst->payload_length = src->payload_length;
+    dst->protocol = src->protocol;
+    dst->hop_limit = src->hop_limit;
+
+    dst->src_address.as_uword[0] = src->src_address.as_uword[0];
+    dst->src_address.as_uword[1] = src->src_address.as_uword[1];
+    dst->dst_address.as_uword[0] = src->dst_address.as_uword[0];
+    dst->dst_address.as_uword[1] = src->dst_address.as_uword[1];
+}
+
 always_inline void
 ip6_tcp_reply_x1 (ip6_header_t * ip0, tcp_header_t * tcp0)
 {
index 9505a09..b96f81b 100644 (file)
@@ -15,6 +15,7 @@
 
 #include <vnet/vnet.h>
 #include <vnet/ip/ip.h>
+#include <vnet/mpls/mpls.h>
 
 /** \file
 
@@ -131,7 +132,7 @@ ip_feature_init_cast (vlib_main_t * vm,
                      vnet_config_main_t * vcm,
                      char **feature_start_nodes,
                      int num_feature_start_nodes,
-                     vnet_cast_t cast, int is_ip4)
+                     vnet_cast_t cast, vnet_l3_packet_type_t proto)
 {
   uword *index_by_name;
   uword *reg_by_index;
@@ -155,33 +156,43 @@ ip_feature_init_cast (vlib_main_t * vm,
   u8 **keys_to_delete = 0;
   ip4_main_t *im4 = &ip4_main;
   ip6_main_t *im6 = &ip6_main;
+  mpls_main_t *mm = &mpls_main;
 
   index_by_name = hash_create_string (0, sizeof (uword));
   reg_by_index = hash_create (0, sizeof (uword));
 
   if (cast == VNET_IP_RX_UNICAST_FEAT)
     {
-      if (is_ip4)
+      if (proto == VNET_L3_PACKET_TYPE_IP4)
        first_reg = im4->next_uc_feature;
-      else
+      else if (proto == VNET_L3_PACKET_TYPE_IP6)
        first_reg = im6->next_uc_feature;
+      else if (proto == VNET_L3_PACKET_TYPE_MPLS_UNICAST)
+       first_reg = mm->next_feature;
+      else
+       return clib_error_return (0,
+                                 "protocol %d cast %d unsupport for features",
+                                 proto, cast);
     }
   else if (cast == VNET_IP_RX_MULTICAST_FEAT)
     {
-      if (is_ip4)
+      if (proto == VNET_L3_PACKET_TYPE_IP4)
        first_reg = im4->next_mc_feature;
-      else
+      else if (proto == VNET_L3_PACKET_TYPE_IP6)
        first_reg = im6->next_mc_feature;
+      else
+       return clib_error_return (0,
+                                 "protocol %d cast %d unsupport for features",
+                                 proto, cast);
     }
   else if (cast == VNET_IP_TX_FEAT)
     {
-      if (is_ip4)
+      if (proto == VNET_L3_PACKET_TYPE_IP4)
        first_reg = im4->next_tx_feature;
       else
        first_reg = im6->next_tx_feature;
     }
 
-
   this_reg = first_reg;
 
   /* pass 1, collect feature node names, construct a before b pairs */
@@ -281,8 +292,7 @@ again:
   /* see if we got a partial order... */
   if (vec_len (result) != n_features)
     return clib_error_return
-      (0, "ip%s_feature_init_cast (cast=%d), no partial order!",
-       is_ip4 ? "4" : "6", cast);
+      (0, "%d feature_init_cast (cast=%d), no partial order!", proto, cast);
 
   /*
    * We win.
@@ -308,10 +318,12 @@ again:
                    feature_nodes, vec_len (feature_nodes));
 
   /* Save a copy for show command */
-  if (is_ip4)
+  if (proto == VNET_L3_PACKET_TYPE_IP4)
     im4->feature_nodes[cast] = feature_nodes;
-  else
+  else if (proto == VNET_L3_PACKET_TYPE_IP6)
     im6->feature_nodes[cast] = feature_nodes;
+  else if (proto == VNET_L3_PACKET_TYPE_MPLS_UNICAST)
+    mm->feature_nodes = feature_nodes;
 
   /* Finally, clean up all the shit we allocated */
   /* *INDENT-OFF* */
index 2d9a15b..95ee78a 100644 (file)
@@ -39,7 +39,8 @@ clib_error_t *ip_feature_init_cast (vlib_main_t * vm,
                                    vnet_config_main_t * vcm,
                                    char **feature_start_nodes,
                                    int num_feature_start_nodes,
-                                   vnet_cast_t cast, int is_ip4);
+                                   vnet_cast_t cast,
+                                   vnet_l3_packet_type_t proto);
 
 #endif /* included_ip_feature_registration_h */
 
index 5b49aab..fefe5ff 100644 (file)
@@ -19,9 +19,6 @@
 
 typedef struct
 {
-  u32 ranges_per_adjacency;
-  u32 special_adjacency_format_function_index;
-
   /* convenience */
   vlib_main_t *vlib_main;
   vnet_main_t *vnet_main;
@@ -60,6 +57,69 @@ typedef struct
   u16x8vec_t hi;
 } protocol_port_range_t;
 
+/**
+ * @brief The number of supported ranges per-data path object.
+ * If more ranges are required, bump this number.
+ */
+#define N_PORT_RANGES_PER_DPO  64
+#define N_RANGES_PER_BLOCK (sizeof(u16x8vec_t)/2)
+#define N_BLOCKS_PER_DPO (N_PORT_RANGES_PER_DPO/N_RANGES_PER_BLOCK)
+
+/**
+ * @brief
+ *  The object that is in the data-path to perform the check.
+ *
+ * Some trade-offs here; memory vs performance.
+ *
+ * performance:
+ *  the principle factor is d-cache line misses/hits.
+ *  so we want the data layout to minimise the d-cache misses. This
+ *  means not following dependent reads. i.e. not doing
+ *
+ *   struct B {
+ *     u16 n_ranges;
+ *     range_t *ragnes; // vector of ranges.
+ *   }
+ *
+ *   so to read ranges[0] we would first d-cache miss on the address
+ *   of the object of type B, for which we would need to wait before we
+ *   can get the address of B->ranges.
+ *   So this layout is better:
+ *
+ *  struct B {
+ *    u16 n_ranges;
+ *    range_t ragnes[N];
+ *  }
+ *
+ * memory:
+ *  the latter layout above is more memory hungry. And N needs to be:
+ *   1 - sized for the maximum required
+ *   2 - fixed, so that objects of type B can be pool allocated and so
+ *       'get'-able using an index.
+ *       An option over fixed might be to allocate contiguous chunk from
+ *       the pool (like we used to do for multi-path adjs).
+ */
+typedef struct protocol_port_range_dpo_t_
+{
+  /**
+   * The number of blocks from the 'block' array below
+   * that have rnages configured. We keep this count so that in the data-path
+   * we can limit the loop to be only over the blocks we need
+   */
+  u16 n_used_blocks;
+
+  /**
+   * The total number of free ranges from all blocks.
+   * Used to prevent overrun of the ranges available.
+   */
+  u16 n_free_ranges;
+
+  /**
+   * the fixed size array of ranges
+   */
+  protocol_port_range_t blocks[N_BLOCKS_PER_DPO];
+} protocol_port_range_dpo_t;
+
 int ip4_source_and_port_range_check_add_del (ip4_address_t * address,
                                             u32 length,
                                             u32 vrf_id,
index 4713807..a695ef7 100644 (file)
  *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include <vppinfra/math.h>             /* for fabs */
 #include <vnet/ip/ip.h>
-#include <vnet/ip/adj_alloc.h>
-
-static void
-ip_multipath_del_adjacency (ip_lookup_main_t * lm, u32 del_adj_index);
-
-always_inline void
-ip_poison_adjacencies (ip_adjacency_t * adj, uword n_adj)
-{
-  if (CLIB_DEBUG > 0)
-    {
-      u32 save_handle = adj->heap_handle;;
-      u32 save_n_adj = adj->n_adj;
-
-      memset (adj, 0xfe, n_adj * sizeof (adj[0]));
-
-      adj->heap_handle = save_handle;
-      adj->n_adj = save_n_adj;
-    }
-}
-
-static void
-ip_share_adjacency(ip_lookup_main_t * lm, u32 adj_index)
-{
-  ip_adjacency_t * adj = ip_get_adjacency(lm, adj_index);
-  uword * p;
-  u32 old_ai;
-  uword signature = vnet_ip_adjacency_signature (adj);
-
-  p = hash_get (lm->adj_index_by_signature, signature);
-  /* Hash collision? */
-  if (p)
-    {
-      /* Save the adj index, p[0] will be toast after the unset! */
-      old_ai = p[0];
-      hash_unset (lm->adj_index_by_signature, signature);
-      hash_set (lm->adj_index_by_signature, signature, adj_index);
-      adj->next_adj_with_signature = old_ai;
-    }
-  else
-    {
-      adj->next_adj_with_signature = 0;
-      hash_set (lm->adj_index_by_signature, signature, adj_index);
-    }
-}
-
-static void
-ip_unshare_adjacency(ip_lookup_main_t * lm, u32 adj_index)
-{
-  ip_adjacency_t * adj = ip_get_adjacency(lm, adj_index);
-  uword signature;
-  uword * p;
-  u32 this_ai;
-  ip_adjacency_t * this_adj, * prev_adj = 0;
-
-  signature = vnet_ip_adjacency_signature (adj);
-  p = hash_get (lm->adj_index_by_signature, signature);
-  if (p == 0)
-      return;
-
-  this_ai = p[0];
-  /* At the top of the signature chain (likely)? */
-  if (this_ai == adj_index)
-    {
-      if (adj->next_adj_with_signature == 0)
-       {
-         hash_unset (lm->adj_index_by_signature, signature);
-         return;
-       }
-      else
-       {
-         this_adj = ip_get_adjacency (lm, adj->next_adj_with_signature);
-         hash_unset (lm->adj_index_by_signature, signature);
-         hash_set (lm->adj_index_by_signature, signature,
-                   this_adj->heap_handle);
-       }
-    }
-  else                      /* walk signature chain */
-    {
-      this_adj = ip_get_adjacency (lm, this_ai);
-      while (this_adj != adj)
-       {
-         prev_adj = this_adj;
-         this_adj = ip_get_adjacency
-           (lm, this_adj->next_adj_with_signature);
-         /*
-          * This can happen when creating the first multipath adj of a set
-          * We end up looking at the miss adjacency (handle==0).
-          */
-         if (this_adj->heap_handle == 0)
-            return;
-        }
-      prev_adj->next_adj_with_signature = this_adj->next_adj_with_signature;
-    }
-}
-
-int ip_register_adjacency(vlib_main_t *vm,
-                          u8 is_ip4,
-                          ip_adj_register_t *reg)
-{
-  ip_lookup_main_t *lm = (is_ip4)?&ip4_main.lookup_main:&ip6_main.lookup_main;
-  vlib_node_t *node = vlib_get_node_by_name (vm, (u8 *) ((is_ip4)?"ip4-lookup":"ip6-lookup"));
-  vlib_node_t *next_node = vlib_get_node_by_name(vm, (u8 *) reg->node_name);
-  *reg->next_index = vlib_node_add_next (vm, node->index, next_node->index);
-  vec_validate(lm->registered_adjacencies, *reg->next_index);
-  lm->registered_adjacencies[*reg->next_index] = *reg;
-  return 0;
-}
-
-int ip_init_registered_adjacencies(u8 is_ip4)
-{
-  vlib_main_t *vm = vlib_get_main();
-  ip_lookup_main_t *lm = (is_ip4)?&ip4_main.lookup_main:&ip6_main.lookup_main;
-  ip_adj_register_t *reg = lm->registered_adjacencies;
-  lm->registered_adjacencies = 0; //Init vector
-  int rv;
-  while (reg) {
-    if((rv = ip_register_adjacency(vm, is_ip4, reg)))
-      return rv;
-    reg = reg->next;
-  }
-  return 0;
-}
-
-/* Create new block of given number of contiguous adjacencies. */
-ip_adjacency_t *
-ip_add_adjacency (ip_lookup_main_t * lm,
-                 ip_adjacency_t * copy_adj,
-                 u32 n_adj,
-                 u32 * adj_index_return)
-{
-  ip_adjacency_t * adj;
-  u32 ai, i, handle;
-
-  /* See if we know enough to attempt to share an existing adjacency */
-  if (copy_adj && n_adj == 1)
-    {
-      uword signature;
-      uword * p;
-
-      switch (copy_adj->lookup_next_index)
-        {
-        case IP_LOOKUP_NEXT_DROP:
-          if (lm->drop_adj_index)
-            {
-              adj = ip_get_adjacency (lm, lm->drop_adj_index);
-              *adj_index_return = lm->drop_adj_index;
-              return (adj);
-            }
-          break;
-
-        case IP_LOOKUP_NEXT_LOCAL:
-          if (lm->local_adj_index)
-            {
-              adj = ip_get_adjacency (lm, lm->local_adj_index);
-              *adj_index_return = lm->local_adj_index;
-              return (adj);
-            }
-        default:
-          break;
-        }
-
-      signature = vnet_ip_adjacency_signature (copy_adj);
-      p = hash_get (lm->adj_index_by_signature, signature);
-      if (p)
-        {
-          adj = vec_elt_at_index (lm->adjacency_heap, p[0]);
-          while (1)
-            {
-              if (vnet_ip_adjacency_share_compare (adj, copy_adj))
-                {
-                  adj->share_count++;
-                  *adj_index_return = p[0];
-                  return adj;
-                }
-              if (adj->next_adj_with_signature == 0)
-                break;
-              adj = vec_elt_at_index (lm->adjacency_heap,
-                                      adj->next_adj_with_signature);
-            }
-        }
-    }
-
-  lm->adjacency_heap = aa_alloc (lm->adjacency_heap, &adj, n_adj);
-  handle = ai = adj->heap_handle;
-
-  ip_poison_adjacencies (adj, n_adj);
-
-  /* Validate adjacency counters. */
-  vlib_validate_combined_counter (&lm->adjacency_counters, ai + n_adj - 1);
-
-  for (i = 0; i < n_adj; i++)
-    {
-      /* Make sure certain fields are always initialized. */
-      adj[i].rewrite_header.sw_if_index = ~0;
-      adj[i].explicit_fib_index = ~0;
-      adj[i].mcast_group_index = ~0;
-      adj[i].classify.table_index = ~0;
-      adj[i].saved_lookup_next_index = 0;
-      adj[i].special_adjacency_format_function_index = 0;
-
-      if (copy_adj)
-        adj[i] = copy_adj[i];
-
-      adj[i].heap_handle = handle;
-      adj[i].n_adj = n_adj;
-      adj[i].share_count = 0;
-      adj[i].next_adj_with_signature = 0;
-
-      /* Zero possibly stale counters for re-used adjacencies. */
-      vlib_zero_combined_counter (&lm->adjacency_counters, ai + i);
-    }
-
-  /* Set up to share the adj later */
-  if (copy_adj && n_adj == 1)
-    ip_share_adjacency(lm, ai);
-
-  *adj_index_return = ai;
-  return adj;
-}
-
-void
-ip_update_adjacency (ip_lookup_main_t * lm,
-                    u32 adj_index,
-                    ip_adjacency_t * copy_adj)
-{
-  ip_adjacency_t * adj = ip_get_adjacency(lm, adj_index);
-
-  ip_call_add_del_adjacency_callbacks (lm, adj_index, /* is_del */ 1);
-  ip_unshare_adjacency(lm, adj_index);
-
-  /* temporary redirect to drop while updating rewrite data */
-  adj->lookup_next_index = IP_LOOKUP_NEXT_ARP;
-  CLIB_MEMORY_BARRIER();
-
-  clib_memcpy (&adj->rewrite_header, &copy_adj->rewrite_header,
-              VLIB_BUFFER_PRE_DATA_SIZE);
-  adj->lookup_next_index = copy_adj->lookup_next_index;
-  ip_share_adjacency(lm, adj_index);
-  ip_call_add_del_adjacency_callbacks (lm, adj_index, /* is_del */ 0);
-}
-
-static void ip_del_adjacency2 (ip_lookup_main_t * lm, u32 adj_index, u32 delete_multipath_adjacency)
-{
-  ip_adjacency_t * adj;
-
-  ip_call_add_del_adjacency_callbacks (lm, adj_index, /* is_del */ 1);
-
-  adj = ip_get_adjacency (lm, adj_index);
-
-  /* Special-case miss, local, drop adjs */
-  if (adj_index < 3)
-      return;
-
-  if (adj->n_adj == 1)
-    {
-      if (adj->share_count > 0)
-        {
-          adj->share_count --;
-          return;
-        }
-
-      ip_unshare_adjacency(lm, adj_index);
-    }
-
-  if (delete_multipath_adjacency)
-    ip_multipath_del_adjacency (lm, adj_index);
-
-  ip_poison_adjacencies (adj, adj->n_adj);
-
-  aa_free (lm->adjacency_heap, adj);
-}
-
-void ip_del_adjacency (ip_lookup_main_t * lm, u32 adj_index)
-{ ip_del_adjacency2 (lm, adj_index, /* delete_multipath_adjacency */ 1); }
-
-static int
-next_hop_sort_by_weight (ip_multipath_next_hop_t * n1,
-                        ip_multipath_next_hop_t * n2)
-{
-  int cmp = (int) n1->weight - (int) n2->weight;
-  return (cmp == 0
-         ? (int) n1->next_hop_adj_index - (int) n2->next_hop_adj_index
-         : (cmp > 0 ? +1 : -1));
-}
-
-/* Given next hop vector is over-written with normalized one with sorted weights and
-   with weights corresponding to the number of adjacencies for each next hop.
-   Returns number of adjacencies in block. */
-static u32 ip_multipath_normalize_next_hops (ip_lookup_main_t * lm,
-                                            ip_multipath_next_hop_t * raw_next_hops,
-                                            ip_multipath_next_hop_t ** normalized_next_hops)
-{
-  ip_multipath_next_hop_t * nhs;
-  uword n_nhs, n_adj, n_adj_left, i;
-  f64 sum_weight, norm, error;
-
-  n_nhs = vec_len (raw_next_hops);
-  ASSERT (n_nhs > 0);
-  if (n_nhs == 0)
-    return 0;
-
-  /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
-  nhs = *normalized_next_hops;
-  vec_validate (nhs, 2*n_nhs - 1);
-
-  /* Fast path: 1 next hop in block. */
-  n_adj = n_nhs;
-  if (n_nhs == 1)
-    {
-      nhs[0] = raw_next_hops[0];
-      nhs[0].weight = 1;
-      _vec_len (nhs) = 1;
-      goto done;
-    }
-
-  else if (n_nhs == 2)
-    {
-      int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;
-
-      /* Fast sort. */
-      nhs[0] = raw_next_hops[cmp];
-      nhs[1] = raw_next_hops[cmp ^ 1];
-
-      /* Fast path: equal cost multipath with 2 next hops. */
-      if (nhs[0].weight == nhs[1].weight)
-       {
-         nhs[0].weight = nhs[1].weight = 1;
-         _vec_len (nhs) = 2;
-         goto done;
-       }
-    }
-  else
-    {
-      clib_memcpy (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
-      qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
-    }
-
-  /* Find total weight to normalize weights. */
-  sum_weight = 0;
-  for (i = 0; i < n_nhs; i++)
-    sum_weight += nhs[i].weight;
-
-  /* In the unlikely case that all weights are given as 0, set them all to 1. */
-  if (sum_weight == 0)
-    {
-      for (i = 0; i < n_nhs; i++)
-       nhs[i].weight = 1;
-      sum_weight = n_nhs;
-    }
-
-  /* Save copies of all next hop weights to avoid being overwritten in loop below. */
-  for (i = 0; i < n_nhs; i++)
-    nhs[n_nhs + i].weight = nhs[i].weight;
-
-  /* Try larger and larger power of 2 sized adjacency blocks until we
-     find one where traffic flows to within 1% of specified weights. */
-  for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
-    {
-      error = 0;
-
-      norm = n_adj / sum_weight;
-      n_adj_left = n_adj;
-      for (i = 0; i < n_nhs; i++)
-       {
-         f64 nf = nhs[n_nhs + i].weight * norm; /* use saved weights */
-         word n = flt_round_nearest (nf);
-
-         n = n > n_adj_left ? n_adj_left : n;
-         n_adj_left -= n;
-         error += fabs (nf - n);
-         nhs[i].weight = n;
-       }
-       
-      nhs[0].weight += n_adj_left;
-
-      /* Less than 5% average error per adjacency with this size adjacency block? */
-      if (error <= lm->multipath_next_hop_error_tolerance*n_adj)
-       {
-         /* Truncate any next hops with zero weight. */
-         _vec_len (nhs) = i;
-         break;
-       }
-    }
-
- done:
-  /* Save vector for next call. */
-  *normalized_next_hops = nhs;
-  return n_adj;
-}
-
-always_inline uword
-ip_next_hop_hash_key_from_handle (uword handle)
-{ return 1 + 2*handle; }
-
-always_inline uword
-ip_next_hop_hash_key_is_heap_handle (uword k)
-{ return k & 1; }
-
-always_inline uword
-ip_next_hop_hash_key_get_heap_handle (uword k)
-{
-  ASSERT (ip_next_hop_hash_key_is_heap_handle (k));
-  return k / 2;
-}
-
-static u32
-ip_multipath_adjacency_get (ip_lookup_main_t * lm,
-                           ip_multipath_next_hop_t * raw_next_hops,
-                           uword create_if_non_existent)
-{
-  uword * p;
-  u32 i, j, n_adj, adj_index, adj_heap_handle;
-  ip_adjacency_t * adj, * copy_adj;
-  ip_multipath_next_hop_t * nh, * nhs;
-  ip_multipath_adjacency_t * madj;
-
-  n_adj = ip_multipath_normalize_next_hops (lm, raw_next_hops, &lm->next_hop_hash_lookup_key_normalized);
-  nhs = lm->next_hop_hash_lookup_key_normalized;
-
-  /* Basic sanity. */
-  ASSERT (n_adj >= vec_len (raw_next_hops));
-
-  /* Use normalized next hops to see if we've seen a block equivalent to this one before. */
-  p = hash_get_mem (lm->multipath_adjacency_by_next_hops, nhs);
-  if (p)
-    return p[0];
-
-  if (! create_if_non_existent)
-    return 0;
-
-  adj = ip_add_adjacency (lm, /* copy_adj */ 0, n_adj, &adj_index);
-  adj_heap_handle = adj[0].heap_handle;
-
-  /* Fill in adjacencies in block based on corresponding next hop adjacencies. */
-  i = 0;
-  vec_foreach (nh, nhs)
-    {
-      copy_adj = ip_get_adjacency (lm, nh->next_hop_adj_index);
-      for (j = 0; j < nh->weight; j++)
-       {
-         adj[i] = copy_adj[0];
-         adj[i].heap_handle = adj_heap_handle;
-         adj[i].n_adj = n_adj;
-         i++;
-       }
-    }
-
-  /* All adjacencies should have been initialized. */
-  ASSERT (i == n_adj);
-
-  vec_validate (lm->multipath_adjacencies, adj_heap_handle);
-  madj = vec_elt_at_index (lm->multipath_adjacencies, adj_heap_handle);
-
-  madj->adj_index = adj_index;
-  madj->n_adj_in_block = n_adj;
-  madj->reference_count = 0;   /* caller will set to one. */
-
-  madj->normalized_next_hops.count = vec_len (nhs);
-  madj->normalized_next_hops.heap_offset
-    = heap_alloc (lm->next_hop_heap, vec_len (nhs),
-                 madj->normalized_next_hops.heap_handle);
-  clib_memcpy (lm->next_hop_heap + madj->normalized_next_hops.heap_offset,
-         nhs, vec_bytes (nhs));
-
-  hash_set (lm->multipath_adjacency_by_next_hops,
-           ip_next_hop_hash_key_from_handle (madj->normalized_next_hops.heap_handle),
-           madj - lm->multipath_adjacencies);
-
-  madj->unnormalized_next_hops.count = vec_len (raw_next_hops);
-  madj->unnormalized_next_hops.heap_offset
-    = heap_alloc (lm->next_hop_heap, vec_len (raw_next_hops),
-                 madj->unnormalized_next_hops.heap_handle);
-  clib_memcpy (lm->next_hop_heap + madj->unnormalized_next_hops.heap_offset,
-         raw_next_hops, vec_bytes (raw_next_hops));
-
-  ip_call_add_del_adjacency_callbacks (lm, adj_index, /* is_del */ 0);
-
-  return adj_heap_handle;
-}
-
-/* Returns 0 for next hop not found. */
-u32
-ip_multipath_adjacency_add_del_next_hop (ip_lookup_main_t * lm,
-                                        u32 is_del,
-                                        u32 old_mp_adj_index,
-                                        u32 next_hop_adj_index,
-                                        u32 next_hop_weight,
-                                        u32 * new_mp_adj_index)
-{
-  ip_multipath_adjacency_t * mp_old, * mp_new;
-  ip_multipath_next_hop_t * nh, * nhs, * hash_nhs;
-  u32 n_nhs, i_nh;
-
-  mp_new = mp_old = 0;
-  n_nhs = 0;
-  i_nh = 0;
-  nhs = 0;
-
-  /* If old adj is not multipath, we need to "convert" it by calling this
-   * function recursively */
-  if (old_mp_adj_index != ~0 && !ip_adjacency_is_multipath(lm, old_mp_adj_index))
-    {
-      ip_multipath_adjacency_add_del_next_hop(lm, /* is_del */ 0,
-                                             /* old_mp_adj_index */ ~0,
-                                             /* nh_adj_index */ old_mp_adj_index,
-                                             /* weight * */ 1,
-                                             &old_mp_adj_index);
-    }
-
-  /* If old multipath adjacency is valid, find requested next hop. */
-  if (old_mp_adj_index < vec_len (lm->multipath_adjacencies)
-      && lm->multipath_adjacencies[old_mp_adj_index].normalized_next_hops.count > 0)
-    {
-      mp_old = vec_elt_at_index (lm->multipath_adjacencies, old_mp_adj_index);
-       
-      nhs = vec_elt_at_index (lm->next_hop_heap, mp_old->unnormalized_next_hops.heap_offset);
-      n_nhs = mp_old->unnormalized_next_hops.count;
-
-      /* Linear search: ok since n_next_hops is small. */
-      for (i_nh = 0; i_nh < n_nhs; i_nh++)
-       if (nhs[i_nh].next_hop_adj_index == next_hop_adj_index)
-         break;
-
-      /* Given next hop not found. */
-      if (i_nh >= n_nhs && is_del)
-       return 0;
-    }
-
-  hash_nhs = lm->next_hop_hash_lookup_key;
-  if (hash_nhs)
-    _vec_len (hash_nhs) = 0;
-
-  if (is_del)
-    {
-      if (n_nhs > 1)
-       {
-         /* Prepare lookup key for multipath with target next hop deleted. */
-         if (i_nh > 0)
-           vec_add (hash_nhs, nhs + 0, i_nh);
-         if (i_nh + 1 < n_nhs)
-           vec_add (hash_nhs, nhs + i_nh + 1, n_nhs - (i_nh + 1));
-       }
-    }
-  else /* it's an add. */
-    {
-      /* If next hop is already there with the same weight, we have nothing to do. */
-      if (i_nh < n_nhs && nhs[i_nh].weight == next_hop_weight)
-       {
-         new_mp_adj_index[0] = ~0;
-         goto done;
-       }
-
-      /* Copy old next hops to lookup key vector. */
-      if (n_nhs > 0)
-       vec_add (hash_nhs, nhs, n_nhs);
-
-      if (i_nh < n_nhs)
-       {
-         /* Change weight of existing next hop. */
-         nh = vec_elt_at_index (hash_nhs, i_nh);
-       }
-      else
-       {
-         /* Add a new next hop. */
-         vec_add2 (hash_nhs, nh, 1);
-         nh->next_hop_adj_index = next_hop_adj_index;
-       }
-
-      /* Set weight for added or old next hop. */
-      nh->weight = next_hop_weight;
-    }
-
-  if (vec_len (hash_nhs) > 0)
-    {
-      u32 tmp = ip_multipath_adjacency_get (lm, hash_nhs,
-                                           /* create_if_non_existent */ 1);
-      if (tmp != ~0)
-       mp_new = vec_elt_at_index (lm->multipath_adjacencies, tmp);
-
-      /* Fetch again since pool may have moved. */
-      if (mp_old)
-       mp_old = vec_elt_at_index (lm->multipath_adjacencies, old_mp_adj_index);
-    }
-
-  new_mp_adj_index[0] = mp_new ? mp_new - lm->multipath_adjacencies : ~0;
-
-  if (mp_new != mp_old)
-    {
-      if (mp_old)
-       {
-         ASSERT (mp_old->reference_count > 0);
-         mp_old->reference_count -= 1;
-       }
-      if (mp_new)
-       mp_new->reference_count += 1;
-    }
-
-  if (mp_old && mp_old->reference_count == 0)
-    ip_multipath_adjacency_free (lm, mp_old);
-
- done:
-  /* Save key vector next call. */
-  lm->next_hop_hash_lookup_key = hash_nhs;
-
-  return 1;
-}
-
-static void
-ip_multipath_del_adjacency (ip_lookup_main_t * lm, u32 del_adj_index)
-{
-  ip_adjacency_t * adj = ip_get_adjacency (lm, del_adj_index);
-  ip_multipath_adjacency_t * madj, * new_madj;
-  ip_multipath_next_hop_t * nhs, * hash_nhs;
-  u32 i, n_nhs, madj_index, new_madj_index;
-
-  if (adj->heap_handle >= vec_len (lm->multipath_adjacencies))
-    return;
-
-  vec_validate (lm->adjacency_remap_table, vec_len (lm->adjacency_heap) - 1);
-
-  for (madj_index = 0; madj_index < vec_len (lm->multipath_adjacencies); madj_index++)
-    {
-      madj = vec_elt_at_index (lm->multipath_adjacencies, madj_index);
-      if (madj->n_adj_in_block == 0)
-       continue;
-
-      nhs = heap_elt_at_index (lm->next_hop_heap, madj->unnormalized_next_hops.heap_offset);
-      n_nhs = madj->unnormalized_next_hops.count;
-      for (i = 0; i < n_nhs; i++)
-       if (nhs[i].next_hop_adj_index == del_adj_index)
-         break;
-
-      /* del_adj_index not found in unnormalized_next_hops?  We're done. */
-      if (i >= n_nhs)
-       continue;
-
-      new_madj = 0;
-      if (n_nhs > 1)
-       {
-         hash_nhs = lm->next_hop_hash_lookup_key;
-         if (hash_nhs)
-           _vec_len (hash_nhs) = 0;
-         if (i > 0)
-           vec_add (hash_nhs, nhs + 0, i);
-         if (i + 1 < n_nhs)
-           vec_add (hash_nhs, nhs + i + 1, n_nhs - (i + 1));
-
-         new_madj_index = ip_multipath_adjacency_get (lm, hash_nhs, /* create_if_non_existent */ 1);
-
-         lm->next_hop_hash_lookup_key = hash_nhs;
-
-         if (new_madj_index == madj_index)
-           continue;
-
-         new_madj = vec_elt_at_index (lm->multipath_adjacencies, new_madj_index);
-       }
-
-      lm->adjacency_remap_table[madj->adj_index] = new_madj ? 1 + new_madj->adj_index : ~0;
-      lm->n_adjacency_remaps += 1;
-      ip_multipath_adjacency_free (lm, madj);
-    }
-}
-
-void
-ip_multipath_adjacency_free (ip_lookup_main_t * lm,
-                            ip_multipath_adjacency_t * a)
-{
-  hash_unset (lm->multipath_adjacency_by_next_hops,
-             ip_next_hop_hash_key_from_handle (a->normalized_next_hops.heap_handle));
-  heap_dealloc (lm->next_hop_heap, a->normalized_next_hops.heap_handle);
-  heap_dealloc (lm->next_hop_heap, a->unnormalized_next_hops.heap_handle);
-
-  ip_del_adjacency2 (lm, a->adj_index, a->reference_count == 0);
-  memset (a, 0, sizeof (a[0]));
-}
-
-always_inline ip_multipath_next_hop_t *
-ip_next_hop_hash_key_get_next_hops (ip_lookup_main_t * lm, uword k,
-                                   uword * n_next_hops)
-{
-  ip_multipath_next_hop_t * nhs;
-  uword n_nhs;
-  if (ip_next_hop_hash_key_is_heap_handle (k))
-    {
-      uword handle = ip_next_hop_hash_key_get_heap_handle (k);
-      nhs = heap_elt_with_handle (lm->next_hop_heap, handle);
-      n_nhs = heap_len (lm->next_hop_heap, handle);
-    }
-  else
-    {
-      nhs = uword_to_pointer (k, ip_multipath_next_hop_t *);
-      n_nhs = vec_len (nhs);
-    }
-  *n_next_hops = n_nhs;
-  return nhs;
-}
-
-static uword
-ip_next_hop_hash_key_sum (hash_t * h, uword key0)
-{
-  ip_lookup_main_t * lm = uword_to_pointer (h->user, ip_lookup_main_t *);  
-  ip_multipath_next_hop_t * k0;
-  uword n0;
-
-  k0 = ip_next_hop_hash_key_get_next_hops (lm, key0, &n0);
-  return hash_memory (k0, n0 * sizeof (k0[0]), /* seed */ n0);
-}
-
-static uword
-ip_next_hop_hash_key_equal (hash_t * h, uword key0, uword key1)
-{
-  ip_lookup_main_t * lm = uword_to_pointer (h->user, ip_lookup_main_t *);  
-  ip_multipath_next_hop_t * k0, * k1;
-  uword n0, n1;
-
-  k0 = ip_next_hop_hash_key_get_next_hops (lm, key0, &n0);
-  k1 = ip_next_hop_hash_key_get_next_hops (lm, key1, &n1);
-
-  return n0 == n1 && ! memcmp (k0, k1, n0 * sizeof (k0[0]));
-}
+#include <vnet/adj/adj_alloc.h>
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/fib/ip6_fib.h>
+#include <vnet/mpls/mpls.h>
+#include <vnet/dpo/drop_dpo.h>
+#include <vnet/dpo/classify_dpo.h>
+#include <vnet/dpo/punt_dpo.h>
+#include <vnet/dpo/receive_dpo.h>
 
 clib_error_t *
 ip_interface_address_add_del (ip_lookup_main_t * lm,
@@ -869,52 +157,16 @@ ip_interface_address_add_del (ip_lookup_main_t * lm,
 
 void ip_lookup_init (ip_lookup_main_t * lm, u32 is_ip6)
 {
-  ip_adjacency_t * adj;
-  ip_adjacency_t template_adj;
-
   /* ensure that adjacency is cacheline aligned and sized */
   ASSERT(STRUCT_OFFSET_OF(ip_adjacency_t, cacheline0) == 0);
   ASSERT(STRUCT_OFFSET_OF(ip_adjacency_t, cacheline1) == CLIB_CACHE_LINE_BYTES);
 
-  lm->adj_index_by_signature = hash_create (0, sizeof (uword));
-  memset (&template_adj, 0, sizeof (template_adj));
-
   /* Preallocate three "special" adjacencies */
-  lm->adjacency_heap = aa_bootstrap (0, 3 /* n=1 free items */);
-
-  /* Hand-craft special miss adjacency to use when nothing matches in the
-     routing table.  Same for drop adjacency. */
-  adj = ip_add_adjacency (lm, /* template */ 0, /* n-adj */ 1, 
-                          &lm->miss_adj_index);
-  adj->lookup_next_index = IP_LOOKUP_NEXT_MISS;
-  ASSERT (lm->miss_adj_index == IP_LOOKUP_MISS_ADJ_INDEX);
-
-  /* Make the "drop" adj sharable */
-  template_adj.lookup_next_index = IP_LOOKUP_NEXT_DROP;
-  adj = ip_add_adjacency (lm, &template_adj, /* n-adj */ 1, 
-                          &lm->drop_adj_index);
-
-  /* Make the "local" adj sharable */
-  template_adj.lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
-  template_adj.if_address_index = ~0;
-  adj = ip_add_adjacency (lm, &template_adj, /* n-adj */ 1, 
-                          &lm->local_adj_index);
+  lm->adjacency_heap = adj_heap;
 
   if (! lm->fib_result_n_bytes)
     lm->fib_result_n_bytes = sizeof (uword);
 
-  lm->multipath_adjacency_by_next_hops
-    = hash_create2 (/* elts */ 0,
-                   /* user */ pointer_to_uword (lm),
-                   /* value_bytes */ sizeof (uword),
-                   ip_next_hop_hash_key_sum,
-                   ip_next_hop_hash_key_equal,
-                   /* format pair/arg */
-                   0, 0);
-
-  /* 1% max error tolerance for multipath. */
-  lm->multipath_next_hop_error_tolerance = .01;
-
   lm->is_ip6 = is_ip6;
   if (is_ip6)
     {
@@ -944,14 +196,12 @@ void ip_lookup_init (ip_lookup_main_t * lm, u32 is_ip6)
     lm->builtin_protocol_by_ip_protocol[IP_PROTOCOL_UDP] = IP_BUILTIN_PROTOCOL_UDP;
     lm->builtin_protocol_by_ip_protocol[is_ip6 ? IP_PROTOCOL_ICMP6 : IP_PROTOCOL_ICMP] = IP_BUILTIN_PROTOCOL_ICMP;
   }
-
-  ip_init_registered_adjacencies(!is_ip6);
 }
 
 u8 * format_ip_flow_hash_config (u8 * s, va_list * args)
 {
-  u32 flow_hash_config = va_arg (*args, u32);
-
+  flow_hash_config_t flow_hash_config = va_arg (*args, u32);
+    
 #define _(n,v) if (flow_hash_config & v) s = format (s, "%s ", #n);
   foreach_flow_hash_bit;
 #undef _
@@ -961,31 +211,20 @@ u8 * format_ip_flow_hash_config (u8 * s, va_list * args)
 
 u8 * format_ip_lookup_next (u8 * s, va_list * args)
 {
-  ip_lookup_main_t * lm = va_arg (*args, ip_lookup_main_t *);
-  ip_lookup_next_t n = va_arg (*args, u32);
-  ip_adj_register_t *reg;
-
+  ip_lookup_next_t n = va_arg (*args, ip_lookup_next_t);
   char * t = 0;
 
   switch (n)
     {
     default:
-      vec_validate(lm->registered_adjacencies, n);
-      reg = vec_elt_at_index(lm->registered_adjacencies, n);
-      if (reg->node_name) {
-        s = format (s, "%s:", reg->node_name);
-      }
+      s = format (s, "unknown %d", n);
       return s;
 
-    case IP_LOOKUP_NEXT_MISS: t = "miss"; break;
     case IP_LOOKUP_NEXT_DROP: t = "drop"; break;
     case IP_LOOKUP_NEXT_PUNT: t = "punt"; break;
-    case IP_LOOKUP_NEXT_LOCAL: t = "local"; break;
     case IP_LOOKUP_NEXT_ARP: t = "arp"; break;
-    case IP_LOOKUP_NEXT_CLASSIFY: t = "classify"; break;
-    case IP_LOOKUP_NEXT_MAP: t = "map"; break;
-    case IP_LOOKUP_NEXT_MAP_T: t = "map-t"; break;
-    case IP_LOOKUP_NEXT_INDIRECT: t="indirect"; break;
+    case IP_LOOKUP_NEXT_MIDCHAIN: t="midchain"; break;
+    case IP_LOOKUP_NEXT_GLEAN: t="glean"; break;
     case IP_LOOKUP_NEXT_REWRITE:
       break;
     }
@@ -996,120 +235,13 @@ u8 * format_ip_lookup_next (u8 * s, va_list * args)
   return s;
 }
 
-static u8 * format_ip_interface_address (u8 * s, va_list * args)
-{
-  ip_lookup_main_t * lm = va_arg (*args, ip_lookup_main_t *);
-  u32 if_address_index = va_arg (*args, u32);
-  ip_interface_address_t * ia = pool_elt_at_index (lm->if_address_pool, if_address_index);
-  void * a = ip_interface_address_get_address (lm, ia);
-
-  if (lm->is_ip6)
-    return format (s, "%U", format_ip6_address_and_length, a, ia->address_length);
-  else
-    return format (s, "%U", format_ip4_address_and_length, a, ia->address_length);
-}
-
-u32 vnet_register_special_adjacency_format_function
-(ip_lookup_main_t * lm, format_function_t * fp)
-{
-    u32 rv;
-    /*
-     * Initialize the format function registration vector
-     * Index 0 must be invalid, to avoid finding and fixing trivial bugs
-     * all over the place
-     */
-    if (vec_len (lm->special_adjacency_format_functions) == 0)
-      {
-        vec_add1 (lm->special_adjacency_format_functions,
-                  (format_function_t *) 0);
-      }
-
-    rv = vec_len (lm->special_adjacency_format_functions);
-    vec_add1 (lm->special_adjacency_format_functions, fp);
-    return rv;
-}
-
-/** @brief Pretty print helper function for formatting specific adjacencies.
-    @param s - input string to format
-    @param args - other args passed to format function such as:
-                  - vnet_main_t
-                  - ip_lookup_main_t
-                  - adj_index
-*/
-u8 * format_ip_adjacency (u8 * s, va_list * args)
-{
-  vnet_main_t * vnm = va_arg (*args, vnet_main_t *);
-  ip_lookup_main_t * lm = va_arg (*args, ip_lookup_main_t *);
-  u32 adj_index = va_arg (*args, u32);
-  ip_adjacency_t * adj = ip_get_adjacency (lm, adj_index);
-  ip_adj_register_t *reg;
-
-  if (adj->lookup_next_index < vec_len (lm->registered_adjacencies))
-    {
-      reg = vec_elt_at_index(lm->registered_adjacencies, 
-                            adj->lookup_next_index);
-      if (reg->fn) 
-       {
-         s = format(s, " %U", reg->fn, lm, adj);
-         goto format_done;
-       }
-    }
-  
-  switch (adj->lookup_next_index)
-    {
-    case IP_LOOKUP_NEXT_REWRITE:
-      s = format (s, "%U",
-                 format_vnet_rewrite,
-                 vnm->vlib_main, &adj->rewrite_header, 
-                 sizeof (adj->rewrite_data));
-      break;
-      
-    case IP_LOOKUP_NEXT_ARP:
-      if (adj->if_address_index != ~0)
-       s = format (s, " %U", format_ip_interface_address, lm, 
-                   adj->if_address_index);
-      if (adj->arp.next_hop.ip6.as_u64[0] || adj->arp.next_hop.ip6.as_u64[1])
-       s = format (s, " via %U", format_ip46_address,
-                   &adj->arp.next_hop, IP46_TYPE_ANY);
-      break;
-    case IP_LOOKUP_NEXT_LOCAL:
-      if (adj->if_address_index != ~0)
-       s = format (s, " %U", format_ip_interface_address, lm, 
-                   adj->if_address_index);
-      break;
-      
-    case IP_LOOKUP_NEXT_CLASSIFY:
-      s = format (s, " table %d", adj->classify.table_index);
-      break;
-    case IP_LOOKUP_NEXT_INDIRECT:
-      s = format (s, " via %U", format_ip46_address,
-                 &adj->indirect.next_hop, IP46_TYPE_ANY);
-      break;
-      
-    default:
-      s = format (s, " unknown %d", adj->lookup_next_index);
-      break;
-    }
-
- format_done:
-  if (adj->explicit_fib_index != ~0 && adj->explicit_fib_index != 0)
-    s = format (s, " lookup fib index %d", adj->explicit_fib_index);
-  if (adj->share_count > 0)
-    s = format (s, " shared %d", adj->share_count + 1);
-  if (adj->next_adj_with_signature)
-    s = format (s, " next_adj_with_signature %d", adj->next_adj_with_signature);
-
-  return s;
-}
-
 u8 * format_ip_adjacency_packet_data (u8 * s, va_list * args)
 {
   vnet_main_t * vnm = va_arg (*args, vnet_main_t *);
-  ip_lookup_main_t * lm = va_arg (*args, ip_lookup_main_t *);
   u32 adj_index = va_arg (*args, u32);
   u8 * packet_data = va_arg (*args, u8 *);
   u32 n_packet_data_bytes = va_arg (*args, u32);
-  ip_adjacency_t * adj = ip_get_adjacency (lm, adj_index);
+  ip_adjacency_t * adj = adj_get(adj_index);
 
   switch (adj->lookup_next_index)
     {
@@ -1126,119 +258,90 @@ u8 * format_ip_adjacency_packet_data (u8 * s, va_list * args)
   return s;
 }
 
-static uword unformat_ip_lookup_next (unformat_input_t * input, va_list * args)
+static uword unformat_dpo (unformat_input_t * input, va_list * args)
 {
-  ip_lookup_next_t * result = va_arg (*args, ip_lookup_next_t *);
-  ip_lookup_next_t n;
+  dpo_id_t *dpo = va_arg (*args, dpo_id_t *);
+  fib_protocol_t fp = va_arg (*args, int);
+  dpo_proto_t proto;
 
-  if (unformat (input, "drop"))
-    n = IP_LOOKUP_NEXT_DROP;
+  proto = fib_proto_to_dpo(fp);
 
+  if (unformat (input, "drop"))
+    dpo_copy(dpo, drop_dpo_get(proto));
   else if (unformat (input, "punt"))
-    n = IP_LOOKUP_NEXT_PUNT;
-
+    dpo_copy(dpo, punt_dpo_get(proto));
   else if (unformat (input, "local"))
-    n = IP_LOOKUP_NEXT_LOCAL;
-
-  else if (unformat (input, "arp"))
-    n = IP_LOOKUP_NEXT_ARP;
-
+    receive_dpo_add_or_lock(proto, ~0, NULL, dpo);
   else if (unformat (input, "classify"))
-    n = IP_LOOKUP_NEXT_CLASSIFY;
-
-  else
-    return 0;
-    
-  *result = n;
-  return 1;
-}
-
-static uword unformat_ip_adjacency (unformat_input_t * input, va_list * args)
-{
-  vlib_main_t * vm = va_arg (*args, vlib_main_t *);
-  ip_adjacency_t * adj = va_arg (*args, ip_adjacency_t *);
-  u32 node_index = va_arg (*args, u32);
-  vnet_main_t * vnm = vnet_get_main();
-  u32 sw_if_index, is_ip6;
-  ip46_address_t a46;
-  ip_lookup_next_t next;
-
-  is_ip6 = node_index == ip6_rewrite_node.index;
-  adj->rewrite_header.node_index = node_index;
-  adj->explicit_fib_index = ~0;
-
-  if (unformat (input, "arp %U %U",
-               unformat_vnet_sw_interface, vnm, &sw_if_index,
-               unformat_ip46_address, &a46, is_ip6?IP46_TYPE_IP6:IP46_TYPE_IP4))
     {
-      ip_lookup_main_t * lm = is_ip6 ? &ip6_main.lookup_main : &ip4_main.lookup_main;
-      ip_adjacency_t * a_adj;
-      u32 adj_index;
-
-      if (is_ip6)
-       adj_index = ip6_fib_lookup (&ip6_main, sw_if_index, &a46.ip6);
-      else
-       adj_index = ip4_fib_lookup (&ip4_main, sw_if_index, &a46.ip4);
+      u32 classify_table_index;
 
-      a_adj = ip_get_adjacency (lm, adj_index);
-
-      if (a_adj->rewrite_header.sw_if_index != sw_if_index)
-       return 0;
+      if (!unformat (input, "%d", &classify_table_index))
+        {
+         clib_warning ("classify adj must specify table index");
+          return 0;
+       }
 
-      if (is_ip6)
-       ip6_adjacency_set_interface_route (vnm, adj, sw_if_index, a_adj->if_address_index);
-      else
-       ip4_adjacency_set_interface_route (vnm, adj, sw_if_index, a_adj->if_address_index);
+      dpo_set(dpo, DPO_CLASSIFY, proto,
+              classify_dpo_create(fp, classify_table_index));
     }
+  else
+    return 0;
 
-  else if (unformat_user (input, unformat_ip_lookup_next, &next))
-    {
-      adj->lookup_next_index = next;
-      adj->if_address_index = ~0;
-      if (next == IP_LOOKUP_NEXT_LOCAL)
-        (void) unformat (input, "%d", &adj->if_address_index);
-      else if (next == IP_LOOKUP_NEXT_CLASSIFY)
-        {
-          if (!unformat (input, "%d", &adj->classify.table_index))
-            {
-              clib_warning ("classify adj must specify table index");
-              return 0;
-            }
-        }
-      else if (next == IP_LOOKUP_NEXT_DROP)
-        {
-          adj->rewrite_header.node_index = 0;
-        }
-    }
+  return 1;
+}
 
-  else if (unformat_user (input,
-                         unformat_vnet_rewrite,
-                         vm, &adj->rewrite_header, sizeof (adj->rewrite_data)))
-    adj->lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
+const ip46_address_t zero_addr = {
+    .as_u64 = {
+       0, 0
+    },
+};
 
-  else
-    return 0;
+u32
+fib_table_id_find_fib_index (fib_protocol_t proto,
+                            u32 table_id)
+{
+    ip4_main_t *im4 = &ip4_main;
+    ip6_main_t *im6 = &ip6_main;
+    uword * p;
 
-  return 1;
+    switch (proto)
+    {
+    case FIB_PROTOCOL_IP4:
+       p = hash_get(im4->fib_index_by_table_id, table_id);
+       break;
+    case FIB_PROTOCOL_IP6:
+       p = hash_get(im6->fib_index_by_table_id, table_id);
+       break;
+    default:
+       p = NULL;
+       break;
+    }
+    if (NULL != p)
+    {
+       return (p[0]);
+    }
+    return (~0);
 }
 
 clib_error_t *
-vnet_ip_route_cmd (vlib_main_t * vm, unformat_input_t * main_input, vlib_cli_command_t * cmd)
+vnet_ip_route_cmd (vlib_main_t * vm,
+                  unformat_input_t * main_input,
+                  vlib_cli_command_t * cmd)
 {
-  vnet_main_t * vnm = vnet_get_main();
-  clib_error_t * error = 0;
-  u32 table_id, is_del;
-  u32 weight, * weights = 0;
-  u32 * table_ids = 0;
-  u32 sw_if_index, * sw_if_indices = 0;
-  ip4_address_t ip4_addr, * ip4_dst_addresses = 0, * ip4_via_next_hops = 0;
-  ip6_address_t ip6_addr, * ip6_dst_addresses = 0, * ip6_via_next_hops = 0;
-  u32 dst_address_length, * dst_address_lengths = 0;
-  ip_adjacency_t parse_adj, * add_adj = 0;
   unformat_input_t _line_input, * line_input = &_line_input;
+  fib_route_path_t *rpaths = NULL, rpath;
+  dpo_id_t dpo = DPO_NULL, *dpos = NULL;
+  fib_prefix_t *prefixs = NULL, pfx;
+  clib_error_t * error = NULL;
+  mpls_label_t out_label;
+  u32 table_id, is_del;
+  vnet_main_t * vnm;
+  u32 fib_index;
   f64 count;
-  u32 outer_table_id;
+  int i;
 
+  vnm = vnet_get_main();
   is_del = 0;
   table_id = 0;
   count = 1;
@@ -1247,410 +350,311 @@ vnet_ip_route_cmd (vlib_main_t * vm, unformat_input_t * main_input, vlib_cli_com
   if (! unformat_user (main_input, unformat_line_input, line_input))
     return 0;
 
-  memset(&parse_adj, 0, sizeof (parse_adj));
-
   while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
     {
+      memset(&rpath, 0, sizeof(rpath));
+      memset(&pfx, 0, sizeof(pfx));
+
       if (unformat (line_input, "table %d", &table_id))
        ;
       else if (unformat (line_input, "del"))
        is_del = 1;
       else if (unformat (line_input, "add"))
        is_del = 0;
+      else if (unformat (line_input, "resolve-via-host"))
+      {
+         if (vec_len(rpaths) == 0)
+         {
+             error = clib_error_return(0 , "Paths then flags");
+             goto done;
+         }
+         rpaths[vec_len(rpaths)-1].frp_flags |= FIB_ROUTE_PATH_RESOLVE_VIA_HOST;
+      }
+      else if (unformat (line_input, "resolve-via-attached"))
+      {
+         if (vec_len(rpaths) == 0)
+         {
+             error = clib_error_return(0 , "Paths then flags");
+             goto done;
+         }
+         rpaths[vec_len(rpaths)-1].frp_flags |=
+             FIB_ROUTE_PATH_RESOLVE_VIA_ATTACHED;
+      }
+      else if (unformat (line_input, "out-label %U",
+                         unformat_mpls_unicast_label, &out_label))
+      {
+         if (vec_len(rpaths) == 0)
+         {
+             error = clib_error_return(0 , "Paths then labels");
+             goto done;
+         }
+         rpaths[vec_len(rpaths)-1].frp_label = out_label;
+      }
       else if (unformat (line_input, "count %f", &count))
        ;
 
       else if (unformat (line_input, "%U/%d",
-                        unformat_ip4_address, &ip4_addr,
-                        &dst_address_length))
-       {
-         vec_add1 (ip4_dst_addresses, ip4_addr);
-         vec_add1 (dst_address_lengths, dst_address_length);
-       }
-
+                        unformat_ip4_address,
+                        &pfx.fp_addr.ip4,
+                        &pfx.fp_len))
+      {
+         pfx.fp_proto = FIB_PROTOCOL_IP4;
+         vec_add1(prefixs, pfx);
+      }
       else if (unformat (line_input, "%U/%d",
-                        unformat_ip6_address, &ip6_addr,
-                        &dst_address_length))
-       {
-         vec_add1 (ip6_dst_addresses, ip6_addr);
-         vec_add1 (dst_address_lengths, dst_address_length);
-       }
-
+                        unformat_ip6_address,
+                        &pfx.fp_addr.ip6,
+                        &pfx.fp_len))
+      {
+         pfx.fp_proto = FIB_PROTOCOL_IP6;
+         vec_add1(prefixs, pfx);
+      }
       else if (unformat (line_input, "via %U %U weight %u",
-                        unformat_ip4_address, &ip4_addr,
-                        unformat_vnet_sw_interface, vnm, &sw_if_index,
-                        &weight))
-       {
-         vec_add1 (ip4_via_next_hops, ip4_addr);
-         vec_add1 (sw_if_indices, sw_if_index);
-         vec_add1 (weights, weight);
-          vec_add1 (table_ids, (u32)~0);
-       }
+                        unformat_ip4_address,
+                        &rpath.frp_addr.ip4,
+                        unformat_vnet_sw_interface, vnm,
+                        &rpath.frp_sw_if_index,
+                        &rpath.frp_weight))
+      {
+         rpath.frp_label = MPLS_LABEL_INVALID;
+         rpath.frp_proto = FIB_PROTOCOL_IP4;
+         vec_add1(rpaths, rpath);
+      }
 
       else if (unformat (line_input, "via %U %U weight %u",
-                        unformat_ip6_address, &ip6_addr,
-                        unformat_vnet_sw_interface, vnm, &sw_if_index,
-                        &weight))
-       {
-         vec_add1 (ip6_via_next_hops, ip6_addr);
-         vec_add1 (sw_if_indices, sw_if_index);
-         vec_add1 (weights, weight);
-          vec_add1 (table_ids, (u32)~0);
-       }
+                        unformat_ip6_address,
+                        &rpath.frp_addr.ip6,
+                        unformat_vnet_sw_interface, vnm,
+                        &rpath.frp_sw_if_index,
+                        &rpath.frp_weight))
+      {
+         rpath.frp_label = MPLS_LABEL_INVALID;
+         rpath.frp_proto = FIB_PROTOCOL_IP6;
+         vec_add1(rpaths, rpath);
+      }
 
       else if (unformat (line_input, "via %U %U",
-                        unformat_ip4_address, &ip4_addr,
-                        unformat_vnet_sw_interface, vnm, &sw_if_index))
-       {
-         vec_add1 (ip4_via_next_hops, ip4_addr);
-         vec_add1 (sw_if_indices, sw_if_index);
-         vec_add1 (weights, 1);
-          vec_add1 (table_ids, (u32)~0);
-       }
+                        unformat_ip4_address,
+                        &rpath.frp_addr.ip4,
+                        unformat_vnet_sw_interface, vnm,
+                        &rpath.frp_sw_if_index))
+      {
+         rpath.frp_label = MPLS_LABEL_INVALID;
+         rpath.frp_weight = 1;
+         rpath.frp_proto = FIB_PROTOCOL_IP4;
+         vec_add1(rpaths, rpath);
+      }
                         
       else if (unformat (line_input, "via %U %U",
-                        unformat_ip6_address, &ip6_addr,
-                        unformat_vnet_sw_interface, vnm, &sw_if_index))
-       {
-         vec_add1 (ip6_via_next_hops, ip6_addr);
-         vec_add1 (sw_if_indices, sw_if_index);
-         vec_add1 (weights, 1);
-          vec_add1 (table_ids, (u32)~0);
-       }
+                        unformat_ip6_address,
+                        &rpath.frp_addr.ip6,
+                        unformat_vnet_sw_interface, vnm,
+                        &rpath.frp_sw_if_index))
+      {
+         rpath.frp_label = MPLS_LABEL_INVALID;
+         rpath.frp_weight = 1;
+         rpath.frp_proto = FIB_PROTOCOL_IP6;
+         vec_add1(rpaths, rpath);
+      }
+      else if (unformat (line_input, "via %U next-hop-table %d",
+                        unformat_ip4_address,
+                        &rpath.frp_addr.ip4,
+                        &rpath.frp_fib_index))
+      {
+         rpath.frp_weight = 1;
+         rpath.frp_sw_if_index = ~0;
+         rpath.frp_label = MPLS_LABEL_INVALID;
+         rpath.frp_proto = FIB_PROTOCOL_IP4;
+         vec_add1(rpaths, rpath);
+      }
+      else if (unformat (line_input, "via %U next-hop-table %d",
+                        unformat_ip6_address,
+                        &rpath.frp_addr.ip6,
+                        &rpath.frp_fib_index))
+      {
+         rpath.frp_weight = 1;
+         rpath.frp_sw_if_index = ~0;
+         rpath.frp_label = MPLS_LABEL_INVALID;
+         rpath.frp_proto = FIB_PROTOCOL_IP6;
+         vec_add1(rpaths, rpath);
+      }
       else if (unformat (line_input, "via %U",
-                        unformat_ip4_address, &ip4_addr))
-       {
-         vec_add1 (ip4_via_next_hops, ip4_addr);
-         vec_add1 (sw_if_indices, (u32)~0);
-         vec_add1 (weights, 1);
-          vec_add1 (table_ids, table_id);
-       }
+                        unformat_ip4_address,
+                        &rpath.frp_addr.ip4))
+      {
+         /*
+          * the recursive next-hops are by default in the same table
+          * as the prefix
+          */
+         rpath.frp_fib_index = table_id;
+         rpath.frp_weight = 1;
+         rpath.frp_sw_if_index = ~0;
+         rpath.frp_label = MPLS_LABEL_INVALID;
+         rpath.frp_proto = FIB_PROTOCOL_IP4;
+         vec_add1(rpaths, rpath);
+      }
       else if (unformat (line_input, "via %U",
-                        unformat_ip6_address, &ip6_addr))
-       {
-         vec_add1 (ip6_via_next_hops, ip6_addr);
-         vec_add1 (sw_if_indices, (u32)~0);
-         vec_add1 (weights, 1);
-          vec_add1 (table_ids, (u32)table_id);
-       }
-                        
-      else if (vec_len (ip4_dst_addresses) > 0
-              && unformat (line_input, "via %U",
-                           unformat_ip_adjacency, vm, &parse_adj, ip4_rewrite_node.index))
-          vec_add1 (add_adj, parse_adj);
-
-      else if (vec_len (ip6_dst_addresses) > 0
-              && unformat (line_input, "via %U",
-                           unformat_ip_adjacency, vm, &parse_adj, ip6_rewrite_node.index))
-       vec_add1 (add_adj, parse_adj);
-      else if (unformat (line_input, "lookup in table %d", &outer_table_id))
-        {
-          uword * p;
-
-          if (vec_len (ip4_dst_addresses) > 0)
-            p = hash_get (ip4_main.fib_index_by_table_id, outer_table_id);
-          else
-            p = hash_get (ip6_main.fib_index_by_table_id, outer_table_id);
-
-          if (p == 0)
-            {
-              error = clib_error_return (0, "Nonexistent outer table id %d", 
-                                         outer_table_id);
-              goto done;
-            }
-
-          parse_adj.lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
-          parse_adj.explicit_fib_index = p[0];
-          vec_add1 (add_adj, parse_adj);
-        }
+                        unformat_ip6_address,
+                        &rpath.frp_addr.ip6))
+      {
+         rpath.frp_fib_index = table_id;
+         rpath.frp_weight = 1;
+         rpath.frp_sw_if_index = ~0;
+         rpath.frp_label = MPLS_LABEL_INVALID;
+         rpath.frp_proto = FIB_PROTOCOL_IP6;
+         vec_add1(rpaths, rpath);
+      }
+      else if (unformat (line_input,
+                        "lookup in table %d",
+                        &rpath.frp_fib_index))
+      {
+         rpath.frp_label = MPLS_LABEL_INVALID;
+         rpath.frp_proto = pfx.fp_proto;
+         vec_add1(rpaths, rpath);
+      }
+      else if (vec_len (prefixs) > 0 &&
+              unformat (line_input, "via %U",
+                        unformat_dpo, &dpo, prefixs[0].fp_proto))
+      {
+         rpath.frp_label = MPLS_LABEL_INVALID;
+         vec_add1 (dpos, dpo);
+      }
       else
-       {
+      {
          error = unformat_parse_error (line_input);
          goto done;
-       }
+      }
     }
     
   unformat_free (line_input);
 
-  if (vec_len (ip4_dst_addresses) + vec_len (ip6_dst_addresses) == 0)
-    {
+  if (vec_len (prefixs) == 0)
+  {
       error = clib_error_return (0, "expected ip4/ip6 destination address/length.");
       goto done;
     }
 
-  if (vec_len (ip4_dst_addresses) > 0 && vec_len (ip6_dst_addresses) > 0)
-    {
-      error = clib_error_return (0, "mixed ip4/ip6 address/length.");
-      goto done;
-    }
-
-  if (vec_len (ip4_dst_addresses) > 0 && vec_len (ip6_via_next_hops) > 0)
-    {
-      error = clib_error_return (0, "ip4 destinations with ip6 next hops.");
-      goto done;
-    }
-
-  if (vec_len (ip6_dst_addresses) > 0 && vec_len (ip4_via_next_hops) > 0)
-    {
-      error = clib_error_return (0, "ip6 destinations with ip4 next hops.");
-      goto done;
-    }
-
-  if (! is_del && vec_len (add_adj) + vec_len (weights) == 0)
+  if (!is_del && vec_len (rpaths) + vec_len (dpos) == 0)
     {
-      error = clib_error_return (0, "no next hops or adjacencies to add.");
+      error = clib_error_return (0, "expected paths.");
       goto done;
     }
 
+  if (~0 == table_id)
   {
-    int i;
-    ip4_main_t * im4 = &ip4_main;
-    ip6_main_t * im6 = &ip6_main;
+      /*
+       * if no table_id is passed we will manipulate the default
+       */
+      fib_index = 0;
+  }
+  else
+  {
+      fib_index = fib_table_id_find_fib_index(prefixs[0].fp_proto,
+                                             table_id);
 
-    for (i = 0; i < vec_len (ip4_dst_addresses); i++)
+      if (~0 == fib_index)
       {
-       ip4_add_del_route_args_t a;
-
-       memset (&a, 0, sizeof (a));
-       a.flags = IP4_ROUTE_FLAG_TABLE_ID;
-       a.table_index_or_table_id = table_id;
-       a.dst_address = ip4_dst_addresses[i];
-       a.dst_address_length = dst_address_lengths[i];
-       a.adj_index = ~0;
-
-       if (is_del)
-         {
-           if (vec_len (ip4_via_next_hops) == 0)
-             {
-                uword * dst_hash, * dst_result;
-                u32 dst_address_u32;
-                ip4_fib_t * fib;
-
-                fib = find_ip4_fib_by_table_index_or_id (im4, table_id, 
-                                                         0 /* by table id */);
-
-               a.flags |= IP4_ROUTE_FLAG_DEL;
-                dst_address_u32 = a.dst_address.as_u32 
-                  & im4->fib_masks[a.dst_address_length];
-
-                dst_hash = 
-                  fib->adj_index_by_dst_address[a.dst_address_length];
-                dst_result = hash_get (dst_hash, dst_address_u32);
-                if (dst_result)
-                  a.adj_index = dst_result[0];
-                else
-                  {
-                    clib_warning ("%U/%d not in FIB",
-                                  format_ip4_address, &a.dst_address,
-                                  a.dst_address_length);
-                    continue;
-                  }
-
-               ip4_add_del_route (im4, &a);
-               ip4_maybe_remap_adjacencies (im4, table_id, 
-                                             IP4_ROUTE_FLAG_TABLE_ID);
-             }
-           else
-             {
-                u32 i, j, n, f, incr;
-               ip4_address_t dst = a.dst_address;
-               f64 t[2];
-               n = count;
-               t[0] = vlib_time_now (vm);
-                incr = 1<<(32 - a.dst_address_length);
-               for (i = 0; i < n; i++)
-                 {
-                   f = i + 1 < n ? IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP : 0;
-                   a.dst_address = dst;
-                   for (j = 0; j < vec_len (ip4_via_next_hops); j++)
-                      {
-                        if (table_ids[j] != (u32)~0)
-                          {
-                            uword * p = hash_get (im4->fib_index_by_table_id, 
-                                                  table_ids[j]);
-                            if (p == 0) 
-                              {
-                                clib_warning ("no such FIB table %d",
-                                              table_ids[j]);
-                                continue;
-                              }
-                            table_ids[j] = p[0];
-                          }
-                        
-                        ip4_add_del_route_next_hop (im4,
-                                                    IP4_ROUTE_FLAG_DEL | f,
-                                                    &a.dst_address,
-                                                    a.dst_address_length,
-                                                    &ip4_via_next_hops[j],
-                                                    sw_if_indices[j],
-                                                    weights[j], (u32)~0, 
-                                                    table_ids[j] /* fib index */);
-                      }
-                    dst.as_u32 = clib_host_to_net_u32 (incr + clib_net_to_host_u32 (dst.as_u32));
-                 }
-               t[1] = vlib_time_now (vm);
-               if (count > 1)
-                 vlib_cli_output (vm, "%.6e routes/sec", count / (t[1] - t[0]));
-             }
-         }
-       else
-         {
-           if (vec_len (add_adj) > 0)
-             {
-               a.flags |= IP4_ROUTE_FLAG_ADD;
-               a.add_adj = add_adj;
-               a.n_add_adj = vec_len (add_adj);
-             
-               ip4_add_del_route (im4, &a);
-             }
-           else if (vec_len (ip4_via_next_hops) > 0)
-             {
-                u32 i, j, n, f, incr;
-               ip4_address_t dst = a.dst_address;
-               f64 t[2];
-               n = count;
-               t[0] = vlib_time_now (vm);
-                incr = 1<<(32 - a.dst_address_length);
-               for (i = 0; i < n; i++)
-                 {
-                   f = i + 1 < n ? IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP : 0;
-                   a.dst_address = dst;
-                   for (j = 0; j < vec_len (ip4_via_next_hops); j++)
-                      {
-                        if (table_ids[j] != (u32)~0)
-                          {
-                            uword * p = hash_get (im4->fib_index_by_table_id, 
-                                                  table_ids[j]);
-                            if (p == 0) 
-                              {
-                                clib_warning ("no such FIB table %d",
-                                              table_ids[j]);
-                                continue;
-                              }
-                            table_ids[j] = p[0];
-                          }
-                     ip4_add_del_route_next_hop (im4,
-                                                 IP4_ROUTE_FLAG_ADD | f,
-                                                 &a.dst_address,
-                                                 a.dst_address_length,
-                                                 &ip4_via_next_hops[j],
-                                                 sw_if_indices[j],
-                                                 weights[j], (u32)~0, 
-                                                  table_ids[j] /* fib index */);
-                      }
-                   dst.as_u32 = clib_host_to_net_u32 (incr + clib_net_to_host_u32 (dst.as_u32));
-                 }
-               t[1] = vlib_time_now (vm);
-               if (count > 1)
-                 vlib_cli_output (vm, "%.6e routes/sec", count / (t[1] - t[0]));
-             }
-         }
+         error = clib_error_return (0,
+                                    "Nonexistent table id %d", 
+                                    table_id);
+         goto done;
       }
+  }
 
-    for (i = 0; i < vec_len (ip6_dst_addresses); i++)
+  for (i = 0; i < vec_len (prefixs); i++)
+  {
+      if (is_del && 0 == vec_len (rpaths))
       {
-       ip6_add_del_route_args_t a;
-        
-
-       memset (&a, 0, sizeof (a));
-       a.flags = IP6_ROUTE_FLAG_TABLE_ID;
-       a.table_index_or_table_id = table_id;
-       a.dst_address = ip6_dst_addresses[i];
-       a.dst_address_length = dst_address_lengths[i];
-       a.adj_index = ~0;
-
-       if (is_del)
+         fib_table_entry_delete(fib_index,
+                                &prefixs[i],
+                                FIB_SOURCE_CLI);
+      }
+      else if (!is_del && 1 == vec_len (dpos))
+      {
+         fib_table_entry_special_dpo_add(fib_index,
+                                          &prefixs[i],
+                                          FIB_SOURCE_CLI,
+                                          FIB_ENTRY_FLAG_EXCLUSIVE,
+                                          &dpos[0]);
+         dpo_reset(&dpos[0]);
+      }
+      else if (vec_len (dpos) > 0)
+      {
+         error = clib_error_return(0 , "Load-balancing over multiple special adjacencies is unsupported");
+         goto done;
+      }
+      else if (0 < vec_len (rpaths))
+      {
+         u32 k, j, n, incr;
+         ip46_address_t dst = prefixs[i].fp_addr;
+         f64 t[2];
+         n = count;
+         t[0] = vlib_time_now (vm);
+         incr = 1 << ((FIB_PROTOCOL_IP4 == prefixs[0].fp_proto ? 32 : 128) -
+                      prefixs[i].fp_len);
+
+         for (k = 0; k < n; k++)
          {
-           if (vec_len (ip6_via_next_hops) == 0)
-             {
-                BVT(clib_bihash_kv) kv, value;
-                ip6_address_t dst_address;
-                ip6_fib_t * fib;
-
-                fib = find_ip6_fib_by_table_index_or_id (im6, table_id, 
-                                                         0 /* by table id */);
-
-               a.flags |= IP4_ROUTE_FLAG_DEL;
-
-                dst_address = ip6_dst_addresses[i];
-
-                ip6_address_mask (&dst_address, 
-                                  &im6->fib_masks[dst_address_length]);
-                
-                kv.key[0] = dst_address.as_u64[0];
-                kv.key[1] = dst_address.as_u64[1];
-                kv.key[2] = ((u64)(fib - im6->fibs)<<32)
-                  | a.dst_address_length;
-                
-                if (BV(clib_bihash_search)(&im6->ip6_lookup_table,
-                                           &kv, &value) == 0)
-                  a.adj_index = value.value;
-                else
-                  {
-                    clib_warning ("%U/%d not in FIB",
-                                  format_ip6_address, &a.dst_address,
-                                  a.dst_address_length);
-                    continue;
-                  }
-                
-               a.flags |= IP6_ROUTE_FLAG_DEL;
-               ip6_add_del_route (im6, &a);
-               ip6_maybe_remap_adjacencies (im6, table_id, 
-                                             IP6_ROUTE_FLAG_TABLE_ID);
-             }
-           else
+             for (j = 0; j < vec_len (rpaths); j++)
              {
-               u32 i;
-               for (i = 0; i < vec_len (ip6_via_next_hops); i++)
-                 {
-                   ip6_add_del_route_next_hop (im6,
-                                               IP6_ROUTE_FLAG_DEL,
-                                               &a.dst_address,
-                                               a.dst_address_length,
-                                               &ip6_via_next_hops[i],
-                                               sw_if_indices[i],
-                                               weights[i], (u32)~0,
-                                                table_ids[i] /* fib index */);
-                 }
+                 /*
+                  * the CLI parsing stored table Ids, swap to FIB indicies
+                  */
+                 rpaths[i].frp_fib_index =
+                     fib_table_id_find_fib_index(prefixs[i].fp_proto,
+                                                 rpaths[i].frp_fib_index);
+
+                 fib_prefix_t rpfx = {
+                     .fp_len = prefixs[i].fp_len,
+                     .fp_proto = prefixs[i].fp_proto,
+                     .fp_addr = dst,
+                 };
+
+                  if (is_del)
+                      fib_table_entry_path_remove2(fib_index,
+                                                   &rpfx,
+                                                   FIB_SOURCE_CLI,
+                                                   &rpaths[j]);
+                  else
+                      fib_table_entry_path_add2(fib_index,
+                                                &rpfx,
+                                                FIB_SOURCE_CLI,
+                                                FIB_ENTRY_FLAG_NONE,
+                                                &rpaths[j]);
              }
-         }
-       else
-         {
-           if (vec_len (add_adj) > 0)
+
+             if (FIB_PROTOCOL_IP4 == prefixs[0].fp_proto)
              {
-               a.flags |= IP6_ROUTE_FLAG_ADD;
-               a.add_adj = add_adj;
-               a.n_add_adj = vec_len (add_adj);
-             
-               ip6_add_del_route (im6, &a);
+                 dst.ip4.as_u32 =
+                     clib_host_to_net_u32(incr +
+                                          clib_net_to_host_u32 (dst.ip4.as_u32));
              }
-           else if (vec_len (ip6_via_next_hops) > 0)
+             else
              {
-               u32 i;
-               for (i = 0; i < vec_len (ip6_via_next_hops); i++)
-                 {
-                   ip6_add_del_route_next_hop (im6,
-                                               IP6_ROUTE_FLAG_ADD,
-                                               &a.dst_address,
-                                               a.dst_address_length,
-                                               &ip6_via_next_hops[i],
-                                               sw_if_indices[i],
-                                               weights[i], (u32)~0,
-                                                table_ids[i]);
-                 }
+                 int bucket = (incr < 64 ? 0 : 1);
+                 dst.ip6.as_u64[bucket] =
+                     clib_host_to_net_u64(incr +
+                                          clib_net_to_host_u64 (
+                                              dst.ip6.as_u64[bucket]));
+
              }
          }
+         t[1] = vlib_time_now (vm);
+         if (count > 1)
+             vlib_cli_output (vm, "%.6e routes/sec", count / (t[1] - t[0]));
+      }
+      else
+      {
+         error = clib_error_return(0 , "Don't understand what you want...");
+         goto done;
       }
   }
 
+
  done:
-  vec_free (add_adj);
-  vec_free (weights);
-  vec_free (dst_address_lengths);
-  vec_free (ip4_dst_addresses);
-  vec_free (ip6_dst_addresses);
-  vec_free (ip4_via_next_hops);
-  vec_free (ip6_via_next_hops);
+  vec_free (dpos);
+  vec_free (prefixs);
+  vec_free (rpaths);
   return error;
 }
 
@@ -1708,14 +712,14 @@ VLIB_CLI_COMMAND (ip_route_command, static) = {
   .is_mp_safe = 1,
 };
 
-/* 
+/*
  * The next two routines address a longstanding script hemorrhoid.
  * Probing a v4 or v6 neighbor needs to appear to be synchronous,
  * or dependent route-adds will simply fail.
  */
 static clib_error_t *
 ip6_probe_neighbor_wait (vlib_main_t *vm, ip6_address_t * a, u32 sw_if_index,
-                         int retry_count)
+                        int retry_count)
 {
   vnet_main_t * vnm = vnet_get_main();
   clib_error_t * e;
@@ -1727,7 +731,7 @@ ip6_probe_neighbor_wait (vlib_main_t *vm, ip6_address_t * a, u32 sw_if_index,
   ASSERT (vlib_in_process_context(vm));
 
   if (retry_count > 0)
-    vnet_register_ip6_neighbor_resolution_event 
+    vnet_register_ip6_neighbor_resolution_event
       (vnm, a, vlib_get_current_process (vm)->node_runtime.node_index,
        1 /* event */, 0 /* data */);
 
@@ -1735,17 +739,17 @@ ip6_probe_neighbor_wait (vlib_main_t *vm, ip6_address_t * a, u32 sw_if_index,
     {
       /* The interface may be down, etc. */
       e = ip6_probe_neighbor (vm, a, sw_if_index);
-      
+
       if (e)
-        return e;
-      
+       return e;
+
       vlib_process_wait_for_event_or_clock (vm, 1.0);
       event_type = vlib_process_get_events (vm, &event_data);
-      switch (event_type) 
-        {
-        case 1: /* resolved... */
-          vlib_cli_output (vm, "Resolved %U", 
-                           format_ip6_address, a);
+      switch (event_type)
+       {
+       case 1: /* resolved... */
+         vlib_cli_output (vm, "Resolved %U",
+                          format_ip6_address, a);
           resolved = 1;
           goto done;
           
@@ -1883,526 +887,3 @@ VLIB_CLI_COMMAND (ip_probe_neighbor_command, static) = {
   .short_help = "ip probe-neighbor <intfc> <ip4-addr> | <ip6-addr> [retry nn]",
   .is_mp_safe = 1,
 };
-
-typedef CLIB_PACKED (struct {
-  ip4_address_t address;
-
-  u32 address_length : 6;
-
-  u32 index : 26;
-}) ip4_route_t;
-
-static int
-ip4_route_cmp (void * a1, void * a2)
-{
-  ip4_route_t * r1 = a1;
-  ip4_route_t * r2 = a2;
-
-  int cmp = ip4_address_compare (&r1->address, &r2->address);
-  return cmp ? cmp : ((int) r1->address_length - (int) r2->address_length);
-}
-
-static clib_error_t *
-ip4_show_fib (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd)
-{
-  vnet_main_t * vnm = vnet_get_main();
-  ip4_main_t * im4 = &ip4_main;
-  ip4_route_t * routes, * r;
-  ip4_fib_t * fib;
-  ip_lookup_main_t * lm = &im4->lookup_main;
-  uword * results, i;
-  int verbose, matching, mtrie, include_empty_fibs;
-  ip4_address_t matching_address;
-  u8 clear = 0;
-  int table_id = -1;
-
-  routes = 0;
-  results = 0;
-  verbose = 1;
-  include_empty_fibs = 0;
-  matching = 0;
-  mtrie = 0;
-  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
-    {
-      if (unformat (input, "brief") || unformat (input, "summary")
-         || unformat (input, "sum"))
-       verbose = 0;
-
-      else if (unformat (input, "mtrie"))
-       mtrie = 1;
-
-      else if (unformat (input, "include-empty"))
-        include_empty_fibs = 1;
-
-      else if (unformat (input, "%U", unformat_ip4_address, &matching_address))
-       matching = 1;
-
-      else if (unformat (input, "clear"))
-        clear = 1;
-
-      else if (unformat (input, "table %d", &table_id))
-               ;
-      else
-       break;
-    }
-
-  vec_foreach (fib, im4->fibs)
-    {
-      int fib_not_empty;
-
-      fib_not_empty = 0;
-      for (i = 0; i < ARRAY_LEN (fib->adj_index_by_dst_address); i++)
-        {
-          uword * hash = fib->adj_index_by_dst_address[i];
-          uword n_elts = hash_elts (hash);
-          if (n_elts)
-            {
-              fib_not_empty = 1;
-              break;
-            }
-        }
-      
-      if (fib_not_empty == 0 && include_empty_fibs == 0)
-        continue;
-
-      if (table_id >= 0 && table_id != (int)fib->table_id)
-        continue;
-
-      if (include_empty_fibs)
-          vlib_cli_output (vm, "Table %d, fib_index %d, flow hash: %U", 
-                           fib->table_id, fib - im4->fibs,
-                           format_ip_flow_hash_config, fib->flow_hash_config);
-
-      /* Show summary? */
-      if (! verbose)
-       {
-        if (include_empty_fibs == 0)
-            vlib_cli_output (vm, "Table %d, fib_index %d, flow hash: %U", 
-                             fib->table_id, fib - im4->fibs,
-                             format_ip_flow_hash_config, fib->flow_hash_config);
-         vlib_cli_output (vm, "%=20s%=16s", "Prefix length", "Count");
-         for (i = 0; i < ARRAY_LEN (fib->adj_index_by_dst_address); i++)
-           {
-             uword * hash = fib->adj_index_by_dst_address[i];
-             uword n_elts = hash_elts (hash);
-             if (n_elts > 0)
-               vlib_cli_output (vm, "%20d%16d", i, n_elts);
-           }
-         continue;
-       }
-
-      if (routes)
-       _vec_len (routes) = 0;
-      if (results)
-       _vec_len (results) = 0;
-
-      for (i = 0; i < ARRAY_LEN (fib->adj_index_by_dst_address); i++)
-       {
-         uword * hash = fib->adj_index_by_dst_address[i];
-         hash_pair_t * p;
-         ip4_route_t x;
-
-         x.address_length = i;
-
-         if (matching)
-           {
-             x.address.as_u32 = matching_address.as_u32 & im4->fib_masks[i];
-             p = hash_get_pair (hash, x.address.as_u32);
-             if (p)
-               {
-                 if (lm->fib_result_n_words > 1)
-                   {
-                     x.index = vec_len (results);
-                     vec_add (results, p->value, lm->fib_result_n_words);
-                   }
-                 else
-                   x.index = p->value[0];
-                 vec_add1 (routes, x);
-               }
-           }
-         else
-           {
-             hash_foreach_pair (p, hash, ({
-               x.address.data_u32 = p->key;
-               if (lm->fib_result_n_words > 1)
-                 {
-                   x.index = vec_len (results);
-                   vec_add (results, p->value, lm->fib_result_n_words);
-                 }
-               else
-                 x.index = p->value[0];
-
-               vec_add1 (routes, x);
-             }));
-           }
-       }
-
-      vec_sort_with_function (routes, ip4_route_cmp);
-      if (vec_len(routes)) {
-          if (include_empty_fibs == 0)
-              vlib_cli_output (vm, "Table %d, fib_index %d, flow hash: %U", 
-                               fib->table_id, fib - im4->fibs,
-                               format_ip_flow_hash_config, fib->flow_hash_config);
-          if (mtrie)
-              vlib_cli_output (vm, "%U", format_ip4_fib_mtrie, &fib->mtrie);
-          vlib_cli_output (vm, "%=20s%=16s%=16s%=16s",
-                           "Destination", "Packets", "Bytes", "Adjacency");
-      }
-      vec_foreach (r, routes)
-       {
-         vlib_counter_t c, sum;
-         uword i, j, n_left, n_nhs, adj_index, * result = 0;
-         ip_adjacency_t * adj;
-         ip_multipath_next_hop_t * nhs, tmp_nhs[1];
-
-         adj_index = r->index;
-         if (lm->fib_result_n_words > 1)
-           {
-             result = vec_elt_at_index (results, adj_index);
-             adj_index = result[0];
-           }
-
-         adj = ip_get_adjacency (lm, adj_index);
-         if (adj->n_adj == 1)
-           {
-             nhs = &tmp_nhs[0];
-             nhs[0].next_hop_adj_index = ~0; /* not used */
-             nhs[0].weight = 1;
-             n_nhs = 1;
-           }
-         else
-           {
-             ip_multipath_adjacency_t * madj;
-             madj = vec_elt_at_index (lm->multipath_adjacencies, adj->heap_handle);
-             nhs = heap_elt_at_index (lm->next_hop_heap, madj->normalized_next_hops.heap_offset);
-             n_nhs = madj->normalized_next_hops.count;
-           }
-
-         n_left = nhs[0].weight;
-         vlib_counter_zero (&sum);
-         for (i = j = 0; i < adj->n_adj; i++)
-           {
-             n_left -= 1;
-             vlib_get_combined_counter (&lm->adjacency_counters, 
-                                         adj_index + i, &c);
-              if (clear)
-                vlib_zero_combined_counter (&lm->adjacency_counters,
-                                            adj_index + i);
-             vlib_counter_add (&sum, &c);
-             if (n_left == 0)
-               {
-                 u8 * msg = 0;
-                 uword indent;
-
-                 if (j == 0)
-                   msg = format (msg, "%-20U",
-                                 format_ip4_address_and_length,
-                                 r->address.data, r->address_length);
-                 else
-                   msg = format (msg, "%U", format_white_space, 20);
-
-                 msg = format (msg, "%16Ld%16Ld ", sum.packets, sum.bytes);
-
-                 indent = vec_len (msg);
-                 msg = format (msg, "weight %d, index %d",
-                               nhs[j].weight, adj_index + i);
-
-                 if (ip_adjacency_is_multipath(lm, adj_index))
-                     msg = format (msg, ", multipath");
-
-                 msg = format (msg, "\n%U%U",
-                               format_white_space, indent,
-                               format_ip_adjacency,
-                               vnm, lm, adj_index + i);
-
-                 vlib_cli_output (vm, "%v", msg);
-                 vec_free (msg);
-
-                 if (result && lm->format_fib_result)
-                   vlib_cli_output (vm, "%20s%U", "",
-                                    lm->format_fib_result, vm, lm, result,
-                                    i + 1 - nhs[j].weight,
-                                    nhs[j].weight);
-
-                 j++;
-                 if (j < n_nhs)
-                   {
-                     n_left = nhs[j].weight;
-                     vlib_counter_zero (&sum);
-                   }
-               }
-           }
-       }
-    }
-
-  vec_free (routes);
-  vec_free (results);
-
-  return 0;
-}
-
-/*?
- * Show FIB/route entries
- *
- * @cliexpar
- * @cliexstart{show ip fib}
- * Display the IPv4 FIB.
- * This command will run for a long time when the FIBs comprise millions of entries.
- *   vpp# sh ip fib
- *   Table 0
- *   Destination         Packets          Bytes         Adjacency
- *   6.0.0.0/8                          0               0 weight 1, index 3
- *                                                       arp fake-eth0 6.0.0.1/8
- *   6.0.0.1/32                         0               0 weight 1, index 4
- *                                                        local 6.0.0.1/8
- *
- *  And so forth. Use 'show ip fib summary' for a summary:
- *
- *   vpp# sh ip fib summary
- *   Table 0
- *   Prefix length         Count
- *         8               1
- *        32               4
- * @cliexend
- ?*/
-VLIB_CLI_COMMAND (ip4_show_fib_command, static) = {
-  .path = "show ip fib",
-  .short_help = "show ip fib [mtrie] [summary] [table <n>] [<ip4-addr>] [clear] [include-empty]",
-  .function = ip4_show_fib,
-};
-
-typedef struct {
-  ip6_address_t address;
-
-  u32 address_length;
-
-  u32 index;
-} ip6_route_t;
-
-typedef struct {
-  u32 fib_index;
-  ip6_route_t ** routep;
-} add_routes_in_fib_arg_t;
-
-static void add_routes_in_fib (BVT(clib_bihash_kv) * kvp, void *arg)
-{
-  add_routes_in_fib_arg_t * ap = arg;
-
-  if (kvp->key[2]>>32 == ap->fib_index)
-    {
-      ip6_address_t *addr;
-      ip6_route_t * r;
-      addr = (ip6_address_t *) kvp;
-      vec_add2 (*ap->routep, r, 1);
-      r->address = addr[0];
-      r->address_length = kvp->key[2] & 0xFF;
-      r->index = kvp->value;
-    }
-}
-
-typedef struct {
-  u32 fib_index;
-  u64 count_by_prefix_length[129];
-} count_routes_in_fib_at_prefix_length_arg_t;
-
-static void count_routes_in_fib_at_prefix_length 
-(BVT(clib_bihash_kv) * kvp, void *arg)
-{
-  count_routes_in_fib_at_prefix_length_arg_t * ap = arg;
-  int mask_width;
-
-  if ((kvp->key[2]>>32) != ap->fib_index)
-    return;
-
-  mask_width = kvp->key[2] & 0xFF;
-
-  ap->count_by_prefix_length[mask_width]++;
-}
-
-static int
-ip6_route_cmp (void * a1, void * a2)
-{
-  ip6_route_t * r1 = a1;
-  ip6_route_t * r2 = a2;
-
-  int cmp = ip6_address_compare (&r1->address, &r2->address);
-  return cmp ? cmp : ((int) r1->address_length - (int) r2->address_length);
-}
-
-static clib_error_t *
-ip6_show_fib (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd)
-{
-  vnet_main_t * vnm = vnet_get_main();
-  ip6_main_t * im6 = &ip6_main;
-  ip6_route_t * routes, * r;
-  ip6_fib_t * fib;
-  ip_lookup_main_t * lm = &im6->lookup_main;
-  uword * results;
-  int verbose;
-  BVT(clib_bihash) * h = &im6->ip6_lookup_table;
-  __attribute__((unused)) u8 clear = 0;
-  add_routes_in_fib_arg_t _a, *a=&_a;
-  count_routes_in_fib_at_prefix_length_arg_t _ca, *ca = &_ca;
-
-  routes = 0;
-  results = 0;
-  verbose = 1;
-  if (unformat (input, "brief") || unformat (input, "summary")
-      || unformat (input, "sum"))
-    verbose = 0;
-
-  if (unformat (input, "clear"))
-    clear = 1;
-
-  vlib_cli_output (vm, "FIB lookup table: %d buckets, %lld MB heap",
-                   im6->lookup_table_nbuckets, im6->lookup_table_size>>20);
-  vlib_cli_output (vm, "%U", format_mheap, h->mheap, 0 /*verbose*/); 
-  vlib_cli_output (vm, " ");
-  
-  vec_foreach (fib, im6->fibs)
-    {
-      vlib_cli_output (vm, "VRF %d, fib_index %d, flow hash: %U", 
-                       fib->table_id, fib - im6->fibs,
-                       format_ip_flow_hash_config, fib->flow_hash_config);
-      
-      /* Show summary? */
-      if (! verbose)
-       {
-          int len;
-         vlib_cli_output (vm, "%=20s%=16s", "Prefix length", "Count");
-
-          memset (ca, 0, sizeof(*ca));
-          ca->fib_index = fib - im6->fibs;
-
-          BV(clib_bihash_foreach_key_value_pair)
-            (h, count_routes_in_fib_at_prefix_length, ca);
-
-          for (len = 128; len >= 0; len--)
-            {
-              if (ca->count_by_prefix_length[len])
-                vlib_cli_output (vm, "%=20d%=16lld", 
-                                 len, ca->count_by_prefix_length[len]);
-            }
-         continue;
-       }
-
-      if (routes)
-       _vec_len (routes) = 0;
-      if (results)
-       _vec_len (results) = 0;
-
-      a->fib_index = fib - im6->fibs;
-      a->routep = &routes;
-
-      BV(clib_bihash_foreach_key_value_pair)(h, add_routes_in_fib, a);
-      
-      vec_sort_with_function (routes, ip6_route_cmp);
-
-      vlib_cli_output (vm, "%=45s%=16s%=16s%=16s",
-                      "Destination", "Packets", "Bytes", "Adjacency");
-      vec_foreach (r, routes)
-       {
-         vlib_counter_t c, sum;
-         uword i, j, n_left, n_nhs, adj_index, * result = 0;
-         ip_adjacency_t * adj;
-         ip_multipath_next_hop_t * nhs, tmp_nhs[1];
-
-         adj_index = r->index;
-         if (lm->fib_result_n_words > 1)
-           {
-             result = vec_elt_at_index (results, adj_index);
-             adj_index = result[0];
-           }
-
-         adj = ip_get_adjacency (lm, adj_index);
-         if (adj->n_adj == 1)
-           {
-             nhs = &tmp_nhs[0];
-             nhs[0].next_hop_adj_index = ~0; /* not used */
-             nhs[0].weight = 1;
-             n_nhs = 1;
-           }
-         else
-           {
-             ip_multipath_adjacency_t * madj;
-             madj = vec_elt_at_index (lm->multipath_adjacencies, adj->heap_handle);
-             nhs = heap_elt_at_index (lm->next_hop_heap, madj->normalized_next_hops.heap_offset);
-             n_nhs = madj->normalized_next_hops.count;
-           }
-
-         n_left = nhs[0].weight;
-         vlib_counter_zero (&sum);
-         for (i = j = 0; i < adj->n_adj; i++)
-           {
-             n_left -= 1;
-             vlib_get_combined_counter (&lm->adjacency_counters, 
-                                         adj_index + i, &c);
-              if (clear)
-                vlib_zero_combined_counter (&lm->adjacency_counters, 
-                                            adj_index + i);
-             vlib_counter_add (&sum, &c);
-             if (n_left == 0)
-               {
-                 u8 * msg = 0;
-                 uword indent;
-
-                 if (j == 0)
-                   msg = format (msg, "%-45U",
-                                 format_ip6_address_and_length,
-                                 r->address.as_u8, r->address_length);
-                 else
-                   msg = format (msg, "%U", format_white_space, 20);
-
-                 msg = format (msg, "%16Ld%16Ld ", sum.packets, sum.bytes);
-
-                 indent = vec_len (msg);
-                 msg = format (msg, "weight %d, index %d",
-                               nhs[j].weight, adj_index + i);
-
-                 if (ip_adjacency_is_multipath(lm, adj_index + i))
-                     msg = format (msg, ", multipath");
-
-                 msg = format (msg, "\n%U%U",
-                               format_white_space, indent,
-                               format_ip_adjacency,
-                               vnm, lm, adj_index + i);
-
-                 vlib_cli_output (vm, "%v", msg);
-                 vec_free (msg);
-
-                 j++;
-                 if (j < n_nhs)
-                   {
-                     n_left = nhs[j].weight;
-                     vlib_counter_zero (&sum);
-                   }
-               }
-           }
-
-         if (result && lm->format_fib_result)
-           vlib_cli_output (vm, "%20s%U", "", lm->format_fib_result, vm, lm, result, 0);
-       }
-      vlib_cli_output (vm, " ");
-    }
-
-  vec_free (routes);
-  vec_free (results);
-
-  return 0;
-}
-
-/*?
- * Show FIB6/route entries
- *
- * @cliexpar
- * @cliexstart{show ip fib}
- * Display the IPv6 FIB.
- * This command will run for a long time when the FIBs comprise millions of entries.
- * See 'show ip fib'
- * @cliexend
- ?*/
-VLIB_CLI_COMMAND (ip6_show_fib_command, static) = {
-  .path = "show ip6 fib",
-  .short_help = "show ip6 fib [summary] [clear]",
-  .function = ip6_show_fib,
-};
index dcc9d25..c8dcc14 100644 (file)
@@ -45,7 +45,6 @@
  * - Callbacks on route add.
  * - Callbacks on interface address change.
  */
-
 #ifndef included_ip_lookup_h
 #define included_ip_lookup_h
 
 #include <vlib/buffer.h>
 #include <vnet/ip/ip4_packet.h>
 #include <vnet/ip/ip6_packet.h>
+#include <vnet/fib/fib_node.h>
+#include <vnet/dpo/dpo.h>
 
 /** @brief Common (IP4/IP6) next index stored in adjacency. */
 typedef enum {
-  /** Packet does not match any route in table. */
-  IP_LOOKUP_NEXT_MISS,
-
   /** Adjacency to drop this packet. */
   IP_LOOKUP_NEXT_DROP,
   /** Adjacency to punt this packet. */
@@ -67,27 +65,26 @@ typedef enum {
   /** This packet is for one of our own IP addresses. */
   IP_LOOKUP_NEXT_LOCAL,
 
-  /** This packet matches an "interface route" and packets
+  /** This packet matches an "incomplete adjacency" and packets
      need to be passed to ARP to find rewrite string for
      this destination. */
   IP_LOOKUP_NEXT_ARP,
 
+  /** This packet matches an "interface route" and packets
+     need to be passed to ARP to find rewrite string for
+     this destination. */
+  IP_LOOKUP_NEXT_GLEAN,
+
   /** This packet is to be rewritten and forwarded to the next
      processing node.  This is typically the output interface but
      might be another node for further output processing. */
   IP_LOOKUP_NEXT_REWRITE,
 
-  /** This packet needs to be classified */
-  IP_LOOKUP_NEXT_CLASSIFY,
-
-  /** This packet needs to go to MAP - RFC7596, RFC7597 */
-  IP_LOOKUP_NEXT_MAP,
+  /** This packets follow a load-balance */
+  IP_LOOKUP_NEXT_LOAD_BALANCE,
 
-  /** This packet needs to go to MAP with Translation - RFC7599 */
-  IP_LOOKUP_NEXT_MAP_T,
-
-  /** This packets needs to go to indirect next hop */
-  IP_LOOKUP_NEXT_INDIRECT,
+  /** This packets follow a mid-chain adjacency */
+  IP_LOOKUP_NEXT_MIDCHAIN,
 
   /** This packets needs to go to ICMP error */
   IP_LOOKUP_NEXT_ICMP_ERROR,
@@ -100,7 +97,7 @@ typedef enum {
 } ip4_lookup_next_t;
 
 typedef enum {
-  /** Hop-by-hop header handling */
+  /* Hop-by-hop header handling */
   IP6_LOOKUP_NEXT_HOP_BY_HOP = IP_LOOKUP_N_NEXT,
   IP6_LOOKUP_NEXT_ADD_HOP_BY_HOP,
   IP6_LOOKUP_NEXT_POP_HOP_BY_HOP,
@@ -108,30 +105,26 @@ typedef enum {
 } ip6_lookup_next_t;
 
 #define IP4_LOOKUP_NEXT_NODES {                                        \
-    [IP_LOOKUP_NEXT_MISS] = "ip4-miss",                                \
     [IP_LOOKUP_NEXT_DROP] = "ip4-drop",                                \
     [IP_LOOKUP_NEXT_PUNT] = "ip4-punt",                                \
     [IP_LOOKUP_NEXT_LOCAL] = "ip4-local",                      \
     [IP_LOOKUP_NEXT_ARP] = "ip4-arp",                          \
+    [IP_LOOKUP_NEXT_GLEAN] = "ip4-glean",                      \
     [IP_LOOKUP_NEXT_REWRITE] = "ip4-rewrite-transit",          \
-    [IP_LOOKUP_NEXT_CLASSIFY] = "ip4-classify",                        \
-    [IP_LOOKUP_NEXT_MAP] = "ip4-map",                          \
-    [IP_LOOKUP_NEXT_MAP_T] = "ip4-map-t",                      \
-    [IP_LOOKUP_NEXT_INDIRECT] = "ip4-indirect",                        \
+    [IP_LOOKUP_NEXT_MIDCHAIN] = "ip4-midchain",                        \
+    [IP_LOOKUP_NEXT_LOAD_BALANCE] = "ip4-load-balance",                \
     [IP_LOOKUP_NEXT_ICMP_ERROR] = "ip4-icmp-error",            \
 }
 
 #define IP6_LOOKUP_NEXT_NODES {                                        \
-    [IP_LOOKUP_NEXT_MISS] = "ip6-miss",                                \
     [IP_LOOKUP_NEXT_DROP] = "ip6-drop",                                \
     [IP_LOOKUP_NEXT_PUNT] = "ip6-punt",                                \
     [IP_LOOKUP_NEXT_LOCAL] = "ip6-local",                      \
     [IP_LOOKUP_NEXT_ARP] = "ip6-discover-neighbor",            \
+    [IP_LOOKUP_NEXT_GLEAN] = "ip6-glean",                      \
     [IP_LOOKUP_NEXT_REWRITE] = "ip6-rewrite",                  \
-    [IP_LOOKUP_NEXT_CLASSIFY] = "ip6-classify",                        \
-    [IP_LOOKUP_NEXT_MAP] = "ip6-map",                          \
-    [IP_LOOKUP_NEXT_MAP_T] = "ip6-map-t",                      \
-    [IP_LOOKUP_NEXT_INDIRECT] = "ip6-indirect",                        \
+    [IP_LOOKUP_NEXT_MIDCHAIN] = "ip6-midchain",                        \
+    [IP_LOOKUP_NEXT_LOAD_BALANCE] = "ip6-load-balance",                \
     [IP_LOOKUP_NEXT_ICMP_ERROR] = "ip6-icmp-error",            \
     [IP6_LOOKUP_NEXT_HOP_BY_HOP] = "ip6-hop-by-hop",           \
     [IP6_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip6-add-hop-by-hop",   \
@@ -157,20 +150,20 @@ _(dport, IP_FLOW_HASH_DST_PORT)                 \
 _(proto, IP_FLOW_HASH_PROTO)                   \
 _(reverse, IP_FLOW_HASH_REVERSE_SRC_DST)
 
+/**
+ * A flow hash configuration is a mask of the flow hash options
+ */
+typedef u32 flow_hash_config_t;
+
 #define IP_ADJACENCY_OPAQUE_SZ 16
 /** @brief IP unicast adjacency.
     @note cache aligned.
 */
 typedef struct {
   CLIB_CACHE_LINE_ALIGN_MARK(cacheline0);
-  /** Handle for this adjacency in adjacency heap. */
+  /* Handle for this adjacency in adjacency heap. */
   u32 heap_handle;
 
-  STRUCT_MARK(signature_start);
-
-  /** Interface address index for this local/arp adjacency. */
-  u32 if_address_index;
-
   /** Number of adjecencies in block.  Greater than 1 means multipath;
      otherwise equal to 1. */
   u16 n_adj;
@@ -181,27 +174,63 @@ typedef struct {
     u16 lookup_next_index_as_int;
   };
 
+  /** Interface address index for this local/arp adjacency. */
+  u32 if_address_index;
+
   /** Force re-lookup in a different FIB. ~0 => normal behavior */
-  i16 explicit_fib_index;
   u16 mcast_group_index;  
 
   /** Highest possible perf subgraph arc interposition, e.g. for ip6 ioam */
   u16 saved_lookup_next_index;
 
+  /*
+   * link/ether-type
+   */
+  u8 ia_link;
+  u8 ia_nh_proto;
+
   union {
-    /** IP_LOOKUP_NEXT_ARP only */
-    struct {
-      ip46_address_t next_hop;
-    } arp;
-    /** IP_LOOKUP_NEXT_CLASSIFY only */
-    struct {
-      u16 table_index;
-    } classify;
-    /** IP_LOOKUP_NEXT_INDIRECT only */
-    struct {
-        ip46_address_t next_hop;
-    } indirect;
-    u8 opaque[IP_ADJACENCY_OPAQUE_SZ];
+    union {
+       /**
+        * IP_LOOKUP_NEXT_ARP/IP_LOOKUP_NEXT_REWRITE
+        *
+        * neighbour adjacency sub-type;
+        */
+       struct {
+           ip46_address_t next_hop;
+       } nbr;
+       /**
+        * IP_LOOKUP_NEXT_MIDCHAIN
+        *
+        * A nbr adj that is also recursive. Think tunnels.
+        * A nbr adj can transition to be of type MDICHAIN
+        * so be sure to leave the two structs with the next_hop
+        * fields aligned.
+        */
+       struct {
+           /**
+            * The recursive next-hop
+            */
+           ip46_address_t next_hop;
+            /**
+             * The node index of the tunnel's post rewrite/TX function.
+             */
+            u32 tx_function_node;
+           /**
+            * The next DPO to use
+            */
+           dpo_id_t next_dpo;
+       } midchain;
+       /**
+        * IP_LOOKUP_NEXT_GLEAN
+        *
+        * Glean the address to ARP for from the packet's destination
+        */
+       struct {
+           ip46_address_t receive_addr;
+       } glean;
+    } sub_type;
+    u16 opaque[IP_ADJACENCY_OPAQUE_SZ];
   };
 
   /** @brief Special format function for this adjacency.
@@ -210,62 +239,31 @@ typedef struct {
    * the first cache line reads "full" on the free space gas gauge.
    */
   u32 special_adjacency_format_function_index;  /* 0 is invalid */
-  STRUCT_MARK(signature_end);
-
-  /** Number of FIB entries sharing this adjacency */
-  u32 share_count;
-  /** Use this adjacency instead */
-  u32 next_adj_with_signature;
 
   CLIB_CACHE_LINE_ALIGN_MARK(cacheline1);
 
-  /** Rewrite in second/third cache lines */
+  /* Rewrite in second/third cache lines */
   vnet_declare_rewrite (VLIB_BUFFER_PRE_DATA_SIZE);
+
+    /*
+     * member not accessed in the data plane are relgated to the
+     * remaining cachelines
+     */
+    fib_node_t ia_node;
 } ip_adjacency_t;
 
-static inline uword
-vnet_ip_adjacency_signature (ip_adjacency_t * adj)
-{
-  uword signature = 0xfeedfaceULL;
-
-  /* Skip heap handle, sum everything up to but not including share_count */
-  signature = hash_memory
-      (STRUCT_MARK_PTR(adj, signature_start),
-       STRUCT_OFFSET_OF(ip_adjacency_t, signature_end)
-       - STRUCT_OFFSET_OF(ip_adjacency_t, signature_start),
-       signature);
-
-  /* and the rewrite */
-  signature = hash_memory (&adj->rewrite_header, VLIB_BUFFER_PRE_DATA_SIZE,
-                             signature);
-  return signature;
-}
+_Static_assert((STRUCT_OFFSET_OF(ip_adjacency_t, cacheline0) == 0),
+              "IP adjacency cachline 0 is not offset");
+_Static_assert((STRUCT_OFFSET_OF(ip_adjacency_t, cacheline1) ==
+               CLIB_CACHE_LINE_BYTES),
+              "IP adjacency cachline 1 is more than one cachline size offset");
 
-static inline int
-vnet_ip_adjacency_share_compare (ip_adjacency_t * a1, ip_adjacency_t *a2)
-{
-  if (memcmp (STRUCT_MARK_PTR(a1, signature_start),
-              STRUCT_MARK_PTR(a2, signature_start),
-              STRUCT_OFFSET_OF(ip_adjacency_t, signature_end)
-              - STRUCT_OFFSET_OF(ip_adjacency_t, signature_start)))
-    return 0;
-  if (memcmp (&a1->rewrite_header, &a2->rewrite_header,
-              VLIB_BUFFER_PRE_DATA_SIZE))
-    return 0;
-  return 1;
-}
+/* An all zeros address */
+extern const ip46_address_t zero_addr;
 
 /* Index into adjacency table. */
 typedef u32 ip_adjacency_index_t;
 
-typedef struct {
-  /* Directly connected next-hop adjacency index. */
-  u32 next_hop_adj_index;
-
-  /* Path weight for this adjacency. */
-  u32 weight;
-} ip_multipath_next_hop_t;
-
 typedef struct {
   /* Adjacency index of first index in block. */
   u32 adj_index;
@@ -276,11 +274,7 @@ typedef struct {
   /* Number of prefixes that point to this adjacency. */
   u32 reference_count;
 
-  /* Normalized next hops are used as hash keys: they are sorted by weight
-     and weights are chosen so they add up to 1 << log2_n_adj_in_block (with
-     zero-weighted next hops being deleted).
-     Unnormalized next hops are saved so that control plane has a record of exactly
-     what the RIB told it. */
+  /* Normalized next hops are saved for stats/display purposes */
   struct {
     /* Number of hops in the multipath. */
     u32 count;
@@ -290,7 +284,7 @@ typedef struct {
 
     /* Heap handle used to for example free block when we're done with it. */
     u32 heap_handle;
-  } normalized_next_hops, unnormalized_next_hops;
+  } normalized_next_hops;
 } ip_multipath_adjacency_t;
 
 /* IP multicast adjacency. */
@@ -397,20 +391,11 @@ typedef struct ip_adj_register_struct {
 } ip_adj_register_t;
 
 typedef struct ip_lookup_main_t {
-  /** Adjacency heap. */
+  /* Adjacency heap. */
   ip_adjacency_t * adjacency_heap;
 
-  /** Adjacency packet/byte counters indexed by adjacency index. */
-  vlib_combined_counter_main_t adjacency_counters;
-
-  /** Heap of (next hop, weight) blocks.  Sorted by next hop. */
-  ip_multipath_next_hop_t * next_hop_heap;
-
-  /** Indexed by heap_handle from ip_adjacency_t. */
-  ip_multipath_adjacency_t * multipath_adjacencies;
-
-  /** Adjacency by signature hash */
-  uword * adj_index_by_signature;
+  /** load-balance  packet/byte counters indexed by LB index. */
+  vlib_combined_counter_main_t load_balance_counters;
 
   /** any-tx-feature-enabled interface bitmap */
   uword * tx_sw_if_has_ip_output_features;
@@ -418,29 +403,6 @@ typedef struct ip_lookup_main_t {
   /** count of enabled features, per sw_if_index, to maintain bitmap */
   i16 * tx_feature_count_by_sw_if_index;
 
-  /** Temporary vectors for looking up next hops in hash. */
-  ip_multipath_next_hop_t * next_hop_hash_lookup_key;
-  ip_multipath_next_hop_t * next_hop_hash_lookup_key_normalized;
-
-  /** Hash table mapping normalized next hops and weights
-     to multipath adjacency index. */
-  uword * multipath_adjacency_by_next_hops;
-
-  u32 * adjacency_remap_table;
-  u32 n_adjacency_remaps;
-
-  /** If average error per adjacency is less than this threshold adjacency block
-     size is accepted. */
-  f64 multipath_next_hop_error_tolerance;
-
-  /** Adjacency index for routing table misses, local punts, and drops. */
-  u32 miss_adj_index, drop_adj_index, local_adj_index;
-
-  /** Miss adjacency is always first in adjacency table. */
-#define IP_LOOKUP_MISS_ADJ_INDEX 0
-
-  ip_add_del_adjacency_callback_t * add_del_adjacency_callbacks;
-
   /** Pool of addresses that are assigned to interfaces. */
   ip_interface_address_t * if_address_pool;
 
@@ -501,54 +463,6 @@ do {                                                               \
   CLIB_PREFETCH (_adj, sizeof (_adj[0]), type);                        \
 } while (0)
 
-/* Adds a next node to ip4 or ip6 lookup node which can be then used in adjacencies.
- * @param vlib_main pointer
- * @param lm ip4_main.lookup_main or ip6_main.lookup_main
- * @param reg registration structure
- * @param next_node_index Returned index to be used in adjacencies.
- * @return 0 on success. -1 on failure.
- */
-int ip_register_adjacency(vlib_main_t *vm, u8 is_ip4,
-                          ip_adj_register_t *reg);
-
-/*
- * Construction helpers to add IP adjacency at init.
- */
-#define VNET_IP_REGISTER_ADJACENCY(ip,x,...)                     \
-  __VA_ARGS__ ip_adj_register_t ip##adj_##x;                     \
-static void __vnet_##ip##_register_adjacency_##x (void)          \
-  __attribute__((__constructor__)) ;                             \
-static void __vnet_##ip##_register_adjacency_##x (void)          \
-{                                                                \
-  ip_lookup_main_t *lm = &ip##_main.lookup_main;                 \
-  ip##adj_##x.next = lm->registered_adjacencies;                 \
-  lm->registered_adjacencies = &ip##adj_##x;                     \
-}                                                                \
-__VA_ARGS__ ip_adj_register_t ip##adj_##x
-
-#define VNET_IP4_REGISTER_ADJACENCY(x,...)                       \
-    VNET_IP_REGISTER_ADJACENCY(ip4, x, __VA_ARGS__)
-
-#define VNET_IP6_REGISTER_ADJACENCY(x,...)                       \
-    VNET_IP_REGISTER_ADJACENCY(ip6, x, __VA_ARGS__)
-
-static inline void
-ip_register_add_del_adjacency_callback(ip_lookup_main_t * lm,
-                                      ip_add_del_adjacency_callback_t cb)
-{
-  vec_add1(lm->add_del_adjacency_callbacks, cb);
-}
-
-always_inline void
-ip_call_add_del_adjacency_callbacks (ip_lookup_main_t * lm, u32 adj_index, u32 is_del)
-{
-  ip_adjacency_t * adj;
-  uword i;
-  adj = ip_get_adjacency (lm, adj_index);
-  for (i = 0; i < vec_len (lm->add_del_adjacency_callbacks); i++)
-    lm->add_del_adjacency_callbacks[i] (lm, adj_index, adj, is_del);
-}
-
 /* Create new block of given number of contiguous adjacencies. */
 ip_adjacency_t *
 ip_add_adjacency (ip_lookup_main_t * lm,
@@ -556,38 +470,6 @@ ip_add_adjacency (ip_lookup_main_t * lm,
                  u32 n_adj,
                  u32 * adj_index_result);
 
-void ip_del_adjacency (ip_lookup_main_t * lm, u32 adj_index);
-void
-ip_update_adjacency (ip_lookup_main_t * lm,
-                    u32 adj_index,
-                    ip_adjacency_t * copy_adj);
-
-static inline int
-ip_adjacency_is_multipath(ip_lookup_main_t * lm, u32 adj_index)
-{
-  if (!vec_len(lm->multipath_adjacencies))
-    return 0;
-
-  if (vec_len(lm->multipath_adjacencies) < adj_index - 1)
-    return 0;
-
-
-  return (lm->multipath_adjacencies[adj_index].adj_index == adj_index &&
-         lm->multipath_adjacencies[adj_index].n_adj_in_block > 0);
-}
-
-void
-ip_multipath_adjacency_free (ip_lookup_main_t * lm,
-                            ip_multipath_adjacency_t * a);
-
-u32
-ip_multipath_adjacency_add_del_next_hop (ip_lookup_main_t * lm,
-                                        u32 is_del,
-                                        u32 old_mp_adj_index,
-                                        u32 next_hop_adj_index,
-                                        u32 next_hop_weight,
-                                        u32 * new_mp_adj_index);
-
 clib_error_t *
 ip_interface_address_add_del (ip_lookup_main_t * lm,
                              u32 sw_if_index,
@@ -596,6 +478,9 @@ ip_interface_address_add_del (ip_lookup_main_t * lm,
                              u32 is_del,
                              u32 * result_index);
 
+u8 *
+format_ip_flow_hash_config (u8 * s, va_list * args);
+
 always_inline ip_interface_address_t *
 ip_get_interface_address (ip_lookup_main_t * lm, void * addr_fib)
 {
@@ -603,28 +488,14 @@ ip_get_interface_address (ip_lookup_main_t * lm, void * addr_fib)
   return p ? pool_elt_at_index (lm->if_address_pool, p[0]) : 0;
 }
 
+u32
+fib_table_id_find_fib_index (fib_protocol_t proto,
+                            u32 table_id);
+
 always_inline void *
 ip_interface_address_get_address (ip_lookup_main_t * lm, ip_interface_address_t * a)
 { return mhash_key_to_mem (&lm->address_to_if_address_index, a->address_key); }
 
-always_inline ip_interface_address_t *
-ip_interface_address_for_packet (ip_lookup_main_t * lm, vlib_buffer_t * b, u32 sw_if_index)
-{
-  ip_adjacency_t * adj;
-  u32 if_address_index;
-
-  adj = ip_get_adjacency (lm, vnet_buffer (b)->ip.adj_index[VLIB_TX]);
-
-  ASSERT (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP
-         || adj->lookup_next_index == IP_LOOKUP_NEXT_LOCAL);
-  if_address_index = adj->if_address_index;
-  if_address_index = (if_address_index == ~0 ?
-                     vec_elt (lm->if_address_pool_index_by_sw_if_index, sw_if_index)
-                     : if_address_index);
-
-  return (if_address_index != ~0)?pool_elt_at_index (lm->if_address_pool, if_address_index):NULL;
-}
-
 #define foreach_ip_interface_address(lm,a,sw_if_index,loop,body)        \
 do {                                                                    \
     vnet_main_t *_vnm = vnet_get_main();                                     \
@@ -653,7 +524,5 @@ do {                                                                    \
 } while (0)
 
 void ip_lookup_init (ip_lookup_main_t * lm, u32 ip_lookup_node_index);
-u32 vnet_register_special_adjacency_format_function 
-(ip_lookup_main_t * lm, format_function_t * fp);
 
 #endif /* included_ip_lookup_h */
index b5842a6..3bc4da8 100644 (file)
@@ -14,6 +14,9 @@
  */
 
 #include <vnet/ip/ping.h>
+#include <vnet/fib/ip6_fib.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/fib/fib_entry.h>
 
 u8 *
 format_icmp4_input_trace (u8 * s, va_list * va)
@@ -278,7 +281,14 @@ send_ip6_ping (vlib_main_t * vm, ip6_main_t * im, ip6_address_t * pa6,
   vnet_buffer (p0)->sw_if_index[VLIB_RX] = 0;
   vnet_buffer (p0)->sw_if_index[VLIB_TX] = ~0;  /* use interface VRF */
   fib_index0 = 0;
-  adj_index0 = ip6_fib_lookup_with_table (im, fib_index0, pa6);
+  adj_index0 = fib_entry_get_adj(ip6_fib_table_lookup(fib_index0, pa6, 128));
+
+  if (ADJ_INDEX_INVALID == adj_index0)
+    {
+      vlib_buffer_free (vm, &bi0, 1);
+      return SEND_PING_NO_INTERFACE;
+    }
+
   sw_if_index0 =
     adj_index_to_sw_if_index (vm, lm, ip6_lookup_next_nodes, adj_index0,
                               sw_if_index, verbose);
@@ -362,7 +372,15 @@ send_ip4_ping (vlib_main_t * vm,
   vnet_buffer (p0)->sw_if_index[VLIB_RX] = 0;
   vnet_buffer (p0)->sw_if_index[VLIB_TX] = ~0;  /* use interface VRF */
   fib_index0 = 0;
-  adj_index0 = ip4_fib_lookup_with_table (im, fib_index0, pa4, 0);
+  adj_index0 = fib_entry_get_adj(ip4_fib_table_lookup(
+                                    ip4_fib_get(fib_index0), pa4, 32));
+
+  if (ADJ_INDEX_INVALID == adj_index0)
+    {
+      vlib_buffer_free (vm, &bi0, 1);
+      return SEND_PING_NO_INTERFACE;
+    }
+
   sw_if_index0 =
     adj_index_to_sw_if_index (vm, lm, ip4_lookup_next_nodes, adj_index0,
                               sw_if_index, verbose);
index 1cf525c..1845fa7 100644 (file)
@@ -115,14 +115,13 @@ void udp_register_dst_port (vlib_main_t * vm,
                             u32 node_index, u8 is_ip4);
 
 always_inline void
-ip_udp_encap_one (vlib_main_t * vm, vlib_buffer_t * b0, u8 * ec0, word ec_len,
+ip_udp_fixup_one (vlib_main_t * vm,
+                  vlib_buffer_t * b0,
                   u8 is_ip4)
 {
   u16 new_l0;
   udp_header_t * udp0;
 
-  vlib_buffer_advance (b0, - ec_len);
-
   if (is_ip4)
     {
       ip4_header_t * ip0;
@@ -131,9 +130,6 @@ ip_udp_encap_one (vlib_main_t * vm, vlib_buffer_t * b0, u8 * ec0, word ec_len,
 
       ip0 = vlib_buffer_get_current(b0);
 
-      /* Apply the encap string. */
-      clib_memcpy(ip0, ec0, ec_len);
-
       /* fix the <bleep>ing outer-IP checksum */
       sum0 = ip0->checksum;
       /* old_l0 always 0, see the rewrite setup */
@@ -157,9 +153,6 @@ ip_udp_encap_one (vlib_main_t * vm, vlib_buffer_t * b0, u8 * ec0, word ec_len,
 
       ip0 = vlib_buffer_get_current(b0);
 
-      /* Apply the encap string. */
-      clib_memcpy(ip0, ec0, ec_len);
-
       new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)
                                      - sizeof (*ip0));
       ip0->payload_length = new_l0;
@@ -175,6 +168,33 @@ ip_udp_encap_one (vlib_main_t * vm, vlib_buffer_t * b0, u8 * ec0, word ec_len,
           udp0->checksum = 0xffff;
     }
 }
+always_inline void
+ip_udp_encap_one (vlib_main_t * vm, vlib_buffer_t * b0, u8 * ec0, word ec_len,
+                  u8 is_ip4)
+{
+  vlib_buffer_advance (b0, - ec_len);
+
+  if (is_ip4)
+    {
+      ip4_header_t * ip0;
+
+      ip0 = vlib_buffer_get_current(b0);
+
+      /* Apply the encap string. */
+      clib_memcpy(ip0, ec0, ec_len);
+      ip_udp_fixup_one(vm, b0, 1);
+    }
+  else
+    {
+      ip6_header_t * ip0;
+
+      ip0 = vlib_buffer_get_current(b0);
+
+      /* Apply the encap string. */
+      clib_memcpy(ip0, ec0, ec_len);
+      ip_udp_fixup_one(vm, b0, 0);
+    }
+}
 
 always_inline void
 ip_udp_encap_two (vlib_main_t * vm, vlib_buffer_t * b0, vlib_buffer_t * b1,
index 3d1b54f..cf0f391 100644 (file)
 
 ipsec_gre_main_t ipsec_gre_main;
 
-/**
- * @brief IPv4 and GRE header.
- *
-*/
-/* *INDENT-OFF* */
-typedef CLIB_PACKED (struct
-{
-  ip4_header_t ip4;
-  gre_header_t gre;
-}) ip4_and_gre_header_t;
-/* *INDENT-OFF* */
-
 /**
  * @brief IPv4 and GRE header union.
  *
index 16d7bfa..5de30d5 100644 (file)
@@ -18,6 +18,8 @@
 #include <vnet/lisp-cp/packets.h>
 #include <vnet/lisp-cp/lisp_msg_serdes.h>
 #include <vnet/lisp-gpe/lisp_gpe.h>
+#include <vnet/fib/fib_entry.h>
+#include <vnet/fib/fib_table.h>
 
 typedef struct
 {
@@ -74,37 +76,36 @@ ip_interface_get_first_ip_address (lisp_cp_main_t * lcm, u32 sw_if_index,
   return 1;
 }
 
-static u32
-ip_fib_lookup_with_table (lisp_cp_main_t * lcm, u32 fib_index,
-                         ip_address_t * dst)
+/**
+ * convert from a LISP address to a FIB prefix
+ */
+void
+ip_address_to_fib_prefix (const ip_address_t * addr, fib_prefix_t * prefix)
 {
-  if (ip_addr_version (dst) == IP4)
-    return ip4_fib_lookup_with_table (lcm->im4, fib_index, &ip_addr_v4 (dst),
-                                     0);
+  if (addr->version == IP4)
+    {
+      prefix->fp_len = 32;
+      prefix->fp_proto = FIB_PROTOCOL_IP4;
+      memset (&prefix->fp_addr.pad, 0, sizeof (prefix->fp_addr.pad));
+      memcpy (&prefix->fp_addr.ip4, &addr->ip, sizeof (prefix->fp_addr.ip4));
+    }
   else
-    return ip6_fib_lookup_with_table (lcm->im6, fib_index, &ip_addr_v6 (dst));
+    {
+      prefix->fp_len = 128;
+      prefix->fp_proto = FIB_PROTOCOL_IP6;
+      memcpy (&prefix->fp_addr.ip6, &addr->ip, sizeof (prefix->fp_addr.ip6));
+    }
 }
 
-u32
-ip_fib_get_egress_iface_for_dst_with_lm (lisp_cp_main_t * lcm,
-                                        ip_address_t * dst,
-                                        ip_lookup_main_t * lm)
+/**
+ * convert from a LISP to a FIB prefix
+ */
+void
+ip_prefix_to_fib_prefix (const ip_prefix_t * ip_prefix,
+                        fib_prefix_t * fib_prefix)
 {
-  u32 adj_index;
-  ip_adjacency_t *adj;
-
-  adj_index = ip_fib_lookup_with_table (lcm, 0, dst);
-  adj = ip_get_adjacency (lm, adj_index);
-
-  if (adj == 0)
-    return ~0;
-
-  /* we only want outgoing routes */
-  if (adj->lookup_next_index != IP_LOOKUP_NEXT_ARP
-      && adj->lookup_next_index != IP_LOOKUP_NEXT_REWRITE)
-    return ~0;
-
-  return adj->rewrite_header.sw_if_index;
+  ip_address_to_fib_prefix (&ip_prefix->addr, fib_prefix);
+  fib_prefix->fp_len = ip_prefix->len;
 }
 
 /**
@@ -114,12 +115,14 @@ ip_fib_get_egress_iface_for_dst_with_lm (lisp_cp_main_t * lcm,
 u32
 ip_fib_get_egress_iface_for_dst (lisp_cp_main_t * lcm, ip_address_t * dst)
 {
-  ip_lookup_main_t *lm;
+  fib_node_index_t fei;
+  fib_prefix_t prefix;
+
+  ip_address_to_fib_prefix (dst, &prefix);
 
-  lm = ip_addr_version (dst) == IP4 ?
-    &lcm->im4->lookup_main : &lcm->im6->lookup_main;
+  fei = fib_table_lookup (0, &prefix);
 
-  return ip_fib_get_egress_iface_for_dst_with_lm (lcm, dst, lm);
+  return (fib_entry_get_resolving_interface (fei));
 }
 
 /**
@@ -140,7 +143,7 @@ ip_fib_get_first_egress_ip_for_dst (lisp_cp_main_t * lcm, ip_address_t * dst,
   ipver = ip_addr_version (dst);
 
   lm = (ipver == IP4) ? &lcm->im4->lookup_main : &lcm->im6->lookup_main;
-  si = ip_fib_get_egress_iface_for_dst_with_lm (lcm, dst, lm);
+  si = ip_fib_get_egress_iface_for_dst (lcm, dst);
 
   if ((u32) ~ 0 == si)
     return 0;
@@ -2871,28 +2874,14 @@ lisp_get_vni_from_buffer_ip (lisp_cp_main_t * lcm, vlib_buffer_t * b,
                             u8 version)
 {
   uword *vnip;
-  u32 vni = ~0, table_id = ~0, fib_index;
+  u32 vni = ~0, table_id = ~0;
 
-  if (version == IP4)
-    {
-      ip4_fib_t *fib;
-      ip4_main_t *im4 = &ip4_main;
-      fib_index = vec_elt (im4->fib_index_by_sw_if_index,
-                          vnet_buffer (b)->sw_if_index[VLIB_RX]);
-      fib = find_ip4_fib_by_table_index_or_id (im4, fib_index,
-                                              IP4_ROUTE_FLAG_FIB_INDEX);
-      table_id = fib->table_id;
-    }
-  else
-    {
-      ip6_fib_t *fib;
-      ip6_main_t *im6 = &ip6_main;
-      fib_index = vec_elt (im6->fib_index_by_sw_if_index,
-                          vnet_buffer (b)->sw_if_index[VLIB_RX]);
-      fib = find_ip6_fib_by_table_index_or_id (im6, fib_index,
-                                              IP6_ROUTE_FLAG_FIB_INDEX);
-      table_id = fib->table_id;
-    }
+  table_id =
+    fib_table_get_table_id_for_sw_if_index (vnet_buffer (b)->sw_if_index
+                                           [VLIB_RX],
+                                           (version ==
+                                            IP4 ? FIB_PROTOCOL_IP4 :
+                                            FIB_PROTOCOL_IP6));
 
   vnip = hash_get (lcm->vni_by_table_id, table_id);
   if (vnip)
@@ -2979,8 +2968,9 @@ get_src_and_dst_eids_from_buffer (lisp_cp_main_t * lcm, vlib_buffer_t * b,
 }
 
 static uword
-lisp_cp_lookup (vlib_main_t * vm, vlib_node_runtime_t * node,
-               vlib_frame_t * from_frame)
+lisp_cp_lookup_inline (vlib_main_t * vm,
+                      vlib_node_runtime_t * node,
+                      vlib_frame_t * from_frame, int overlay)
 {
   u32 *from, *to_next_drop, di, si;
   lisp_cp_main_t *lcm = vnet_lisp_cp_get_main ();
@@ -3010,6 +3000,7 @@ lisp_cp_lookup (vlib_main_t * vm, vlib_node_runtime_t * node,
 
          b0 = vlib_get_buffer (vm, pi0);
          b0->error = node->errors[LISP_CP_LOOKUP_ERROR_DROP];
+         vnet_buffer (b0)->lisp.overlay_afi = overlay;
 
          /* src/dst eid pair */
          get_src_and_dst_eids_from_buffer (lcm, b0, &src, &dst);
@@ -3070,10 +3061,45 @@ lisp_cp_lookup (vlib_main_t * vm, vlib_node_runtime_t * node,
   return from_frame->n_vectors;
 }
 
+static uword
+lisp_cp_lookup_ip4 (vlib_main_t * vm,
+                   vlib_node_runtime_t * node, vlib_frame_t * from_frame)
+{
+  return (lisp_cp_lookup_inline (vm, node, from_frame, LISP_AFI_IP));
+}
+
+static uword
+lisp_cp_lookup_ip6 (vlib_main_t * vm,
+                   vlib_node_runtime_t * node, vlib_frame_t * from_frame)
+{
+  return (lisp_cp_lookup_inline (vm, node, from_frame, LISP_AFI_IP6));
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (lisp_cp_lookup_ip4_node) = {
+  .function = lisp_cp_lookup_ip4,
+  .name = "lisp-cp-lookup-ip4",
+  .vector_size = sizeof (u32),
+  .format_trace = format_lisp_cp_lookup_trace,
+  .type = VLIB_NODE_TYPE_INTERNAL,
+
+  .n_errors = LISP_CP_LOOKUP_N_ERROR,
+  .error_strings = lisp_cp_lookup_error_strings,
+
+  .n_next_nodes = LISP_CP_LOOKUP_N_NEXT,
+
+  .next_nodes = {
+      [LISP_CP_LOOKUP_NEXT_DROP] = "error-drop",
+      [LISP_CP_LOOKUP_NEXT_IP4_LOOKUP] = "ip4-lookup",
+      [LISP_CP_LOOKUP_NEXT_IP6_LOOKUP] = "ip6-lookup",
+  },
+};
+/* *INDENT-ON* */
+
 /* *INDENT-OFF* */
-VLIB_REGISTER_NODE (lisp_cp_lookup_node) = {
-  .function = lisp_cp_lookup,
-  .name = "lisp-cp-lookup",
+VLIB_REGISTER_NODE (lisp_cp_lookup_ip6_node) = {
+  .function = lisp_cp_lookup_ip6,
+  .name = "lisp-cp-lookup-ip6",
   .vector_size = sizeof (u32),
   .format_trace = format_lisp_cp_lookup_trace,
   .type = VLIB_NODE_TYPE_INTERNAL,
index 76590b2..02efd04 100644 (file)
@@ -149,7 +149,8 @@ typedef struct
 lisp_cp_main_t lisp_control_main;
 
 extern vlib_node_registration_t lisp_cp_input_node;
-extern vlib_node_registration_t lisp_cp_lookup_node;
+extern vlib_node_registration_t lisp_cp_lookup_ip4_node;
+extern vlib_node_registration_t lisp_cp_lookup_ip6_node;
 
 clib_error_t *lisp_cp_init ();
 
diff --git a/vnet/vnet/lisp-cp/lisp_cp_dpo.c b/vnet/vnet/lisp-cp/lisp_cp_dpo.c
new file mode 100644 (file)
index 0000000..0bb8098
--- /dev/null
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/dpo/dpo.h>
+#include <vnet/lisp-gpe/lisp_gpe.h>
+#include <vnet/lisp-cp/control.h>
+
+index_t
+lisp_cp_dpo_get (fib_protocol_t proto)
+{
+    /*
+     * there are only two instances of this DPO type.
+     * we can use the protocol as the index
+     */
+    return (proto);
+}
+
+static u8*
+format_lisp_cp_dpo (u8 *s, va_list *args)
+{
+    index_t index = va_arg (*args, index_t);
+    CLIB_UNUSED(u32 indent) = va_arg (*args, u32);
+
+    return (format(s, "lisp-cp-punt-%U",
+                   format_fib_protocol, index));
+}
+
+static void
+lisp_cp_dpo_lock (dpo_id_t *dpo)
+{
+}
+
+static void
+lisp_cp_dpo_unlock (dpo_id_t *dpo)
+{
+}
+
+const static dpo_vft_t lisp_cp_vft = {
+    .dv_lock = lisp_cp_dpo_lock,
+    .dv_unlock = lisp_cp_dpo_unlock,
+    .dv_format = format_lisp_cp_dpo,
+};
+
+/**
+ * @brief The per-protocol VLIB graph nodes that are assigned to a LISP-CP
+ *        object.
+ *
+ * this means that these graph nodes are ones from which a LISP-CP is the
+ * parent object in the DPO-graph.
+ */
+const static char* const lisp_cp_ip4_nodes[] =
+{
+    "lisp-cp-lookup-ip4",
+    NULL,
+};
+const static char* const lisp_cp_ip6_nodes[] =
+{
+    "lisp-cp-lookup-ip6",
+    NULL,
+};
+
+const static char* const * const lisp_cp_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = lisp_cp_ip4_nodes,
+    [DPO_PROTO_IP6]  = lisp_cp_ip6_nodes,
+    [DPO_PROTO_MPLS] = NULL,
+};
+
+clib_error_t *
+lisp_cp_dpo_module_init (vlib_main_t * vm)
+{
+    /*
+     * there are no exit arcs from the LIS-CP VLIB node, so we
+     * pass NULL as said node array.
+     */
+    dpo_register(DPO_LISP_CP, &lisp_cp_vft, lisp_cp_nodes);
+
+    return (NULL);
+}
+
+VLIB_INIT_FUNCTION(lisp_cp_dpo_module_init);
diff --git a/vnet/vnet/lisp-cp/lisp_cp_dpo.h b/vnet/vnet/lisp-cp/lisp_cp_dpo.h
new file mode 100644 (file)
index 0000000..ea97711
--- /dev/null
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LISP_CP_DPO_H__
+#define __LISP_CP_DPO_H__
+
+#include <vnet/vnet.h>
+#include <vnet/fib/fib_types.h>
+#include <vnet/dpo/dpo.h>
+
+/**
+ * A representation of punt to the LISP control plane.
+ */
+typedef struct lisp_cp_dpo_t
+{
+    /**
+     * The transport payload type.
+     */
+    fib_protocol_t lcd_proto;
+} lisp_cp_dpo_t;
+
+extern index_t lisp_cp_dpo_get(fib_protocol_t proto);
+
+extern void lisp_cp_dpo_module_init(void);
+
+#endif
index b4fb1d9..a2edb48 100644 (file)
@@ -147,6 +147,8 @@ uword
 unformat_ip_address (unformat_input_t * input, va_list * args)
 {
   ip_address_t *a = va_arg (*args, ip_address_t *);
+
+  memset (a, 0, sizeof (*a));
   if (unformat (input, "%U", unformat_ip4_address, &ip_addr_v4 (a)))
     ip_addr_version (a) = IP4;
   else if (unformat_user (input, unformat_ip6_address, &ip_addr_v6 (a)))
@@ -331,8 +333,32 @@ unformat_negative_mapping_action (unformat_input_t * input, va_list * args)
   return 1;
 }
 
+u8 *
+format_negative_mapping_action (u8 * s, va_list * args)
+{
+  lisp_action_e action = va_arg (*args, lisp_action_e);
+
+  switch (action)
+    {
+    case LISP_NO_ACTION:
+      s = format (s, "no-action");
+      break;
+    case LISP_FORWARD_NATIVE:
+      s = format (s, "natively-forward");
+      break;
+    case LISP_SEND_MAP_REQUEST:
+      s = format (s, "send-map-request");
+      break;
+    case LISP_DROP:
+    default:
+      s = format (s, "drop");
+      break;
+    }
+  return (s);
+}
+
 u16
-ip_address_size (ip_address_t * a)
+ip_address_size (const ip_address_t * a)
 {
   switch (ip_addr_version (a))
     {
@@ -653,7 +679,7 @@ gid_address_free (gid_address_t * a)
 }
 
 int
-ip_address_cmp (ip_address_t * ip1, ip_address_t * ip2)
+ip_address_cmp (const ip_address_t * ip1, const ip_address_t * ip2)
 {
   int res = 0;
   if (ip_addr_version (ip1) != ip_addr_version (ip2))
@@ -670,19 +696,19 @@ ip_address_cmp (ip_address_t * ip1, ip_address_t * ip2)
 }
 
 void
-ip_address_copy (ip_address_t * dst, ip_address_t * src)
+ip_address_copy (ip_address_t * dst, const ip_address_t * src)
 {
   clib_memcpy (dst, src, sizeof (ip_address_t));
 }
 
 void
-ip_address_copy_addr (void *dst, ip_address_t * src)
+ip_address_copy_addr (void *dst, const ip_address_t * src)
 {
   clib_memcpy (dst, src, ip_address_size (src));
 }
 
 void
-ip_address_set (ip_address_t * dst, void *src, u8 version)
+ip_address_set (ip_address_t * dst, const void *src, u8 version)
 {
   clib_memcpy (dst, src, ip_version_to_size (version));
   ip_addr_version (dst) = version;
index cb1b277..cd1d1b9 100644 (file)
@@ -42,10 +42,10 @@ typedef CLIB_PACKED(struct ip_address
 #define ip_addr_v6(_a) (_a)->ip.v6
 #define ip_addr_version(_a) (_a)->version
 
-int ip_address_cmp (ip_address_t * ip1, ip_address_t * ip2);
-void ip_address_copy (ip_address_t * dst, ip_address_t * src);
-void ip_address_copy_addr (void *dst, ip_address_t * src);
-void ip_address_set (ip_address_t * dst, void *src, u8 version);
+int ip_address_cmp (const ip_address_t * ip1, const ip_address_t * ip2);
+void ip_address_copy (ip_address_t * dst, const ip_address_t * src);
+void ip_address_copy_addr (void *dst, const ip_address_t * src);
+void ip_address_set (ip_address_t * dst, const void *src, u8 version);
 
 /* *INDENT-OFF* */
 typedef CLIB_PACKED(struct ip_prefix
@@ -63,6 +63,11 @@ typedef CLIB_PACKED(struct ip_prefix
 
 void ip_prefix_normalize (ip_prefix_t * a);
 
+extern void ip_address_to_fib_prefix (const ip_address_t * addr,
+                                     fib_prefix_t * prefix);
+extern void ip_prefix_to_fib_prefix (const ip_prefix_t * ipp,
+                                    fib_prefix_t * fibp);
+
 typedef enum
 {
   /* NOTE: ip addresses are left out on purpose. Use max masked ip-prefixes
@@ -107,6 +112,7 @@ typedef fid_address_t dp_address_t;
 #define fid_addr_ippref(_a) (_a)->ippref
 #define fid_addr_mac(_a) (_a)->mac
 #define fid_addr_type(_a) (_a)->type
+u8 *format_fid_address (u8 * s, va_list * args);
 
 typedef struct
 {
@@ -293,6 +299,7 @@ typedef struct
 
 uword
 unformat_negative_mapping_action (unformat_input_t * input, va_list * args);
+u8 *format_negative_mapping_action (u8 *, va_list * args);
 
 typedef struct locator_pair
 {
index abfdfdb..52db1eb 100644 (file)
 #include <vnet/ip/udp.h>
 #include <vnet/ethernet/ethernet.h>
 #include <vnet/lisp-gpe/lisp_gpe.h>
+#include <vnet/adj/adj.h>
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/fib/ip6_fib.h>
 
 #define foreach_lisp_gpe_tx_next        \
   _(DROP, "error-drop")                 \
@@ -56,147 +60,6 @@ format_lisp_gpe_tx_trace (u8 * s, va_list * args)
   return s;
 }
 
-always_inline void
-get_one_tunnel_inline (lisp_gpe_main_t * lgm, vlib_buffer_t * b0,
-                      lisp_gpe_tunnel_t ** t0, u8 is_v4)
-{
-  u32 adj_index0, tunnel_index0;
-  ip_adjacency_t *adj0;
-
-  /* Get adjacency and from it the tunnel_index */
-  adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
-
-  if (is_v4)
-    adj0 = ip_get_adjacency (lgm->lm4, adj_index0);
-  else
-    adj0 = ip_get_adjacency (lgm->lm6, adj_index0);
-
-  tunnel_index0 = adj0->if_address_index;
-  t0[0] = pool_elt_at_index (lgm->tunnels, tunnel_index0);
-
-  ASSERT (t0[0] != 0);
-}
-
-always_inline void
-encap_one_inline (lisp_gpe_main_t * lgm, vlib_buffer_t * b0,
-                 lisp_gpe_tunnel_t * t0, u32 * next0)
-{
-  ASSERT (sizeof (ip4_udp_lisp_gpe_header_t) == 36);
-  ASSERT (sizeof (ip6_udp_lisp_gpe_header_t) == 56);
-
-  lisp_gpe_sub_tunnel_t *st0;
-  u32 *sti0;
-
-  sti0 = vec_elt_at_index (t0->sub_tunnels_lbv,
-                          vnet_buffer (b0)->ip.flow_hash %
-                          t0->sub_tunnels_lbv_count);
-  st0 = vec_elt_at_index (t0->sub_tunnels, sti0[0]);
-  if (st0->is_ip4)
-    {
-      ip_udp_encap_one (lgm->vlib_main, b0, st0->rewrite, 36, 1);
-      next0[0] = LISP_GPE_TX_NEXT_IP4_LOOKUP;
-    }
-  else
-    {
-      ip_udp_encap_one (lgm->vlib_main, b0, st0->rewrite, 56, 0);
-      next0[0] = LISP_GPE_TX_NEXT_IP6_LOOKUP;
-    }
-
-  /* Reset to look up tunnel partner in the configured FIB */
-  vnet_buffer (b0)->sw_if_index[VLIB_TX] = t0->encap_fib_index;
-}
-
-always_inline void
-get_two_tunnels_inline (lisp_gpe_main_t * lgm, vlib_buffer_t * b0,
-                       vlib_buffer_t * b1, lisp_gpe_tunnel_t ** t0,
-                       lisp_gpe_tunnel_t ** t1, u8 is_v4)
-{
-  u32 adj_index0, adj_index1, tunnel_index0, tunnel_index1;
-  ip_adjacency_t *adj0, *adj1;
-
-  /* Get adjacency and from it the tunnel_index */
-  adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
-  adj_index1 = vnet_buffer (b1)->ip.adj_index[VLIB_TX];
-
-  if (is_v4)
-    {
-      adj0 = ip_get_adjacency (lgm->lm4, adj_index0);
-      adj1 = ip_get_adjacency (lgm->lm4, adj_index1);
-    }
-  else
-    {
-      adj0 = ip_get_adjacency (lgm->lm6, adj_index0);
-      adj1 = ip_get_adjacency (lgm->lm6, adj_index1);
-    }
-
-  tunnel_index0 = adj0->if_address_index;
-  tunnel_index1 = adj1->if_address_index;
-
-  t0[0] = pool_elt_at_index (lgm->tunnels, tunnel_index0);
-  t1[0] = pool_elt_at_index (lgm->tunnels, tunnel_index1);
-
-  ASSERT (t0[0] != 0);
-  ASSERT (t1[0] != 0);
-}
-
-always_inline void
-encap_two_inline (lisp_gpe_main_t * lgm, vlib_buffer_t * b0,
-                 vlib_buffer_t * b1, lisp_gpe_tunnel_t * t0,
-                 lisp_gpe_tunnel_t * t1, u32 * next0, u32 * next1)
-{
-  ASSERT (sizeof (ip4_udp_lisp_gpe_header_t) == 36);
-  ASSERT (sizeof (ip6_udp_lisp_gpe_header_t) == 56);
-
-  lisp_gpe_sub_tunnel_t *st0, *st1;
-  u32 *sti0, *sti1;
-  sti0 = vec_elt_at_index (t0->sub_tunnels_lbv,
-                          vnet_buffer (b0)->ip.flow_hash %
-                          t0->sub_tunnels_lbv_count);
-  sti1 =
-    vec_elt_at_index (t1->sub_tunnels_lbv,
-                     vnet_buffer (b1)->ip.flow_hash %
-                     t1->sub_tunnels_lbv_count);
-  st0 = vec_elt_at_index (t0->sub_tunnels, sti0[0]);
-  st1 = vec_elt_at_index (t1->sub_tunnels, sti1[0]);
-
-  if (PREDICT_TRUE (st0->is_ip4 == st1->is_ip4))
-    {
-      if (st0->is_ip4)
-       {
-         ip_udp_encap_one (lgm->vlib_main, b0, st0->rewrite, 36, 1);
-         ip_udp_encap_one (lgm->vlib_main, b1, st1->rewrite, 36, 1);
-         next0[0] = next1[0] = LISP_GPE_TX_NEXT_IP4_LOOKUP;
-       }
-      else
-       {
-         ip_udp_encap_one (lgm->vlib_main, b0, st0->rewrite, 56, 0);
-         ip_udp_encap_one (lgm->vlib_main, b1, st1->rewrite, 56, 0);
-         next0[0] = next1[0] = LISP_GPE_TX_NEXT_IP6_LOOKUP;
-       }
-    }
-  else
-    {
-      if (st0->is_ip4)
-       {
-         ip_udp_encap_one (lgm->vlib_main, b0, st0->rewrite, 36, 1);
-         ip_udp_encap_one (lgm->vlib_main, b1, st1->rewrite, 56, 1);
-         next0[0] = LISP_GPE_TX_NEXT_IP4_LOOKUP;
-         next1[0] = LISP_GPE_TX_NEXT_IP6_LOOKUP;
-       }
-      else
-       {
-         ip_udp_encap_one (lgm->vlib_main, b0, st0->rewrite, 56, 1);
-         ip_udp_encap_one (lgm->vlib_main, b1, st1->rewrite, 36, 1);
-         next0[0] = LISP_GPE_TX_NEXT_IP6_LOOKUP;
-         next1[0] = LISP_GPE_TX_NEXT_IP4_LOOKUP;
-       }
-    }
-
-  /* Reset to look up tunnel partner in the configured FIB */
-  vnet_buffer (b0)->sw_if_index[VLIB_TX] = t0->encap_fib_index;
-  vnet_buffer (b1)->sw_if_index[VLIB_TX] = t1->encap_fib_index;
-}
-
 #define is_v4_packet(_h) ((*(u8*) _h) & 0xF0) == 0x40
 
 /**
@@ -233,81 +96,12 @@ lisp_gpe_interface_tx (vlib_main_t * vm, vlib_node_runtime_t * node,
 
       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
 
-      while (n_left_from >= 4 && n_left_to_next >= 2)
-       {
-         u32 bi0, bi1;
-         vlib_buffer_t *b0, *b1;
-         u32 next0, next1;
-         lisp_gpe_tunnel_t *t0 = 0, *t1 = 0;
-         u8 is_v4_eid0, is_v4_eid1;
-
-         next0 = next1 = LISP_GPE_TX_NEXT_IP4_LOOKUP;
-
-         /* Prefetch next iteration. */
-         {
-           vlib_buffer_t *p2, *p3;
-
-           p2 = vlib_get_buffer (vm, from[2]);
-           p3 = vlib_get_buffer (vm, from[3]);
-
-           vlib_prefetch_buffer_header (p2, LOAD);
-           vlib_prefetch_buffer_header (p3, LOAD);
-
-           CLIB_PREFETCH (p2->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
-           CLIB_PREFETCH (p3->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
-         }
-
-         bi0 = from[0];
-         bi1 = from[1];
-         to_next[0] = bi0;
-         to_next[1] = bi1;
-         from += 2;
-         to_next += 2;
-         n_left_to_next -= 2;
-         n_left_from -= 2;
-
-         b0 = vlib_get_buffer (vm, bi0);
-         b1 = vlib_get_buffer (vm, bi1);
-
-         is_v4_eid0 = is_v4_packet (vlib_buffer_get_current (b0));
-         is_v4_eid1 = is_v4_packet (vlib_buffer_get_current (b1));
-
-         if (PREDICT_TRUE (is_v4_eid0 == is_v4_eid1))
-           {
-             get_two_tunnels_inline (lgm, b0, b1, &t0, &t1,
-                                     is_v4_eid0 ? 1 : 0);
-           }
-         else
-           {
-             get_one_tunnel_inline (lgm, b0, &t0, is_v4_eid0 ? 1 : 0);
-             get_one_tunnel_inline (lgm, b1, &t1, is_v4_eid1 ? 1 : 0);
-           }
-
-         encap_two_inline (lgm, b0, b1, t0, t1, &next0, &next1);
-
-         if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
-           {
-             lisp_gpe_tx_trace_t *tr = vlib_add_trace (vm, node, b0,
-                                                       sizeof (*tr));
-             tr->tunnel_index = t0 - lgm->tunnels;
-           }
-         if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED))
-           {
-             lisp_gpe_tx_trace_t *tr = vlib_add_trace (vm, node, b1,
-                                                       sizeof (*tr));
-             tr->tunnel_index = t1 - lgm->tunnels;
-           }
-
-         vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next,
-                                          n_left_to_next, bi0, bi1, next0,
-                                          next1);
-       }
-
       while (n_left_from > 0 && n_left_to_next > 0)
        {
+         u32 bi0, adj_index0, next0;
+         const ip_adjacency_t *adj0;
+         const dpo_id_t *dpo0;
          vlib_buffer_t *b0;
-         u32 bi0, next0 = LISP_GPE_TX_NEXT_IP4_LOOKUP;
-         lisp_gpe_tunnel_t *t0 = 0;
          u8 is_v4_0;
 
          bi0 = from[0];
@@ -319,16 +113,23 @@ lisp_gpe_interface_tx (vlib_main_t * vm, vlib_node_runtime_t * node,
 
          b0 = vlib_get_buffer (vm, bi0);
 
+         /* Fixup the checksum and len fields in the LISP tunnel encap
+          * that was applied at the midchain node */
          is_v4_0 = is_v4_packet (vlib_buffer_get_current (b0));
-         get_one_tunnel_inline (lgm, b0, &t0, is_v4_0 ? 1 : 0);
+         ip_udp_fixup_one (lgm->vlib_main, b0, is_v4_0);
 
-         encap_one_inline (lgm, b0, t0, &next0);
+         /* Follow the DPO on which the midchain is stacked */
+         adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
+         adj0 = adj_get (adj_index0);
+         dpo0 = &adj0->sub_type.midchain.next_dpo;
+         next0 = dpo0->dpoi_next_node;
+         vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
 
          if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
            {
              lisp_gpe_tx_trace_t *tr = vlib_add_trace (vm, node, b0,
                                                        sizeof (*tr));
-             tr->tunnel_index = t0 - lgm->tunnels;
+             tr->tunnel_index = adj_index0;
            }
          vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
                                           n_left_to_next, bi0, next0);
@@ -348,7 +149,7 @@ format_lisp_gpe_name (u8 * s, va_list * args)
 }
 
 /* *INDENT-OFF* */
-VNET_DEVICE_CLASS (lisp_gpe_device_class,static) = {
+VNET_DEVICE_CLASS (lisp_gpe_device_class) = {
   .name = "LISP_GPE",
   .format_device_name = format_lisp_gpe_name,
   .format_tx_trace = format_lisp_gpe_tx_trace,
@@ -394,133 +195,51 @@ VNET_HW_INTERFACE_CLASS (lisp_gpe_hw_class) = {
 };
 /* *INDENT-ON* */
 
-int
-add_del_ip_prefix_route (ip_prefix_t * dst_prefix, u32 table_id,
-                        ip_adjacency_t * add_adj, u8 is_add, u32 * adj_index)
+static void
+add_del_lisp_gpe_default_route (u32 table_id, fib_protocol_t proto, u8 is_add)
 {
-  uword *p;
+  fib_prefix_t prefix = {
+    .fp_proto = proto,
+  };
+  u32 fib_index;
 
-  if (ip_prefix_version (dst_prefix) == IP4)
+  if (is_add)
     {
-      ip4_main_t *im4 = &ip4_main;
-      ip4_add_del_route_args_t a;
-      ip4_address_t addr = ip_prefix_v4 (dst_prefix);
-
-      memset (&a, 0, sizeof (a));
-      a.flags = IP4_ROUTE_FLAG_TABLE_ID;
-      a.table_index_or_table_id = table_id;
-      a.adj_index = ~0;
-      a.dst_address_length = ip_prefix_len (dst_prefix);
-      a.dst_address = addr;
-      a.flags |= is_add ? IP4_ROUTE_FLAG_ADD : IP4_ROUTE_FLAG_DEL;
-      a.add_adj = add_adj;
-      a.n_add_adj = is_add ? 1 : 0;
-
-      ip4_add_del_route (im4, &a);
-
-      if (is_add)
-       {
-         p = ip4_get_route (im4, table_id, 0, addr.as_u8,
-                            ip_prefix_len (dst_prefix));
-         if (p == 0)
-           {
-             clib_warning ("Failed to insert route for eid %U!",
-                           format_ip4_address_and_length, addr.as_u8,
-                           ip_prefix_len (dst_prefix));
-             return -1;
-           }
-         adj_index[0] = p[0];
-       }
+      /*
+       * Add a deafult route that results in a control plane punt DPO
+       */
+      dpo_id_t cp_punt = DPO_NULL;
+
+      dpo_set (&cp_punt, DPO_LISP_CP, fib_proto_to_dpo (proto), proto);
+
+      fib_index =
+       fib_table_find_or_create_and_lock (prefix.fp_proto, table_id);
+      fib_table_entry_special_dpo_add (fib_index, &prefix, FIB_SOURCE_LISP,
+                                      FIB_ENTRY_FLAG_EXCLUSIVE, &cp_punt);
+      dpo_unlock (&cp_punt);
     }
   else
     {
-      ip6_main_t *im6 = &ip6_main;
-      ip6_add_del_route_args_t a;
-      ip6_address_t addr = ip_prefix_v6 (dst_prefix);
-
-      memset (&a, 0, sizeof (a));
-      a.flags = IP6_ROUTE_FLAG_TABLE_ID;
-      a.table_index_or_table_id = table_id;
-      a.adj_index = ~0;
-      a.dst_address_length = ip_prefix_len (dst_prefix);
-      a.dst_address = addr;
-      a.flags |= is_add ? IP6_ROUTE_FLAG_ADD : IP6_ROUTE_FLAG_DEL;
-      a.add_adj = add_adj;
-      a.n_add_adj = is_add ? 1 : 0;
-
-      ip6_add_del_route (im6, &a);
-
-      if (is_add)
-       {
-         adj_index[0] = ip6_get_route (im6, table_id, 0, &addr,
-                                       ip_prefix_len (dst_prefix));
-         if (adj_index[0] == 0)
-           {
-             clib_warning ("Failed to insert route for eid %U!",
-                           format_ip6_address_and_length, addr.as_u8,
-                           ip_prefix_len (dst_prefix));
-             return -1;
-           }
-       }
+      fib_index = fib_table_find (prefix.fp_proto, table_id);
+      fib_table_entry_special_remove (fib_index, &prefix, FIB_SOURCE_LISP);
+      fib_table_unlock (fib_index, prefix.fp_proto);
     }
-  return 0;
 }
 
-static void
-add_del_lisp_gpe_default_route (u32 table_id, u8 is_v4, u8 is_add)
+void
+lisp_gpe_iface_set_table (u32 sw_if_index, u32 table_id)
 {
-  lisp_gpe_main_t *lgm = &lisp_gpe_main;
-  ip_adjacency_t adj;
-  ip_prefix_t prefix;
-  u32 adj_index = 0;
-
-  /* setup adjacency */
-  memset (&adj, 0, sizeof (adj));
-
-  adj.n_adj = 1;
-  adj.explicit_fib_index = ~0;
-  adj.lookup_next_index = is_v4 ? lgm->ip4_lookup_next_lgpe_ip4_lookup :
-    lgm->ip6_lookup_next_lgpe_ip6_lookup;
-  /* default route has tunnel_index ~0 */
-  adj.rewrite_header.sw_if_index = ~0;
-
-  /* set prefix to 0/0 */
-  memset (&prefix, 0, sizeof (prefix));
-  ip_prefix_version (&prefix) = is_v4 ? IP4 : IP6;
-
-  /* add/delete route for prefix */
-  add_del_ip_prefix_route (&prefix, table_id, &adj, is_add, &adj_index);
-}
+  fib_node_index_t fib_index;
 
-static void
-lisp_gpe_iface_set_table (u32 sw_if_index, u32 table_id, u8 is_ip4)
-{
-  if (is_ip4)
-    {
-      ip4_main_t *im4 = &ip4_main;
-      ip4_fib_t *fib;
-      fib = find_ip4_fib_by_table_index_or_id (im4, table_id,
-                                              IP4_ROUTE_FLAG_TABLE_ID);
+  fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, table_id);
+  vec_validate (ip4_main.fib_index_by_sw_if_index, sw_if_index);
+  ip4_main.fib_index_by_sw_if_index[sw_if_index] = fib_index;
+  ip4_sw_interface_enable_disable (sw_if_index, 1);
 
-      /* fib's created if it doesn't exist */
-      ASSERT (fib != 0);
-
-      vec_validate (im4->fib_index_by_sw_if_index, sw_if_index);
-      im4->fib_index_by_sw_if_index[sw_if_index] = fib->index;
-    }
-  else
-    {
-      ip6_main_t *im6 = &ip6_main;
-      ip6_fib_t *fib;
-      fib = find_ip6_fib_by_table_index_or_id (im6, table_id,
-                                              IP6_ROUTE_FLAG_TABLE_ID);
-
-      /* fib's created if it doesn't exist */
-      ASSERT (fib != 0);
-
-      vec_validate (im6->fib_index_by_sw_if_index, sw_if_index);
-      im6->fib_index_by_sw_if_index[sw_if_index] = fib->index;
-    }
+  fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP6, table_id);
+  vec_validate (ip6_main.fib_index_by_sw_if_index, sw_if_index);
+  ip6_main.fib_index_by_sw_if_index[sw_if_index] = fib_index;
+  ip6_sw_interface_enable_disable (sw_if_index, 1);
 }
 
 #define foreach_l2_lisp_gpe_tx_next     \
@@ -605,71 +324,71 @@ l2_flow_hash (vlib_buffer_t * b0)
   return (u32) c;
 }
 
-always_inline void
-l2_process_one (lisp_gpe_main_t * lgm, vlib_buffer_t * b0, u32 ti0,
-               u32 * next0)
-{
-  lisp_gpe_tunnel_t *t0;
-
-  t0 = pool_elt_at_index (lgm->tunnels, ti0);
-  ASSERT (0 != t0);
-
-  if (PREDICT_TRUE (LISP_NO_ACTION == t0->action))
-    {
-      /* compute 'flow' hash */
-      if (PREDICT_TRUE (t0->sub_tunnels_lbv_count > 1))
-       vnet_buffer (b0)->ip.flow_hash = l2_flow_hash (b0);
-      encap_one_inline (lgm, b0, t0, next0);
-    }
-  else
-    {
-      l2_process_tunnel_action (b0, t0->action, next0);
-    }
-}
-
-always_inline void
-l2_process_two (lisp_gpe_main_t * lgm, vlib_buffer_t * b0, vlib_buffer_t * b1,
-               u32 ti0, u32 ti1, u32 * next0, u32 * next1)
-{
-  lisp_gpe_tunnel_t *t0, *t1;
-
-  t0 = pool_elt_at_index (lgm->tunnels, ti0);
-  t1 = pool_elt_at_index (lgm->tunnels, ti1);
-
-  ASSERT (0 != t0 && 0 != t1);
-
-  if (PREDICT_TRUE (LISP_NO_ACTION == t0->action
-                   && LISP_NO_ACTION == t1->action))
-    {
-      if (PREDICT_TRUE (t0->sub_tunnels_lbv_count > 1))
-       vnet_buffer (b0)->ip.flow_hash = l2_flow_hash (b0);
-      if (PREDICT_TRUE (t1->sub_tunnels_lbv_count > 1))
-       vnet_buffer (b1)->ip.flow_hash = l2_flow_hash (b1);
-      encap_two_inline (lgm, b0, b1, t0, t1, next0, next1);
-    }
-  else
-    {
-      if (LISP_NO_ACTION == t0->action)
-       {
-         if (PREDICT_TRUE (t0->sub_tunnels_lbv_count > 1))
-           vnet_buffer (b0)->ip.flow_hash = l2_flow_hash (b0);
-         encap_one_inline (lgm, b0, t0, next0);
-         l2_process_tunnel_action (b1, t1->action, next1);
-       }
-      else if (LISP_NO_ACTION == t1->action)
-       {
-         if (PREDICT_TRUE (t1->sub_tunnels_lbv_count > 1))
-           vnet_buffer (b1)->ip.flow_hash = l2_flow_hash (b1);
-         encap_one_inline (lgm, b1, t1, next1);
-         l2_process_tunnel_action (b0, t0->action, next0);
-       }
-      else
-       {
-         l2_process_tunnel_action (b0, t0->action, next0);
-         l2_process_tunnel_action (b1, t1->action, next1);
-       }
-    }
-}
+/* always_inline void */
+/* l2_process_one (lisp_gpe_main_t * lgm, vlib_buffer_t * b0, u32 ti0, */
+/*             u32 * next0) */
+/* { */
+/*   lisp_gpe_tunnel_t *t0; */
+
+/*   t0 = pool_elt_at_index (lgm->tunnels, ti0); */
+/*   ASSERT (0 != t0); */
+
+/*   if (PREDICT_TRUE (LISP_NO_ACTION == t0->action)) */
+/*     { */
+/*       /\* compute 'flow' hash *\/ */
+/*       if (PREDICT_TRUE (t0->sub_tunnels_lbv_count > 1)) */
+/*     vnet_buffer (b0)->ip.flow_hash = l2_flow_hash (b0); */
+/*       encap_one_inline (lgm, b0, t0, next0); */
+/*     } */
+/*   else */
+/*     { */
+/*       l2_process_tunnel_action (b0, t0->action, next0); */
+/*     } */
+/* } */
+
+/* always_inline void */
+/* l2_process_two (lisp_gpe_main_t * lgm, vlib_buffer_t * b0, vlib_buffer_t * b1, */
+/*             u32 ti0, u32 ti1, u32 * next0, u32 * next1) */
+/* { */
+/*   lisp_gpe_tunnel_t *t0, *t1; */
+
+/*   t0 = pool_elt_at_index (lgm->tunnels, ti0); */
+/*   t1 = pool_elt_at_index (lgm->tunnels, ti1); */
+
+/*   ASSERT (0 != t0 && 0 != t1); */
+
+/*   if (PREDICT_TRUE (LISP_NO_ACTION == t0->action */
+/*                 && LISP_NO_ACTION == t1->action)) */
+/*     { */
+/*       if (PREDICT_TRUE (t0->sub_tunnels_lbv_count > 1)) */
+/*     vnet_buffer (b0)->ip.flow_hash = l2_flow_hash (b0); */
+/*       if (PREDICT_TRUE (t1->sub_tunnels_lbv_count > 1)) */
+/*     vnet_buffer (b1)->ip.flow_hash = l2_flow_hash (b1); */
+/*       encap_two_inline (lgm, b0, b1, t0, t1, next0, next1); */
+/*     } */
+/*   else */
+/*     { */
+/*       if (LISP_NO_ACTION == t0->action) */
+/*     { */
+/*       if (PREDICT_TRUE (t0->sub_tunnels_lbv_count > 1)) */
+/*         vnet_buffer (b0)->ip.flow_hash = l2_flow_hash (b0); */
+/*       encap_one_inline (lgm, b0, t0, next0); */
+/*       l2_process_tunnel_action (b1, t1->action, next1); */
+/*     } */
+/*       else if (LISP_NO_ACTION == t1->action) */
+/*     { */
+/*       if (PREDICT_TRUE (t1->sub_tunnels_lbv_count > 1)) */
+/*         vnet_buffer (b1)->ip.flow_hash = l2_flow_hash (b1); */
+/*       encap_one_inline (lgm, b1, t1, next1); */
+/*       l2_process_tunnel_action (b0, t0->action, next0); */
+/*     } */
+/*       else */
+/*     { */
+/*       l2_process_tunnel_action (b0, t0->action, next0); */
+/*       l2_process_tunnel_action (b1, t1->action, next1); */
+/*     } */
+/*     } */
+/* } */
 
 /**
  * @brief LISP-GPE interface TX (encap) function for L2 overlays.
@@ -710,9 +429,9 @@ l2_lisp_gpe_interface_tx (vlib_main_t * vm, vlib_node_runtime_t * node,
        {
          u32 bi0, bi1;
          vlib_buffer_t *b0, *b1;
-         u32 next0, next1, ti0, ti1;
+         u32 next0, next1;
          lisp_gpe_tunnel_t *t0 = 0, *t1 = 0;
-         ethernet_header_t *e0, *e1;
+         //      ethernet_header_t *e0, *e1;
 
          next0 = next1 = L2_LISP_GPE_TX_NEXT_LISP_CP_LOOKUP;
 
@@ -742,49 +461,49 @@ l2_lisp_gpe_interface_tx (vlib_main_t * vm, vlib_node_runtime_t * node,
          b0 = vlib_get_buffer (vm, bi0);
          b1 = vlib_get_buffer (vm, bi1);
 
-         e0 = vlib_buffer_get_current (b0);
-         e1 = vlib_buffer_get_current (b1);
+         /* e0 = vlib_buffer_get_current (b0); */
+         /* e1 = vlib_buffer_get_current (b1); */
 
          /* lookup dst + src mac */
-         ti0 = lisp_l2_fib_lookup (lgm, vnet_buffer (b0)->l2.bd_index,
-                                   e0->src_address, e0->dst_address);
-         ti1 = lisp_l2_fib_lookup (lgm, vnet_buffer (b1)->l2.bd_index,
-                                   e1->src_address, e1->dst_address);
-
-         if (PREDICT_TRUE ((u32) ~ 0 != ti0) && (u32) ~ 0 != ti1)
-           {
-             /* process both tunnels */
-             l2_process_two (lgm, b0, b1, ti0, ti1, &next0, &next1);
-           }
-         else
-           {
-             if ((u32) ~ 0 != ti0)
-               {
-                 /* process tunnel for b0 */
-                 l2_process_one (lgm, b0, ti0, &next0);
-
-                 /* no tunnel found for b1, send to control plane */
-                 next1 = L2_LISP_GPE_TX_NEXT_LISP_CP_LOOKUP;
-                 vnet_buffer (b1)->lisp.overlay_afi = LISP_AFI_MAC;
-               }
-             else if ((u32) ~ 0 != ti1)
-               {
-                 /* process tunnel for b1 */
-                 l2_process_one (lgm, b1, ti1, &next1);
-
-                 /* no tunnel found b0, send to control plane */
-                 next0 = L2_LISP_GPE_TX_NEXT_LISP_CP_LOOKUP;
-                 vnet_buffer (b0)->lisp.overlay_afi = LISP_AFI_MAC;
-               }
-             else
-               {
-                 /* no tunnels found */
-                 next0 = L2_LISP_GPE_TX_NEXT_LISP_CP_LOOKUP;
-                 vnet_buffer (b0)->lisp.overlay_afi = LISP_AFI_MAC;
-                 next1 = L2_LISP_GPE_TX_NEXT_LISP_CP_LOOKUP;
-                 vnet_buffer (b1)->lisp.overlay_afi = LISP_AFI_MAC;
-               }
-           }
+         /* ti0 = lisp_l2_fib_lookup (lgm, vnet_buffer (b0)->l2.bd_index, */
+         /*                        e0->src_address, e0->dst_address); */
+         /* ti1 = lisp_l2_fib_lookup (lgm, vnet_buffer (b1)->l2.bd_index, */
+         /*                        e1->src_address, e1->dst_address); */
+
+         /* if (PREDICT_TRUE ((u32) ~ 0 != ti0) && (u32) ~ 0 != ti1) */
+         /*   { */
+         /*     /\* process both tunnels *\/ */
+         /*     l2_process_two (lgm, b0, b1, ti0, ti1, &next0, &next1); */
+         /*   } */
+         /* else */
+         /*   { */
+         /*     if ((u32) ~ 0 != ti0) */
+         /*       { */
+         /*         /\* process tunnel for b0 *\/ */
+         /*         l2_process_one (lgm, b0, ti0, &next0); */
+
+         /*         /\* no tunnel found for b1, send to control plane *\/ */
+         /*         next1 = L2_LISP_GPE_TX_NEXT_LISP_CP_LOOKUP; */
+         /*         vnet_buffer (b1)->lisp.overlay_afi = LISP_AFI_MAC; */
+         /*       } */
+         /*     else if ((u32) ~ 0 != ti1) */
+         /*       { */
+         /*         /\* process tunnel for b1 *\/ */
+         /*         l2_process_one (lgm, b1, ti1, &next1); */
+
+         /*         /\* no tunnel found b0, send to control plane *\/ */
+         /*         next0 = L2_LISP_GPE_TX_NEXT_LISP_CP_LOOKUP; */
+         /*         vnet_buffer (b0)->lisp.overlay_afi = LISP_AFI_MAC; */
+         /*       } */
+         /*     else */
+         /*       { */
+         /*         /\* no tunnels found *\/ */
+         /*         next0 = L2_LISP_GPE_TX_NEXT_LISP_CP_LOOKUP; */
+         /*         vnet_buffer (b0)->lisp.overlay_afi = LISP_AFI_MAC; */
+         /*         next1 = L2_LISP_GPE_TX_NEXT_LISP_CP_LOOKUP; */
+         /*         vnet_buffer (b1)->lisp.overlay_afi = LISP_AFI_MAC; */
+         /*       } */
+         /*   } */
 
          if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
            {
@@ -824,16 +543,16 @@ l2_lisp_gpe_interface_tx (vlib_main_t * vm, vlib_node_runtime_t * node,
          ti0 = lisp_l2_fib_lookup (lgm, vnet_buffer (b0)->l2.bd_index,
                                    e0->src_address, e0->dst_address);
 
-         if (PREDICT_TRUE ((u32) ~ 0 != ti0))
-           {
-             l2_process_one (lgm, b0, ti0, &next0);
-           }
-         else
-           {
-             /* no tunnel found send to control plane */
-             next0 = L2_LISP_GPE_TX_NEXT_LISP_CP_LOOKUP;
-             vnet_buffer (b0)->lisp.overlay_afi = LISP_AFI_MAC;
-           }
+         /* if (PREDICT_TRUE ((u32) ~ 0 != ti0)) */
+         /*   { */
+         /*     l2_process_one (lgm, b0, ti0, &next0); */
+         /*   } */
+         /* else */
+         /*   { */
+         /*     /\* no tunnel found send to control plane *\/ */
+         /*     next0 = L2_LISP_GPE_TX_NEXT_LISP_CP_LOOKUP; */
+         /*     vnet_buffer (b0)->lisp.overlay_afi = LISP_AFI_MAC; */
+         /*   } */
 
          if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
            {
@@ -973,7 +692,6 @@ lisp_gpe_add_del_l3_iface (lisp_gpe_main_t * lgm,
   vnet_main_t *vnm = lgm->vnet_main;
   tunnel_lookup_t *l3_ifaces = &lgm->l3_ifaces;
   vnet_hw_interface_t *hi;
-  u32 lookup_next_index4, lookup_next_index6;
   uword *hip, *si;
 
   hip = hash_get (l3_ifaces->hw_if_index_by_dp_table, a->table_id);
@@ -997,30 +715,10 @@ lisp_gpe_add_del_l3_iface (lisp_gpe_main_t * lgm,
       hi = create_lisp_gpe_iface (lgm, a->vni, a->table_id,
                                  &lisp_gpe_device_class, l3_ifaces);
 
-      /* set ingress arc from lgpe_ipX_lookup */
-      lookup_next_index4 = vlib_node_add_next (lgm->vlib_main,
-                                              lgpe_ip4_lookup_node.index,
-                                              hi->output_node_index);
-      lookup_next_index6 = vlib_node_add_next (lgm->vlib_main,
-                                              lgpe_ip6_lookup_node.index,
-                                              hi->output_node_index);
-      hash_set (lgm->lgpe_ip4_lookup_next_index_by_table_id, a->table_id,
-               lookup_next_index4);
-      hash_set (lgm->lgpe_ip6_lookup_next_index_by_table_id, a->table_id,
-               lookup_next_index6);
-
-      /* insert default routes that point to lgpe-ipx-lookup */
-      add_del_lisp_gpe_default_route (a->table_id, /* is_v4 */ 1, 1);
-      add_del_lisp_gpe_default_route (a->table_id, /* is_v4 */ 0, 1);
-
-      /* set egress arcs */
-#define _(sym,str) vlib_node_add_named_next_with_slot (vnm->vlib_main, \
-                    hi->tx_node_index, str, LISP_GPE_TX_NEXT_##sym);
-      foreach_lisp_gpe_tx_next
-#undef _
-       /* set interface in appropriate v4 and v6 FIBs */
-       lisp_gpe_iface_set_table (hi->sw_if_index, a->table_id, 1);
-      lisp_gpe_iface_set_table (hi->sw_if_index, a->table_id, 0);
+      /* insert default routes that point to lisp-cp lookup */
+      lisp_gpe_iface_set_table (hi->sw_if_index, a->table_id);
+      add_del_lisp_gpe_default_route (a->table_id, FIB_PROTOCOL_IP4, 1);
+      add_del_lisp_gpe_default_route (a->table_id, FIB_PROTOCOL_IP6, 1);
 
       /* enable interface */
       vnet_sw_interface_set_flags (vnm, hi->sw_if_index,
@@ -1037,11 +735,15 @@ lisp_gpe_add_del_l3_iface (lisp_gpe_main_t * lgm,
          return -1;
        }
 
+      hi = vnet_get_hw_interface (vnm, hip[0]);
+
       remove_lisp_gpe_iface (lgm, hip[0], a->table_id, &lgm->l3_ifaces);
 
       /* unset default routes */
-      add_del_lisp_gpe_default_route (a->table_id, /* is_v4 */ 1, 0);
-      add_del_lisp_gpe_default_route (a->table_id, /* is_v4 */ 0, 0);
+      ip4_sw_interface_enable_disable (hi->sw_if_index, 0);
+      ip6_sw_interface_enable_disable (hi->sw_if_index, 0);
+      add_del_lisp_gpe_default_route (a->table_id, FIB_PROTOCOL_IP4, 0);
+      add_del_lisp_gpe_default_route (a->table_id, FIB_PROTOCOL_IP6, 0);
     }
 
   return 0;
index bd9951a..8a24ec0 100644 (file)
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-/**
- *  @file
- *  @brief LISP-GPE overlay IP forwarding logic and lookup data structures.
- *
- *  Provides an implementation of a Source/Dest (SD) IP FIB that leverages the
- *  existing destination only FIB. Lookups are done in two stages, first the
- *  destination FIB looks up a packet's destination address and then if a
- *  an SD entry is hit, the destination adjacency will point to the second
- *  stage, the source FIB, where the packet's source is looked up. Note that a
- *  miss in the source FIB does not result in an overall SD lookup retry with
- *  a less specific entry from the destination FIB.
- */
-#include <vnet/lisp-gpe/lisp_gpe.h>
-
-/** Sets adj index for destination address in IP4 FIB. Similar to the function
- * in ip4_forward but this one avoids calling route callbacks */
-static void
-ip4_sd_fib_set_adj_index (lisp_gpe_main_t * lgm, ip4_fib_t * fib, u32 flags,
-                         u32 dst_address_u32, u32 dst_address_length,
-                         u32 adj_index)
-{
-  ip_lookup_main_t *lm = lgm->lm4;
-  uword *hash;
-
-  if (vec_bytes (fib->old_hash_values))
-    memset (fib->old_hash_values, ~0, vec_bytes (fib->old_hash_values));
-  if (vec_bytes (fib->new_hash_values))
-    memset (fib->new_hash_values, ~0, vec_bytes (fib->new_hash_values));
-  fib->new_hash_values[0] = adj_index;
-
-  /* Make sure adj index is valid. */
-  if (CLIB_DEBUG > 0)
-    (void) ip_get_adjacency (lm, adj_index);
-
-  hash = fib->adj_index_by_dst_address[dst_address_length];
-
-  hash = _hash_set3 (hash, dst_address_u32,
-                    fib->new_hash_values, fib->old_hash_values);
-
-  fib->adj_index_by_dst_address[dst_address_length] = hash;
-}
-
-/** Initialize the adjacency index by destination address vector for IP4 FIB.
- * Copied from ip4_forward since it's static */
-static void
-ip4_fib_init_adj_index_by_dst_address (ip_lookup_main_t * lm,
-                                      ip4_fib_t * fib, u32 address_length)
-{
-  hash_t *h;
-  uword max_index;
-
-  ASSERT (lm->fib_result_n_bytes >= sizeof (uword));
-  lm->fib_result_n_words = round_pow2 (lm->fib_result_n_bytes, sizeof (uword))
-    / sizeof (uword);
-
-  fib->adj_index_by_dst_address[address_length] =
-    hash_create (32 /* elts */ , lm->fib_result_n_words * sizeof (uword));
-
-  hash_set_flags (fib->adj_index_by_dst_address[address_length],
-                 HASH_FLAG_NO_AUTO_SHRINK);
-
-  h = hash_header (fib->adj_index_by_dst_address[address_length]);
-  max_index = (hash_value_bytes (h) / sizeof (fib->new_hash_values[0])) - 1;
-
-  /* Initialize new/old hash value vectors. */
-  vec_validate_init_empty (fib->new_hash_values, max_index, ~0);
-  vec_validate_init_empty (fib->old_hash_values, max_index, ~0);
-}
-
-/** Add/del src route to IP4 SD FIB. */
-static void
-ip4_sd_fib_add_del_src_route (lisp_gpe_main_t * lgm,
-                             ip4_add_del_route_args_t * a)
-{
-  ip_lookup_main_t *lm = lgm->lm4;
-  ip4_fib_t *fib;
-  u32 dst_address, dst_address_length, adj_index, old_adj_index;
-  uword *hash, is_del;
-
-  /* Either create new adjacency or use given one depending on arguments. */
-  if (a->n_add_adj > 0)
-    ip_add_adjacency (lm, a->add_adj, a->n_add_adj, &adj_index);
-  else
-    adj_index = a->adj_index;
-
-  dst_address = a->dst_address.data_u32;
-  dst_address_length = a->dst_address_length;
-
-  fib = pool_elt_at_index (lgm->ip4_src_fibs, a->table_index_or_table_id);
-
-  if (!fib->adj_index_by_dst_address[dst_address_length])
-    ip4_fib_init_adj_index_by_dst_address (lm, fib, dst_address_length);
-
-  hash = fib->adj_index_by_dst_address[dst_address_length];
-
-  is_del = (a->flags & IP4_ROUTE_FLAG_DEL) != 0;
-
-  if (is_del)
-    {
-      fib->old_hash_values[0] = ~0;
-      hash = _hash_unset (hash, dst_address, fib->old_hash_values);
-      fib->adj_index_by_dst_address[dst_address_length] = hash;
-    }
-  else
-    ip4_sd_fib_set_adj_index (lgm, fib, a->flags, dst_address,
-                             dst_address_length, adj_index);
-
-  old_adj_index = fib->old_hash_values[0];
-
-  /* Avoid spurious reference count increments */
-  if (old_adj_index == adj_index
-      && adj_index != ~0 && !(a->flags & IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY))
-    {
-      ip_adjacency_t *adj = ip_get_adjacency (lm, adj_index);
-      if (adj->share_count > 0)
-       adj->share_count--;
-    }
-
-  ip4_fib_mtrie_add_del_route (fib, a->dst_address, dst_address_length,
-                              is_del ? old_adj_index : adj_index, is_del);
-
-  /* Delete old adjacency index if present and changed. */
-  if (!(a->flags & IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY)
-      && old_adj_index != ~0 && old_adj_index != adj_index)
-    ip_del_adjacency (lm, old_adj_index);
-}
-
-/** Get src route from IP4 SD FIB. */
-static void *
-ip4_sd_get_src_route (lisp_gpe_main_t * lgm, u32 src_fib_index,
-                     ip4_address_t * src, u32 address_length)
-{
-  ip4_fib_t *fib = pool_elt_at_index (lgm->ip4_src_fibs, src_fib_index);
-  uword *hash, *p;
 
-  hash = fib->adj_index_by_dst_address[address_length];
-  p = hash_get (hash, src->as_u32);
-  return (void *) p;
-}
-
-/* *INDENT-OFF* */
-typedef CLIB_PACKED (struct ip4_route {
-  ip4_address_t address;
-  u32 address_length : 6;
-  u32 index : 26;
-}) ip4_route_t;
-/* *INDENT-ON* */
-
-/** Remove all routes from src IP4 FIB */
-void
-ip4_sd_fib_clear_src_fib (lisp_gpe_main_t * lgm, ip4_fib_t * fib)
-{
-  ip4_route_t *routes = 0, *r;
-  u32 i;
-
-  vec_reset_length (routes);
-
-  for (i = 0; i < ARRAY_LEN (fib->adj_index_by_dst_address); i++)
-    {
-      uword *hash = fib->adj_index_by_dst_address[i];
-      hash_pair_t *p;
-      ip4_route_t x;
-
-      x.address_length = i;
-      x.index = 0;             /* shut up coverity */
-
-      /* *INDENT-OFF* */
-      hash_foreach_pair (p, hash,
-      ({
-          x.address.data_u32 = p->key;
-          vec_add1 (routes, x);
-      }));
-      /* *INDENT-ON* */
-    }
-
-  vec_foreach (r, routes)
-  {
-    ip4_add_del_route_args_t a;
-
-    memset (&a, 0, sizeof (a));
-    a.flags = IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_DEL;
-    a.table_index_or_table_id = fib - lgm->ip4_src_fibs;
-    a.dst_address = r->address;
-    a.dst_address_length = r->address_length;
-    a.adj_index = ~0;
-
-    ip4_sd_fib_add_del_src_route (lgm, &a);
-  }
-}
-
-/** Test if IP4 FIB is empty */
-static u8
-ip4_fib_is_empty (ip4_fib_t * fib)
-{
-  u8 fib_is_empty;
-  int i;
-
-  fib_is_empty = 1;
-  for (i = ARRAY_LEN (fib->adj_index_by_dst_address) - 1; i >= 0; i--)
-    {
-      uword *hash = fib->adj_index_by_dst_address[i];
-      uword n_elts = hash_elts (hash);
-      if (n_elts)
-       {
-         fib_is_empty = 0;
-         break;
-       }
-    }
-  return fib_is_empty;
-}
+#include <vnet/lisp-gpe/lisp_gpe_adjacency.h>
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/fib_entry.h>
+#include <vnet/fib/ip6_fib.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/dpo/lookup_dpo.h>
+#include <vnet/dpo/load_balance.h>
 
 /**
- * @brief Add/del route to IP4 SD FIB.
- *
- * Adds/remove routes to both destination and source FIBs. Entries added
- * to destination FIB are associated to adjacencies that point to the source
- * FIB and store the index of the particular source FIB associated to the
- * destination. Source FIBs are locally managed (see @ref lgm->ip4_src_fibs
- * and @ref lgm->ip6_src_fibs), but the adjacencies are allocated out of the
- * global adjacency pool.
+ * @brief Add route to IP4 or IP6 Destination FIB.
  *
- * @param[in]   lgm             Reference to @ref lisp_gpe_main_t.
- * @param[out]  dst_prefix      Destination IP4 prefix.
- * @param[in]   src_prefix      Source IP4 prefix.
- * @param[in]   table_id        Table id.
- * @param[in]   add_adj         Pointer to the adjacency to be added.
- * @param[in]   is_add          Add/del flag.
+ * Add a route to the destination FIB that results in the lookup
+ * in the SRC FIB. The SRC FIB is created is it does not yet exist.
  *
- * @return 0 on success.
+ * @param[in]   dst_table_id    Destination FIB Table-ID
+ * @param[in]   dst_prefix      Destination IP prefix.
+ * @param[out]  src_fib_index   The index/ID of the SRC FIB created.
  */
-static int
-ip4_sd_fib_add_del_route (lisp_gpe_main_t * lgm, ip_prefix_t * dst_prefix,
-                         ip_prefix_t * src_prefix, u32 table_id,
-                         ip_adjacency_t * add_adj, u8 is_add)
-{
-  uword *p;
-  ip4_add_del_route_args_t a;
-  ip_adjacency_t *dst_adjp, dst_adj;
-  ip4_address_t dst = ip_prefix_v4 (dst_prefix), src;
-  u32 dst_address_length = ip_prefix_len (dst_prefix), src_address_length = 0;
-  ip4_fib_t *src_fib;
-
-  if (src_prefix)
-    {
-      src = ip_prefix_v4 (src_prefix);
-      src_address_length = ip_prefix_len (src_prefix);
-    }
-  else
-    memset (&src, 0, sizeof (src));
-
-  /* lookup dst adj */
-  p = ip4_get_route (lgm->im4, table_id, 0, dst.as_u8, dst_address_length);
-
-  if (is_add)
-    {
-      /* insert dst prefix to ip4 fib, if it's not in yet */
-      if (p == 0)
-       {
-         /* allocate and init src ip4 fib */
-         pool_get (lgm->ip4_src_fibs, src_fib);
-         ip4_mtrie_init (&src_fib->mtrie);
-
-         /* configure adjacency */
-         memset (&dst_adj, 0, sizeof (dst_adj));
-
-         /* reuse rewrite header to store pointer to src fib */
-         dst_adj.rewrite_header.sw_if_index = src_fib - lgm->ip4_src_fibs;
-
-         /* dst adj should point to lisp gpe lookup */
-         dst_adj.lookup_next_index = lgm->ip4_lookup_next_lgpe_ip4_lookup;
-
-         /* explicit_fib_index is used in IP6 FIB lookup, don't reuse it */
-         dst_adj.explicit_fib_index = ~0;
-         dst_adj.n_adj = 1;
-
-         /* make sure we have different signatures for adj in different tables
-          * but with the same lookup_next_index and for adj in the same table
-          * but associated to different destinations */
-         dst_adj.if_address_index = table_id;
-         dst_adj.indirect.next_hop.ip4 = dst;
-
-         memset (&a, 0, sizeof (a));
-         a.flags = IP4_ROUTE_FLAG_TABLE_ID;
-         a.table_index_or_table_id = table_id; /* vrf */
-         a.adj_index = ~0;
-         a.dst_address_length = dst_address_length;
-         a.dst_address = dst;
-         a.flags |= IP4_ROUTE_FLAG_ADD;
-         a.add_adj = &dst_adj;
-         a.n_add_adj = 1;
-
-         ip4_add_del_route (lgm->im4, &a);
-
-         /* lookup dst adj to obtain the adj index */
-         p = ip4_get_route (lgm->im4, table_id, 0, dst.as_u8,
-                            dst_address_length);
-
-         /* make sure insertion succeeded */
-         if (CLIB_DEBUG)
-           {
-             ASSERT (p != 0);
-             dst_adjp = ip_get_adjacency (lgm->lm4, p[0]);
-             ASSERT (dst_adjp->rewrite_header.sw_if_index
-                     == dst_adj.rewrite_header.sw_if_index);
-           }
-       }
-    }
-  else
-    {
-      if (p == 0)
-       {
-         clib_warning
-           ("Trying to delete inexistent dst route for %U. Aborting",
-            format_ip4_address_and_length, dst.as_u8, dst_address_length);
-         return -1;
-       }
-    }
-
-  dst_adjp = ip_get_adjacency (lgm->lm4, p[0]);
-
-  /* add/del src prefix to src fib */
-  memset (&a, 0, sizeof (a));
-  a.flags = IP4_ROUTE_FLAG_TABLE_ID;
-  a.table_index_or_table_id = dst_adjp->rewrite_header.sw_if_index;
-  a.adj_index = ~0;
-  a.flags |= is_add ? IP4_ROUTE_FLAG_ADD : IP4_ROUTE_FLAG_DEL;
-  a.add_adj = add_adj;
-  a.n_add_adj = is_add ? 1 : 0;
-  /* if src prefix is null, add 0/0 */
-  a.dst_address_length = src_address_length;
-  a.dst_address = src;
-  ip4_sd_fib_add_del_src_route (lgm, &a);
-
-  /* make sure insertion succeeded */
-  if (CLIB_DEBUG && is_add)
-    {
-      uword *sai;
-      ip_adjacency_t *src_adjp;
-      sai = ip4_sd_get_src_route (lgm, dst_adjp->rewrite_header.sw_if_index,
-                                 &src, src_address_length);
-      ASSERT (sai != 0);
-      src_adjp = ip_get_adjacency (lgm->lm4, sai[0]);
-      ASSERT (src_adjp->if_address_index == add_adj->if_address_index);
-    }
-
-  /* if a delete, check if there are elements left in the src fib */
-  if (!is_add)
-    {
-      src_fib = pool_elt_at_index (lgm->ip4_src_fibs,
-                                  dst_adjp->rewrite_header.sw_if_index);
-      if (!src_fib)
-       return 0;
-
-      /* if there's nothing left */
-      if (ip4_fib_is_empty (src_fib))
-       {
-         /* remove the src fib ..  */
-         pool_put (lgm->ip4_src_fibs, src_fib);
-
-         /* .. and remove dst route */
-         memset (&a, 0, sizeof (a));
-         a.flags = IP4_ROUTE_FLAG_TABLE_ID;
-         a.table_index_or_table_id = table_id; /* vrf */
-         a.adj_index = ~0;
-         a.dst_address_length = dst_address_length;
-         a.dst_address = dst;
-         a.flags |= IP4_ROUTE_FLAG_DEL;
-
-         ip4_add_del_route (lgm->im4, &a);
-       }
-    }
-
-  return 0;
-}
-
-/**
- * @brief Retrieve IP4 SD FIB entry.
- *
- * Looks up SD IP4 route by first looking up the destination in VPP's main FIB
- * and subsequently the source in the src FIB. The index of the source FIB is
- * stored in the dst adjacency's rewrite_header.sw_if_index. If source is 0
- * do search with 0/0 src.
- *
- * @param[in]   lgm             Reference to @ref lisp_gpe_main_t.
- * @param[out]  dst_prefix      Destination IP4 prefix.
- * @param[in]   src_prefix      Source IP4 prefix.
- * @param[in]   table_id        Table id.
- *
- * @return pointer to the adjacency if route found.
- */
-static void *
-ip4_sd_fib_get_route (lisp_gpe_main_t * lgm, ip_prefix_t * dst_prefix,
-                     ip_prefix_t * src_prefix, u32 table_id)
-{
-  uword *p;
-  ip4_address_t dst = ip_prefix_v4 (dst_prefix), src;
-  u32 dst_address_length = ip_prefix_len (dst_prefix), src_address_length = 0;
-  ip_adjacency_t *dst_adj;
-
-  if (src_prefix)
-    {
-      src = ip_prefix_v4 (src_prefix);
-      src_address_length = ip_prefix_len (src_prefix);
-    }
-  else
-    memset (&src, 0, sizeof (src));
-
-  /* lookup dst adj */
-  p = ip4_get_route (lgm->im4, table_id, 0, dst.as_u8, dst_address_length);
-  if (p == 0)
-    return p;
-
-  dst_adj = ip_get_adjacency (lgm->lm4, p[0]);
-  return ip4_sd_get_src_route (lgm, dst_adj->rewrite_header.sw_if_index, &src,
-                              src_address_length);
-}
-
-/** Get src route from IP6 SD FIB. */
-static u32
-ip6_sd_get_src_route (lisp_gpe_main_t * lgm, u32 src_fib_index,
-                     ip6_address_t * src, u32 address_length)
-{
-  int rv;
-  BVT (clib_bihash_kv) kv, value;
-  ip6_src_fib_t *fib = pool_elt_at_index (lgm->ip6_src_fibs, src_fib_index);
-
-  ip6_address_t *mask;
-
-  ASSERT (address_length <= 128);
-
-  mask = &fib->fib_masks[address_length];
-
-  kv.key[0] = src->as_u64[0] & mask->as_u64[0];
-  kv.key[1] = src->as_u64[1] & mask->as_u64[1];
-  kv.key[2] = address_length;
-
-  rv = BV (clib_bihash_search_inline_2) (&fib->ip6_lookup_table, &kv, &value);
-  if (rv == 0)
-    return value.value;
-
-  return 0;
-}
-
-static void
-compute_prefix_lengths_in_search_order (ip6_src_fib_t * fib)
-{
-  int i;
-  vec_reset_length (fib->prefix_lengths_in_search_order);
-  /* Note: bitmap reversed so this is in fact a longest prefix match */
-
-  /* *INDENT-OFF* */
-  clib_bitmap_foreach(i, fib->non_empty_dst_address_length_bitmap, ({
-    int dst_address_length = 128 - i;
-    vec_add1 (fib->prefix_lengths_in_search_order, dst_address_length);
-  }));
-  /* *INDENT-ON* */
-}
-
-/** Add/del src route to IP6 SD FIB. Rewrite of ip6_add_del_route() because
- * it uses im6 to find the FIB .*/
-static void
-ip6_sd_fib_add_del_src_route (lisp_gpe_main_t * lgm,
-                             ip6_add_del_route_args_t * a)
+u32
+ip_dst_fib_add_route (u32 dst_fib_index, const ip_prefix_t * dst_prefix)
 {
-  ip_lookup_main_t *lm = lgm->lm6;
-  ip6_src_fib_t *fib;
-  ip6_address_t dst_address;
-  u32 dst_address_length, adj_index;
-  uword is_del;
-  u32 old_adj_index = ~0;
-  BVT (clib_bihash_kv) kv, value;
-
-  vlib_smp_unsafe_warning ();
-
-  is_del = (a->flags & IP6_ROUTE_FLAG_DEL) != 0;
+  fib_node_index_t src_fib_index;
+  fib_prefix_t dst_fib_prefix;
+  fib_node_index_t dst_fei;
 
-  /* Either create new adjacency or use given one depending on arguments. */
-  if (a->n_add_adj > 0)
-    {
-      ip_add_adjacency (lm, a->add_adj, a->n_add_adj, &adj_index);
-    }
-  else
-    adj_index = a->adj_index;
+  ASSERT (NULL != dst_prefix);
 
-  dst_address = a->dst_address;
-  dst_address_length = a->dst_address_length;
-  fib = pool_elt_at_index (lgm->ip6_src_fibs, a->table_index_or_table_id);
+  ip_prefix_to_fib_prefix (dst_prefix, &dst_fib_prefix);
 
-  ASSERT (dst_address_length < ARRAY_LEN (fib->fib_masks));
-  ip6_address_mask (&dst_address, &fib->fib_masks[dst_address_length]);
+  /*
+   * lookup the destination prefix in the VRF table and retrieve the
+   * LISP associated data
+   */
+  dst_fei = fib_table_lookup_exact_match (dst_fib_index, &dst_fib_prefix);
 
-  /* refcount accounting */
-  if (is_del)
+  /*
+   * If the FIB entry is not present, or not LISP sourced, add it
+   */
+  if (dst_fei == FIB_NODE_INDEX_INVALID ||
+      NULL == fib_entry_get_source_data (dst_fei, FIB_SOURCE_LISP))
     {
-      ASSERT (fib->dst_address_length_refcounts[dst_address_length] > 0);
-      if (--fib->dst_address_length_refcounts[dst_address_length] == 0)
-       {
-         fib->non_empty_dst_address_length_bitmap =
-           clib_bitmap_set (fib->non_empty_dst_address_length_bitmap,
-                            128 - dst_address_length, 0);
-         compute_prefix_lengths_in_search_order (fib);
-       }
+      dpo_id_t src_lkup_dpo = DPO_NULL;
+
+      /* create a new src FIB.  */
+      src_fib_index =
+       fib_table_create_and_lock (dst_fib_prefix.fp_proto,
+                                  "LISP-src for [%d,%U]",
+                                  dst_fib_index,
+                                  format_fib_prefix, &dst_fib_prefix);
+
+      /*
+       * create a data-path object to perform the source address lookup
+       * in the SRC FIB
+       */
+      lookup_dpo_add_or_lock_w_fib_index (src_fib_index,
+                                         (ip_prefix_version (dst_prefix) ==
+                                          IP6 ? DPO_PROTO_IP6 :
+                                          DPO_PROTO_IP4),
+                                         LOOKUP_INPUT_SRC_ADDR,
+                                         LOOKUP_TABLE_FROM_CONFIG,
+                                         &src_lkup_dpo);
+
+      /*
+       * add the entry to the destination FIB that uses the lookup DPO
+       */
+      dst_fei = fib_table_entry_special_dpo_add (dst_fib_index,
+                                                &dst_fib_prefix,
+                                                FIB_SOURCE_LISP,
+                                                FIB_ENTRY_FLAG_EXCLUSIVE,
+                                                &src_lkup_dpo);
+
+      /*
+       * the DPO is locked by the FIB entry, and we have no further
+       * need for it.
+       */
+      dpo_unlock (&src_lkup_dpo);
+
+      /*
+       * save the SRC FIB index on the entry so we can retrieve it for
+       * subsequent routes.
+       */
+      fib_entry_set_source_data (dst_fei, FIB_SOURCE_LISP, &src_fib_index);
     }
   else
     {
-      fib->dst_address_length_refcounts[dst_address_length]++;
-
-      fib->non_empty_dst_address_length_bitmap =
-       clib_bitmap_set (fib->non_empty_dst_address_length_bitmap,
-                        128 - dst_address_length, 1);
-      compute_prefix_lengths_in_search_order (fib);
-    }
-
-  kv.key[0] = dst_address.as_u64[0];
-  kv.key[1] = dst_address.as_u64[1];
-  kv.key[2] = dst_address_length;
-
-  if (BV (clib_bihash_search) (&fib->ip6_lookup_table, &kv, &value) == 0)
-    old_adj_index = value.value;
-
-  if (is_del)
-    BV (clib_bihash_add_del) (&fib->ip6_lookup_table, &kv, 0 /* is_add */ );
-  else
-    {
-      /* Make sure adj index is valid. */
-      if (CLIB_DEBUG > 0)
-       (void) ip_get_adjacency (lm, adj_index);
-
-      kv.value = adj_index;
-
-      BV (clib_bihash_add_del) (&fib->ip6_lookup_table, &kv, 1 /* is_add */ );
+      /*
+       * destination FIB entry already present
+       */
+      src_fib_index = *(u32 *) fib_entry_get_source_data (dst_fei,
+                                                         FIB_SOURCE_LISP);
     }
 
-  /* Avoid spurious reference count increments */
-  if (old_adj_index == adj_index
-      && !(a->flags & IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY))
-    {
-      ip_adjacency_t *adj = ip_get_adjacency (lm, adj_index);
-      if (adj->share_count > 0)
-       adj->share_count--;
-    }
-
-  /* Delete old adjacency index if present and changed. */
-  {
-    if (!(a->flags & IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY)
-       && old_adj_index != ~0 && old_adj_index != adj_index)
-      ip_del_adjacency (lm, old_adj_index);
-  }
-}
-
-static void
-ip6_src_fib_init (ip6_src_fib_t * fib)
-{
-  uword i;
-
-  for (i = 0; i < ARRAY_LEN (fib->fib_masks); i++)
-    {
-      u32 j, i0, i1;
-
-      i0 = i / 32;
-      i1 = i % 32;
-
-      for (j = 0; j < i0; j++)
-       fib->fib_masks[i].as_u32[j] = ~0;
-
-      if (i1)
-       fib->fib_masks[i].as_u32[i0] =
-         clib_host_to_net_u32 (pow2_mask (i1) << (32 - i1));
-    }
-
-  if (fib->lookup_table_nbuckets == 0)
-    fib->lookup_table_nbuckets = IP6_FIB_DEFAULT_HASH_NUM_BUCKETS;
-
-  fib->lookup_table_nbuckets = 1 << max_log2 (fib->lookup_table_nbuckets);
-
-  if (fib->lookup_table_size == 0)
-    fib->lookup_table_size = IP6_FIB_DEFAULT_HASH_MEMORY_SIZE;
-
-  BV (clib_bihash_init) (&fib->ip6_lookup_table, "ip6 lookup table",
-                        fib->lookup_table_nbuckets, fib->lookup_table_size);
-
+  return (src_fib_index);
 }
 
 /**
- * @brief Add/del route to IP6 SD FIB.
- *
- * Adds/remove routes to both destination and source FIBs. Entries added
- * to destination FIB are associated to adjacencies that point to the source
- * FIB and store the index of the particular source FIB associated to the
- * destination. Source FIBs are locally managed (see @ref lgm->ip4_src_fibs
- * and @ref lgm->ip6_src_fibs), but the adjacencies are allocated out of the
- * global adjacency pool.
+ * @brief Del route to IP4 or IP6 SD FIB.
  *
- * @param[in]   lgm             Reference to @ref lisp_gpe_main_t.
- * @param[out]  dst_prefix      Destination IP6 prefix.
- * @param[in]   src_prefix      Source IP6 prefix.
- * @param[in]   table_id        Table id.
- * @param[in]   add_adj         Pointer to the adjacency to be added.
- * @param[in]   is_add          Add/del flag.
+ * Remove routes from both destination and source FIBs.
  *
- * @return 0 on success.
+ * @param[in]   src_fib_index   The index/ID of the SRC FIB
+ * @param[in]   src_prefix      Source IP prefix.
+ * @param[in]   dst_fib_index   The index/ID of the DST FIB
+ * @param[in]   dst_prefix      Destination IP prefix.
  */
-static int
-ip6_sd_fib_add_del_route (lisp_gpe_main_t * lgm, ip_prefix_t * dst_prefix,
-                         ip_prefix_t * src_prefix, u32 table_id,
-                         ip_adjacency_t * add_adj, u8 is_add)
+void
+ip_src_dst_fib_del_route (u32 src_fib_index,
+                         const ip_prefix_t * src_prefix,
+                         u32 dst_fib_index, const ip_prefix_t * dst_prefix)
 {
-  u32 adj_index;
-  ip6_add_del_route_args_t a;
-  ip_adjacency_t *dst_adjp, dst_adj;
-  ip6_address_t dst = ip_prefix_v6 (dst_prefix), src;
-  u32 dst_address_length = ip_prefix_len (dst_prefix), src_address_length = 0;
-  ip6_src_fib_t *src_fib;
-
-  if (src_prefix)
-    {
-      src = ip_prefix_v6 (src_prefix);
-      src_address_length = ip_prefix_len (src_prefix);
-    }
-  else
-    memset (&src, 0, sizeof (src));
-
-  /* lookup dst adj and create it if it doesn't exist */
-  adj_index = ip6_get_route (lgm->im6, table_id, 0, &dst, dst_address_length);
+  fib_prefix_t dst_fib_prefix, src_fib_prefix;
 
-  if (is_add)
-    {
-      /* insert dst prefix to ip6 fib, if it's not in yet */
-      if (adj_index == 0)
-       {
-         /* allocate and init src ip6 fib */
-         pool_get (lgm->ip6_src_fibs, src_fib);
-         memset (src_fib, 0, sizeof (src_fib[0]));
-         ip6_src_fib_init (src_fib);
-
-         memset (&dst_adj, 0, sizeof (dst_adj));
-
-         /* reuse rewrite header to store pointer to src fib */
-         dst_adj.rewrite_header.sw_if_index = src_fib - lgm->ip6_src_fibs;
-
-         /* dst adj should point to lisp gpe ip lookup */
-         dst_adj.lookup_next_index = lgm->ip6_lookup_next_lgpe_ip6_lookup;
-
-         /* explicit_fib_index is used in IP6 FIB lookup, don't reuse it */
-         dst_adj.explicit_fib_index = ~0;
-         dst_adj.n_adj = 1;
-
-         /* make sure we have different signatures for adj in different tables
-          * but with the same lookup_next_index and for adj in the same table
-          * but associated to different destinations */
-         dst_adj.if_address_index = table_id;
-         dst_adj.indirect.next_hop.ip6 = dst;
-
-         memset (&a, 0, sizeof (a));
-         a.flags = IP6_ROUTE_FLAG_TABLE_ID;
-         a.table_index_or_table_id = table_id; /* vrf */
-         a.adj_index = ~0;
-         a.dst_address_length = dst_address_length;
-         a.dst_address = dst;
-         a.flags |= IP6_ROUTE_FLAG_ADD;
-         a.add_adj = &dst_adj;
-         a.n_add_adj = 1;
-
-         ip6_add_del_route (lgm->im6, &a);
-
-         /* lookup dst adj to obtain the adj index */
-         adj_index = ip6_get_route (lgm->im6, table_id, 0, &dst,
-                                    dst_address_length);
-
-         /* make sure insertion succeeded */
-         if (CLIB_DEBUG)
-           {
-             ASSERT (adj_index != 0);
-             dst_adjp = ip_get_adjacency (lgm->lm6, adj_index);
-             ASSERT (dst_adjp->rewrite_header.sw_if_index
-                     == dst_adj.rewrite_header.sw_if_index);
-           }
-       }
-    }
-  else
-    {
-      if (adj_index == 0)
-       {
-         clib_warning
-           ("Trying to delete inexistent dst route for %U. Aborting",
-            format_ip_prefix, dst_prefix);
-         return -1;
-       }
-    }
-
-  dst_adjp = ip_get_adjacency (lgm->lm6, adj_index);
-
-  /* add/del src prefix to src fib */
-  memset (&a, 0, sizeof (a));
-  a.flags = IP6_ROUTE_FLAG_TABLE_ID;
-  a.table_index_or_table_id = dst_adjp->rewrite_header.sw_if_index;
-  a.adj_index = ~0;
-  a.flags |= is_add ? IP6_ROUTE_FLAG_ADD : IP6_ROUTE_FLAG_DEL;
-  a.add_adj = add_adj;
-  a.n_add_adj = is_add ? 1 : 0;
-  /* if src prefix is null, add ::0 */
-  a.dst_address_length = src_address_length;
-  a.dst_address = src;
-  ip6_sd_fib_add_del_src_route (lgm, &a);
-
-  /* make sure insertion succeeded */
-  if (CLIB_DEBUG && is_add)
-    {
-      u32 sai;
-      ip_adjacency_t *src_adjp;
-      sai = ip6_sd_get_src_route (lgm, dst_adjp->rewrite_header.sw_if_index,
-                                 &src, src_address_length);
-      ASSERT (sai != 0);
-      src_adjp = ip_get_adjacency (lgm->lm6, sai);
-      ASSERT (src_adjp->if_address_index == add_adj->if_address_index);
-    }
-
-  /* if a delete, check if there are elements left in the src fib */
-  if (!is_add)
-    {
-      src_fib = pool_elt_at_index (lgm->ip6_src_fibs,
-                                  dst_adjp->rewrite_header.sw_if_index);
-      if (!src_fib)
-       return 0;
-
-      /* if there's nothing left */
-      if (clib_bitmap_count_set_bits
-         (src_fib->non_empty_dst_address_length_bitmap) == 0)
-       {
-         /* remove src fib .. */
-         pool_put (lgm->ip6_src_fibs, src_fib);
+  ASSERT (NULL != dst_prefix);
+  ASSERT (NULL != src_prefix);
 
-         /* .. and remove dst route */
-         memset (&a, 0, sizeof (a));
-         a.flags = IP6_ROUTE_FLAG_TABLE_ID;
-         a.table_index_or_table_id = table_id; /* vrf */
-         a.adj_index = ~0;
-         a.dst_address_length = dst_address_length;
-         a.dst_address = dst;
-         a.flags |= IP6_ROUTE_FLAG_DEL;
+  ip_prefix_to_fib_prefix (dst_prefix, &dst_fib_prefix);
+  ip_prefix_to_fib_prefix (src_prefix, &src_fib_prefix);
 
-         ip6_add_del_route (lgm->im6, &a);
-       }
-    }
-
-  return 0;
-}
+  fib_table_entry_delete (src_fib_index, &src_fib_prefix, FIB_SOURCE_LISP);
 
-/**
- * @brief Retrieve IP6 SD FIB entry.
- *
- * Looks up SD IP6 route by first looking up the destination in VPP's main FIB
- * and subsequently the source in the src FIB. The index of the source FIB is
- * stored in the dst adjacency's @ref rewrite_header.sw_if_index. If source is
- * 0 do search with ::/0 src.
- *
- * @param[in]   lgm             Reference to @ref lisp_gpe_main_t.
- * @param[out]  dst_prefix      Destination IP6 prefix.
- * @param[in]   src_prefix      Source IP6 prefix.
- * @param[in]   table_id        Table id.
- *
- * @return adjacency index if route found.
- */
-static u32
-ip6_sd_fib_get_route (lisp_gpe_main_t * lgm, ip_prefix_t * dst_prefix,
-                     ip_prefix_t * src_prefix, u32 table_id)
-{
-  u32 adj_index;
-  ip6_address_t dst = ip_prefix_v6 (dst_prefix), src;
-  u32 dst_address_length = ip_prefix_len (dst_prefix), src_address_length = 0;
-  ip_adjacency_t *dst_adj;
-
-  if (src_prefix)
+  if (0 == fib_table_get_num_entries (src_fib_index,
+                                     src_fib_prefix.fp_proto,
+                                     FIB_SOURCE_LISP))
     {
-      src = ip_prefix_v6 (src_prefix);
-      src_address_length = ip_prefix_len (src_prefix);
+      /*
+       * there's nothing left, unlock the source FIB and the
+       * destination route
+       */
+      fib_table_entry_special_remove (dst_fib_index,
+                                     &dst_fib_prefix, FIB_SOURCE_LISP);
+      fib_table_unlock (src_fib_index, src_fib_prefix.fp_proto);
     }
-  else
-    memset (&src, 0, sizeof (src));
-
-  /* lookup dst adj */
-  adj_index = ip6_get_route (lgm->im6, table_id, 0, &dst, dst_address_length);
-  if (adj_index == 0)
-    return adj_index;
-
-  dst_adj = ip_get_adjacency (lgm->lm6, adj_index);
-  return ip6_sd_get_src_route (lgm, dst_adj->rewrite_header.sw_if_index, &src,
-                              src_address_length);
-}
-
-/**
- * @brief Add/del route to IP4 or IP6 SD FIB.
- *
- * Adds/remove routes to both destination and source FIBs. Entries added
- * to destination FIB are associated to adjacencies that point to the source
- * FIB and store the index of the particular source FIB associated to the
- * destination. Source FIBs are locally managed (see @ref lgm->ip4_src_fibs
- * and @ref lgm->ip6_src_fibs), but the adjacencies are allocated out of the
- * global adjacency pool.
- *
- * @param[in]   lgm             Reference to @ref lisp_gpe_main_t.
- * @param[out]  dst_prefix      Destination IP prefix.
- * @param[in]   src_prefix      Source IP prefix.
- * @param[in]   table_id        Table id.
- * @param[in]   add_adj         Pointer to the adjacency to be added.
- * @param[in]   is_add          Add/del flag.
- *
- * @return 0 on success.
- */
-int
-ip_sd_fib_add_del_route (lisp_gpe_main_t * lgm, ip_prefix_t * dst_prefix,
-                        ip_prefix_t * src_prefix, u32 table_id,
-                        ip_adjacency_t * add_adj, u8 is_add)
-{
-  return (ip_prefix_version (dst_prefix) == IP4 ?
-         ip4_sd_fib_add_del_route : ip6_sd_fib_add_del_route) (lgm,
-                                                               dst_prefix,
-                                                               src_prefix,
-                                                               table_id,
-                                                               add_adj,
-                                                               is_add);
 }
 
 /**
- * @brief Retrieve IP4 or IP6 SD FIB entry.
+ * @brief Add route to IP4 or IP6 SRC FIB.
  *
- * Looks up SD IP route by first looking up the destination in VPP's main FIB
- * and subsequently the source in the src FIB. The index of the source FIB is
- * stored in the dst adjacency's @ref rewrite_header.sw_if_index. If source is
- * 0 do search with ::/0 src.
+ * Adds a route to in the LISP SRC FIB with the result of the route
+ * being the DPO passed.
  *
- * @param[in]   lgm             Reference to @ref lisp_gpe_main_t.
- * @param[out]  dst_prefix      Destination IP prefix.
+ * @param[in]   src_fib_index   The index/ID of the SRC FIB
  * @param[in]   src_prefix      Source IP prefix.
- * @param[in]   table_id        Table id.
- *
- * @return adjacency index if route found.
+ * @param[in]   src_dpo         The DPO the route will link to.
  */
-u32
-ip_sd_fib_get_route (lisp_gpe_main_t * lgm, ip_prefix_t * dst_prefix,
-                    ip_prefix_t * src_prefix, u32 table_id)
-{
-  if (ip_prefix_version (dst_prefix) == IP4)
-    {
-      u32 *adj_index = ip4_sd_fib_get_route (lgm, dst_prefix, src_prefix,
-                                            table_id);
-      return (adj_index == 0) ? 0 : adj_index[0];
-    }
-  else
-    return ip6_sd_fib_get_route (lgm, dst_prefix, src_prefix, table_id);
-}
-
-always_inline void
-ip4_src_fib_lookup_one (lisp_gpe_main_t * lgm, u32 src_fib_index0,
-                       ip4_address_t * addr0, u32 * src_adj_index0)
-{
-  ip4_fib_mtrie_leaf_t leaf0, leaf1;
-  ip4_fib_mtrie_t *mtrie0;
-
-  /* if default route not hit in ip4 lookup */
-  if (PREDICT_TRUE (src_fib_index0 != (u32) ~ 0))
-    {
-      mtrie0 = &vec_elt_at_index (lgm->ip4_src_fibs, src_fib_index0)->mtrie;
-
-      leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
-      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 0);
-      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 1);
-      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 2);
-      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 3);
-
-      /* Handle default route. */
-      leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY) ?
-       mtrie0->default_leaf : leaf0;
-      src_adj_index0[0] = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
-    }
-  else
-    src_adj_index0[0] = ~0;
-}
-
-always_inline void
-ip4_src_fib_lookup_two (lisp_gpe_main_t * lgm, u32 src_fib_index0,
-                       u32 src_fib_index1, ip4_address_t * addr0,
-                       ip4_address_t * addr1, u32 * src_adj_index0,
-                       u32 * src_adj_index1)
+void
+ip_src_fib_add_route_w_dpo (u32 src_fib_index,
+                           const ip_prefix_t * src_prefix,
+                           const dpo_id_t * src_dpo)
 {
-  ip4_fib_mtrie_leaf_t leaf0, leaf1;
-  ip4_fib_mtrie_t *mtrie0, *mtrie1;
-
-  /* if default route not hit in ip4 lookup */
-  if (PREDICT_TRUE
-      (src_fib_index0 != (u32) ~ 0 && src_fib_index1 != (u32) ~ 0))
-    {
-      mtrie0 = &vec_elt_at_index (lgm->ip4_src_fibs, src_fib_index0)->mtrie;
-      mtrie1 = &vec_elt_at_index (lgm->ip4_src_fibs, src_fib_index1)->mtrie;
-
-      leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
-
-      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 0);
-      leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, addr1, 0);
+  fib_prefix_t src_fib_prefix;
 
-      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 1);
-      leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, addr1, 1);
+  ip_prefix_to_fib_prefix (src_prefix, &src_fib_prefix);
 
-      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 2);
-      leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, addr1, 2);
+  /*
+   * add the entry into the source fib.
+   */
+  fib_node_index_t src_fei;
 
-      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 3);
-      leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, addr1, 3);
+  src_fei = fib_table_lookup_exact_match (src_fib_index, &src_fib_prefix);
 
-      /* Handle default route. */
-      leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY) ?
-       mtrie0->default_leaf : leaf0;
-      leaf1 = (leaf1 == IP4_FIB_MTRIE_LEAF_EMPTY) ?
-       mtrie1->default_leaf : leaf1;
-      src_adj_index0[0] = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
-      src_adj_index1[0] = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
-    }
-  else
+  if (FIB_NODE_INDEX_INVALID == src_fei ||
+      !fib_entry_is_sourced (src_fei, FIB_SOURCE_LISP))
     {
-      ip4_src_fib_lookup_one (lgm, src_fib_index0, addr0, src_adj_index0);
-      ip4_src_fib_lookup_one (lgm, src_fib_index1, addr1, src_adj_index1);
+      fib_table_entry_special_dpo_add (src_fib_index,
+                                      &src_fib_prefix,
+                                      FIB_SOURCE_LISP,
+                                      FIB_ENTRY_FLAG_EXCLUSIVE, src_dpo);
     }
 }
 
-/**
- * @brief IPv4 src lookup node.
- * @node lgpe-ip4-lookup
- *
- * The LISP IPv4 source lookup dispatch node.
- *
- * This is the IPv4 source lookup dispatch node. It first looks up the
- * adjacency hit in the main (destination) FIB and then uses its
- * <code>rewrite_header.sw_if_index</code>to find the source FIB wherein
- * the source IP is subsequently looked up. Data in the resulting adjacency
- * is used to decide the next node (the lisp_gpe interface) and if a flow
- * hash must be computed, when traffic can be load balanced over multiple
- * tunnels.
- *
- *
- * @param[in]   vm      vlib_main_t corresponding to current thread.
- * @param[in]   node    vlib_node_runtime_t data for this node.
- * @param[in]   frame   vlib_frame_t whose contents should be dispatched.
- *
- * @return number of vectors in frame.
- */
-always_inline uword
-lgpe_ip4_lookup (vlib_main_t * vm, vlib_node_runtime_t * node,
-                vlib_frame_t * from_frame)
+static void
+ip_address_to_46 (const ip_address_t * addr,
+                 ip46_address_t * a, fib_protocol_t * proto)
 {
-  u32 n_left_from, next_index, *from, *to_next;
-  lisp_gpe_main_t *lgm = &lisp_gpe_main;
-
-  from = vlib_frame_vector_args (from_frame);
-  n_left_from = from_frame->n_vectors;
-
-  next_index = node->cached_next_index;
-
-  while (n_left_from > 0)
+  *proto = (IP4 == ip_addr_version (addr) ?
+           FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6);
+  switch (*proto)
     {
-      u32 n_left_to_next;
-
-      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
-
-      while (n_left_from >= 4 && n_left_to_next >= 2)
-       {
-         u32 bi0, bi1;
-         vlib_buffer_t *b0, *b1;
-         ip4_header_t *ip0, *ip1;
-         u32 dst_adj_index0, src_adj_index0, src_fib_index0;
-         u32 dst_adj_index1, src_adj_index1, src_fib_index1;
-         ip_adjacency_t *dst_adj0, *src_adj0, *dst_adj1, *src_adj1;
-         u32 next0, next1;
-
-         next0 = next1 = LGPE_IP4_LOOKUP_NEXT_LISP_CP_LOOKUP;
-
-         /* Prefetch next iteration. */
-         {
-           vlib_buffer_t *p2, *p3;
-
-           p2 = vlib_get_buffer (vm, from[2]);
-           p3 = vlib_get_buffer (vm, from[3]);
-
-           vlib_prefetch_buffer_header (p2, LOAD);
-           vlib_prefetch_buffer_header (p3, LOAD);
-
-           CLIB_PREFETCH (p2->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
-           CLIB_PREFETCH (p3->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
-         }
-
-         bi0 = from[0];
-         bi1 = from[1];
-         to_next[0] = bi0;
-         to_next[1] = bi1;
-         from += 2;
-         to_next += 2;
-         n_left_to_next -= 2;
-         n_left_from -= 2;
-
-         b0 = vlib_get_buffer (vm, bi0);
-         b1 = vlib_get_buffer (vm, bi1);
-
-         ip0 = vlib_buffer_get_current (b0);
-         ip1 = vlib_buffer_get_current (b1);
-
-         /* dst lookup was done by ip4 lookup */
-         dst_adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
-         dst_adj_index1 = vnet_buffer (b1)->ip.adj_index[VLIB_TX];
-
-         dst_adj0 = ip_get_adjacency (lgm->lm4, dst_adj_index0);
-         dst_adj1 = ip_get_adjacency (lgm->lm4, dst_adj_index1);
-
-         src_fib_index0 = dst_adj0->rewrite_header.sw_if_index;
-         src_fib_index1 = dst_adj1->rewrite_header.sw_if_index;
-
-         ip4_src_fib_lookup_two (lgm, src_fib_index0, src_fib_index1,
-                                 &ip0->src_address, &ip1->src_address,
-                                 &src_adj_index0, &src_adj_index1);
-
-         /* if a source fib exists */
-         if (PREDICT_TRUE ((u32) ~ 0 != src_adj_index0
-                           && (u32) ~ 0 != src_adj_index1))
-           {
-             vnet_buffer (b0)->ip.adj_index[VLIB_TX] = src_adj_index0;
-             vnet_buffer (b1)->ip.adj_index[VLIB_TX] = src_adj_index1;
-
-             src_adj0 = ip_get_adjacency (lgm->lm4, src_adj_index0);
-             src_adj1 = ip_get_adjacency (lgm->lm4, src_adj_index1);
-
-             next0 = src_adj0->explicit_fib_index;
-             next1 = src_adj1->explicit_fib_index;
-
-             /* prepare buffer for lisp-gpe output node */
-             vnet_buffer (b0)->sw_if_index[VLIB_TX] =
-               src_adj0->rewrite_header.sw_if_index;
-             vnet_buffer (b1)->sw_if_index[VLIB_TX] =
-               src_adj1->rewrite_header.sw_if_index;
-
-             /* if multipath: saved_lookup_next_index is reused to store
-              * nb of sub-tunnels. If greater than 1, multipath is on.
-              * Note that flow hash should be 0 after ipx lookup! */
-             if (PREDICT_TRUE (src_adj0->saved_lookup_next_index > 1))
-               vnet_buffer (b0)->ip.flow_hash =
-                 ip4_compute_flow_hash (ip0, IP_FLOW_HASH_DEFAULT);
-
-             if (PREDICT_TRUE (src_adj1->saved_lookup_next_index > 1))
-               vnet_buffer (b1)->ip.flow_hash =
-                 ip4_compute_flow_hash (ip1, IP_FLOW_HASH_DEFAULT);
-           }
-         else
-           {
-             if ((u32) ~ 0 != src_adj_index0)
-               {
-                 vnet_buffer (b0)->ip.adj_index[VLIB_TX] = src_adj_index0;
-                 src_adj0 = ip_get_adjacency (lgm->lm4, src_adj_index0);
-                 next0 = src_adj0->explicit_fib_index;
-                 vnet_buffer (b0)->sw_if_index[VLIB_TX] =
-                   src_adj0->rewrite_header.sw_if_index;
-
-                 if (PREDICT_TRUE (src_adj0->saved_lookup_next_index > 1))
-                   vnet_buffer (b0)->ip.flow_hash =
-                     ip4_compute_flow_hash (ip0, IP_FLOW_HASH_DEFAULT);
-               }
-             else
-               {
-                 next0 = LGPE_IP4_LOOKUP_NEXT_LISP_CP_LOOKUP;
-               }
-
-             if ((u32) ~ 0 != src_adj_index1)
-               {
-                 vnet_buffer (b1)->ip.adj_index[VLIB_TX] = src_adj_index1;
-                 src_adj1 = ip_get_adjacency (lgm->lm4, src_adj_index1);
-                 next1 = src_adj1->explicit_fib_index;
-                 vnet_buffer (b1)->sw_if_index[VLIB_TX] =
-                   src_adj1->rewrite_header.sw_if_index;
-                 if (PREDICT_TRUE (src_adj1->saved_lookup_next_index > 1))
-                   vnet_buffer (b1)->ip.flow_hash =
-                     ip4_compute_flow_hash (ip1, IP_FLOW_HASH_DEFAULT);
-               }
-             else
-               {
-                 next1 = LGPE_IP4_LOOKUP_NEXT_LISP_CP_LOOKUP;
-               }
-           }
-
-         /* mark the packets for CP lookup if needed */
-         if (PREDICT_FALSE (LGPE_IP4_LOOKUP_NEXT_LISP_CP_LOOKUP == next0))
-           vnet_buffer (b0)->lisp.overlay_afi = LISP_AFI_IP;
-         if (PREDICT_FALSE (LGPE_IP4_LOOKUP_NEXT_LISP_CP_LOOKUP == next1))
-           vnet_buffer (b1)->lisp.overlay_afi = LISP_AFI_IP;
-
-         vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next,
-                                          n_left_to_next, bi0, bi1, next0,
-                                          next1);
-       }
-
-      while (n_left_from > 0 && n_left_to_next > 0)
-       {
-         vlib_buffer_t *b0;
-         ip4_header_t *ip0;
-         u32 bi0, dst_adj_index0, src_adj_index0, src_fib_index0;
-         u32 next0 = LGPE_IP4_LOOKUP_NEXT_LISP_CP_LOOKUP;
-         ip_adjacency_t *dst_adj0, *src_adj0;
-
-         bi0 = from[0];
-         to_next[0] = bi0;
-         from += 1;
-         to_next += 1;
-         n_left_from -= 1;
-         n_left_to_next -= 1;
-
-         b0 = vlib_get_buffer (vm, bi0);
-         ip0 = vlib_buffer_get_current (b0);
-
-         /* dst lookup was done by ip4 lookup */
-         dst_adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
-         dst_adj0 = ip_get_adjacency (lgm->lm4, dst_adj_index0);
-         src_fib_index0 = dst_adj0->rewrite_header.sw_if_index;
-
-         /* do src lookup */
-         ip4_src_fib_lookup_one (lgm, src_fib_index0, &ip0->src_address,
-                                 &src_adj_index0);
-
-         /* if a source fib exists */
-         if (PREDICT_TRUE ((u32) ~ 0 != src_adj_index0))
-           {
-             vnet_buffer (b0)->ip.adj_index[VLIB_TX] = src_adj_index0;
-             src_adj0 = ip_get_adjacency (lgm->lm4, src_adj_index0);
-             next0 = src_adj0->explicit_fib_index;
-
-             /* prepare packet for lisp-gpe output node */
-             vnet_buffer (b0)->sw_if_index[VLIB_TX] =
-               src_adj0->rewrite_header.sw_if_index;
-
-             /* if multipath: saved_lookup_next_index is reused to store
-              * nb of sub-tunnels. If greater than 1, multipath is on */
-             if (PREDICT_TRUE (src_adj0->saved_lookup_next_index > 1))
-               vnet_buffer (b0)->ip.flow_hash =
-                 ip4_compute_flow_hash (ip0, IP_FLOW_HASH_DEFAULT);
-           }
-         else
-           {
-             next0 = LGPE_IP4_LOOKUP_NEXT_LISP_CP_LOOKUP;
-           }
-
-         if (PREDICT_FALSE (LGPE_IP4_LOOKUP_NEXT_LISP_CP_LOOKUP == next0))
-           vnet_buffer (b0)->lisp.overlay_afi = LISP_AFI_IP;
-
-         vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
-                                          n_left_to_next, bi0, next0);
-       }
-      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+    case FIB_PROTOCOL_IP4:
+      a->ip4 = addr->ip.v4;
+      break;
+    case FIB_PROTOCOL_IP6:
+      a->ip6 = addr->ip.v6;
+      break;
+    default:
+      ASSERT (0);
+      break;
     }
-  return from_frame->n_vectors;
 }
 
-/* *INDENT-OFF* */
-VLIB_REGISTER_NODE (lgpe_ip4_lookup_node) = {
-  .function = lgpe_ip4_lookup,
-  .name = "lgpe-ip4-lookup",
-  .vector_size = sizeof (u32),
-
-  .type = VLIB_NODE_TYPE_INTERNAL,
-
-  .n_next_nodes = LGPE_IP4_LOOKUP_N_NEXT,
-  .next_nodes = {
-#define _(sym,str) [LGPE_IP4_LOOKUP_NEXT_##sym] = str,
-      foreach_lgpe_ip4_lookup_next
-#undef _
-  },
-};
-/* *INDENT-ON* */
-
-static u32
-ip6_src_fib_lookup (lisp_gpe_main_t * lgm, u32 src_fib_index,
-                   ip6_address_t * src)
+static fib_route_path_t *
+ip_src_fib_mk_paths (const lisp_fwd_path_t * paths)
 {
-  int i, len;
-  int rv;
-  BVT (clib_bihash_kv) kv, value;
-  ip6_src_fib_t *fib = pool_elt_at_index (lgm->ip6_src_fibs, src_fib_index);
+  const lisp_gpe_adjacency_t *ladj;
+  fib_route_path_t *rpaths = NULL;
+  u8 best_priority;
+  u32 ii;
 
-  len = vec_len (fib->prefix_lengths_in_search_order);
+  vec_validate (rpaths, vec_len (paths) - 1);
 
-  for (i = 0; i < len; i++)
-    {
-      int dst_address_length = fib->prefix_lengths_in_search_order[i];
-      ip6_address_t *mask;
+  best_priority = paths[0].priority;
 
-      ASSERT (dst_address_length >= 0 && dst_address_length <= 128);
+  vec_foreach_index (ii, paths)
+  {
+    if (paths[0].priority != best_priority)
+      break;
 
-      mask = &fib->fib_masks[dst_address_length];
+    ladj = lisp_gpe_adjacency_get (paths[ii].lisp_adj);
 
-      kv.key[0] = src->as_u64[0] & mask->as_u64[0];
-      kv.key[1] = src->as_u64[1] & mask->as_u64[1];
-      kv.key[2] = dst_address_length;
+    ip_address_to_46 (&ladj->remote_rloc,
+                     &rpaths[ii].frp_addr, &rpaths[ii].frp_proto);
 
-      rv =
-       BV (clib_bihash_search_inline_2) (&fib->ip6_lookup_table, &kv,
-                                         &value);
-      if (rv == 0)
-       return value.value;
-    }
-
-  return 0;
-}
+    rpaths[ii].frp_sw_if_index = ladj->sw_if_index;
+    rpaths[ii].frp_weight = (paths[ii].weight ? paths[ii].weight : 1);
+    rpaths[ii].frp_label = MPLS_LABEL_INVALID;
+  }
 
-always_inline void
-ip6_src_fib_lookup_one (lisp_gpe_main_t * lgm, u32 src_fib_index0,
-                       ip6_address_t * addr0, u32 * src_adj_index0)
-{
-  /* if default route not hit in ip6 lookup */
-  if (PREDICT_TRUE (src_fib_index0 != (u32) ~ 0))
-    src_adj_index0[0] = ip6_src_fib_lookup (lgm, src_fib_index0, addr0);
-  else
-    src_adj_index0[0] = ~0;
-}
+  ASSERT (0 != vec_len (rpaths));
 
-always_inline void
-ip6_src_fib_lookup_two (lisp_gpe_main_t * lgm, u32 src_fib_index0,
-                       u32 src_fib_index1, ip6_address_t * addr0,
-                       ip6_address_t * addr1, u32 * src_adj_index0,
-                       u32 * src_adj_index1)
-{
-  /* if default route not hit in ip6 lookup */
-  if (PREDICT_TRUE
-      (src_fib_index0 != (u32) ~ 0 && src_fib_index1 != (u32) ~ 0))
-    {
-      src_adj_index0[0] = ip6_src_fib_lookup (lgm, src_fib_index0, addr0);
-      src_adj_index1[0] = ip6_src_fib_lookup (lgm, src_fib_index1, addr1);
-    }
-  else
-    {
-      ip6_src_fib_lookup_one (lgm, src_fib_index0, addr0, src_adj_index0);
-      ip6_src_fib_lookup_one (lgm, src_fib_index1, addr1, src_adj_index1);
-    }
+  return (rpaths);
 }
 
 /**
- * @brief IPv6 src lookup node.
- * @node lgpe-ip6-lookup
- *
- * The LISP IPv6 source lookup dispatch node.
+ * @brief Add route to IP4 or IP6 SRC FIB.
  *
- * This is the IPv6 source lookup dispatch node. It first looks up the
- * adjacency hit in the main (destination) FIB and then uses its
- * <code>rewrite_header.sw_if_index</code>to find the source FIB wherein
- * the source IP is subsequently looked up. Data in the resulting adjacency
- * is used to decide the next node (the lisp_gpe interface) and if a flow
- * hash must be computed, when traffic can be load balanced over multiple
- * tunnels.
+ * Adds a route to in the LISP SRC FIB for the tunnel.
  *
- * @param[in]   vm      vlib_main_t corresponding to current thread.
- * @param[in]   node    vlib_node_runtime_t data for this node.
- * @param[in]   frame   vlib_frame_t whose contents should be dispatched.
- *
- * @return number of vectors in frame.
+ * @param[in]   src_fib_index   The index/ID of the SRC FIB
+ * @param[in]   src_prefix      Source IP prefix.
+ * @param[in]   paths           The paths from which to construct the
+ *                              load balance
  */
-always_inline uword
-lgpe_ip6_lookup (vlib_main_t * vm, vlib_node_runtime_t * node,
-                vlib_frame_t * from_frame)
+void
+ip_src_fib_add_route (u32 src_fib_index,
+                     const ip_prefix_t * src_prefix,
+                     const lisp_fwd_path_t * paths)
 {
-  u32 n_left_from, next_index, *from, *to_next;
-  lisp_gpe_main_t *lgm = &lisp_gpe_main;
-
-  from = vlib_frame_vector_args (from_frame);
-  n_left_from = from_frame->n_vectors;
-
-  next_index = node->cached_next_index;
-
-  while (n_left_from > 0)
-    {
-      u32 n_left_to_next;
-
-      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
-
-      while (n_left_from >= 4 && n_left_to_next >= 2)
-       {
-         u32 bi0, bi1;
-         vlib_buffer_t *b0, *b1;
-         ip6_header_t *ip0, *ip1;
-         u32 dst_adj_index0, src_adj_index0, src_fib_index0, dst_adj_index1,
-           src_adj_index1, src_fib_index1;
-         ip_adjacency_t *dst_adj0, *src_adj0, *dst_adj1, *src_adj1;
-         u32 next0, next1;
-
-         next0 = next1 = LGPE_IP6_LOOKUP_NEXT_LISP_CP_LOOKUP;
-
-         /* Prefetch next iteration. */
-         {
-           vlib_buffer_t *p2, *p3;
-
-           p2 = vlib_get_buffer (vm, from[2]);
-           p3 = vlib_get_buffer (vm, from[3]);
-
-           vlib_prefetch_buffer_header (p2, LOAD);
-           vlib_prefetch_buffer_header (p3, LOAD);
-
-           CLIB_PREFETCH (p2->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
-           CLIB_PREFETCH (p3->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
-         }
-
-         bi0 = from[0];
-         bi1 = from[1];
-         to_next[0] = bi0;
-         to_next[1] = bi1;
-         from += 2;
-         to_next += 2;
-         n_left_to_next -= 2;
-         n_left_from -= 2;
-
-         b0 = vlib_get_buffer (vm, bi0);
-         b1 = vlib_get_buffer (vm, bi1);
-
-         ip0 = vlib_buffer_get_current (b0);
-         ip1 = vlib_buffer_get_current (b1);
-
-         /* dst lookup was done by ip6 lookup */
-         dst_adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
-         dst_adj_index1 = vnet_buffer (b1)->ip.adj_index[VLIB_TX];
-
-         dst_adj0 = ip_get_adjacency (lgm->lm6, dst_adj_index0);
-         dst_adj1 = ip_get_adjacency (lgm->lm6, dst_adj_index1);
-
-         src_fib_index0 = dst_adj0->rewrite_header.sw_if_index;
-         src_fib_index1 = dst_adj1->rewrite_header.sw_if_index;
-
-         ip6_src_fib_lookup_two (lgm, src_fib_index0, src_fib_index1,
-                                 &ip0->src_address, &ip1->src_address,
-                                 &src_adj_index0, &src_adj_index1);
-
-         /* if a source fib exists */
-         if (PREDICT_TRUE ((u32) ~ 0 != src_adj_index0
-                           && (u32) ~ 0 != src_adj_index1))
-           {
-             vnet_buffer (b0)->ip.adj_index[VLIB_TX] = src_adj_index0;
-             vnet_buffer (b1)->ip.adj_index[VLIB_TX] = src_adj_index1;
+  fib_prefix_t src_fib_prefix;
+  fib_route_path_t *rpaths;
 
-             src_adj0 = ip_get_adjacency (lgm->lm6, src_adj_index0);
-             src_adj1 = ip_get_adjacency (lgm->lm6, src_adj_index1);
+  ip_prefix_to_fib_prefix (src_prefix, &src_fib_prefix);
 
-             next0 = src_adj0->explicit_fib_index;
-             next1 = src_adj1->explicit_fib_index;
+  rpaths = ip_src_fib_mk_paths (paths);
 
-             /* prepare buffer for lisp-gpe output node */
-             vnet_buffer (b0)->sw_if_index[VLIB_TX] =
-               src_adj0->rewrite_header.sw_if_index;
-             vnet_buffer (b1)->sw_if_index[VLIB_TX] =
-               src_adj1->rewrite_header.sw_if_index;
-
-             /* if multipath: saved_lookup_next_index is reused to store
-              * nb of sub-tunnels. If greater than 1, multipath is on.
-              * Note that flow hash should be 0 after ipx lookup! */
-             if (PREDICT_TRUE (src_adj0->saved_lookup_next_index > 1))
-               vnet_buffer (b0)->ip.flow_hash =
-                 ip6_compute_flow_hash (ip0, IP_FLOW_HASH_DEFAULT);
-
-             if (PREDICT_TRUE (src_adj1->saved_lookup_next_index > 1))
-               vnet_buffer (b1)->ip.flow_hash =
-                 ip6_compute_flow_hash (ip1, IP_FLOW_HASH_DEFAULT);
-           }
-         else
-           {
-             if (src_adj_index0 != (u32) ~ 0)
-               {
-                 vnet_buffer (b0)->ip.adj_index[VLIB_TX] = src_adj_index0;
-                 src_adj0 = ip_get_adjacency (lgm->lm6, src_adj_index0);
-                 next0 = src_adj0->explicit_fib_index;
-                 vnet_buffer (b0)->sw_if_index[VLIB_TX] =
-                   src_adj0->rewrite_header.sw_if_index;
-
-                 if (PREDICT_TRUE (src_adj0->saved_lookup_next_index > 1))
-                   vnet_buffer (b0)->ip.flow_hash =
-                     ip6_compute_flow_hash (ip0, IP_FLOW_HASH_DEFAULT);
-               }
-             else
-               {
-                 next0 = LGPE_IP4_LOOKUP_NEXT_LISP_CP_LOOKUP;
-               }
-
-             if (src_adj_index1 != (u32) ~ 0)
-               {
-                 vnet_buffer (b1)->ip.adj_index[VLIB_TX] = src_adj_index1;
-                 src_adj1 = ip_get_adjacency (lgm->lm6, src_adj_index1);
-                 next1 = src_adj1->explicit_fib_index;
-                 vnet_buffer (b1)->sw_if_index[VLIB_TX] =
-                   src_adj1->rewrite_header.sw_if_index;
-
-                 if (PREDICT_TRUE (src_adj1->saved_lookup_next_index > 1))
-                   vnet_buffer (b1)->ip.flow_hash =
-                     ip6_compute_flow_hash (ip1, IP_FLOW_HASH_DEFAULT);
-               }
-             else
-               {
-                 next1 = LGPE_IP4_LOOKUP_NEXT_LISP_CP_LOOKUP;
-               }
-           }
-
-         /* mark the packets for CP lookup if needed */
-         if (PREDICT_FALSE (LGPE_IP4_LOOKUP_NEXT_LISP_CP_LOOKUP == next0))
-           vnet_buffer (b0)->lisp.overlay_afi = LISP_AFI_IP;
-         if (PREDICT_FALSE (LGPE_IP4_LOOKUP_NEXT_LISP_CP_LOOKUP == next1))
-           vnet_buffer (b1)->lisp.overlay_afi = LISP_AFI_IP;
-
-         vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next,
-                                          n_left_to_next, bi0, bi1, next0,
-                                          next1);
-       }
-
-      while (n_left_from > 0 && n_left_to_next > 0)
-       {
-         vlib_buffer_t *b0;
-         ip6_header_t *ip0;
-         u32 bi0, dst_adj_index0, src_adj_index0, src_fib_index0;
-         u32 next0 = LGPE_IP6_LOOKUP_NEXT_LISP_CP_LOOKUP;
-         ip_adjacency_t *dst_adj0, *src_adj0;
-
-         bi0 = from[0];
-         to_next[0] = bi0;
-         from += 1;
-         to_next += 1;
-         n_left_from -= 1;
-         n_left_to_next -= 1;
-
-         b0 = vlib_get_buffer (vm, bi0);
-         ip0 = vlib_buffer_get_current (b0);
-
-         /* dst lookup was done by ip6 lookup */
-         dst_adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
-         dst_adj0 = ip_get_adjacency (lgm->lm6, dst_adj_index0);
-         src_fib_index0 = dst_adj0->rewrite_header.sw_if_index;
-
-         /* do src lookup */
-         ip6_src_fib_lookup_one (lgm, src_fib_index0, &ip0->src_address,
-                                 &src_adj_index0);
-
-         /* if a source fib exists */
-         if (PREDICT_TRUE (src_adj_index0 != (u32) ~ 0))
-           {
-             vnet_buffer (b0)->ip.adj_index[VLIB_TX] = src_adj_index0;
-             src_adj0 = ip_get_adjacency (lgm->lm6, src_adj_index0);
-             next0 = src_adj0->explicit_fib_index;
-
-             /* prepare packet for lisp-gpe output node */
-             vnet_buffer (b0)->sw_if_index[VLIB_TX] =
-               src_adj0->rewrite_header.sw_if_index;
-
-             /* if multipath: saved_lookup_next_index is reused to store
-              * nb of sub-tunnels. If greater than 1, multipath is on */
-             if (PREDICT_TRUE (src_adj0->saved_lookup_next_index > 1))
-               vnet_buffer (b0)->ip.flow_hash =
-                 ip6_compute_flow_hash (ip0, IP_FLOW_HASH_DEFAULT);
-           }
-         else
-           {
-             next0 = LGPE_IP4_LOOKUP_NEXT_LISP_CP_LOOKUP;
-           }
-
-         /* mark the packets for CP lookup if needed */
-         if (PREDICT_FALSE (LGPE_IP4_LOOKUP_NEXT_LISP_CP_LOOKUP == next0))
-           vnet_buffer (b0)->lisp.overlay_afi = LISP_AFI_IP;
-
-         vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
-                                          n_left_to_next, bi0, next0);
-       }
-      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
-    }
-  return from_frame->n_vectors;
+  fib_table_entry_update (src_fib_index,
+                         &src_fib_prefix,
+                         FIB_SOURCE_LISP, FIB_ENTRY_FLAG_NONE, rpaths);
+  vec_free (rpaths);
 }
 
-/* *INDENT-OFF* */
-VLIB_REGISTER_NODE (lgpe_ip6_lookup_node) = {
-  .function = lgpe_ip6_lookup,
-  .name = "lgpe-ip6-lookup",
-  .vector_size = sizeof (u32),
-
-  .type = VLIB_NODE_TYPE_INTERNAL,
-
-  .n_next_nodes = LGPE_IP6_LOOKUP_N_NEXT,
-  .next_nodes = {
-#define _(sym,str) [LGPE_IP6_LOOKUP_NEXT_##sym] = str,
-      foreach_lgpe_ip6_lookup_next
-#undef _
-  },
-};
-/* *INDENT-ON* */
-
 /*
  * fd.io coding-style-patch-verification: ON
  *
index 579422b..f05c6a2 100644 (file)
  */
 
 #include <vnet/lisp-gpe/lisp_gpe.h>
-#include <vppinfra/math.h>
+#include <vnet/lisp-gpe/lisp_gpe_adjacency.h>
+#include <vnet/adj/adj_midchain.h>
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/fib_entry.h>
+#include <vnet/fib/fib_path_list.h>
+#include <vnet/dpo/drop_dpo.h>
+#include <vnet/dpo/load_balance.h>
 
 /** LISP-GPE global state */
 lisp_gpe_main_t lisp_gpe_main;
 
 /**
- * @brief Compute IP-UDP-GPE sub-tunnel encap/rewrite header.
- *
- * @param[in]   t       Parent of the sub-tunnel.
- * @param[in]   st      Sub-tunnel.
- * @param[in]   lp      Local and remote locators used in the encap header.
- *
- * @return 0 on success.
+ * @brief A Pool of all LISP forwarding entries
  */
-static int
-lisp_gpe_rewrite (lisp_gpe_tunnel_t * t, lisp_gpe_sub_tunnel_t * st,
-                 locator_pair_t * lp)
-{
-  u8 *rw = 0;
-  lisp_gpe_header_t *lisp0;
-  int len;
-
-  if (ip_addr_version (&lp->lcl_loc) == IP4)
-    {
-      ip4_header_t *ip0;
-      ip4_udp_lisp_gpe_header_t *h0;
-      len = sizeof (*h0);
+static lisp_fwd_entry_t *lisp_fwd_entry_pool;
 
-      vec_validate_aligned (rw, len - 1, CLIB_CACHE_LINE_BYTES);
+/**
+ * DB of all forwarding entries. The Key is:{l-EID,r-EID,vni}
+ * where the EID encodes L2 or L3
+ */
+static uword *lisp_gpe_fwd_entries;
 
-      h0 = (ip4_udp_lisp_gpe_header_t *) rw;
+static void
+create_fib_entries (lisp_fwd_entry_t * lfe)
+{
+  dpo_proto_t dproto;
 
-      /* Fixed portion of the (outer) ip4 header */
-      ip0 = &h0->ip4;
-      ip0->ip_version_and_header_length = 0x45;
-      ip0->ttl = 254;
-      ip0->protocol = IP_PROTOCOL_UDP;
+  dproto = (ip_prefix_version (&lfe->key->rmt.ippref) == IP4 ?
+           FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6);
 
-      /* we fix up the ip4 header length and checksum after-the-fact */
-      ip_address_copy_addr (&ip0->src_address, &lp->lcl_loc);
-      ip_address_copy_addr (&ip0->dst_address, &lp->rmt_loc);
-      ip0->checksum = ip4_header_checksum (ip0);
+  lfe->src_fib_index = ip_dst_fib_add_route (lfe->eid_fib_index,
+                                            &lfe->key->rmt.ippref);
 
-      /* UDP header, randomize src port on something, maybe? */
-      h0->udp.src_port = clib_host_to_net_u16 (4341);
-      h0->udp.dst_port = clib_host_to_net_u16 (UDP_DST_PORT_lisp_gpe);
+  if (LISP_FWD_ENTRY_TYPE_NEGATIVE == lfe->type)
+    {
+      dpo_id_t dpo = DPO_NULL;
 
-      /* LISP-gpe header */
-      lisp0 = &h0->lisp;
+      switch (lfe->action)
+       {
+       case LISP_NO_ACTION:
+         /* TODO update timers? */
+       case LISP_FORWARD_NATIVE:
+         /* TODO check if route/next-hop for eid exists in fib and add
+          * more specific for the eid with the next-hop found */
+       case LISP_SEND_MAP_REQUEST:
+         /* insert tunnel that always sends map-request */
+         dpo_set (&dpo, DPO_LISP_CP, 0, dproto);
+         break;
+       case LISP_DROP:
+         /* for drop fwd entries, just add route, no need to add encap tunnel */
+         dpo_copy (&dpo, drop_dpo_get (dproto));
+         break;
+       }
+      ip_src_fib_add_route_w_dpo (lfe->src_fib_index,
+                                 &lfe->key->lcl.ippref, &dpo);
+      dpo_reset (&dpo);
     }
   else
     {
-      ip6_header_t *ip0;
-      ip6_udp_lisp_gpe_header_t *h0;
-      len = sizeof (*h0);
-
-      vec_validate_aligned (rw, len - 1, CLIB_CACHE_LINE_BYTES);
-
-      h0 = (ip6_udp_lisp_gpe_header_t *) rw;
-
-      /* Fixed portion of the (outer) ip6 header */
-      ip0 = &h0->ip6;
-      ip0->ip_version_traffic_class_and_flow_label =
-       clib_host_to_net_u32 (0x6 << 28);
-      ip0->hop_limit = 254;
-      ip0->protocol = IP_PROTOCOL_UDP;
-
-      /* we fix up the ip6 header length after-the-fact */
-      ip_address_copy_addr (&ip0->src_address, &lp->lcl_loc);
-      ip_address_copy_addr (&ip0->dst_address, &lp->rmt_loc);
-
-      /* UDP header, randomize src port on something, maybe? */
-      h0->udp.src_port = clib_host_to_net_u16 (4341);
-      h0->udp.dst_port = clib_host_to_net_u16 (UDP_DST_PORT_lisp_gpe);
-
-      /* LISP-gpe header */
-      lisp0 = &h0->lisp;
+      ip_src_fib_add_route (lfe->src_fib_index,
+                           &lfe->key->lcl.ippref, lfe->paths);
     }
-
-  lisp0->flags = t->flags;
-  lisp0->ver_res = t->ver_res;
-  lisp0->res = t->res;
-  lisp0->next_protocol = t->next_protocol;
-  lisp0->iid = clib_host_to_net_u32 (t->vni);
-
-  st->is_ip4 = ip_addr_version (&lp->lcl_loc) == IP4;
-  st->rewrite = rw;
-  return 0;
 }
 
-static int
-weight_cmp (normalized_sub_tunnel_weights_t * a,
-           normalized_sub_tunnel_weights_t * b)
+static void
+delete_fib_entries (lisp_fwd_entry_t * lfe)
 {
-  int cmp = a->weight - b->weight;
-  return (cmp == 0
-         ? a->sub_tunnel_index - b->sub_tunnel_index : (cmp > 0 ? -1 : 1));
+  ip_src_dst_fib_del_route (lfe->src_fib_index,
+                           &lfe->key->lcl.ippref,
+                           lfe->eid_fib_index, &lfe->key->rmt.ippref);
 }
 
-/**
- * @brief Computes sub-tunnel load balancing vector.
- *
- * Algorithm is identical to that used for building unequal-cost multipath
- * adjacencies. Saves normalized sub-tunnel weights and builds load-balancing
- * vector consisting of list of sub-tunnel indexes replicated according to
- * weight.
- *
- * @param[in]   t       Tunnel for which load balancing vector is computed.
- */
 static void
-compute_sub_tunnels_balancing_vector (lisp_gpe_tunnel_t * t)
+gid_to_dp_address (gid_address_t * g, dp_address_t * d)
 {
-  uword n_sts, i, n_nsts, n_nsts_left;
-  f64 sum_weight, norm, error, tolerance;
-  normalized_sub_tunnel_weights_t *nsts = 0, *stp;
-  lisp_gpe_sub_tunnel_t *sts = t->sub_tunnels;
-  u32 *st_lbv = 0;
-
-  /* Accept 1% error */
-  tolerance = .01;
-
-  n_sts = vec_len (sts);
-  vec_validate (nsts, 2 * n_sts - 1);
-
-  sum_weight = 0;
-  for (i = 0; i < n_sts; i++)
+  switch (gid_address_type (g))
     {
-      /* Find total weight to normalize weights. */
-      sum_weight += sts[i].weight;
-
-      /* build normalized sub tunnels vector */
-      nsts[i].weight = sts[i].weight;
-      nsts[i].sub_tunnel_index = i;
-    }
-
-  n_nsts = n_sts;
-  if (n_sts == 1)
-    {
-      nsts[0].weight = 1;
-      _vec_len (nsts) = 1;
-      goto build_lbv;
+    case GID_ADDR_IP_PREFIX:
+    case GID_ADDR_SRC_DST:
+      ip_prefix_copy (&d->ippref, &gid_address_ippref (g));
+      d->type = FID_ADDR_IP_PREF;
+      break;
+    case GID_ADDR_MAC:
+    default:
+      mac_copy (&d->mac, &gid_address_mac (g));
+      d->type = FID_ADDR_MAC;
+      break;
     }
+}
 
-  /* Sort sub-tunnels by weight */
-  qsort (nsts, n_nsts, sizeof (u32), (void *) weight_cmp);
+static lisp_fwd_entry_t *
+find_fwd_entry (lisp_gpe_main_t * lgm,
+               vnet_lisp_gpe_add_del_fwd_entry_args_t * a,
+               lisp_gpe_fwd_entry_key_t * key)
+{
+  uword *p;
 
-  /* Save copies of all next hop weights to avoid being overwritten in loop below. */
-  for (i = 0; i < n_nsts; i++)
-    nsts[n_nsts + i].weight = nsts[i].weight;
+  memset (key, 0, sizeof (*key));
 
-  /* Try larger and larger power of 2 sized blocks until we
-     find one where traffic flows to within 1% of specified weights. */
-  for (n_nsts = max_pow2 (n_sts);; n_nsts *= 2)
+  if (GID_ADDR_IP_PREFIX == gid_address_type (&a->rmt_eid))
     {
-      error = 0;
-
-      norm = n_nsts / sum_weight;
-      n_nsts_left = n_nsts;
-      for (i = 0; i < n_sts; i++)
-       {
-         f64 nf = nsts[n_sts + i].weight * norm;
-         word n = flt_round_nearest (nf);
-
-         n = n > n_nsts_left ? n_nsts_left : n;
-         n_nsts_left -= n;
-         error += fabs (nf - n);
-         nsts[i].weight = n;
-       }
-
-      nsts[0].weight += n_nsts_left;
-
-      /* Less than 5% average error per adjacency with this size adjacency block? */
-      if (error <= tolerance * n_nsts)
-       {
-         /* Truncate any next hops with zero weight. */
-         _vec_len (nsts) = i;
-         break;
-       }
+      /*
+       * the ip version of the source is not set to ip6 when the
+       * source is all zeros. force it.
+       */
+      ip_prefix_version (&gid_address_ippref (&a->lcl_eid)) =
+       ip_prefix_version (&gid_address_ippref (&a->rmt_eid));
     }
 
-build_lbv:
+  gid_to_dp_address (&a->rmt_eid, &key->rmt);
+  gid_to_dp_address (&a->lcl_eid, &key->lcl);
+  key->vni = a->vni;
 
-  /* build load balancing vector */
-  vec_foreach (stp, nsts)
-  {
-    for (i = 0; i < stp[0].weight; i++)
-      vec_add1 (st_lbv, stp[0].sub_tunnel_index);
-  }
+  p = hash_get_mem (lisp_gpe_fwd_entries, key);
 
-  t->sub_tunnels_lbv = st_lbv;
-  t->sub_tunnels_lbv_count = n_nsts;
-  t->norm_sub_tunnel_weights = nsts;
+  if (NULL != p)
+    {
+      return (pool_elt_at_index (lisp_fwd_entry_pool, p[0]));
+    }
+  return (NULL);
 }
 
-/** Create sub-tunnels and load-balancing vector for all locator pairs
- * associated to a tunnel.*/
-static void
-create_sub_tunnels (lisp_gpe_main_t * lgm, lisp_gpe_tunnel_t * t)
+static int
+lisp_gpe_fwd_entry_path_sort (void *a1, void *a2)
 {
-  lisp_gpe_sub_tunnel_t st;
-  locator_pair_t *lp = 0;
-  int i;
-
-  /* create sub-tunnels for all locator pairs */
-  for (i = 0; i < vec_len (t->locator_pairs); i++)
-    {
-      lp = &t->locator_pairs[i];
-      st.locator_pair_index = i;
-      st.parent_index = t - lgm->tunnels;
-      st.weight = lp->weight;
-
-      /* compute rewrite for sub-tunnel */
-      lisp_gpe_rewrite (t, &st, lp);
-      vec_add1 (t->sub_tunnels, st);
-    }
+  lisp_fwd_path_t *p1 = a1, *p2 = a2;
 
-  /* normalize weights and compute sub-tunnel load balancing vector */
-  compute_sub_tunnels_balancing_vector (t);
+  return (p1->priority - p2->priority);
 }
 
-#define foreach_copy_field                      \
-_(encap_fib_index)                              \
-_(decap_fib_index)                              \
-_(decap_next_index)                             \
-_(vni)                                          \
-_(action)
-
 /**
- * @brief Create/delete IP encapsulated tunnel.
+ * @brief Add/Delete LISP IP forwarding entry.
  *
- * Builds GPE tunnel for L2 or L3 packets and populates tunnel pool
- * @ref lisp_gpe_tunnel_by_key in @ref lisp_gpe_main_t.
+ * creation of forwarding entries for IP LISP overlay:
  *
- * @param[in]   a               Tunnel parameters.
- * @param[in]   is_l2           Flag indicating if encapsulated content is l2.
- * @param[out]  tun_index_res   Tunnel index.
+ * @param[in]   lgm     Reference to @ref lisp_gpe_main_t.
+ * @param[in]   a       Parameters for building the forwarding entry.
  *
  * @return 0 on success.
  */
 static int
-add_del_ip_tunnel (vnet_lisp_gpe_add_del_fwd_entry_args_t * a, u8 is_l2,
-                  u32 * tun_index_res)
+add_ip_fwd_entry (lisp_gpe_main_t * lgm,
+                 vnet_lisp_gpe_add_del_fwd_entry_args_t * a)
 {
-  lisp_gpe_main_t *lgm = &lisp_gpe_main;
-  lisp_gpe_tunnel_t *t = 0;
-  lisp_gpe_tunnel_key_t key;
-  lisp_gpe_sub_tunnel_t *stp = 0;
-  uword *p;
-
-  /* prepare tunnel key */
-  memset (&key, 0, sizeof (key));
-
-  /* fill in the key's remote eid */
-  if (!is_l2)
-    ip_prefix_copy (&key.rmt.ippref, &gid_address_ippref (&a->rmt_eid));
-  else
-    mac_copy (&key.rmt.mac, &gid_address_mac (&a->rmt_eid));
-
-  key.vni = clib_host_to_net_u32 (a->vni);
-
-  p = mhash_get (&lgm->lisp_gpe_tunnel_by_key, &key);
-
-  if (a->is_add)
-    {
-      /* adding a tunnel: tunnel must not already exist */
-      if (p)
-       return VNET_API_ERROR_INVALID_VALUE;
-
-      if (a->decap_next_index >= LISP_GPE_INPUT_N_NEXT)
-       return VNET_API_ERROR_INVALID_DECAP_NEXT;
+  lisp_gpe_fwd_entry_key_t key;
+  lisp_fwd_entry_t *lfe;
+  fib_protocol_t fproto;
 
-      pool_get_aligned (lgm->tunnels, t, CLIB_CACHE_LINE_BYTES);
-      memset (t, 0, sizeof (*t));
+  lfe = find_fwd_entry (lgm, a, &key);
 
-      /* copy from arg structure */
-#define _(x) t->x = a->x;
-      foreach_copy_field;
-#undef _
+  if (NULL != lfe)
+    /* don't support updates */
+    return VNET_API_ERROR_INVALID_VALUE;
 
-      t->locator_pairs = vec_dup (a->locator_pairs);
+  pool_get (lisp_fwd_entry_pool, lfe);
+  memset (lfe, 0, sizeof (*lfe));
+  lfe->key = clib_mem_alloc (sizeof (key));
+  memcpy (lfe->key, &key, sizeof (key));
 
-      /* if vni is non-default */
-      if (a->vni)
-       t->flags = LISP_GPE_FLAGS_I;
+  hash_set_mem (lisp_gpe_fwd_entries, lfe->key, lfe - lisp_fwd_entry_pool);
 
-      /* work in lisp-gpe not legacy mode */
-      t->flags |= LISP_GPE_FLAGS_P;
+  fproto = (IP4 == ip_prefix_version (&fid_addr_ippref (&lfe->key->rmt)) ?
+           FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6);
 
-      /* next proto */
-      if (!is_l2)
-       t->next_protocol = ip_prefix_version (&key.rmt.ippref) == IP4 ?
-         LISP_GPE_NEXT_PROTO_IP4 : LISP_GPE_NEXT_PROTO_IP6;
-      else
-       t->next_protocol = LISP_GPE_NEXT_PROTO_ETHERNET;
-
-      /* build sub-tunnels for lowest priority locator-pairs */
-      if (!a->is_negative)
-       create_sub_tunnels (lgm, t);
-
-      mhash_set (&lgm->lisp_gpe_tunnel_by_key, &key, t - lgm->tunnels, 0);
+  lfe->type = (a->is_negative ?
+              LISP_FWD_ENTRY_TYPE_NEGATIVE : LISP_FWD_ENTRY_TYPE_NORMAL);
+  lfe->eid_table_id = a->table_id;
+  lfe->eid_fib_index = fib_table_find_or_create_and_lock (fproto,
+                                                         lfe->eid_table_id);
 
-      /* return tunnel index */
-      if (tun_index_res)
-       tun_index_res[0] = t - lgm->tunnels;
-    }
-  else
+  if (LISP_FWD_ENTRY_TYPE_NEGATIVE != lfe->type)
     {
-      /* deleting a tunnel: tunnel must exist */
-      if (!p)
-       {
-         clib_warning ("Tunnel for eid %U doesn't exist!",
-                       format_gid_address, &a->rmt_eid);
-         return VNET_API_ERROR_NO_SUCH_ENTRY;
-       }
-
-      t = pool_elt_at_index (lgm->tunnels, p[0]);
+      lisp_fwd_path_t *path;
+      u32 index;
 
-      mhash_unset (&lgm->lisp_gpe_tunnel_by_key, &key, 0);
+      vec_validate (lfe->paths, vec_len (a->locator_pairs) - 1);
 
-      vec_foreach (stp, t->sub_tunnels)
+      vec_foreach_index (index, a->locator_pairs)
       {
-       vec_free (stp->rewrite);
+       path = &lfe->paths[index];
+
+       path->priority = a->locator_pairs[index].priority;
+       path->weight = a->locator_pairs[index].weight;
+
+       path->lisp_adj =
+         lisp_gpe_adjacency_find_or_create_and_lock (&a->locator_pairs
+                                                     [index],
+                                                     lfe->eid_table_id,
+                                                     lfe->key->vni);
       }
-      vec_free (t->sub_tunnels);
-      vec_free (t->sub_tunnels_lbv);
-      vec_free (t->locator_pairs);
-      pool_put (lgm->tunnels, t);
+      vec_sort_with_function (lfe->paths, lisp_gpe_fwd_entry_path_sort);
     }
 
-  return 0;
+  create_fib_entries (lfe);
+
+  return (0);
 }
 
-/**
- * @brief Build IP adjacency for LISP Source/Dest FIB.
- *
- * Because LISP forwarding does not follow typical IP forwarding path, the
- * adjacency's fields are overloaded (i.e., hijacked) to carry LISP specific
- * data concerning the lisp-gpe interface the packets hitting the adjacency
- * should be sent to and the tunnel that should be used.
- *
- * @param[in]   lgm             Reference to @ref lisp_gpe_main_t.
- * @param[out]  adj             Adjacency to be populated.
- * @param[in]   table_id        VRF for adjacency.
- * @param[in]   vni             Virtual Network identifier (tenant id).
- * @param[in]   tun_index       Tunnel index.
- * @param[in]   n_sub_tun       Number of sub-tunnels.
- * @param[in]   is_negative     Flag to indicate if the adjacency is for a
- *                              negative mapping.
- * @param[in]   action          Action to be taken for negative mapping.
- * @param[in]   ip_ver          IP version for the adjacency.
- *
- * @return 0 on success.
- */
-static int
-build_ip_adjacency (lisp_gpe_main_t * lgm, ip_adjacency_t * adj, u32 table_id,
-                   u32 vni, u32 tun_index, u32 n_sub_tun, u8 is_negative,
-                   u8 action, u8 ip_ver)
+static void
+del_ip_fwd_entry_i (lisp_fwd_entry_t * lfe)
 {
-  uword *lookup_next_index, *lgpe_sw_if_index, *lnip;
+  lisp_fwd_path_t *path;
+  fib_protocol_t fproto;
 
-  memset (adj, 0, sizeof (adj[0]));
-  adj->n_adj = 1;
-  /* fill in lookup_next_index with a 'legal' value to avoid problems */
-  adj->lookup_next_index = (ip_ver == IP4) ?
-    lgm->ip4_lookup_next_lgpe_ip4_lookup :
-    lgm->ip6_lookup_next_lgpe_ip6_lookup;
+  vec_foreach (path, lfe->paths)
+  {
+    lisp_gpe_adjacency_unlock (path->lisp_adj);
+  }
 
-  /* positive mapping */
-  if (!is_negative)
-    {
-      /* send packets that hit this adj to lisp-gpe interface output node in
-       * requested vrf. */
-      lnip = (ip_ver == IP4) ?
-       lgm->lgpe_ip4_lookup_next_index_by_table_id :
-       lgm->lgpe_ip6_lookup_next_index_by_table_id;
-      lookup_next_index = hash_get (lnip, table_id);
-      lgpe_sw_if_index = hash_get (lgm->l3_ifaces.sw_if_index_by_vni, vni);
-
-      /* the assumption is that the interface must've been created before
-       * programming the dp */
-      ASSERT (lookup_next_index != 0 && lgpe_sw_if_index != 0);
-
-      /* hijack explicit fib index to store lisp interface node index,
-       * if_address_index for the tunnel index and saved lookup next index
-       * for the number of sub tunnels */
-      adj->explicit_fib_index = lookup_next_index[0];
-      adj->if_address_index = tun_index;
-      adj->rewrite_header.sw_if_index = lgpe_sw_if_index[0];
-      adj->saved_lookup_next_index = n_sub_tun;
-    }
-  /* negative mapping */
-  else
-    {
-      adj->rewrite_header.sw_if_index = ~0;
-      adj->rewrite_header.next_index = ~0;
-      adj->if_address_index = tun_index;
+  delete_fib_entries (lfe);
 
-      switch (action)
-       {
-       case LISP_NO_ACTION:
-         /* TODO update timers? */
-       case LISP_FORWARD_NATIVE:
-         /* TODO check if route/next-hop for eid exists in fib and add
-          * more specific for the eid with the next-hop found */
-       case LISP_SEND_MAP_REQUEST:
-         /* insert tunnel that always sends map-request */
-         adj->explicit_fib_index = (ip_ver == IP4) ?
-           LGPE_IP4_LOOKUP_NEXT_LISP_CP_LOOKUP :
-           LGPE_IP6_LOOKUP_NEXT_LISP_CP_LOOKUP;
-         break;
-       case LISP_DROP:
-         /* for drop fwd entries, just add route, no need to add encap tunnel */
-         adj->explicit_fib_index = (ip_ver == IP4 ?
-                                    LGPE_IP4_LOOKUP_NEXT_DROP :
-                                    LGPE_IP6_LOOKUP_NEXT_DROP);
-         break;
-       default:
-         return -1;
-       }
-    }
-  return 0;
+  fproto = (IP4 == ip_prefix_version (&fid_addr_ippref (&lfe->key->rmt)) ?
+           FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6);
+  fib_table_unlock (lfe->eid_fib_index, fproto);
+
+  hash_unset_mem (lisp_gpe_fwd_entries, lfe->key);
+  clib_mem_free (lfe->key);
+  pool_put (lisp_fwd_entry_pool, lfe);
 }
 
 /**
  * @brief Add/Delete LISP IP forwarding entry.
  *
- * Coordinates the creation/removal of forwarding entries for IP LISP overlay:
- * creates lisp-gpe tunnel, builds tunnel customized forwarding entry and
- * injects new route in Source/Dest FIB.
+ * removal of forwarding entries for IP LISP overlay:
  *
  * @param[in]   lgm     Reference to @ref lisp_gpe_main_t.
  * @param[in]   a       Parameters for building the forwarding entry.
@@ -455,63 +250,21 @@ build_ip_adjacency (lisp_gpe_main_t * lgm, ip_adjacency_t * adj, u32 table_id,
  * @return 0 on success.
  */
 static int
-add_del_ip_fwd_entry (lisp_gpe_main_t * lgm,
-                     vnet_lisp_gpe_add_del_fwd_entry_args_t * a)
+del_ip_fwd_entry (lisp_gpe_main_t * lgm,
+                 vnet_lisp_gpe_add_del_fwd_entry_args_t * a)
 {
-  ip_adjacency_t adj, *adjp;
-  lisp_gpe_tunnel_t *t;
-  u32 rv, tun_index = ~0, n_sub_tuns = 0;
-  ip_prefix_t *rmt_pref, *lcl_pref;
-  u8 ip_ver;
-
-  rmt_pref = &gid_address_ippref (&a->rmt_eid);
-  lcl_pref = &gid_address_ippref (&a->lcl_eid);
-  ip_ver = ip_prefix_version (rmt_pref);
-
-  /* add/del tunnel to tunnels pool and prepares rewrite */
-  if (0 != a->locator_pairs)
-    {
-      rv = add_del_ip_tunnel (a, 0 /* is_l2 */ , &tun_index);
-      if (rv)
-       {
-         clib_warning ("failed to build tunnel!");
-         return rv;
-       }
-      if (a->is_add)
-       {
-         t = pool_elt_at_index (lgm->tunnels, tun_index);
-         n_sub_tuns = t->sub_tunnels_lbv_count;
-       }
-    }
-
-  /* setup adjacency for eid */
-  rv = build_ip_adjacency (lgm, &adj, a->table_id, a->vni, tun_index,
-                          n_sub_tuns, a->is_negative, a->action, ip_ver);
-
-  /* add/delete route for eid */
-  rv |= ip_sd_fib_add_del_route (lgm, rmt_pref, lcl_pref, a->table_id, &adj,
-                                a->is_add);
-
-  if (rv)
-    {
-      clib_warning ("failed to insert route for tunnel!");
-      return rv;
-    }
+  lisp_gpe_fwd_entry_key_t key;
+  lisp_fwd_entry_t *lfe;
 
-  /* check that everything worked */
-  if (CLIB_DEBUG && a->is_add)
-    {
-      u32 adj_index;
-      adj_index = ip_sd_fib_get_route (lgm, rmt_pref, lcl_pref, a->table_id);
-      ASSERT (adj_index != 0);
+  lfe = find_fwd_entry (lgm, a, &key);
 
-      adjp = ip_get_adjacency ((ip_ver == IP4) ? lgm->lm4 : lgm->lm6,
-                              adj_index);
+  if (NULL == lfe)
+    /* no such entry */
+    return VNET_API_ERROR_INVALID_VALUE;
 
-      ASSERT (adjp != 0 && adjp->if_address_index == tun_index);
-    }
+  del_ip_fwd_entry_i (lfe);
 
-  return rv;
+  return (0);
 }
 
 static void
@@ -536,7 +289,7 @@ make_mac_fib_key (BVT (clib_bihash_kv) * kv, u16 bd_index, u8 src_mac[6],
  *
  * @return index of mapping matching the lookup key.
  */
-u32
+index_t
 lisp_l2_fib_lookup (lisp_gpe_main_t * lgm, u16 bd_index, u8 src_mac[6],
                    u8 dst_mac[6])
 {
@@ -555,7 +308,7 @@ lisp_l2_fib_lookup (lisp_gpe_main_t * lgm, u16 bd_index, u8 src_mac[6],
        return value.value;
     }
 
-  return ~0;
+  return lisp_gpe_main.l2_lb_miss;
 }
 
 /**
@@ -601,6 +354,12 @@ l2_fib_init (lisp_gpe_main_t * lgm)
   BV (clib_bihash_init) (&lgm->l2_fib, "l2 fib",
                         1 << max_log2 (L2_FIB_DEFAULT_HASH_NUM_BUCKETS),
                         L2_FIB_DEFAULT_HASH_MEMORY_SIZE);
+
+  /*
+   * the result from a 'miss' in a L2 Table
+   */
+  lgm->l2_lb_miss = load_balance_create (1, DPO_PROTO_IP4, 0);
+  load_balance_set_bucket (lgm->l2_lb_miss, 0, drop_dpo_get (DPO_PROTO_IP4));
 }
 
 /**
@@ -618,27 +377,75 @@ static int
 add_del_l2_fwd_entry (lisp_gpe_main_t * lgm,
                      vnet_lisp_gpe_add_del_fwd_entry_args_t * a)
 {
-  int rv;
-  u32 tun_index;
-  bd_main_t *bdm = &bd_main;
-  uword *bd_indexp;
-
-  /* create tunnel */
-  rv = add_del_ip_tunnel (a, 1 /* is_l2 */ , &tun_index);
-  if (rv)
-    return rv;
-
-  bd_indexp = hash_get (bdm->bd_index_by_bd_id, a->bd_id);
-  if (!bd_indexp)
-    {
-      clib_warning ("bridge domain %d doesn't exist", a->bd_id);
-      return -1;
-    }
-
-  /* add entry to l2 lisp fib */
-  lisp_l2_fib_add_del_entry (lgm, bd_indexp[0], gid_address_mac (&a->lcl_eid),
-                            gid_address_mac (&a->rmt_eid), tun_index,
-                            a->is_add);
+  /* lisp_gpe_fwd_entry_key_t key; */
+  /* lisp_fwd_entry_t *lfe; */
+  /* fib_protocol_t fproto; */
+  /* uword *bd_indexp; */
+
+  /* bd_indexp = hash_get (bdm->bd_index_by_bd_id, a->bd_id); */
+  /* if (!bd_indexp) */
+  /*   { */
+  /*     clib_warning ("bridge domain %d doesn't exist", a->bd_id); */
+  /*     return -1; */
+  /*   } */
+
+  /* lfe = find_fwd_entry(lgm, a, &key); */
+
+  /* if (NULL != lfe) */
+  /*   /\* don't support updates *\/ */
+  /*   return VNET_API_ERROR_INVALID_VALUE; */
+
+  /* int rv; */
+  /* u32 tun_index; */
+  /* fib_node_index_t old_path_list; */
+  /* bd_main_t *bdm = &bd_main; */
+  /* fib_route_path_t *rpaths; */
+  /* lisp_gpe_tunnel_t *t; */
+  /* const dpo_id_t *dpo; */
+  /* index_t lbi; */
+
+  /* /\* create tunnel *\/ */
+  /* rv = add_del_ip_tunnel (a, 1 /\* is_l2 *\/ , &tun_index, NULL); */
+  /* if (rv) */
+  /*   return rv; */
+
+  /* bd_indexp = hash_get (bdm->bd_index_by_bd_id, a->bd_id); */
+  /* if (!bd_indexp) */
+  /*   { */
+  /*     clib_warning ("bridge domain %d doesn't exist", a->bd_id); */
+  /*     return -1; */
+  /*   } */
+
+  /* t = pool_elt_at_index (lgm->tunnels, tun_index); */
+  /* old_path_list = t->l2_path_list; */
+
+  /* if (LISP_NO_ACTION == t->action) */
+  /*   { */
+  /*     rpaths = lisp_gpe_mk_paths_for_sub_tunnels (t); */
+
+  /*     t->l2_path_list = fib_path_list_create (FIB_PATH_LIST_FLAG_NONE, */
+  /*                                          rpaths); */
+
+  /*     vec_free (rpaths); */
+  /*     fib_path_list_lock (t->l2_path_list); */
+
+  /*     dpo = fib_path_list_contribute_forwarding (t->l2_path_list, */
+  /*                                             FIB_FORW_CHAIN_TYPE_UNICAST_IP); */
+  /*     lbi = dpo->dpoi_index; */
+  /*   } */
+  /* else if (LISP_SEND_MAP_REQUEST == t->action) */
+  /*   { */
+  /*     lbi = lgm->l2_lb_cp_lkup; */
+  /*   } */
+  /* else */
+  /*   { */
+  /*     lbi = lgm->l2_lb_miss; */
+  /*   } */
+  /* fib_path_list_unlock (old_path_list); */
+
+  /* /\* add entry to l2 lisp fib *\/ */
+  /* lisp_l2_fib_add_del_entry (lgm, bd_indexp[0], gid_address_mac (&a->lcl_eid), */
+  /*                         gid_address_mac (&a->rmt_eid), lbi, a->is_add); */
   return 0;
 }
 
@@ -669,7 +476,11 @@ vnet_lisp_gpe_add_del_fwd_entry (vnet_lisp_gpe_add_del_fwd_entry_args_t * a,
   switch (type)
     {
     case GID_ADDR_IP_PREFIX:
-      return add_del_ip_fwd_entry (lgm, a);
+      if (a->is_add)
+       return add_ip_fwd_entry (lgm, a);
+      else
+       return del_ip_fwd_entry (lgm, a);
+      break;
     case GID_ADDR_MAC:
       return add_del_l2_fwd_entry (lgm, a);
     default:
@@ -807,103 +618,77 @@ done:
 
 /* *INDENT-OFF* */
 VLIB_CLI_COMMAND (lisp_gpe_add_del_fwd_entry_command, static) = {
-  .path = "lisp gpe tunnel",
-  .short_help = "lisp gpe tunnel add/del vni <vni> vrf <vrf> [leid <leid>]"
+  .path = "lisp gpe entry",
+  .short_help = "lisp gpe entry add/del vni <vni> vrf <vrf> [leid <leid>]"
       "reid <reid> [loc-pair <lloc> <rloc> p <priority> w <weight>] "
       "[negative action <action>]",
   .function = lisp_gpe_add_del_fwd_entry_command_fn,
 };
 /* *INDENT-ON* */
 
-/** Format LISP-GPE next indexes. */
 static u8 *
-format_decap_next (u8 * s, va_list * args)
+format_lisp_fwd_path (u8 * s, va_list ap)
 {
-  u32 next_index = va_arg (*args, u32);
+  lisp_fwd_path_t *lfp = va_arg (ap, lisp_fwd_path_t *);
 
-  switch (next_index)
-    {
-    case LISP_GPE_INPUT_NEXT_DROP:
-      return format (s, "drop");
-    case LISP_GPE_INPUT_NEXT_IP4_INPUT:
-      return format (s, "ip4");
-    case LISP_GPE_INPUT_NEXT_IP6_INPUT:
-      return format (s, "ip6");
-    default:
-      return format (s, "unknown %d", next_index);
-    }
-  return s;
+  s = format (s, "pirority:%d weight:%d ", lfp->priority, lfp->weight);
+  s = format (s, "adj:[%U]\n",
+             format_lisp_gpe_adjacency,
+             lisp_gpe_adjacency_get (lfp->lisp_adj),
+             LISP_GPE_ADJ_FORMAT_FLAG_NONE);
+
+  return (s);
 }
 
-/** Format LISP-GPE tunnel. */
-u8 *
-format_lisp_gpe_tunnel (u8 * s, va_list * args)
+static u8 *
+format_lisp_gpe_fwd_entry (u8 * s, va_list ap)
 {
-  lisp_gpe_tunnel_t *t = va_arg (*args, lisp_gpe_tunnel_t *);
-  lisp_gpe_main_t *lgm = vnet_lisp_gpe_get_main ();
-  locator_pair_t *lp = 0;
-  normalized_sub_tunnel_weights_t *nstw;
-
-  s =
-    format (s, "tunnel %d vni %d (0x%x)\n", t - lgm->tunnels, t->vni, t->vni);
-  s =
-    format (s, " fibs: encap %d, decap %d decap next %U\n",
-           t->encap_fib_index, t->decap_fib_index, format_decap_next,
-           t->decap_next_index);
-  s = format (s, " lisp ver %d ", (t->ver_res >> 6));
-
-#define _(n,v) if (t->flags & v) s = format (s, "%s-bit ", #n);
-  foreach_lisp_gpe_flag_bit;
-#undef _
-
-  s = format (s, "next_protocol %d ver_res %x res %x\n",
-             t->next_protocol, t->ver_res, t->res);
-
-  s = format (s, " locator-pairs:\n");
-  vec_foreach (lp, t->locator_pairs)
-  {
-    s = format (s, "  local: %U remote: %U weight %d\n",
-               format_ip_address, &lp->lcl_loc, format_ip_address,
-               &lp->rmt_loc, lp->weight);
-  }
+  lisp_fwd_entry_t *lfe = va_arg (ap, lisp_fwd_entry_t *);
 
-  s = format (s, " active sub-tunnels:\n");
-  vec_foreach (nstw, t->norm_sub_tunnel_weights)
-  {
-    lp = vec_elt_at_index (t->locator_pairs, nstw->sub_tunnel_index);
-    s = format (s, "  local: %U remote: %U weight %d\n", format_ip_address,
-               &lp->lcl_loc, format_ip_address, &lp->rmt_loc, nstw->weight);
-  }
-  return s;
+  s = format (s, "VNI:%d VRF:%d EID: %U -> %U",
+             lfe->key->vni, lfe->eid_table_id,
+             format_fid_address, &lfe->key->lcl,
+             format_fid_address, &lfe->key->rmt);
+  if (LISP_FWD_ENTRY_TYPE_NEGATIVE == lfe->type)
+    {
+      s = format (s, "\n Negative - action:%U",
+                 format_negative_mapping_action, lfe->action);
+    }
+  else
+    {
+      lisp_fwd_path_t *path;
+
+      s = format (s, "\n via:");
+      vec_foreach (path, lfe->paths)
+      {
+       s = format (s, "\n  %U", format_lisp_fwd_path, path);
+      }
+    }
+
+  return (s);
 }
 
-/** CLI command to show LISP-GPE tunnels. */
 static clib_error_t *
-show_lisp_gpe_tunnel_command_fn (vlib_main_t * vm,
-                                unformat_input_t * input,
-                                vlib_cli_command_t * cmd)
+lisp_gpe_fwd_entry_show (vlib_main_t * vm,
+                        unformat_input_t * input, vlib_cli_command_t * cmd)
 {
-  lisp_gpe_main_t *lgm = &lisp_gpe_main;
-  lisp_gpe_tunnel_t *t;
-
-  if (pool_elts (lgm->tunnels) == 0)
-    vlib_cli_output (vm, "No lisp-gpe tunnels configured...");
+  lisp_fwd_entry_t *lfe;
 
-  /* *INDENT-OFF* */
-  pool_foreach (t, lgm->tunnels,
+/* *INDENT-OFF* */
+  pool_foreach (lfe, lisp_fwd_entry_pool,
   ({
-    vlib_cli_output (vm, "%U", format_lisp_gpe_tunnel, t);
+    vlib_cli_output (vm, "%U", format_lisp_gpe_fwd_entry, lfe);
   }));
-  /* *INDENT-ON* */
+/* *INDENT-ON* */
 
-  return 0;
+  return (NULL);
 }
 
 /* *INDENT-OFF* */
-VLIB_CLI_COMMAND (show_lisp_gpe_tunnel_command, static) =
-{
-  .path = "show lisp gpe tunnel",
-  .function = show_lisp_gpe_tunnel_command_fn,
+VLIB_CLI_COMMAND (lisp_gpe_fwd_entry_show_command, static) = {
+  .path = "show lisp gpe entry",
+  .short_help = "show lisp gpe entry vni <vni> vrf <vrf> [leid <leid>] reid <reid>",
+  .function = lisp_gpe_fwd_entry_show,
 };
 /* *INDENT-ON* */
 
@@ -921,29 +706,9 @@ clib_error_t *
 vnet_lisp_gpe_enable_disable (vnet_lisp_gpe_enable_disable_args_t * a)
 {
   lisp_gpe_main_t *lgm = &lisp_gpe_main;
-  vnet_main_t *vnm = lgm->vnet_main;
 
   if (a->is_en)
     {
-      /* add lgpe_ip4_lookup as possible next_node for ip4 lookup */
-      if (lgm->ip4_lookup_next_lgpe_ip4_lookup == ~0)
-       {
-         lgm->ip4_lookup_next_lgpe_ip4_lookup =
-           vlib_node_add_next (vnm->vlib_main, ip4_lookup_node.index,
-                               lgpe_ip4_lookup_node.index);
-       }
-      /* add lgpe_ip6_lookup as possible next_node for ip6 lookup */
-      if (lgm->ip6_lookup_next_lgpe_ip6_lookup == ~0)
-       {
-         lgm->ip6_lookup_next_lgpe_ip6_lookup =
-           vlib_node_add_next (vnm->vlib_main, ip6_lookup_node.index,
-                               lgpe_ip6_lookup_node.index);
-       }
-      else
-       {
-         /* ask cp to re-add ifaces and defaults */
-       }
-
       lgm->is_en = 1;
     }
   else
@@ -951,37 +716,17 @@ vnet_lisp_gpe_enable_disable (vnet_lisp_gpe_enable_disable_args_t * a)
       CLIB_UNUSED (uword * val);
       hash_pair_t *p;
       u32 *dp_tables = 0, *dp_table;
-      lisp_gpe_tunnel_key_t *tunnels = 0, *tunnel;
-      vnet_lisp_gpe_add_del_fwd_entry_args_t _at, *at = &_at;
       vnet_lisp_gpe_add_del_iface_args_t _ai, *ai = &_ai;
+      lisp_fwd_entry_t *lfe;
 
-      /* remove all tunnels */
-
+      /* remove all entries */
       /* *INDENT-OFF* */
-      mhash_foreach(tunnel, val, &lgm->lisp_gpe_tunnel_by_key, ({
-        vec_add1(tunnels, tunnel[0]);
+      pool_foreach (lfe, lisp_fwd_entry_pool,
+      ({
+       del_ip_fwd_entry_i (lfe);
       }));
       /* *INDENT-ON* */
 
-      vec_foreach (tunnel, tunnels)
-      {
-       memset (at, 0, sizeof (at[0]));
-       at->is_add = 0;
-       if (tunnel->rmt.type == GID_ADDR_IP_PREFIX)
-         {
-           gid_address_type (&at->rmt_eid) = GID_ADDR_IP_PREFIX;
-           ip_prefix_copy (&gid_address_ippref (&at->rmt_eid),
-                           &tunnel->rmt.ippref);
-         }
-       else
-         {
-           gid_address_type (&at->rmt_eid) = GID_ADDR_MAC;
-           mac_copy (&gid_address_mac (&at->rmt_eid), &tunnel->rmt.mac);
-         }
-       vnet_lisp_gpe_add_del_fwd_entry (at, 0);
-      }
-      vec_free (tunnels);
-
       /* disable all l3 ifaces */
 
       /* *INDENT-OFF* */
@@ -1109,6 +854,7 @@ format_vnet_lisp_gpe_status (u8 * s, va_list * args)
   return format (s, "%s", lgm->is_en ? "enabled" : "disabled");
 }
 
+
 /** LISP-GPE init function. */
 clib_error_t *
 lisp_gpe_init (vlib_main_t * vm)
@@ -1128,11 +874,10 @@ lisp_gpe_init (vlib_main_t * vm)
   lgm->im6 = &ip6_main;
   lgm->lm4 = &ip4_main.lookup_main;
   lgm->lm6 = &ip6_main.lookup_main;
-  lgm->ip4_lookup_next_lgpe_ip4_lookup = ~0;
-  lgm->ip6_lookup_next_lgpe_ip6_lookup = ~0;
 
-  mhash_init (&lgm->lisp_gpe_tunnel_by_key, sizeof (uword),
-             sizeof (lisp_gpe_tunnel_key_t));
+  lisp_gpe_fwd_entries = hash_create_mem (0,
+                                         sizeof (lisp_gpe_fwd_entry_key_t),
+                                         sizeof (uword));
 
   l2_fib_init (lgm);
 
index 4a8bdfe..66009cc 100644 (file)
@@ -30,6 +30,7 @@
 #include <vnet/ip/udp.h>
 #include <vnet/lisp-cp/lisp_types.h>
 #include <vnet/lisp-gpe/lisp_gpe_packet.h>
+#include <vnet/adj/adj_types.h>
 
 /** IP4-UDP-LISP encap header */
 /* *INDENT-OFF* */
@@ -49,37 +50,6 @@ typedef CLIB_PACKED (struct {
 }) ip6_udp_lisp_gpe_header_t;
 /* *INDENT-ON* */
 
-/** LISP-GPE tunnel key */
-typedef struct
-{
-  union
-  {
-    struct
-    {
-      dp_address_t rmt;
-      dp_address_t lcl;
-      u32 vni;
-    };
-    u8 as_u8[40];
-  };
-} lisp_gpe_tunnel_key_t;
-
-typedef struct lisp_gpe_sub_tunnel
-{
-  /** Rewrite string. $$$$ embed vnet_rewrite header */
-  u8 *rewrite;
-  u32 parent_index;
-  u32 locator_pair_index;
-  u8 weight;
-  u8 is_ip4;
-} lisp_gpe_sub_tunnel_t;
-
-typedef struct nomalized_sub_tunnel
-{
-  u32 sub_tunnel_index;
-  u8 weight;
-} normalized_sub_tunnel_weights_t;
-
 /** LISP-GPE tunnel structure */
 typedef struct
 {
@@ -87,17 +57,7 @@ typedef struct
   locator_pair_t *locator_pairs;
 
   /** locator-pairs with best priority become sub-tunnels */
-  lisp_gpe_sub_tunnel_t *sub_tunnels;
-
-  /** sub-tunnels load balancing vector: contains list of sub-tunnel
-   * indexes replicated according to weight */
-  u32 *sub_tunnels_lbv;
-
-  /** number of entries in load balancing vector */
-  u32 sub_tunnels_lbv_count;
-
-  /** normalized sub tunnel weights */
-  normalized_sub_tunnel_weights_t *norm_sub_tunnel_weights;
+  u32 *sub_tunnels;
 
   /** decap next index */
   u32 decap_next_index;
@@ -109,10 +69,16 @@ typedef struct
   u32 encap_fib_index;         /* tunnel partner lookup here */
   u32 decap_fib_index;         /* inner IP lookup here */
 
+  /** index of the source address lookup FIB */
+  u32 src_fib_index;
+
   /** vnet intfc hw/sw_if_index */
   u32 hw_if_index;
   u32 sw_if_index;
 
+  /** L2 path-list */
+  fib_node_index_t l2_path_list;
+
   /** action for 'negative' tunnels */
   u8 action;
 
@@ -124,6 +90,112 @@ typedef struct
   u32 vni;
 } lisp_gpe_tunnel_t;
 
+/**
+ * @brief A path on which to forward lisp traffic
+ */
+typedef struct lisp_fwd_path_t_
+{
+  /**
+   * The adjacency constructed for the locator pair
+   */
+  index_t lisp_adj;
+
+  /**
+   * Priority. Only the paths with the best priority will be installed in FIB
+   */
+  u8 priority;
+
+  /**
+   * [UE]CMP weigt for the path
+   */
+  u8 weight;
+
+} lisp_fwd_path_t;
+
+/**
+ * @brief A Forwarding entry can be 'normal' or 'negative'
+ * Negative implies we deliberately want to add a FIB entry for an EID
+ * that results in 'spcial' behaviour determined by an 'action'.
+ * @normal' means send it down some tunnels.
+ */
+typedef enum lisp_fwd_entry_type_t_
+{
+  LISP_FWD_ENTRY_TYPE_NORMAL,
+  LISP_FWD_ENTRY_TYPE_NEGATIVE,
+} lisp_fwd_entry_type_t;
+
+typedef enum
+{
+  NO_ACTION,
+  FORWARD_NATIVE,
+  SEND_MAP_REQUEST,
+  DROP
+} negative_fwd_actions_e;
+
+/**
+ * LISP-GPE fwd entry key
+ */
+typedef struct lisp_gpe_fwd_entry_key_t_
+{
+  dp_address_t rmt;
+  dp_address_t lcl;
+  u32 vni;
+} lisp_gpe_fwd_entry_key_t;
+
+/**
+ * @brief A LISP Forwarding Entry
+ *
+ * A forwarding entry is from a locai EID to a remote EID over a set of rloc pairs
+ */
+typedef struct lisp_fwd_entry_t_
+{
+  /**
+   * The Entry's key: {lEID,r-EID,vni}
+   */
+  lisp_gpe_fwd_entry_key_t *key;
+
+  /**
+   * The VRF (in the case of L3) or Bridge-Domain (for L2) index
+   */
+  union
+  {
+    u32 eid_table_id;
+    u32 eid_bd_index;
+  };
+
+  /**
+   * The forwarding entry type
+   */
+  lisp_fwd_entry_type_t type;
+
+  union
+  {
+    /**
+     * @brief When the type is 'normal'
+     *        The RLOC pair that form the route's paths. i.e. where to send
+     *        packets for this route.
+     */
+    lisp_fwd_path_t *paths;
+
+    /**
+     * @brief When the type is negative. The action to take.
+     */
+    negative_fwd_actions_e action;
+  };
+
+  /**
+   * The FIB index for the overlay, i.e. the FIB in which the EIDs
+   * are present
+   */
+  u32 eid_fib_index;
+
+  /**
+   * The SRC-FIB index for created for anding source-route entries
+   */
+  u32 src_fib_index;
+} lisp_fwd_entry_t;
+
+
 #define foreach_lisp_gpe_ip_input_next          \
 _(DROP, "error-drop")                           \
 _(IP4_INPUT, "ip4-input")                       \
@@ -147,30 +219,6 @@ typedef enum
   LISP_GPE_N_ERROR,
 } lisp_gpe_error_t;
 
-/** IP4 source FIB.
- * As a first step, reuse v4 fib. The goal of the typedef is
- * to shield consumers from future updates that may result in the lisp ip4 fib
- * diverging from ip4 fib
- */
-typedef ip4_fib_t ip4_src_fib_t;
-
-/** IP6 source FIB */
-typedef struct ip6_src_fib
-{
-  BVT (clib_bihash) ip6_lookup_table;
-
-  /** bitmap/vector of mask widths to search */
-  uword *non_empty_dst_address_length_bitmap;
-  u8 *prefix_lengths_in_search_order;
-  ip6_address_t fib_masks[129];
-  i32 dst_address_length_refcounts[129];
-
-  /** ip6 lookup table config parameters */
-  u32 lookup_table_nbuckets;
-  uword lookup_table_size;
-} ip6_src_fib_t;
-
-/** Tunnel lookup structure for L2 and L3 tunnels */
 typedef struct tunnel_lookup
 {
   /** Lookup lisp-gpe interfaces by dp table (eg. vrf/bridge index) */
@@ -178,6 +226,8 @@ typedef struct tunnel_lookup
 
   /** lookup decap tunnel termination sw_if_index by vni and vice versa */
   uword *sw_if_index_by_vni;
+
+  // FIXME - Need this?
   uword *vni_by_sw_if_index;
 } tunnel_lookup_t;
 
@@ -187,9 +237,6 @@ typedef struct lisp_gpe_main
   /** pool of encap tunnel instances */
   lisp_gpe_tunnel_t *tunnels;
 
-  /** lookup tunnel by key */
-  mhash_t lisp_gpe_tunnel_by_key;
-
   /** Free vlib hw_if_indices */
   u32 *free_tunnel_hw_if_indices;
 
@@ -197,21 +244,8 @@ typedef struct lisp_gpe_main
 
   /* L3 data structures
    * ================== */
-
-  /** Pool of src fibs that are paired with dst fibs */
-  ip4_src_fib_t *ip4_src_fibs;
-  ip6_src_fib_t *ip6_src_fibs;
-
   tunnel_lookup_t l3_ifaces;
 
-  /** Lookup lgpe_ipX_lookup_next by vrf */
-  uword *lgpe_ip4_lookup_next_index_by_table_id;
-  uword *lgpe_ip6_lookup_next_index_by_table_id;
-
-  /** next node indexes that point ip4/6 lookup to lisp gpe ip lookup */
-  u32 ip4_lookup_next_lgpe_ip4_lookup;
-  u32 ip6_lookup_next_lgpe_ip6_lookup;
-
   /* L2 data structures
    * ================== */
 
@@ -220,6 +254,10 @@ typedef struct lisp_gpe_main
 
   tunnel_lookup_t l2_ifaces;
 
+  /** Load-balance for a miss in the table */
+  index_t l2_lb_miss;
+  index_t l2_lb_cp_lkup;
+
   /** convenience */
   vlib_main_t *vlib_main;
   vnet_main_t *vnet_main;
@@ -238,10 +276,10 @@ vnet_lisp_gpe_get_main ()
   return &lisp_gpe_main;
 }
 
-extern vlib_node_registration_t lgpe_ip4_lookup_node;
-extern vlib_node_registration_t lgpe_ip6_lookup_node;
+
 extern vlib_node_registration_t lisp_gpe_ip4_input_node;
 extern vlib_node_registration_t lisp_gpe_ip6_input_node;
+extern vnet_hw_interface_class_t lisp_gpe_hw_class;
 
 u8 *format_lisp_gpe_header_with_length (u8 * s, va_list * args);
 
@@ -291,7 +329,7 @@ typedef struct
   u8 is_negative;
 
   /** action for negative mappings */
-  u8 action;
+  negative_fwd_actions_e action;
 
   /** local eid */
   gid_address_t lcl_eid;
@@ -332,13 +370,23 @@ int
 vnet_lisp_gpe_add_del_fwd_entry (vnet_lisp_gpe_add_del_fwd_entry_args_t * a,
                                 u32 * hw_if_indexp);
 
-int
-ip_sd_fib_add_del_route (lisp_gpe_main_t * lgm, ip_prefix_t * dst_prefix,
-                        ip_prefix_t * src_prefix, u32 table_id,
-                        ip_adjacency_t * add_adj, u8 is_add);
-u32
-ip_sd_fib_get_route (lisp_gpe_main_t * lgm, ip_prefix_t * dst_prefix,
-                    ip_prefix_t * src_prefix, u32 table_id);
+extern void
+ip_src_fib_add_route (u32 src_fib_index,
+                     const ip_prefix_t * src_prefix,
+                     const lisp_fwd_path_t * paths);
+extern void
+ip_src_dst_fib_del_route (u32 src_fib_index,
+                         const ip_prefix_t * src_prefix,
+                         u32 dst_table_id, const ip_prefix_t * dst_prefix);
+extern void
+ip_src_fib_add_route_w_dpo (u32 src_fib_index,
+                           const ip_prefix_t * src_prefix,
+                           const dpo_id_t * src_dpo);
+extern u32
+ip_dst_fib_add_route (u32 dst_table_id, const ip_prefix_t * dst_prefix);
+
+extern fib_route_path_t *lisp_gpe_mk_paths_for_sub_tunnels (lisp_gpe_tunnel_t
+                                                           * t);
 
 #define foreach_lgpe_ip4_lookup_next    \
   _(DROP, "error-drop")                 \
diff --git a/vnet/vnet/lisp-gpe/lisp_gpe_adjacency.c b/vnet/vnet/lisp-gpe/lisp_gpe_adjacency.c
new file mode 100644 (file)
index 0000000..861f0dd
--- /dev/null
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file
+ * @brief Common utility functions for IPv4, IPv6 and L2 LISP-GPE adjacencys.
+ *
+ */
+
+#include <vnet/dpo/dpo.h>
+#include <vnet/lisp-gpe/lisp_gpe_sub_interface.h>
+#include <vnet/lisp-gpe/lisp_gpe_adjacency.h>
+#include <vnet/lisp-gpe/lisp_gpe_tunnel.h>
+#include <vnet/fib/fib_entry.h>
+#include <vnet/adj/adj_midchain.h>
+
+/**
+ * Memory pool of all adjacencies
+ */
+static lisp_gpe_adjacency_t *lisp_adj_pool;
+
+/**
+ * Hash table of all adjacencies. key:{nh, itf}
+ * We never have an all zeros address since the interfaces are multi-access,
+ * therefore there is no ambiguity between a v4 and v6 next-hop, so we don't
+ * need to add the protocol to the key.
+ */
+static
+BVT (clib_bihash)
+  lisp_adj_db;
+
+#define LISP_ADJ_SET_KEY(_key, _itf, _nh)       \
+{                                              \
+  _key.key[0] = (_nh)->ip.v6.as_u64[0];                \
+  _key.key[1] = (_nh)->ip.v6.as_u64[1];                \
+  _key.key[2] = (_itf);                                \
+}
+
+     static index_t lisp_adj_find (const ip_address_t * addr, u32 sw_if_index)
+{
+  BVT (clib_bihash_kv) kv;
+
+  LISP_ADJ_SET_KEY (kv, sw_if_index, addr);
+
+  if (BV (clib_bihash_search) (&lisp_adj_db, &kv, &kv) < 0)
+    {
+      return (INDEX_INVALID);
+    }
+  else
+    {
+      return (kv.value);
+    }
+}
+
+static void
+lisp_adj_insert (const ip_address_t * addr, u32 sw_if_index, index_t ai)
+{
+  BVT (clib_bihash_kv) kv;
+
+  LISP_ADJ_SET_KEY (kv, sw_if_index, addr);
+  kv.value = ai;
+
+  BV (clib_bihash_add_del) (&lisp_adj_db, &kv, 1);
+}
+
+static void
+lisp_adj_remove (const ip_address_t * addr, u32 sw_if_index)
+{
+  BVT (clib_bihash_kv) kv;
+
+  LISP_ADJ_SET_KEY (kv, sw_if_index, addr);
+
+  BV (clib_bihash_add_del) (&lisp_adj_db, &kv, 0);
+}
+
+static lisp_gpe_adjacency_t *
+lisp_gpe_adjacency_get_i (index_t lai)
+{
+  return (pool_elt_at_index (lisp_adj_pool, lai));
+}
+
+fib_forward_chain_type_t
+lisp_gpe_adj_get_fib_chain_type (const lisp_gpe_adjacency_t * ladj)
+{
+  switch (ip_addr_version (&ladj->remote_rloc))
+    {
+    case IP4:
+      return (FIB_FORW_CHAIN_TYPE_UNICAST_IP4);
+    case IP6:
+      return (FIB_FORW_CHAIN_TYPE_UNICAST_IP6);
+    default:
+      ASSERT (0);
+      break;
+    }
+  return (FIB_FORW_CHAIN_TYPE_UNICAST_IP4);
+}
+
+/**
+ * @brief Stack the tunnel's midchain on the IP forwarding chain of the via
+ */
+static void
+lisp_gpe_adj_stack (lisp_gpe_adjacency_t * ladj)
+{
+  const lisp_gpe_tunnel_2_t *lgt;
+  dpo_id_t tmp = DPO_NULL;
+  fib_link_t linkt;
+
+  lgt = lisp_gpe_tunnel_get (ladj->tunnel_index);
+  fib_entry_contribute_forwarding (lgt->fib_entry_index,
+                                  lisp_gpe_adj_get_fib_chain_type (ladj),
+                                  &tmp);
+
+  FOR_EACH_FIB_IP_LINK (linkt)
+  {
+    adj_nbr_midchain_stack (ladj->adjs[linkt], &tmp);
+  }
+  dpo_reset (&tmp);
+}
+
+static lisp_gpe_next_protocol_e
+lisp_gpe_adj_proto_from_fib_link_type (fib_link_t linkt)
+{
+  switch (linkt)
+    {
+    case FIB_LINK_IP4:
+      return (LISP_GPE_INPUT_NEXT_IP4_INPUT);
+    case FIB_LINK_IP6:
+      return (LISP_GPE_INPUT_NEXT_IP6_INPUT);
+    default:
+      ASSERT (0);
+    }
+  return (LISP_GPE_INPUT_NEXT_DROP);
+}
+
+index_t
+lisp_gpe_adjacency_find_or_create_and_lock (const locator_pair_t * pair,
+                                           u32 overlay_table_id, u32 vni)
+{
+  const lisp_gpe_tunnel_2_t *lgt;
+  lisp_gpe_adjacency_t *ladj;
+  index_t lai, l3si;
+
+  /*
+   * first find the L3 sub-interface that corresponds to the loacl-rloc and vni
+   */
+  l3si = lisp_gpe_sub_interface_find_or_create_and_lock (&pair->lcl_loc,
+                                                        overlay_table_id,
+                                                        vni);
+
+  /*
+   * find an existing or create a new adj
+   */
+  lai = lisp_adj_find (&pair->rmt_loc, l3si);
+
+  if (INDEX_INVALID == lai)
+    {
+      const lisp_gpe_sub_interface_t *l3s;
+      u8 *rewrite = NULL;
+      fib_link_t linkt;
+      fib_prefix_t nh;
+
+      pool_get (lisp_adj_pool, ladj);
+      memset (ladj, 0, sizeof (*ladj));
+      lai = (ladj - lisp_adj_pool);
+
+      ladj->remote_rloc = pair->rmt_loc;
+      ladj->vni = vni;
+      /* transfer the lock to the adj */
+      ladj->lisp_l3_sub_index = l3si;
+
+      l3s = lisp_gpe_sub_interface_get (l3si);
+      ladj->sw_if_index = l3s->sw_if_index;
+
+      /* if vni is non-default */
+      if (ladj->vni)
+       ladj->flags = LISP_GPE_FLAGS_I;
+
+      /* work in lisp-gpe not legacy mode */
+      ladj->flags |= LISP_GPE_FLAGS_P;
+
+      /*
+       * find the tunnel that will provide the underlying transport
+       * and hence the rewrite.
+       * The RLOC FIB index is default table - always.
+       */
+      ladj->tunnel_index = lisp_gpe_tunnel_find_or_create_and_lock (pair, 0);
+
+      lgt = lisp_gpe_tunnel_get (ladj->tunnel_index);
+
+      /*
+       * become of child of the RLOC FIB entry so we are updated when
+       * its reachability changes, allowing us to re-stack the midcahins
+       */
+      ladj->fib_entry_child_index = fib_entry_child_add (lgt->fib_entry_index,
+                                                        FIB_NODE_TYPE_LISP_ADJ,
+                                                        lai);
+      ip_address_to_fib_prefix (&pair->rmt_loc, &nh);
+
+      /*
+       * construct and stack the FIB midchain adjacencies
+       */
+      FOR_EACH_FIB_IP_LINK (linkt)
+      {
+       ladj->adjs[linkt] = adj_nbr_add_or_lock (nh.fp_proto,
+                                                linkt,
+                                                &nh.fp_addr,
+                                                ladj->sw_if_index);
+
+       rewrite =
+         lisp_gpe_tunnel_build_rewrite (lgt, ladj,
+                                        lisp_gpe_adj_proto_from_fib_link_type
+                                        (linkt));
+
+       adj_nbr_midchain_update_rewrite (ladj->adjs[linkt],
+                                        vnet_get_sup_hw_interface
+                                        (vnet_get_main (),
+                                         ladj->sw_if_index)->tx_node_index,
+                                        rewrite);
+
+       vec_free (rewrite);
+      }
+
+      lisp_gpe_adj_stack (ladj);
+
+      lisp_adj_insert (&ladj->remote_rloc, ladj->lisp_l3_sub_index, lai);
+    }
+  else
+    {
+      /* unlock the interface from the find. */
+      lisp_gpe_sub_interface_unlock (l3si);
+      ladj = lisp_gpe_adjacency_get_i (lai);
+    }
+
+  ladj->locks++;
+
+  return (lai);
+}
+
+/**
+ * @brief Get a pointer to a tunnel from a pointer to a FIB node
+ */
+static lisp_gpe_adjacency_t *
+lisp_gpe_adjacency_from_fib_node (const fib_node_t * node)
+{
+  return ((lisp_gpe_adjacency_t *)
+         ((char *) node -
+          STRUCT_OFFSET_OF (lisp_gpe_adjacency_t, fib_node)));
+}
+
+static void
+lisp_gpe_adjacency_last_lock_gone (lisp_gpe_adjacency_t * ladj)
+{
+  /*
+   * no children so we are not counting locks. no-op.
+   * at least not counting
+   */
+  lisp_adj_remove (&ladj->remote_rloc, ladj->lisp_l3_sub_index);
+
+  /*
+   * unlock the resources this adj holds
+   */
+  lisp_gpe_tunnel_unlock (ladj->tunnel_index);
+  lisp_gpe_sub_interface_unlock (ladj->lisp_l3_sub_index);
+
+  pool_put (lisp_adj_pool, ladj);
+}
+
+void
+lisp_gpe_adjacency_unlock (index_t lai)
+{
+  lisp_gpe_adjacency_t *ladj;
+
+  ladj = lisp_gpe_adjacency_get_i (lai);
+
+  ladj->locks--;
+
+  if (0 == ladj->locks)
+    {
+      lisp_gpe_adjacency_last_lock_gone (ladj);
+    }
+}
+
+const lisp_gpe_adjacency_t *
+lisp_gpe_adjacency_get (index_t lai)
+{
+  return (lisp_gpe_adjacency_get_i (lai));
+}
+
+
+/**
+ * @brief LISP GPE tunnel back walk
+ *
+ * The FIB entry through which this tunnel resolves has been updated.
+ * re-stack the midchain on the new forwarding.
+ */
+static fib_node_back_walk_rc_t
+lisp_gpe_adjacency_back_walk (fib_node_t * node,
+                             fib_node_back_walk_ctx_t * ctx)
+{
+  lisp_gpe_adj_stack (lisp_gpe_adjacency_from_fib_node (node));
+
+  return (FIB_NODE_BACK_WALK_CONTINUE);
+}
+
+static fib_node_t *
+lisp_gpe_adjacency_get_fib_node (fib_node_index_t index)
+{
+  lisp_gpe_adjacency_t *ladj;
+
+  ladj = pool_elt_at_index (lisp_adj_pool, index);
+  return (&ladj->fib_node);
+}
+
+static void
+lisp_gpe_adjacency_last_fib_lock_gone (fib_node_t * node)
+{
+  lisp_gpe_adjacency_last_lock_gone (lisp_gpe_adjacency_from_fib_node (node));
+}
+
+const static fib_node_vft_t lisp_gpe_tuennel_vft = {
+  .fnv_get = lisp_gpe_adjacency_get_fib_node,
+  .fnv_back_walk = lisp_gpe_adjacency_back_walk,
+  .fnv_last_lock = lisp_gpe_adjacency_last_fib_lock_gone,
+};
+
+u8 *
+format_lisp_gpe_adjacency (u8 * s, va_list * args)
+{
+  lisp_gpe_adjacency_t *ladj = va_arg (*args, lisp_gpe_adjacency_t *);
+  lisp_gpe_adjacency_format_flags_t flags =
+    va_arg (args, lisp_gpe_adjacency_format_flags_t);
+
+  if (flags & LISP_GPE_ADJ_FORMAT_FLAG_DETAIL)
+    {
+      s =
+       format (s, "index %d locks:%d\n", ladj - lisp_adj_pool, ladj->locks);
+    }
+
+  s = format (s, " vni: %d,", ladj->vni);
+  s = format (s, " remote-RLOC: %U,", format_ip_address, &ladj->remote_rloc);
+
+  if (flags & LISP_GPE_ADJ_FORMAT_FLAG_DETAIL)
+    {
+      s = format (s, " %U\n",
+                 format_lisp_gpe_sub_interface,
+                 lisp_gpe_sub_interface_get (ladj->lisp_l3_sub_index));
+      s = format (s, " %U\n",
+                 format_lisp_gpe_tunnel,
+                 lisp_gpe_tunnel_get (ladj->tunnel_index));
+      s = format (s, " FIB adjacencies: IPV4:%d IPv6:%d\n",
+                 ladj->adjs[FIB_LINK_IP4], ladj->adjs[FIB_LINK_IP6]);
+    }
+  else
+    {
+      s = format (s, " LISP L3 sub-interface index: %d,",
+                 ladj->lisp_l3_sub_index);
+      s = format (s, " LISP tunnel index: %d", ladj->tunnel_index);
+    }
+
+
+  return (s);
+}
+
+static clib_error_t *
+lisp_gpe_adjacency_show (vlib_main_t * vm,
+                        unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  lisp_gpe_adjacency_t *ladj;
+  index_t index;
+
+  if (pool_elts (lisp_adj_pool) == 0)
+    vlib_cli_output (vm, "No lisp-gpe Adjacencies");
+
+  if (unformat (input, "%d", &index))
+    {
+      ladj = lisp_gpe_adjacency_get_i (index);
+      vlib_cli_output (vm, "%U", format_lisp_gpe_adjacency, ladj,
+                      LISP_GPE_ADJ_FORMAT_FLAG_DETAIL);
+    }
+  else
+    {
+      /* *INDENT-OFF* */
+      pool_foreach (ladj, lisp_adj_pool,
+      ({
+       vlib_cli_output (vm, "[%d] %U\n",
+                        ladj - lisp_adj_pool,
+                        format_lisp_gpe_adjacency, ladj,
+                        LISP_GPE_ADJ_FORMAT_FLAG_NONE);
+      }));
+      /* *INDENT-ON* */
+    }
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_lisp_gpe_tunnel_command, static) =
+{
+  .path = "show lisp gpe adjacency",
+  .function = lisp_gpe_adjacency_show,
+};
+/* *INDENT-ON* */
+
+#define LISP_ADJ_NBR_DEFAULT_HASH_NUM_BUCKETS (256)
+#define LISP_ADJ_NBR_DEFAULT_HASH_MEMORY_SIZE (1<<20)
+
+static clib_error_t *
+lisp_gpe_adj_module_init (vlib_main_t * vm)
+{
+  BV (clib_bihash_init) (&lisp_adj_db,
+                        "Adjacency Neighbour table",
+                        LISP_ADJ_NBR_DEFAULT_HASH_NUM_BUCKETS,
+                        LISP_ADJ_NBR_DEFAULT_HASH_MEMORY_SIZE);
+
+  fib_node_register_type (FIB_NODE_TYPE_LISP_ADJ, &lisp_gpe_tuennel_vft);
+  return (NULL);
+}
+
+VLIB_INIT_FUNCTION (lisp_gpe_adj_module_init)
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/vnet/vnet/lisp-gpe/lisp_gpe_adjacency.h b/vnet/vnet/lisp-gpe/lisp_gpe_adjacency.h
new file mode 100644 (file)
index 0000000..f6a66cd
--- /dev/null
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file
+ * @brief Common utility functions for IPv4, IPv6 and L2 LISP-GPE adjacencys.
+ *
+ */
+
+#ifndef LISP_GPE_ADJACENCY_H__
+#define LISP_GPE_ADJACENCY_H__
+
+#include <vnet/fib/fib_node.h>
+#include <vnet/lisp-gpe/lisp_gpe.h>
+
+/**
+ * @brief A LISP GPE Adjacency.
+ *
+ * A adjacency represents peer on an L3 sub-interface to which to send traffic.
+ * adjacencies are thus present in the EID space.
+ * The peer is identified by the key:{remote-rloc, sub-interface}, which is
+ * equivalent to the usal adjacency key {next-hop, interface}. So curiously
+ * the rloc address from the underlay is used as a next hop address in the overlay
+ * This is OK because:
+ *  1 - the RLOC is unique in the underlay AND there is only one underlay VRF per
+ *      overlay
+ *  2 - the RLOC may overlap with an address in the overlay, but we do not create
+ *      an adj-fib (i.e. a route in the overlay FIB for the rloc)
+ *
+ *
+ */
+typedef struct lisp_gpe_adjacency_t_
+{
+  /**
+   * The LISP adj is a part of the FIB control plane graph.
+   */
+  fib_node_t fib_node;
+
+  /**
+   * remote RLOC. The adjacency's next-hop
+   */
+  ip_address_t remote_rloc;
+
+  /**
+   * The VNI. Used in combination with the local-rloc to get the sub-interface
+   */
+  u32 vni;
+
+  /**
+   * The number of locks/reference counts on the adjacency.
+   */
+  u32 locks;
+
+  /**
+   * The index of the LISP L3 subinterface
+   */
+  u32 lisp_l3_sub_index;
+
+  /**
+   * The SW IF index of the sub-interface this adjacency uses.
+   * Cached for convenience from the LISP L3 sub-interface
+   */
+  u32 sw_if_index;
+
+  /**
+   * The index of the LISP GPE tunnel that provides the transport
+   * in the underlay.
+   */
+  u32 tunnel_index;
+
+  /**
+   * Per-link-type FIB adjacencies contributed.
+   * These will be used as a result of a FIB lookup.
+   */
+  adj_index_t adjs[FIB_LINK_NUM];
+
+  /**
+   * This adjacency is a child of the FIB entry to reach the RLOC.
+   * This is so when the reachability of that RLOC changes, we can restack
+   * the FIB adjacnecies.
+   */
+  u32 fib_entry_child_index;
+
+  /**
+   * LISP header fields in HOST byte order
+   */
+  u8 flags;
+  u8 ver_res;
+  u8 res;
+  u8 next_protocol;
+
+} lisp_gpe_adjacency_t;
+
+extern index_t lisp_gpe_adjacency_find_or_create_and_lock (const
+                                                          locator_pair_t *
+                                                          pair,
+                                                          u32 rloc_fib_index,
+                                                          u32 vni);
+
+extern void lisp_gpe_adjacency_unlock (index_t l3si);
+
+extern const lisp_gpe_adjacency_t *lisp_gpe_adjacency_get (index_t l3si);
+
+/**
+ * @brief Flags for displaying the adjacency
+ */
+typedef enum lisp_gpe_adjacency_format_flags_t_
+{
+  LISP_GPE_ADJ_FORMAT_FLAG_NONE,
+  LISP_GPE_ADJ_FORMAT_FLAG_DETAIL,
+} lisp_gpe_adjacency_format_flags_t;
+
+extern u8 *format_lisp_gpe_adjacency (u8 * s, va_list * args);
+
+#endif
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/vnet/vnet/lisp-gpe/lisp_gpe_sub_interface.c b/vnet/vnet/lisp-gpe/lisp_gpe_sub_interface.c
new file mode 100644 (file)
index 0000000..220802b
--- /dev/null
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file
+ * @brief LISP sub-interfaces.
+ *
+ */
+#include <vnet/lisp-gpe/lisp_gpe_sub_interface.h>
+#include <vnet/fib/fib_table.h>
+#include <vnet/interface.h>
+
+/**
+ * @brief Pool of all l3-sub-interfaces
+ */
+static lisp_gpe_sub_interface_t *lisp_gpe_sub_interface_pool;
+
+/**
+ * A DB of all LISP L3 sub-interfaces. The key is:{VNI,l-RLOC}
+ */
+static uword *lisp_gpe_sub_interfaces;
+
+/**
+ * A DB of all VNET L3 sub-interfaces. The key is:{VNI,l-RLOC}
+ * Used in the data-plane for interface lookup on decap.
+ */
+uword *lisp_gpe_sub_interfaces_sw_if_index;
+
+/**
+ * The next available sub-interface ID. FIXME
+ */
+static u32 lisp_gpe_sub_interface_id;
+
+
+static index_t
+lisp_gpe_sub_interface_db_find (const ip_address_t * lrloc, u32 vni)
+{
+  uword *p;
+
+  lisp_gpe_sub_interface_key_t key = {
+    .local_rloc = *lrloc,
+    .vni = clib_host_to_net_u32 (vni),
+  };
+
+  p = hash_get_mem (lisp_gpe_sub_interfaces, &key);
+
+  if (NULL == p)
+    return (INDEX_INVALID);
+  else
+    return (p[0]);
+}
+
+static void
+lisp_gpe_sub_interface_db_insert (const lisp_gpe_sub_interface_t * l3s)
+{
+  hash_set_mem (lisp_gpe_sub_interfaces,
+               &l3s->key, l3s - lisp_gpe_sub_interface_pool);
+  hash_set_mem (lisp_gpe_sub_interfaces_sw_if_index,
+               &l3s->key, l3s->sw_if_index);
+}
+
+static void
+lisp_gpe_sub_interface_db_remove (const lisp_gpe_sub_interface_t * l3s)
+{
+  hash_unset_mem (lisp_gpe_sub_interfaces, &l3s->key);
+  hash_unset_mem (lisp_gpe_sub_interfaces_sw_if_index, &l3s->key);
+}
+
+lisp_gpe_sub_interface_t *
+lisp_gpe_sub_interface_get_i (index_t l3si)
+{
+  return (pool_elt_at_index (lisp_gpe_sub_interface_pool, l3si));
+}
+
+static void
+lisp_gpe_sub_interface_set_table (u32 sw_if_index, u32 table_id)
+{
+  fib_node_index_t fib_index;
+
+  fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, table_id);
+  ASSERT (FIB_NODE_INDEX_INVALID != fib_index);
+
+  vec_validate (ip4_main.fib_index_by_sw_if_index, sw_if_index);
+  ip4_main.fib_index_by_sw_if_index[sw_if_index] = fib_index;
+  // FIXME. enable When we get an adj
+  ip4_sw_interface_enable_disable (sw_if_index, 1);
+
+  fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP6, table_id);
+  ASSERT (FIB_NODE_INDEX_INVALID != fib_index);
+
+  vec_validate (ip6_main.fib_index_by_sw_if_index, sw_if_index);
+  ip6_main.fib_index_by_sw_if_index[sw_if_index] = fib_index;
+  // FIXME. enable When we get an adj
+  ip6_sw_interface_enable_disable (sw_if_index, 1);
+}
+
+static void
+lisp_gpe_sub_interface_unset_table (u32 sw_if_index, u32 table_id)
+{
+  ip4_main.fib_index_by_sw_if_index[sw_if_index] = 0;
+  ip4_sw_interface_enable_disable (sw_if_index, 0);
+
+  ip6_main.fib_index_by_sw_if_index[sw_if_index] = 0;
+  ip6_sw_interface_enable_disable (sw_if_index, 0);
+}
+
+index_t
+lisp_gpe_sub_interface_find_or_create_and_lock (const ip_address_t * lrloc,
+                                               u32 overlay_table_id, u32 vni)
+{
+  lisp_gpe_sub_interface_t *l3s;
+  lisp_gpe_main_t *lgm = &lisp_gpe_main;
+  index_t l3si;
+
+  l3si = lisp_gpe_sub_interface_db_find (lrloc, vni);
+
+  if (INDEX_INVALID == l3si)
+    {
+      vnet_hw_interface_t *hi;
+      clib_error_t *error;
+      u32 sub_sw_if_index;
+      uword *p;
+
+      /*
+       * find the main interface from the VNI
+       */
+      p = hash_get (lgm->l3_ifaces.sw_if_index_by_vni, vni);
+
+      if (NULL == p)
+       return (INDEX_INVALID);
+
+      hi = vnet_get_hw_interface (vnet_get_main (), p[0]);
+
+      if (NULL == hi)
+       return (INDEX_INVALID);
+
+      vnet_sw_interface_t sub_itf_template = {
+       .type = VNET_SW_INTERFACE_TYPE_SUB,
+       .sup_sw_if_index = hi->sw_if_index,
+       .sub.id = lisp_gpe_sub_interface_id++,
+      };
+
+      error = vnet_create_sw_interface (vnet_get_main (),
+                                       &sub_itf_template, &sub_sw_if_index);
+
+      if (NULL != error)
+       return (INDEX_INVALID);
+
+      pool_get (lisp_gpe_sub_interface_pool, l3s);
+      memset (l3s, 0, sizeof (*l3s));
+      l3s->key = clib_mem_alloc (sizeof (*l3s->key));
+      memset (l3s->key, 0, sizeof (*l3s->key));
+
+      l3s->key->local_rloc = *lrloc;
+      l3s->key->vni = clib_host_to_net_u32 (vni);
+      l3s->main_sw_if_index = hi->sw_if_index;
+      l3s->sw_if_index = sub_sw_if_index;
+      l3s->eid_table_id = overlay_table_id;
+
+      l3si = (l3s - lisp_gpe_sub_interface_pool);
+
+      lisp_gpe_sub_interface_set_table (l3s->sw_if_index, l3s->eid_table_id);
+      vnet_sw_interface_set_flags (vnet_get_main (),
+                                  l3s->sw_if_index,
+                                  VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+
+      lisp_gpe_sub_interface_db_insert (l3s);
+    }
+  else
+    {
+      l3s = lisp_gpe_sub_interface_get_i (l3si);
+    }
+
+  l3s->locks++;
+
+  return (l3si);
+}
+
+void
+lisp_gpe_sub_interface_unlock (index_t l3si)
+{
+  lisp_gpe_sub_interface_t *l3s;
+
+  l3s = lisp_gpe_sub_interface_get_i (l3si);
+
+  l3s->locks--;
+
+  if (0 == l3s->locks)
+    {
+      lisp_gpe_sub_interface_unset_table (l3s->sw_if_index,
+                                         l3s->eid_table_id);
+
+      vnet_sw_interface_set_flags (vnet_get_main (), l3s->sw_if_index, 0);
+      vnet_delete_sub_interface (l3s->sw_if_index);
+
+      lisp_gpe_sub_interface_db_remove (l3s);
+
+      clib_mem_free (l3s->key);
+      pool_put (lisp_gpe_sub_interface_pool, l3s);
+    }
+}
+
+const lisp_gpe_sub_interface_t *
+lisp_gpe_sub_interface_get (index_t l3si)
+{
+  return (lisp_gpe_sub_interface_get_i (l3si));
+}
+
+u8 *
+format_lisp_gpe_sub_interface (u8 * s, va_list ap)
+{
+  lisp_gpe_sub_interface_t *l3s = va_arg (ap, lisp_gpe_sub_interface_t *);
+  vnet_main_t *vnm = vnet_get_main ();
+
+  s = format (s, "%=16U",
+             format_vnet_sw_interface_name,
+             vnm, vnet_get_sw_interface (vnm, l3s->sw_if_index));
+  s = format (s, "%=10d", clib_net_to_host_u32 (l3s->key->vni));
+  s = format (s, "%=12d", l3s->sw_if_index);
+  s = format (s, "%U", format_ip_address, &l3s->key->local_rloc);
+
+  return (s);
+}
+
+/** CLI command to show LISP-GPE interfaces. */
+static clib_error_t *
+lisp_gpe_sub_interface_show (vlib_main_t * vm,
+                            unformat_input_t * input,
+                            vlib_cli_command_t * cmd)
+{
+  lisp_gpe_sub_interface_t *l3s;
+
+  vlib_cli_output (vm, "%=16s%=10s%=12s%s", "Name", "VNI", "SW IF Index",
+                  "local RLOC");
+
+  /* *INDENT-OFF* */
+  pool_foreach (l3s, lisp_gpe_sub_interface_pool,
+  ({
+    vlib_cli_output (vm, "%U", format_lisp_gpe_sub_interface, l3s);
+  }));
+  /* *INDENT-ON* */
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (lisp_gpe_sub_interface_command) = {
+  .path = "show lisp gpe sub-interface",
+  .short_help = "show lisp gpe sub-interface",
+  .function = lisp_gpe_sub_interface_show,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+lisp_gpe_sub_interface_module_init (vlib_main_t * vm)
+{
+  lisp_gpe_sub_interfaces =
+    hash_create_mem (0,
+                    sizeof (lisp_gpe_sub_interface_key_t), sizeof (uword));
+  lisp_gpe_sub_interfaces_sw_if_index =
+    hash_create_mem (0,
+                    sizeof (lisp_gpe_sub_interface_key_t), sizeof (uword));
+
+  return (NULL);
+}
+
+VLIB_INIT_FUNCTION (lisp_gpe_sub_interface_module_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/vnet/vnet/lisp-gpe/lisp_gpe_sub_interface.h b/vnet/vnet/lisp-gpe/lisp_gpe_sub_interface.h
new file mode 100644 (file)
index 0000000..ad942f4
--- /dev/null
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file
+ * @brief LISP sub-interfaces.
+ *
+ */
+
+#ifndef __LISP_GPE_SUB_INTERFACE_H__
+#define __LISP_GPE_SUB_INTERFACE_H__
+
+#include <vnet/lisp-gpe/lisp_gpe.h>
+
+/**
+ * A Key for lookup in the L£ sub-interface DB
+ */
+typedef struct lisp_gpe_sub_interface_key_t_
+{
+    /**
+     * The local-RLOC. This is the interface's 'source' address.
+     */
+  ip_address_t local_rloc;
+
+    /**
+     * The VNI. In network byte order!
+     */
+  u32 vni;
+} lisp_gpe_sub_interface_key_t;
+
+/**
+ * @brief A LISP L3 sub-interface
+ *
+ * A LISP sub-interface is a multi-access interface, whose local address is a
+ * single local-RLOC. Adjacencies that form on this sub-interface, represent
+ * remote RLOCs.
+ * This is analogous to an ethernet interface.
+ * As with all interface types it can only be present in one VRF, hence a
+ * LISP sub-interface is per-local-rloc and per-VNI.
+ */
+typedef struct lisp_gpe_sub_interface_t_
+{
+  /**
+   * The interface's key inthe DB; rloc & vni;
+   * The key is allocated from the heap so it can be used in the hash-table.
+   * if it's part of the object, then it is subjet to realloc, which no-worky.
+   */
+  lisp_gpe_sub_interface_key_t *key;
+
+  /**
+   * The Table-ID in the overlay that this interface is bound to.
+   */
+  u32 eid_table_id;
+
+  /**
+   * A reference counting lock on the number of users of this interface.
+   * When this count drops to 0 the interface is deleted.
+   */
+  u32 locks;
+
+  /**
+   * The SW if index assigned to this sub-interface
+   */
+  u32 sw_if_index;
+
+  /**
+   * The SW IF index assigned to the main interface of which this is a sub.
+   */
+  u32 main_sw_if_index;
+} lisp_gpe_sub_interface_t;
+
+extern index_t lisp_gpe_sub_interface_find_or_create_and_lock (const
+                                                              ip_address_t *
+                                                              lrloc,
+                                                              u32
+                                                              eid_table_id,
+                                                              u32 vni);
+
+extern u8 *format_lisp_gpe_sub_interface (u8 * s, va_list ap);
+
+extern void lisp_gpe_sub_interface_unlock (index_t itf);
+
+extern const lisp_gpe_sub_interface_t *lisp_gpe_sub_interface_get (index_t
+                                                                  itf);
+
+/**
+ * A DB of all L3 sub-interfaces. The key is:{VNI,l-RLOC}
+ */
+extern uword *lisp_gpe_sub_interfaces_sw_if_index;
+
+/**
+ * @brief
+ *  Get a VNET L3 interface matching the local-RLOC and VNI
+ *  Called from the data-plane
+ */
+always_inline u32
+lisp_gpe_sub_interface_find_ip6 (const ip6_address_t * addr, u32 vni)
+{
+  lisp_gpe_sub_interface_key_t key;
+  const uword *p;
+
+  key.local_rloc.ip.v6.as_u64[0] = addr->as_u64[0];
+  key.local_rloc.ip.v6.as_u64[1] = addr->as_u64[1];
+  key.local_rloc.version = IP6;
+  key.vni = vni;
+
+  p = hash_get_mem (&lisp_gpe_sub_interfaces_sw_if_index, &key);
+
+  if (NULL != p)
+    return p[0];
+
+  return (INDEX_INVALID);
+}
+
+/**
+ * @brief
+ *  Get a VNET L3 interface matching the local-RLOC and VNI
+ *  Called from the data-plane
+ */
+always_inline index_t
+lisp_gpe_sub_interface_find_ip4 (const ip4_address_t * addr, u32 vni)
+{
+  lisp_gpe_sub_interface_key_t key;
+  const uword *p;
+
+  key.local_rloc.ip.v4.as_u32 = addr->as_u32;
+  key.local_rloc.version = IP4;
+  key.vni = vni;
+
+  p = hash_get_mem (&lisp_gpe_sub_interfaces_sw_if_index, &key);
+
+  if (NULL != p)
+    return p[0];
+
+  return (INDEX_INVALID);
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
+
+#endif
diff --git a/vnet/vnet/lisp-gpe/lisp_gpe_tunnel.c b/vnet/vnet/lisp-gpe/lisp_gpe_tunnel.c
new file mode 100644 (file)
index 0000000..0aecc0a
--- /dev/null
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file
+ * @brief Common utility functions for IPv4, IPv6 and L2 LISP-GPE tunnels.
+ *
+ */
+#include <vnet/lisp-gpe/lisp_gpe.h>
+#include <vnet/lisp-gpe/lisp_gpe_tunnel.h>
+#include <vnet/lisp-gpe/lisp_gpe_adjacency.h>
+
+#include <vnet/fib/fib_table.h>
+
+/**
+ * @brief Pool of all LISP tunnels
+ */
+static lisp_gpe_tunnel_2_t *lisp_gpe_tunnel_pool;
+
+/**
+ * @brief a DB of all tunnels
+ */
+static uword *lisp_gpe_tunnel_db;
+
+/**
+ * @brief Compute IP-UDP-GPE sub-tunnel encap/rewrite header.
+ *
+ * @param[in]   t       Parent of the sub-tunnel.
+ * @param[in]   st      Sub-tunnel.
+ * @param[in]   lp      Local and remote locators used in the encap header.
+ *
+ * @return 0 on success.
+ */
+u8 *
+lisp_gpe_tunnel_build_rewrite (const lisp_gpe_tunnel_2_t * lgt,
+                              const lisp_gpe_adjacency_t * ladj,
+                              lisp_gpe_next_protocol_e payload_proto)
+{
+  lisp_gpe_header_t *lisp0;
+  u8 *rw = 0;
+  int len;
+
+  if (IP4 == ip_addr_version (&lgt->key->lcl))
+    {
+      ip4_udp_lisp_gpe_header_t *h0;
+      ip4_header_t *ip0;
+
+      len = sizeof (*h0);
+
+      vec_validate_aligned (rw, len - 1, CLIB_CACHE_LINE_BYTES);
+
+      h0 = (ip4_udp_lisp_gpe_header_t *) rw;
+
+      /* Fixed portion of the (outer) ip4 header */
+      ip0 = &h0->ip4;
+      ip0->ip_version_and_header_length = 0x45;
+      ip0->ttl = 254;
+      ip0->protocol = IP_PROTOCOL_UDP;
+
+      /* we fix up the ip4 header length and checksum after-the-fact */
+      ip_address_copy_addr (&ip0->src_address, &lgt->key->lcl);
+      ip_address_copy_addr (&ip0->dst_address, &lgt->key->rmt);
+      ip0->checksum = ip4_header_checksum (ip0);
+
+      /* UDP header, randomize src port on something, maybe? */
+      h0->udp.src_port = clib_host_to_net_u16 (4341);
+      h0->udp.dst_port = clib_host_to_net_u16 (UDP_DST_PORT_lisp_gpe);
+
+      /* LISP-gpe header */
+      lisp0 = &h0->lisp;
+    }
+  else
+    {
+      ip6_udp_lisp_gpe_header_t *h0;
+      ip6_header_t *ip0;
+
+      len = sizeof (*h0);
+
+      vec_validate_aligned (rw, len - 1, CLIB_CACHE_LINE_BYTES);
+
+      h0 = (ip6_udp_lisp_gpe_header_t *) rw;
+
+      /* Fixed portion of the (outer) ip6 header */
+      ip0 = &h0->ip6;
+      ip0->ip_version_traffic_class_and_flow_label =
+       clib_host_to_net_u32 (0x6 << 28);
+      ip0->hop_limit = 254;
+      ip0->protocol = IP_PROTOCOL_UDP;
+
+      /* we fix up the ip6 header length after-the-fact */
+      ip_address_copy_addr (&ip0->src_address, &lgt->key->lcl);
+      ip_address_copy_addr (&ip0->dst_address, &lgt->key->rmt);
+
+      /* UDP header, randomize src port on something, maybe? */
+      h0->udp.src_port = clib_host_to_net_u16 (4341);
+      h0->udp.dst_port = clib_host_to_net_u16 (UDP_DST_PORT_lisp_gpe);
+
+      /* LISP-gpe header */
+      lisp0 = &h0->lisp;
+    }
+
+  lisp0->flags = ladj->flags;
+  lisp0->ver_res = 0;
+  lisp0->res = 0;
+  lisp0->next_protocol = payload_proto;
+  lisp0->iid = clib_host_to_net_u32 (ladj->vni);
+
+  return (rw);
+}
+
+static lisp_gpe_tunnel_2_t *
+lisp_gpe_tunnel_db_find (const lisp_gpe_tunnel_key_t * key)
+{
+  uword *p;
+
+  p = hash_get_mem (lisp_gpe_tunnel_db, (void *) key);
+
+  if (NULL != p)
+    {
+      return (pool_elt_at_index (lisp_gpe_tunnel_pool, p[0]));
+    }
+  return (NULL);
+}
+
+lisp_gpe_tunnel_2_t *
+lisp_gpe_tunnel_get_i (index_t lgti)
+{
+  return (pool_elt_at_index (lisp_gpe_tunnel_pool, lgti));
+}
+
+index_t
+lisp_gpe_tunnel_find_or_create_and_lock (const locator_pair_t * pair,
+                                        u32 rloc_fib_index)
+{
+  lisp_gpe_tunnel_key_t key = {
+    .lcl = pair->lcl_loc,
+    .rmt = pair->rmt_loc,
+    .fib_index = rloc_fib_index,
+  };
+  lisp_gpe_tunnel_2_t *lgt;
+  fib_prefix_t pfx;
+
+  lgt = lisp_gpe_tunnel_db_find (&key);
+
+  if (NULL == lgt)
+    {
+      pool_get (lisp_gpe_tunnel_pool, lgt);
+      memset (lgt, 0, sizeof (*lgt));
+
+      lgt->key = clib_mem_alloc (sizeof (*lgt->key));
+      memset (lgt->key, 0, sizeof (*lgt->key));
+
+      lgt->key->rmt = pair->rmt_loc;
+      lgt->key->lcl = pair->lcl_loc;
+      lgt->key->fib_index = rloc_fib_index;
+
+      /*
+       * source the FIB entry for the RLOC so we can track its forwarding
+       * chain
+       */
+      ip_address_to_fib_prefix (&lgt->key->rmt, &pfx);
+
+      lgt->fib_entry_index = fib_table_entry_special_add (rloc_fib_index,
+                                                         &pfx,
+                                                         FIB_SOURCE_RR,
+                                                         FIB_ENTRY_FLAG_NONE,
+                                                         ADJ_INDEX_INVALID);
+
+      hash_set_mem (lisp_gpe_tunnel_db, &lgt->key,
+                   (lgt - lisp_gpe_tunnel_pool));
+    }
+
+  lgt->locks++;
+
+  return (lgt - lisp_gpe_tunnel_pool);
+}
+
+void
+lisp_gpe_tunnel_unlock (index_t lgti)
+{
+  lisp_gpe_tunnel_2_t *lgt;
+
+  lgt = lisp_gpe_tunnel_get_i (lgti);
+  lgt->locks--;
+
+  if (0 == lgt->locks)
+    {
+      hash_unset_mem (lisp_gpe_tunnel_db, &lgt->key);
+      clib_mem_free (lgt->key);
+      pool_put (lisp_gpe_tunnel_pool, lgt);
+    }
+}
+
+const lisp_gpe_tunnel_2_t *
+lisp_gpe_tunnel_get (index_t lgti)
+{
+  return (lisp_gpe_tunnel_get_i (lgti));
+}
+
+/** Format LISP-GPE tunnel. */
+u8 *
+format_lisp_gpe_tunnel (u8 * s, va_list * args)
+{
+  lisp_gpe_tunnel_2_t *lgt = va_arg (*args, lisp_gpe_tunnel_2_t *);
+
+  s = format (s, "tunnel %d\n", lgt - lisp_gpe_tunnel_pool);
+  s = format (s, " fib-index: %d, locks:%d \n",
+             lgt->key->fib_index, lgt->locks);
+  s = format (s, " lisp ver 0\n");
+
+  s = format (s, " locator-pair:\n");
+  s = format (s, "  local: %U remote: %U\n",
+             format_ip_address, &lgt->key->lcl,
+             format_ip_address, &lgt->key->rmt);
+  s = format (s, " RLOC FIB entry: %d\n", lgt->fib_entry_index);
+
+  return s;
+}
+
+/**
+ * CLI command to show LISP-GPE tunnels.
+ */
+static clib_error_t *
+show_lisp_gpe_tunnel_command_fn (vlib_main_t * vm,
+                                unformat_input_t * input,
+                                vlib_cli_command_t * cmd)
+{
+  lisp_gpe_tunnel_2_t *lgt;
+  index_t index;
+
+  if (pool_elts (lisp_gpe_tunnel_pool) == 0)
+    vlib_cli_output (vm, "No lisp-gpe tunnels configured...");
+
+  if (unformat (input, "%d", &index))
+    {
+      lgt = lisp_gpe_tunnel_get_i (index);
+      vlib_cli_output (vm, "%U", format_lisp_gpe_tunnel, lgt);
+    }
+  else
+    {
+      /* *INDENT-OFF* */
+      pool_foreach (lgt, lisp_gpe_tunnel_pool,
+      ({
+       vlib_cli_output (vm, "%U", format_lisp_gpe_tunnel, lgt);
+      }));
+      /* *INDENT-ON* */
+    }
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_lisp_gpe_tunnel_command, static) =
+{
+  .path = "show lisp gpe tunnel",
+  .function = show_lisp_gpe_tunnel_command_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+lisp_gpe_tunnel_module_init (vlib_main_t * vm)
+{
+  lisp_gpe_tunnel_db = hash_create_mem (0,
+                                       sizeof (lisp_gpe_fwd_entry_key_t),
+                                       sizeof (uword));
+
+  return (NULL);
+}
+
+VLIB_INIT_FUNCTION (lisp_gpe_tunnel_module_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/vnet/vnet/lisp-gpe/lisp_gpe_tunnel.h b/vnet/vnet/lisp-gpe/lisp_gpe_tunnel.h
new file mode 100644 (file)
index 0000000..d417fa9
--- /dev/null
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file
+ * @brief Common utility functions for IPv4, IPv6 and L2 LISP-GPE tunnels.
+ *
+ */
+
+#ifndef LISP_GPE_TUNNEL_H__
+#define LISP_GPE_TUNNEL_H__
+
+#include <vnet/lisp-gpe/lisp_gpe.h>
+#include <vnet/lisp-gpe/lisp_gpe_packet.h>
+
+/**
+ * Forward declaration
+ */
+struct lisp_gpe_adjacency_t_;
+
+/**
+ * A Key for a tunnel
+ */
+typedef struct lisp_gpe_tunnel_key_t_
+{
+  ip_address_t rmt;
+  ip_address_t lcl;
+  u32 fib_index;
+} lisp_gpe_tunnel_key_t;
+
+/**
+ * @brief A LISP GPE Tunnel.
+ *
+ * A tunnel represents an associatation between a local and remote RLOC.
+ * As such it represents a unique LISP rewrite.
+ */
+typedef struct lisp_gpe_tunnel_2_t_
+{
+  /**
+   * RLOC pair and rloc fib_index. This is the tunnel's key.
+   */
+  lisp_gpe_tunnel_key_t *key;
+
+  /**
+   * number of reference counting locks
+   */
+  u32 locks;
+
+  /**
+   * the FIB entry through which the remote rloc is reachable
+   s */
+  fib_node_index_t fib_entry_index;
+} lisp_gpe_tunnel_2_t;
+
+extern index_t lisp_gpe_tunnel_find_or_create_and_lock (const locator_pair_t *
+                                                       pair,
+                                                       u32 rloc_fib_index);
+
+extern void lisp_gpe_tunnel_unlock (index_t lgti);
+
+extern const lisp_gpe_tunnel_2_t *lisp_gpe_tunnel_get (index_t lgti);
+
+extern u8 *lisp_gpe_tunnel_build_rewrite (const lisp_gpe_tunnel_2_t * lgt,
+                                         const struct lisp_gpe_adjacency_t_
+                                         *ladj,
+                                         lisp_gpe_next_protocol_e
+                                         payload_proto);
+extern u8 *format_lisp_gpe_tunnel (u8 * s, va_list * args);
+
+#endif
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
index 5b5bae5..74a9905 100644 (file)
  * limitations under the License.
  */
 
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/ip6_fib.h>
+#include <vnet/adj/adj.h>
+#include <vnet/map/map_dpo.h>
+
 #include "map.h"
 
 #ifndef __SSE4_2__
@@ -159,15 +164,12 @@ map_create_domain (ip4_address_t * ip4_prefix,
                   u8 psid_offset,
                   u8 psid_length, u32 * map_domain_index, u16 mtu, u8 flags)
 {
+  u8 suffix_len, suffix_shift;
   map_main_t *mm = &map_main;
-  ip4_main_t *im4 = &ip4_main;
-  ip6_main_t *im6 = &ip6_main;
+  dpo_id_t dpo_v4 = DPO_NULL;
+  dpo_id_t dpo_v6 = DPO_NULL;
+  fib_node_index_t fei;
   map_domain_t *d;
-  ip_adjacency_t adj;
-  ip4_add_del_route_args_t args4;
-  ip6_add_del_route_args_t args6;
-  u8 suffix_len, suffix_shift;
-  uword *p;
 
   /* Sanity check on the src prefix length */
   if (flags & MAP_DOMAIN_TRANSLATION)
@@ -236,73 +238,82 @@ map_create_domain (ip4_address_t * ip4_prefix,
   d->psid_mask = (1 << d->psid_length) - 1;
   d->ea_shift = 64 - ip6_prefix_len - suffix_len - d->psid_length;
 
-  /* Init IP adjacency */
-  memset (&adj, 0, sizeof (adj));
-  adj.explicit_fib_index = ~0;
-  adj.lookup_next_index =
-    (d->flags & MAP_DOMAIN_TRANSLATION) ? IP_LOOKUP_NEXT_MAP_T :
-    IP_LOOKUP_NEXT_MAP;
-  p = (uword *) & adj.rewrite_data[0];
-  *p = (uword) (*map_domain_index);
+  /* MAP data-plane object */
+  if (d->flags & MAP_DOMAIN_TRANSLATION)
+    map_t_dpo_create (DPO_PROTO_IP4, *map_domain_index, &dpo_v4);
+  else
+    map_dpo_create (DPO_PROTO_IP4, *map_domain_index, &dpo_v4);
+
+  /* Create ip4 route */
+  fib_prefix_t pfx = {
+    .fp_proto = FIB_PROTOCOL_IP4,
+    .fp_len = d->ip4_prefix_len,
+    .fp_addr = {
+               .ip4 = d->ip4_prefix,
+               }
+    ,
+  };
+  fib_table_entry_special_dpo_add (0, &pfx,
+                                  FIB_SOURCE_MAP,
+                                  FIB_ENTRY_FLAG_EXCLUSIVE, &dpo_v4);
+  dpo_reset (&dpo_v4);
 
-  if (ip4_get_route (im4, 0, 0, (u8 *) ip4_prefix, ip4_prefix_len))
+  /*
+   * Multiple MAP domains may share same source IPv6 TEP.
+   * In this case the route will exist and be MAP sourced.
+   * Find the adj (if any) already contributed and modify it
+   */
+  fib_prefix_t pfx6 = {
+    .fp_proto = FIB_PROTOCOL_IP6,
+    .fp_len = d->ip6_src_len,
+    .fp_addr = {
+               .ip6 = d->ip6_src,
+               }
+    ,
+  };
+  fei = fib_table_lookup_exact_match (0, &pfx6);
+
+  if (FIB_NODE_INDEX_INVALID != fei)
     {
-      clib_warning ("IPv4 route already defined: %U/%d", format_ip4_address,
-                   ip4_prefix, ip4_prefix_len);
-      pool_put (mm->domains, d);
-      return -1;
-    }
+      dpo_id_t dpo = DPO_NULL;
 
-  /* Create ip4 adjacency */
-  memset (&args4, 0, sizeof (args4));
-  args4.table_index_or_table_id = 0;
-  args4.flags = IP4_ROUTE_FLAG_ADD;
-  args4.dst_address.as_u32 = ip4_prefix->as_u32;
-  args4.dst_address_length = ip4_prefix_len;
+      if (fib_entry_get_dpo_for_source (fei, FIB_SOURCE_MAP, &dpo))
+       {
+         /*
+          * modify the existing MAP to indicate it's shared
+          * skip to route add.
+          */
+         const dpo_id_t *md_dpo;
+         map_dpo_t *md;
 
-  args4.adj_index = ~0;
-  args4.add_adj = &adj;
-  args4.n_add_adj = 1;
-  ip4_add_del_route (im4, &args4);
+         ASSERT (DPO_LOAD_BALANCE == dpo.dpoi_type);
 
-  /* Multiple MAP domains may share same source IPv6 TEP */
-  u32 ai = ip6_get_route (im6, 0, 0, ip6_src, ip6_src_len);
-  if (ai > 0)
-    {
-      ip_lookup_main_t *lm6 = &ip6_main.lookup_main;
-      ip_adjacency_t *adj6 = ip_get_adjacency (lm6, ai);
-      if (adj6->lookup_next_index != IP_LOOKUP_NEXT_MAP &&
-         adj6->lookup_next_index != IP_LOOKUP_NEXT_MAP_T)
-       {
-         clib_warning ("BR source address already assigned: %U",
-                       format_ip6_address, ip6_src);
-         pool_put (mm->domains, d);
-         return -1;
-       }
-      /* Shared source */
-      p = (uword *) & adj6->rewrite_data[0];
-      p[0] = ~0;
+         md_dpo = load_balance_get_bucket (dpo.dpoi_index, 0);
+         md = map_dpo_get (md_dpo->dpoi_index);
 
-      /*
-       *  Add refcount, so we don't accidentially delete the route
-       *  underneath someone
-       */
-      p[1]++;
+         md->md_domain = ~0;
+         dpo_copy (&dpo_v6, md_dpo);
+         dpo_reset (&dpo);
+
+         goto route_add;
+       }
     }
+
+  if (d->flags & MAP_DOMAIN_TRANSLATION)
+    map_t_dpo_create (DPO_PROTO_IP6, *map_domain_index, &dpo_v6);
   else
-    {
-      /* Create ip6 adjacency. */
-      memset (&args6, 0, sizeof (args6));
-      args6.table_index_or_table_id = 0;
-      args6.flags = IP6_ROUTE_FLAG_ADD;
-      args6.dst_address.as_u64[0] = ip6_src->as_u64[0];
-      args6.dst_address.as_u64[1] = ip6_src->as_u64[1];
-      args6.dst_address_length = ip6_src_len;
-      args6.adj_index = ~0;
-      args6.add_adj = &adj;
-      args6.n_add_adj = 1;
-      ip6_add_del_route (im6, &args6);
-    }
+    map_dpo_create (DPO_PROTO_IP6, *map_domain_index, &dpo_v6);
+
+route_add:
+  /*
+   * Create ip6 route. This is a reference counted add. If the prefix
+   * already exists and is MAP sourced, it is now MAP source n+1 times
+   * and will need to be removed n+1 times.
+   */
+  fib_table_entry_special_dpo_add (0, &pfx6,
+                                  FIB_SOURCE_MAP,
+                                  FIB_ENTRY_FLAG_EXCLUSIVE, &dpo_v6);
+  dpo_reset (&dpo_v6);
 
   /* Validate packet/byte counters */
   map_domain_counter_lock (mm);
@@ -332,12 +343,7 @@ int
 map_delete_domain (u32 map_domain_index)
 {
   map_main_t *mm = &map_main;
-  ip4_main_t *im4 = &ip4_main;
-  ip6_main_t *im6 = &ip6_main;
   map_domain_t *d;
-  ip_adjacency_t adj;
-  ip4_add_del_route_args_t args4;
-  ip6_add_del_route_args_t args6;
 
   if (pool_is_free_index (mm->domains, map_domain_index))
     {
@@ -348,47 +354,26 @@ map_delete_domain (u32 map_domain_index)
 
   d = pool_elt_at_index (mm->domains, map_domain_index);
 
-  memset (&adj, 0, sizeof (adj));
-  adj.explicit_fib_index = ~0;
-  adj.lookup_next_index =
-    (d->flags & MAP_DOMAIN_TRANSLATION) ? IP_LOOKUP_NEXT_MAP_T :
-    IP_LOOKUP_NEXT_MAP;
-
-  /* Delete ip4 adjacency */
-  memset (&args4, 0, sizeof (args4));
-  args4.table_index_or_table_id = 0;
-  args4.flags = IP4_ROUTE_FLAG_DEL;
-  args4.dst_address.as_u32 = d->ip4_prefix.as_u32;
-  args4.dst_address_length = d->ip4_prefix_len;
-  args4.adj_index = 0;
-  args4.add_adj = &adj;
-  args4.n_add_adj = 0;
-  ip4_add_del_route (im4, &args4);
-
-  /* Delete ip6 adjacency */
-  u32 ai = ip6_get_route (im6, 0, 0, &d->ip6_src, d->ip6_src_len);
-  if (ai > 0)
-    {
-      ip_lookup_main_t *lm6 = &ip6_main.lookup_main;
-      ip_adjacency_t *adj6 = ip_get_adjacency (lm6, ai);
-
-      uword *p = (uword *) & adj6->rewrite_data[0];
-      /* Delete route when no other domains use this source */
-      if (p[1] == 0)
-       {
-         memset (&args6, 0, sizeof (args6));
-         args6.table_index_or_table_id = 0;
-         args6.flags = IP6_ROUTE_FLAG_DEL;
-         args6.dst_address.as_u64[0] = d->ip6_src.as_u64[0];
-         args6.dst_address.as_u64[1] = d->ip6_src.as_u64[1];
-         args6.dst_address_length = d->ip6_src_len;
-         args6.adj_index = 0;
-         args6.add_adj = &adj;
-         args6.n_add_adj = 0;
-         ip6_add_del_route (im6, &args6);
-       }
-      p[1]--;
-    }
+  fib_prefix_t pfx = {
+    .fp_proto = FIB_PROTOCOL_IP4,
+    .fp_len = d->ip4_prefix_len,
+    .fp_addr = {
+               .ip4 = d->ip4_prefix,
+               }
+    ,
+  };
+  fib_table_entry_special_remove (0, &pfx, FIB_SOURCE_MAP);
+
+  fib_prefix_t pfx6 = {
+    .fp_proto = FIB_PROTOCOL_IP6,
+    .fp_len = d->ip6_src_len,
+    .fp_addr = {
+               .ip6 = d->ip6_src,
+               }
+    ,
+  };
+  fib_table_entry_special_remove (0, &pfx6, FIB_SOURCE_MAP);
+
   /* Deleting rules */
   if (d->rules)
     clib_mem_free (d->rules);
@@ -448,17 +433,18 @@ static void
 map_pre_resolve (ip4_address_t * ip4, ip6_address_t * ip6)
 {
   map_main_t *mm = &map_main;
-  ip4_main_t *im4 = &ip4_main;
   ip6_main_t *im6 = &ip6_main;
 
   if (ip6->as_u64[0] != 0 || ip6->as_u64[1] != 0)
     {
-      mm->adj6_index = ip6_fib_lookup_with_table (im6, 0, ip6);
+      // FIXME NOT an ADJ
+      mm->adj6_index = ip6_fib_table_fwding_lookup (im6, 0, ip6);
       clib_warning ("FIB lookup results in: %u", mm->adj6_index);
     }
   if (ip4->as_u32 != 0)
     {
-      mm->adj4_index = ip4_fib_lookup_with_table (im4, 0, ip4, 0);
+      // FIXME NOT an ADJ
+      mm->adj4_index = ip4_fib_table_lookup_lb (0, ip4);
       clib_warning ("FIB lookup results in: %u", mm->adj4_index);
     }
 }
@@ -2156,6 +2142,8 @@ map_init (vlib_main_t * vm)
   mm->ip6_reass_fifo_last = MAP_REASS_INDEX_NONE;
   map_ip6_reass_reinit (NULL, NULL);
 
+  map_dpo_module_init ();
+
   return 0;
 }
 
index fb53229..b76891b 100644 (file)
 #include <vnet/vnet.h>
 #include <vnet/ip/ip.h>
 #include <vlib/vlib.h>
+#include <vnet/fib/fib_types.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/adj/adj.h>
+#include <vnet/map/map_dpo.h>
+#include <vnet/dpo/load_balance.h>
 
 #define MAP_SKIP_IP6_LOOKUP 1
 
@@ -105,6 +110,9 @@ typedef struct
   u8 ip4_prefix_len;
 } map_domain_t;
 
+_Static_assert ((sizeof (map_domain_t) <= CLIB_CACHE_LINE_BYTES),
+               "MAP domain fits in one cacheline");
+
 #define MAP_REASS_INDEX_NONE ((u16)0xffff)
 
 /*
@@ -381,16 +389,17 @@ map_get_ip4 (ip6_address_t *addr)
  * Get the MAP domain from an IPv4 lookup adjacency.
  */
 static_always_inline map_domain_t *
-ip4_map_get_domain (u32 adj_index, u32 *map_domain_index)
+ip4_map_get_domain (u32 mdi,
+                   u32 *map_domain_index)
 {
   map_main_t *mm = &map_main;
-  ip_lookup_main_t *lm = &ip4_main.lookup_main;
-  ip_adjacency_t *adj = ip_get_adjacency(lm, adj_index);
-  ASSERT(adj);
-  uword *p = (uword *)adj->rewrite_data;
-  ASSERT(p);
-  *map_domain_index = p[0];
-  return pool_elt_at_index(mm->domains, p[0]);
+  map_dpo_t *md;
+
+  md = map_dpo_get(mdi);
+
+  ASSERT(md);
+  *map_domain_index = md->md_domain;
+  return pool_elt_at_index(mm->domains, *map_domain_index);
 }
 
 /*
@@ -399,36 +408,34 @@ ip4_map_get_domain (u32 adj_index, u32 *map_domain_index)
  * The IPv4 address is used otherwise.
  */
 static_always_inline map_domain_t *
-ip6_map_get_domain (u32 adj_index, ip4_address_t *addr,
+ip6_map_get_domain (u32 mdi, ip4_address_t *addr,
                     u32 *map_domain_index, u8 *error)
 {
   map_main_t *mm = &map_main;
-  ip4_main_t *im4 = &ip4_main;
-  ip_lookup_main_t *lm4 = &ip4_main.lookup_main;
+  map_dpo_t *md;
 
   /*
    * Disable direct MAP domain lookup on decap, until the security check is updated to verify IPv4 SA.
    * (That's done implicitly when MAP domain is looked up in the IPv4 FIB)
    */
 #ifdef MAP_NONSHARED_DOMAIN_ENABLED
-  ip_lookup_main_t *lm6 = &ip6_main.lookup_main;
-  ip_adjacency_t *adj = ip_get_adjacency(lm6, adj_index);
-  ASSERT(adj);
-  uword *p = (uword *)adj->rewrite_data;
-  ASSERT(p);
-  *map_domain_index = p[0];
-  if (p[0] != ~0)
-    return pool_elt_at_index(mm->domains, p[0]);
-#endif
+  md = map_dpo_get(mdi);
 
-  u32 ai = ip4_fib_lookup_with_table(im4, 0, addr, 0);
-  ip_adjacency_t *adj4 = ip_get_adjacency (lm4, ai);
-  if (PREDICT_TRUE(adj4->lookup_next_index == IP_LOOKUP_NEXT_MAP ||
-                  adj4->lookup_next_index == IP_LOOKUP_NEXT_MAP_T)) {
-    uword *p = (uword *)adj4->rewrite_data;
-    *map_domain_index = p[0];
+  ASSERT(md);
+  *map_domain_index = md->md_domain;
+  if (*map_domain_index != ~0)
     return pool_elt_at_index(mm->domains, *map_domain_index);
-  }
+#endif
+
+  u32 lbi = ip4_fib_forwarding_lookup(0, addr);
+  const dpo_id_t *dpo = load_balance_get_bucket(lbi, 0);
+  if (PREDICT_TRUE(dpo->dpoi_type == map_dpo_type ||
+                  dpo->dpoi_type == map_t_dpo_type))
+    {
+      md = map_dpo_get(dpo->dpoi_index);
+     *map_domain_index = md->md_domain;
+      return pool_elt_at_index(mm->domains, *map_domain_index);
+    }
   *error = MAP_ERROR_NO_DOMAIN;
   return NULL;
 }
diff --git a/vnet/vnet/map/map_dpo.c b/vnet/vnet/map/map_dpo.c
new file mode 100644 (file)
index 0000000..df2b5fa
--- /dev/null
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/ip/ip.h>
+#include <vnet/map/map_dpo.h>
+
+/**
+ * pool of all MPLS Label DPOs
+ */
+map_dpo_t *map_dpo_pool;
+
+/**
+ * The register MAP DPO type
+ */
+dpo_type_t map_dpo_type;
+dpo_type_t map_t_dpo_type;
+
+static map_dpo_t *
+map_dpo_alloc (void)
+{
+    map_dpo_t *md;
+
+    pool_get_aligned(map_dpo_pool, md, CLIB_CACHE_LINE_BYTES);
+    memset(md, 0, sizeof(*md));
+
+    return (md);
+}
+
+static index_t
+map_dpo_get_index (map_dpo_t *md)
+{
+    return (md - map_dpo_pool);
+}
+
+void
+map_dpo_create (dpo_proto_t dproto,
+               u32 domain_index,
+               dpo_id_t *dpo)
+{
+    map_dpo_t *md;
+
+    md = map_dpo_alloc();
+    md->md_domain = domain_index;
+    md->md_proto = dproto;
+
+    dpo_set(dpo,
+           map_dpo_type,
+           dproto,
+           map_dpo_get_index(md));
+}
+
+void
+map_t_dpo_create (dpo_proto_t dproto,
+                 u32 domain_index,
+                 dpo_id_t *dpo)
+{
+    map_dpo_t *md;
+
+    md = map_dpo_alloc();
+    md->md_domain = domain_index;
+    md->md_proto = dproto;
+
+    dpo_set(dpo,
+           map_t_dpo_type,
+           dproto,
+           map_dpo_get_index(md));
+}
+
+
+u8*
+format_map_dpo (u8 *s, va_list *args)
+{
+    index_t index = va_arg (*args, index_t);
+    CLIB_UNUSED(u32 indent) = va_arg (*args, u32);
+    map_dpo_t *md;
+
+    md = map_dpo_get(index);
+
+    return (format(s, "map:[%d]:%U domain:%d",
+                  index,
+                   format_dpo_proto, md->md_proto,
+                  md->md_domain));
+}
+
+u8*
+format_map_t_dpo (u8 *s, va_list *args)
+{
+    index_t index = va_arg (*args, index_t);
+    CLIB_UNUSED(u32 indent) = va_arg (*args, u32);
+    map_dpo_t *md;
+
+    md = map_dpo_get(index);
+
+    return (format(s, "map-t:[%d]:%U domain:%d",
+                  index,
+                   format_dpo_proto, md->md_proto,
+                  md->md_domain));
+}
+
+
+static void
+map_dpo_lock (dpo_id_t *dpo)
+{
+    map_dpo_t *md;
+
+    md = map_dpo_get(dpo->dpoi_index);
+
+    md->md_locks++;
+}
+
+static void
+map_dpo_unlock (dpo_id_t *dpo)
+{
+    map_dpo_t *md;
+
+    md = map_dpo_get(dpo->dpoi_index);
+
+    md->md_locks--;
+
+    if (0 == md->md_locks)
+    {
+       pool_put(map_dpo_pool, md);
+    }
+}
+
+const static dpo_vft_t md_vft = {
+    .dv_lock = map_dpo_lock,
+    .dv_unlock = map_dpo_unlock,
+    .dv_format = format_map_dpo,
+};
+
+const static char* const map_ip4_nodes[] =
+{
+    "ip4-map",
+    NULL,
+};
+const static char* const map_ip6_nodes[] =
+{
+    "ip6-map",
+    NULL,
+};
+
+const static char* const * const map_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = map_ip4_nodes,
+    [DPO_PROTO_IP6]  = map_ip6_nodes,
+    [DPO_PROTO_MPLS] = NULL,
+};
+
+const static dpo_vft_t md_t_vft = {
+    .dv_lock = map_dpo_lock,
+    .dv_unlock = map_dpo_unlock,
+    .dv_format = format_map_t_dpo,
+};
+
+const static char* const map_t_ip4_nodes[] =
+{
+    "ip4-map-t",
+    NULL,
+};
+const static char* const map_t_ip6_nodes[] =
+{
+    "ip6-map-t",
+    NULL,
+};
+
+const static char* const * const map_t_nodes[DPO_PROTO_NUM] =
+{
+    [DPO_PROTO_IP4]  = map_t_ip4_nodes,
+    [DPO_PROTO_IP6]  = map_t_ip6_nodes,
+    [DPO_PROTO_MPLS] = NULL,
+};
+
+void
+map_dpo_module_init (void)
+{
+    map_dpo_type = dpo_register_new_type(&md_vft, map_nodes);
+    map_t_dpo_type = dpo_register_new_type(&md_t_vft, map_t_nodes);
+}
diff --git a/vnet/vnet/map/map_dpo.h b/vnet/vnet/map/map_dpo.h
new file mode 100644 (file)
index 0000000..be510db
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MAP_DPO_H__
+#define __MAP_DPO_H__
+
+#include <vnet/vnet.h>
+#include <vnet/dpo/dpo.h>
+
+/**
+ * A representation of a MAP DPO
+ */
+typedef struct map_dpo_t
+{
+    /**
+     * The dat-plane protocol
+     */
+    dpo_proto_t md_proto;
+
+    /**
+     * the MAP domain index
+     */
+    u32 md_domain;
+
+    /**
+     * Number of locks/users of the label
+     */
+    u16 md_locks;
+} map_dpo_t;
+
+extern void map_dpo_create (dpo_proto_t dproto,
+                           u32 domain_index,
+                           dpo_id_t *dpo);
+extern void map_t_dpo_create (dpo_proto_t dproto,
+                             u32 domain_index,
+                             dpo_id_t *dpo);
+
+extern u8* format_map_dpo(u8 *s, va_list *args);
+
+/*
+ * Encapsulation violation for fast data-path access
+ */
+extern map_dpo_t *map_dpo_pool;
+extern dpo_type_t map_dpo_type;
+extern dpo_type_t map_t_dpo_type;
+
+static inline map_dpo_t *
+map_dpo_get (index_t index)
+{
+    return (pool_elt_at_index(map_dpo_pool, index));
+}
+
+extern void map_dpo_module_init(void);
+
+#endif
index 4561d7c..be80c9f 100644 (file)
@@ -40,91 +40,91 @@ mcast_test_command_fn (vlib_main_t * vm,
                 unformat_input_t * input,
                 vlib_cli_command_t * cmd)
 {
-  u8 *rewrite_data;
-  mcast_test_main_t * mtm = &mcast_test_main;
-  mcast_main_t * mcm = mtm->mcast_main;
-  ip_adjacency_t adj;
-  u32 adj_index;
-  mcast_group_t * g;
-  mcast_group_member_t * member;
-  unformat_input_t _line_input, * line_input = &_line_input;
-  ip4_address_t dst_addr, zero;
-  ip4_main_t * im = &ip4_main;
-  ip_lookup_main_t * lm = &im->lookup_main;
-
-  /* Get a line of input. */
-  if (! unformat_user (input, unformat_line_input, line_input))
-    return 0;
-
-  pool_get (mcm->groups, g);
-  memset (g, 0, sizeof (*g));
-
-  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
-    {
-      vnet_hw_interface_t *hw;
-      u32 next, sw_if_index;
-
-      if (unformat (line_input, "%U", unformat_vnet_sw_interface, 
-                    mtm->vnet_main, &sw_if_index)) 
-        {
-          vec_add2 (g->members, member, 1);
-          member->tx_sw_if_index = sw_if_index;
+  /* u8 *rewrite_data; */
+  /* mcast_test_main_t * mtm = &mcast_test_main; */
+  /* mcast_main_t * mcm = mtm->mcast_main; */
+  /* ip_adjacency_t adj; */
+  /* u32 adj_index; */
+  /* mcast_group_t * g; */
+  /* mcast_group_member_t * member; */
+  /* unformat_input_t _line_input, * line_input = &_line_input; */
+  /* ip4_address_t dst_addr, zero; */
+  /* ip4_main_t * im = &ip4_main; */
+  /* ip_lookup_main_t * lm = &im->lookup_main; */
+
+  /* /\* Get a line of input. *\/ */
+  /* if (! unformat_user (input, unformat_line_input, line_input)) */
+  /*   return 0; */
+
+  /* pool_get (mcm->groups, g); */
+  /* memset (g, 0, sizeof (*g)); */
+
+  /* while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) */
+  /*   { */
+  /*     vnet_hw_interface_t *hw; */
+  /*     u32 next, sw_if_index; */
+
+  /*     if (unformat (line_input, "%U", unformat_vnet_sw_interface,  */
+  /*                   mtm->vnet_main, &sw_if_index))  */
+  /*       { */
+  /*         vec_add2 (g->members, member, 1); */
+  /*         member->tx_sw_if_index = sw_if_index; */
           
-          hw = vnet_get_sup_hw_interface (mtm->vnet_main, 
-                                          sw_if_index);
+  /*         hw = vnet_get_sup_hw_interface (mtm->vnet_main,  */
+  /*                                         sw_if_index); */
           
-          next = vlib_node_add_next (mtm->vlib_main, 
-                                     mcast_prep_node.index,
-                                     hw->output_node_index);
+  /*         next = vlib_node_add_next (mtm->vlib_main,  */
+  /*                                    mcast_prep_node.index, */
+  /*                                    hw->output_node_index); */
           
-          /* Required to be the same next index... */
-          vlib_node_add_next_with_slot (mtm->vlib_main,
-                                        mcast_recycle_node.index,
-                                        hw->output_node_index, next);
-          member->prep_and_recycle_node_next_index = next;
-        }
-      else
-        {
-          return unformat_parse_error (line_input);
-        }
-    }
-
-  if (vec_len (g->members) == 0)
-    {
-      pool_put (mcm->groups, g);
-      vlib_cli_output (vm, "no group members specified");
-      return 0;
-    }
-
-
-  adj.lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
-  adj.mcast_group_index = g - mcm->groups;
-  rewrite_data = format (0, "abcdefg");
-
-  vnet_rewrite_for_tunnel
-    (mtm->vnet_main,
-     (u32)~0, /* tx_sw_if_index, we dont know yet */
-     ip4_rewrite_node.index,
-     mcast_prep_node.index,
-     &adj.rewrite_header,
-     rewrite_data, vec_len(rewrite_data));
-
-  ip_add_adjacency (lm, &adj, 1 /* one adj */,
-                    &adj_index);
+  /*         /\* Required to be the same next index... *\/ */
+  /*         vlib_node_add_next_with_slot (mtm->vlib_main, */
+  /*                                       mcast_recycle_node.index, */
+  /*                                       hw->output_node_index, next); */
+  /*         member->prep_and_recycle_node_next_index = next; */
+  /*       } */
+  /*     else */
+  /*       { */
+  /*         return unformat_parse_error (line_input); */
+  /*       } */
+  /*   } */
+
+  /* if (vec_len (g->members) == 0) */
+  /*   { */
+  /*     pool_put (mcm->groups, g); */
+  /*     vlib_cli_output (vm, "no group members specified"); */
+  /*     return 0; */
+  /*   } */
+
+
+  /* adj.lookup_next_index = IP_LOOKUP_NEXT_REWRITE; */
+  /* adj.mcast_group_index = g - mcm->groups; */
+  /* rewrite_data = format (0, "abcdefg"); */
+
+  /* vnet_rewrite_for_tunnel */
+  /*   (mtm->vnet_main, */
+  /*    (u32)~0, /\* tx_sw_if_index, we dont know yet *\/ */
+  /*    ip4_rewrite_node.index, */
+  /*    mcast_prep_node.index, */
+  /*    &adj.rewrite_header, */
+  /*    rewrite_data, vec_len(rewrite_data)); */
+
+  /* ip_add_adjacency (lm, &adj, 1 /\* one adj *\/, */
+  /*                   &adj_index); */
   
-  dst_addr.as_u32 = clib_host_to_net_u32 (0x0a000002);
-  zero.as_u32 = 0;
-
-  ip4_add_del_route_next_hop (im,
-                              IP4_ROUTE_FLAG_ADD,
-                              &dst_addr,
-                              24 /* mask width */,
-                              &zero /* no next hop */,
+  /* dst_addr.as_u32 = clib_host_to_net_u32 (0x0a000002); */
+  /* zero.as_u32 = 0; */
+
+  /* ip4_add_del_route_next_hop (im, */
+  /*                             IP4_ROUTE_FLAG_ADD, */
+  /*                             &dst_addr, */
+  /*                             24 /\* mask width *\/, */
+  /*                             &zero /\* no next hop *\/, */
                           
-                              0, // next hop sw if index
-                              1, // weight
-                              adj_index,
-                              0 /* explicit fib 0 */);
+  /*                             0, // next hop sw if index */
+  /*                             1, // weight */
+  /*                             adj_index, */
+  /*                             0 /\* explicit fib 0 *\/); */
 
   return 0;
 }
index c0729f7..4c8c4ca 100644 (file)
@@ -38,6 +38,7 @@
  */
 
 #include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
 
 vnet_main_t vnet_main;
 
@@ -79,6 +80,9 @@ vnet_main_init (vlib_main_t * vm)
   if ((error = vlib_call_init_function (vm, vnet_interface_init)))
     return error;
 
+  if ((error = vlib_call_init_function (vm, fib_module_init)))
+    return error;
+
   if ((error = vlib_call_init_function (vm, ip_main_init)))
     return error;
 
@@ -88,6 +92,9 @@ vnet_main_init (vlib_main_t * vm)
   if ((error = vlib_call_init_function (vm, ip6_lookup_init)))
     return error;
 
+  if ((error = vlib_call_init_function (vm, mpls_init)))
+    return error;
+
   vnm->vlib_main = vm;
 
   hw_if_index = vnet_register_interface
@@ -98,6 +105,11 @@ vnet_main_init (vlib_main_t * vm)
   vnm->local_interface_hw_if_index = hw_if_index;
   vnm->local_interface_sw_if_index = hw->sw_if_index;
 
+  /* the local interface is used as an input interface when decapping from
+   * an IPSEC tunnel. so it needs to be IP enabled */
+  ip4_sw_interface_enable_disable (hw->sw_if_index, 1);
+  ip6_sw_interface_enable_disable (hw->sw_if_index, 1);
+
   return 0;
 }
 
diff --git a/vnet/vnet/mpls-gre/node.c b/vnet/vnet/mpls-gre/node.c
deleted file mode 100644 (file)
index 474e2e2..0000000
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- * node.c: mpls-o-gre decap processing
- *
- * Copyright (c) 2012-2014 Cisco and/or its affiliates.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vlib/vlib.h>
-#include <vnet/pg/pg.h>
-#include <vnet/mpls-gre/mpls.h>
-
-typedef struct {
-  u32 next_index;
-  u32 decap_index;
-  u32 tx_fib_index;
-  u32 label_host_byte_order;
-} mpls_rx_trace_t;
-
-u8 * format_mpls_rx_trace (u8 * s, va_list * args)
-{
-  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
-  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
-  mpls_rx_trace_t * t = va_arg (*args, mpls_rx_trace_t *);
-  char * next_name;
-
-  next_name = "BUG!";
-
-#define _(a,b) if (t->next_index == MPLS_INPUT_NEXT_##a) next_name = b;
-  foreach_mpls_input_next;
-#undef _
-  
-  s = format (s, "MPLS: next %s, lookup fib index %d, decap index %d\n",
-              next_name, t->next_index, t->tx_fib_index, t->decap_index);
-  if (t->decap_index != ~0)
-    {
-      s = format (s, "    label %d", 
-                  vnet_mpls_uc_get_label(t->label_host_byte_order));
-    }
-  return s;
-}
-
-vlib_node_registration_t mpls_input_node;
-
-typedef struct {
-  u32 last_label;
-  u32 last_inner_fib_index;
-  u32 last_outer_fib_index;
-  mpls_main_t * mpls_main;
-} mpls_input_runtime_t;
-
-static inline uword
-mpls_input_inline (vlib_main_t * vm,
-                   vlib_node_runtime_t * node,
-                   vlib_frame_t * from_frame, int is_mpls_o_gre)
-{
-  u32 n_left_from, next_index, * from, * to_next;
-  ip4_main_t * im = &ip4_main;
-  from = vlib_frame_vector_args (from_frame);
-  n_left_from = from_frame->n_vectors;
-  mpls_input_runtime_t * rt;
-  mpls_main_t * mm;
-
-  rt = vlib_node_get_runtime_data (vm, mpls_input_node.index);
-  mm = rt->mpls_main;
-  /* 
-   * Force an initial lookup every time, in case the control-plane
-   * changed the label->FIB mapping.
-   */
-  rt->last_label = ~0;
-
-  next_index = node->cached_next_index;
-
-  while (n_left_from > 0)
-    {
-      u32 n_left_to_next;
-
-      vlib_get_next_frame (vm, node, next_index,
-                          to_next, n_left_to_next);
-
-#if 0
-      while (n_left_from >= 4 && n_left_to_next >= 2)
-       {
-         u32 bi0, bi1;
-         vlib_buffer_t * b0, * b1;
-         mpls_unicast_header_t * h0, * h1;
-          int li0, li1;
-          u64 key0, key1;
-          u32 label0, label1;
-         u32 next0, next1;
-         uword * p0, * p1;
-          u32 fib_index0, fib_index1;
-
-         /* Prefetch next iteration. */
-         {
-           vlib_buffer_t * p2, * p3;
-
-           p2 = vlib_get_buffer (vm, from[2]);
-           p3 = vlib_get_buffer (vm, from[3]);
-
-           vlib_prefetch_buffer_header (p2, LOAD);
-           vlib_prefetch_buffer_header (p3, LOAD);
-
-           CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
-           CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
-         }
-
-         bi0 = from[0];
-         bi1 = from[1];
-         to_next[0] = bi0;
-         to_next[1] = bi1;
-         from += 2;
-         to_next += 2;
-         n_left_to_next -= 2;
-         n_left_from -= 2;
-
-         b0 = vlib_get_buffer (vm, bi0);
-         b1 = vlib_get_buffer (vm, bi1);
-
-          /* $$$$$ dual loop me */
-
-          vlib_buffer_advance (b0, sizeof (*h0));
-          vlib_buffer_advance (b1, sizeof (*h1));
-
-         vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
-                                          to_next, n_left_to_next,
-                                          bi0, bi1, next0, next1);
-       }
-    
-#endif
-
-      while (n_left_from > 0 && n_left_to_next > 0)
-       {
-         u32 bi0;
-         vlib_buffer_t * b0;
-         mpls_unicast_header_t * h0;
-          u32 label0;
-         u32 next0;
-          u64 key0;
-         uword * p0;
-          u32 rx_fib_index0;
-          mpls_decap_t *d0;
-
-         bi0 = from[0];
-         to_next[0] = bi0;
-         from += 1;
-         to_next += 1;
-         n_left_from -= 1;
-         n_left_to_next -= 1;
-
-         b0 = vlib_get_buffer (vm, bi0);
-          h0 = vlib_buffer_get_current (b0);
-
-          if (is_mpls_o_gre)
-            {
-              rx_fib_index0 = vec_elt (im->fib_index_by_sw_if_index, 
-                                       vnet_buffer(b0)->sw_if_index[VLIB_RX]);
-            }
-          else
-            {
-#if 0
-              /* If separate RX numbering spaces are required... */
-              rx_fib_index0 = vec_elt (mm->fib_index_by_sw_if_index, 
-                                       vnet_buffer(b0)->sw_if_index[VLIB_RX]);
-#endif
-              rx_fib_index0 = 0;
-            }
-          
-          next0 = ~0;
-          d0 = 0;
-
-          /* 
-           * Expect the control-plane team to squeal like pigs.
-           * If they don't program a decap label entry for each
-           * and every label in the stack, packets go into the trash...
-           */
-
-          do
-            {
-              label0 = clib_net_to_host_u32 (h0->label_exp_s_ttl);
-              /* TTL expired? */
-              if (PREDICT_FALSE(vnet_mpls_uc_get_ttl (label0) == 0))
-                {
-                  next0 = MPLS_INPUT_NEXT_DROP;
-                  b0->error = node->errors[MPLS_ERROR_TTL_EXPIRED];
-                  break;
-                }
-              
-              key0 = ((u64)rx_fib_index0<<32) 
-                | ((u64)vnet_mpls_uc_get_label (label0)<<12) 
-                | ((u64)vnet_mpls_uc_get_s (label0)<<8);
-
-              /* 
-               * The architecture crew claims that we won't need
-               * separate ip4, ip6, mpls-o-ethernet label numbering
-               * spaces. Use the low 8 key bits as a discriminator.
-               */
-
-              p0 = hash_get (mm->mpls_decap_by_rx_fib_and_label, key0);
-              if (p0 == 0)
-                {
-                  next0 = MPLS_INPUT_NEXT_DROP;
-                  b0->error = node->errors[MPLS_ERROR_BAD_LABEL];
-                  break;
-                }
-              d0 = pool_elt_at_index (mm->decaps, p0[0]);
-              next0 = d0->next_index;
-              vnet_buffer(b0)->sw_if_index[VLIB_TX] = d0->tx_fib_index;
-              vlib_buffer_advance (b0, sizeof (*h0));
-              h0 = vlib_buffer_get_current (b0);
-            } while (!vnet_mpls_uc_get_s(label0));
-
-          if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) 
-            {
-              mpls_rx_trace_t *tr = vlib_add_trace (vm, node, 
-                                                   b0, sizeof (*tr));
-              tr->next_index = next0;
-              tr->decap_index = d0 ? d0 - mm->decaps : ~0;
-              tr->tx_fib_index = vnet_buffer(b0)->sw_if_index[VLIB_TX];
-              tr->label_host_byte_order = label0;
-            }
-
-         vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
-                                          to_next, n_left_to_next,
-                                          bi0, next0);
-       }
-
-      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
-    }
-  vlib_node_increment_counter (vm, mpls_input_node.index,
-                               MPLS_ERROR_PKTS_DECAP, from_frame->n_vectors);
-  return from_frame->n_vectors;
-}
-
-static uword
-mpls_input (vlib_main_t * vm,
-            vlib_node_runtime_t * node,
-            vlib_frame_t * from_frame)
-{
-  return mpls_input_inline (vm, node, from_frame, 1 /* is mpls-o-gre */);
-}
-
-static char * mpls_error_strings[] = {
-#define mpls_error(n,s) s,
-#include "error.def"
-#undef mpls_error
-};
-
-VLIB_REGISTER_NODE (mpls_input_node) = {
-  .function = mpls_input,
-  .name = "mpls-gre-input",
-  /* Takes a vector of packets. */
-  .vector_size = sizeof (u32),
-
-  .runtime_data_bytes = sizeof(mpls_input_runtime_t),
-
-  .n_errors = MPLS_N_ERROR,
-  .error_strings = mpls_error_strings,
-
-  .n_next_nodes = MPLS_INPUT_N_NEXT,
-  .next_nodes = {
-#define _(s,n) [MPLS_INPUT_NEXT_##s] = n,
-    foreach_mpls_input_next
-#undef _
-  },
-
-  .format_buffer = format_mpls_gre_header_with_length,
-  .format_trace = format_mpls_rx_trace,
-  .unformat_buffer = unformat_mpls_gre_header,
-};
-
-VLIB_NODE_FUNCTION_MULTIARCH (mpls_input_node, mpls_input)
-
-static uword
-mpls_ethernet_input (vlib_main_t * vm,
-                     vlib_node_runtime_t * node,
-                     vlib_frame_t * from_frame)
-{
-  return mpls_input_inline (vm, node, from_frame, 0 /* is mpls-o-gre */);
-}
-
-
-VLIB_REGISTER_NODE (mpls_ethernet_input_node) = {
-  .function = mpls_ethernet_input,
-  .name = "mpls-ethernet-input",
-  /* Takes a vector of packets. */
-  .vector_size = sizeof (u32),
-
-  .runtime_data_bytes = sizeof(mpls_input_runtime_t),
-
-  .n_errors = MPLS_N_ERROR,
-  .error_strings = mpls_error_strings,
-
-  .n_next_nodes = MPLS_INPUT_N_NEXT,
-  .next_nodes = {
-#define _(s,n) [MPLS_INPUT_NEXT_##s] = n,
-    foreach_mpls_input_next
-#undef _
-  },
-
-  .format_buffer = format_mpls_eth_header_with_length,
-  .format_trace = format_mpls_rx_trace,
-  .unformat_buffer = unformat_mpls_gre_header,
-};
-
-VLIB_NODE_FUNCTION_MULTIARCH (mpls_ethernet_input_node, mpls_ethernet_input)
-
-static void
-mpls_setup_nodes (vlib_main_t * vm)
-{
-  vlib_node_t * n = vlib_get_node (vm, mpls_input_node.index);
-  pg_node_t * pn = pg_get_node (mpls_input_node.index);
-  mpls_input_runtime_t * rt;
-
-  n->format_buffer = format_mpls_gre_header_with_length;
-  n->unformat_buffer = unformat_mpls_gre_header;
-  pn->unformat_edit = unformat_pg_mpls_header;
-
-  rt = vlib_node_get_runtime_data (vm, mpls_input_node.index);
-  rt->last_label = (u32) ~0;
-  rt->last_inner_fib_index = 0;
-  rt->last_outer_fib_index = 0;
-  rt->mpls_main = &mpls_main;
-
-  n = vlib_get_node (vm, mpls_ethernet_input_node.index);
-
-  n->format_buffer = format_mpls_eth_header_with_length;
-
-  n->unformat_buffer = 0; /* unformat_mpls_ethernet_header; */
-
-  rt = vlib_node_get_runtime_data (vm, mpls_ethernet_input_node.index);
-  rt->last_label = (u32) ~0;
-  rt->last_inner_fib_index = 0;
-  rt->last_outer_fib_index = 0;
-  rt->mpls_main = &mpls_main;
-
-  ethernet_register_input_type (vm, ETHERNET_TYPE_MPLS_UNICAST,
-                                mpls_ethernet_input_node.index);
-}
-
-static clib_error_t * mpls_input_init (vlib_main_t * vm)
-{
-  clib_error_t * error; 
-
-  error = vlib_call_init_function (vm, mpls_init);
-  if (error)
-    clib_error_report (error);
-
-  mpls_setup_nodes (vm);
-
-  return 0;
-}
-
-VLIB_INIT_FUNCTION (mpls_input_init);
diff --git a/vnet/vnet/mpls-gre/packet.h b/vnet/vnet/mpls-gre/packet.h
deleted file mode 100644 (file)
index baa0181..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef included_vnet_mpls_packet_h
-#define included_vnet_mpls_packet_h
-
-/*
- * MPLS packet format
- *
- * Copyright (c) 2012 Cisco and/or its affiliates.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-typedef struct {
-    /* Label: top 20 bits [in network byte order] */
-    /* Experimental: 3 bits ... */
-    /* S (bottom of label stack): 1 bit */
-    /* TTL: 8 bits */
-    u32 label_exp_s_ttl;
-} mpls_unicast_header_t;
-
-static inline u32 vnet_mpls_uc_get_label (u32 label_exp_s_ttl)
-{
-    return (label_exp_s_ttl>>12);
-}
-
-static inline u32 vnet_mpls_uc_get_exp (u32 label_exp_s_ttl)
-{
-    return ((label_exp_s_ttl>>9) & 0x7);
-}
-
-static inline u32 vnet_mpls_uc_get_s (u32 label_exp_s_ttl)
-{
-    return ((label_exp_s_ttl>>8) & 0x1);
-}
-
-static inline u32 vnet_mpls_uc_get_ttl (u32 label_exp_s_ttl)
-{
-    return (label_exp_s_ttl & 0xff);
-}
-
-#endif /* included_vnet_mpls_packet_h */
similarity index 91%
rename from vnet/vnet/mpls-gre/error.def
rename to vnet/vnet/mpls/error.def
index 424ab50..de8b966 100644 (file)
@@ -26,3 +26,6 @@ mpls_error (S_NOT_SET, "MPLS-GRE s-bit not set")
 mpls_error (BAD_LABEL, "invalid FIB id in label")
 mpls_error (NOT_IP4, "non-ip4 packets dropped")
 mpls_error (DISALLOWED_FIB, "disallowed FIB id")
+mpls_error (NOT_ENABLED, "MPLS not enabled")
+mpls_error (DROP, "MPLS DROP DPO")
+mpls_error (PUNT, "MPLS PUNT DPO")
similarity index 69%
rename from vnet/vnet/mpls-gre/interface.c
rename to vnet/vnet/mpls/interface.c
index dd61a80..9ef4c29 100644 (file)
 #include <vnet/vnet.h>
 #include <vnet/pg/pg.h>
 #include <vnet/gre/gre.h>
-#include <vnet/mpls-gre/mpls.h>
+#include <vnet/mpls/mpls.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/adj/adj_midchain.h>
+#include <vnet/dpo/classify_dpo.h>
 
 static uword mpls_gre_set_rewrite (vnet_main_t * vnm,
                               u32 sw_if_index,
@@ -525,24 +528,23 @@ VNET_HW_INTERFACE_CLASS (mpls_eth_hw_interface_class) = {
   .set_rewrite = mpls_eth_set_rewrite,
 };
 
-#define foreach_mpls_post_rewrite_next \
-  _ (IP4_LOOKUP, "ip4-lookup")
-
-typedef enum {
-#define _(s,n) MPLS_POST_REWRITE_NEXT_##s,
-  foreach_mpls_post_rewrite_next
-#undef _
-  MPLS_POST_REWRITE_N_NEXT,
-} mpls_post_rewrite_next_t;
-
+/**
+ * A conversion of DPO next object tpyes to VLIB graph next nodes from
+ * the mpls_post_rewrite node
+ */
+static const int dpo_next_2_mpls_post_rewrite[DPO_LAST] = {
+    [DPO_LOAD_BALANCE] = IP_LOOKUP_NEXT_LOAD_BALANCE,
+};
 
 static uword
 mpls_post_rewrite (vlib_main_t * vm,
                    vlib_node_runtime_t * node,
                    vlib_frame_t * from_frame)
 {
+  ip4_main_t * im = &ip4_main;
+  ip_lookup_main_t * lm = &im->lookup_main;
   u32 n_left_from, next_index, * from, * to_next;
-  u16 old_l0 = 0, old_l1 = 0;
+  u16 old_l0 = 0; //, old_l1 = 0;
 
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
@@ -556,78 +558,103 @@ mpls_post_rewrite (vlib_main_t * vm,
       vlib_get_next_frame (vm, node, next_index,
                           to_next, n_left_to_next);
 
-      while (n_left_from >= 4 && n_left_to_next >= 2)
-       {
-         u32 bi0, bi1;
-         vlib_buffer_t * b0, * b1;
-          ip4_header_t * ip0, * ip1;
-         u32 next0 = MPLS_POST_REWRITE_NEXT_IP4_LOOKUP;
-         u32 next1 = MPLS_POST_REWRITE_NEXT_IP4_LOOKUP;
-          u16 new_l0, new_l1;
-          ip_csum_t sum0, sum1;
-
-         /* Prefetch next iteration. */
-         {
-           vlib_buffer_t * p2, * p3;
+      /* while (n_left_from >= 4 && n_left_to_next >= 2) */
+      /*       { */
+      /*         u32 bi0, bi1; */
+      /*         vlib_buffer_t * b0, * b1; */
+      /*     ip4_header_t * ip0, * ip1; */
+      /*         u32 next0; */
+      /*         u32 next1; */
+      /*     u16 new_l0, new_l1, adj_index0, adj_index1; */
+      /*     ip_csum_t sum0, sum1; */
+      /*         ip_adjacency_t *adj0, *adj1; */
+
+      /*         /\* Prefetch next iteration. *\/ */
+      /*         { */
+      /*           vlib_buffer_t * p2, * p3; */
+
+      /*           p2 = vlib_get_buffer (vm, from[2]); */
+      /*           p3 = vlib_get_buffer (vm, from[3]); */
+
+      /*           vlib_prefetch_buffer_header (p2, LOAD); */
+      /*           vlib_prefetch_buffer_header (p3, LOAD); */
+
+      /*           CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); */
+      /*           CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); */
+      /*         } */
+
+      /*         bi0 = from[0]; */
+      /*         bi1 = from[1]; */
+      /*         to_next[0] = bi0; */
+      /*         to_next[1] = bi1; */
+      /*         from += 2; */
+      /*         to_next += 2; */
+      /*         n_left_to_next -= 2; */
+      /*         n_left_from -= 2; */
+
+
+      /*         b0 = vlib_get_buffer (vm, bi0); */
+      /*         b1 = vlib_get_buffer (vm, bi1); */
+      /*     ip0 = vlib_buffer_get_current (b0); */
+      /*     ip1 = vlib_buffer_get_current (b1); */
+          
+      /*     /\* Note: the tunnel rewrite sets up sw_if_index[VLIB_TX] *\/ */
 
-           p2 = vlib_get_buffer (vm, from[2]);
-           p3 = vlib_get_buffer (vm, from[3]);
+      /*     /\* set the GRE (outer) ip packet length, fix the bloody checksum *\/ */
+      /*     sum0 = ip0->checksum; */
+      /*     sum1 = ip1->checksum; */
 
-           vlib_prefetch_buffer_header (p2, LOAD);
-           vlib_prefetch_buffer_header (p3, LOAD);
+      /*     /\* old_l0, old_l1 always 0, see the rewrite setup *\/ */
+      /*     new_l0 =  */
+      /*       clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)); */
+      /*     new_l1 =  */
+      /*       clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1)); */
+          
+      /*     sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t, */
+      /*                            length /\* changed member *\/); */
+      /*     sum1 = ip_csum_update (sum1, old_l1, new_l1, ip4_header_t, */
+      /*                            length /\* changed member *\/); */
+      /*     ip0->checksum = ip_csum_fold (sum0); */
+      /*     ip1->checksum = ip_csum_fold (sum1); */
+      /*     ip0->length = new_l0; */
+      /*     ip1->length = new_l1; */
 
-           CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
-           CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
-         }
+      /*         /\* replace the TX adj in the packet with the next in the chain *\/ */
+      /*         adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX]; */
+      /*         adj_index1 = vnet_buffer (b1)->ip.adj_index[VLIB_TX]; */
 
-         bi0 = from[0];
-         bi1 = from[1];
-         to_next[0] = bi0;
-         to_next[1] = bi1;
-         from += 2;
-         to_next += 2;
-         n_left_to_next -= 2;
-         n_left_from -= 2;
+      /*         adj0 = ip_get_adjacency (lm, adj_index0); */
+      /*         adj1 = ip_get_adjacency (lm, adj_index1); */
 
+      /*         ASSERT(adj0->sub_type.midchain.adj_index != ADJ_INDEX_INVALID); */
+      /*         ASSERT(adj1->sub_type.midchain.adj_index != ADJ_INDEX_INVALID); */
 
-         b0 = vlib_get_buffer (vm, bi0);
-         b1 = vlib_get_buffer (vm, bi1);
-          ip0 = vlib_buffer_get_current (b0);
-          ip1 = vlib_buffer_get_current (b1);
-          
-          /* Note: the tunnel rewrite sets up sw_if_index[VLIB_TX] */
+      /*         adj_index0 = adj0->sub_type.midchain.adj_index; */
+      /*         adj_index1 = adj1->sub_type.midchain.adj_index; */
 
-          /* set the GRE (outer) ip packet length, fix the bloody checksum */
-          sum0 = ip0->checksum;
-          sum1 = ip1->checksum;
+      /*         vnet_buffer (b0)->ip.adj_index[VLIB_TX] = adj_index0; */
+      /*         vnet_buffer (b1)->ip.adj_index[VLIB_TX] = adj_index1; */
 
-          /* old_l0, old_l1 always 0, see the rewrite setup */
-          new_l0 = 
-            clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
-          new_l1 = 
-            clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1));
-          
-          sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t,
-                                 length /* changed member */);
-          sum1 = ip_csum_update (sum1, old_l1, new_l1, ip4_header_t,
-                                 length /* changed member */);
-          ip0->checksum = ip_csum_fold (sum0);
-          ip1->checksum = ip_csum_fold (sum1);
-          ip0->length = new_l0;
-          ip1->length = new_l1;
+      /*         /\* get the next adj in the chain to determine the next graph node *\/ */
+      /*         adj0 = ip_get_adjacency (lm, adj_index0); */
+      /*         adj1 = ip_get_adjacency (lm, adj_index1); */
+
+      /*         next0 = adj0->lookup_next_index; */
+      /*         next1 = adj1->lookup_next_index; */
+
+      /*         vlib_validate_buffer_enqueue_x2 (vm, node, next_index, */
+      /*                                          to_next, n_left_to_next, */
+      /*                                          bi0, bi1, next0, next1); */
+      /*       } */
 
-         vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
-                                          to_next, n_left_to_next,
-                                          bi0, bi1, next0, next1);
-       }
-    
       while (n_left_from > 0 && n_left_to_next > 0)
        {
+         ip_adjacency_t * adj0;
          u32 bi0;
          vlib_buffer_t * b0;
           ip4_header_t * ip0;
-         u32 next0 = MPLS_POST_REWRITE_NEXT_IP4_LOOKUP;
-          u16 new_l0;
+         u32 next0;
+          u16 new_l0, adj_index0;
           ip_csum_t sum0;
 
          bi0 = from[0];
@@ -653,6 +680,20 @@ mpls_post_rewrite (vlib_main_t * vm,
           ip0->checksum = ip_csum_fold (sum0);
           ip0->length = new_l0;
 
+         /* replace the TX adj in the packet with the next in the chain */
+         adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
+
+          ASSERT(adj_index0);
+
+         adj0 = ip_get_adjacency (lm, adj_index0);
+         ASSERT(adj0->sub_type.midchain.next_dpo.dpoi_index != ADJ_INDEX_INVALID);
+         adj_index0 = adj0->sub_type.midchain.next_dpo.dpoi_index;
+         vnet_buffer (b0)->ip.adj_index[VLIB_TX] = adj_index0;
+
+         /* get the next adj in the chain to determine the next graph node */
+         ASSERT(0);
+         next0 = 0; //adj0->sub_type.midchain.next_dpo.dpoi_next;
+
          vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
                                           to_next, n_left_to_next,
                                           bi0, next0);
@@ -673,12 +714,8 @@ VLIB_REGISTER_NODE (mpls_post_rewrite_node) = {
 
   .runtime_data_bytes = 0,
 
-  .n_next_nodes = MPLS_POST_REWRITE_N_NEXT,
-  .next_nodes = {
-#define _(s,n) [MPLS_POST_REWRITE_NEXT_##s] = n,
-    foreach_mpls_post_rewrite_next
-#undef _
-  },
+  .n_next_nodes = IP_LOOKUP_N_NEXT,
+  .next_nodes = IP4_LOOKUP_NEXT_NODES,
 };
 
 VLIB_NODE_FUNCTION_MULTIARCH (mpls_post_rewrite_node, mpls_post_rewrite)
@@ -725,237 +762,512 @@ static u8 * mpls_gre_rewrite (mpls_main_t *mm, mpls_gre_tunnel_t * t)
   return (rewrite_data);
 }
 
-int vnet_mpls_gre_add_del_tunnel (ip4_address_t *src,
-                                  ip4_address_t *dst,
-                                  ip4_address_t *intfc,
-                                  u32 mask_width,
-                                  u32 inner_fib_id, u32 outer_fib_id,
-                                  u32 * tunnel_sw_if_index,
-                                  u8 l2_only,
-                                  u8 is_add)
+u8
+mpls_sw_interface_is_enabled (u32 sw_if_index)
 {
-  ip4_main_t * im = &ip4_main;
-  ip_lookup_main_t * lm = &im->lookup_main;
-  mpls_main_t * mm = &mpls_main;
-  vnet_main_t * vnm = vnet_get_main();
-  ip4_address_t zero;
-  mpls_gre_tunnel_t *tp;
-  int need_route_add_del = 1;
-  u32 inner_fib_index = 0;
-  u32 outer_fib_index = 0;
-  ip_adjacency_t adj;
-  u32 adj_index;
-  u8 * rewrite_data;
-  int found_tunnel = 0;
-  mpls_encap_t * e = 0;
-  u32 hw_if_index = ~0;
-  vnet_hw_interface_t * hi;
-  u32 slot;
-  u32 dummy;
-  
-  zero.as_u32 = 0;
-  
-  /* No questions, no answers */
-  if (tunnel_sw_if_index == 0)
-    tunnel_sw_if_index = &dummy;
+    mpls_main_t * mm = &mpls_main;
 
-  *tunnel_sw_if_index = ~0;
+    if (vec_len(mm->mpls_enabled_by_sw_if_index) < sw_if_index)
+        return (0);
 
-  if (inner_fib_id != (u32)~0)
+    return (mm->mpls_enabled_by_sw_if_index[sw_if_index]);
+}
+
+void
+mpls_sw_interface_enable_disable (mpls_main_t * mm,
+                                  u32 sw_if_index,
+                                  u8 is_enable)
+{
+  mpls_interface_state_change_callback_t *callback;
+  vlib_main_t * vm = vlib_get_main();
+  ip_config_main_t * cm = &mm->rx_config_mains;
+  vnet_config_main_t * vcm = &cm->config_main;
+  u32 lookup_feature_index;
+  fib_node_index_t lfib_index;
+  u32 ci;
+
+  vec_validate_init_empty (mm->mpls_enabled_by_sw_if_index, sw_if_index, 0);
+
+  /*
+   * enable/disable only on the 1<->0 transition
+   */
+  if (is_enable)
     {
-      uword * p;
-      
-      p = hash_get (im->fib_index_by_table_id, inner_fib_id);
-      if (! p)
-        return VNET_API_ERROR_NO_SUCH_INNER_FIB;
-      inner_fib_index = p[0];
-    }
+      if (1 != ++mm->mpls_enabled_by_sw_if_index[sw_if_index])
+        return;
 
-  if (outer_fib_id != 0)
+      lfib_index = fib_table_find_or_create_and_lock(FIB_PROTOCOL_MPLS,
+                                                    MPLS_FIB_DEFAULT_TABLE_ID);
+      vec_validate(mm->fib_index_by_sw_if_index, 0);
+      mm->fib_index_by_sw_if_index[sw_if_index] = lfib_index;
+    }
+  else
     {
-      uword * p;
-      
-      p = hash_get (im->fib_index_by_table_id, outer_fib_id);
-      if (! p)
-        return VNET_API_ERROR_NO_SUCH_FIB;
-      outer_fib_index = p[0];
+      ASSERT(mm->mpls_enabled_by_sw_if_index[sw_if_index] > 0);
+      if (0 != --mm->mpls_enabled_by_sw_if_index[sw_if_index])
+        return;
+
+      fib_table_unlock(mm->fib_index_by_sw_if_index[sw_if_index],
+                      FIB_PROTOCOL_MPLS);
     }
 
-  /* suppress duplicate mpls interface generation. */
-  pool_foreach (tp, mm->gre_tunnels, 
-  ({
-    /* 
-     * If we have a tunnel which matches (src, dst, intfc/mask)
-     * AND the expected route is in the FIB, it's a dup 
+  vec_validate_init_empty (cm->config_index_by_sw_if_index, sw_if_index, ~0);
+  ci = cm->config_index_by_sw_if_index[sw_if_index];
+
+  lookup_feature_index = mm->mpls_rx_feature_lookup;
+
+  if (is_enable)
+    ci = vnet_config_add_feature (vm, vcm,
+                                  ci,
+                                  lookup_feature_index,
+                                  /* config data */ 0,
+                                  /* # bytes of config data */ 0);
+  else
+    ci = vnet_config_del_feature (vm, vcm, ci,
+                                  lookup_feature_index,
+                                  /* config data */ 0,
+                                  /* # bytes of config data */ 0);
+
+  cm->config_index_by_sw_if_index[sw_if_index] = ci;
+
+  /*
+   * notify all interested clients of the change of state.
+   */
+  vec_foreach(callback, mm->mpls_interface_state_change_callbacks)
+  {
+      (*callback)(sw_if_index, is_enable);
+  }
+}
+
+static mpls_gre_tunnel_t *
+mpls_gre_tunnel_from_fib_node (fib_node_t *node)
+{
+#if (CLIB_DEBUG > 0)
+    ASSERT(FIB_NODE_TYPE_MPLS_GRE_TUNNEL == node->fn_type);
+#endif
+    return ((mpls_gre_tunnel_t*)node);
+}
+
+/*
+ * mpls_gre_tunnel_stack
+ *
+ * 'stack' (resolve the recursion for) the tunnel's midchain adjacency
+ */
+static void
+mpls_gre_tunnel_stack (mpls_gre_tunnel_t *mgt)
+{
+    /*
+     * find the adjacency that is contributed by the FIB entry
+     * that this tunnel resovles via, and use it as the next adj
+     * in the midchain
      */
-    if (!memcmp (&tp->tunnel_src, src, sizeof (*src))
-        && !memcmp (&tp->tunnel_dst, dst, sizeof (*dst))
-        && !memcmp (&tp->intfc_address, intfc, sizeof (*intfc))
-        && tp->inner_fib_index == inner_fib_index) 
-      {
-        ip4_fib_t * fib = vec_elt_at_index (im->fibs, inner_fib_index);
-        uword * hash = fib->adj_index_by_dst_address[mask_width];
-        uword key = intfc->as_u32 & im->fib_masks[mask_width];
-        uword *p = hash_get (hash, key);
+    adj_nbr_midchain_stack(mgt->adj_index,
+                          fib_entry_contribute_ip_forwarding(mgt->fei));
+}
 
-        found_tunnel = 1;
+/**
+ * Function definition to backwalk a FIB node
+ */
+static fib_node_back_walk_rc_t
+mpls_gre_tunnel_back_walk (fib_node_t *node,
+                          fib_node_back_walk_ctx_t *ctx)
+{
+    mpls_gre_tunnel_stack(mpls_gre_tunnel_from_fib_node(node));
 
-        if (is_add)
-          {
-            /* A dup, and the route is in the fib. Done */
-            if (p || l2_only)
-              return 1;
-            else
-              {
-                /* Reinstall the route (and other stuff) */
-                e = mpls_encap_by_fib_and_dest (mm, inner_fib_index, 
-                                                dst->as_u32);
-                if (e == 0)
-                  return VNET_API_ERROR_NO_SUCH_LABEL;
-                goto reinstall_it;
-              }
-          }
-        else
-          {
-            /* Delete, the route is already gone? */
-            if (!p)
-              need_route_add_del = 0;
-            goto add_del_route;
-          }
+    return (FIB_NODE_BACK_WALK_CONTINUE);
+}
 
-      }
-  }));
-    
-  /* Delete, and we can't find the tunnel */
-  if (is_add == 0 && found_tunnel == 0)
-    return VNET_API_ERROR_NO_SUCH_ENTRY;
+/**
+ * Function definition to get a FIB node from its index
+ */
+static fib_node_t*
+mpls_gre_tunnel_fib_node_get (fib_node_index_t index)
+{
+    mpls_gre_tunnel_t * mgt;
+    mpls_main_t * mm;
 
-  e = mpls_encap_by_fib_and_dest (mm, inner_fib_index, dst->as_u32);
-  if (e == 0)
-    return VNET_API_ERROR_NO_SUCH_LABEL;
+    mm  = &mpls_main;
+    mgt = pool_elt_at_index(mm->gre_tunnels, index);
 
-  pool_get(mm->gre_tunnels, tp);
-  memset (tp, 0, sizeof (*tp));
+    return (&mgt->mgt_node);
+}
 
-  if (vec_len (mm->free_gre_sw_if_indices) > 0)
+/**
+ * Function definition to inform the FIB node that its last lock has gone.
+ */
+static void
+mpls_gre_tunnel_last_lock_gone (fib_node_t *node)
+{
+    /*
+     * The MPLS GRE tunnel is a root of the graph. As such
+     * it never has children and thus is never locked.
+     */
+    ASSERT(0);
+}
+
+/*
+ * Virtual function table registered by MPLS GRE tunnels
+ * for participation in the FIB object graph.
+ */
+const static fib_node_vft_t mpls_gre_vft = {
+    .fnv_get = mpls_gre_tunnel_fib_node_get,
+    .fnv_last_lock = mpls_gre_tunnel_last_lock_gone,
+    .fnv_back_walk = mpls_gre_tunnel_back_walk,
+};
+static mpls_gre_tunnel_t *
+mpls_gre_tunnel_find (ip4_address_t *src,
+                     ip4_address_t *dst,
+                     ip4_address_t *intfc,
+                     u32 inner_fib_index)
+{
+    mpls_main_t * mm = &mpls_main;
+    mpls_gre_tunnel_t *tp;
+    int found_tunnel = 0;
+
+    /* suppress duplicate mpls interface generation. */
+    pool_foreach (tp, mm->gre_tunnels, 
+    ({
+       /* 
+        * If we have a tunnel which matches (src, dst, intfc/mask)
+        * AND the expected route is in the FIB, it's a dup 
+        */
+       if (!memcmp (&tp->tunnel_src, src, sizeof (*src))
+           && !memcmp (&tp->tunnel_dst, dst, sizeof (*dst))
+           && !memcmp (&tp->intfc_address, intfc, sizeof (*intfc))
+           && tp->inner_fib_index == inner_fib_index) 
+       {
+           found_tunnel = 1;
+           goto found;
+       }
+    }));
+
+found:
+    if (found_tunnel)
     {
-      hw_if_index = 
-        mm->free_gre_sw_if_indices[vec_len(mm->free_gre_sw_if_indices)-1];
-      _vec_len (mm->free_gre_sw_if_indices) -= 1;
-      hi = vnet_get_hw_interface (vnm, hw_if_index);
-      hi->dev_instance = tp - mm->gre_tunnels;
-      hi->hw_instance = tp - mm->gre_tunnels;
+       return (tp);
     }
-  else 
+    return (NULL);
+}
+
+int mpls_gre_tunnel_add (ip4_address_t *src,
+                        ip4_address_t *dst,
+                        ip4_address_t *intfc,
+                        u32 mask_width,
+                        u32 inner_fib_index,
+                        u32 outer_fib_index,
+                        u32 * tunnel_sw_if_index,
+                        u8 l2_only)
+{
+    mpls_main_t * mm = &mpls_main;
+    gre_main_t * gm = &gre_main;
+    vnet_main_t * vnm = vnet_get_main();
+    mpls_gre_tunnel_t *tp;
+    ip_adjacency_t adj;
+    u8 * rewrite_data;
+    mpls_encap_t * e = 0;
+    u32 hw_if_index = ~0;
+    vnet_hw_interface_t * hi;
+    u32 slot;
+    const ip46_address_t zero_nh = {
+       .ip4.as_u32 = 0,
+    };
+
+    tp = mpls_gre_tunnel_find(src,dst,intfc,inner_fib_index);
+
+    /* Add, duplicate */
+    if (NULL != tp)
+       return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+    e = mpls_encap_by_fib_and_dest (mm, inner_fib_index, dst->as_u32);
+    if (e == 0)
+       return VNET_API_ERROR_NO_SUCH_LABEL;
+
+    pool_get(mm->gre_tunnels, tp);
+    memset (tp, 0, sizeof (*tp));
+    fib_node_init(&tp->mgt_node,
+                 FIB_NODE_TYPE_MPLS_GRE_TUNNEL);
+
+    if (vec_len (mm->free_gre_sw_if_indices) > 0)
     {
-      hw_if_index = vnet_register_interface
-        (vnm, mpls_gre_device_class.index, tp - mm->gre_tunnels,
-         mpls_gre_hw_interface_class.index,
-         tp - mm->gre_tunnels);
-      hi = vnet_get_hw_interface (vnm, hw_if_index);
+       hw_if_index = 
+           mm->free_gre_sw_if_indices[vec_len(mm->free_gre_sw_if_indices)-1];
+       _vec_len (mm->free_gre_sw_if_indices) -= 1;
+       hi = vnet_get_hw_interface (vnm, hw_if_index);
+       hi->dev_instance = tp - mm->gre_tunnels;
+       hi->hw_instance = tp - mm->gre_tunnels;
+    }
+    else 
+    {
+       hw_if_index = vnet_register_interface
+           (vnm, mpls_gre_device_class.index, tp - mm->gre_tunnels,
+            mpls_gre_hw_interface_class.index,
+            tp - mm->gre_tunnels);
+       hi = vnet_get_hw_interface (vnm, hw_if_index);
+
+       /* ... to make the IP and L2 x-connect cases identical */
+       slot = vlib_node_add_named_next_with_slot
+           (vnm->vlib_main, hi->tx_node_index, 
+            "mpls-post-rewrite", MPLS_GRE_OUTPUT_NEXT_POST_REWRITE);
+
+       ASSERT (slot == MPLS_GRE_OUTPUT_NEXT_POST_REWRITE);
+    }
+  
+    *tunnel_sw_if_index = hi->sw_if_index;
+    vnet_sw_interface_set_flags (vnm, hi->sw_if_index, 
+                                VNET_SW_INTERFACE_FLAG_ADMIN_UP);      
+    vec_validate(ip4_main.fib_index_by_sw_if_index, *tunnel_sw_if_index);
+    ip4_main.fib_index_by_sw_if_index[*tunnel_sw_if_index] = outer_fib_index;
+
+    tp->hw_if_index = hw_if_index;
+
+    /* bind the MPLS and IPv4 FIBs to the interface and enable */
+    vec_validate(mm->fib_index_by_sw_if_index, hi->sw_if_index);
+    mm->fib_index_by_sw_if_index[hi->sw_if_index] = inner_fib_index;
+    mpls_sw_interface_enable_disable(mm, hi->sw_if_index, 1);
+    ip4_main.fib_index_by_sw_if_index[hi->sw_if_index] = inner_fib_index;
+    ip4_sw_interface_enable_disable(hi->sw_if_index, 1);
+
+    tp->tunnel_src.as_u32 = src->as_u32;
+    tp->tunnel_dst.as_u32 = dst->as_u32;
+    tp->intfc_address.as_u32 = intfc->as_u32;
+    tp->mask_width = mask_width;
+    tp->inner_fib_index = inner_fib_index;
+    tp->outer_fib_index = outer_fib_index;
+    tp->encap_index = e - mm->encaps;
+    tp->l2_only = l2_only;
+
+    /* Add the tunnel to the hash table of all GRE tunnels */
+    u64 key = (u64)src->as_u32 << 32 | (u64)dst->as_u32;
+
+    ASSERT(NULL == hash_get (gm->tunnel_by_key, key));
+    hash_set (gm->tunnel_by_key, key, tp - mm->gre_tunnels);
+
+    /* Create the adjacency and add to v4 fib */
+    memset(&adj, 0, sizeof (adj));
+    adj.lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
+    
+    rewrite_data = mpls_gre_rewrite (mm, tp);
+    if (rewrite_data == 0)
+    {
+       if (*tunnel_sw_if_index != ~0)
+       {
+           hi = vnet_get_hw_interface (vnm, tp->hw_if_index);
+           vnet_sw_interface_set_flags (vnm, hi->sw_if_index, 
+                                        0 /* admin down */);
+           vec_add1 (mm->free_gre_sw_if_indices, tp->hw_if_index);
+       }
+       pool_put (mm->gre_tunnels, tp);
+       return VNET_API_ERROR_NO_SUCH_LABEL;
+    }
 
-      /* ... to make the IP and L2 x-connect cases identical */
-      slot = vlib_node_add_named_next_with_slot
-        (vnm->vlib_main, hi->tx_node_index, 
-         "mpls-post-rewrite", MPLS_GRE_OUTPUT_NEXT_POST_REWRITE);
+    /* Save a copy of the rewrite data for L2 x-connect */
+    vec_free (tp->rewrite_data);
 
-      ASSERT (slot == MPLS_GRE_OUTPUT_NEXT_POST_REWRITE);
+    tp->rewrite_data = rewrite_data;
+  
+    if (!l2_only)
+    {
+       /*
+        * source the FIB entry for the tunnel's destination
+        * and become a child thereof. The tunnel will then get poked
+        * when the forwarding for the entry updates, and the tunnel can
+        * re-stack accordingly
+        */
+       const fib_prefix_t tun_dst_pfx = {
+           .fp_len = 32,
+           .fp_proto = FIB_PROTOCOL_IP4,
+           .fp_addr = {
+               .ip4 = *dst,
+           }
+       };
+
+       tp->fei = fib_table_entry_special_add(outer_fib_index,
+                                             &tun_dst_pfx,
+                                             FIB_SOURCE_RR,
+                                             FIB_ENTRY_FLAG_NONE,
+                                             ADJ_INDEX_INVALID);
+       tp->sibling_index = fib_entry_child_add(tp->fei,
+                                               FIB_NODE_TYPE_MPLS_GRE_TUNNEL,
+                                               tp - mm->gre_tunnels);
+
+       /*
+        * create and update the midchain adj this tunnel sources.
+        * This is the adj the route we add below will resolve to.
+        */
+       tp->adj_index = adj_nbr_add_or_lock(FIB_PROTOCOL_IP4,
+                                           FIB_LINK_IP4,
+                                           &zero_nh,
+                                           hi->sw_if_index);
+
+       adj_nbr_midchain_update_rewrite(tp->adj_index,
+                                       mpls_post_rewrite_node.index,
+                                       rewrite_data);
+       mpls_gre_tunnel_stack(tp);
+
+       /*
+        * Update the route for the tunnel's subnet to point through the tunnel
+        */
+       const fib_prefix_t tun_sub_net_pfx = {
+           .fp_len = tp->mask_width,
+           .fp_proto = FIB_PROTOCOL_IP4,
+           .fp_addr = {
+               .ip4 = tp->intfc_address,
+           },
+       };
+
+       fib_table_entry_update_one_path(inner_fib_index,
+                                       &tun_sub_net_pfx,
+                                       FIB_SOURCE_INTERFACE,
+                                       (FIB_ENTRY_FLAG_CONNECTED |
+                                        FIB_ENTRY_FLAG_ATTACHED),
+                                       FIB_PROTOCOL_IP4,
+                                       &zero_nh,
+                                       hi->sw_if_index,
+                                       ~0, // invalid fib index
+                                       1,
+                                       MPLS_LABEL_INVALID,
+                                       FIB_ROUTE_PATH_FLAG_NONE);
     }
+
+    return 0;
+}
+
+static int
+mpls_gre_tunnel_del (ip4_address_t *src,
+                    ip4_address_t *dst,
+                    ip4_address_t *intfc,
+                    u32 mask_width,
+                    u32 inner_fib_index,
+                    u32 outer_fib_index,
+                    u32 * tunnel_sw_if_index,
+                    u8 l2_only)
+{
+    mpls_main_t * mm = &mpls_main;
+    vnet_main_t * vnm = vnet_get_main();
+    gre_main_t * gm = &gre_main;
+    mpls_gre_tunnel_t *tp;
+    vnet_hw_interface_t * hi;
   
-  *tunnel_sw_if_index = hi->sw_if_index;
-  vnet_sw_interface_set_flags (vnm, hi->sw_if_index, 
-                               VNET_SW_INTERFACE_FLAG_ADMIN_UP);      
+    tp = mpls_gre_tunnel_find(src,dst,intfc,inner_fib_index);
 
-  tp->hw_if_index = hw_if_index;
-    
- reinstall_it:
-  tp->tunnel_src.as_u32 = src->as_u32;
-  tp->tunnel_dst.as_u32 = dst->as_u32;
-  tp->intfc_address.as_u32 = intfc->as_u32;
-  tp->mask_width = mask_width;
-  tp->inner_fib_index = inner_fib_index;
-  tp->outer_fib_index = outer_fib_index;
-  tp->encap_index = e - mm->encaps;
-  tp->l2_only = l2_only;
+    /* Delete, and we can't find the tunnel */
+    if (NULL == tp)
+       return VNET_API_ERROR_NO_SUCH_ENTRY;
 
-  /* Create the adjacency and add to v4 fib */
-  memset(&adj, 0, sizeof (adj));
-  adj.explicit_fib_index = ~0;
-  adj.lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
-    
-  rewrite_data = mpls_gre_rewrite (mm, tp);
-  if (rewrite_data == 0)
+    hi = vnet_get_hw_interface (vnm, tp->hw_if_index);
+
+    if (!l2_only)
     {
-      if (*tunnel_sw_if_index != ~0)
-        {
-          hi = vnet_get_hw_interface (vnm, tp->hw_if_index);
-          vnet_sw_interface_set_flags (vnm, hi->sw_if_index, 
-                                       0 /* admin down */);
-          vec_add1 (mm->free_gre_sw_if_indices, tp->hw_if_index);
-      }
-      pool_put (mm->gre_tunnels, tp);
-      return VNET_API_ERROR_NO_SUCH_LABEL;
+       /*
+        * unsource the FIB entry for the tunnel's destination
+        */
+       const fib_prefix_t tun_dst_pfx = {
+           .fp_len = 32,
+           .fp_proto = FIB_PROTOCOL_IP4,
+           .fp_addr = {
+               .ip4 = *dst,
+           }
+       };
+
+       fib_entry_child_remove(tp->fei,
+                              tp->sibling_index);
+       fib_table_entry_special_remove(outer_fib_index,
+                                      &tun_dst_pfx,
+                                      FIB_SOURCE_RR);
+       tp->fei = FIB_NODE_INDEX_INVALID;
+       adj_unlock(tp->adj_index);
+       /*
+        * unsource the route for the tunnel's subnet
+        */
+       const fib_prefix_t tun_sub_net_pfx = {
+           .fp_len = tp->mask_width,
+           .fp_proto = FIB_PROTOCOL_IP4,
+           .fp_addr = {
+               .ip4 = tp->intfc_address,
+           },
+       };
+
+       fib_table_entry_delete(inner_fib_index,
+                              &tun_sub_net_pfx,
+                              FIB_SOURCE_INTERFACE);
     }
-  
-  /* Save a copy of the rewrite data for L2 x-connect */
-  vec_free (tp->rewrite_data);
 
-  tp->rewrite_data = rewrite_data;
+    u64 key = ((u64)tp->tunnel_src.as_u32 << 32 |
+               (u64)tp->tunnel_src.as_u32);
 
-  vnet_rewrite_for_tunnel
-    (vnm,
-     outer_fib_index /* tx_sw_if_index, aka outer fib ID */,
-     ip4_rewrite_node.index,
-     mpls_post_rewrite_node.index,
-     &adj.rewrite_header,
-     rewrite_data, vec_len(rewrite_data));
-  
-  if (!l2_only)
-      ip_add_adjacency (lm, &adj, 1 /* one adj */,
-                        &adj_index);
+    hash_unset (gm->tunnel_by_key, key);
+    mpls_sw_interface_enable_disable(mm, hi->sw_if_index, 0);
+    ip4_sw_interface_enable_disable(hi->sw_if_index, 0);
+
+    vnet_sw_interface_set_flags (vnm, hi->sw_if_index, 
+                                0 /* admin down */);
+    vec_add1 (mm->free_gre_sw_if_indices, tp->hw_if_index);
+    vec_free (tp->rewrite_data);
+    fib_node_deinit(&tp->mgt_node);
+    pool_put (mm->gre_tunnels, tp);
+
+    return 0;
+}
+
+int
+vnet_mpls_gre_add_del_tunnel (ip4_address_t *src,
+                             ip4_address_t *dst,
+                             ip4_address_t *intfc,
+                             u32 mask_width,
+                             u32 inner_fib_id, u32 outer_fib_id,
+                             u32 * tunnel_sw_if_index,
+                             u8 l2_only,
+                             u8 is_add)
+{
+    u32 inner_fib_index = 0;
+    u32 outer_fib_index = 0;
+    u32 dummy;
+    ip4_main_t * im = &ip4_main;
   
- add_del_route:
+    /* No questions, no answers */
+    if (NULL == tunnel_sw_if_index)
+       tunnel_sw_if_index = &dummy;
 
-  if (need_route_add_del && !l2_only)
+    *tunnel_sw_if_index = ~0;
+
+    if (inner_fib_id != (u32)~0)
     {
-      if (is_add)
-        ip4_add_del_route_next_hop (im,
-                                    IP4_ROUTE_FLAG_ADD,
-                                    &tp->intfc_address,
-                                    tp->mask_width,
-                                    &zero /* no next hop */,
-                                    (u32)~0 /* next_hop_sw_if_index */,
-                                    1 /* weight */, 
-                                    adj_index,
-                                    tp->inner_fib_index);
-      else
-        {
-          ip4_add_del_route_args_t a;
-          memset (&a, 0, sizeof (a));
-
-          a.flags = IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_DEL;
-          a.table_index_or_table_id = tp->inner_fib_index;
-          a.dst_address = tp->intfc_address;
-          a.dst_address_length = tp->mask_width;
-          a.adj_index = ~0;
-
-          ip4_add_del_route (im, &a);
-          ip4_maybe_remap_adjacencies (im, tp->inner_fib_index, 
-                                       IP4_ROUTE_FLAG_FIB_INDEX);
-        }
+       uword * p;
+      
+       p = hash_get (im->fib_index_by_table_id, inner_fib_id);
+       if (! p)
+           return VNET_API_ERROR_NO_SUCH_INNER_FIB;
+       inner_fib_index = p[0];
     }
 
-  if (is_add == 0 && found_tunnel)
+    if (outer_fib_id != 0)
     {
-      hi = vnet_get_hw_interface (vnm, tp->hw_if_index);
-      vnet_sw_interface_set_flags (vnm, hi->sw_if_index, 
-                                   0 /* admin down */);
-      vec_add1 (mm->free_gre_sw_if_indices, tp->hw_if_index);
-      vec_free (tp->rewrite_data);
-      pool_put (mm->gre_tunnels, tp);
+       uword * p;
+      
+       p = hash_get (im->fib_index_by_table_id, outer_fib_id);
+       if (! p)
+           return VNET_API_ERROR_NO_SUCH_FIB;
+       outer_fib_index = p[0];
     }
 
-  return 0;
+    if (is_add)
+    {
+       return (mpls_gre_tunnel_add(src,dst,intfc, mask_width,
+                                   inner_fib_index,
+                                   outer_fib_index,
+                                   tunnel_sw_if_index,
+                                   l2_only));
+    }
+    else
+    {
+       return (mpls_gre_tunnel_del(src,dst,intfc, mask_width,
+                                   inner_fib_index,
+                                   outer_fib_index,
+                                   tunnel_sw_if_index,
+                                   l2_only));
+    }
 }
 
 /*
@@ -963,21 +1275,17 @@ int vnet_mpls_gre_add_del_tunnel (ip4_address_t *src,
  */
 int vnet_mpls_gre_delete_fib_tunnels (u32 fib_id)
 {
-  ip4_main_t * im = &ip4_main;
   mpls_main_t * mm = &mpls_main;
   vnet_main_t * vnm = mm->vnet_main;
   mpls_gre_tunnel_t *tp;
   u32 fib_index = 0;
-  uword * p;
   u32 * tunnels_to_delete = 0;
   vnet_hw_interface_t * hi;
-  ip4_fib_t * fib;
   int i;
 
-  p = hash_get (im->fib_index_by_table_id, fib_id);
-  if (! p)
+  fib_index = ip4_fib_index_from_table_id(fib_id);
+  if (~0 == fib_index)
       return VNET_API_ERROR_NO_SUCH_INNER_FIB;
-  fib_index = p[0];
 
   pool_foreach (tp, mm->gre_tunnels, 
     ({
@@ -985,28 +1293,40 @@ int vnet_mpls_gre_delete_fib_tunnels (u32 fib_id)
         vec_add1 (tunnels_to_delete, tp - mm->gre_tunnels);
     }));
   
-  fib = vec_elt_at_index (im->fibs, fib_index);
-  
   for (i = 0; i < vec_len(tunnels_to_delete); i++) {
       tp = pool_elt_at_index (mm->gre_tunnels, tunnels_to_delete[i]);
-      uword * hash = fib->adj_index_by_dst_address[tp->mask_width];
-      uword key = tp->intfc_address.as_u32 & im->fib_masks[tp->mask_width];
-      uword *p = hash_get (hash, key);
-      ip4_add_del_route_args_t a;
 
       /* Delete, the route if not already gone */
-      if (p && !tp->l2_only) 
-        {
-          memset (&a, 0, sizeof (a));
-          a.flags = IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_DEL;
-          a.table_index_or_table_id = tp->inner_fib_index;
-          a.dst_address = tp->intfc_address;
-          a.dst_address_length = tp->mask_width;
-          a.adj_index = ~0;
-          ip4_add_del_route (im, &a);
-          ip4_maybe_remap_adjacencies (im, tp->inner_fib_index, 
-                                       IP4_ROUTE_FLAG_FIB_INDEX);
-        }
+      if (FIB_NODE_INDEX_INVALID != tp->fei && !tp->l2_only) 
+      {
+         const fib_prefix_t tun_dst_pfx = {
+             .fp_len = 32,
+             .fp_proto = FIB_PROTOCOL_IP4,
+             .fp_addr = {
+                 .ip4 = tp->tunnel_dst,
+             }
+         };
+
+         fib_entry_child_remove(tp->fei,
+                                tp->sibling_index);
+         fib_table_entry_special_remove(tp->outer_fib_index,
+                                        &tun_dst_pfx,
+                                        FIB_SOURCE_RR);
+         tp->fei = FIB_NODE_INDEX_INVALID;
+         adj_unlock(tp->adj_index);
+         const fib_prefix_t tun_sub_net_pfx = {
+             .fp_len = tp->mask_width,
+             .fp_proto = FIB_PROTOCOL_IP4,
+             .fp_addr = {
+                 .ip4 = tp->intfc_address,
+             },
+         };
+
+         fib_table_entry_delete(tp->inner_fib_index,
+                                &tun_sub_net_pfx,
+                                FIB_SOURCE_INTERFACE);
+      }
       
       hi = vnet_get_hw_interface (vnm, tp->hw_if_index);
       vnet_sw_interface_set_flags (vnm, hi->sw_if_index, 
@@ -1229,11 +1549,15 @@ VLIB_CLI_COMMAND (show_mpls_tunnel_command, static) = {
     .function = show_mpls_tunnel_command_fn,
 };
 
+
 /* force inclusion from application's main.c */
 clib_error_t *mpls_interface_init (vlib_main_t *vm)
 {
   clib_error_t * error;
 
+  fib_node_register_type(FIB_NODE_TYPE_MPLS_GRE_TUNNEL,
+                        &mpls_gre_vft);
+
   if ((error = vlib_call_init_function (vm, mpls_policy_encap_init)))
       return error;
 
@@ -1286,9 +1610,7 @@ int vnet_mpls_ethernet_add_del_tunnel (u8 *dst,
   ip_lookup_main_t * lm = &im->lookup_main;
   mpls_main_t * mm = &mpls_main;
   vnet_main_t * vnm = vnet_get_main();
-  ip4_address_t zero;
   mpls_eth_tunnel_t *tp;
-  int need_route_add_del = 1;
   u32 inner_fib_index = 0;
   ip_adjacency_t adj;
   u32 adj_index;
@@ -1300,8 +1622,6 @@ int vnet_mpls_ethernet_add_del_tunnel (u8 *dst,
   u32 slot;
   u32 dummy;
   
-  zero.as_u32 = 0;
-  
   if (tunnel_sw_if_index == 0)
     tunnel_sw_if_index = &dummy;
 
@@ -1326,18 +1646,14 @@ int vnet_mpls_ethernet_add_del_tunnel (u8 *dst,
      */
     if (!memcmp (&tp->tunnel_dst, dst, sizeof (*dst))
         && !memcmp (&tp->intfc_address, intfc, sizeof (*intfc))
-        && tp->inner_fib_index == inner_fib_index) 
+        && tp->inner_fib_index == inner_fib_index
+       && FIB_NODE_INDEX_INVALID != tp->fei)
       {
-        ip4_fib_t * fib = vec_elt_at_index (im->fibs, inner_fib_index);
-        uword * hash = fib->adj_index_by_dst_address[mask_width];
-        uword key = intfc->as_u32 & im->fib_masks[mask_width];
-        uword *p = hash_get (hash, key);
-
         found_tunnel = 1;
 
         if (is_add)
           {
-            if (p || l2_only)
+            if (l2_only)
               return 1;
             else
               {
@@ -1351,9 +1667,7 @@ int vnet_mpls_ethernet_add_del_tunnel (u8 *dst,
           }
         else
           {
-            /* Delete, the route is already gone? */
-            if (!p)
-              need_route_add_del = 0;
+            /* Delete */
             goto add_del_route;
           }
 
@@ -1413,7 +1727,6 @@ int vnet_mpls_ethernet_add_del_tunnel (u8 *dst,
 
   /* Create the adjacency and add to v4 fib */
   memset(&adj, 0, sizeof (adj));
-  adj.explicit_fib_index = ~0;
   adj.lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
     
   rewrite_data = mpls_ethernet_rewrite (mm, tp);
@@ -1465,33 +1778,26 @@ int vnet_mpls_ethernet_add_del_tunnel (u8 *dst,
   
  add_del_route:
 
-  if (need_route_add_del && !l2_only)
+  if (!l2_only)
     {
+      const fib_prefix_t pfx = {
+         .fp_addr = {
+             .ip4 = tp->intfc_address,
+         },
+         .fp_len = tp->mask_width,
+         .fp_proto = FIB_PROTOCOL_IP4,
+      };
       if (is_add)
-        ip4_add_del_route_next_hop (im,
-                                    IP4_ROUTE_FLAG_ADD,
-                                    &tp->intfc_address,
-                                    tp->mask_width,
-                                    &zero /* no next hop */,
-                                    (u32)~0 /* next_hop_sw_if_index */,
-                                    1 /* weight */, 
-                                    adj_index,
-                                    tp->inner_fib_index);
+         tp->fei = fib_table_entry_special_add(tp->inner_fib_index,
+                                               &pfx,
+                                               FIB_SOURCE_API,
+                                               FIB_ENTRY_FLAG_NONE,
+                                               adj_index);
       else
         {
-          ip4_add_del_route_args_t a;
-          memset (&a, 0, sizeof (a));
-
-          a.flags = IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_DEL;
-          a.table_index_or_table_id = tp->inner_fib_index;
-          a.dst_address = tp->intfc_address;
-          a.dst_address_length = tp->mask_width;
-          a.adj_index = ~0;
-
-          ip4_add_del_route (im, &a);
-          ip4_maybe_remap_adjacencies (im, tp->inner_fib_index, 
-                                       IP4_ROUTE_FLAG_FIB_INDEX);
-        }
+         fib_table_entry_delete(tp->inner_fib_index, &pfx, FIB_SOURCE_API);
+         tp->fei = FIB_NODE_INDEX_INVALID;
+       }
     }
   if (is_add == 0 && found_tunnel)
     {
@@ -1667,15 +1973,10 @@ int vnet_mpls_ethernet_add_del_policy_tunnel (u8 *dst,
                                               u8 is_add)
 {
   ip4_main_t * im = &ip4_main;
-  ip_lookup_main_t * lm = &im->lookup_main;
   mpls_main_t * mm = &mpls_main;
   vnet_main_t * vnm = vnet_get_main();
-  ip4_address_t zero;
   mpls_eth_tunnel_t *tp;
-  int need_route_add_del = 1;
   u32 inner_fib_index = 0;
-  ip_adjacency_t adj;
-  u32 adj_index;
   int found_tunnel = 0;
   mpls_encap_t * e = 0;
   u32 hw_if_index = ~0;
@@ -1683,8 +1984,6 @@ int vnet_mpls_ethernet_add_del_policy_tunnel (u8 *dst,
   u32 slot;
   u32 dummy;
   
-  zero.as_u32 = 0;
-  
   if (tunnel_sw_if_index == 0)
     tunnel_sw_if_index = &dummy;
 
@@ -1709,18 +2008,14 @@ int vnet_mpls_ethernet_add_del_policy_tunnel (u8 *dst,
      */
     if (!memcmp (&tp->tunnel_dst, dst, sizeof (*dst))
         && !memcmp (&tp->intfc_address, intfc, sizeof (*intfc))
-        && tp->inner_fib_index == inner_fib_index) 
+        && tp->inner_fib_index == inner_fib_index
+       && FIB_NODE_INDEX_INVALID != tp->fei)
       {
-        ip4_fib_t * fib = vec_elt_at_index (im->fibs, inner_fib_index);
-        uword * hash = fib->adj_index_by_dst_address[mask_width];
-        uword key = intfc->as_u32 & im->fib_masks[mask_width];
-        uword *p = hash_get (hash, key);
-
         found_tunnel = 1;
 
         if (is_add)
           {
-            if (p || l2_only)
+            if (l2_only)
               return 1;
             else
               {
@@ -1729,9 +2024,7 @@ int vnet_mpls_ethernet_add_del_policy_tunnel (u8 *dst,
           }
         else
           {
-            /* Delete, the route is already gone? */
-            if (!p)
-              need_route_add_del = 0;
+            /* Delete */
             goto add_del_route;
           }
 
@@ -1784,49 +2077,44 @@ int vnet_mpls_ethernet_add_del_policy_tunnel (u8 *dst,
   tp->encap_index = e - mm->encaps;
   tp->tx_sw_if_index = tx_sw_if_index;
   tp->l2_only = l2_only;
+  tp->fei = FIB_NODE_INDEX_INVALID;
 
   if (new_tunnel_index)
     *new_tunnel_index = tp - mm->eth_tunnels;
 
-  /* Create the classify adjacency and add to v4 fib */
-  memset(&adj, 0, sizeof (adj));
-  adj.explicit_fib_index = ~0;
-  adj.lookup_next_index = IP_LOOKUP_NEXT_CLASSIFY;
-  adj.classify.table_index = classify_table_index;
-    
-  if (!l2_only)
-    ip_add_adjacency (lm, &adj, 1 /* one adj */,
-                      &adj_index);
-  
  add_del_route:
 
-  if (need_route_add_del && !l2_only)
+  if (!l2_only)
     {
+      const fib_prefix_t pfx = {
+         .fp_addr = {
+             .ip4 = tp->intfc_address,
+         },
+         .fp_len = tp->mask_width,
+         .fp_proto = FIB_PROTOCOL_IP4,
+      };
+      dpo_id_t dpo = DPO_NULL;
+
       if (is_add)
-        ip4_add_del_route_next_hop (im,
-                                    IP4_ROUTE_FLAG_ADD,
-                                    &tp->intfc_address,
-                                    tp->mask_width,
-                                    &zero /* no next hop */,
-                                    (u32)~0 /* next_hop_sw_if_index */,
-                                    1 /* weight */, 
-                                    adj_index,
-                                    tp->inner_fib_index);
-      else
         {
-          ip4_add_del_route_args_t a;
-          memset (&a, 0, sizeof (a));
-
-          a.flags = IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_DEL;
-          a.table_index_or_table_id = tp->inner_fib_index;
-          a.dst_address = tp->intfc_address;
-          a.dst_address_length = tp->mask_width;
-          a.adj_index = ~0;
-
-          ip4_add_del_route (im, &a);
-          ip4_maybe_remap_adjacencies (im, tp->inner_fib_index, 
-                                       IP4_ROUTE_FLAG_FIB_INDEX);
+          dpo_set(&dpo,
+                  DPO_CLASSIFY,
+                  DPO_PROTO_IP4,
+                  classify_dpo_create(FIB_PROTOCOL_IP4,
+                                      classify_table_index));
+
+          tp->fei = fib_table_entry_special_dpo_add(tp->inner_fib_index,
+                                                    &pfx,
+                                                    FIB_SOURCE_API,
+                                                    FIB_ENTRY_FLAG_EXCLUSIVE,
+                                                    &dpo);
+          dpo_reset(&dpo);
         }
+      else
+        {
+         fib_table_entry_delete(tp->inner_fib_index, &pfx, FIB_SOURCE_API);
+         tp->fei = FIB_NODE_INDEX_INVALID;
+       }
     }
   if (is_add == 0 && found_tunnel)
     {
@@ -1945,3 +2233,44 @@ VLIB_CLI_COMMAND (create_mpls_ethernet_policy_tunnel_command, static) = {
   " classify-table-index <nn>",
   .function = create_mpls_ethernet_policy_tunnel_command_fn,
 };
+
+static clib_error_t *
+mpls_interface_enable_disable (vlib_main_t * vm,
+                               unformat_input_t * input,
+                               vlib_cli_command_t * cmd)
+{
+  vnet_main_t * vnm = vnet_get_main();
+  clib_error_t * error = 0;
+  u32 sw_if_index, enable;
+
+  sw_if_index = ~0;
+
+  if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+    {
+      error = clib_error_return (0, "unknown interface `%U'",
+                                format_unformat_error, input);
+      goto done;
+    }
+
+  if (unformat (input, "enable"))
+      enable = 1;
+  else if (unformat (input, "disable"))
+      enable = 0;
+  else
+    {
+      error = clib_error_return (0, "expected 'enable' or 'disable'",
+                                format_unformat_error, input);
+      goto done;
+    }
+
+  mpls_sw_interface_enable_disable(&mpls_main, sw_if_index, enable);
+
+ done:
+  return error;
+}
+
+VLIB_CLI_COMMAND (set_interface_ip_table_command, static) = {
+  .path = "set interface mpls",
+  .function = mpls_interface_enable_disable,
+  .short_help = "Enable/Disable an interface for MPLS forwarding",
+};
similarity index 74%
rename from vnet/vnet/mpls-gre/mpls.c
rename to vnet/vnet/mpls/mpls.c
index d914b4c..be5e882 100644 (file)
  */
 
 #include <vnet/vnet.h>
-#include <vnet/mpls-gre/mpls.h>
+#include <vnet/mpls/mpls.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/fib/mpls_fib.h>
+
+const static char* mpls_eos_bit_names[] = MPLS_EOS_BITS;
 
 mpls_main_t mpls_main;
 
+u8 * format_mpls_unicast_label (u8 * s, va_list * args)
+{
+  mpls_label_t label = va_arg (*args, mpls_label_t);
+
+  switch (label) {
+  case MPLS_IETF_IPV4_EXPLICIT_NULL_LABEL:
+      s = format (s, "%s", MPLS_IETF_IPV4_EXPLICIT_NULL_STRING);
+      break;
+  case MPLS_IETF_ROUTER_ALERT_LABEL:
+      s = format (s, "%s", MPLS_IETF_ROUTER_ALERT_STRING);
+      break;
+  case MPLS_IETF_IPV6_EXPLICIT_NULL_LABEL:
+      s = format (s, "%s", MPLS_IETF_IPV6_EXPLICIT_NULL_STRING);
+      break;
+  case MPLS_IETF_IMPLICIT_NULL_LABEL:
+      s = format (s, "%s", MPLS_IETF_IMPLICIT_NULL_STRING);
+      break;
+  case MPLS_IETF_ELI_LABEL:
+      s = format (s, "%s", MPLS_IETF_ELI_STRING);
+      break;
+  case MPLS_IETF_GAL_LABEL:
+      s = format (s, "%s", MPLS_IETF_GAL_STRING);
+      break;
+  default:
+      s = format (s, "%d", label);
+      break;
+  }
+  return s;
+}
+
+uword unformat_mpls_unicast_label (unformat_input_t * input, va_list * args)
+{
+  mpls_label_t *label = va_arg (*args, mpls_label_t*);
+  
+  if (unformat (input, MPLS_IETF_IPV4_EXPLICIT_NULL_STRING))
+      *label = MPLS_IETF_IPV4_EXPLICIT_NULL_LABEL;
+  else if (unformat (input, MPLS_IETF_IPV6_EXPLICIT_NULL_STRING))
+      *label = MPLS_IETF_IPV6_EXPLICIT_NULL_LABEL;
+  else if (unformat (input, MPLS_IETF_ROUTER_ALERT_STRING))
+      *label = MPLS_IETF_ROUTER_ALERT_LABEL;
+  else if (unformat (input, MPLS_IETF_IMPLICIT_NULL_STRING))
+      *label = MPLS_IETF_IMPLICIT_NULL_LABEL;
+  else if (unformat (input, "%d", label))
+      ;
+
+  return (1);
+}
+
+u8 * format_mpls_eos_bit (u8 * s, va_list * args)
+{
+  mpls_eos_bit_t eb = va_arg (*args, mpls_eos_bit_t);
+
+  ASSERT(eb <= MPLS_EOS);
+
+  s = format(s, "%s", mpls_eos_bit_names[eb]);
+
+  return (s);
+}
+
+u8 * format_mpls_header (u8 * s, va_list * args)
+{
+  mpls_unicast_header_t hdr = va_arg (*args, mpls_unicast_header_t);
+
+  return (format(s, "[%U:%d:%d:%U]",
+                format_mpls_unicast_label, 
+                vnet_mpls_uc_get_label(hdr.label_exp_s_ttl),
+                vnet_mpls_uc_get_ttl(hdr.label_exp_s_ttl),
+                vnet_mpls_uc_get_exp(hdr.label_exp_s_ttl),
+                format_mpls_eos_bit,
+                vnet_mpls_uc_get_s(hdr.label_exp_s_ttl)));
+}
+
 u8 * format_mpls_gre_tx_trace (u8 * s, va_list * args)
 {
   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
@@ -203,8 +279,9 @@ int vnet_mpls_add_del_encap (ip4_address_t *dest, u32 fib_id,
           
           /* Reformat label into mpls_unicast_header_t */
           label_host_byte_order <<= 12;
-          if (i == vec_len(labels_host_byte_order) - 1)
-            label_host_byte_order |= 1<<8;            /* S=1 */
+         // FIXME NEOS AND EOS
+          //if (i == vec_len(labels_host_byte_order) - 1)
+          //  label_host_byte_order |= 1<<8;            /* S=1 */
           label_host_byte_order |= 0xff;            /* TTL=FF */
           label_net_byte_order = clib_host_to_net_u32 (label_host_byte_order);
           h.label_exp_s_ttl = label_net_byte_order;
@@ -385,7 +462,7 @@ int vnet_mpls_add_del_decap (u32 rx_fib_id,
   rx_fib_index = p[0];
 
   /* L3 decap => transform fib ID to fib index */
-  if (next_index == MPLS_INPUT_NEXT_IP4_INPUT)
+  if (next_index == MPLS_LOOKUP_NEXT_IP4_INPUT)
     {
       p = hash_get (im->fib_index_by_table_id, tx_fib_id);
       if (! p)
@@ -437,12 +514,12 @@ unformat_mpls_gre_input_next (unformat_input_t * input, va_list * args)
 
   if (unformat (input, "lookup"))
     {
-      *result = MPLS_INPUT_NEXT_IP4_INPUT;
+      *result = MPLS_LOOKUP_NEXT_IP4_INPUT;
       rv = 1;
     }
   else if (unformat (input, "output"))
     {
-      *result = MPLS_INPUT_NEXT_L2_OUTPUT;
+      *result = MPLS_LOOKUP_NEXT_L2_OUTPUT;
       rv = 1;
     }
   return rv;
@@ -614,10 +691,7 @@ show_mpls_fib_command_fn (vlib_main_t * vm,
   show_mpls_fib_t *records = 0;
   show_mpls_fib_t *s;
   mpls_main_t * mm = &mpls_main;
-  ip4_main_t * im = &ip4_main;
-  ip4_fib_t * rx_fib, * tx_fib;
-  u32 tx_table_id;
-  char *swif_tag;
+  ip4_fib_t * rx_fib;
 
   hash_foreach (key, value, mm->mpls_encap_by_fib_and_dest, 
   ({
@@ -630,7 +704,6 @@ show_mpls_fib_command_fn (vlib_main_t * vm,
   if (!vec_len(records))
     {
       vlib_cli_output (vm, "MPLS encap table empty");
-      goto decap_table;
     }
   /* sort output by dst address within fib */
   vec_sort_with_function (records, mpls_dest_cmp);
@@ -639,65 +712,174 @@ show_mpls_fib_command_fn (vlib_main_t * vm,
   vlib_cli_output (vm, "%=6s%=16s%=16s", "Table", "Dest address", "Labels");
   vec_foreach (s, records)
     {
-      rx_fib = vec_elt_at_index (im->fibs, s->fib_index);
+      rx_fib = ip4_fib_get (s->fib_index);
       vlib_cli_output (vm, "%=6d%=16U%=16U", rx_fib->table_id, 
                        format_ip4_address, &s->dest,
                        format_mpls_encap_index, mm, s->entry_index);
     }
 
- decap_table:
-  vec_reset_length(records);
+  vec_free(records);
+  return 0;
+}
 
-  hash_foreach (key, value, mm->mpls_decap_by_rx_fib_and_label, 
-  ({
-    vec_add2 (records, s, 1);
-    s->fib_index = (u32)(key>>32);
-    s->entry_index = (u32) value;
-    s->label = ((u32) key)>>12;
-    s->s_bit = (key & (1<<8)) != 0;
-  }));
-  
-  if (!vec_len(records))
-    {
-      vlib_cli_output (vm, "MPLS decap table empty");
-      goto out;
-    }
+VLIB_CLI_COMMAND (show_mpls_fib_command, static) = {
+    .path = "show mpls encap",
+    .short_help = "show mpls encap",
+    .function = show_mpls_fib_command_fn,
+};
 
-  vec_sort_with_function (records, mpls_label_cmp);
+static clib_error_t *
+vnet_mpls_local_label (vlib_main_t * vm,
+                       unformat_input_t * input,
+                       vlib_cli_command_t * cmd)
+{
+  unformat_input_t _line_input, * line_input = &_line_input;
+  fib_route_path_t *rpaths = NULL, rpath;
+  clib_error_t * error = 0;
+  u32 table_id, is_del, is_ip;
+  fib_prefix_t pfx;
+  mpls_label_t local_label;
+  mpls_eos_bit_t eos;
+
+  is_ip = 0;
+  table_id = 0;
+  eos = MPLS_EOS;
+
+   /* Get a line of input. */
+  if (! unformat_user (input, unformat_line_input, line_input))
+    return 0;
 
-  vlib_cli_output (vm, "MPLS decap table");
-  vlib_cli_output (vm, "%=10s%=15s%=6s%=6s", "RX Table", "TX Table/Intfc", 
-                   "Label", "S-bit");
-  vec_foreach (s, records)
+  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
     {
-      mpls_decap_t * d;
-      d = pool_elt_at_index (mm->decaps, s->entry_index);
-      if (d->next_index == MPLS_INPUT_NEXT_IP4_INPUT)
-        {
-          tx_fib = vec_elt_at_index (im->fibs, d->tx_fib_index);
-          tx_table_id = tx_fib->table_id;
-          swif_tag = "     ";
-        }
+      memset(&rpath, 0, sizeof(rpath));
+      memset(&pfx, 0, sizeof(pfx));
+
+      if (unformat (line_input, "table %d", &table_id))
+       ;
+      else if (unformat (line_input, "del"))
+       is_del = 1;
+      else if (unformat (line_input, "add"))
+       is_del = 0;
+      else if (unformat (line_input, "eos"))
+       eos = MPLS_EOS;
+      else if (unformat (line_input, "non-eos"))
+       eos = MPLS_NON_EOS;
+      else if (unformat (line_input, "%U/%d",
+                        unformat_ip4_address,
+                        &pfx.fp_addr.ip4,
+                        &pfx.fp_len))
+      {
+         pfx.fp_proto = FIB_PROTOCOL_IP4;
+          is_ip = 1;
+      }
+      else if (unformat (line_input, "%U/%d",
+                        unformat_ip6_address,
+                        &pfx.fp_addr.ip6,
+                        &pfx.fp_len))
+      {
+         pfx.fp_proto = FIB_PROTOCOL_IP6;
+          is_ip = 1;
+      }
+      else if (unformat (line_input, "%d", &local_label))
+       ;
+      else if (unformat (line_input,
+                        "ip4-lookup-in-table %d",
+                        &rpath.frp_fib_index))
+      {
+         rpath.frp_label = MPLS_LABEL_INVALID;
+          rpath.frp_proto = FIB_PROTOCOL_IP4;
+          rpath.frp_sw_if_index = FIB_NODE_INDEX_INVALID;
+         vec_add1(rpaths, rpath);
+      }
+      else if (unformat (line_input,
+                        "ip6-lookup-in-table %d",
+                        &rpath.frp_fib_index))
+      {
+         rpath.frp_label = MPLS_LABEL_INVALID;
+          rpath.frp_proto = FIB_PROTOCOL_IP6;
+          rpath.frp_sw_if_index = FIB_NODE_INDEX_INVALID;
+         vec_add1(rpaths, rpath);
+      }
+      else if (unformat (line_input,
+                        "mpls-lookup-in-table %d",
+                        &rpath.frp_fib_index))
+      {
+         rpath.frp_label = MPLS_LABEL_INVALID;
+          rpath.frp_proto = FIB_PROTOCOL_IP4;
+          rpath.frp_sw_if_index = FIB_NODE_INDEX_INVALID;
+         vec_add1(rpaths, rpath);
+      }
       else
-        {
-          tx_table_id = d->tx_fib_index;
-          swif_tag = "(i)  ";
-        }
-      rx_fib = vec_elt_at_index (im->fibs, s->fib_index);
+      {
+          error = clib_error_return (0, "unkown input: %U",
+                                     format_unformat_error, input);
+          goto done;
+      }
 
-      vlib_cli_output (vm, "%=10d%=10d%=5s%=6d%=6d", rx_fib->table_id, 
-                       tx_table_id, swif_tag, s->label, s->s_bit);
     }
 
- out:
-  vec_free(records);
-  return 0;
+  if (is_ip)
+  {
+      u32 fib_index = fib_table_find(pfx.fp_proto, table_id);
+
+      if (FIB_NODE_INDEX_INVALID == fib_index)
+      {
+          error = clib_error_return (0, "%U table-id %d does not exist",
+                                     format_fib_protocol, pfx.fp_proto, table_id);
+          goto done;
+      }
+
+      if (is_del)
+      {
+          fib_table_entry_local_label_remove(fib_index, &pfx, local_label);
+      }
+      else
+      {
+          fib_table_entry_local_label_add(fib_index, &pfx, local_label);
+      }
+  }
+  else
+  {
+      fib_node_index_t lfe, fib_index;
+      fib_prefix_t prefix = {
+         .fp_proto = FIB_PROTOCOL_MPLS,
+         .fp_label = local_label,
+         .fp_eos = eos,
+      };
+
+      fib_index = mpls_fib_index_from_table_id(table_id);
+
+      if (FIB_NODE_INDEX_INVALID == fib_index)
+      {
+          error = clib_error_return (0, "MPLS table-id %d does not exist",
+                                     table_id);
+          goto done;
+      }
+
+      lfe = fib_table_entry_path_add2(fib_index,
+                                     &prefix,
+                                     FIB_SOURCE_CLI,
+                                     FIB_ENTRY_FLAG_NONE,
+                                     rpaths);
+
+      if (FIB_NODE_INDEX_INVALID == lfe)
+      {
+          error = clib_error_return (0, "Failed to create %U-%U in MPLS table-id %d",
+                                     format_mpls_unicast_label, local_label,
+                                     format_mpls_eos_bit, eos,
+                                     table_id);
+          goto done;
+      }
+  }
+
+done:
+  return error;
 }
 
-VLIB_CLI_COMMAND (show_mpls_fib_command, static) = {
-    .path = "show mpls fib",
-    .short_help = "show mpls fib",
-    .function = show_mpls_fib_command_fn,
+VLIB_CLI_COMMAND (mpls_local_label_command, static) = {
+  .path = "mpls local-label",
+  .function = vnet_mpls_local_label,
+  .short_help = "Create/Delete MPL local labels",
 };
 
 int mpls_fib_reset_labels (u32 fib_id)
@@ -764,7 +946,6 @@ static clib_error_t * mpls_init (vlib_main_t * vm)
   mpls_main_t * mm = &mpls_main;
   clib_error_t * error;
 
-  memset (mm, 0, sizeof (mm[0]));
   mm->vlib_main = vm;
   mm->vnet_main = vnet_get_main();
 
similarity index 65%
rename from vnet/vnet/mpls-gre/mpls.h
rename to vnet/vnet/mpls/mpls.h
index d8ffca2..2aeae49 100644 (file)
 
 #include <vnet/vnet.h>
 #include <vnet/gre/gre.h>
-#include <vnet/mpls-gre/packet.h>
+#include <vnet/mpls/packet.h>
+#include <vnet/mpls/mpls_types.h>
 #include <vnet/ip/ip4_packet.h>
 #include <vnet/ethernet/ethernet.h>
+#include <vnet/fib/fib_node.h>
+#include <vnet/adj/adj.h>
 
 typedef CLIB_PACKED (struct {
   ip4_header_t ip4;             /* 20 bytes */
@@ -31,7 +34,7 @@ extern vnet_hw_interface_class_t mpls_gre_hw_interface_class;
 
 typedef enum {
 #define mpls_error(n,s) MPLS_ERROR_##n,
-#include <vnet/mpls-gre/error.def>
+#include <vnet/mpls/error.def>
 #undef mpls_error
   MPLS_N_ERROR,
 } mpls_gre_error_t;
@@ -42,6 +45,7 @@ typedef enum {
  */
 
 typedef struct {
+  fib_node_t mgt_node;
   ip4_address_t tunnel_src;
   ip4_address_t tunnel_dst;
   ip4_address_t intfc_address;
@@ -52,6 +56,9 @@ typedef struct {
   u32 hw_if_index;              /* L2 x-connect capable tunnel intfc */
   u8 * rewrite_data;
   u8 l2_only;
+  fib_node_index_t fei; /* FIB Entry index for the tunnel's destination */
+  adj_index_t adj_index; /* The midchain adj this tunnel creates */
+  u32 sibling_index;
 } mpls_gre_tunnel_t;
 
 typedef struct {
@@ -64,6 +71,7 @@ typedef struct {
   u32 hw_if_index;
   u8 * rewrite_data;
   u8 l2_only;
+  fib_node_index_t fei;
 } mpls_eth_tunnel_t;
 
 typedef struct {
@@ -78,7 +86,53 @@ typedef struct {
   u32 next_index;               /* e.g. ip4/6-input, l2-input */
 } mpls_decap_t;
 
+#define MPLS_FIB_DEFAULT_TABLE_ID 0
+
+/**
+ * Type exposure is to allow the DP fast/inlined access
+ */
+#define MPLS_FIB_KEY_SIZE 21
+#define MPLS_FIB_DB_SIZE (1 << (MPLS_FIB_KEY_SIZE-1))
+
+typedef struct mpls_fib_t_
+{
+  /**
+   * A hash table of entries. 21 bit key
+   * Hash table for reduced memory footprint
+   */
+  uword * mf_entries; 
+
+  /**
+   * The load-balance indeices keyed by 21 bit label+eos bit.
+   * A flat array for maximum lookup performace.
+   */
+  index_t mf_lbs[MPLS_FIB_DB_SIZE];
+} mpls_fib_t;
+
+/**
+ * @brief Definition of a callback for receiving MPLS interface state change
+ * notifications
+ */
+typedef void (*mpls_interface_state_change_callback_t)(u32 sw_if_index,
+                                                       u32 is_enable);
+
 typedef struct {
+  /* MPLS FIB index for each software interface */
+  u32 *fib_index_by_sw_if_index;
+
+  /**  A pool of all the MPLS FIBs */
+  struct fib_table_t_ *fibs;
+
+  /** A hash table to lookup the mpls_fib by table ID */
+  uword *fib_index_by_table_id;
+
+  /* rx/tx interface/feature configuration. */
+  ip_config_main_t rx_config_mains, tx_config_main;
+
+  /* Built-in unicast feature path indices, see ip_feature_init_cast(...)  */
+  u32 mpls_rx_feature_lookup;
+  u32 mpls_rx_feature_not_enabled;
+
   /* pool of gre tunnel instances */
   mpls_gre_tunnel_t *gre_tunnels;
   u32 * free_gre_sw_if_indices;
@@ -99,23 +153,53 @@ typedef struct {
   u32 ip4_classify_mpls_policy_encap_next_index;
   u32 ip6_classify_mpls_policy_encap_next_index;
 
+  /* feature path configuration lists */
+  vnet_ip_feature_registration_t * next_feature;
+
+  /* Save feature results for show command */
+  char **feature_nodes;
+
+  /* IP4 enabled count by software interface */
+  u8 * mpls_enabled_by_sw_if_index;
+
+  /* Functions to call when MPLS state on an interface changes. */
+  mpls_interface_state_change_callback_t * mpls_interface_state_change_callbacks;
+
   /* convenience */
   vlib_main_t * vlib_main;
   vnet_main_t * vnet_main;
 } mpls_main_t;
 
-mpls_main_t mpls_main;
+extern mpls_main_t mpls_main;
+
+#define VNET_MPLS_FEATURE_INIT(x,...)                           \
+  __VA_ARGS__ vnet_ip_feature_registration_t uc_##x;            \
+static void __vnet_add_feature_registration_uc_##x (void)       \
+  __attribute__((__constructor__)) ;                            \
+static void __vnet_add_feature_registration_uc_##x (void)       \
+{                                                               \
+  mpls_main_t * mm = &mpls_main;                                \
+  uc_##x.next = mm->next_feature;                               \
+  mm->next_feature = &uc_##x;                                   \
+}                                                               \
+__VA_ARGS__ vnet_ip_feature_registration_t uc_##x
+
+extern clib_error_t * mpls_feature_init(vlib_main_t * vm);
 
 format_function_t format_mpls_protocol;
-format_function_t format_mpls_header;
-format_function_t format_mpls_header_with_length;
 format_function_t format_mpls_gre_header_with_length;
 format_function_t format_mpls_eth_header_with_length;
-format_function_t format_mpls_unicast_label;
 format_function_t format_mpls_encap_index;
 
+format_function_t format_mpls_eos_bit;
+format_function_t format_mpls_unicast_header_net_byte_order;
+format_function_t format_mpls_unicast_label;
+format_function_t format_mpls_header;
+
 extern vlib_node_registration_t mpls_input_node;
 extern vlib_node_registration_t mpls_policy_encap_node;
+extern vlib_node_registration_t mpls_output_node;
+extern vlib_node_registration_t mpls_midchain_node;
 
 extern vnet_device_class_t mpls_gre_device_class;
 
@@ -126,6 +210,7 @@ unformat_function_t unformat_mpls_protocol_net_byte_order;
 unformat_function_t unformat_mpls_label_net_byte_order;
 unformat_function_t unformat_mpls_gre_header;
 unformat_function_t unformat_pg_mpls_gre_header;
+unformat_function_t unformat_mpls_unicast_label;
 
 /* Parse mpls header. */
 unformat_function_t unformat_mpls_header;
@@ -135,6 +220,12 @@ unformat_function_t unformat_pg_mpls_header;
 #define MPLS_GRE_OUTPUT_NEXT_LOOKUP    1
 #define MPLS_GRE_OUTPUT_NEXT_DROP      VNET_INTERFACE_TX_NEXT_DROP
 
+void mpls_sw_interface_enable_disable (mpls_main_t * mm,
+                                      u32 sw_if_index,
+                                      u8 is_enable);
+
+u8 mpls_sw_interface_is_enabled (u32 sw_if_index);
+
 mpls_encap_t * 
 mpls_encap_by_fib_and_dest (mpls_main_t * mm, u32 rx_fib, u32 dst_address);
 
@@ -176,6 +267,7 @@ int vnet_mpls_add_del_encap (ip4_address_t *dest, u32 fib_id,
 int vnet_mpls_policy_tunnel_add_rewrite (mpls_main_t * mm, 
                                          mpls_encap_t * e, 
                                          u32 policy_tunnel_index);
+
 typedef struct {
   u32 lookup_miss;
 
@@ -198,8 +290,7 @@ u8 * format_mpls_gre_header (u8 * s, va_list * args);
 
 #define foreach_mpls_input_next                        \
 _(DROP, "error-drop")                           \
-_(IP4_INPUT, "ip4-input")                       \
-_(L2_OUTPUT, "l2-output")
+_(LOOKUP, "mpls-lookup")
 
 typedef enum {
 #define _(s,n) MPLS_INPUT_NEXT_##s,
@@ -208,6 +299,28 @@ typedef enum {
   MPLS_INPUT_N_NEXT,
 } mpls_input_next_t;
 
+#define foreach_mpls_lookup_next               \
+_(DROP, "error-drop")                           \
+_(IP4_INPUT, "ip4-input")                       \
+_(L2_OUTPUT, "l2-output")
+
+// FIXME remove.
+typedef enum {
+#define _(s,n) MPLS_LOOKUP_NEXT_##s,
+  foreach_mpls_lookup_next
+#undef _
+  MPLS_LOOKUP_N_NEXT,
+} mpls_lookup_next_t;
+
+#define foreach_mpls_output_next               \
+_(DROP, "error-drop")                           
+
+typedef enum {
+#define _(s,n) MPLS_OUTPUT_NEXT_##s,
+  foreach_mpls_output_next
+#undef _
+  MPLS_OUTPUT_N_NEXT,
+} mpls_output_next_t;
 
 typedef struct {
   u32 lookup_miss;
diff --git a/vnet/vnet/mpls/mpls_features.c b/vnet/vnet/mpls/mpls_features.c
new file mode 100644 (file)
index 0000000..d3a726a
--- /dev/null
@@ -0,0 +1,254 @@
+/*
+ * mpls_features.c: MPLS input and output features
+ *
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/mpls/mpls.h>
+
+always_inline uword
+mpls_terminate (vlib_main_t * vm,
+                vlib_node_runtime_t * node,
+                vlib_frame_t * frame,
+                int error_code)
+{
+  u32 * buffers = vlib_frame_vector_args (frame);
+  uword n_packets = frame->n_vectors;
+
+  vlib_error_drop_buffers (vm, node,
+                           buffers,
+                           /* stride */ 1,
+                           n_packets,
+                           /* next */ 0,
+                           mpls_input_node.index,
+                           error_code);
+
+  return n_packets;
+}
+
+static uword
+mpls_punt (vlib_main_t * vm,
+           vlib_node_runtime_t * node,
+           vlib_frame_t * frame)
+{
+    return (mpls_terminate(vm, node, frame, MPLS_ERROR_PUNT));
+}
+
+VLIB_REGISTER_NODE (mpls_punt_node) = {
+  .function = mpls_punt,
+  .name = "mpls-punt",
+  .vector_size = sizeof (u32),
+
+  .n_next_nodes = 1,
+  .next_nodes = {
+    [0] = "error-punt",
+  },
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (mpls_punt_node, mpls_punt)
+
+static uword
+mpls_drop (vlib_main_t * vm,
+           vlib_node_runtime_t * node,
+           vlib_frame_t * frame)
+{
+    return (mpls_terminate(vm, node, frame, MPLS_ERROR_DROP));
+}
+
+VLIB_REGISTER_NODE (mpls_drop_node) = {
+  .function = mpls_drop,
+  .name = "mpls-drop",
+  .vector_size = sizeof (u32),
+
+  .n_next_nodes = 1,
+  .next_nodes = {
+    [0] = "error-drop",
+  },
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (mpls_drop_node, mpls_drop)
+
+static uword
+mpls_not_enabled (vlib_main_t * vm,
+                  vlib_node_runtime_t * node,
+                  vlib_frame_t * frame)
+{
+    return (mpls_terminate(vm, node, frame, MPLS_ERROR_NOT_ENABLED));
+}
+
+VLIB_REGISTER_NODE (mpls_not_enabled_node) = {
+  .function = mpls_not_enabled,
+  .name = "mpls-not-enabled",
+  .vector_size = sizeof (u32),
+
+  .n_next_nodes = 1,
+  .next_nodes = {
+    [0] = "error-drop",
+  },
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (mpls_not_enabled_node, mpls_not_enabled)
+
+VNET_MPLS_FEATURE_INIT (mpls_lookup, static) = {
+  .node_name = "mpls-lookup",
+  .runs_before = ORDER_CONSTRAINTS {"mpls-not-enabled", 0},
+  .feature_index = &mpls_main.mpls_rx_feature_lookup,
+};
+
+VNET_MPLS_FEATURE_INIT (mpls_not_enabled, static) = {
+  .node_name = "mpls-not-enabled",
+  .runs_before = ORDER_CONSTRAINTS {0}, /* not before any other features */
+  .feature_index = &mpls_main.mpls_rx_feature_not_enabled,
+};
+
+static char * feature_start_nodes[] =
+{
+    "mpls-input",
+};
+
+clib_error_t *
+mpls_feature_init (vlib_main_t * vm)
+{
+  ip_config_main_t * cm = &mpls_main.rx_config_mains;
+  vnet_config_main_t * vcm = &cm->config_main;
+
+  return (ip_feature_init_cast (vm, cm, vcm,
+                                feature_start_nodes,
+                                ARRAY_LEN(feature_start_nodes),
+                                VNET_IP_RX_UNICAST_FEAT,
+                                VNET_L3_PACKET_TYPE_MPLS_UNICAST));
+}
+
+static clib_error_t *
+mpls_sw_interface_add_del (vnet_main_t * vnm,
+                           u32 sw_if_index,
+                           u32 is_add)
+{
+  vlib_main_t * vm = vnm->vlib_main;
+  mpls_main_t * mm = &mpls_main;
+  ip_config_main_t * cm = &mm->rx_config_mains;
+  vnet_config_main_t * vcm = &cm->config_main;
+  u32 drop_feature_index;
+  u32 ci;
+
+  vec_validate_init_empty (mm->mpls_enabled_by_sw_if_index, sw_if_index, 0);
+  vec_validate_init_empty (mm->fib_index_by_sw_if_index, sw_if_index, 0);
+  vec_validate_init_empty (cm->config_index_by_sw_if_index, sw_if_index, ~0);
+  ci = cm->config_index_by_sw_if_index[sw_if_index];
+
+  drop_feature_index = mm->mpls_rx_feature_not_enabled;
+
+  if (is_add)
+    ci = vnet_config_add_feature (vm, vcm, ci,
+                                  drop_feature_index,
+                                  /* config data */ 0,
+                                  /* # bytes of config data */ 0);
+  else
+   {
+     ci = vnet_config_del_feature (vm, vcm, ci,
+                                   drop_feature_index,
+                                   /* config data */ 0,
+                                   /* # bytes of config data */ 0);
+     mm->mpls_enabled_by_sw_if_index[sw_if_index] = 0;;
+   }
+
+  cm->config_index_by_sw_if_index[sw_if_index] = ci;
+
+  return /* no error */ 0;
+}
+
+VNET_SW_INTERFACE_ADD_DEL_FUNCTION (mpls_sw_interface_add_del);
+
+static clib_error_t *
+show_mpls_features_command_fn (vlib_main_t * vm,
+                               unformat_input_t * input,
+                               vlib_cli_command_t * cmd)
+{
+  mpls_main_t * mm = &mpls_main;
+  int i;
+  char ** features;
+
+  vlib_cli_output (vm, "Available MPLS feature nodes");
+
+  do {
+    features = mm->feature_nodes;
+    for (i = 0; i < vec_len(features); i++)
+      vlib_cli_output (vm, "  %s\n", features[i]);
+  } while(0);
+
+  return 0;
+}
+
+VLIB_CLI_COMMAND (show_ip_features_command, static) = {
+  .path = "show mpls features",
+  .short_help = "show mpls features",
+  .function = show_mpls_features_command_fn,
+};
+
+static clib_error_t *
+show_mpls_interface_features_command_fn (vlib_main_t * vm,
+                                         unformat_input_t * input,
+                                         vlib_cli_command_t * cmd)
+{
+  vnet_main_t * vnm = vnet_get_main();
+  mpls_main_t * mm = &mpls_main;
+
+  ip_config_main_t * cm;
+  vnet_config_main_t * vcm;
+  vnet_config_t * cfg;
+  u32 cfg_index;
+  vnet_config_feature_t * feat;
+  vlib_node_t * n;
+  u32 sw_if_index;
+  u32 node_index;
+  u32 current_config_index;
+  int i;
+
+  if (! unformat (input, "%U", unformat_vnet_sw_interface,
+                  vnm, &sw_if_index))
+    return clib_error_return (0, "Interface not specified...");
+
+  vlib_cli_output (vm, "MPLS feature paths configured on %U...",
+                   format_vnet_sw_if_index_name, vnm, sw_if_index);
+
+  cm = &mm->rx_config_mains;
+  vcm = &cm->config_main;
+
+  current_config_index = vec_elt (cm->config_index_by_sw_if_index,
+                                  sw_if_index);
+
+  ASSERT(current_config_index
+         < vec_len (vcm->config_pool_index_by_user_index));
+
+  cfg_index =
+      vcm->config_pool_index_by_user_index[current_config_index];
+  cfg = pool_elt_at_index (vcm->config_pool, cfg_index);
+
+  for (i = 0; i < vec_len(cfg->features); i++)
+  {
+      feat = cfg->features + i;
+      node_index = feat->node_index;
+      n = vlib_get_node (vm, node_index);
+      vlib_cli_output (vm, "  %v", n->name);
+  }
+
+  return 0;
+}
+
+VLIB_CLI_COMMAND (show_mpls_interface_features_command, static) = {
+  .path = "show mpls interface features",
+  .short_help = "show mpls interface features <intfc>",
+  .function = show_mpls_interface_features_command_fn,
+};
+
diff --git a/vnet/vnet/mpls/mpls_lookup.c b/vnet/vnet/mpls/mpls_lookup.c
new file mode 100644 (file)
index 0000000..31ad68c
--- /dev/null
@@ -0,0 +1,278 @@
+/*
+ * node.c: mpls-o-gre decap processing
+ *
+ * Copyright (c) 2012-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/mpls/mpls.h>
+#include <vnet/fib/mpls_fib.h>
+#include <vnet/dpo/load_balance.h>
+
+vlib_node_registration_t mpls_lookup_node;
+
+typedef struct {
+  u32 next_index;
+  u32 lb_index;
+  u32 lfib_index;
+  u32 label_net_byte_order;
+} mpls_lookup_trace_t;
+
+static u8 *
+format_mpls_lookup_trace (u8 * s, va_list * args)
+{
+  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+  mpls_lookup_trace_t * t = va_arg (*args, mpls_lookup_trace_t *);
+
+  s = format (s, "MPLS: next [%d], lookup fib index %d, LB index %d "
+              "label %d eos %d", 
+              t->next_index, t->lfib_index, t->lb_index,
+              vnet_mpls_uc_get_label(
+                  clib_net_to_host_u32(t->label_net_byte_order)),
+              vnet_mpls_uc_get_s(t->label_net_byte_order));
+  return s;
+}
+
+/*
+ * Compute flow hash. 
+ * We'll use it to select which adjacency to use for this flow.  And other things.
+ */
+always_inline u32
+mpls_compute_flow_hash (const mpls_unicast_header_t * hdr,
+                        flow_hash_config_t flow_hash_config)
+{
+    // FIXME
+    return (vnet_mpls_uc_get_label(hdr->label_exp_s_ttl));
+}
+
+static inline uword
+mpls_lookup (vlib_main_t * vm,
+             vlib_node_runtime_t * node,
+             vlib_frame_t * from_frame)
+{
+  vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
+  u32 n_left_from, next_index, * from, * to_next;
+  mpls_main_t * mm = &mpls_main;
+  u32 cpu_index = os_get_cpu_number();
+
+  from = vlib_frame_vector_args (from_frame);
+  n_left_from = from_frame->n_vectors;
+  next_index = node->cached_next_index;
+
+  while (n_left_from > 0)
+    {
+      u32 n_left_to_next;
+
+      vlib_get_next_frame (vm, node, next_index,
+                           to_next, n_left_to_next);
+
+      while (n_left_from > 0 && n_left_to_next > 0)
+      {
+          u32 lbi0, next0, lfib_index0, bi0, hash_c0;
+          const mpls_unicast_header_t * h0;
+          const load_balance_t *lb0;
+          const dpo_id_t *dpo0;
+          vlib_buffer_t * b0;
+
+          bi0 = from[0];
+          to_next[0] = bi0;
+          from += 1;
+          to_next += 1;
+          n_left_from -= 1;
+          n_left_to_next -= 1;
+
+          b0 = vlib_get_buffer (vm, bi0);
+          h0 = vlib_buffer_get_current (b0);
+
+          lfib_index0 = vec_elt(mm->fib_index_by_sw_if_index,
+                                vnet_buffer(b0)->sw_if_index[VLIB_RX]);
+
+          lbi0 = mpls_fib_table_forwarding_lookup (lfib_index0, h0);
+         lb0 = load_balance_get(lbi0);
+
+          hash_c0 = vnet_buffer(b0)->ip.flow_hash = 0;
+          if (PREDICT_FALSE(lb0->lb_n_buckets > 1))
+          {
+              hash_c0 = vnet_buffer (b0)->ip.flow_hash = 
+                  mpls_compute_flow_hash(h0, lb0->lb_hash_config);
+          }
+
+         ASSERT (lb0->lb_n_buckets > 0);
+         ASSERT (is_pow2 (lb0->lb_n_buckets));
+
+         dpo0 = load_balance_get_bucket_i(lb0,
+                                           (hash_c0 &
+                                            (lb0->lb_n_buckets_minus_1)));
+
+         next0 = dpo0->dpoi_next_node;
+         vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
+
+         vlib_increment_combined_counter 
+              (cm, cpu_index, lbi0, 1,
+               vlib_buffer_length_in_chain (vm, b0));
+
+          /*
+           * pop the label that was just used in the lookup
+           */
+          vlib_buffer_advance(b0, sizeof(*h0));
+
+          if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+          {
+              mpls_lookup_trace_t *tr = vlib_add_trace (vm, node,
+                                                       b0, sizeof (*tr));
+              tr->next_index = next0;
+              tr->lb_index = lbi0;
+              tr->lfib_index = lfib_index0;
+              tr->label_net_byte_order = h0->label_exp_s_ttl;
+          }
+
+          vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+                                           to_next, n_left_to_next,
+                                           bi0, next0);
+        }
+
+      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+    }
+  vlib_node_increment_counter (vm, mpls_lookup_node.index,
+                               MPLS_ERROR_PKTS_DECAP, from_frame->n_vectors);
+  return from_frame->n_vectors;
+}
+
+static char * mpls_error_strings[] = {
+#define mpls_error(n,s) s,
+#include "error.def"
+#undef mpls_error
+};
+
+VLIB_REGISTER_NODE (mpls_lookup_node) = {
+  .function = mpls_lookup,
+  .name = "mpls-lookup",
+  /* Takes a vector of packets. */
+  .vector_size = sizeof (u32),
+  .n_errors = MPLS_N_ERROR,
+  .error_strings = mpls_error_strings,
+
+  .sibling_of = "ip4-lookup",
+
+  .format_buffer = format_mpls_gre_header_with_length,
+  .format_trace = format_mpls_lookup_trace,
+  .unformat_buffer = unformat_mpls_gre_header,
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (mpls_lookup_node, mpls_lookup)
+
+typedef struct {
+  u32 next_index;
+  u32 lb_index;
+} mpls_load_balance_trace_t;
+
+static u8 *
+format_mpls_load_balance_trace (u8 * s, va_list * args)
+{
+  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+  mpls_load_balance_trace_t * t = va_arg (*args, mpls_load_balance_trace_t *);
+
+  s = format (s, "MPLS: next [%d], LB index %d ", 
+              t->next_index, t->lb_index);
+  return s;
+}
+
+always_inline uword
+mpls_load_balance (vlib_main_t * vm,
+                 vlib_node_runtime_t * node,
+                 vlib_frame_t * frame)
+{
+  vlib_combined_counter_main_t * cm = &load_balance_main.lbm_via_counters;
+  u32 n_left_from, n_left_to_next, * from, * to_next;
+  ip_lookup_next_t next;
+  u32 cpu_index = os_get_cpu_number();
+
+  from = vlib_frame_vector_args (frame);
+  n_left_from = frame->n_vectors;
+  next = node->cached_next_index;
+
+  while (n_left_from > 0)
+    {
+      vlib_get_next_frame (vm, node, next,
+                          to_next, n_left_to_next);
+
+    
+      while (n_left_from > 0 && n_left_to_next > 0)
+       {
+         const mpls_unicast_header_t *hdr0;
+         const load_balance_t *lb0;
+         u32 pi0, lbi0, hc0, next0;
+         const dpo_id_t *dpo0;
+         vlib_buffer_t * p0;
+
+         pi0 = from[0];
+         to_next[0] = pi0;
+
+         p0 = vlib_get_buffer (vm, pi0);
+
+         hdr0 = vlib_buffer_get_current (p0);
+         lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
+
+         lb0 = load_balance_get(lbi0);
+         hc0 = lb0->lb_hash_config;
+         vnet_buffer(p0)->ip.flow_hash = mpls_compute_flow_hash(hdr0, hc0);
+
+         dpo0 = load_balance_get_bucket_i(lb0, 
+                                          vnet_buffer(p0)->ip.flow_hash &
+                                          (lb0->lb_n_buckets_minus_1));
+
+         next0 = dpo0->dpoi_next_node;
+         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
+
+         vlib_increment_combined_counter 
+              (cm, cpu_index, lbi0, 1,
+               vlib_buffer_length_in_chain (vm, p0));
+
+         from += 1;
+         to_next += 1;
+         n_left_to_next -= 1;
+         n_left_from -= 1;
+
+         if (PREDICT_FALSE (next0 != next))
+           {
+             n_left_to_next += 1;
+             vlib_put_next_frame (vm, node, next, n_left_to_next);
+             next = next0;
+             vlib_get_next_frame (vm, node, next,
+                                  to_next, n_left_to_next);
+             to_next[0] = pi0;
+             to_next += 1;
+             n_left_to_next -= 1;
+           }
+       }
+
+      vlib_put_next_frame (vm, node, next, n_left_to_next);
+    }
+
+  return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (mpls_load_balance_node) = {
+  .function = mpls_load_balance,
+  .name = "mpls-load-balance",
+  .vector_size = sizeof (u32),
+  .sibling_of = "mpls-lookup",
+
+  .format_trace = format_mpls_load_balance_trace,
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (mpls_load_balance_node, mpls_load_balance)
diff --git a/vnet/vnet/mpls/mpls_output.c b/vnet/vnet/mpls/mpls_output.c
new file mode 100644 (file)
index 0000000..932fcb8
--- /dev/null
@@ -0,0 +1,343 @@
+/*
+ * mpls_output.c: MPLS Adj rewrite
+ *
+ * Copyright (c) 2012-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/mpls/mpls.h>
+
+typedef struct {
+  /* Adjacency taken. */
+  u32 adj_index;
+  u32 flow_hash;
+
+  /* Packet data, possibly *after* rewrite. */
+  u8 packet_data[64 - 1*sizeof(u32)];
+} mpls_output_trace_t;
+
+static u8 *
+format_mpls_output_trace (u8 * s, va_list * args)
+{
+  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+  mpls_output_trace_t * t = va_arg (*args, mpls_output_trace_t *);
+  vnet_main_t * vnm = vnet_get_main();
+  uword indent = format_get_indent (s);
+
+  s = format (s, "adj-idx %d : %U flow hash: 0x%08x",
+              t->adj_index,
+              format_ip_adjacency, vnm, t->adj_index, FORMAT_IP_ADJACENCY_NONE,
+             t->flow_hash);
+  s = format (s, "\n%U%U",
+              format_white_space, indent,
+              format_ip_adjacency_packet_data,
+              vnm, t->adj_index,
+              t->packet_data, sizeof (t->packet_data));
+  return s;
+}
+
+static inline uword
+mpls_output_inline (vlib_main_t * vm,
+                    vlib_node_runtime_t * node,
+                    vlib_frame_t * from_frame)
+{
+  u32 n_left_from, next_index, * from, * to_next, cpu_index;
+  vlib_node_runtime_t * error_node;
+
+  cpu_index = os_get_cpu_number();
+  error_node = vlib_node_get_runtime (vm, mpls_output_node.index);
+  from = vlib_frame_vector_args (from_frame);
+  n_left_from = from_frame->n_vectors;
+  next_index = node->cached_next_index;
+
+  while (n_left_from > 0)
+    {
+      u32 n_left_to_next;
+
+      vlib_get_next_frame (vm, node, next_index,
+                           to_next, n_left_to_next);
+
+      while (n_left_from > 0 && n_left_to_next > 0)
+        {
+         ip_adjacency_t * adj0;
+          mpls_unicast_header_t *hdr0;
+         vlib_buffer_t * p0;
+         u32 pi0, rw_len0, adj_index0, next0, error0;
+
+         pi0 = to_next[0] = from[0];
+
+         p0 = vlib_get_buffer (vm, pi0);
+
+         adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
+
+          /* We should never rewrite a pkt using the MISS adjacency */
+          ASSERT(adj_index0);
+
+         adj0 = adj_get(adj_index0);
+         hdr0 = vlib_buffer_get_current (p0);
+
+         /* Guess we are only writing on simple Ethernet header. */
+          vnet_rewrite_one_header (adj0[0], hdr0, 
+                                   sizeof (ethernet_header_t));
+          
+          /* Update packet buffer attributes/set output interface. */
+          rw_len0 = adj0[0].rewrite_header.data_bytes;
+          
+          if (PREDICT_FALSE (rw_len0 > sizeof(ethernet_header_t)))
+              vlib_increment_combined_counter 
+                  (&adjacency_counters,
+                   cpu_index, adj_index0, 
+                   /* packet increment */ 0,
+                   /* byte increment */ rw_len0-sizeof(ethernet_header_t));
+          
+          /* Check MTU of outgoing interface. */
+          error0 = (vlib_buffer_length_in_chain (vm, p0) 
+                    > adj0[0].rewrite_header.max_l3_packet_bytes
+                    ? IP4_ERROR_MTU_EXCEEDED
+                    : IP4_ERROR_NONE);
+
+         p0->error = error_node->errors[error0];
+
+          /* Don't adjust the buffer for ttl issue; icmp-error node wants
+           * to see the IP headerr */
+          if (PREDICT_TRUE(error0 == IP4_ERROR_NONE))
+            {
+              p0->current_data -= rw_len0;
+              p0->current_length += rw_len0;
+
+              vnet_buffer (p0)->sw_if_index[VLIB_TX] =
+                  adj0[0].rewrite_header.sw_if_index;
+              next0 = adj0[0].rewrite_header.next_index;
+            }
+          else
+            {
+              next0 = MPLS_OUTPUT_NEXT_DROP;
+            }
+
+         from += 1;
+         n_left_from -= 1;
+         to_next += 1;
+         n_left_to_next -= 1;
+      
+          if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) 
+            {
+              mpls_output_trace_t *tr = vlib_add_trace (vm, node, 
+                                                        p0, sizeof (*tr));
+              tr->adj_index = vnet_buffer(p0)->ip.adj_index[VLIB_TX];
+              tr->flow_hash = vnet_buffer(p0)->ip.flow_hash;
+            }
+
+         vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+                                          to_next, n_left_to_next,
+                                          pi0, next0);
+        }
+
+      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+    }
+  vlib_node_increment_counter (vm, mpls_output_node.index,
+                               MPLS_ERROR_PKTS_ENCAP,
+                               from_frame->n_vectors);
+
+  return from_frame->n_vectors;
+}
+
+static char * mpls_error_strings[] = {
+#define mpls_error(n,s) s,
+#include "error.def"
+#undef mpls_error
+};
+
+static inline uword
+mpls_output (vlib_main_t * vm,
+             vlib_node_runtime_t * node,
+             vlib_frame_t * from_frame)
+{
+    return (mpls_output_inline(vm, node, from_frame));
+}
+
+VLIB_REGISTER_NODE (mpls_output_node) = {
+  .function = mpls_output,
+  .name = "mpls-output",
+  /* Takes a vector of packets. */
+  .vector_size = sizeof (u32),
+  .n_errors = MPLS_N_ERROR,
+  .error_strings = mpls_error_strings,
+
+  .n_next_nodes = MPLS_OUTPUT_N_NEXT,
+  .next_nodes = {
+#define _(s,n) [MPLS_OUTPUT_NEXT_##s] = n,
+    foreach_mpls_output_next
+#undef _
+  },
+
+  .format_trace = format_mpls_output_trace,
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (mpls_output_node, mpls_output)
+
+static inline uword
+mpls_midchain (vlib_main_t * vm,
+               vlib_node_runtime_t * node,
+               vlib_frame_t * from_frame)
+{
+    return (mpls_output_inline(vm, node, from_frame));
+}
+
+VLIB_REGISTER_NODE (mpls_midchain_node) = {
+  .function = mpls_output,
+  .name = "mpls-midchain",
+  .vector_size = sizeof (u32),
+
+  .format_trace = format_mpls_output_trace,
+
+  .sibling_of = "mpls-output",
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (mpls_midchain_node, mpls_midchain)
+
+/**
+ * @brief Next index values from the MPLS incomplete adj node
+ */
+#define foreach_mpls_adj_incomplete_next               \
+_(DROP, "error-drop")                   \
+_(IP4,  "ip4-arp")                      \
+_(IP6,  "ip6-discover-neighbor")
+
+typedef enum {
+#define _(s,n) MPLS_ADJ_INCOMPLETE_NEXT_##s,
+  foreach_mpls_adj_incomplete_next
+#undef _
+  MPLS_ADJ_INCOMPLETE_N_NEXT,
+} mpls_adj_incomplete_next_t;
+
+/**
+ * @brief A struct to hold tracing information for the MPLS label imposition
+ * node.
+ */
+typedef struct mpls_adj_incomplete_trace_t_
+{
+    u32 next;
+} mpls_adj_incomplete_trace_t;
+
+
+/**
+ * @brief Graph node for incomplete MPLS adjacency.
+ * This node will push traffic to either the v4-arp or v6-nd node
+ * based on the next-hop proto of the adj.
+ * We pay a cost for this 'routing' node, but an incomplete adj is the
+ * exception case.
+ */
+static inline uword
+mpls_adj_incomplete (vlib_main_t * vm,
+                     vlib_node_runtime_t * node,
+                     vlib_frame_t * from_frame)
+{
+    u32 n_left_from, next_index, * from, * to_next;
+
+  from = vlib_frame_vector_args (from_frame);
+  n_left_from = from_frame->n_vectors;
+  next_index = node->cached_next_index;
+
+  while (n_left_from > 0)
+    {
+      u32 n_left_to_next;
+
+      vlib_get_next_frame (vm, node, next_index,
+                           to_next, n_left_to_next);
+
+      while (n_left_from > 0 && n_left_to_next > 0)
+        {
+          u32 pi0, next0, adj_index0;
+         ip_adjacency_t * adj0;
+         vlib_buffer_t * p0;
+
+         pi0 = to_next[0] = from[0];
+         p0 = vlib_get_buffer (vm, pi0);
+         from += 1;
+         n_left_from -= 1;
+         to_next += 1;
+         n_left_to_next -= 1;
+
+          adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
+          ASSERT(adj_index0);
+
+         adj0 = adj_get(adj_index0);
+
+          if (PREDICT_TRUE(FIB_PROTOCOL_IP4 == adj0->ia_nh_proto))
+          {
+              next0 = MPLS_ADJ_INCOMPLETE_NEXT_IP4;
+          }
+          else
+          {
+              next0 = MPLS_ADJ_INCOMPLETE_NEXT_IP6;
+          }              
+
+         if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) 
+         {
+             mpls_adj_incomplete_trace_t *tr =
+                 vlib_add_trace (vm, node, p0, sizeof (*tr));
+             tr->next = next0;
+         }
+
+         vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+                                          to_next, n_left_to_next,
+                                          pi0, next0);
+        }
+
+      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+    }
+
+  return from_frame->n_vectors;
+}
+
+static u8 *
+format_mpls_adj_incomplete_trace (u8 * s, va_list * args)
+{
+    CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+    CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+    mpls_adj_incomplete_trace_t * t;
+    uword indent;
+
+    t = va_arg (*args, mpls_adj_incomplete_trace_t *);
+    indent = format_get_indent (s);
+
+    s = format (s, "%Unext:%d",
+                format_white_space, indent,
+                t->next);
+    return (s);
+}
+
+VLIB_REGISTER_NODE (mpls_adj_incomplete_node) = {
+  .function = mpls_adj_incomplete,
+  .name = "mpls-adj-incomplete",
+  .format_trace = format_mpls_adj_incomplete_trace,
+  /* Takes a vector of packets. */
+  .vector_size = sizeof (u32),
+  .n_errors = MPLS_N_ERROR,
+  .error_strings = mpls_error_strings,
+
+  .n_next_nodes = MPLS_ADJ_INCOMPLETE_N_NEXT,
+  .next_nodes = {
+#define _(s,n) [MPLS_ADJ_INCOMPLETE_NEXT_##s] = n,
+    foreach_mpls_adj_incomplete_next
+#undef _
+  },
+
+  .format_trace = format_mpls_output_trace,
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (mpls_adj_incomplete_node,
+                              mpls_adj_incomplete)
diff --git a/vnet/vnet/mpls/mpls_types.h b/vnet/vnet/mpls/mpls_types.h
new file mode 100644 (file)
index 0000000..d7c629d
--- /dev/null
@@ -0,0 +1,39 @@
+#ifndef __MPLS_TYPES_H__
+#define __MPLS_TYPES_H__
+
+#define MPLS_IETF_MIN_LABEL                  0x00000
+#define MPLS_IETF_MAX_LABEL                  0xfffff
+
+#define MPLS_IETF_MIN_RESERVED_LABEL         0x00000
+#define MPLS_IETF_MAX_RESERVED_LABEL         0x0000f
+
+#define MPLS_IETF_MIN_UNRES_LABEL            0x00010
+#define MPLS_IETF_MAX_UNRES_LABEL            0xfffff
+
+#define MPLS_IETF_IPV4_EXPLICIT_NULL_LABEL   0x00000
+#define MPLS_IETF_ROUTER_ALERT_LABEL         0x00001
+#define MPLS_IETF_IPV6_EXPLICIT_NULL_LABEL   0x00002
+#define MPLS_IETF_IMPLICIT_NULL_LABEL        0x00003
+#define MPLS_IETF_ELI_LABEL                  0x00007
+#define MPLS_IETF_GAL_LABEL                  0x0000D
+
+#define MPLS_IETF_IPV4_EXPLICIT_NULL_STRING          "ip4-explicit-null"
+#define MPLS_IETF_IPV4_EXPLICIT_NULL_BRIEF_STRING    "e-nul"
+#define MPLS_IETF_IMPLICIT_NULL_STRING               "implicit-null"
+#define MPLS_IETF_IMPLICIT_NULL_BRIEF_STRING         "i-nul"
+#define MPLS_IETF_ROUTER_ALERT_STRING                "router-alert"
+#define MPLS_IETF_ROUTER_ALERT_BRIEF_STRING          "r-alt"
+#define MPLS_IETF_IPV6_EXPLICIT_NULL_STRING          "ipv6-explicit-null"
+#define MPLS_IETF_IPV6_EXPLICIT_NULL_BRIEF_STRING    "v6enl"
+#define MPLS_IETF_ELI_STRING                         "entropy-label-indicator"
+#define MPLS_IETF_ELI_BRIEF_STRING                   "eli"
+#define MPLS_IETF_GAL_STRING                         "gal"
+#define MPLS_IETF_GAL_BRIEF_STRING                   "gal"
+
+#define MPLS_LABEL_INVALID (MPLS_IETF_MAX_LABEL+1)
+
+#define MPLS_LABEL_IS_REAL(_lbl) \
+    (((_lbl) > MPLS_IETF_MIN_UNRES_LABEL) &&   \
+     ((_lbl) <= MPLS_IETF_MAX_UNRES_LABEL))
+
+#endif
diff --git a/vnet/vnet/mpls/node.c b/vnet/vnet/mpls/node.c
new file mode 100644 (file)
index 0000000..6801cc7
--- /dev/null
@@ -0,0 +1,223 @@
+/*
+ * node.c: mpls-o-gre decap processing
+ *
+ * Copyright (c) 2012-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/mpls/mpls.h>
+
+typedef struct {
+  u32 next_index;
+  u32 label_host_byte_order;
+} mpls_input_trace_t;
+
+static u8 *
+format_mpls_input_trace (u8 * s, va_list * args)
+{
+  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+  mpls_input_trace_t * t = va_arg (*args, mpls_input_trace_t *);
+  char * next_name;
+
+  next_name = "BUG!";
+
+#define _(a,b) if (t->next_index == MPLS_INPUT_NEXT_##a) next_name = b;
+  foreach_mpls_input_next;
+#undef _
+  
+  s = format (s, "MPLS: next %s[%d]  label %d ttl %d", 
+              next_name, t->next_index,
+             vnet_mpls_uc_get_label(t->label_host_byte_order),
+             vnet_mpls_uc_get_ttl(t->label_host_byte_order));
+
+  return s;
+}
+
+vlib_node_registration_t mpls_input_node;
+
+typedef struct {
+  u32 last_label;
+  u32 last_inner_fib_index;
+  u32 last_outer_fib_index;
+  mpls_main_t * mpls_main;
+} mpls_input_runtime_t;
+
+static inline uword
+mpls_input_inline (vlib_main_t * vm,
+                   vlib_node_runtime_t * node,
+                   vlib_frame_t * from_frame)
+{
+  u32 n_left_from, next_index, * from, * to_next;
+  mpls_input_runtime_t * rt;
+  mpls_main_t * mm;
+  u32 cpu_index = os_get_cpu_number();
+  vlib_simple_counter_main_t * cm;
+  vnet_main_t * vnm = vnet_get_main();
+
+  from = vlib_frame_vector_args (from_frame);
+  n_left_from = from_frame->n_vectors;
+  rt = vlib_node_get_runtime_data (vm, mpls_input_node.index);
+  mm = rt->mpls_main;
+  /* 
+   * Force an initial lookup every time, in case the control-plane
+   * changed the label->FIB mapping.
+   */
+  rt->last_label = ~0;
+
+  next_index = node->cached_next_index;
+
+  cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+                         VNET_INTERFACE_COUNTER_MPLS);
+
+  while (n_left_from > 0)
+    {
+      u32 n_left_to_next;
+
+      vlib_get_next_frame (vm, node, next_index,
+                          to_next, n_left_to_next);
+
+      while (n_left_from > 0 && n_left_to_next > 0)
+       {
+         u32 bi0;
+         vlib_buffer_t * b0;
+         mpls_unicast_header_t * h0;
+          u32 label0;
+         u32 next0;
+         ip_config_main_t * cm0;
+          u32 sw_if_index0;
+
+         bi0 = from[0];
+         to_next[0] = bi0;
+         from += 1;
+         to_next += 1;
+         n_left_from -= 1;
+         n_left_to_next -= 1;
+
+         b0 = vlib_get_buffer (vm, bi0);
+          h0 = vlib_buffer_get_current (b0);
+         sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+
+         cm0 = &mm->rx_config_mains;
+         b0->current_config_index = vec_elt (cm0->config_index_by_sw_if_index,
+                                             sw_if_index0);
+
+         label0 = clib_net_to_host_u32 (h0->label_exp_s_ttl);
+         /* TTL expired? */
+         if (PREDICT_FALSE(vnet_mpls_uc_get_ttl (label0) == 0))
+           {
+              next0 = MPLS_INPUT_NEXT_DROP;
+              b0->error = node->errors[MPLS_ERROR_TTL_EXPIRED];
+            }
+         else
+            {
+              vnet_get_config_data (&cm0->config_main,
+                                   &b0->current_config_index,
+                                   &next0,
+                                   /* # bytes of config data */ 0);
+              vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
+            }
+
+          if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) 
+            {
+              mpls_input_trace_t *tr = vlib_add_trace (vm, node, 
+                                                      b0, sizeof (*tr));
+              tr->next_index = next0;
+              tr->label_host_byte_order = label0;
+            }
+
+         vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+                                          to_next, n_left_to_next,
+                                          bi0, next0);
+       }
+
+      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+    }
+  vlib_node_increment_counter (vm, mpls_input_node.index,
+                               MPLS_ERROR_PKTS_DECAP, from_frame->n_vectors);
+  return from_frame->n_vectors;
+}
+
+static uword
+mpls_input (vlib_main_t * vm,
+            vlib_node_runtime_t * node,
+            vlib_frame_t * from_frame)
+{
+  return mpls_input_inline (vm, node, from_frame);
+}
+
+static char * mpls_error_strings[] = {
+#define mpls_error(n,s) s,
+#include "error.def"
+#undef mpls_error
+};
+
+VLIB_REGISTER_NODE (mpls_input_node) = {
+  .function = mpls_input,
+  .name = "mpls-input",
+  /* Takes a vector of packets. */
+  .vector_size = sizeof (u32),
+
+  .runtime_data_bytes = sizeof(mpls_input_runtime_t),
+
+  .n_errors = MPLS_N_ERROR,
+  .error_strings = mpls_error_strings,
+
+  .n_next_nodes = MPLS_INPUT_N_NEXT,
+  .next_nodes = {
+#define _(s,n) [MPLS_INPUT_NEXT_##s] = n,
+    foreach_mpls_input_next
+#undef _
+  },
+
+  .format_buffer = format_mpls_unicast_header_net_byte_order,
+  .format_trace = format_mpls_input_trace,
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (mpls_input_node, mpls_input)
+
+static void
+mpls_setup_nodes (vlib_main_t * vm)
+{
+  mpls_input_runtime_t * rt;
+  pg_node_t * pn;
+
+  pn = pg_get_node (mpls_input_node.index);
+  pn->unformat_edit = unformat_pg_mpls_header;
+
+  rt = vlib_node_get_runtime_data (vm, mpls_input_node.index);
+  rt->last_label = (u32) ~0;
+  rt->last_inner_fib_index = 0;
+  rt->last_outer_fib_index = 0;
+  rt->mpls_main = &mpls_main;
+
+  ethernet_register_input_type (vm, ETHERNET_TYPE_MPLS_UNICAST,
+                                mpls_input_node.index);
+}
+
+static clib_error_t * mpls_input_init (vlib_main_t * vm)
+{
+  clib_error_t * error; 
+
+  error = vlib_call_init_function (vm, mpls_init);
+  if (error)
+    clib_error_report (error);
+
+  mpls_setup_nodes (vm);
+
+  return (mpls_feature_init(vm));
+}
+
+VLIB_INIT_FUNCTION (mpls_input_init);
diff --git a/vnet/vnet/mpls/packet.h b/vnet/vnet/mpls/packet.h
new file mode 100644 (file)
index 0000000..bc67445
--- /dev/null
@@ -0,0 +1,125 @@
+#ifndef included_vnet_mpls_packet_h
+#define included_vnet_mpls_packet_h
+
+/*
+ * MPLS packet format
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A label value only, i.e. 20bits.
+ */
+typedef u32 mpls_label_t;
+
+typedef struct {
+    /* Label: top 20 bits [in network byte order] */
+    /* Experimental: 3 bits ... */
+    /* S (bottom of label stack): 1 bit */
+    /* TTL: 8 bits */
+    mpls_label_t label_exp_s_ttl;
+} mpls_unicast_header_t;
+
+typedef enum mpls_eos_bit_t_
+{
+    MPLS_NON_EOS = 0,
+    MPLS_EOS     = 1,
+} mpls_eos_bit_t;
+
+#define MPLS_EOS_BITS {                                \
+    [MPLS_NON_EOS] = "neos",                   \
+    [MPLS_EOS] = "eos",                                \
+}
+
+#define FOR_EACH_MPLS_EOS_BIT(_eos) \
+    for (_eos = MPLS_NON_EOS; _eos <= MPLS_EOS; _eos++)
+
+#define MPLS_ENTRY_LABEL_OFFSET        0
+#define MPLS_ENTRY_LABEL_SHIFT 12
+#define MPLS_ENTRY_LABEL_MASK  0x000fffff
+#define MPLS_ENTRY_LABEL_BITS  \
+    (MPLS_ENTRY_LABEL_MASK << MPLS_ENTRY_LABEL_SHIFT)
+
+#define MPLS_ENTRY_EXP_OFFSET   2       /* byte offset to EXP bits */
+#define MPLS_ENTRY_EXP_SHIFT   9
+#define MPLS_ENTRY_EXP_MASK    0x07
+#define MPLS_ENTRY_EXP(mpls)   \
+    (((mpls)>>MPLS_ENTRY_EXP_SHIFT) & MPLS_ENTRY_EXP_MASK)
+#define MPLS_ENTRY_EXP_BITS    \
+    (MPLS_ENTRY_EXP_MASK << MPLS_ENTRY_EXP_SHIFT)
+
+#define MPLS_ENTRY_EOS_OFFSET   2       /* byte offset to EOS bit */
+#define MPLS_ENTRY_EOS_SHIFT   8
+#define MPLS_ENTRY_EOS_MASK    0x01    /* EOS bit in its byte */
+#define        MPLS_ENTRY_EOS(mpls)    \
+    (((mpls) >> MPLS_ENTRY_EOS_SHIFT) & MPLS_ENTRY_EOS_MASK)
+#define MPLS_ENTRY_EOS_BIT     (MPLS_ENTRY_EOS_MASK << MPLS_ENTRY_EOS_SHIFT)
+
+#define MPLS_ENTRY_TTL_OFFSET  3  /* byte offset to ttl field */
+#define MPLS_ENTRY_TTL_SHIFT   0
+#define MPLS_ENTRY_TTL_MASK    0xff
+#define MPLS_ENTRY_TTL(mpls)   \
+    (((mpls) >> MPLS_ENTRY_TTL_SHIFT) & MPLS_ENTRY_TTL_MASK)
+#define MPLS_ENTRY_TTL_BITS    \
+    (MPLS_ENTRY_TTL_MASK << MPLS_ENTRY_TTL_SHIFT)
+
+static inline u32 vnet_mpls_uc_get_label (mpls_label_t label_exp_s_ttl)
+{
+    return (label_exp_s_ttl>>MPLS_ENTRY_LABEL_SHIFT);
+}
+
+static inline u32 vnet_mpls_uc_get_exp (mpls_label_t label_exp_s_ttl)
+{
+    return (MPLS_ENTRY_EXP(label_exp_s_ttl));
+}
+
+static inline u32 vnet_mpls_uc_get_s (mpls_label_t label_exp_s_ttl)
+{
+    return (MPLS_ENTRY_EOS(label_exp_s_ttl));
+}
+
+static inline u32 vnet_mpls_uc_get_ttl (mpls_label_t label_exp_s_ttl)
+{
+    return (MPLS_ENTRY_TTL(label_exp_s_ttl));
+}
+
+static inline void vnet_mpls_uc_set_label (mpls_label_t *label_exp_s_ttl,
+                                           u32 value)
+{
+    *label_exp_s_ttl = (((*label_exp_s_ttl) & ~(MPLS_ENTRY_LABEL_BITS)) |
+                        ((value  & MPLS_ENTRY_LABEL_MASK) << MPLS_ENTRY_LABEL_SHIFT));
+}
+
+static inline void vnet_mpls_uc_set_exp (mpls_label_t *label_exp_s_ttl,
+                                         u32 exp)
+{
+    *label_exp_s_ttl = (((*label_exp_s_ttl) & ~(MPLS_ENTRY_EXP_BITS)) |
+                        ((exp & MPLS_ENTRY_EXP_MASK) << MPLS_ENTRY_EXP_SHIFT));
+}
+static inline void vnet_mpls_uc_set_s (mpls_label_t *label_exp_s_ttl,
+                                       u32 eos)
+{
+    *label_exp_s_ttl = (((*label_exp_s_ttl) & ~(MPLS_ENTRY_EOS_BIT)) |
+                        ((eos & MPLS_ENTRY_EOS_MASK) << MPLS_ENTRY_EOS_SHIFT));
+}
+static inline void vnet_mpls_uc_set_ttl (mpls_label_t *label_exp_s_ttl,
+                                         u32 ttl)
+{
+    *label_exp_s_ttl = (((*label_exp_s_ttl) & ~(MPLS_ENTRY_TTL_BITS)) |
+                        ((ttl & MPLS_ENTRY_TTL_MASK)));
+}
+
+#endif /* included_vnet_mpls_packet_h */
similarity index 98%
rename from vnet/vnet/mpls-gre/pg.c
rename to vnet/vnet/mpls/pg.c
index 6b6a101..f04b530 100644 (file)
@@ -18,7 +18,7 @@
 #include <vlib/vlib.h>
 #include <vnet/pg/pg.h>
 #include <vnet/gre/gre.h>
-#include <vnet/mpls-gre/mpls.h>
+#include <vnet/mpls/mpls.h>
 
 typedef struct {
   pg_edit_t label;
similarity index 99%
rename from vnet/vnet/mpls-gre/policy_encap.c
rename to vnet/vnet/mpls/policy_encap.c
index 0ea051f..278e8e6 100644 (file)
@@ -17,7 +17,7 @@
 
 #include <vlib/vlib.h>
 #include <vnet/pg/pg.h>
-#include <vnet/mpls-gre/mpls.h>
+#include <vnet/mpls/mpls.h>
 
 typedef struct {
   u32 next_index;
index 9f7e9e8..b66fb74 100644 (file)
@@ -40,6 +40,8 @@
 #include <vnet/vnet.h>
 #include <vnet/pg/pg.h>
 #include <vnet/ethernet/ethernet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/mpls/mpls.h>
 
 /* Mark stream active or inactive. */
 void
@@ -186,6 +188,10 @@ pg_interface_add_or_get (pg_main_t * pg, uword if_id)
       pi->sw_if_index = hi->sw_if_index;
 
       hash_set (pg->if_index_by_if_id, if_id, i);
+
+      ip4_sw_interface_enable_disable (pi->hw_if_index, 1);
+      ip6_sw_interface_enable_disable (pi->hw_if_index, 1);
+      mpls_sw_interface_enable_disable (&mpls_main, pi->hw_if_index, 1);
     }
 
   return i;
index 0dcec40..42d0688 100644 (file)
@@ -70,27 +70,25 @@ format_vnet_rewrite (u8 * s, va_list * args)
   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
   vnet_rewrite_header_t *rw = va_arg (*args, vnet_rewrite_header_t *);
   u32 max_data_bytes = va_arg (*args, u32);
+  CLIB_UNUSED (uword indent) = va_arg (*args, u32);
   vnet_main_t *vnm = vnet_get_main ();
   vlib_node_t *next;
-  uword indent;
 
   next = vlib_get_next_node (vm, rw->node_index, rw->next_index);
 
-  indent = format_get_indent (s);
-
   if (rw->sw_if_index != ~0)
     {
       vnet_sw_interface_t *si;
       si = vnet_get_sw_interface (vnm, rw->sw_if_index);
-      s = format (s, "%U", format_vnet_sw_interface_name, vnm, si);
+      s = format (s, "%U", format_vnet_sw_interface_name, vnm, si);
     }
   else
-    s = format (s, "%v", next->name);
+    s = format (s, "%v", next->name);
 
   /* Format rewrite string. */
   if (rw->data_bytes > 0)
-    s = format (s, "\n%U%U",
-               format_white_space, indent,
+
+    s = format (s, "%U",
                next->format_buffer ? next->format_buffer : format_hex_bytes,
                rw->data + max_data_bytes - rw->data_bytes, rw->data_bytes);
 
index 9c2d591..086cbe9 100644 (file)
  */
 #include <vnet/vnet.h>
 #include <vnet/sr/sr.h>
+#include <vnet/fib/ip6_fib.h>
+#include <vnet/dpo/dpo.h>
 
 #include <openssl/hmac.h>
 
 ip6_sr_main_t sr_main;
 static vlib_node_registration_t sr_local_node;
 
+/**
+ * @brief Dynamically added SR DPO type
+ */
+static dpo_type_t sr_dpo_type;
+
 /**
  * @brief Use passed HMAC key in ip6_sr_header_t in OpenSSL HMAC routines
  *
@@ -319,16 +326,12 @@ format_sr_rewrite_trace (u8 * s, va_list * args)
   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
   sr_rewrite_trace_t *t = va_arg (*args, sr_rewrite_trace_t *);
-  ip6_main_t *im = &ip6_main;
   ip6_sr_main_t *sm = &sr_main;
   ip6_sr_tunnel_t *tun = pool_elt_at_index (sm->tunnels, t->tunnel_index);
   ip6_fib_t *rx_fib, *tx_fib;
 
-  rx_fib = find_ip6_fib_by_table_index_or_id (im, tun->rx_fib_index,
-                                             IP6_ROUTE_FLAG_FIB_INDEX);
-
-  tx_fib = find_ip6_fib_by_table_index_or_id (im, tun->tx_fib_index,
-                                             IP6_ROUTE_FLAG_FIB_INDEX);
+  rx_fib = ip6_fib_get (tun->rx_fib_index);
+  tx_fib = ip6_fib_get (tun->tx_fib_index);
 
   s = format
     (s, "SR-REWRITE: next %s ip6 src %U dst %U len %u\n"
@@ -733,38 +736,18 @@ VLIB_NODE_FUNCTION_MULTIARCH (sr_rewrite_node, sr_rewrite)
                                              u32 dst_address_length,
                                              u32 rx_table_id)
 {
-  ip6_add_del_route_args_t a;
-  ip6_address_t dst_address;
-  ip6_fib_t *fib;
-  ip6_main_t *im6 = &ip6_main;
-  BVT (clib_bihash_kv) kv, value;
-
-  fib = find_ip6_fib_by_table_index_or_id (im6, rx_table_id,
-                                          IP6_ROUTE_FLAG_TABLE_ID);
-  memset (&a, 0, sizeof (a));
-  a.flags |= IP4_ROUTE_FLAG_DEL;
-  a.dst_address_length = dst_address_length;
-
-  dst_address = *dst_address_arg;
-
-  ip6_address_mask (&dst_address, &im6->fib_masks[dst_address_length]);
-
-  kv.key[0] = dst_address.as_u64[0];
-  kv.key[1] = dst_address.as_u64[1];
-  kv.key[2] = ((u64) ((fib - im6->fibs)) << 32) | dst_address_length;
-
-  if (BV (clib_bihash_search) (&im6->ip6_lookup_table, &kv, &value) < 0)
-    {
-      clib_warning ("%U/%d not in FIB",
-                   format_ip6_address, &a.dst_address, a.dst_address_length);
-      return -10;
-    }
+  fib_prefix_t pfx = {
+    .fp_len = dst_address_length,
+    .fp_proto = FIB_PROTOCOL_IP6,
+    .fp_addr = {
+               .ip6 = *dst_address_arg,
+               }
+  };
 
-  a.adj_index = value.value;
-  a.dst_address = dst_address;
+  fib_table_entry_delete (fib_table_id_find_fib_index (FIB_PROTOCOL_IP6,
+                                                      rx_table_id),
+                         &pfx, FIB_SOURCE_SR);
 
-  ip6_add_del_route (im6, &a);
-  ip6_maybe_remap_adjacencies (im6, rx_table_id, IP6_ROUTE_FLAG_TABLE_ID);
   return 0;
 }
 
@@ -837,23 +820,20 @@ int
 ip6_sr_add_del_tunnel (ip6_sr_add_del_tunnel_args_t * a)
 {
   ip6_main_t *im = &ip6_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
   ip6_sr_tunnel_key_t key;
   ip6_sr_tunnel_t *t;
   uword *p, *n;
   ip6_sr_header_t *h = 0;
   u32 header_length;
   ip6_address_t *addrp, *this_address;
-  ip_adjacency_t adj, *ap, *add_adj = 0;
-  u32 adj_index;
   ip6_sr_main_t *sm = &sr_main;
   u8 *key_copy;
   u32 rx_fib_index, tx_fib_index;
-  ip6_add_del_route_args_t aa;
   u32 hmac_key_index_u32;
   u8 hmac_key_index = 0;
   ip6_sr_policy_t *pt;
   int i;
+  dpo_id_t dpo = DPO_NULL;
 
   /* Make sure that the rx FIB exists */
   p = hash_get (im->fib_index_by_table_id, a->rx_table_id);
@@ -1057,15 +1037,6 @@ ip6_sr_add_del_tunnel (ip6_sr_add_del_tunnel_args_t * a)
   clib_memcpy (key_copy, &key, sizeof (ip6_sr_tunnel_key_t));
   hash_set_mem (sm->tunnel_index_by_key, key_copy, t - sm->tunnels);
 
-  memset (&adj, 0, sizeof (adj));
-
-  /* Create an adjacency and add to v6 fib */
-  adj.lookup_next_index = sm->ip6_lookup_sr_next_index;
-  adj.explicit_fib_index = ~0;
-
-  ap = ip_add_adjacency (lm, &adj, 1 /* one adj */ ,
-                        &adj_index);
-
   /*
    * Stick the tunnel index into the rewrite header.
    *
@@ -1077,22 +1048,20 @@ ip6_sr_add_del_tunnel (ip6_sr_add_del_tunnel_args_t * a)
    * We don't handle ugly RFC-related cases yet, but I'm sure PL will complain
    * at some point...
    */
-  ap->rewrite_header.sw_if_index = t - sm->tunnels;
-
-  vec_add1 (add_adj, ap[0]);
-
-  clib_memcpy (aa.dst_address.as_u8, a->dst_address,
-              sizeof (aa.dst_address.as_u8));
-  aa.dst_address_length = a->dst_mask_width;
+  dpo_set (&dpo, sr_dpo_type, DPO_PROTO_IP6, t - sm->tunnels);
 
-  aa.flags = (a->is_del ? IP6_ROUTE_FLAG_DEL : IP6_ROUTE_FLAG_ADD);
-  aa.flags |= IP6_ROUTE_FLAG_FIB_INDEX;
-  aa.table_index_or_table_id = rx_fib_index;
-  aa.add_adj = add_adj;
-  aa.adj_index = adj_index;
-  aa.n_add_adj = 1;
-  ip6_add_del_route (im, &aa);
-  vec_free (add_adj);
+  fib_prefix_t pfx = {
+    .fp_proto = FIB_PROTOCOL_IP6,
+    .fp_len = a->dst_mask_width,
+    .fp_addr = {
+               .ip6 = *a->dst_address,
+               }
+  };
+  fib_table_entry_special_dpo_add (rx_fib_index,
+                                  &pfx,
+                                  FIB_SOURCE_SR,
+                                  FIB_ENTRY_FLAG_EXCLUSIVE, &dpo);
+  dpo_reset (&dpo);
 
   if (a->policy_name)
     {
@@ -1125,6 +1094,48 @@ ip6_sr_add_del_tunnel (ip6_sr_add_del_tunnel_args_t * a)
   return 0;
 }
 
+/**
+ * @brief no-op lock function.
+ * The lifetime of the SR entry is managed by the control plane
+ */
+static void
+sr_dpo_lock (dpo_id_t * dpo)
+{
+}
+
+/**
+ * @brief no-op unlock function.
+ * The lifetime of the SR entry is managed by the control plane
+ */
+static void
+sr_dpo_unlock (dpo_id_t * dpo)
+{
+}
+
+u8 *
+format_sr_dpo (u8 * s, va_list * args)
+{
+  index_t index = va_arg (*args, index_t);
+  CLIB_UNUSED (u32 indent) = va_arg (*args, u32);
+
+  return (format (s, "SR: tunnel:[%d]", index));
+}
+
+const static dpo_vft_t sr_vft = {
+  .dv_lock = sr_dpo_lock,
+  .dv_unlock = sr_dpo_unlock,
+  .dv_format = format_sr_dpo,
+};
+
+const static char *const sr_ip6_nodes[] = {
+  "sr-rewrite",
+  NULL,
+};
+
+const static char *const *const sr_nodes[DPO_PROTO_NUM] = {
+  [DPO_PROTO_IP6] = sr_ip6_nodes,
+};
+
 /**
  * @brief CLI parser for Add or Delete a Segment Routing tunnel.
  *
@@ -1315,16 +1326,12 @@ VLIB_CLI_COMMAND (sr_tunnel_command, static) = {
 void
 ip6_sr_tunnel_display (vlib_main_t * vm, ip6_sr_tunnel_t * t)
 {
-  ip6_main_t *im = &ip6_main;
   ip6_sr_main_t *sm = &sr_main;
   ip6_fib_t *rx_fib, *tx_fib;
   ip6_sr_policy_t *pt;
 
-  rx_fib = find_ip6_fib_by_table_index_or_id (im, t->rx_fib_index,
-                                             IP6_ROUTE_FLAG_FIB_INDEX);
-
-  tx_fib = find_ip6_fib_by_table_index_or_id (im, t->tx_fib_index,
-                                             IP6_ROUTE_FLAG_FIB_INDEX);
+  rx_fib = ip6_fib_get (t->rx_fib_index);
+  tx_fib = ip6_fib_get (t->tx_fib_index);
 
   if (t->name)
     vlib_cli_output (vm, "sr tunnel name: %s", (char *) t->name);
@@ -1678,13 +1685,8 @@ int
 ip6_sr_add_del_multicastmap (ip6_sr_add_del_multicastmap_args_t * a)
 {
   uword *p;
-  ip6_main_t *im = &ip6_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
   ip6_sr_tunnel_t *t;
-  ip_adjacency_t adj, *ap, *add_adj = 0;
-  u32 adj_index;
   ip6_sr_main_t *sm = &sr_main;
-  ip6_add_del_route_args_t aa;
   ip6_sr_policy_t *pt;
 
   if (a->is_del)
@@ -1714,16 +1716,6 @@ ip6_sr_add_del_multicastmap (ip6_sr_add_del_multicastmap_args_t * a)
 
   t = pool_elt_at_index (sm->tunnels, pt->tunnel_indices[0]);
 
-  /* Construct a FIB entry for multicast using the rx/tx fib from the first tunnel */
-  memset (&adj, 0, sizeof (adj));
-
-  /* Create an adjacency and add to v6 fib */
-  adj.lookup_next_index = sm->ip6_lookup_sr_replicate_index;
-  adj.explicit_fib_index = ~0;
-
-  ap = ip_add_adjacency (lm, &adj, 1 /* one adj */ ,
-                        &adj_index);
-
   /*
    * Stick the tunnel index into the rewrite header.
    *
@@ -1735,22 +1727,23 @@ ip6_sr_add_del_multicastmap (ip6_sr_add_del_multicastmap_args_t * a)
    * We don't handle ugly RFC-related cases yet, but I'm sure PL will complain
    * at some point...
    */
-  ap->rewrite_header.sw_if_index = t - sm->tunnels;
-
-  vec_add1 (add_adj, ap[0]);
+  dpo_id_t dpo = DPO_NULL;
 
-  memcpy (aa.dst_address.as_u8, a->multicast_address,
-         sizeof (aa.dst_address.as_u8));
-  aa.dst_address_length = 128;
+  dpo_set (&dpo, sr_dpo_type, DPO_PROTO_IP6, t - sm->tunnels);
 
-  aa.flags = (a->is_del ? IP6_ROUTE_FLAG_DEL : IP6_ROUTE_FLAG_ADD);
-  aa.flags |= IP6_ROUTE_FLAG_FIB_INDEX;
-  aa.table_index_or_table_id = t->rx_fib_index;
-  aa.add_adj = add_adj;
-  aa.adj_index = adj_index;
-  aa.n_add_adj = 1;
-  ip6_add_del_route (im, &aa);
-  vec_free (add_adj);
+  /* Construct a FIB entry for multicast using the rx/tx fib from the first tunnel */
+  fib_prefix_t pfx = {
+    .fp_proto = FIB_PROTOCOL_IP6,
+    .fp_len = 128,
+    .fp_addr = {
+               .ip6 = *a->multicast_address,
+               }
+  };
+  fib_table_entry_special_dpo_add (t->rx_fib_index,
+                                  &pfx,
+                                  FIB_SOURCE_SR,
+                                  FIB_ENTRY_FLAG_EXCLUSIVE, &dpo);
+  dpo_reset (&dpo);
 
   u8 *mcast_copy = 0;
   mcast_copy = vec_new (ip6_address_t, 1);
@@ -2224,10 +2217,6 @@ VLIB_NODE_FUNCTION_MULTIARCH (sr_fix_dst_addr_node, sr_fix_dst_addr)
   ip6_rewrite_node = vlib_get_node_by_name (vm, (u8 *) "ip6-rewrite");
   ASSERT (ip6_rewrite_node);
 
-  /* Add a disposition to ip6_lookup for the sr rewrite node */
-  sm->ip6_lookup_sr_next_index =
-    vlib_node_add_next (vm, ip6_lookup_node->index, sr_rewrite_node.index);
-
 #if DPDK > 0                   /* Cannot run replicate without DPDK */
   /* Add a disposition to sr_replicate for the sr multicast replicate node */
   sm->ip6_lookup_sr_replicate_index =
@@ -2244,6 +2233,8 @@ VLIB_NODE_FUNCTION_MULTIARCH (sr_fix_dst_addr_node, sr_fix_dst_addr)
   sm->md = (void *) EVP_get_digestbyname ("sha1");
   sm->hmac_ctx = clib_mem_alloc (sizeof (HMAC_CTX));
 
+  sr_dpo_type = dpo_register_new_type (&sr_vft, sr_nodes);
+
   return error;
 }
 
@@ -2884,41 +2875,48 @@ static clib_error_t *
 set_ip6_sr_rewrite_fn (vlib_main_t * vm,
                       unformat_input_t * input, vlib_cli_command_t * cmd)
 {
-  ip6_address_t a;
-  ip6_main_t *im = &ip6_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
+  fib_prefix_t pfx = {
+    .fp_proto = FIB_PROTOCOL_IP6,
+    .fp_len = 128,
+  };
   u32 fib_index = 0;
   u32 fib_id = 0;
   u32 adj_index;
-  uword *p;
   ip_adjacency_t *adj;
   vnet_hw_interface_t *hi;
   u32 sw_if_index;
   ip6_sr_main_t *sm = &sr_main;
   vnet_main_t *vnm = vnet_get_main ();
+  fib_node_index_t fei;
 
-  if (!unformat (input, "%U", unformat_ip6_address, &a))
+  if (!unformat (input, "%U", unformat_ip6_address, &pfx.fp_addr.ip6))
     return clib_error_return (0, "ip6 address missing in '%U'",
                              format_unformat_error, input);
 
   if (unformat (input, "rx-table-id %d", &fib_id))
     {
-      p = hash_get (im->fib_index_by_table_id, fib_id);
-      if (p == 0)
-       return clib_error_return (0, "fib-id %d not found");
-      fib_index = p[0];
+      fib_index = fib_table_id_find_fib_index (FIB_PROTOCOL_IP6, fib_id);
+      if (fib_index == ~0)
+       return clib_error_return (0, "fib-id %d not found", fib_id);
     }
 
-  adj_index = ip6_fib_lookup_with_table (im, fib_index, &a);
+  fei = fib_table_lookup_exact_match (fib_index, &pfx);
+
+  if (FIB_NODE_INDEX_INVALID == fei)
+    return clib_error_return (0, "no match for %U",
+                             format_ip6_address, &pfx.fp_addr.ip6);
+
+  adj_index = fib_entry_get_adj_for_source (fei, FIB_SOURCE_SR);
 
-  if (adj_index == lm->miss_adj_index)
-    return clib_error_return (0, "no match for %U", format_ip6_address, &a);
+  if (ADJ_INDEX_INVALID == adj_index)
+    return clib_error_return (0, "%U not SR sourced",
+                             format_ip6_address, &pfx.fp_addr.ip6);
 
-  adj = ip_get_adjacency (lm, adj_index);
+  adj = adj_get (adj_index);
 
   if (adj->lookup_next_index != IP_LOOKUP_NEXT_REWRITE)
     return clib_error_return (0, "%U unresolved (not a rewrite adj)",
-                             format_ip6_address, &a);
+                             format_ip6_address, &pfx.fp_addr.ip6);
 
   adj->rewrite_header.next_index = sm->ip6_rewrite_sr_next_index;
 
index bd8fa8e..610b369 100644 (file)
@@ -199,9 +199,6 @@ typedef struct
   /** multicast address to policy mapping */
   uword *policy_index_by_multicast_address;
 
-  /** ip6-lookup next index for imposition FIB entries */
-  u32 ip6_lookup_sr_next_index;
-
   /** hmac key id by shared secret */
   uword *hmac_key_by_shared_secret;
 
index 37c3944..5fd9ef0 100644 (file)
@@ -32,6 +32,7 @@
 #include <vnet/devices/dpdk/dpdk.h>
 #include <vnet/dpdk_replication.h>
 #include <vnet/ip/ip.h>
+#include <vnet/fib/ip6_fib.h>
 
 #include <vppinfra/hash.h>
 #include <vppinfra/error.h>
@@ -76,16 +77,12 @@ format_sr_replicate_trace (u8 * s, va_list * args)
   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
   sr_replicate_trace_t *t = va_arg (*args, sr_replicate_trace_t *);
-  ip6_main_t *im = &ip6_main;
   ip6_sr_main_t *sm = &sr_main;
   ip6_sr_tunnel_t *tun = pool_elt_at_index (sm->tunnels, t->tunnel_index);
   ip6_fib_t *rx_fib, *tx_fib;
 
-  rx_fib = find_ip6_fib_by_table_index_or_id (im, tun->rx_fib_index,
-                                             IP6_ROUTE_FLAG_FIB_INDEX);
-
-  tx_fib = find_ip6_fib_by_table_index_or_id (im, tun->tx_fib_index,
-                                             IP6_ROUTE_FLAG_FIB_INDEX);
+  rx_fib = ip6_fib_get (tun->rx_fib_index);
+  tx_fib = ip6_fib_get (tun->tx_fib_index);
 
   s = format
     (s, "SR-REPLICATE: next %s ip6 src %U dst %U len %u\n"
index a2b8978..fae481c 100644 (file)
@@ -18,6 +18,7 @@
  *
 */
 #include <vnet/vxlan-gpe/vxlan_gpe.h>
+#include <vnet/fib/fib.h>
 #include <vnet/ip/format.h>
 
 vxlan_gpe_main_t vxlan_gpe_main;
@@ -419,56 +420,6 @@ int vnet_vxlan_gpe_add_del_tunnel
   return 0;
 }
 
-/**
- * @brief Find the IPv4 FIB index from the FIB ID
- *
- * @param fib_id
- *
- * @return fib_index
- *
- */
-static u32 fib4_index_from_fib_id (u32 fib_id)
-{
-  ip4_main_t * im = &ip4_main;
-  uword * p;
-
-  p = hash_get (im->fib_index_by_table_id, fib_id);
-  if (!p)
-    return ~0;
-
-  return p[0];
-}
-
-/**
- * @brief Find the IPv4 FIB index from the FIB ID
- *
- * @param fib_id
- *
- * @return fib_index
- *
- */
-static u32 fib6_index_from_fib_id (u32 fib_id)
-{
-  ip6_main_t * im = &ip6_main;
-  uword * p;
-
-  p = hash_get (im->fib_index_by_table_id, fib_id);
-  if (!p)
-    return ~0;
-
-  return p[0];
-}
-
-/**
- * @brief CLI function for Add/Del of IPv4/IPv6 VXLAN GPE tunnel
- *
- * @param *vm
- * @param *input
- * @param *cmd
- *
- * @return error
- *
- */
 static clib_error_t *
 vxlan_gpe_add_del_tunnel_command_fn (vlib_main_t * vm,
                                    unformat_input_t * input,
@@ -525,20 +476,19 @@ vxlan_gpe_add_del_tunnel_command_fn (vlib_main_t * vm,
     else if (unformat (line_input, "encap-vrf-id %d", &tmp))
       {
         if (ipv6_set)
-          encap_fib_index = fib6_index_from_fib_id (tmp);
+          encap_fib_index = ip6_fib_index_from_table_id (tmp);
         else
-          encap_fib_index = fib4_index_from_fib_id (tmp);
+          encap_fib_index =  ip4_fib_index_from_table_id (tmp);
 
         if (encap_fib_index == ~0)
           return clib_error_return (0, "nonexistent encap fib id %d", tmp);
       }
     else if (unformat (line_input, "decap-vrf-id %d", &tmp))
       {
-
         if (ipv6_set)
-          decap_fib_index = fib6_index_from_fib_id (tmp);
+          decap_fib_index = ip6_fib_index_from_table_id (tmp);
         else
-          decap_fib_index = fib4_index_from_fib_id (tmp);
+          decap_fib_index = ip4_fib_index_from_table_id (tmp);
 
         if (decap_fib_index == ~0)
           return clib_error_return (0, "nonexistent decap fib id %d", tmp);
index 32ad753..da359a8 100644 (file)
@@ -348,11 +348,13 @@ int vnet_vxlan_add_del_tunnel
       vnet_sw_interface_set_flags (vnm, sw_if_index, 
                                    VNET_SW_INTERFACE_FLAG_ADMIN_UP);
       if (!a->is_ip6) {
-      vec_validate (im4->fib_index_by_sw_if_index, sw_if_index);
-      im4->fib_index_by_sw_if_index[sw_if_index] = t->encap_fib_index;
+        vec_validate (im4->fib_index_by_sw_if_index, sw_if_index);
+        im4->fib_index_by_sw_if_index[sw_if_index] = t->encap_fib_index;
+        ip4_sw_interface_enable_disable(sw_if_index, 1);
       } else {
         vec_validate (im6->fib_index_by_sw_if_index, sw_if_index);
         im6->fib_index_by_sw_if_index[sw_if_index] = t->encap_fib_index;
+        ip6_sw_interface_enable_disable(sw_if_index, 1);
       }
     }
   else
@@ -375,13 +377,16 @@ int vnet_vxlan_add_del_tunnel
         = L2OUTPUT_NEXT_DEL_TUNNEL;
 
       if (!a->is_ip6)
-        hash_unset (vxm->vxlan4_tunnel_by_key, key4.as_u64);
+        {
+          hash_unset (vxm->vxlan4_tunnel_by_key, key4.as_u64);
+          ip4_sw_interface_enable_disable(sw_if_index, 1);
+       }
       else
         {
          hash_unset_mem (vxm->vxlan6_tunnel_by_key, t->key6);
          clib_mem_free (t->key6);
+          ip6_sw_interface_enable_disable(sw_if_index, 1);
        }
-
       vec_free (t->rewrite);
       pool_put (vxm->tunnels, t);
     }
index 7b5b434..f2fb6ff 100644 (file)
@@ -35,7 +35,7 @@
 #include <vnet/l2/l2_vtr.h>
 #include <vnet/classify/input_acl.h>
 #include <vnet/classify/policer_classify.h>
-#include <vnet/mpls-gre/mpls.h>
+#include <vnet/mpls/mpls.h>
 #if DPDK > 0
 #include <vnet/ipsec/ipsec.h>
 #include <vnet/ipsec/ikev2.h>
@@ -5346,6 +5346,7 @@ api_ip_add_del_route (vat_main_t * vam)
   u32 random_seed = 0xdeaddabe;
   u32 classify_table_index = ~0;
   u8 is_classify = 0;
+  u8 resolve_host, resolve_attached;
 
   /* Parse args required to build the message */
   while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT)
@@ -5401,6 +5402,10 @@ api_ip_add_del_route (vat_main_t * vam)
        is_add = 1;
       else if (unformat (i, "not-last"))
        not_last = 1;
+      else if (unformat (i, "resolve-via-host"))
+       resolve_host = 1;
+      else if (unformat (i, "resolve-via-attached"))
+       resolve_attached = 1;
       else if (unformat (i, "multipath"))
        is_multipath = 1;
       else if (unformat (i, "vrf %d", &vrf_id))
@@ -5497,6 +5502,8 @@ api_ip_add_del_route (vat_main_t * vam)
       mp->is_local = is_local;
       mp->is_classify = is_classify;
       mp->is_multipath = is_multipath;
+      mp->is_resolve_host = resolve_host;
+      mp->is_resolve_attached = resolve_attached;
       mp->not_last = not_last;
       mp->next_hop_weight = next_hop_weight;
       mp->dst_address_length = dst_address_length;
@@ -7860,7 +7867,6 @@ out:
 }
 
 #define foreach_ip_next                         \
-_(miss, MISS)                                   \
 _(drop, DROP)                                   \
 _(local, LOCAL)                                 \
 _(rewrite, REWRITE)
@@ -12571,7 +12577,7 @@ api_lisp_add_del_remote_mapping (vat_main_t * vam)
        {
          is_add = 1;
        }
-      else if (unformat (input, "eid %U", unformat_lisp_eid_vat, eid))
+      else if (unformat (input, "deid %U", unformat_lisp_eid_vat, eid))
        {
          eid_set = 1;
        }
index b9f7968..3b24c26 100644 (file)
@@ -14,6 +14,8 @@
  */
 #include <vnet/ip/ip.h>
 #include <vnet/ethernet/ethernet.h>
+#include <vnet/adj/adj.h>
+#include <vnet/fib/fib_table.h>
 
 typedef struct
 {
@@ -27,20 +29,25 @@ virtual_ip_cmd_fn_command_fn (vlib_main_t * vm,
 {
   unformat_input_t _line_input, *line_input = &_line_input;
   vnet_main_t *vnm = vnet_get_main ();
-  ip4_main_t *im = &ip4_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
-  ip4_address_t ip_addr, next_hop;
+  ip46_address_t next_hop, *next_hops;
+  fib_route_path_t *rpaths;
+  fib_prefix_t prefix;
   u8 mac_addr[6];
   mac_addr_t *mac_addrs = 0;
   u32 sw_if_index;
-  u32 i, f;
+  u32 i;
+
+  next_hops = NULL;
+  rpaths = NULL;
+  prefix.fp_len = 32;
+  prefix.fp_proto = FIB_PROTOCOL_IP4;
 
   /* Get a line of input. */
   if (!unformat_user (input, unformat_line_input, line_input))
     return 0;
 
   if (!unformat (line_input, "%U %U",
-                unformat_ip4_address, &ip_addr,
+                unformat_ip4_address, &prefix.fp_addr.ip4,
                 unformat_vnet_sw_interface, vnm, &sw_if_index))
     goto barf;
 
@@ -53,6 +60,11 @@ virtual_ip_cmd_fn_command_fn (vlib_main_t * vm,
          vec_add2 (mac_addrs, ma, 1);
          clib_memcpy (ma, mac_addr, sizeof (mac_addr));
        }
+      else if (unformat (line_input, "next-hop %U",
+                        unformat_ip4_address, &next_hop.ip4))
+       {
+         vec_add1 (next_hops, next_hop);
+       }
       else
        {
        barf:
@@ -60,37 +72,37 @@ virtual_ip_cmd_fn_command_fn (vlib_main_t * vm,
                                    format_unformat_error, input);
        }
     }
-  if (vec_len (mac_addrs) == 0)
+  if (vec_len (mac_addrs) == 0 ||
+      vec_len (next_hops) == 0 || vec_len (mac_addrs) != vec_len (next_hops))
     goto barf;
 
   /* Create / delete special interface route /32's */
-  next_hop.as_u32 = 0;
 
   for (i = 0; i < vec_len (mac_addrs); i++)
     {
-      ip_adjacency_t adj;
-      u32 adj_index;
-
-      memset (&adj, 0, sizeof (adj));
-      adj.lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
-
-      vnet_rewrite_for_sw_interface (vnm, VNET_L3_PACKET_TYPE_IP4, sw_if_index, ip4_rewrite_node.index, &mac_addrs[i], /* destination address */
-                                    &adj.rewrite_header,
-                                    sizeof (adj.rewrite_data));
-
-      ip_add_adjacency (lm, &adj, 1 /* one adj */ ,
-                       &adj_index);
-
-      f =
-       (i + 1 < vec_len (mac_addrs)) ? IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP : 0;
-      ip4_add_del_route_next_hop (im, IP4_ROUTE_FLAG_ADD | f, &ip_addr,
-                                 32 /* insert /32's */ ,
-                                 &next_hop, sw_if_index, 1 /* weight */ ,
-                                 adj_index,
-                                 (u32) ~ 0 /* explicit fib index */ );
+      fib_route_path_t *rpath;
+
+      adj_nbr_add_or_lock_w_rewrite (FIB_PROTOCOL_IP4,
+                                    FIB_LINK_IP4,
+                                    &next_hops[i],
+                                    sw_if_index, mac_addrs[i].mac_addr);
+
+      vec_add2 (rpaths, rpath, 1);
+
+      rpath->frp_proto = FIB_PROTOCOL_IP4;
+      rpath->frp_addr = next_hops[i];
+      rpath->frp_sw_if_index = sw_if_index;
+      rpath->frp_fib_index = ~0;
+      rpath->frp_weight = 1;
+      rpath->frp_label = MPLS_LABEL_INVALID;
     }
 
+  fib_table_entry_path_add2 (0,        // default FIB table
+                            &prefix,
+                            FIB_SOURCE_CLI, FIB_ENTRY_FLAG_NONE, rpaths);
+
   vec_free (mac_addrs);
+  vec_free (next_hops);
 
   return 0;
 }
index 823e064..ee9a6a6 100644 (file)
@@ -15,6 +15,9 @@
 #include <stats/stats.h>
 #include <signal.h>
 #include <vlib/threads.h>
+#include <vnet/fib/fib_entry.h>
+#include <vnet/fib/fib_table.h>
+#include <vnet/dpo/load_balance.h>
 
 #define STATS_DEBUG 0
 
@@ -250,7 +253,7 @@ do_ip4_fibs (stats_main_t * sm)
   unix_shared_memory_queue_t *q = shmem_hdr->vl_input_queue;
   static ip4_route_t *routes;
   ip4_route_t *r;
-  ip4_fib_t *fib;
+  fib_table_t *fib;
   ip_lookup_main_t *lm = &im4->lookup_main;
   static uword *results;
   vl_api_vnet_ip4_fib_counters_t *mp = 0;
@@ -260,8 +263,9 @@ do_ip4_fibs (stats_main_t * sm)
   int i;
 
 again:
-  vec_foreach (fib, im4->fibs)
-  {
+  /* *INDENT-OFF* */
+  pool_foreach (fib, im4->fibs,
+  ({
     /* We may have bailed out due to control-plane activity */
     while ((fib - im4->fibs) < start_at_fib_index)
       continue;
@@ -274,14 +278,14 @@ again:
           items_this_message * sizeof (vl_api_ip4_fib_counter_t));
        mp->_vl_msg_id = ntohs (VL_API_VNET_IP4_FIB_COUNTERS);
        mp->count = 0;
-       mp->vrf_id = ntohl (fib->table_id);
+       mp->vrf_id = ntohl (fib->ft_table_id);
        ctrp = (vl_api_ip4_fib_counter_t *) mp->c;
       }
     else
       {
        /* happens if the last FIB was empty... */
        ASSERT (mp->count == 0);
-       mp->vrf_id = ntohl (fib->table_id);
+       mp->vrf_id = ntohl (fib->ft_table_id);
       }
 
     dslock (sm, 0 /* release hint */ , 1 /* tag */ );
@@ -289,15 +293,14 @@ again:
     vec_reset_length (routes);
     vec_reset_length (results);
 
-    for (i = 0; i < ARRAY_LEN (fib->adj_index_by_dst_address); i++)
+    for (i = 0; i < ARRAY_LEN (fib->v4.fib_entry_by_dst_address); i++)
       {
-       uword *hash = fib->adj_index_by_dst_address[i];
+       uword *hash = fib->v4.fib_entry_by_dst_address[i];
        hash_pair_t *p;
        ip4_route_t x;
 
        x.address_length = i;
 
-        /* *INDENT-OFF* */
         hash_foreach_pair (p, hash,
         ({
           x.address.data_u32 = p->key;
@@ -321,114 +324,71 @@ again:
               goto again;
             }
         }));
-        /* *INDENT-ON* */
       }
 
     vec_foreach (r, routes)
-    {
-      vlib_counter_t c, sum;
-      uword i, j, n_left, n_nhs, adj_index, *result = 0;
-      ip_adjacency_t *adj;
-      ip_multipath_next_hop_t *nhs, tmp_nhs[1];
-
-      adj_index = r->index;
-      if (lm->fib_result_n_words > 1)
-       {
-         result = vec_elt_at_index (results, adj_index);
-         adj_index = result[0];
-       }
-
-      adj = ip_get_adjacency (lm, adj_index);
-      if (adj->n_adj == 1)
-       {
-         nhs = &tmp_nhs[0];
-         nhs[0].next_hop_adj_index = ~0;       /* not used */
-         nhs[0].weight = 1;
-         n_nhs = 1;
-       }
-      else
-       {
-         ip_multipath_adjacency_t *madj;
-         madj = vec_elt_at_index (lm->multipath_adjacencies,
-                                  adj->heap_handle);
-         nhs = heap_elt_at_index
-           (lm->next_hop_heap, madj->normalized_next_hops.heap_offset);
-         n_nhs = madj->normalized_next_hops.count;
-       }
+      {
+        vlib_counter_t c;
+
+        vlib_get_combined_counter (&load_balance_main.lbm_to_counters,
+                                   r->index, &c);
+        /*
+         * If it has actually
+         * seen at least one packet, send it.
+         */
+        if (c.packets > 0)
+          {
 
-      n_left = nhs[0].weight;
-      vlib_counter_zero (&sum);
-      for (i = j = 0; i < adj->n_adj; i++)
-       {
-         n_left -= 1;
-         vlib_get_combined_counter (&lm->adjacency_counters,
-                                    adj_index + i, &c);
-         vlib_counter_add (&sum, &c);
-         /*
-          * If we're done with this adj and it has actually
-          * seen at least one packet, send it.
-          */
-         if (n_left == 0 && sum.packets > 0)
-           {
-
-             /* already in net byte order */
-             ctrp->address = r->address.as_u32;
-             ctrp->address_length = r->address_length;
-             ctrp->packets = clib_host_to_net_u64 (sum.packets);
-             ctrp->bytes = clib_host_to_net_u64 (sum.bytes);
-             mp->count++;
-             ctrp++;
-
-             if (mp->count == items_this_message)
-               {
-                 mp->count = htonl (items_this_message);
-                 /*
-                  * If the main thread's input queue is stuffed,
-                  * drop the data structure lock (which the main thread
-                  * may want), and take a pause.
-                  */
-                 unix_shared_memory_queue_lock (q);
-                 if (unix_shared_memory_queue_is_full (q))
-                   {
-                     dsunlock (sm);
-                     vl_msg_api_send_shmem_nolock (q, (u8 *) & mp);
-                     unix_shared_memory_queue_unlock (q);
-                     mp = 0;
-                     ip46_fib_stats_delay (sm, 0 /* sec */ ,
-                                           STATS_RELEASE_DELAY_NS);
-                     goto again;
-                   }
-                 vl_msg_api_send_shmem_nolock (q, (u8 *) & mp);
-                 unix_shared_memory_queue_unlock (q);
-
-                 items_this_message = IP4_FIB_COUNTER_BATCH_SIZE;
-                 mp = vl_msg_api_alloc_as_if_client
-                   (sizeof (*mp) +
-                    items_this_message * sizeof (vl_api_ip4_fib_counter_t));
-                 mp->_vl_msg_id = ntohs (VL_API_VNET_IP4_FIB_COUNTERS);
-                 mp->count = 0;
-                 mp->vrf_id = ntohl (fib->table_id);
-                 ctrp = (vl_api_ip4_fib_counter_t *) mp->c;
-               }
-
-             j++;
-             if (j < n_nhs)
-               {
-                 n_left = nhs[j].weight;
-                 vlib_counter_zero (&sum);
-               }
-           }
-       }                       /* for each (mp or single) adj */
-      if (sm->data_structure_lock->release_hint)
-       {
-         start_at_fib_index = fib - im4->fibs;
-         dsunlock (sm);
-         ip46_fib_stats_delay (sm, 0 /* sec */ , STATS_RELEASE_DELAY_NS);
-         mp->count = 0;
-         ctrp = (vl_api_ip4_fib_counter_t *) mp->c;
-         goto again;
-       }
-    }                          /* vec_foreach (routes) */
+            /* already in net byte order */
+            ctrp->address = r->address.as_u32;
+            ctrp->address_length = r->address_length;
+            ctrp->packets = clib_host_to_net_u64 (c.packets);
+            ctrp->bytes = clib_host_to_net_u64 (c.bytes);
+            mp->count++;
+            ctrp++;
+
+            if (mp->count == items_this_message)
+              {
+                mp->count = htonl (items_this_message);
+                /*
+                 * If the main thread's input queue is stuffed,
+                 * drop the data structure lock (which the main thread
+                 * may want), and take a pause.
+                 */
+                unix_shared_memory_queue_lock (q);
+                if (unix_shared_memory_queue_is_full (q))
+                  {
+                    dsunlock (sm);
+                    vl_msg_api_send_shmem_nolock (q, (u8 *) & mp);
+                    unix_shared_memory_queue_unlock (q);
+                    mp = 0;
+                    ip46_fib_stats_delay (sm, 0 /* sec */ ,
+                                          STATS_RELEASE_DELAY_NS);
+                    goto again;
+                  }
+                vl_msg_api_send_shmem_nolock (q, (u8 *) & mp);
+                unix_shared_memory_queue_unlock (q);
+
+                items_this_message = IP4_FIB_COUNTER_BATCH_SIZE;
+                mp = vl_msg_api_alloc_as_if_client
+                  (sizeof (*mp) +
+                   items_this_message * sizeof (vl_api_ip4_fib_counter_t));
+                mp->_vl_msg_id = ntohs (VL_API_VNET_IP4_FIB_COUNTERS);
+                mp->count = 0;
+                mp->vrf_id = ntohl (fib->ft_table_id);
+                ctrp = (vl_api_ip4_fib_counter_t *) mp->c;
+              }
+          }                    /* for each (mp or single) adj */
+        if (sm->data_structure_lock->release_hint)
+          {
+            start_at_fib_index = fib - im4->fibs;
+            dsunlock (sm);
+            ip46_fib_stats_delay (sm, 0 /* sec */ , STATS_RELEASE_DELAY_NS);
+            mp->count = 0;
+            ctrp = (vl_api_ip4_fib_counter_t *) mp->c;
+            goto again;
+          }
+      }                                /* vec_foreach (routes) */
 
     dsunlock (sm);
 
@@ -439,7 +399,9 @@ again:
        vl_msg_api_send_shmem (q, (u8 *) & mp);
        mp = 0;
       }
-  }                            /* vec_foreach (fib) */
+  }));
+  /* *INDENT-ON* */
+
   /* If e.g. the last FIB had no reportable routes, free the buffer */
   if (mp)
     vl_msg_api_free (mp);
@@ -489,19 +451,19 @@ do_ip6_fibs (stats_main_t * sm)
   unix_shared_memory_queue_t *q = shmem_hdr->vl_input_queue;
   static ip6_route_t *routes;
   ip6_route_t *r;
-  ip6_fib_t *fib;
-  ip_lookup_main_t *lm = &im6->lookup_main;
+  fib_table_t *fib;
   static uword *results;
   vl_api_vnet_ip6_fib_counters_t *mp = 0;
   u32 items_this_message;
   vl_api_ip6_fib_counter_t *ctrp = 0;
   u32 start_at_fib_index = 0;
-  BVT (clib_bihash) * h = &im6->ip6_lookup_table;
+  BVT (clib_bihash) * h = &im6->ip6_table[IP6_FIB_TABLE_FWDING].ip6_hash;
   add_routes_in_fib_arg_t _a, *a = &_a;
 
 again:
-  vec_foreach (fib, im6->fibs)
-  {
+  /* *INDENT-OFF* */
+  pool_foreach (fib, im6->fibs,
+  ({
     /* We may have bailed out due to control-plane activity */
     while ((fib - im6->fibs) < start_at_fib_index)
       continue;
@@ -514,7 +476,7 @@ again:
           items_this_message * sizeof (vl_api_ip6_fib_counter_t));
        mp->_vl_msg_id = ntohs (VL_API_VNET_IP6_FIB_COUNTERS);
        mp->count = 0;
-       mp->vrf_id = ntohl (fib->table_id);
+       mp->vrf_id = ntohl (fib->ft_table_id);
        ctrp = (vl_api_ip6_fib_counter_t *) mp->c;
       }
 
@@ -544,105 +506,67 @@ again:
 
     vec_foreach (r, routes)
     {
-      vlib_counter_t c, sum;
-      uword i, j, n_left, n_nhs, adj_index, *result = 0;
-      ip_adjacency_t *adj;
-      ip_multipath_next_hop_t *nhs, tmp_nhs[1];
-
-      adj_index = r->index;
-      if (lm->fib_result_n_words > 1)
-       {
-         result = vec_elt_at_index (results, adj_index);
-         adj_index = result[0];
-       }
-
-      adj = ip_get_adjacency (lm, adj_index);
-      if (adj->n_adj == 1)
-       {
-         nhs = &tmp_nhs[0];
-         nhs[0].next_hop_adj_index = ~0;       /* not used */
-         nhs[0].weight = 1;
-         n_nhs = 1;
-       }
-      else
-       {
-         ip_multipath_adjacency_t *madj;
-         madj = vec_elt_at_index (lm->multipath_adjacencies,
-                                  adj->heap_handle);
-         nhs = heap_elt_at_index
-           (lm->next_hop_heap, madj->normalized_next_hops.heap_offset);
-         n_nhs = madj->normalized_next_hops.count;
-       }
+        vlib_counter_t c;
+
+        vlib_get_combined_counter (&load_balance_main.lbm_to_counters,
+                                   r->index, &c);
+        /*
+         * If it has actually
+         * seen at least one packet, send it.
+         */
+        if (c.packets > 0)
+          {
+            /* already in net byte order */
+            ctrp->address[0] = r->address.as_u64[0];
+            ctrp->address[1] = r->address.as_u64[1];
+            ctrp->address_length = (u8) r->address_length;
+            ctrp->packets = clib_host_to_net_u64 (c.packets);
+            ctrp->bytes = clib_host_to_net_u64 (c.bytes);
+            mp->count++;
+            ctrp++;
+
+            if (mp->count == items_this_message)
+              {
+                mp->count = htonl (items_this_message);
+                /*
+                 * If the main thread's input queue is stuffed,
+                 * drop the data structure lock (which the main thread
+                 * may want), and take a pause.
+                 */
+                unix_shared_memory_queue_lock (q);
+                if (unix_shared_memory_queue_is_full (q))
+                  {
+                    dsunlock (sm);
+                    vl_msg_api_send_shmem_nolock (q, (u8 *) & mp);
+                    unix_shared_memory_queue_unlock (q);
+                    mp = 0;
+                    ip46_fib_stats_delay (sm, 0 /* sec */ ,
+                                          STATS_RELEASE_DELAY_NS);
+                    goto again;
+                  }
+                vl_msg_api_send_shmem_nolock (q, (u8 *) & mp);
+                unix_shared_memory_queue_unlock (q);
+
+                items_this_message = IP6_FIB_COUNTER_BATCH_SIZE;
+                mp = vl_msg_api_alloc_as_if_client
+                  (sizeof (*mp) +
+                   items_this_message * sizeof (vl_api_ip6_fib_counter_t));
+                mp->_vl_msg_id = ntohs (VL_API_VNET_IP6_FIB_COUNTERS);
+                mp->count = 0;
+                mp->vrf_id = ntohl (fib->ft_table_id);
+                ctrp = (vl_api_ip6_fib_counter_t *) mp->c;
+              }
+          }
 
-      n_left = nhs[0].weight;
-      vlib_counter_zero (&sum);
-      for (i = j = 0; i < adj->n_adj; i++)
-       {
-         n_left -= 1;
-         vlib_get_combined_counter (&lm->adjacency_counters,
-                                    adj_index + i, &c);
-         vlib_counter_add (&sum, &c);
-         if (n_left == 0 && sum.packets > 0)
-           {
-
-             /* already in net byte order */
-             ctrp->address[0] = r->address.as_u64[0];
-             ctrp->address[1] = r->address.as_u64[1];
-             ctrp->address_length = (u8) r->address_length;
-             ctrp->packets = clib_host_to_net_u64 (sum.packets);
-             ctrp->bytes = clib_host_to_net_u64 (sum.bytes);
-             mp->count++;
-             ctrp++;
-
-             if (mp->count == items_this_message)
-               {
-                 mp->count = htonl (items_this_message);
-                 /*
-                  * If the main thread's input queue is stuffed,
-                  * drop the data structure lock (which the main thread
-                  * may want), and take a pause.
-                  */
-                 unix_shared_memory_queue_lock (q);
-                 if (unix_shared_memory_queue_is_full (q))
-                   {
-                     dsunlock (sm);
-                     vl_msg_api_send_shmem_nolock (q, (u8 *) & mp);
-                     unix_shared_memory_queue_unlock (q);
-                     mp = 0;
-                     ip46_fib_stats_delay (sm, 0 /* sec */ ,
-                                           STATS_RELEASE_DELAY_NS);
-                     goto again;
-                   }
-                 vl_msg_api_send_shmem_nolock (q, (u8 *) & mp);
-                 unix_shared_memory_queue_unlock (q);
-
-                 items_this_message = IP6_FIB_COUNTER_BATCH_SIZE;
-                 mp = vl_msg_api_alloc_as_if_client
-                   (sizeof (*mp) +
-                    items_this_message * sizeof (vl_api_ip6_fib_counter_t));
-                 mp->_vl_msg_id = ntohs (VL_API_VNET_IP6_FIB_COUNTERS);
-                 mp->count = 0;
-                 mp->vrf_id = ntohl (fib->table_id);
-                 ctrp = (vl_api_ip6_fib_counter_t *) mp->c;
-               }
-
-             j++;
-             if (j < n_nhs)
-               {
-                 n_left = nhs[j].weight;
-                 vlib_counter_zero (&sum);
-               }
-           }
-       }                       /* for each (mp or single) adj */
-      if (sm->data_structure_lock->release_hint)
-       {
-         start_at_fib_index = fib - im6->fibs;
-         dsunlock (sm);
-         ip46_fib_stats_delay (sm, 0 /* sec */ , STATS_RELEASE_DELAY_NS);
-         mp->count = 0;
-         ctrp = (vl_api_ip6_fib_counter_t *) mp->c;
-         goto again;
-       }
+        if (sm->data_structure_lock->release_hint)
+          {
+            start_at_fib_index = fib - im6->fibs;
+            dsunlock (sm);
+            ip46_fib_stats_delay (sm, 0 /* sec */ , STATS_RELEASE_DELAY_NS);
+            mp->count = 0;
+            ctrp = (vl_api_ip6_fib_counter_t *) mp->c;
+            goto again;
+          }
     }                          /* vec_foreach (routes) */
 
     dsunlock (sm);
@@ -654,7 +578,9 @@ again:
        vl_msg_api_send_shmem (q, (u8 *) & mp);
        mp = 0;
       }
-  }                            /* vec_foreach (fib) */
+  }));
+  /* *INDENT-ON* */
+
   /* If e.g. the last FIB had no reportable routes, free the buffer */
   if (mp)
     vl_msg_api_free (mp);
index 6630960..1a46eb9 100644 (file)
@@ -54,7 +54,7 @@
 #include <vnet/ip/ip6.h>
 #include <vnet/unix/tuntap.h>
 #include <vnet/unix/tapcli.h>
-#include <vnet/mpls-gre/mpls.h>
+#include <vnet/mpls/mpls.h>
 #include <vnet/dhcp/proxy.h>
 #include <vnet/dhcp/client.h>
 #if IPV6SR > 0
 #include <vnet/l2/l2_bd.h>
 #include <vpp-api/vpe_msg_enum.h>
 
+#include <vnet/fib/ip6_fib.h>
+#include <vnet/fib/ip4_fib.h>
+#include <vnet/dpo/drop_dpo.h>
+#include <vnet/dpo/receive_dpo.h>
+#include <vnet/dpo/lookup_dpo.h>
+#include <vnet/dpo/classify_dpo.h>
+
 #define f64_endian(a)
 #define f64_print(a,b)
 
@@ -244,6 +251,7 @@ _(IP_ADD_DEL_ROUTE, ip_add_del_route)                                   \
 _(IS_ADDRESS_REACHABLE, is_address_reachable)                           \
 _(SW_INTERFACE_ADD_DEL_ADDRESS, sw_interface_add_del_address)           \
 _(SW_INTERFACE_SET_TABLE, sw_interface_set_table)                       \
+_(SW_INTERFACE_SET_MPLS_ENABLE, sw_interface_set_mpls_enable)           \
 _(SW_INTERFACE_SET_VPATH, sw_interface_set_vpath)                       \
 _(SW_INTERFACE_SET_L2_XCONNECT, sw_interface_set_l2_xconnect)           \
 _(SW_INTERFACE_SET_L2_BRIDGE, sw_interface_set_l2_bridge)               \
@@ -979,251 +987,165 @@ VLIB_REGISTER_NODE (vpe_resolver_process_node,static) = {
 /* *INDENT-ON* */
 
 static int
-ip4_add_del_route_t_handler (vl_api_ip_add_del_route_t * mp)
+ip_add_del_route_t_handler (vl_api_ip_add_del_route_t * mp,
+                           u32 fib_index,
+                           const fib_prefix_t * prefix,
+                           const ip46_address_t * next_hop,
+                           u32 next_hop_sw_if_index,
+                           u32 next_hop_fib_index, u32 next_hop_weight)
 {
-  ip4_main_t *im = &ip4_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
   vnet_classify_main_t *cm = &vnet_classify_main;
+  fib_protocol_t proto = prefix->fp_proto;
   stats_main_t *sm = &stats_main;
-  ip4_add_del_route_args_t a;
-  ip4_address_t next_hop_address;
-  u32 fib_index;
-  vpe_api_main_t *vam = &vpe_api_main;
-  vnet_main_t *vnm = vam->vnet_main;
-  vlib_main_t *vm = vlib_get_main ();
-  pending_route_t *pr;
-  vl_api_ip_add_del_route_t *adr;
-  uword *p;
-  clib_error_t *e;
-  u32 ai;
-  ip_adjacency_t *adj;
-
-  p = hash_get (im->fib_index_by_table_id, ntohl (mp->vrf_id));
-  if (!p)
-    {
-      if (mp->create_vrf_if_needed)
-       {
-         ip4_fib_t *f;
-         f = find_ip4_fib_by_table_index_or_id (im, ntohl (mp->vrf_id),
-                                                0 /* flags */ );
-         fib_index = f->index;
-       }
-      else
-       {
-         /* No such VRF, and we weren't asked to create one */
-         return VNET_API_ERROR_NO_SUCH_FIB;
-       }
-    }
-  else
-    {
-      fib_index = p[0];
-    }
-
-  if (~0 != mp->next_hop_sw_if_index &&
-      pool_is_free_index (vnm->interface_main.sw_interfaces,
-                         ntohl (mp->next_hop_sw_if_index)))
-    return VNET_API_ERROR_NO_MATCHING_INTERFACE;
-
-  clib_memcpy (next_hop_address.data, mp->next_hop_address,
-              sizeof (next_hop_address.data));
-
-  /* Arp for the next_hop if necessary */
-  if (mp->is_add && mp->resolve_if_needed && ~0 != mp->next_hop_sw_if_index)
-    {
-      u32 lookup_result;
-      ip_adjacency_t *adj;
-
-      lookup_result = ip4_fib_lookup_with_table
-       (im, fib_index, &next_hop_address, 1 /* disable default route */ );
-
-      adj = ip_get_adjacency (lm, lookup_result);
-
-      if (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP)
-       {
-         pool_get (vam->pending_routes, pr);
-         pr->resolve_type = RESOLVE_IP4_ADD_DEL_ROUTE;
-         adr = &pr->r;
-         clib_memcpy (adr, mp, sizeof (*adr));
-         /* recursion block, "just in case" */
-         adr->resolve_if_needed = 0;
-         adr->resolve_attempts = ntohl (mp->resolve_attempts);
-         vnet_register_ip4_arp_resolution_event
-           (vnm, &next_hop_address, vpe_resolver_process_node.index,
-            RESOLUTION_EVENT, pr - vam->pending_routes);
-
-         vlib_process_signal_event
-           (vm, vpe_resolver_process_node.index,
-            RESOLUTION_PENDING_EVENT, 0 /* data */ );
-
-         /* The interface may be down, etc. */
-         e = ip4_probe_neighbor
-           (vm, (ip4_address_t *) & (mp->next_hop_address),
-            ntohl (mp->next_hop_sw_if_index));
-
-         if (e)
-           clib_error_report (e);
-
-         return VNET_API_ERROR_IN_PROGRESS;
-       }
-    }
 
   if (mp->is_multipath)
     {
-      u32 flags;
+      fib_route_path_flags_t path_flags = FIB_ROUTE_PATH_FLAG_NONE;
 
       dslock (sm, 1 /* release hint */ , 10 /* tag */ );
 
+      if (mp->is_resolve_host)
+       path_flags |= FIB_ROUTE_PATH_RESOLVE_VIA_HOST;
+      if (mp->is_resolve_attached)
+       path_flags |= FIB_ROUTE_PATH_RESOLVE_VIA_ATTACHED;
+
       if (mp->is_add)
-       flags = IP4_ROUTE_FLAG_ADD;
+       fib_table_entry_path_add (fib_index,
+                                 prefix,
+                                 FIB_SOURCE_API,
+                                 FIB_ENTRY_FLAG_NONE,
+                                 prefix->fp_proto,
+                                 next_hop,
+                                 next_hop_sw_if_index,
+                                 next_hop_fib_index,
+                                 next_hop_weight,
+                                 MPLS_LABEL_INVALID, path_flags);
       else
-       flags = IP4_ROUTE_FLAG_DEL;
-
-      if (mp->not_last)
-       flags |= IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP;
-
-      ip4_add_del_route_next_hop (im, flags,
-                                 (ip4_address_t *) mp->dst_address,
-                                 (u32) mp->dst_address_length,
-                                 (ip4_address_t *) mp->next_hop_address,
-                                 ntohl (mp->next_hop_sw_if_index),
-                                 (u32) mp->next_hop_weight,
-                                 ~0 /* adj_index */ ,
-                                 fib_index);
+       fib_table_entry_path_remove (fib_index,
+                                    prefix,
+                                    FIB_SOURCE_API,
+                                    prefix->fp_proto,
+                                    next_hop,
+                                    next_hop_sw_if_index,
+                                    next_hop_fib_index,
+                                    next_hop_weight, path_flags);
+
       dsunlock (sm);
       return 0;
     }
 
-  memset (&a, 0, sizeof (a));
-  clib_memcpy (a.dst_address.data, mp->dst_address,
-              sizeof (a.dst_address.data));
-
-  a.dst_address_length = mp->dst_address_length;
-
-  a.flags = (mp->is_add ? IP4_ROUTE_FLAG_ADD : IP4_ROUTE_FLAG_DEL);
-  a.flags |= IP4_ROUTE_FLAG_FIB_INDEX;
-  a.table_index_or_table_id = fib_index;
-  a.add_adj = 0;
-  a.n_add_adj = 0;
-
-  if (mp->not_last)
-    a.flags |= IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP;
-
   dslock (sm, 1 /* release hint */ , 2 /* tag */ );
 
-  if (mp->is_add)
+  if (mp->is_drop || mp->is_local || mp->is_classify || mp->lookup_in_vrf)
     {
-      if (mp->is_drop)
-       ai = lm->drop_adj_index;
-      else if (mp->is_local)
-       ai = lm->local_adj_index;
-      else if (mp->is_classify)
+      /*
+       * special route types that link directly to the adj
+       */
+      if (mp->is_add)
        {
-         if (pool_is_free_index
-             (cm->tables, ntohl (mp->classify_table_index)))
+         dpo_id_t dpo = DPO_NULL;
+         dpo_proto_t dproto;
+
+         dproto = fib_proto_to_dpo (prefix->fp_proto);
+
+         if (mp->is_drop)
+           dpo_copy (&dpo, drop_dpo_get (dproto));
+         else if (mp->is_local)
+           receive_dpo_add_or_lock (dproto, ~0, NULL, &dpo);
+         else if (mp->is_classify)
            {
-             dsunlock (sm);
-             return VNET_API_ERROR_NO_SUCH_TABLE;
-           }
-         adj = ip_add_adjacency (lm,
-                                 /* template */ 0,
-                                 /* block size */ 1,
-                                 &ai);
+             if (pool_is_free_index (cm->tables,
+                                     ntohl (mp->classify_table_index)))
+               {
+                 dsunlock (sm);
+                 return VNET_API_ERROR_NO_SUCH_TABLE;
+               }
 
-         adj->lookup_next_index = IP_LOOKUP_NEXT_CLASSIFY;
-         adj->classify.table_index = ntohl (mp->classify_table_index);
-       }
-      else if (mp->lookup_in_vrf)
-       {
-         p = hash_get (im->fib_index_by_table_id, ntohl (mp->lookup_in_vrf));
-         if (p)
+             dpo_set (&dpo, DPO_CLASSIFY, proto,
+                      classify_dpo_create (prefix->fp_proto,
+                                           ntohl
+                                           (mp->classify_table_index)));
+           }
+         else if (mp->lookup_in_vrf)
            {
-             adj = ip_add_adjacency (lm,
-                                     /* template */ 0,
-                                     /* block size */ 1,
-                                     &ai);
-             adj->explicit_fib_index = p[0];
+             next_hop_fib_index =
+               fib_table_id_find_fib_index (dproto,
+                                            ntohl (mp->lookup_in_vrf));
+             if (~0 == next_hop_fib_index)
+               {
+                 dsunlock (sm);
+                 return VNET_API_ERROR_NO_SUCH_INNER_FIB;
+               }
+
+             lookup_dpo_add_or_lock_w_fib_index (next_hop_fib_index,
+                                                 dproto,
+                                                 LOOKUP_INPUT_DST_ADDR,
+                                                 LOOKUP_TABLE_FROM_CONFIG,
+                                                 &dpo);
            }
          else
            {
              dsunlock (sm);
-             return VNET_API_ERROR_NO_SUCH_INNER_FIB;
+             return VNET_API_ERROR_NO_SUCH_TABLE;
            }
+
+         fib_table_entry_special_dpo_add (fib_index,
+                                          prefix,
+                                          FIB_SOURCE_API,
+                                          FIB_ENTRY_FLAG_EXCLUSIVE, &dpo);
+         dpo_reset (&dpo);
        }
       else
-       ai = ip4_route_get_next_hop_adj (im,
-                                        fib_index,
-                                        &next_hop_address,
-                                        ntohl (mp->next_hop_sw_if_index),
-                                        fib_index);
-
-      if (ai == lm->miss_adj_index)
        {
-         dsunlock (sm);
-         return VNET_API_ERROR_NO_SUCH_INNER_FIB;
+         fib_table_entry_special_remove (fib_index, prefix, FIB_SOURCE_API);
        }
     }
   else
     {
-      ip_adjacency_t *adj;
-      int disable_default_route = 1;
-
-      /* Trying to delete the default route? */
-      if (a.dst_address.as_u32 == 0 && a.dst_address_length == 0)
-       disable_default_route = 0;
-
-      ai = ip4_fib_lookup_with_table
-       (im, fib_index, &a.dst_address, disable_default_route);
-      if (ai == lm->miss_adj_index)
+      if (mp->is_add)
        {
-         dsunlock (sm);
-         return VNET_API_ERROR_UNKNOWN_DESTINATION;
+         fib_route_path_flags_t path_flags = FIB_ROUTE_PATH_FLAG_NONE;
+
+         if (mp->is_resolve_host)
+           path_flags |= FIB_ROUTE_PATH_RESOLVE_VIA_HOST;
+         if (mp->is_resolve_attached)
+           path_flags |= FIB_ROUTE_PATH_RESOLVE_VIA_ATTACHED;
+
+         fib_table_entry_update_one_path (fib_index,
+                                          prefix,
+                                          FIB_SOURCE_API,
+                                          FIB_ENTRY_FLAG_NONE,
+                                          prefix->fp_proto,
+                                          next_hop,
+                                          next_hop_sw_if_index,
+                                          next_hop_fib_index,
+                                          next_hop_weight,
+                                          MPLS_LABEL_INVALID, path_flags);
        }
-
-      adj = ip_get_adjacency (lm, ai);
-      if (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP)
+      else
        {
-         dsunlock (sm);
-         return VNET_API_ERROR_ADDRESS_MATCHES_INTERFACE_ADDRESS;
+         fib_table_entry_delete (fib_index, prefix, FIB_SOURCE_API);
        }
     }
 
-  a.adj_index = ai;
-  ip4_add_del_route (im, &a);
-
   dsunlock (sm);
-  return 0;
+  return (0);
 }
 
 static int
-ip6_add_del_route_t_handler (vl_api_ip_add_del_route_t * mp)
+ip4_add_del_route_t_handler (vl_api_ip_add_del_route_t * mp)
 {
-  ip6_main_t *im = &ip6_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
-  vnet_main_t *vnm = vnet_get_main ();
-  vlib_main_t *vm = vlib_get_main ();
   vpe_api_main_t *vam = &vpe_api_main;
-  stats_main_t *sm = &stats_main;
-  ip6_add_del_route_args_t a;
-  ip6_address_t next_hop_address;
-  pending_route_t *pr;
-  vl_api_ip_add_del_route_t *adr;
-
+  vnet_main_t *vnm = vam->vnet_main;
   u32 fib_index;
-  uword *p;
-  clib_error_t *e;
-  ip_adjacency_t *adj = 0;
-  u32 ai;
-
-  p = hash_get (im->fib_index_by_table_id, ntohl (mp->vrf_id));
 
-  if (!p)
+  fib_index = ip4_fib_index_from_table_id (ntohl (mp->vrf_id));
+  if (~0 == fib_index)
     {
       if (mp->create_vrf_if_needed)
        {
-         ip6_fib_t *f;
-         f = find_ip6_fib_by_table_index_or_id (im, ntohl (mp->vrf_id),
-                                                0 /* flags */ );
-         fib_index = f->index;
+         fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4,
+                                                        ntohl (mp->vrf_id));
        }
       else
        {
@@ -1231,160 +1153,66 @@ ip6_add_del_route_t_handler (vl_api_ip_add_del_route_t * mp)
          return VNET_API_ERROR_NO_SUCH_FIB;
        }
     }
-  else
-    {
-      fib_index = p[0];
-    }
 
-  if (~0 != mp->next_hop_sw_if_index &&
+  if (~0 != ntohl (mp->next_hop_sw_if_index) &&
       pool_is_free_index (vnm->interface_main.sw_interfaces,
                          ntohl (mp->next_hop_sw_if_index)))
     return VNET_API_ERROR_NO_MATCHING_INTERFACE;
 
-  clib_memcpy (next_hop_address.as_u8, mp->next_hop_address,
-              sizeof (next_hop_address.as_u8));
-
-  /* Arp for the next_hop if necessary */
-  if (mp->is_add && mp->resolve_if_needed && ~0 != mp->next_hop_sw_if_index)
-    {
-      u32 lookup_result;
-      ip_adjacency_t *adj;
-
-      lookup_result = ip6_fib_lookup_with_table
-       (im, fib_index, &next_hop_address);
-
-      adj = ip_get_adjacency (lm, lookup_result);
-
-      if (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP)
-       {
-         pool_get (vam->pending_routes, pr);
-         adr = &pr->r;
-         pr->resolve_type = RESOLVE_IP6_ADD_DEL_ROUTE;
-         clib_memcpy (adr, mp, sizeof (*adr));
-         /* recursion block, "just in case" */
-         adr->resolve_if_needed = 0;
-         adr->resolve_attempts = ntohl (mp->resolve_attempts);
-         vnet_register_ip6_neighbor_resolution_event
-           (vnm, &next_hop_address, vpe_resolver_process_node.index,
-            RESOLUTION_EVENT, pr - vam->pending_routes);
-
-         vlib_process_signal_event
-           (vm, vpe_resolver_process_node.index,
-            RESOLUTION_PENDING_EVENT, 0 /* data */ );
-
-         /* The interface may be down, etc. */
-         e = ip6_probe_neighbor
-           (vm, (ip6_address_t *) & (mp->next_hop_address),
-            ntohl (mp->next_hop_sw_if_index));
-
-         if (e)
-           clib_error_report (e);
-
-         return VNET_API_ERROR_IN_PROGRESS;
-       }
-    }
-
-  if (mp->is_multipath)
-    {
-      u32 flags;
-
-      dslock (sm, 1 /* release hint */ , 11 /* tag */ );
-
-      if (mp->is_add)
-       flags = IP6_ROUTE_FLAG_ADD;
-      else
-       flags = IP6_ROUTE_FLAG_DEL;
-
-      if (mp->not_last)
-       flags |= IP6_ROUTE_FLAG_NOT_LAST_IN_GROUP;
-
-      ip6_add_del_route_next_hop (im, flags,
-                                 (ip6_address_t *) mp->dst_address,
-                                 (u32) mp->dst_address_length,
-                                 (ip6_address_t *) mp->next_hop_address,
-                                 ntohl (mp->next_hop_sw_if_index),
-                                 (u32) mp->next_hop_weight,
-                                 ~0 /* adj_index */ ,
-                                 fib_index);
-      dsunlock (sm);
-      return 0;
-    }
-
-  memset (&a, 0, sizeof (a));
-  clib_memcpy (a.dst_address.as_u8, mp->dst_address,
-              sizeof (a.dst_address.as_u8));
-
-  a.dst_address_length = mp->dst_address_length;
+  fib_prefix_t pfx = {
+    .fp_len = mp->dst_address_length,
+    .fp_proto = FIB_PROTOCOL_IP4,
+  };
+  clib_memcpy (&pfx.fp_addr.ip4, mp->dst_address, sizeof (pfx.fp_addr.ip4));
 
-  a.flags = (mp->is_add ? IP6_ROUTE_FLAG_ADD : IP6_ROUTE_FLAG_DEL);
-  a.flags |= IP6_ROUTE_FLAG_FIB_INDEX;
-  a.table_index_or_table_id = fib_index;
-  a.add_adj = 0;
-  a.n_add_adj = 0;
+  ip46_address_t nh;
+  memset (&nh, 0, sizeof (nh));
+  memcpy (&nh.ip4, mp->next_hop_address, sizeof (nh.ip4));
 
-  if (mp->not_last)
-    a.flags |= IP6_ROUTE_FLAG_NOT_LAST_IN_GROUP;
+  return (ip_add_del_route_t_handler (mp, fib_index, &pfx, &nh,
+                                     ntohl (mp->next_hop_sw_if_index),
+                                     fib_index, (u32) mp->next_hop_weight));
+}
 
-  dslock (sm, 1 /* release hint */ , 3 /* tag */ );
+static int
+ip6_add_del_route_t_handler (vl_api_ip_add_del_route_t * mp)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  u32 fib_index;
 
-  if (mp->is_add)
+  fib_index = ip6_fib_index_from_table_id (ntohl (mp->vrf_id));
+  if (~0 == fib_index)
     {
-      if (mp->is_drop)
-       ai = lm->drop_adj_index;
-      else if (mp->is_local)
-       ai = lm->local_adj_index;
-      else if (mp->lookup_in_vrf)
+      if (mp->create_vrf_if_needed)
        {
-         p = hash_get (im->fib_index_by_table_id, ntohl (mp->lookup_in_vrf));
-         if (p)
-           {
-             adj = ip_add_adjacency (lm,
-                                     /* template */ 0,
-                                     /* block size */ 1,
-                                     &ai);
-             adj->explicit_fib_index = p[0];
-           }
-         else
-           {
-             dsunlock (sm);
-             return VNET_API_ERROR_NO_SUCH_INNER_FIB;
-           }
+         fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP6,
+                                                        ntohl (mp->vrf_id));
        }
       else
-       ai = ip6_route_get_next_hop_adj (im,
-                                        fib_index,
-                                        &next_hop_address,
-                                        ntohl (mp->next_hop_sw_if_index),
-                                        fib_index);
-      if (ai == lm->miss_adj_index)
        {
-         dsunlock (sm);
-         return VNET_API_ERROR_NEXT_HOP_NOT_IN_FIB;
+         /* No such VRF, and we weren't asked to create one */
+         return VNET_API_ERROR_NO_SUCH_FIB;
        }
     }
-  else
-    {
-      ip_adjacency_t *adj;
 
-      ai = ip6_fib_lookup_with_table (im, fib_index, &a.dst_address);
-      if (ai == lm->miss_adj_index)
-       {
-         dsunlock (sm);
-         return VNET_API_ERROR_UNKNOWN_DESTINATION;
-       }
-      adj = ip_get_adjacency (lm, ai);
-      if (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP)
-       {
-         dsunlock (sm);
-         return VNET_API_ERROR_ADDRESS_MATCHES_INTERFACE_ADDRESS;
-       }
-    }
+  if (~0 != ntohl (mp->next_hop_sw_if_index) &&
+      pool_is_free_index (vnm->interface_main.sw_interfaces,
+                         ntohl (mp->next_hop_sw_if_index)))
+    return VNET_API_ERROR_NO_MATCHING_INTERFACE;
+
+  fib_prefix_t pfx = {
+    .fp_len = mp->dst_address_length,
+    .fp_proto = FIB_PROTOCOL_IP6,
+  };
+  clib_memcpy (&pfx.fp_addr.ip6, mp->dst_address, sizeof (pfx.fp_addr.ip6));
 
-  a.adj_index = ai;
-  ip6_add_del_route (im, &a);
+  ip46_address_t nh;
+  memset (&nh, 0, sizeof (nh));
+  memcpy (&nh.ip6, mp->next_hop_address, sizeof (nh.ip6));
 
-  dsunlock (sm);
-  return 0;
+  return (ip_add_del_route_t_handler (mp, fib_index, &pfx,
+                                     &nh, ntohl (mp->next_hop_sw_if_index),
+                                     fib_index, (u32) mp->next_hop_weight));
 }
 
 void
@@ -1406,48 +1234,6 @@ vl_api_ip_add_del_route_t_handler (vl_api_ip_add_del_route_t * mp)
   REPLY_MACRO (VL_API_IP_ADD_DEL_ROUTE_REPLY);
 }
 
-void
-api_config_default_ip_route (u8 is_ipv6, u8 is_add, u32 vrf_id,
-                            u32 sw_if_index, u8 * next_hop_addr)
-{
-  vl_api_ip_add_del_route_t mp;
-  int rv;
-
-  memset (&mp, 0, sizeof (vl_api_ip_add_del_route_t));
-
-  /*
-   * Configure default IP route:
-   *  - ip route add 0.0.0.0/1 via <GW IP>
-   *  - ip route add 128.0.0.0/1 via <GW IP>
-   */
-  mp.next_hop_sw_if_index = ntohl (sw_if_index);
-  mp.vrf_id = vrf_id;
-  mp.resolve_attempts = ~0;
-  mp.resolve_if_needed = 1;
-  mp.is_add = is_add;
-  mp.is_ipv6 = is_ipv6;
-  mp.next_hop_weight = 1;
-
-  clib_memcpy (&mp.next_hop_address[0], next_hop_addr, 16);
-
-  if (is_ipv6)
-    rv = ip6_add_del_route_t_handler (&mp);
-  else
-    {
-      mp.dst_address_length = 1;
-
-      mp.dst_address[0] = 0;
-      rv = ip4_add_del_route_t_handler (&mp);
-
-      mp.dst_address[0] = 128;
-      rv |= ip4_add_del_route_t_handler (&mp);
-    }
-
-  if (rv)
-    clib_error_return (0, "failed to config default IP route");
-
-}
-
 static void
   vl_api_sw_interface_add_del_address_t_handler
   (vl_api_sw_interface_add_del_address_t * mp)
@@ -1485,6 +1271,7 @@ vl_api_sw_interface_set_table_t_handler (vl_api_sw_interface_set_table_t * mp)
   u32 sw_if_index = ntohl (mp->sw_if_index);
   vl_api_sw_interface_set_table_reply_t *rmp;
   stats_main_t *sm = &stats_main;
+  u32 fib_index;
 
   VALIDATE_SW_IF_INDEX (mp);
 
@@ -1492,35 +1279,20 @@ vl_api_sw_interface_set_table_t_handler (vl_api_sw_interface_set_table_t * mp)
 
   if (mp->is_ipv6)
     {
-      ip6_main_t *im = &ip6_main;
-      ip6_fib_t *fib = find_ip6_fib_by_table_index_or_id (im, table_id,
-                                                         IP6_ROUTE_FLAG_TABLE_ID);
-      if (fib)
-       {
-         vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
-         im->fib_index_by_sw_if_index[sw_if_index] = fib->index;
-       }
-      else
-       {
-         rv = VNET_API_ERROR_NO_SUCH_FIB;
-       }
+      fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP6,
+                                                    table_id);
+
+      vec_validate (ip6_main.fib_index_by_sw_if_index, sw_if_index);
+      ip6_main.fib_index_by_sw_if_index[sw_if_index] = fib_index;
     }
   else
     {
-      ip4_main_t *im = &ip4_main;
-      ip4_fib_t *fib = find_ip4_fib_by_table_index_or_id
-       (im, table_id, IP4_ROUTE_FLAG_TABLE_ID);
 
-      /* Truthfully this can't fail */
-      if (fib)
-       {
-         vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
-         im->fib_index_by_sw_if_index[sw_if_index] = fib->index;
-       }
-      else
-       {
-         rv = VNET_API_ERROR_NO_SUCH_FIB;
-       }
+      fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4,
+                                                    table_id);
+
+      vec_validate (ip4_main.fib_index_by_sw_if_index, sw_if_index);
+      ip4_main.fib_index_by_sw_if_index[sw_if_index] = fib_index;
     }
   dsunlock (sm);
 
@@ -2303,10 +2075,11 @@ static int mpls_ethernet_add_del_tunnel_2_t_handler
   if (inner_fib_index == outer_fib_index)
     return VNET_API_ERROR_INVALID_VALUE;
 
-  lookup_result = ip4_fib_lookup_with_table
-    (im, outer_fib_index,
-     (ip4_address_t *) mp->next_hop_ip4_address_in_outer_vrf,
-     1 /* disable default route */ );
+  // FIXME not an ADJ
+  lookup_result = ip4_fib_table_lookup_lb (ip4_fib_get (outer_fib_index),
+                                          (ip4_address_t *)
+                                          mp->
+                                          next_hop_ip4_address_in_outer_vrf);
 
   adj = ip_get_adjacency (lm, lookup_result);
   tx_sw_if_index = adj->rewrite_header.sw_if_index;
@@ -2489,14 +2262,18 @@ vl_api_ip_neighbor_add_del_t_handler (vl_api_ip_neighbor_add_del_t * mp,
 {
   vl_api_ip_neighbor_add_del_reply_t *rmp;
   vnet_main_t *vnm = vnet_get_main ();
-  u32 fib_index;
-  int rv = 0;
   stats_main_t *sm = &stats_main;
+  int rv = 0;
 
   VALIDATE_SW_IF_INDEX (mp);
 
   dslock (sm, 1 /* release hint */ , 7 /* tag */ );
 
+  /*
+   * there's no validation here of the ND/ARP entry being added.
+   * The expectation is that the FIB will ensure that nothing bad
+   * will come of adding bogus entries.
+   */
   if (mp->is_ipv6)
     {
       if (mp->is_add)
@@ -2512,56 +2289,21 @@ vl_api_ip_neighbor_add_del_t_handler (vl_api_ip_neighbor_add_del_t * mp,
     }
   else
     {
-      ip4_main_t *im = &ip4_main;
-      ip_lookup_main_t *lm = &im->lookup_main;
       ethernet_arp_ip4_over_ethernet_address_t a;
-      u32 ai;
-      ip_adjacency_t *nh_adj;
-
-      uword *p = hash_get (im->fib_index_by_table_id, ntohl (mp->vrf_id));
-      if (!p)
-       {
-         rv = VNET_API_ERROR_NO_SUCH_FIB;
-         goto out;
-       }
-      fib_index = p[0];
-
-      /*
-       * Unfortunately, folks have a penchant for
-       * adding interface addresses to the ARP cache, and
-       * wondering why the forwarder eventually ASSERTs...
-       */
-      ai = ip4_fib_lookup_with_table
-       (im, fib_index, (ip4_address_t *) (mp->dst_address),
-        1 /* disable default route */ );
-
-      if (ai != 0)
-       {
-         nh_adj = ip_get_adjacency (lm, ai);
-         /* Never allow manipulation of a local adj! */
-         if (nh_adj->lookup_next_index == IP_LOOKUP_NEXT_LOCAL)
-           {
-             clib_warning ("%U matches local adj",
-                           format_ip4_address,
-                           (ip4_address_t *) (mp->dst_address));
-             rv = VNET_API_ERROR_ADDRESS_MATCHES_INTERFACE_ADDRESS;
-             goto out;
-           }
-       }
 
       clib_memcpy (&a.ethernet, mp->mac_address, 6);
       clib_memcpy (&a.ip4, mp->dst_address, 4);
 
       if (mp->is_add)
        rv = vnet_arp_set_ip4_over_ethernet (vnm, ntohl (mp->sw_if_index),
-                                            fib_index, &a, mp->is_static);
+                                            &a, mp->is_static);
       else
-       rv = vnet_arp_unset_ip4_over_ethernet (vnm, ntohl (mp->sw_if_index),
-                                              fib_index, &a);
+       rv =
+         vnet_arp_unset_ip4_over_ethernet (vnm, ntohl (mp->sw_if_index), &a);
     }
 
   BAD_SW_IF_INDEX_LABEL;
-out:
+
   dsunlock (sm);
   REPLY_MACRO (VL_API_IP_NEIGHBOR_ADD_DEL_REPLY);
 }
@@ -2604,6 +2346,7 @@ vl_api_is_address_reachable_t_handler (vl_api_is_address_reachable_t * mp)
   else
     {
       lm = &im4->lookup_main;
+      // FIXME NOT an ADJ
       adj_index = ip4_fib_lookup (im4, sw_if_index, &addr.ip4);
     }
   if (adj_index == ~0)
@@ -2671,6 +2414,22 @@ vl_api_sw_interface_set_flags_t_handler (vl_api_sw_interface_set_flags_t * mp)
   REPLY_MACRO (VL_API_SW_INTERFACE_SET_FLAGS_REPLY);
 }
 
+static void
+  vl_api_sw_interface_set_mpls_enable_t_handler
+  (vl_api_sw_interface_set_mpls_enable_t * mp)
+{
+  vl_api_sw_interface_set_mpls_enable_reply_t *rmp;
+  int rv = 0;
+
+  VALIDATE_SW_IF_INDEX (mp);
+
+  mpls_sw_interface_enable_disable (&mpls_main,
+                                   ntohl (mp->sw_if_index), mp->enable);
+
+  BAD_SW_IF_INDEX_LABEL;
+  REPLY_MACRO (VL_API_SW_INTERFACE_SET_MPLS_ENABLE_REPLY);
+}
+
 static void
 vl_api_sw_interface_clear_stats_t_handler (vl_api_sw_interface_clear_stats_t *
                                           mp)
@@ -2999,10 +2758,9 @@ ip4_reset_fib_t_handler (vl_api_reset_fib_t * mp)
   vnet_main_t *vnm = vnet_get_main ();
   vnet_interface_main_t *im = &vnm->interface_main;
   ip4_main_t *im4 = &ip4_main;
-  static ip4_route_t *routes;
   static u32 *sw_if_indices_to_shut;
   stats_main_t *sm = &stats_main;
-  ip4_route_t *r;
+  fib_table_t *fib_table;
   ip4_fib_t *fib;
   u32 sw_if_index;
   int i;
@@ -3011,9 +2769,11 @@ ip4_reset_fib_t_handler (vl_api_reset_fib_t * mp)
 
   dslock (sm, 1 /* release hint */ , 8 /* tag */ );
 
-  vec_foreach (fib, im4->fibs)
-  {
-    vnet_sw_interface_t *si;
+  /* *INDENT-OFF* */
+  pool_foreach (fib_table, im4->fibs,
+  ({
+    fib = &fib_table->v4;
+    vnet_sw_interface_t * si;
 
     if (fib->table_id != target_fib_id)
       continue;
@@ -3033,100 +2793,37 @@ ip4_reset_fib_t_handler (vl_api_reset_fib_t * mp)
     vec_reset_length (sw_if_indices_to_shut);
 
     /* Shut down interfaces in this FIB / clean out intfc routes */
-    /* *INDENT-OFF* */
     pool_foreach (si, im->sw_interfaces,
     ({
       u32 sw_if_index = si->sw_if_index;
 
       if (sw_if_index < vec_len (im4->fib_index_by_sw_if_index)
           && (im4->fib_index_by_sw_if_index[si->sw_if_index] ==
-              fib - im4->fibs))
+              fib->index))
         vec_add1 (sw_if_indices_to_shut, si->sw_if_index);
     }));
-    /* *INDENT-ON* */
 
-    for (i = 0; i < vec_len (sw_if_indices_to_shut); i++)
-      {
-       sw_if_index = sw_if_indices_to_shut[i];
-       // vec_foreach (sw_if_index, sw_if_indices_to_shut) {
-
-       u32 flags = vnet_sw_interface_get_flags (vnm, sw_if_index);
-       flags &= ~(VNET_SW_INTERFACE_FLAG_ADMIN_UP);
-       vnet_sw_interface_set_flags (vnm, sw_if_index, flags);
-      }
+    for (i = 0; i < vec_len (sw_if_indices_to_shut); i++) {
+      sw_if_index = sw_if_indices_to_shut[i];
+      // vec_foreach (sw_if_index, sw_if_indices_to_shut) {
 
-    vec_reset_length (routes);
-
-    for (i = 0; i < ARRAY_LEN (fib->adj_index_by_dst_address); i++)
-      {
-       uword *hash = fib->adj_index_by_dst_address[i];
-       hash_pair_t *p;
-       ip4_route_t x;
-
-       x.address_length = i;
-
-        /* *INDENT-OFF* */
-        hash_foreach_pair (p, hash,
-        ({
-          x.address.data_u32 = p->key;
-          vec_add1 (routes, x);
-        }));
-        /* *INDENT-ON* */
-      }
-
-    vec_foreach (r, routes)
-    {
-      ip4_add_del_route_args_t a;
+      u32 flags = vnet_sw_interface_get_flags (vnm, sw_if_index);
+      flags &= ~(VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+      vnet_sw_interface_set_flags (vnm, sw_if_index, flags);
+    }
 
-      memset (&a, 0, sizeof (a));
-      a.flags = IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_DEL;
-      a.table_index_or_table_id = fib - im4->fibs;
-      a.dst_address = r->address;
-      a.dst_address_length = r->address_length;
-      a.adj_index = ~0;
+    fib_table_flush(fib->index, FIB_PROTOCOL_IP4, FIB_SOURCE_API);
+    fib_table_flush(fib->index, FIB_PROTOCOL_IP4, FIB_SOURCE_INTERFACE);
 
-      ip4_add_del_route (im4, &a);
-      ip4_maybe_remap_adjacencies (im4, fib - im4->fibs,
-                                  IP4_ROUTE_FLAG_FIB_INDEX);
-    }
     rv = 0;
     break;
-  }                            /* vec_foreach (fib) */
+    })); /* pool_foreach (fib) */
+    /* *INDENT-ON* */
 
   dsunlock (sm);
   return rv;
 }
 
-typedef struct
-{
-  ip6_address_t address;
-  u32 address_length;
-  u32 index;
-} ip6_route_t;
-
-typedef struct
-{
-  u32 fib_index;
-  ip6_route_t **routep;
-} add_routes_in_fib_arg_t;
-
-static void
-add_routes_in_fib (clib_bihash_kv_24_8_t * kvp, void *arg)
-{
-  add_routes_in_fib_arg_t *ap = arg;
-
-  if (kvp->key[2] >> 32 == ap->fib_index)
-    {
-      ip6_address_t *addr;
-      ip6_route_t *r;
-      addr = (ip6_address_t *) kvp;
-      vec_add2 (*ap->routep, r, 1);
-      r->address = addr[0];
-      r->address_length = kvp->key[2] & 0xFF;
-      r->index = kvp->value;
-    }
-}
-
 static int
 ip6_reset_fib_t_handler (vl_api_reset_fib_t * mp)
 {
@@ -3134,22 +2831,21 @@ ip6_reset_fib_t_handler (vl_api_reset_fib_t * mp)
   vnet_interface_main_t *im = &vnm->interface_main;
   ip6_main_t *im6 = &ip6_main;
   stats_main_t *sm = &stats_main;
-  static ip6_route_t *routes;
   static u32 *sw_if_indices_to_shut;
-  ip6_route_t *r;
+  fib_table_t *fib_table;
   ip6_fib_t *fib;
   u32 sw_if_index;
   int i;
   int rv = VNET_API_ERROR_NO_SUCH_FIB;
   u32 target_fib_id = ntohl (mp->vrf_id);
-  add_routes_in_fib_arg_t _a, *a = &_a;
-  clib_bihash_24_8_t *h = &im6->ip6_lookup_table;
 
   dslock (sm, 1 /* release hint */ , 9 /* tag */ );
 
-  vec_foreach (fib, im6->fibs)
-  {
-    vnet_sw_interface_t *si;
+  /* *INDENT-OFF* */
+  pool_foreach (fib_table, im6->fibs,
+  ({
+    vnet_sw_interface_t * si;
+    fib = &(fib_table->v6);
 
     if (fib->table_id != target_fib_id)
       continue;
@@ -3157,52 +2853,29 @@ ip6_reset_fib_t_handler (vl_api_reset_fib_t * mp)
     vec_reset_length (sw_if_indices_to_shut);
 
     /* Shut down interfaces in this FIB / clean out intfc routes */
-    /* *INDENT-OFF* */
     pool_foreach (si, im->sw_interfaces,
-    ({
-      if (im6->fib_index_by_sw_if_index[si->sw_if_index] ==
-          fib - im6->fibs)
-        vec_add1 (sw_if_indices_to_shut, si->sw_if_index);
-    }));
-    /* *INDENT-ON* */
+                  ({
+                    if (im6->fib_index_by_sw_if_index[si->sw_if_index] ==
+                        fib->index)
+                      vec_add1 (sw_if_indices_to_shut, si->sw_if_index);
+                  }));
 
-    for (i = 0; i < vec_len (sw_if_indices_to_shut); i++)
-      {
-       sw_if_index = sw_if_indices_to_shut[i];
-       // vec_foreach (sw_if_index, sw_if_indices_to_shut) {
-
-       u32 flags = vnet_sw_interface_get_flags (vnm, sw_if_index);
-       flags &= ~(VNET_SW_INTERFACE_FLAG_ADMIN_UP);
-       vnet_sw_interface_set_flags (vnm, sw_if_index, flags);
-      }
+    for (i = 0; i < vec_len (sw_if_indices_to_shut); i++) {
+      sw_if_index = sw_if_indices_to_shut[i];
+      // vec_foreach (sw_if_index, sw_if_indices_to_shut) {
 
-    vec_reset_length (routes);
-
-    a->fib_index = fib - im6->fibs;
-    a->routep = &routes;
-
-    clib_bihash_foreach_key_value_pair_24_8 (h, add_routes_in_fib, a);
-
-    vec_foreach (r, routes)
-    {
-      ip6_add_del_route_args_t a;
+      u32 flags = vnet_sw_interface_get_flags (vnm, sw_if_index);
+      flags &= ~(VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+      vnet_sw_interface_set_flags (vnm, sw_if_index, flags);
+    }
 
-      memset (&a, 0, sizeof (a));
-      a.flags = IP6_ROUTE_FLAG_FIB_INDEX | IP6_ROUTE_FLAG_DEL;
-      a.table_index_or_table_id = fib - im6->fibs;
-      a.dst_address = r->address;
-      a.dst_address_length = r->address_length;
-      a.adj_index = ~0;
+    fib_table_flush(fib->index, FIB_PROTOCOL_IP6, FIB_SOURCE_API);
+    fib_table_flush(fib->index, FIB_PROTOCOL_IP6, FIB_SOURCE_INTERFACE);
 
-      ip6_add_del_route (im6, &a);
-      ip6_maybe_remap_adjacencies (im6, fib - im6->fibs,
-                                  IP6_ROUTE_FLAG_FIB_INDEX);
-    }
     rv = 0;
-    /* Reinstall the neighbor / router discovery routes */
-    vnet_ip6_fib_init (im6, fib - im6->fibs);
     break;
-  }                            /* vec_foreach (fib) */
+  })); /* pool_foreach (fib) */
+  /* *INDENT-ON* */
 
   dsunlock (sm);
   return rv;
@@ -3519,7 +3192,7 @@ set_ip4_flow_hash (vl_api_set_ip_flow_hash_t * mp)
   vl_api_set_ip_flow_hash_reply_t *rmp;
   int rv;
   u32 table_id;
-  u32 flow_hash_config = 0;
+  flow_hash_config_t flow_hash_config = 0;
 
   table_id = ntohl (mp->vrf_id);
 
@@ -3580,11 +3253,15 @@ static void vl_api_sw_interface_set_unnumbered_t_handler
     {
       si->flags |= VNET_SW_INTERFACE_FLAG_UNNUMBERED;
       si->unnumbered_sw_if_index = sw_if_index;
+      ip4_sw_interface_enable_disable (sw_if_index, 1);
+      ip6_sw_interface_enable_disable (sw_if_index, 1);
     }
   else
     {
       si->flags &= ~(VNET_SW_INTERFACE_FLAG_UNNUMBERED);
       si->unnumbered_sw_if_index = (u32) ~ 0;
+      ip4_sw_interface_enable_disable (sw_if_index, 0);
+      ip6_sw_interface_enable_disable (sw_if_index, 0);
     }
 
 done:
@@ -3769,7 +3446,7 @@ vl_api_set_arp_neighbor_limit_t_handler (vl_api_set_arp_neighbor_limit_t * mp)
 static void vl_api_sr_tunnel_add_del_t_handler
   (vl_api_sr_tunnel_add_del_t * mp)
 {
-#if IPV6SR == 0
+#if IP6SR == 0
   clib_warning ("unimplemented");
 #else
   ip6_sr_add_del_tunnel_args_t _a, *a = &_a;
@@ -3832,7 +3509,7 @@ out:
 static void vl_api_sr_policy_add_del_t_handler
   (vl_api_sr_policy_add_del_t * mp)
 {
-#if IPV6SR == 0
+#if IP6SR == 0
   clib_warning ("unimplemented");
 #else
   ip6_sr_add_del_policy_args_t _a, *a = &_a;
@@ -3886,7 +3563,7 @@ out:
 static void vl_api_sr_multicast_map_add_del_t_handler
   (vl_api_sr_multicast_map_add_del_t * mp)
 {
-#if IPV6SR == 0
+#if IP6SR == 0
   clib_warning ("unimplemented");
 #else
   ip6_sr_add_del_multicastmap_args_t _a, *a = &_a;
@@ -4906,13 +4583,13 @@ static void send_vxlan_tunnel_details
     {
       memcpy (rmp->src_address, &(t->src.ip6), 16);
       memcpy (rmp->dst_address, &(t->dst.ip6), 16);
-      rmp->encap_vrf_id = htonl (im6->fibs[t->encap_fib_index].table_id);
+      rmp->encap_vrf_id = htonl (im6->fibs[t->encap_fib_index].ft_table_id);
     }
   else
     {
       memcpy (rmp->src_address, &(t->src.ip4), 4);
       memcpy (rmp->dst_address, &(t->dst.ip4), 4);
-      rmp->encap_vrf_id = htonl (im4->fibs[t->encap_fib_index].table_id);
+      rmp->encap_vrf_id = htonl (im4->fibs[t->encap_fib_index].ft_table_id);
     }
   rmp->vni = htonl (t->vni);
   rmp->decap_next_index = htonl (t->decap_next_index);
@@ -5018,7 +4695,7 @@ static void send_gre_tunnel_details
   rmp->_vl_msg_id = ntohs (VL_API_GRE_TUNNEL_DETAILS);
   clib_memcpy (rmp->src_address, &(t->tunnel_src), 4);
   clib_memcpy (rmp->dst_address, &(t->tunnel_dst), 4);
-  rmp->outer_fib_id = htonl (im->fibs[t->outer_fib_index].table_id);
+  rmp->outer_fib_id = htonl (im->fibs[t->outer_fib_index].ft_table_id);
   rmp->teb = t->teb;
   rmp->sw_if_index = htonl (t->sw_if_index);
   rmp->context = context;
@@ -5178,15 +4855,15 @@ static void send_vxlan_gpe_tunnel_details
     {
       memcpy (rmp->local, &(t->local.ip6), 16);
       memcpy (rmp->remote, &(t->remote.ip6), 16);
-      rmp->encap_vrf_id = htonl (im6->fibs[t->encap_fib_index].table_id);
-      rmp->decap_vrf_id = htonl (im6->fibs[t->decap_fib_index].table_id);
+      rmp->encap_vrf_id = htonl (im6->fibs[t->encap_fib_index].ft_table_id);
+      rmp->decap_vrf_id = htonl (im6->fibs[t->decap_fib_index].ft_table_id);
     }
   else
     {
       memcpy (rmp->local, &(t->local.ip4), 4);
       memcpy (rmp->remote, &(t->remote.ip4), 4);
-      rmp->encap_vrf_id = htonl (im4->fibs[t->encap_fib_index].table_id);
-      rmp->decap_vrf_id = htonl (im4->fibs[t->decap_fib_index].table_id);
+      rmp->encap_vrf_id = htonl (im4->fibs[t->encap_fib_index].ft_table_id);
+      rmp->decap_vrf_id = htonl (im4->fibs[t->decap_fib_index].ft_table_id);
     }
   rmp->vni = htonl (t->vni);
   rmp->protocol = t->protocol;
@@ -7630,7 +7307,6 @@ vl_api_mpls_fib_encap_dump_t_handler (vl_api_mpls_fib_encap_dump_t * mp)
   show_mpls_fib_t *records = 0;
   show_mpls_fib_t *s;
   mpls_main_t *mm = &mpls_main;
-  ip4_main_t *im = &ip4_main;
   ip4_fib_t *rx_fib;
 
   q = vl_api_client_index_to_input_queue (mp->client_index);
@@ -7660,7 +7336,7 @@ vl_api_mpls_fib_encap_dump_t_handler (vl_api_mpls_fib_encap_dump_t * mp)
   vlib_cli_output (vm, "%=6s%=16s%=16s", "Table", "Dest address", "Labels");
   vec_foreach (s, records)
   {
-    rx_fib = vec_elt_at_index (im->fibs, s->fib_index);
+    rx_fib = ip4_fib_get (s->fib_index);
     vlib_cli_output (vm, "%=6d%=16U%=16U", rx_fib->table_id,
                     format_ip4_address, &s->dest, format_mpls_encap_index,
                     mm, s->entry_index);
@@ -7715,7 +7391,6 @@ vl_api_mpls_fib_decap_dump_t_handler (vl_api_mpls_fib_decap_dump_t * mp)
   show_mpls_fib_t *records = 0;
   show_mpls_fib_t *s;
   mpls_main_t *mm = &mpls_main;
-  ip4_main_t *im = &ip4_main;
   ip4_fib_t *rx_fib;
   ip4_fib_t *tx_fib;
   u32 tx_table_id;
@@ -7750,9 +7425,9 @@ vl_api_mpls_fib_decap_dump_t_handler (vl_api_mpls_fib_decap_dump_t * mp)
   {
     mpls_decap_t *d;
     d = pool_elt_at_index (mm->decaps, s->entry_index);
-    if (d->next_index == MPLS_INPUT_NEXT_IP4_INPUT)
+    if (d->next_index == MPLS_LOOKUP_NEXT_IP4_INPUT)
       {
-       tx_fib = vec_elt_at_index (im->fibs, d->tx_fib_index);
+       tx_fib = ip4_fib_get (d->tx_fib_index);
        tx_table_id = tx_fib->table_id;
        swif_tag = "     ";
       }
@@ -7761,7 +7436,7 @@ vl_api_mpls_fib_decap_dump_t_handler (vl_api_mpls_fib_decap_dump_t * mp)
        tx_table_id = d->tx_fib_index;
        swif_tag = "(i)  ";
       }
-    rx_fib = vec_elt_at_index (im->fibs, s->fib_index);
+    rx_fib = ip4_fib_get (s->fib_index);
 
     vlib_cli_output (vm, "%=10d%=10d%=5s%=6d%=6d", rx_fib->table_id,
                     tx_table_id, swif_tag, s->label, s->s_bit);
@@ -8104,7 +7779,7 @@ vl_api_ipfix_exporter_dump_t_handler (vl_api_ipfix_exporter_dump_t * mp)
   if (frm->fib_index == ~0)
     vrf_id = ~0;
   else
-    vrf_id = im->fibs[frm->fib_index].table_id;
+    vrf_id = im->fibs[frm->fib_index].ft_table_id;
   rmp->vrf_id = htonl (vrf_id);
   rmp->path_mtu = htonl (frm->path_mtu);
   rmp->template_interval = htonl (frm->template_interval);
index cc37368..98598ce 100644 (file)
@@ -20,7 +20,7 @@
 #include <vnet/vnet.h>
 #include <vnet/ip/ip.h>
 #include <vnet/unix/tuntap.h>
-#include <vnet/mpls-gre/mpls.h>
+#include <vnet/mpls/mpls.h>
 #include <vnet/dhcp/proxy.h>
 #include <vnet/dhcpv6/proxy.h>
 #include <vnet/l2tp/l2tp.h>
@@ -1413,6 +1413,9 @@ static void *vl_api_vxlan_add_del_tunnel_t_print
 
   s = format (s, "vni %d ", ntohl (mp->vni));
 
+  if (mp->is_add == 0)
+    s = format (s, "del ");
+
   if (mp->is_add == 0)
     s = format (s, "del ");
 
index 2434517..5c75986 100644 (file)
@@ -215,6 +215,30 @@ define sw_interface_set_table_reply
   i32 retval;
 };
 
+/** \brief Enable or Disable MPLS on and interface
+    @param client_index - opaque cookie to identify the sender
+    @param context - sender context, to match reply w/ request
+    @param sw_if_index - index of the interface
+    @param enable - if non-zero enable, else disable
+*/
+define sw_interface_set_mpls_enable
+{
+  u32 client_index;
+  u32 context;
+  u32 sw_if_index;
+  u8 enable;
+};
+
+/** \brief Reply for MPLS state on an interface
+    @param context - returned sender context, to match reply w/ request
+    @param retval - return code
+*/
+define sw_interface_set_mpls_enable_reply
+{
+  u32 context;
+  i32 retval;
+};
+
 /** \brief Initialize a new tap interface with the given paramters 
     @param client_index - opaque cookie to identify the sender
     @param context - sender context, to match reply w/ request
@@ -381,8 +405,10 @@ define ip_add_del_route
   u8 is_ipv6;
   u8 is_local;
   u8 is_classify;
-  /* Is last/not-last message in group of multiple add/del messages. */
   u8 is_multipath;
+  u8 is_resolve_host;
+  u8 is_resolve_attached;
+  /* Is last/not-last message in group of multiple add/del messages. */
   u8 not_last;
   u8 next_hop_weight;
   u8 dst_address_length;
index 19771e2..bbfa605 100644 (file)
@@ -243,3 +243,11 @@ main (int argc, char **argv)
  * eval: (c-set-style "gnu")
  * End:
  */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
index e97a37d..17e99bf 100644 (file)
@@ -32,7 +32,7 @@ typedef struct
 } clib_bihash_kv_24_8_t;
 
 static inline int
-clib_bihash_is_free_24_8 (clib_bihash_kv_24_8_t * v)
+clib_bihash_is_free_24_8 (const clib_bihash_kv_24_8_t * v)
 {
   /* Free values are memset to 0xff, check a bit... */
   if (v->key[0] == ~0ULL && v->value == ~0ULL)
@@ -50,9 +50,9 @@ crc_u32 (u32 data, u32 value)
 }
 
 static inline u64
-clib_bihash_hash_24_8 (clib_bihash_kv_24_8_t * v)
+clib_bihash_hash_24_8 (const clib_bihash_kv_24_8_t * v)
 {
-  u32 *dp = (u32 *) & v->key[0];
+  const u32 *dp = (const u32 *) &v->key[0];
   u32 value = 0;
 
   value = crc_u32 (dp[0], value);
@@ -66,7 +66,7 @@ clib_bihash_hash_24_8 (clib_bihash_kv_24_8_t * v)
 }
 #else
 static inline u64
-clib_bihash_hash_24_8 (clib_bihash_kv_24_8_t * v)
+clib_bihash_hash_24_8 (const clib_bihash_kv_24_8_t * v)
 {
   u64 tmp = v->key[0] ^ v->key[1] ^ v->key[2];
   return clib_xxhash (tmp);
@@ -84,7 +84,7 @@ format_bihash_kvp_24_8 (u8 * s, va_list * args)
 }
 
 static inline int
-clib_bihash_key_compare_24_8 (u64 * a, u64 * b)
+clib_bihash_key_compare_24_8 (const u64 * a, const u64 * b)
 {
   return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0;
 }
index 2ad8293..4b0b425 100644 (file)
@@ -321,7 +321,7 @@ unlock:
 }
 
 int BV (clib_bihash_search)
-  (BVT (clib_bihash) * h,
+  (const BVT (clib_bihash) * h,
    BVT (clib_bihash_kv) * search_key, BVT (clib_bihash_kv) * valuep)
 {
   u64 hash;
index a8bb27f..f70190c 100644 (file)
@@ -88,7 +88,7 @@ typedef struct
 } BVT (clib_bihash);
 
 
-static inline void *BV (clib_bihash_get_value) (BVT (clib_bihash) * h,
+static inline void *BV (clib_bihash_get_value) (const BVT (clib_bihash) * h,
                                                uword offset)
 {
   u8 *hp = h->mheap;
@@ -97,7 +97,7 @@ static inline void *BV (clib_bihash_get_value) (BVT (clib_bihash) * h,
   return (void *) vp;
 }
 
-static inline uword BV (clib_bihash_get_offset) (BVT (clib_bihash) * h,
+static inline uword BV (clib_bihash_get_offset) (const BVT (clib_bihash) * h,
                                                 void *v)
 {
   u8 *hp, *vp;
@@ -116,7 +116,7 @@ void BV (clib_bihash_free) (BVT (clib_bihash) * h);
 
 int BV (clib_bihash_add_del) (BVT (clib_bihash) * h,
                              BVT (clib_bihash_kv) * add_v, int is_add);
-int BV (clib_bihash_search) (BVT (clib_bihash) * h,
+int BV (clib_bihash_search) (const BVT (clib_bihash) * h,
                             BVT (clib_bihash_kv) * search_v,
                             BVT (clib_bihash_kv) * return_v);
 
@@ -128,7 +128,7 @@ format_function_t BV (format_bihash_kvp);
 
 
 static inline int BV (clib_bihash_search_inline)
-  (BVT (clib_bihash) * h, BVT (clib_bihash_kv) * kvp)
+  (const BVT (clib_bihash) * h, BVT (clib_bihash_kv) * kvp)
 {
   u64 hash;
   u32 bucket_index;
@@ -163,7 +163,7 @@ static inline int BV (clib_bihash_search_inline)
 }
 
 static inline int BV (clib_bihash_search_inline_2)
-  (BVT (clib_bihash) * h,
+  (const BVT (clib_bihash) * h,
    BVT (clib_bihash_kv) * search_key, BVT (clib_bihash_kv) * valuep)
 {
   u64 hash;
index 0da3502..78e52e9 100644 (file)
@@ -383,7 +383,7 @@ done:
 }
 
 u8 *
-va_format (u8 * s, char *fmt, va_list * va)
+va_format (u8 * s, const char *fmt, va_list * va)
 {
   u8 *f = (u8 *) fmt, *g;
   u8 c;
@@ -415,7 +415,7 @@ va_format (u8 * s, char *fmt, va_list * va)
 }
 
 u8 *
-format (u8 * s, char *fmt, ...)
+format (u8 * s, const char *fmt, ...)
 {
   va_list va;
   va_start (va, fmt);
index c91cc74..bc0d6d1 100644 (file)
@@ -47,8 +47,8 @@
 
 typedef u8 *(format_function_t) (u8 * s, va_list * args);
 
-u8 *va_format (u8 * s, char *format, va_list * args);
-u8 *format (u8 * s, char *format, ...);
+u8 *va_format (u8 * s, const char *format, va_list * args);
+u8 *format (u8 * s, const char *format, ...);
 
 #ifdef CLIB_UNIX
 
index 4db5a57..3f0efaa 100644 (file)
@@ -683,6 +683,11 @@ unformat_function_t unformat_hash_string;
 /* Main test routine. */
 int test_hash_main (unformat_input_t * input);
 
+static inline void
+hash_delete (void *bob)
+{
+}
+
 #endif /* included_hash_h */
 
 /*
index 353a06b..eed96d6 100644 (file)
@@ -892,6 +892,26 @@ do {                                               \
   (_v(cmp) < 0 ? -1 : (_v(cmp) > 0 ? +1 : 0));         \
 })
 
+/** \brief Search a vector for the index of the entry that matches.
+
+    @param v1 Pointer to a vector
+    @param v2 Entry to match
+    @return index of match or ~0
+*/
+#define vec_search(v,E)                                        \
+({                                                     \
+  word _v(i) = 0;                                      \
+  while (_v(i) < vec_len(v))                           \
+  {                                                    \
+    if (v[_v(i)] == E)                                 \
+      break;                                           \
+    _v(i)++;                                           \
+  }                                                    \
+  if (_v(i) == vec_len(v))                             \
+    _v(i) = ~0;                                                \
+  _v(i);                                               \
+})
+
 /** \brief Sort a vector using the supplied element comparison function
 
     @param vec vector to sort