MTRIE Optimisations 2
[vpp.git] / src / vnet / ip / ip4_mtrie.c
index 317d8f1..adc9512 100644 (file)
  */
 
 #include <vnet/ip/ip.h>
-#include <vnet/fib/fib_entry.h>
+#include <vnet/ip/ip4_mtrie.h>
+#include <vnet/fib/ip4_fib.h>
+
+
+/**
+ * Global pool of IPv4 8bit PLYs
+ */
+ip4_fib_mtrie_8_ply_t *ip4_ply_pool;
 
 always_inline u32
-ip4_fib_mtrie_leaf_is_non_empty (ip4_fib_mtrie_ply_t * p, u8 dst_byte)
+ip4_fib_mtrie_leaf_is_non_empty (ip4_fib_mtrie_8_ply_t * p, u8 dst_byte)
 {
   /*
    * It's 'non-empty' if the length of the leaf stored is greater than the
@@ -84,61 +91,83 @@ ip4_fib_mtrie_leaf_set_next_ply_index (u32 i)
   return l;
 }
 
-static void
-ply_init (ip4_fib_mtrie_ply_t * p,
-         ip4_fib_mtrie_leaf_t init, u32 prefix_len, u32 ply_base_len)
-{
-  /*
-   * A leaf is 'empty' if it represents a leaf from the covering PLY
-   * i.e. if the prefix length of the leaf is less than or equal to
-   * the prefix length of the PLY
-   */
-  p->n_non_empty_leafs = (prefix_len > ply_base_len ?
-                         ARRAY_LEN (p->leaves) : 0);
-  memset (p->dst_address_bits_of_leaves, prefix_len,
-         sizeof (p->dst_address_bits_of_leaves));
-  p->dst_address_bits_base = ply_base_len;
-
-  /* Initialize leaves. */
-#ifdef CLIB_HAVE_VEC128
-  {
-    u32x4 *l, init_x4;
-
 #ifndef __ALTIVEC__
-    init_x4 = u32x4_splat (init);
+#define PLY_X4_SPLAT_INIT(init_x4, init) \
+  init_x4 = u32x4_splat (init);
 #else
-    {
-      u32x4_union_t y;
-      y.as_u32[0] = init;
-      y.as_u32[1] = init;
-      y.as_u32[2] = init;
-      y.as_u32[3] = init;
-      init_x4 = y.as_u32x4;
-    }
+#define PLY_X4_SPLAT_INIT(init_x4, init)                                \
+{                                                                       \
+  u32x4_union_t y;                                                      \
+  y.as_u32[0] = init;                                                   \
+  y.as_u32[1] = init;                                                   \
+  y.as_u32[2] = init;                                                   \
+  y.as_u32[3] = init;                                                   \
+  init_x4 = y.as_u32x4;                                                 \
+}
 #endif
 
-    for (l = p->leaves_as_u32x4;
-        l < p->leaves_as_u32x4 + ARRAY_LEN (p->leaves_as_u32x4); l += 4)
-      {
-       l[0] = init_x4;
-       l[1] = init_x4;
-       l[2] = init_x4;
-       l[3] = init_x4;
-      }
-  }
+#ifdef CLIB_HAVE_VEC128
+#define PLY_INIT_LEAVES(p)                                              \
+{                                                                       \
+    u32x4 *l, init_x4;                                                  \
+                                                                        \
+    PLY_X4_SPLAT_INIT(init_x4, init);                                   \
+    for (l = p->leaves_as_u32x4;                                        \
+        l < p->leaves_as_u32x4 + ARRAY_LEN (p->leaves_as_u32x4);       \
+         l += 4)                                                        \
+      {                                                                 \
+       l[0] = init_x4;                                                 \
+       l[1] = init_x4;                                                 \
+       l[2] = init_x4;                                                 \
+       l[3] = init_x4;                                                 \
+      }                                                                 \
+}
 #else
-  {
-    u32 *l;
-
-    for (l = p->leaves; l < p->leaves + ARRAY_LEN (p->leaves); l += 4)
-      {
-       l[0] = init;
-       l[1] = init;
-       l[2] = init;
-       l[3] = init;
-      }
-  }
+#define PLY_INIT_LEAVES(p)                                              \
+{                                                                       \
+  u32 *l;                                                               \
+                                                                        \
+  for (l = p->leaves; l < p->leaves + ARRAY_LEN (p->leaves); l += 4)    \
+    {                                                                   \
+      l[0] = init;                                                      \
+      l[1] = init;                                                      \
+      l[2] = init;                                                      \
+      l[3] = init;                                                      \
+      }                                                                 \
+}
 #endif
+
+#define PLY_INIT(p, init, prefix_len, ply_base_len)                     \
+{                                                                       \
+  /*                                                                    \
+   * A leaf is 'empty' if it represents a leaf from the covering PLY    \
+   * i.e. if the prefix length of the leaf is less than or equal to     \
+   * the prefix length of the PLY                                       \
+   */                                                                   \
+  p->n_non_empty_leafs = (prefix_len > ply_base_len ?                   \
+                         ARRAY_LEN (p->leaves) : 0);                   \
+  memset (p->dst_address_bits_of_leaves, prefix_len,                    \
+         sizeof (p->dst_address_bits_of_leaves));                      \
+  p->dst_address_bits_base = ply_base_len;                              \
+                                                                        \
+  /* Initialize leaves. */                                              \
+  PLY_INIT_LEAVES(p);                                                   \
+}
+
+static void
+ply_8_init (ip4_fib_mtrie_8_ply_t * p,
+           ip4_fib_mtrie_leaf_t init, uword prefix_len, u32 ply_base_len)
+{
+  PLY_INIT (p, init, prefix_len, ply_base_len);
+}
+
+static void
+ply_16_init (ip4_fib_mtrie_16_ply_t * p,
+            ip4_fib_mtrie_leaf_t init, uword prefix_len)
+{
+  memset (p->dst_address_bits_of_leaves, prefix_len,
+         sizeof (p->dst_address_bits_of_leaves));
+  PLY_INIT_LEAVES (p);
 }
 
 static ip4_fib_mtrie_leaf_t
@@ -146,49 +175,43 @@ ply_create (ip4_fib_mtrie_t * m,
            ip4_fib_mtrie_leaf_t init_leaf,
            u32 leaf_prefix_len, u32 ply_base_len)
 {
-  ip4_fib_mtrie_ply_t *p;
+  ip4_fib_mtrie_8_ply_t *p;
 
   /* Get cache aligned ply. */
-  pool_get_aligned (m->ply_pool, p, sizeof (p[0]));
+  pool_get_aligned (ip4_ply_pool, p, CLIB_CACHE_LINE_BYTES);
 
-  ply_init (p, init_leaf, leaf_prefix_len, ply_base_len);
-  return ip4_fib_mtrie_leaf_set_next_ply_index (p - m->ply_pool);
+  ply_8_init (p, init_leaf, leaf_prefix_len, ply_base_len);
+  return ip4_fib_mtrie_leaf_set_next_ply_index (p - ip4_ply_pool);
 }
 
-always_inline ip4_fib_mtrie_ply_t *
+always_inline ip4_fib_mtrie_8_ply_t *
 get_next_ply_for_leaf (ip4_fib_mtrie_t * m, ip4_fib_mtrie_leaf_t l)
 {
   uword n = ip4_fib_mtrie_leaf_get_next_ply_index (l);
-  /* It better not be the root ply. */
-  ASSERT (n != 0);
-  return pool_elt_at_index (m->ply_pool, n);
+
+  return pool_elt_at_index (ip4_ply_pool, n);
 }
 
-static void
-ply_free (ip4_fib_mtrie_t * m, ip4_fib_mtrie_ply_t * p)
+void
+ip4_mtrie_free (ip4_fib_mtrie_t * m)
 {
-  uword i, is_root;
-
-  is_root = p - m->ply_pool == 0;
-
-  for (i = 0; i < ARRAY_LEN (p->leaves); i++)
+  /* the root ply is embedded so the is nothing to do,
+   * the assumption being that the IP4 FIB table has emptied the trie
+   * before deletion.
+   */
+#if CLIB_DEBUG > 0
+  int i;
+  for (i = 0; i < ARRAY_LEN (m->root_ply.leaves); i++)
     {
-      ip4_fib_mtrie_leaf_t l = p->leaves[i];
-      if (ip4_fib_mtrie_leaf_is_next_ply (l))
-       ply_free (m, get_next_ply_for_leaf (m, l));
+      ASSERT (!ip4_fib_mtrie_leaf_is_next_ply (m->root_ply.leaves[i]));
     }
-
-  if (is_root)
-    ply_init (p, IP4_FIB_MTRIE_LEAF_EMPTY, /* prefix_len */ 0, 0);
-  else
-    pool_put (m->ply_pool, p);
+#endif
 }
 
 void
-ip4_fib_free (ip4_fib_mtrie_t * m)
+ip4_mtrie_init (ip4_fib_mtrie_t * m)
 {
-  ip4_fib_mtrie_ply_t *root_ply = pool_elt_at_index (m->ply_pool, 0);
-  ply_free (m, root_ply);
+  ply_16_init (&m->root_ply, IP4_FIB_MTRIE_LEAF_EMPTY, 0);
 }
 
 typedef struct
@@ -202,7 +225,7 @@ typedef struct
 
 static void
 set_ply_with_more_specific_leaf (ip4_fib_mtrie_t * m,
-                                ip4_fib_mtrie_ply_t * ply,
+                                ip4_fib_mtrie_8_ply_t * ply,
                                 ip4_fib_mtrie_leaf_t new_leaf,
                                 uword new_leaf_dst_address_bits)
 {
@@ -218,7 +241,8 @@ set_ply_with_more_specific_leaf (ip4_fib_mtrie_t * m,
       /* Recurse into sub plies. */
       if (!ip4_fib_mtrie_leaf_is_terminal (old_leaf))
        {
-         ip4_fib_mtrie_ply_t *sub_ply = get_next_ply_for_leaf (m, old_leaf);
+         ip4_fib_mtrie_8_ply_t *sub_ply =
+           get_next_ply_for_leaf (m, old_leaf);
          set_ply_with_more_specific_leaf (m, sub_ply, new_leaf,
                                           new_leaf_dst_address_bits);
        }
@@ -237,16 +261,20 @@ set_ply_with_more_specific_leaf (ip4_fib_mtrie_t * m,
 
 static void
 set_leaf (ip4_fib_mtrie_t * m,
-         ip4_fib_mtrie_set_unset_leaf_args_t * a,
+         const ip4_fib_mtrie_set_unset_leaf_args_t * a,
          u32 old_ply_index, u32 dst_address_byte_index)
 {
   ip4_fib_mtrie_leaf_t old_leaf, new_leaf;
   i32 n_dst_bits_next_plies;
   u8 dst_byte;
+  ip4_fib_mtrie_8_ply_t *old_ply;
+
+  old_ply = pool_elt_at_index (ip4_ply_pool, old_ply_index);
 
   ASSERT (a->dst_address_length >= 0 && a->dst_address_length <= 32);
   ASSERT (dst_address_byte_index < ARRAY_LEN (a->dst_address.as_u8));
 
+  /* how many bits of the destination address are in the next PLY */
   n_dst_bits_next_plies =
     a->dst_address_length - BITS (u8) * (dst_address_byte_index + 1);
 
@@ -255,30 +283,36 @@ set_leaf (ip4_fib_mtrie_t * m,
   /* Number of bits next plies <= 0 => insert leaves this ply. */
   if (n_dst_bits_next_plies <= 0)
     {
+      /* The mask length of the address to insert maps to this ply */
       uword i, n_dst_bits_this_ply, old_leaf_is_terminal;
 
+      /* The number of bits, and hence slots/buckets, we will fill */
       n_dst_bits_this_ply = clib_min (8, -n_dst_bits_next_plies);
       ASSERT ((a->dst_address.as_u8[dst_address_byte_index] &
               pow2_mask (n_dst_bits_this_ply)) == 0);
 
+      /* Starting at the value of the byte at this section of the v4 address
+       * fill the buckets/slots of the ply */
       for (i = dst_byte; i < dst_byte + (1 << n_dst_bits_this_ply); i++)
        {
-         ip4_fib_mtrie_ply_t *old_ply, *new_ply;
-
-         old_ply = pool_elt_at_index (m->ply_pool, old_ply_index);
+         ip4_fib_mtrie_8_ply_t *new_ply;
 
          old_leaf = old_ply->leaves[i];
          old_leaf_is_terminal = ip4_fib_mtrie_leaf_is_terminal (old_leaf);
 
-         /* Is leaf to be inserted more specific? */
          if (a->dst_address_length >= old_ply->dst_address_bits_of_leaves[i])
            {
+             /* The new leaf is more or equally specific than the one currently
+              * occupying the slot */
              new_leaf = ip4_fib_mtrie_leaf_set_adj_index (a->adj_index);
 
              if (old_leaf_is_terminal)
                {
+                 /* The current leaf is terminal, we can replace it with
+                  * the new one */
                  old_ply->n_non_empty_leafs -=
                    ip4_fib_mtrie_leaf_is_non_empty (old_ply, i);
+
                  old_ply->dst_address_bits_of_leaves[i] =
                    a->dst_address_length;
                  __sync_val_compare_and_swap (&old_ply->leaves[i], old_leaf,
@@ -292,32 +326,42 @@ set_leaf (ip4_fib_mtrie_t * m,
                }
              else
                {
-                 /* Existing leaf points to another ply.  We need to place new_leaf into all
-                    more specific slots. */
+                 /* Existing leaf points to another ply.  We need to place
+                  * new_leaf into all more specific slots. */
                  new_ply = get_next_ply_for_leaf (m, old_leaf);
                  set_ply_with_more_specific_leaf (m, new_ply, new_leaf,
                                                   a->dst_address_length);
                }
            }
-
          else if (!old_leaf_is_terminal)
            {
+             /* The current leaf is less specific and not termial (i.e. a ply),
+              * recurse on down the trie */
              new_ply = get_next_ply_for_leaf (m, old_leaf);
-             set_leaf (m, a, new_ply - m->ply_pool,
+             set_leaf (m, a, new_ply - ip4_ply_pool,
                        dst_address_byte_index + 1);
            }
+         /*
+          * else
+          *  the route we are adding is less specific than the leaf currently
+          *  occupying this slot. leave it there
+          */
        }
     }
   else
     {
-      ip4_fib_mtrie_ply_t *old_ply, *new_ply;
+      /* The address to insert requires us to move down at a lower level of
+       * the trie - recurse on down */
+      ip4_fib_mtrie_8_ply_t *new_ply;
       u8 ply_base_len;
 
       ply_base_len = 8 * (dst_address_byte_index + 1);
-      old_ply = pool_elt_at_index (m->ply_pool, old_ply_index);
+
       old_leaf = old_ply->leaves[dst_byte];
+
       if (ip4_fib_mtrie_leaf_is_terminal (old_leaf))
        {
+         /* There is a leaf occupying the slot. Replace it with a new ply */
          old_ply->n_non_empty_leafs -=
            ip4_fib_mtrie_leaf_is_non_empty (old_ply, dst_byte);
 
@@ -328,28 +372,143 @@ set_leaf (ip4_fib_mtrie_t * m,
          new_ply = get_next_ply_for_leaf (m, new_leaf);
 
          /* Refetch since ply_create may move pool. */
-         old_ply = pool_elt_at_index (m->ply_pool, old_ply_index);
+         old_ply = pool_elt_at_index (ip4_ply_pool, old_ply_index);
 
          __sync_val_compare_and_swap (&old_ply->leaves[dst_byte], old_leaf,
                                       new_leaf);
          ASSERT (old_ply->leaves[dst_byte] == new_leaf);
          old_ply->dst_address_bits_of_leaves[dst_byte] = ply_base_len;
 
-         /* Account for the ply we just created. */
-         old_ply->n_non_empty_leafs += 1;
+         old_ply->n_non_empty_leafs +=
+           ip4_fib_mtrie_leaf_is_non_empty (old_ply, dst_byte);
          ASSERT (old_ply->n_non_empty_leafs >= 0);
        }
       else
        new_ply = get_next_ply_for_leaf (m, old_leaf);
 
-      set_leaf (m, a, new_ply - m->ply_pool, dst_address_byte_index + 1);
+      set_leaf (m, a, new_ply - ip4_ply_pool, dst_address_byte_index + 1);
+    }
+}
+
+static void
+set_root_leaf (ip4_fib_mtrie_t * m,
+              const ip4_fib_mtrie_set_unset_leaf_args_t * a)
+{
+  ip4_fib_mtrie_leaf_t old_leaf, new_leaf;
+  ip4_fib_mtrie_16_ply_t *old_ply;
+  i32 n_dst_bits_next_plies;
+  u16 dst_byte;
+
+  old_ply = &m->root_ply;
+
+  ASSERT (a->dst_address_length >= 0 && a->dst_address_length <= 32);
+
+  /* how many bits of the destination address are in the next PLY */
+  n_dst_bits_next_plies = a->dst_address_length - BITS (u16);
+
+  dst_byte = a->dst_address.as_u16[0];
+
+  /* Number of bits next plies <= 0 => insert leaves this ply. */
+  if (n_dst_bits_next_plies <= 0)
+    {
+      /* The mask length of the address to insert maps to this ply */
+      uword i, n_dst_bits_this_ply, old_leaf_is_terminal;
+
+      /* The number of bits, and hence slots/buckets, we will fill */
+      n_dst_bits_this_ply = 16 - a->dst_address_length;
+      ASSERT ((clib_host_to_net_u16 (a->dst_address.as_u16[0]) &
+              pow2_mask (n_dst_bits_this_ply)) == 0);
+
+      /* Starting at the value of the byte at this section of the v4 address
+       * fill the buckets/slots of the ply */
+      for (i = 0; i < (1 << n_dst_bits_this_ply); i++)
+       {
+         ip4_fib_mtrie_8_ply_t *new_ply;
+         u16 slot;
+
+         slot = clib_net_to_host_u16 (dst_byte);
+         slot += i;
+         slot = clib_host_to_net_u16 (slot);
+
+         old_leaf = old_ply->leaves[slot];
+         old_leaf_is_terminal = ip4_fib_mtrie_leaf_is_terminal (old_leaf);
+
+         if (a->dst_address_length >=
+             old_ply->dst_address_bits_of_leaves[slot])
+           {
+             /* The new leaf is more or equally specific than the one currently
+              * occupying the slot */
+             new_leaf = ip4_fib_mtrie_leaf_set_adj_index (a->adj_index);
+
+             if (old_leaf_is_terminal)
+               {
+                 /* The current leaf is terminal, we can replace it with
+                  * the new one */
+                 old_ply->dst_address_bits_of_leaves[slot] =
+                   a->dst_address_length;
+                 __sync_val_compare_and_swap (&old_ply->leaves[slot],
+                                              old_leaf, new_leaf);
+                 ASSERT (old_ply->leaves[slot] == new_leaf);
+               }
+             else
+               {
+                 /* Existing leaf points to another ply.  We need to place
+                  * new_leaf into all more specific slots. */
+                 new_ply = get_next_ply_for_leaf (m, old_leaf);
+                 set_ply_with_more_specific_leaf (m, new_ply, new_leaf,
+                                                  a->dst_address_length);
+               }
+           }
+         else if (!old_leaf_is_terminal)
+           {
+             /* The current leaf is less specific and not termial (i.e. a ply),
+              * recurse on down the trie */
+             new_ply = get_next_ply_for_leaf (m, old_leaf);
+             set_leaf (m, a, new_ply - ip4_ply_pool, 2);
+           }
+         /*
+          * else
+          *  the route we are adding is less specific than the leaf currently
+          *  occupying this slot. leave it there
+          */
+       }
+    }
+  else
+    {
+      /* The address to insert requires us to move down at a lower level of
+       * the trie - recurse on down */
+      ip4_fib_mtrie_8_ply_t *new_ply;
+      u8 ply_base_len;
+
+      ply_base_len = 16;
+
+      old_leaf = old_ply->leaves[dst_byte];
+
+      if (ip4_fib_mtrie_leaf_is_terminal (old_leaf))
+       {
+         /* There is a leaf occupying the slot. Replace it with a new ply */
+         new_leaf = ply_create (m, old_leaf,
+                                clib_max (old_ply->dst_address_bits_of_leaves
+                                          [dst_byte], ply_base_len),
+                                ply_base_len);
+         new_ply = get_next_ply_for_leaf (m, new_leaf);
+
+         __sync_val_compare_and_swap (&old_ply->leaves[dst_byte], old_leaf,
+                                      new_leaf);
+         ASSERT (old_ply->leaves[dst_byte] == new_leaf);
+         old_ply->dst_address_bits_of_leaves[dst_byte] = ply_base_len;
+       }
+      else
+       new_ply = get_next_ply_for_leaf (m, old_leaf);
+
+      set_leaf (m, a, new_ply - ip4_ply_pool, 2);
     }
 }
 
 static uword
 unset_leaf (ip4_fib_mtrie_t * m,
-           ip4_fib_mtrie_set_unset_leaf_args_t * a,
-           ip4_fib_mtrie_ply_t * old_ply, u32 dst_address_byte_index)
+           const ip4_fib_mtrie_set_unset_leaf_args_t * a,
+           ip4_fib_mtrie_8_ply_t * old_ply, u32 dst_address_byte_index)
 {
   ip4_fib_mtrie_leaf_t old_leaf, del_leaf;
   i32 n_dst_bits_next_plies;
@@ -397,7 +556,7 @@ unset_leaf (ip4_fib_mtrie_t * m,
          ASSERT (old_ply->n_non_empty_leafs >= 0);
          if (old_ply->n_non_empty_leafs == 0 && dst_address_byte_index > 0)
            {
-             pool_put (m->ply_pool, old_ply);
+             pool_put (ip4_ply_pool, old_ply);
              /* Old ply was deleted. */
              return 1;
            }
@@ -419,106 +578,120 @@ unset_leaf (ip4_fib_mtrie_t * m,
   return 0;
 }
 
-void
-ip4_mtrie_init (ip4_fib_mtrie_t * m)
+static void
+unset_root_leaf (ip4_fib_mtrie_t * m,
+                const ip4_fib_mtrie_set_unset_leaf_args_t * a)
 {
-  ip4_fib_mtrie_leaf_t root;
-  memset (m, 0, sizeof (m[0]));
-  root = ply_create (m, IP4_FIB_MTRIE_LEAF_EMPTY, 0, 0);
-  ASSERT (ip4_fib_mtrie_leaf_get_next_ply_index (root) == 0);
-}
+  ip4_fib_mtrie_leaf_t old_leaf, del_leaf;
+  i32 n_dst_bits_next_plies;
+  i32 i, n_dst_bits_this_ply, old_leaf_is_terminal;
+  u16 dst_byte;
+  ip4_fib_mtrie_16_ply_t *old_ply;
 
-void
-ip4_fib_mtrie_add_del_route (ip4_fib_t * fib,
-                            ip4_address_t dst_address,
-                            u32 dst_address_length,
-                            u32 adj_index, u32 is_del)
-{
-  ip4_fib_mtrie_t *m = &fib->mtrie;
-  ip4_fib_mtrie_ply_t *root_ply;
-  ip4_fib_mtrie_set_unset_leaf_args_t a;
-  ip4_main_t *im = &ip4_main;
+  ASSERT (a->dst_address_length >= 0 && a->dst_address_length <= 32);
 
-  ASSERT (m->ply_pool != 0);
+  old_ply = &m->root_ply;
+  n_dst_bits_next_plies = a->dst_address_length - BITS (u16);
 
-  root_ply = pool_elt_at_index (m->ply_pool, 0);
+  dst_byte = a->dst_address.as_u16[0];
 
-  /* Honor dst_address_length. Fib masks are in network byte order */
-  dst_address.as_u32 &= im->fib_masks[dst_address_length];
-  a.dst_address = dst_address;
-  a.dst_address_length = dst_address_length;
-  a.adj_index = adj_index;
+  n_dst_bits_this_ply = (n_dst_bits_next_plies <= 0 ?
+                        (16 - a->dst_address_length) : 0);
 
-  if (!is_del)
-    {
-      set_leaf (m, &a, /* ply_index */ 0, /* dst_address_byte_index */ 0);
-    }
-  else
+  del_leaf = ip4_fib_mtrie_leaf_set_adj_index (a->adj_index);
+
+  /* Starting at the value of the byte at this section of the v4 address
+   * fill the buckets/slots of the ply */
+  for (i = 0; i < (1 << n_dst_bits_this_ply); i++)
     {
-      ip4_main_t *im = &ip4_main;
+      u16 slot;
+
+      slot = clib_net_to_host_u16 (dst_byte);
+      slot += i;
+      slot = clib_host_to_net_u16 (slot);
 
-      if (dst_address_length)
+      old_leaf = old_ply->leaves[slot];
+      old_leaf_is_terminal = ip4_fib_mtrie_leaf_is_terminal (old_leaf);
+
+      if (old_leaf == del_leaf
+         || (!old_leaf_is_terminal
+             && unset_leaf (m, a, get_next_ply_for_leaf (m, old_leaf), 2)))
        {
-         word i;
+         old_ply->leaves[slot] =
+           ip4_fib_mtrie_leaf_set_adj_index (a->cover_adj_index);
+         old_ply->dst_address_bits_of_leaves[slot] = a->cover_address_length;
+       }
+    }
+}
 
-         /* If the ply was not deleted, then we need to fill the
-          * bucket just reset will the leaf from the less specfic
-          * cover.
-          * Find next less specific route and insert into mtrie. */
-         for (i = dst_address_length - 1; i >= 0; i--)
-           {
-             uword *p;
-             index_t lbi;
-             ip4_address_t key;
+void
+ip4_fib_mtrie_route_add (ip4_fib_mtrie_t * m,
+                        const ip4_address_t * dst_address,
+                        u32 dst_address_length, u32 adj_index)
+{
+  ip4_fib_mtrie_set_unset_leaf_args_t a;
+  ip4_main_t *im = &ip4_main;
 
-             if (!fib->fib_entry_by_dst_address[i])
-               continue;
+  /* Honor dst_address_length. Fib masks are in network byte order */
+  a.dst_address.as_u32 = (dst_address->as_u32 &
+                         im->fib_masks[dst_address_length]);
+  a.dst_address_length = dst_address_length;
+  a.adj_index = adj_index;
 
-             key.as_u32 = dst_address.as_u32 & im->fib_masks[i];
-             p = hash_get (fib->fib_entry_by_dst_address[i], key.as_u32);
-             if (p)
-               {
-                 lbi = fib_entry_contribute_ip_forwarding (p[0])->dpoi_index;
-                 if (INDEX_INVALID == lbi)
-                   continue;
+  set_root_leaf (m, &a);
+}
 
-                 a.cover_adj_index = lbi;
-                 a.cover_address_length = i;
+void
+ip4_fib_mtrie_route_del (ip4_fib_mtrie_t * m,
+                        const ip4_address_t * dst_address,
+                        u32 dst_address_length,
+                        u32 adj_index,
+                        u32 cover_address_length, u32 cover_adj_index)
+{
+  ip4_fib_mtrie_set_unset_leaf_args_t a;
+  ip4_main_t *im = &ip4_main;
 
-                 break;
-               }
-           }
-       }
-      else
-       {
-         a.cover_adj_index = 0;
-         a.cover_address_length = 0;
-       }
+  /* Honor dst_address_length. Fib masks are in network byte order */
+  a.dst_address.as_u32 = (dst_address->as_u32 &
+                         im->fib_masks[dst_address_length]);
+  a.dst_address_length = dst_address_length;
+  a.adj_index = adj_index;
+  a.cover_adj_index = cover_adj_index;
+  a.cover_address_length = cover_address_length;
 
-      /* the top level ply is never removed, so we can ignore the return code */
-      unset_leaf (m, &a, root_ply, 0);
-    }
+  /* the top level ply is never removed */
+  unset_root_leaf (m, &a);
 }
 
 /* Returns number of bytes of memory used by mtrie. */
 static uword
-mtrie_memory_usage (ip4_fib_mtrie_t * m, ip4_fib_mtrie_ply_t * p)
+mtrie_ply_memory_usage (ip4_fib_mtrie_t * m, ip4_fib_mtrie_8_ply_t * p)
 {
   uword bytes, i;
 
-  if (!p)
-    {
-      if (pool_is_free_index (m->ply_pool, 0))
-       return 0;
-      p = pool_elt_at_index (m->ply_pool, 0);
-    }
-
   bytes = sizeof (p[0]);
   for (i = 0; i < ARRAY_LEN (p->leaves); i++)
     {
       ip4_fib_mtrie_leaf_t l = p->leaves[i];
       if (ip4_fib_mtrie_leaf_is_next_ply (l))
-       bytes += mtrie_memory_usage (m, get_next_ply_for_leaf (m, l));
+       bytes += mtrie_ply_memory_usage (m, get_next_ply_for_leaf (m, l));
+    }
+
+  return bytes;
+}
+
+/* Returns number of bytes of memory used by mtrie. */
+static uword
+mtrie_memory_usage (ip4_fib_mtrie_t * m)
+{
+  uword bytes, i;
+
+  bytes = sizeof (*m);
+  for (i = 0; i < ARRAY_LEN (m->root_ply.leaves); i++)
+    {
+      ip4_fib_mtrie_leaf_t l = m->root_ply.leaves[i];
+      if (ip4_fib_mtrie_leaf_is_next_ply (l))
+       bytes += mtrie_ply_memory_usage (m, get_next_ply_for_leaf (m, l));
     }
 
   return bytes;
@@ -536,47 +709,49 @@ format_ip4_fib_mtrie_leaf (u8 * s, va_list * va)
   return s;
 }
 
+#define FORMAT_PLY(s, _p, _i, _base_address, _ply_max_len, _indent)     \
+({                                                                      \
+  u32 a, ia_length;                                                     \
+  ip4_address_t ia;                                                     \
+  ip4_fib_mtrie_leaf_t _l = p->leaves[(_i)];                            \
+                                                                        \
+  a = (_base_address) + ((_i) << (32 - (_ply_max_len)));                \
+  ia.as_u32 = clib_host_to_net_u32 (a);                                 \
+  ia_length = (_p)->dst_address_bits_of_leaves[(_i)];                   \
+  s = format (s, "\n%U%20U %U",                                         \
+              format_white_space, (_indent) + 2,                        \
+              format_ip4_address_and_length, &ia, ia_length,            \
+              format_ip4_fib_mtrie_leaf, _l);                           \
+                                                                        \
+  if (ip4_fib_mtrie_leaf_is_next_ply (_l))                              \
+    s = format (s, "\n%U%U",                                            \
+                format_white_space, (_indent) + 2,                      \
+                format_ip4_fib_mtrie_ply, m, a,                         \
+                ip4_fib_mtrie_leaf_get_next_ply_index (_l));            \
+  s;                                                                    \
+})
+
 static u8 *
 format_ip4_fib_mtrie_ply (u8 * s, va_list * va)
 {
   ip4_fib_mtrie_t *m = va_arg (*va, ip4_fib_mtrie_t *);
   u32 base_address = va_arg (*va, u32);
   u32 ply_index = va_arg (*va, u32);
-  u32 dst_address_byte_index = va_arg (*va, u32);
-  ip4_fib_mtrie_ply_t *p;
-  uword i, indent;
+  ip4_fib_mtrie_8_ply_t *p;
+  uword indent;
+  int i;
 
-  p = pool_elt_at_index (m->ply_pool, ply_index);
+  p = pool_elt_at_index (ip4_ply_pool, ply_index);
   indent = format_get_indent (s);
-  s =
-    format (s, "ply index %d, %d non-empty leaves", ply_index,
-           p->n_non_empty_leafs);
+  s = format (s, "ply index %d, %d non-empty leaves", ply_index,
+             p->n_non_empty_leafs);
+
   for (i = 0; i < ARRAY_LEN (p->leaves); i++)
     {
-      ip4_fib_mtrie_leaf_t l = p->leaves[i];
-
       if (ip4_fib_mtrie_leaf_is_non_empty (p, i))
        {
-         u32 a, ia_length;
-         ip4_address_t ia;
-
-         a = base_address + (i << (24 - 8 * dst_address_byte_index));
-         ia.as_u32 = clib_host_to_net_u32 (a);
-         if (ip4_fib_mtrie_leaf_is_terminal (l))
-           ia_length = p->dst_address_bits_of_leaves[i];
-         else
-           ia_length = 8 * (1 + dst_address_byte_index);
-         s = format (s, "\n%U%20U %U",
-                     format_white_space, indent + 2,
-                     format_ip4_address_and_length, &ia, ia_length,
-                     format_ip4_fib_mtrie_leaf, l);
-
-         if (ip4_fib_mtrie_leaf_is_next_ply (l))
-           s = format (s, "\n%U%U",
-                       format_white_space, indent + 2,
-                       format_ip4_fib_mtrie_ply, m, a,
-                       ip4_fib_mtrie_leaf_get_next_ply_index (l),
-                       dst_address_byte_index + 1);
+         FORMAT_PLY (s, p, i, base_address,
+                     p->dst_address_bits_base + 8, indent);
        }
     }
 
@@ -587,22 +762,44 @@ u8 *
 format_ip4_fib_mtrie (u8 * s, va_list * va)
 {
   ip4_fib_mtrie_t *m = va_arg (*va, ip4_fib_mtrie_t *);
+  ip4_fib_mtrie_16_ply_t *p;
+  u32 base_address = 0;
+  int i;
 
-  s = format (s, "%d plies, memory usage %U",
-             pool_elts (m->ply_pool),
-             format_memory_size, mtrie_memory_usage (m, 0));
+  s = format (s, "%d plies, memory usage %U\n",
+             pool_elts (ip4_ply_pool),
+             format_memory_size, mtrie_memory_usage (m));
+  s = format (s, "root-ply");
+  p = &m->root_ply;
 
-  if (pool_elts (m->ply_pool) > 0)
+  for (i = 0; i < ARRAY_LEN (p->leaves); i++)
     {
-      ip4_address_t base_address;
-      base_address.as_u32 = 0;
-      s =
-       format (s, "\n  %U", format_ip4_fib_mtrie_ply, m, base_address, 0, 0);
+      u16 slot;
+
+      slot = clib_host_to_net_u16 (i);
+
+      if (p->dst_address_bits_of_leaves[slot] > 0)
+       {
+         FORMAT_PLY (s, p, slot, base_address, 16, 2);
+       }
     }
 
   return s;
 }
 
+static clib_error_t *
+ip4_mtrie_module_init (vlib_main_t * vm)
+{
+  /* Burn one ply so index 0 is taken */
+  CLIB_UNUSED (ip4_fib_mtrie_8_ply_t * p);
+
+  pool_get (ip4_ply_pool, p);
+
+  return (NULL);
+}
+
+VLIB_INIT_FUNCTION (ip4_mtrie_module_init);
+
 /*
  * fd.io coding-style-patch-verification: ON
  *