Prevent re-entrant walks on an adjacency.
authorNeale Ranns <[email protected]>
Wed, 7 Dec 2016 15:38:14 +0000 (15:38 +0000)
committerDamjan Marion <[email protected]>
Wed, 7 Dec 2016 17:17:30 +0000 (17:17 +0000)
The re-entrant walks were caused when the walk from an IP adj updated a fib_netry with an MPLS adj which in turn triggers a walk of the IP adj. Re-entrant walks do unnecessary work.
also fixed a walk merge issue where the encountered walk should only be checked for equivalence woth the most recent alk, not any in the list. Otherwise an UO,DOWN,UP beceoms (2*)UP,DOWN

Change-Id: Ib8b27f055dc6c1366d33740276d1c26cd314220a
Signed-off-by: Neale Ranns <[email protected]>
vnet/vnet/adj/adj.c
vnet/vnet/adj/adj_nbr.c
vnet/vnet/ethernet/arp.c
vnet/vnet/fib/fib_walk.c
vnet/vnet/ip/ip6_neighbor.c
vnet/vnet/ip/lookup.h

index 2741c88..e740c4c 100644 (file)
@@ -66,6 +66,7 @@ adj_alloc (fib_protocol_t proto)
     fib_node_init(&adj->ia_node,
                   FIB_NODE_TYPE_ADJ);
     adj->ia_nh_proto = proto;
+    adj->ia_flags = 0;
 
     ip4_main.lookup_main.adjacency_heap = adj_pool;
     ip6_main.lookup_main.adjacency_heap = adj_pool;
index 95d1254..1344bb6 100644 (file)
@@ -297,9 +297,11 @@ adj_nbr_update_rewrite_internal (ip_adjacency_t *adj,
                                 u32 next_node,
                                 u8 *rewrite)
 {
+    ip_adjacency_t *walk_adj;
     adj_index_t walk_ai;
     vlib_main_t * vm;
     u32 old_next;
+    int do_walk;
 
     vm = vlib_get_main();
     old_next = adj->lookup_next_index;
@@ -318,6 +320,39 @@ adj_nbr_update_rewrite_internal (ip_adjacency_t *adj,
                                adj->rewrite_header.sw_if_index);
     }
 
+    /*
+     * Don't call the walk re-entrantly
+     */
+    if (ADJ_INDEX_INVALID != walk_ai)
+    {
+        walk_adj = adj_get(walk_ai);
+        if (IP_ADJ_SYNC_WALK_ACTIVE & walk_adj->ia_flags)
+        {
+            do_walk = 0;
+        }
+        else
+        {
+            /*
+             * Prevent re-entrant walk of the same adj
+             */
+            walk_adj->ia_flags |= IP_ADJ_SYNC_WALK_ACTIVE;
+            do_walk = 1;
+        }
+    }
+    else
+    {
+        do_walk = 0;
+    }
+
+    /*
+     * lock the adjacencies that are affected by updates this walk will provoke.
+     * Since the aim of the walk is to update children to link to a different
+     * DPO, this adj will no longer be in use and its lock count will drop to 0.
+     * We don't want it to be deleted as part of this endevour.
+     */
+    adj_lock(adj_get_index(adj));
+    adj_lock(walk_ai);
+
     /*
      * Updating a rewrite string is not atomic;
      *  - the rewrite string is too long to write in one instruction
@@ -346,7 +381,8 @@ adj_nbr_update_rewrite_internal (ip_adjacency_t *adj,
      *  1 - mac change
      *  2 - adj type change
      */
-    if (old_next != adj_next_index &&
+    if (do_walk &&
+        old_next != adj_next_index &&
         ADJ_INDEX_INVALID != walk_ai)
     {
         /*
@@ -407,8 +443,9 @@ adj_nbr_update_rewrite_internal (ip_adjacency_t *adj,
      */
     vlib_worker_thread_barrier_release(vm);
 
-    if (old_next != adj->lookup_next_index &&
-        ADJ_INDEX_INVALID != walk_ai)
+    if (do_walk &&
+        (old_next != adj->lookup_next_index) &&
+        (ADJ_INDEX_INVALID != walk_ai))
     {
         /*
          * backwalk to the children so they can stack on the now updated
@@ -420,6 +457,16 @@ adj_nbr_update_rewrite_internal (ip_adjacency_t *adj,
 
        fib_walk_sync(FIB_NODE_TYPE_ADJ, walk_ai, &bw_ctx);
     }
+    /*
+     * Prevent re-entrant walk of the same adj
+     */
+    if (do_walk)
+    {
+        walk_adj->ia_flags &= ~IP_ADJ_SYNC_WALK_ACTIVE;
+    }
+
+    adj_unlock(adj_get_index(adj));
+    adj_unlock(walk_ai);
 }
 
 typedef struct adj_db_count_ctx_t_ {
index ec13858..d020c8a 100644 (file)
@@ -374,13 +374,15 @@ arp_mk_complete (adj_index_t ai, ethernet_arp_ip4_entry_t * e)
 }
 
 static void
-arp_mk_incomplete (adj_index_t ai, ethernet_arp_ip4_entry_t * e)
+arp_mk_incomplete (adj_index_t ai)
 {
+  ip_adjacency_t *adj = adj_get (ai);
+
   adj_nbr_update_rewrite
     (ai,
      ADJ_NBR_REWRITE_FLAG_INCOMPLETE,
      ethernet_build_rewrite (vnet_get_main (),
-                            e->sw_if_index,
+                            adj->rewrite_header.sw_if_index,
                             VNET_LINK_ARP,
                             VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST));
 }
@@ -417,9 +419,7 @@ arp_mk_complete_walk (adj_index_t ai, void *ctx)
 static adj_walk_rc_t
 arp_mk_incomplete_walk (adj_index_t ai, void *ctx)
 {
-  ethernet_arp_ip4_entry_t *e = ctx;
-
-  arp_mk_incomplete (ai, e);
+  arp_mk_incomplete (ai);
 
   return (ADJ_WALK_RC_CONTINUE);
 }
@@ -1620,9 +1620,10 @@ vnet_arp_unset_ip4_over_ethernet_internal (vnet_main_t * vnm,
 
   if (NULL != e)
     {
-      adj_nbr_walk_nh4 (e->sw_if_index,
-                       &e->ip4_address, arp_mk_incomplete_walk, e);
       arp_entry_free (eai, e);
+
+      adj_nbr_walk_nh4 (e->sw_if_index,
+                       &e->ip4_address, arp_mk_incomplete_walk, NULL);
     }
 
   return 0;
index e7376bf..938f7b8 100644 (file)
@@ -177,6 +177,7 @@ static u64 fib_walk_hist_vists_per_walk[HISTOGRAM_VISITS_PER_WALK_N_BUCKETS];
  * @brief History of state for the last 128 walks
  */
 #define HISTORY_N_WALKS 128
+#define MAX_HISTORY_REASONS 16
 static u32 history_last_walk_pos;
 typedef struct fib_walk_history_t_ {
     u32 fwh_n_visits;
@@ -184,7 +185,7 @@ typedef struct fib_walk_history_t_ {
     f64 fwh_completed;
     fib_node_ptr_t fwh_parent;
     fib_walk_flags_t fwh_flags;
-    fib_node_bw_reason_flag_t fwh_reason;
+    fib_node_bw_reason_flag_t fwh_reason[MAX_HISTORY_REASONS];
 } fib_walk_history_t;
 static fib_walk_history_t fib_walk_history[HISTORY_N_WALKS];
 
@@ -241,8 +242,7 @@ fib_walk_queue_get_front (fib_walk_priority_t prio)
 static void
 fib_walk_destroy (fib_walk_t *fwalk)
 {
-    fib_node_back_walk_ctx_t *ctx;
-    u32 bucket;
+    u32 bucket, ii;
 
     if (FIB_NODE_INDEX_INVALID != fwalk->fw_prio_sibling)
     {
@@ -277,16 +277,19 @@ fib_walk_destroy (fib_walk_t *fwalk)
     fib_walk_history[history_last_walk_pos].fwh_flags =
        fwalk->fw_flags;
 
-    fib_walk_history[history_last_walk_pos].fwh_reason = 0;
-    vec_foreach(ctx, fwalk->fw_ctx)
+    vec_foreach_index(ii, fwalk->fw_ctx)
     {
-        fib_walk_history[history_last_walk_pos].fwh_reason |=
-            ctx->fnbw_reason;
+        if (ii < MAX_HISTORY_REASONS)
+        {
+            fib_walk_history[history_last_walk_pos].fwh_reason[ii] =
+                fwalk->fw_ctx[ii].fnbw_reason;
+        }
     }
 
     history_last_walk_pos = (history_last_walk_pos + 1) % HISTORY_N_WALKS;
 
     fib_node_deinit(&fwalk->fw_node);
+    vec_free(fwalk->fw_ctx);
     pool_put(fib_walk_pool, fwalk);
 }
 
@@ -315,7 +318,7 @@ typedef enum fib_walk_advance_rc_t_
 static fib_walk_advance_rc_t
 fib_walk_advance (fib_node_index_t fwi)
 {
-    fib_node_back_walk_ctx_t *ctx;
+    fib_node_back_walk_ctx_t *ctx, *old;
     fib_node_back_walk_rc_t wrc;
     fib_node_ptr_t sibling;
     fib_walk_t *fwalk;
@@ -332,6 +335,8 @@ fib_walk_advance (fib_node_index_t fwi)
 
     if (more_elts)
     {
+        old = fwalk->fw_ctx;
+
        vec_foreach(ctx, fwalk->fw_ctx)
        {
            wrc = fib_node_back_walk_one(&sibling, ctx);
@@ -347,6 +352,14 @@ fib_walk_advance (fib_node_index_t fwi)
                 */
                return (FIB_WALK_ADVANCE_MERGE);
            }
+            if (old != fwalk->fw_ctx)
+            {
+                /*
+                 * nasty re-entrant addition of a walk has realloc'd the vector
+                 * break out
+                 */
+               return (FIB_WALK_ADVANCE_MERGE);
+           }
        }
        /*
         * move foward to the next node to visit
@@ -565,7 +578,7 @@ fib_walk_alloc (fib_node_type_t parent_type,
     /*
      * make a copy of the backwalk context so the depth count remains
      * the same for each sibling visitsed. This is important in the case
-     * where a parents has a loop via one child, but all the others are not.
+     * where a parent has a loop via one child, but all the others are not.
      * if the looped child were visited first, the depth count would exceed, the
      * max and the walk would terminate before it reached the other siblings.
      */
@@ -801,42 +814,40 @@ static fib_node_back_walk_rc_t
 fib_walk_back_walk_notify (fib_node_t *node,
                           fib_node_back_walk_ctx_t *ctx)
 {
-    fib_node_back_walk_ctx_t *old;
+    fib_node_back_walk_ctx_t *last;
     fib_walk_t *fwalk;
 
     fwalk = fib_walk_get_from_node(node);
 
     /*
-     * check whether the walk context can be merge with another,
-     * or whether it needs to be appended.
+     * check whether the walk context can be merged with the most recent.
+     * the most recent was the one last added and is thus at the back of the vector.
+     * we can merge walks if the reason for the walk is the same.
      */
-    vec_foreach(old, fwalk->fw_ctx)
+    last = vec_end(fwalk->fw_ctx) - 1;
+
+    if (last->fnbw_reason == ctx->fnbw_reason)
     {
-       /*
-        * we can merge walks if the reason for the walk is the same.
-        */
-       if (old->fnbw_reason == ctx->fnbw_reason)
-       {
-           /*
-            * copy the largest of the depth values. in the presence of a loop,
-            * the same walk will merge with itself. if we take the smaller depth
-            * then it will never end.
-            */
-           old->fnbw_depth = ((old->fnbw_depth >= ctx->fnbw_depth) ?
-                               old->fnbw_depth :
-                               ctx->fnbw_depth);
-           goto out;
-       }
+        /*
+         * copy the largest of the depth values. in the presence of a loop,
+         * the same walk will merge with itself. if we take the smaller depth
+         * then it will never end.
+         */
+        last->fnbw_depth = ((last->fnbw_depth >= ctx->fnbw_depth) ?
+                            last->fnbw_depth :
+                            ctx->fnbw_depth);
+    }
+    else
+    {
+        /*
+         * walks could not be merged, this means that the walk infront needs to
+         * perform different action to this one that has caught up. the one in
+         * front was scheduled first so append the new walk context to the back
+         * of the list.
+         */
+        vec_add1(fwalk->fw_ctx, *ctx);
     }
 
-    /*
-     * walks could not be merged, this means that the walk infront needs to
-     * perform different action to this one that has caught up. the one in front
-     * was scheduled first so append the new walk context to the back of the list.
-     */
-    vec_add1(fwalk->fw_ctx, *ctx);
-
-out:
     return (FIB_NODE_BACK_WALK_MERGE);
 }
 
@@ -979,10 +990,11 @@ fib_walk_show (vlib_main_t * vm,
 
     while (ii != history_last_walk_pos)
     {
-       if (0 != fib_walk_history[ii].fwh_reason)
+       if (0 != fib_walk_history[ii].fwh_reason[0])
        {
             fib_node_back_walk_reason_t reason;
             u8 *s = NULL;
+            u32 jj;
 
            s = format(s, "[@%d]: %s:%d visits:%d duration:%.2f completed:%.2f ",
                        ii, fib_node_type_get_name(fib_walk_history[ii].fwh_parent.fnp_type),
@@ -996,10 +1008,15 @@ fib_walk_show (vlib_main_t * vm,
                 s = format(s, "async, ");
 
             s = format(s, "reason:");
-            FOR_EACH_FIB_NODE_BW_REASON(reason) {
-                if ((1<<reason) & fib_walk_history[ii].fwh_reason) {
-                    s = format (s, "%s,", fib_node_bw_reason_names[reason]);
+            jj = 0;
+            while (0 != fib_walk_history[ii].fwh_reason[jj])
+            {
+                FOR_EACH_FIB_NODE_BW_REASON(reason) {
+                    if ((1<<reason) & fib_walk_history[ii].fwh_reason[jj]) {
+                        s = format (s, "%s,", fib_node_bw_reason_names[reason]);
+                    }
                 }
+                jj++;
             }
             vlib_cli_output(vm, "%v", s);
        }
index 92417a4..15b3f76 100644 (file)
@@ -402,13 +402,15 @@ ip6_nd_mk_complete (adj_index_t ai, ip6_neighbor_t * nbr)
 }
 
 static void
-ip6_nd_mk_incomplete (adj_index_t ai, ip6_neighbor_t * nbr)
+ip6_nd_mk_incomplete (adj_index_t ai)
 {
+  ip_adjacency_t *adj = adj_get(ai);
+
   adj_nbr_update_rewrite (
       ai,
       ADJ_NBR_REWRITE_FLAG_INCOMPLETE,
       ethernet_build_rewrite (vnet_get_main (),
-                             nbr->key.sw_if_index,
+                              adj->rewrite_header.sw_if_index,
                              adj_get_link_type(ai),
                              VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST));
 }
@@ -452,9 +454,7 @@ ip6_nd_mk_complete_walk (adj_index_t ai, void *ctx)
 static adj_walk_rc_t
 ip6_nd_mk_incomplete_walk (adj_index_t ai, void *ctx)
 {
-  ip6_neighbor_t *nbr = ctx;
-
-  ip6_nd_mk_incomplete (ai, nbr);
+  ip6_nd_mk_incomplete (ai);
 
   return (ADJ_WALK_RC_CONTINUE);
 }
@@ -683,13 +683,13 @@ vnet_unset_ip6_ethernet_neighbor (vlib_main_t * vm,
     }
   
   n = pool_elt_at_index (nm->neighbor_pool, p[0]);
+  mhash_unset (&nm->neighbor_index_by_key, &n->key, 0);
 
   adj_nbr_walk_nh6 (sw_if_index,
                    &n->key.ip6_address,
                    ip6_nd_mk_incomplete_walk,
-                   n);
+                   NULL);
 
-  mhash_unset (&nm->neighbor_index_by_key, &n->key, 0);
   fib_table_entry_delete_index (n->fib_entry_index,  FIB_SOURCE_ADJ);
   pool_put (nm->neighbor_pool, n);
   
index a609e2f..4b6aaa1 100644 (file)
@@ -168,6 +168,17 @@ typedef void (*adj_midchain_fixup_t)(vlib_main_t * vm,
                                     struct ip_adjacency_t_ *adj,
                                     vlib_buffer_t * b0);
 
+/**
+ * @brief Flags on an IP adjacency
+ */
+typedef enum ip_adjacency_flags_t_
+{
+    /**
+     * Currently a sync walk is active. Used to prevent re-entrant walking
+     */
+    IP_ADJ_SYNC_WALK_ACTIVE = (1 << 0),
+} ip_adjacency_flags_t;
+
 /** @brief IP unicast adjacency.
     @note cache aligned.
 */
@@ -254,6 +265,12 @@ typedef struct ip_adjacency_t_ {
    * remaining cachelines
    */
   fib_node_t ia_node;
+
+  /**
+   * Flags on the adjacency
+   */
+  ip_adjacency_flags_t ia_flags;
+
 } ip_adjacency_t;
 
 STATIC_ASSERT((STRUCT_OFFSET_OF(ip_adjacency_t, cacheline0) == 0),