From ad95b5df2728f9061f8cd8c3d06a41a9c2c943bd Mon Sep 17 00:00:00 2001 From: Neale Ranns Date: Thu, 10 Nov 2016 20:35:14 +0000 Subject: [PATCH] pseudo atomic adjacency updates When an adjacency changes from incomplete to complete (arp to rewrite) and vice-versa, that update must produce valid behavour w.r.t. the packets that continue to encounter it. Change-Id: Icc2709aff9807c5d24f91ef7649268991ad9a45d Signed-off-by: Neale Ranns --- vnet/vnet/adj/adj_midchain.c | 9 ---- vnet/vnet/adj/adj_nbr.c | 124 +++++++++++++++++++++++++++++-------------- vnet/vnet/dpo/dpo.c | 27 +++++++++- vnet/vnet/dpo/dpo.h | 13 ++++- vnet/vnet/fib/fib_entry.c | 6 +++ vnet/vnet/fib/fib_node.h | 43 ++++++++++++--- vnet/vnet/fib/fib_path.c | 16 +++++- vnet/vnet/fib/fib_types.c | 20 ------- vnet/vnet/fib/fib_types.h | 1 - vnet/vnet/fib/fib_walk.c | 65 +++++++++++++++++++---- 10 files changed, 230 insertions(+), 94 deletions(-) diff --git a/vnet/vnet/adj/adj_midchain.c b/vnet/vnet/adj/adj_midchain.c index 92ea1ea9510..8c6ab5aa17b 100644 --- a/vnet/vnet/adj/adj_midchain.c +++ b/vnet/vnet/adj/adj_midchain.c @@ -429,15 +429,6 @@ adj_nbr_midchain_update_rewrite (adj_index_t adj_index, adj_get_midchain_node(adj->ia_link), adj->sub_type.midchain.tx_function_node, rewrite); - - /* - * time for walkies fido. - */ - fib_node_back_walk_ctx_t bw_ctx = { - .fnbw_reason = FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE, - }; - - fib_walk_sync(FIB_NODE_TYPE_ADJ, adj_get_index(adj), &bw_ctx); } /** diff --git a/vnet/vnet/adj/adj_nbr.c b/vnet/vnet/adj/adj_nbr.c index 8d0511061de..4245990585e 100644 --- a/vnet/vnet/adj/adj_nbr.c +++ b/vnet/vnet/adj/adj_nbr.c @@ -252,12 +252,10 @@ adj_nbr_update_rewrite (adj_index_t adj_index, u8 *rewrite) { ip_adjacency_t *adj; - u32 old_next; ASSERT(ADJ_INDEX_INVALID != adj_index); adj = adj_get(adj_index); - old_next = adj->lookup_next_index; if (flags & ADJ_NBR_REWRITE_FLAG_COMPLETE) { @@ -281,34 +279,6 @@ adj_nbr_update_rewrite (adj_index_t adj_index, adj->rewrite_header.sw_if_index), rewrite); } - - if (old_next != adj->lookup_next_index) - { - /* - * time for walkies fido. - * The link type MPLS Adj never has children. So if it is this adj - * that is updated, we need to walk from its IP sibling. - */ - if (VNET_LINK_MPLS == adj->ia_link) - { - adj_index = adj_nbr_find(adj->ia_nh_proto, - fib_proto_to_link(adj->ia_nh_proto), - &adj->sub_type.nbr.next_hop, - adj->rewrite_header.sw_if_index); - - ASSERT(ADJ_INDEX_INVALID != adj_index); - } - - fib_node_back_walk_ctx_t bw_ctx = { - .fnbw_reason = FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE, - /* - * This walk only needs to go back one level, but there is no control - * here. the first receiving fib_entry_t will quash the walk - */ - }; - - fib_walk_sync(FIB_NODE_TYPE_ADJ, adj_index, &bw_ctx); - } } /** @@ -325,13 +295,32 @@ adj_nbr_update_rewrite_internal (ip_adjacency_t *adj, u32 next_node, u8 *rewrite) { - vlib_main_t * vm = vlib_get_main(); + adj_index_t walk_ai; + vlib_main_t * vm; + u32 old_next; + + vm = vlib_get_main(); + old_next = adj->lookup_next_index; + + walk_ai = adj_get_index(adj); + if (VNET_LINK_MPLS == adj->ia_link) + { + /* + * The link type MPLS has no children in the control plane graph, it only + * has children in the data-palne graph. The backwalk is up the former. + * So we need to walk from its IP cousin. + */ + walk_ai = adj_nbr_find(adj->ia_nh_proto, + fib_proto_to_link(adj->ia_nh_proto), + &adj->sub_type.nbr.next_hop, + adj->rewrite_header.sw_if_index); + } /* * Updating a rewrite string is not atomic; * - the rewrite string is too long to write in one instruction * - when swapping from incomplete to complete, we also need to update - * the VLIB graph next-index. + * the VLIB graph next-index of the adj. * ideally we would only want to suspend forwarding via this adj whilst we * do this, but we do not have that level of granularity - it's suspend all * worker threads or nothing. @@ -340,9 +329,50 @@ adj_nbr_update_rewrite_internal (ip_adjacency_t *adj, * from the set. * - update the next_node index of this adj to point to error-drop * both of which will mean for MAC change we will drop for this adj - * which is not acceptable. - * So the pause all threads is preferable. We don't update MAC addresses often - * so it's no big deal. + * which is not acceptable. However, when the adj changes type (from + * complete to incomplete and vice-versa) the child DPOs, which have the + * VLIB graph next node index, will be sending packets to the wrong graph + * node. So from the options above, updating the next_node of the adj to + * be drop will work, but it relies on each graph node v4/v6/mpls, rewrite/ + * arp/midchain always be valid w.r.t. a mis-match of adj type and node type + * (i.e. a rewrite adj in the arp node). This is not enforcable. Getting it + * wrong will lead to hard to find bugs since its a race condition. So we + * choose the more reliable method of updating the children to use the drop, + * then switching adj's type, then updating the children again. Did I mention + * that this doesn't happen often... + * So we need to distinguish between the two cases: + * 1 - mac change + * 2 - adj type change + */ + if (old_next != adj_next_index && + ADJ_INDEX_INVALID != walk_ai) + { + /* + * the adj is changing type. we need to fix all children so that they + * stack momentarily on a drop, while the adj changes. If we don't do + * this the children will send packets to a VLIB graph node that does + * not correspond to the adj's type - and it goes downhill from there. + */ + fib_node_back_walk_ctx_t bw_ctx = { + .fnbw_reason = FIB_NODE_BW_REASON_FLAG_ADJ_DOWN, + /* + * force this walk to be synchrous. if we don't and a node in the graph + * (a heavily shared path-list) chooses to back-ground the walk (make it + * async) then it will pause and we will do the adj update below, before + * all the children are updated. not good. + */ + .fnbw_flags = FIB_NODE_BW_FLAG_FORCE_SYNC, + }; + + fib_walk_sync(FIB_NODE_TYPE_ADJ, walk_ai, &bw_ctx); + } + + /* + * If we are just updating the MAC string of the adj (which we also can't + * do atomically), then we need to stop packets switching through the adj. + * We can't do that on a per-adj basis, so it's all the packets. + * If we are updating the type, and we walked back to the children above, + * then this barrier serves to flush the queues/frames. */ vlib_worker_thread_barrier_sync(vm); @@ -358,12 +388,6 @@ adj_nbr_update_rewrite_internal (ip_adjacency_t *adj, sizeof(adj->rewrite_data), rewrite, vec_len(rewrite)); - - adj->rewrite_header.node_index = this_node; - adj->rewrite_header.next_index = vlib_node_add_next (vlib_get_main(), - this_node, - next_node); - vec_free(rewrite); } else @@ -371,11 +395,29 @@ adj_nbr_update_rewrite_internal (ip_adjacency_t *adj, vnet_rewrite_clear_data_internal(&adj->rewrite_header, sizeof(adj->rewrite_data)); } + adj->rewrite_header.node_index = this_node; + adj->rewrite_header.next_index = vlib_node_add_next(vlib_get_main(), + this_node, + next_node); /* * done with the rewirte update - let the workers loose. */ vlib_worker_thread_barrier_release(vm); + + if (old_next != adj->lookup_next_index && + ADJ_INDEX_INVALID != walk_ai) + { + /* + * backwalk to the children so they can stack on the now updated + * adjacency + */ + fib_node_back_walk_ctx_t bw_ctx = { + .fnbw_reason = FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE, + }; + + fib_walk_sync(FIB_NODE_TYPE_ADJ, walk_ai, &bw_ctx); + } } typedef struct adj_db_count_ctx_t_ { diff --git a/vnet/vnet/dpo/dpo.c b/vnet/vnet/dpo/dpo.c index 3542b3f1f85..688d2892412 100644 --- a/vnet/vnet/dpo/dpo.c +++ b/vnet/vnet/dpo/dpo.c @@ -84,6 +84,26 @@ static u32 ****dpo_edges; */ static dpo_type_t dpo_dynamic = DPO_LAST; +dpo_proto_t +vnet_link_to_dpo_proto (vnet_link_t linkt) +{ + switch (linkt) + { + case VNET_LINK_IP6: + return (DPO_PROTO_IP6); + case VNET_LINK_IP4: + return (DPO_PROTO_IP4); + case VNET_LINK_MPLS: + return (DPO_PROTO_MPLS); + case VNET_LINK_ETHERNET: + return (DPO_PROTO_ETHERNET); + case VNET_LINK_ARP: + break; + } + ASSERT(0); + return (0); +} + u8 * format_dpo_type (u8 * s, va_list * args) { @@ -170,7 +190,12 @@ dpo_set (dpo_id_t *dpo, void dpo_reset (dpo_id_t *dpo) { - dpo_set(dpo, DPO_FIRST, DPO_PROTO_NONE, INDEX_INVALID); + dpo_id_t tmp = DPO_INVALID; + + /* + * use the atomic copy operation. + */ + dpo_copy(dpo, &tmp); } /** diff --git a/vnet/vnet/dpo/dpo.h b/vnet/vnet/dpo/dpo.h index 78b657d65f2..1efcbc8834b 100644 --- a/vnet/vnet/dpo/dpo.h +++ b/vnet/vnet/dpo/dpo.h @@ -159,9 +159,16 @@ STATIC_ASSERT(sizeof(dpo_id_t) <= sizeof(u64), "atomic updates need to be revisited"); /** - * @brief An initialiser for DPos declared on the stack. + * @brief An initialiser for DPOs declared on the stack. + * Thenext node is set to 0 since VLIB graph nodes should set 0 index to drop. */ -#define DPO_INVALID {0} +#define DPO_INVALID \ +{ \ + .dpoi_type = DPO_FIRST, \ + .dpoi_proto = DPO_PROTO_NONE, \ + .dpoi_index = INDEX_INVALID, \ + .dpoi_next_node = 0, \ +} /** * @brief Return true if the DPO object is valid, i.e. has been initialised. @@ -173,6 +180,8 @@ dpo_id_is_valid (const dpo_id_t *dpoi) dpoi->dpoi_index != INDEX_INVALID); } +extern dpo_proto_t vnet_link_to_dpo_proto(vnet_link_t linkt); + /** * @brief * Take a reference counting lock on the DPO diff --git a/vnet/vnet/fib/fib_entry.c b/vnet/vnet/fib/fib_entry.c index 1047c50e1d2..4080b7622d1 100644 --- a/vnet/vnet/fib/fib_entry.c +++ b/vnet/vnet/fib/fib_entry.c @@ -393,6 +393,7 @@ fib_entry_back_walk_notify (fib_node_t *node, if (FIB_NODE_BW_REASON_FLAG_EVALUATE & ctx->fnbw_reason || FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE & ctx->fnbw_reason || + FIB_NODE_BW_REASON_FLAG_ADJ_DOWN & ctx->fnbw_reason || FIB_NODE_BW_REASON_FLAG_INTERFACE_UP & ctx->fnbw_reason || FIB_NODE_BW_REASON_FLAG_INTERFACE_DOWN & ctx->fnbw_reason || FIB_NODE_BW_REASON_FLAG_INTERFACE_DELETE & ctx->fnbw_reason) @@ -410,6 +411,11 @@ fib_entry_back_walk_notify (fib_node_t *node, */ ctx->fnbw_reason = FIB_NODE_BW_REASON_FLAG_EVALUATE; + /* + * ... and nothing is forced sync from now on. + */ + ctx->fnbw_flags &= ~FIB_NODE_BW_FLAG_FORCE_SYNC; + /* * propagate the backwalk further if we haven't already reached the * maximum depth. diff --git a/vnet/vnet/fib/fib_node.h b/vnet/vnet/fib/fib_node.h index 791d63b9591..33f2203e170 100644 --- a/vnet/vnet/fib/fib_node.h +++ b/vnet/vnet/fib/fib_node.h @@ -97,21 +97,31 @@ typedef enum fib_node_back_walk_reason_t_ { * a unipath adjacency changes */ FIB_NODE_BW_REASON_ADJ_UPDATE, + /** + * Walk to update children to inform them the adjacency is now down. + */ + FIB_NODE_BW_REASON_ADJ_DOWN, /** * Marker. Add new before and update */ - FIB_NODE_BW_REASON_LAST = FIB_NODE_BW_REASON_EVALUATE, + FIB_NODE_BW_REASON_LAST = FIB_NODE_BW_REASON_ADJ_DOWN, } fib_node_back_walk_reason_t; -#define FIB_NODE_BW_REASONS { \ - [FIB_NODE_BW_REASON_RESOLVE] = "resolve" \ - [FIB_NODE_BW_REASON_EVALUATE] = "evaluate" \ - [FIB_NODE_BW_REASON_INTERFACE_UP] = "if-up" \ - [FIB_NODE_BW_REASON_INTERFACE_DOWN] = "if-down" \ - [FIB_NODE_BW_REASON_INTERFACE_DELETE] = "if-delete" \ - [FIB_NODE_BW_REASON_ADJ_UPDATE] = "adj-update" \ +#define FIB_NODE_BW_REASONS { \ + [FIB_NODE_BW_REASON_RESOLVE] = "resolve", \ + [FIB_NODE_BW_REASON_EVALUATE] = "evaluate", \ + [FIB_NODE_BW_REASON_INTERFACE_UP] = "if-up", \ + [FIB_NODE_BW_REASON_INTERFACE_DOWN] = "if-down", \ + [FIB_NODE_BW_REASON_INTERFACE_DELETE] = "if-delete", \ + [FIB_NODE_BW_REASON_ADJ_UPDATE] = "adj-update", \ + [FIB_NODE_BW_REASON_ADJ_DOWN] = "adj-down", \ } +#define FOR_EACH_FIB_NODE_BW_REASON(_item) \ + for (_item = FIB_NODE_BW_REASON_FIRST; \ + _item <= FIB_NODE_BW_REASON_LAST; \ + _item++) + /** * Flags enum constructed from the reaons */ @@ -123,11 +133,23 @@ typedef enum fib_node_bw_reason_flag_t_ { FIB_NODE_BW_REASON_FLAG_INTERFACE_DOWN = (1 << FIB_NODE_BW_REASON_INTERFACE_DOWN), FIB_NODE_BW_REASON_FLAG_INTERFACE_DELETE = (1 << FIB_NODE_BW_REASON_INTERFACE_DELETE), FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE = (1 << FIB_NODE_BW_REASON_ADJ_UPDATE), + FIB_NODE_BW_REASON_FLAG_ADJ_DOWN = (1 << FIB_NODE_BW_REASON_ADJ_DOWN), } __attribute__ ((packed)) fib_node_bw_reason_flag_t; STATIC_ASSERT(sizeof(fib_node_bw_reason_flag_t) < 2, "BW Reason enum < 2 byte. Consequences for cover_upd_res_t"); +/** + * Flags on the walk + */ +typedef enum fib_node_bw_flags_t_ +{ + /** + * Force the walk to be synchronous + */ + FIB_NODE_BW_FLAG_FORCE_SYNC = (1 << 0), +} fib_node_bw_flags_t; + /** * Forward eclarations */ @@ -165,6 +187,11 @@ typedef struct fib_node_back_walk_ctx_t_ { */ fib_node_bw_reason_flag_t fnbw_reason; + /** + * additional flags for the walk + */ + fib_node_bw_flags_t fnbw_flags; + /** * the number of levels the walk has already traversed. * this value is maintained by the walk infra, tp limit the depth of diff --git a/vnet/vnet/fib/fib_path.c b/vnet/vnet/fib/fib_path.c index 7f5aac77c43..9fe653dc8d0 100644 --- a/vnet/vnet/fib/fib_path.c +++ b/vnet/vnet/fib/fib_path.c @@ -757,7 +757,8 @@ fib_path_back_walk_notify (fib_node_t *node, fib_path_proto_to_chain_type(path->fp_nh_proto), &path->fp_dpo); } - if (FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE & ctx->fnbw_reason) + if ((FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE & ctx->fnbw_reason) || + (FIB_NODE_BW_REASON_FLAG_ADJ_DOWN & ctx->fnbw_reason)) { /* * ADJ updates (complete<->incomplete) do not need to propagate to @@ -810,6 +811,12 @@ FIXME comment */ adj_index_t ai; + if (vnet_sw_interface_is_admin_up(vnet_get_main(), + path->attached_next_hop.fp_interface)) + { + path->fp_oper_flags |= FIB_PATH_OPER_FLAG_RESOLVED; + } + ai = fib_path_attached_next_hop_get_adj( path, fib_proto_to_link(path->fp_nh_proto)); @@ -819,6 +826,13 @@ FIXME comment ai); adj_unlock(ai); } + if (FIB_NODE_BW_REASON_FLAG_ADJ_DOWN & ctx->fnbw_reason) + { + /* + * the adj has gone down. the path is no longer resolved. + */ + path->fp_oper_flags &= ~FIB_PATH_OPER_FLAG_RESOLVED; + } break; case FIB_PATH_TYPE_ATTACHED: /* diff --git a/vnet/vnet/fib/fib_types.c b/vnet/vnet/fib/fib_types.c index 383d48e647c..f52de7b8be0 100644 --- a/vnet/vnet/fib/fib_types.c +++ b/vnet/vnet/fib/fib_types.c @@ -214,26 +214,6 @@ fib_proto_to_dpo (fib_protocol_t fib_proto) return (0); } -dpo_proto_t -vnet_link_to_dpo_proto (vnet_link_t linkt) -{ - switch (linkt) - { - case VNET_LINK_IP6: - return (DPO_PROTO_IP6); - case VNET_LINK_IP4: - return (DPO_PROTO_IP4); - case VNET_LINK_MPLS: - return (DPO_PROTO_MPLS); - case VNET_LINK_ETHERNET: - return (DPO_PROTO_ETHERNET); - case VNET_LINK_ARP: - break; - } - ASSERT(0); - return (0); -} - fib_protocol_t dpo_proto_to_fib (dpo_proto_t dpo_proto) { diff --git a/vnet/vnet/fib/fib_types.h b/vnet/vnet/fib/fib_types.h index ea5a6d8cae1..1e70a2a98d8 100644 --- a/vnet/vnet/fib/fib_types.h +++ b/vnet/vnet/fib/fib_types.h @@ -210,7 +210,6 @@ extern u8 * format_fib_prefix(u8 * s, va_list * args); extern u8 * format_fib_forw_chain_type(u8 * s, va_list * args); extern dpo_proto_t fib_proto_to_dpo(fib_protocol_t fib_proto); -extern dpo_proto_t vnet_link_to_dpo_proto(vnet_link_t linkt); extern fib_protocol_t dpo_proto_to_fib(dpo_proto_t dpo_proto); /** diff --git a/vnet/vnet/fib/fib_walk.c b/vnet/vnet/fib/fib_walk.c index 83f09e33a56..215d21dc273 100644 --- a/vnet/vnet/fib/fib_walk.c +++ b/vnet/vnet/fib/fib_walk.c @@ -124,6 +124,10 @@ typedef enum fib_walk_queue_stats_t_ * The names of the walk stats */ static const char * const fib_walk_queue_stats_names[] = FIB_WALK_QUEUE_STATS; +/** + * The names of the walk reasons + */ +static const char * const fib_node_bw_reason_names[] = FIB_NODE_BW_REASONS; /** * A represenation of one queue of walk @@ -178,6 +182,8 @@ typedef struct fib_walk_history_t_ { u32 fwh_n_visits; f64 fwh_duration; fib_node_ptr_t fwh_parent; + fib_walk_flags_t fwh_flags; + fib_node_bw_reason_flag_t fwh_reason; } fib_walk_history_t; static fib_walk_history_t fib_walk_history[HISTORY_N_WALKS]; @@ -234,6 +240,7 @@ fib_walk_queue_get_front (fib_walk_priority_t prio) static void fib_walk_destroy (fib_walk_t *fwalk) { + fib_node_back_walk_ctx_t *ctx; u32 bucket; if (FIB_NODE_INDEX_INVALID != fwalk->fw_prio_sibling) @@ -263,6 +270,15 @@ fib_walk_destroy (fib_walk_t *fwalk) vlib_time_now(vlib_get_main()) - fwalk->fw_start_time; fib_walk_history[history_last_walk_pos].fwh_parent = fwalk->fw_parent; + fib_walk_history[history_last_walk_pos].fwh_flags = + fwalk->fw_flags; + + fib_walk_history[history_last_walk_pos].fwh_reason = 0; + vec_foreach(ctx, fwalk->fw_ctx) + { + fib_walk_history[history_last_walk_pos].fwh_reason |= + ctx->fnbw_reason; + } history_last_walk_pos = (history_last_walk_pos + 1) % HISTORY_N_WALKS; @@ -599,6 +615,15 @@ fib_walk_async (fib_node_type_t parent_type, */ return; } + if (ctx->fnbw_flags & FIB_NODE_BW_FLAG_FORCE_SYNC) + { + /* + * the originator of the walk wanted it to be synchronous, but the + * parent object chose async - denied. + */ + return (fib_walk_sync(parent_type, parent_index, ctx)); + } + fwalk = fib_walk_alloc(parent_type, parent_index, @@ -928,22 +953,40 @@ fib_walk_show (vlib_main_t * vm, vlib_cli_output(vm, "Brief History (last %d walks):", HISTORY_N_WALKS); - ii = history_last_walk_pos; - do + ii = history_last_walk_pos - 1; + if (ii < 0) + ii = HISTORY_N_WALKS - 1; + + while (ii != history_last_walk_pos) { if (0 != fib_walk_history[ii].fwh_n_visits) { - vlib_cli_output( - vm, " %s:%d visits:%d duration:%.2f ", - fib_node_type_get_name(fib_walk_history[ii].fwh_parent.fnp_type), - fib_walk_history[ii].fwh_parent.fnp_index, - fib_walk_history[ii].fwh_n_visits, - fib_walk_history[ii].fwh_duration); + fib_node_back_walk_reason_t reason; + u8 *s = NULL; + + s = format(s, " %s:%d visits:%d duration:%.2f ", + fib_node_type_get_name(fib_walk_history[ii].fwh_parent.fnp_type), + fib_walk_history[ii].fwh_parent.fnp_index, + fib_walk_history[ii].fwh_n_visits, + fib_walk_history[ii].fwh_duration); + if (FIB_WALK_FLAG_SYNC & fib_walk_history[ii].fwh_flags) + s = format(s, "sync, "); + if (FIB_WALK_FLAG_ASYNC & fib_walk_history[ii].fwh_flags) + s = format(s, "async, "); + + s = format(s, "reason:"); + FOR_EACH_FIB_NODE_BW_REASON(reason) { + if ((1<