2 * Copyright (c) 2016 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 #include <vnet/adj/adj_nbr.h>
17 #include <vnet/adj/adj_internal.h>
18 #include <vnet/ethernet/arp_packet.h>
19 #include <vnet/fib/fib_walk.h>
22 * Vector Hash tables of neighbour (traditional) adjacencies
23 * Key: interface(for the vector index), address (and its proto),
24 * link-type/ether-type.
26 static BVT(clib_bihash) **adj_nbr_tables[FIB_PROTOCOL_MAX];
28 // FIXME SIZE APPROPRIATELY. ASK DAVEB.
29 #define ADJ_NBR_DEFAULT_HASH_NUM_BUCKETS (64 * 64)
30 #define ADJ_NBR_DEFAULT_HASH_MEMORY_SIZE (32<<20)
33 #define ADJ_NBR_SET_KEY(_key, _lt, _nh) \
35 _key.key[0] = (_nh)->as_u64[0]; \
36 _key.key[1] = (_nh)->as_u64[1]; \
37 _key.key[2] = (_lt); \
40 #define ADJ_NBR_ITF_OK(_proto, _itf) \
41 (((_itf) < vec_len(adj_nbr_tables[_proto])) && \
42 (NULL != adj_nbr_tables[_proto][sw_if_index]))
45 adj_nbr_insert (fib_protocol_t nh_proto,
47 const ip46_address_t *nh_addr,
49 adj_index_t adj_index)
51 BVT(clib_bihash_kv) kv;
53 if (sw_if_index >= vec_len(adj_nbr_tables[nh_proto]))
55 vec_validate(adj_nbr_tables[nh_proto], sw_if_index);
57 if (NULL == adj_nbr_tables[nh_proto][sw_if_index])
59 adj_nbr_tables[nh_proto][sw_if_index] =
60 clib_mem_alloc_aligned(sizeof(BVT(clib_bihash)),
61 CLIB_CACHE_LINE_BYTES);
62 memset(adj_nbr_tables[nh_proto][sw_if_index],
64 sizeof(BVT(clib_bihash)));
66 BV(clib_bihash_init) (adj_nbr_tables[nh_proto][sw_if_index],
67 "Adjacency Neighbour table",
68 ADJ_NBR_DEFAULT_HASH_NUM_BUCKETS,
69 ADJ_NBR_DEFAULT_HASH_MEMORY_SIZE);
72 ADJ_NBR_SET_KEY(kv, link_type, nh_addr);
75 BV(clib_bihash_add_del) (adj_nbr_tables[nh_proto][sw_if_index], &kv, 1);
79 adj_nbr_remove (fib_protocol_t nh_proto,
81 const ip46_address_t *nh_addr,
84 BVT(clib_bihash_kv) kv;
86 if (!ADJ_NBR_ITF_OK(nh_proto, sw_if_index))
89 ADJ_NBR_SET_KEY(kv, link_type, nh_addr);
91 BV(clib_bihash_add_del) (adj_nbr_tables[nh_proto][sw_if_index], &kv, 0);
95 adj_nbr_find (fib_protocol_t nh_proto,
97 const ip46_address_t *nh_addr,
100 BVT(clib_bihash_kv) kv;
102 ADJ_NBR_SET_KEY(kv, link_type, nh_addr);
104 if (!ADJ_NBR_ITF_OK(nh_proto, sw_if_index))
105 return (ADJ_INDEX_INVALID);
107 if (BV(clib_bihash_search)(adj_nbr_tables[nh_proto][sw_if_index],
110 return (ADJ_INDEX_INVALID);
119 adj_get_nd_node (fib_protocol_t proto)
122 case FIB_PROTOCOL_IP4:
123 return (ip4_arp_node.index);
124 case FIB_PROTOCOL_IP6:
125 return (ip6_discover_neighbor_node.index);
126 case FIB_PROTOCOL_MPLS:
130 return (ip4_arp_node.index);
133 static ip_adjacency_t*
134 adj_nbr_alloc (fib_protocol_t nh_proto,
135 fib_link_t link_type,
136 const ip46_address_t *nh_addr,
141 adj = adj_alloc(nh_proto);
143 adj_nbr_insert(nh_proto, link_type, nh_addr,
148 * since we just added the ADJ we have no rewrite string for it,
151 adj->lookup_next_index = IP_LOOKUP_NEXT_ARP;
152 adj->sub_type.nbr.next_hop = *nh_addr;
153 adj->ia_link = link_type;
154 adj->ia_nh_proto = nh_proto;
155 adj->rewrite_header.sw_if_index = sw_if_index;
156 memset(&adj->sub_type.midchain.next_dpo, 0,
157 sizeof(adj->sub_type.midchain.next_dpo));
165 * Add an adjacency for the neighbour requested.
167 * The key for an adj is:
168 * - the Next-hops protocol (i.e. v4 or v6)
169 * - the address of the next-hop
170 * - the interface the next-hop is reachable through
173 adj_nbr_add_or_lock (fib_protocol_t nh_proto,
174 fib_link_t link_type,
175 const ip46_address_t *nh_addr,
178 adj_index_t adj_index;
181 adj_index = adj_nbr_find(nh_proto, link_type, nh_addr, sw_if_index);
183 if (ADJ_INDEX_INVALID == adj_index)
187 vnm = vnet_get_main();
188 adj = adj_nbr_alloc(nh_proto, link_type, nh_addr, sw_if_index);
189 adj_index = adj_get_index(adj);
192 vnet_rewrite_init(vnm, sw_if_index,
193 adj_get_nd_node(nh_proto),
194 vnet_tx_node_index_for_sw_interface(vnm, sw_if_index),
195 &adj->rewrite_header);
198 * we need a rewrite where the destination IP address is converted
199 * to the appropriate link-layer address. This is interface specific.
200 * So ask the interface to do it.
202 vnet_update_adjacency_for_sw_interface(vnm, sw_if_index, adj_index);
213 adj_nbr_add_or_lock_w_rewrite (fib_protocol_t nh_proto,
214 fib_link_t link_type,
215 const ip46_address_t *nh_addr,
219 adj_index_t adj_index;
222 adj_index = adj_nbr_find(nh_proto, link_type, nh_addr, sw_if_index);
224 if (ADJ_INDEX_INVALID == adj_index)
226 adj = adj_nbr_alloc(nh_proto, link_type, nh_addr, sw_if_index);
227 adj->rewrite_header.sw_if_index = sw_if_index;
231 adj = adj_get(adj_index);
234 adj_lock(adj_get_index(adj));
235 adj_nbr_update_rewrite(adj_get_index(adj),
236 ADJ_NBR_REWRITE_FLAG_COMPLETE,
239 return (adj_get_index(adj));
243 * adj_nbr_update_rewrite
245 * Update the adjacency's rewrite string. A NULL string implies the
246 * rewirte is reset (i.e. when ARP/ND etnry is gone).
247 * NB: the adj being updated may be handling traffic in the DP.
250 adj_nbr_update_rewrite (adj_index_t adj_index,
251 adj_nbr_rewrite_flag_t flags,
257 ASSERT(ADJ_INDEX_INVALID != adj_index);
259 adj = adj_get(adj_index);
260 old_next = adj->lookup_next_index;
262 if (flags & ADJ_NBR_REWRITE_FLAG_COMPLETE)
265 * update the adj's rewrite string and build the arc
266 * from the rewrite node to the interface's TX node
268 adj_nbr_update_rewrite_internal(adj, IP_LOOKUP_NEXT_REWRITE,
269 adj_get_rewrite_node(adj->ia_link),
270 vnet_tx_node_index_for_sw_interface(
272 adj->rewrite_header.sw_if_index),
277 adj_nbr_update_rewrite_internal(adj, IP_LOOKUP_NEXT_ARP,
278 adj_get_nd_node(adj->ia_nh_proto),
279 vnet_tx_node_index_for_sw_interface(
281 adj->rewrite_header.sw_if_index),
285 if (old_next != adj->lookup_next_index)
288 * time for walkies fido.
289 * The link type MPLS Adj never has children. So if it is this adj
290 * that is updated, we need to walk from its IP sibling.
292 if (FIB_LINK_MPLS == adj->ia_link)
294 adj_index = adj_nbr_find(adj->ia_nh_proto,
295 fib_proto_to_link(adj->ia_nh_proto),
296 &adj->sub_type.nbr.next_hop,
297 adj->rewrite_header.sw_if_index);
299 ASSERT(ADJ_INDEX_INVALID != adj_index);
302 fib_node_back_walk_ctx_t bw_ctx = {
303 .fnbw_reason = FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE,
305 * This walk only needs to go back one level, but there is no control
306 * here. the first receiving fib_entry_t will quash the walk
310 fib_walk_sync(FIB_NODE_TYPE_ADJ, adj_index, &bw_ctx);
315 * adj_nbr_update_rewrite_internal
317 * Update the adjacency's rewrite string. A NULL string implies the
318 * rewirte is reset (i.e. when ARP/ND etnry is gone).
319 * NB: the adj being updated may be handling traffic in the DP.
322 adj_nbr_update_rewrite_internal (ip_adjacency_t *adj,
328 vlib_main_t * vm = vlib_get_main();
331 * Updating a rewrite string is not atomic;
332 * - the rewrite string is too long to write in one instruction
333 * - when swapping from incomplete to complete, we also need to update
334 * the VLIB graph next-index.
335 * ideally we would only want to suspend forwarding via this adj whilst we
336 * do this, but we do not have that level of granularity - it's suspend all
337 * worker threads or nothing.
338 * The other chioces are:
339 * - to mark the adj down and back walk so child load-balances drop this adj
341 * - update the next_node index of this adj to point to error-drop
342 * both of which will mean for MAC change we will drop for this adj
343 * which is not acceptable.
344 * So the pause all threads is preferable. We don't update MAC addresses often
345 * so it's no big deal.
347 vlib_worker_thread_barrier_sync(vm);
349 adj->lookup_next_index = adj_next_index;
354 * new rewrite provided.
355 * fill in the adj's rewrite string, and build the VLIB graph arc.
357 vnet_rewrite_set_data_internal(&adj->rewrite_header,
358 sizeof(adj->rewrite_data),
362 adj->rewrite_header.node_index = this_node;
363 adj->rewrite_header.next_index = vlib_node_add_next (vlib_get_main(),
371 vnet_rewrite_clear_data_internal(&adj->rewrite_header,
372 sizeof(adj->rewrite_data));
376 * done with the rewirte update - let the workers loose.
378 vlib_worker_thread_barrier_release(vm);
381 typedef struct adj_db_count_ctx_t_ {
383 } adj_db_count_ctx_t;
386 adj_db_count (BVT(clib_bihash_kv) * kvp,
389 adj_db_count_ctx_t * ctx = arg;
394 adj_nbr_db_size (void)
396 adj_db_count_ctx_t ctx = {
399 fib_protocol_t proto;
402 for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
404 vec_foreach_index(sw_if_index, adj_nbr_tables[proto])
406 if (NULL != adj_nbr_tables[proto][sw_if_index])
408 BV(clib_bihash_foreach_key_value_pair) (
409 adj_nbr_tables[proto][sw_if_index],
419 * @brief Context for a walk of the adjacency neighbour DB
421 typedef struct adj_walk_ctx_t_
423 adj_walk_cb_t awc_cb;
428 adj_nbr_walk_cb (BVT(clib_bihash_kv) * kvp,
431 adj_walk_ctx_t *ctx = arg;
433 // FIXME: can't stop early...
434 ctx->awc_cb(kvp->value, ctx->awc_ctx);
438 adj_nbr_walk (u32 sw_if_index,
439 fib_protocol_t adj_nh_proto,
443 if (!ADJ_NBR_ITF_OK(adj_nh_proto, sw_if_index))
446 adj_walk_ctx_t awc = {
451 BV(clib_bihash_foreach_key_value_pair) (
452 adj_nbr_tables[adj_nh_proto][sw_if_index],
458 * @brief Context for a walk of the adjacency neighbour DB
460 typedef struct adj_walk_nh_ctx_t_
462 adj_walk_cb_t awc_cb;
464 const ip46_address_t *awc_nh;
468 adj_nbr_walk_nh_cb (BVT(clib_bihash_kv) * kvp,
472 adj_walk_nh_ctx_t *ctx = arg;
474 adj = adj_get(kvp->value);
476 if (!ip46_address_cmp(&adj->sub_type.nbr.next_hop, ctx->awc_nh))
477 ctx->awc_cb(kvp->value, ctx->awc_ctx);
481 * @brief Walk adjacencies on a link with a given v4 next-hop.
482 * that is visit the adjacencies with different link types.
485 adj_nbr_walk_nh4 (u32 sw_if_index,
486 const ip4_address_t *addr,
490 if (!ADJ_NBR_ITF_OK(FIB_PROTOCOL_IP4, sw_if_index))
493 ip46_address_t nh = {
497 adj_walk_nh_ctx_t awc = {
503 BV(clib_bihash_foreach_key_value_pair) (
504 adj_nbr_tables[FIB_PROTOCOL_IP4][sw_if_index],
510 * @brief Walk adjacencies on a link with a given v6 next-hop.
511 * that is visit the adjacencies with different link types.
514 adj_nbr_walk_nh6 (u32 sw_if_index,
515 const ip6_address_t *addr,
519 if (!ADJ_NBR_ITF_OK(FIB_PROTOCOL_IP6, sw_if_index))
522 ip46_address_t nh = {
526 adj_walk_nh_ctx_t awc = {
532 BV(clib_bihash_foreach_key_value_pair) (
533 adj_nbr_tables[FIB_PROTOCOL_IP6][sw_if_index],
539 * @brief Walk adjacencies on a link with a given next-hop.
540 * that is visit the adjacencies with different link types.
543 adj_nbr_walk_nh (u32 sw_if_index,
544 fib_protocol_t adj_nh_proto,
545 const ip46_address_t *nh,
549 if (!ADJ_NBR_ITF_OK(adj_nh_proto, sw_if_index))
552 adj_walk_nh_ctx_t awc = {
558 BV(clib_bihash_foreach_key_value_pair) (
559 adj_nbr_tables[adj_nh_proto][sw_if_index],
565 * Context for the state change walk of the DB
567 typedef struct adj_nbr_interface_state_change_ctx_t_
570 * Flags passed from the vnet notifiy function
573 } adj_nbr_interface_state_change_ctx_t;
576 adj_nbr_interface_state_change_one (adj_index_t ai,
580 * Back walk the graph to inform the forwarding entries
581 * that this interface state has changed.
583 adj_nbr_interface_state_change_ctx_t *ctx = arg;
585 fib_node_back_walk_ctx_t bw_ctx = {
586 .fnbw_reason = (ctx->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP ?
587 FIB_NODE_BW_REASON_FLAG_INTERFACE_UP :
588 FIB_NODE_BW_REASON_FLAG_INTERFACE_DOWN),
591 fib_walk_sync(FIB_NODE_TYPE_ADJ, ai, &bw_ctx);
593 return (ADJ_WALK_RC_CONTINUE);
596 static clib_error_t *
597 adj_nbr_interface_state_change (vnet_main_t * vnm,
601 fib_protocol_t proto;
604 * walk each adj on the interface and trigger a walk from that adj
606 for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
608 adj_nbr_interface_state_change_ctx_t ctx = {
612 adj_nbr_walk(sw_if_index, proto,
613 adj_nbr_interface_state_change_one,
620 VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION(adj_nbr_interface_state_change);
623 adj_nbr_interface_delete_one (adj_index_t ai,
627 * Back walk the graph to inform the forwarding entries
628 * that this interface has been deleted.
630 fib_node_back_walk_ctx_t bw_ctx = {
631 .fnbw_reason = FIB_NODE_BW_REASON_FLAG_INTERFACE_DELETE,
634 fib_walk_sync(FIB_NODE_TYPE_ADJ, ai, &bw_ctx);
636 return (ADJ_WALK_RC_CONTINUE);
640 * adj_nbr_interface_add_del
642 * Registered to receive interface Add and delete notifications
644 static clib_error_t *
645 adj_nbr_interface_add_del (vnet_main_t * vnm,
649 fib_protocol_t proto;
654 * not interested in interface additions. we will not back walk
655 * to resolve paths through newly added interfaces. Why? The control
656 * plane should have the brains to add interfaces first, then routes.
657 * So the case where there are paths with a interface that matches
658 * one just created is the case where the path resolved through an
659 * interface that was deleted, and still has not been removed. The
660 * new interface added, is NO GUARANTEE that the interface being
661 * added now, even though it may have the same sw_if_index, is the
662 * same interface that the path needs. So tough!
663 * If the control plane wants these routes to resolve it needs to
664 * remove and add them again.
669 for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
671 adj_nbr_walk(sw_if_index, proto,
672 adj_nbr_interface_delete_one,
680 VNET_SW_INTERFACE_ADD_DEL_FUNCTION(adj_nbr_interface_add_del);
684 adj_nbr_show_one (adj_index_t ai,
687 vlib_cli_output (arg, "[@%d] %U",
689 format_ip_adjacency, ai,
690 FORMAT_IP_ADJACENCY_NONE);
692 return (ADJ_WALK_RC_CONTINUE);
695 static clib_error_t *
696 adj_nbr_show (vlib_main_t * vm,
697 unformat_input_t * input,
698 vlib_cli_command_t * cmd)
700 adj_index_t ai = ADJ_INDEX_INVALID;
701 u32 sw_if_index = ~0;
703 while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
705 if (unformat (input, "%d", &ai))
707 else if (unformat (input, "%U",
708 unformat_vnet_sw_interface, vnet_get_main(),
715 if (ADJ_INDEX_INVALID != ai)
717 vlib_cli_output (vm, "[@%d] %U",
719 format_ip_adjacency, ai,
720 FORMAT_IP_ADJACENCY_DETAIL);
722 else if (~0 != sw_if_index)
724 fib_protocol_t proto;
726 for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
728 adj_nbr_walk(sw_if_index, proto,
735 fib_protocol_t proto;
737 for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
739 vec_foreach_index(sw_if_index, adj_nbr_tables[proto])
741 adj_nbr_walk(sw_if_index, proto,
752 * Show all neighbour adjacencies.
754 * @cliexstart{sh adj nbr}
755 * [@2] ipv4 via 1.0.0.2 loop0: IP4: 00:00:22:aa:bb:cc -> 00:00:11:aa:bb:cc
756 * [@3] mpls via 1.0.0.2 loop0: MPLS_UNICAST: 00:00:22:aa:bb:cc -> 00:00:11:aa:bb:cc
757 * [@4] ipv4 via 1.0.0.3 loop0: IP4: 00:00:22:aa:bb:cc -> 00:00:11:aa:bb:cc
758 * [@5] mpls via 1.0.0.3 loop0: MPLS_UNICAST: 00:00:22:aa:bb:cc -> 00:00:11:aa:bb:cc
761 VLIB_CLI_COMMAND (ip4_show_fib_command, static) = {
762 .path = "show adj nbr",
763 .short_help = "show adj nbr [<adj_index>] [interface]",
764 .function = adj_nbr_show,
768 adj_proto_to_46 (fib_protocol_t proto)
772 case FIB_PROTOCOL_IP4:
773 return (IP46_TYPE_IP4);
774 case FIB_PROTOCOL_IP6:
775 return (IP46_TYPE_IP6);
777 return (IP46_TYPE_IP4);
779 return (IP46_TYPE_IP4);
783 format_adj_nbr_incomplete (u8* s, va_list *ap)
785 index_t index = va_arg(ap, index_t);
786 CLIB_UNUSED(u32 indent) = va_arg(ap, u32);
787 vnet_main_t * vnm = vnet_get_main();
788 ip_adjacency_t * adj = adj_get(index);
790 s = format (s, "arp-%U", format_fib_link, adj->ia_link);
791 s = format (s, ": via %U",
792 format_ip46_address, &adj->sub_type.nbr.next_hop,
793 adj_proto_to_46(adj->ia_nh_proto));
794 s = format (s, " %U",
795 format_vnet_sw_interface_name,
797 vnet_get_sw_interface(vnm,
798 adj->rewrite_header.sw_if_index));
804 format_adj_nbr (u8* s, va_list *ap)
806 index_t index = va_arg(ap, index_t);
807 CLIB_UNUSED(u32 indent) = va_arg(ap, u32);
808 vnet_main_t * vnm = vnet_get_main();
809 ip_adjacency_t * adj = adj_get(index);
811 s = format (s, "%U", format_fib_link, adj->ia_link);
812 s = format (s, " via %U ",
813 format_ip46_address, &adj->sub_type.nbr.next_hop,
814 adj_proto_to_46(adj->ia_nh_proto));
817 vnm->vlib_main, &adj->rewrite_header, sizeof (adj->rewrite_data), 0);
823 adj_dpo_lock (dpo_id_t *dpo)
825 adj_lock(dpo->dpoi_index);
828 adj_dpo_unlock (dpo_id_t *dpo)
830 adj_unlock(dpo->dpoi_index);
836 fib_show_memory_usage("Adjacency",
839 sizeof(ip_adjacency_t));
842 const static dpo_vft_t adj_nbr_dpo_vft = {
843 .dv_lock = adj_dpo_lock,
844 .dv_unlock = adj_dpo_unlock,
845 .dv_format = format_adj_nbr,
846 .dv_mem_show = adj_mem_show,
848 const static dpo_vft_t adj_nbr_incompl_dpo_vft = {
849 .dv_lock = adj_dpo_lock,
850 .dv_unlock = adj_dpo_unlock,
851 .dv_format = format_adj_nbr_incomplete,
855 * @brief The per-protocol VLIB graph nodes that are assigned to an adjacency
858 * this means that these graph nodes are ones from which a nbr is the
859 * parent object in the DPO-graph.
861 const static char* const nbr_ip4_nodes[] =
863 "ip4-rewrite-transit",
866 const static char* const nbr_ip6_nodes[] =
871 const static char* const nbr_mpls_nodes[] =
876 const static char* const nbr_ethernet_nodes[] =
881 const static char* const * const nbr_nodes[DPO_PROTO_NUM] =
883 [DPO_PROTO_IP4] = nbr_ip4_nodes,
884 [DPO_PROTO_IP6] = nbr_ip6_nodes,
885 [DPO_PROTO_MPLS] = nbr_mpls_nodes,
886 [DPO_PROTO_ETHERNET] = nbr_ethernet_nodes,
889 const static char* const nbr_incomplete_ip4_nodes[] =
894 const static char* const nbr_incomplete_ip6_nodes[] =
896 "ip6-discover-neighbor",
899 const static char* const nbr_incomplete_mpls_nodes[] =
901 "mpls-adj-incomplete",
905 const static char* const * const nbr_incomplete_nodes[DPO_PROTO_NUM] =
907 [DPO_PROTO_IP4] = nbr_incomplete_ip4_nodes,
908 [DPO_PROTO_IP6] = nbr_incomplete_ip6_nodes,
909 [DPO_PROTO_MPLS] = nbr_incomplete_mpls_nodes,
913 adj_nbr_module_init (void)
915 dpo_register(DPO_ADJACENCY,
918 dpo_register(DPO_ADJACENCY_INCOMPLETE,
919 &adj_nbr_incompl_dpo_vft,
920 nbr_incomplete_nodes);