ip: Path MTU 35/30535/25
authorNeale Ranns <neale@graphiant.com>
Mon, 21 Dec 2020 08:29:34 +0000 (08:29 +0000)
committerOle Tr�an <otroan@employees.org>
Mon, 15 Feb 2021 17:27:48 +0000 (17:27 +0000)
Type: feature

Support setting the MTU for a peer on an interface. The minimum value of
the path and interface MTU is used at forwarding time.

the path MTU is specified for a given peer, by address and table-ID.
In the forwarding plane the MTU is enfored either:
 1 - if the peer is attached, then the MTU is set on the peer's
adjacency
 2 - if the peer is not attached, it is remote, then a DPO is added to
the peer's FIB entry to perform the necessary fragmentation.

Signed-off-by: Neale Ranns <neale@graphiant.com>
Change-Id: I8b9ea6a07868b50e97e2561f18d9335407dea7ae

26 files changed:
src/plugins/unittest/fib_test.c
src/vnet/CMakeLists.txt
src/vnet/adj/adj.c
src/vnet/adj/adj.h
src/vnet/adj/adj_glean.c
src/vnet/adj/adj_internal.h
src/vnet/adj/adj_mcast.c
src/vnet/adj/adj_midchain.c
src/vnet/adj/adj_nbr.c
src/vnet/adj/adj_nbr.h
src/vnet/dpo/dpo.c
src/vnet/dpo/dpo.h
src/vnet/dpo/load_balance.c
src/vnet/dpo/mpls_label_dpo.c
src/vnet/fib/fib_entry.c
src/vnet/fib/fib_node.h
src/vnet/fib/fib_path.c
src/vnet/gre/gre.c
src/vnet/ip/ip.api
src/vnet/ip/ip_api.c
src/vnet/ip/ip_path_mtu.c [new file with mode: 0644]
src/vnet/ip/ip_path_mtu.h [new file with mode: 0644]
src/vnet/ip/ip_path_mtu_node.c [new file with mode: 0644]
test/test_ip4.py
test/test_ip6.py
test/vpp_ip.py

index b9b77ba..c6291fb 100644 (file)
@@ -5869,10 +5869,14 @@ fib_test_ae (void)
 static int
 fib_test_pref (void)
 {
-    test_main_t *tm = &test_main;
-    int res;
+    test_main_t *tm;
+    ip4_main_t *im;
+    int res, i;
 
+    tm = &test_main;
+    im = &ip4_main;
     res = 0;
+
     const fib_prefix_t pfx_1_1_1_1_s_32 = {
         .fp_len = 32,
         .fp_proto = FIB_PROTOCOL_IP4,
@@ -5883,6 +5887,11 @@ fib_test_pref (void)
         },
     };
 
+    vec_validate(im->fib_index_by_sw_if_index, tm->hw[2]->sw_if_index);
+
+    for (i = 0; i <= 2; i++)
+        im->fib_index_by_sw_if_index[tm->hw[i]->sw_if_index] = 0;
+
     /*
      * 2 high, 2 medium and 2 low preference non-recursive paths
      */
@@ -9049,12 +9058,25 @@ static int
 fib_test_inherit (void)
 {
     fib_node_index_t fei;
+    int n_feis, res, i;
     test_main_t *tm;
-    int n_feis, res;
+    ip4_main_t *im4;
+    ip6_main_t *im6;
 
+    tm = &test_main;
+    im4 = &ip4_main;
+    im6 = &ip6_main;
     res = 0;
+
+    vec_validate(im4->fib_index_by_sw_if_index, tm->hw[2]->sw_if_index);
+    vec_validate(im6->fib_index_by_sw_if_index, tm->hw[2]->sw_if_index);
+
+    for (i = 0; i <= 2; i++)
+    {
+        im4->fib_index_by_sw_if_index[tm->hw[i]->sw_if_index] = 0;
+        im6->fib_index_by_sw_if_index[tm->hw[i]->sw_if_index] = 0;
+    }
     n_feis = fib_entry_pool_size();
-    tm = &test_main;
 
     const ip46_address_t nh_10_10_10_1 = {
         .ip4.as_u32 = clib_host_to_net_u32(0x0a0a0a01),
index 0e1d9c4..38aeda5 100644 (file)
@@ -415,6 +415,8 @@ list(APPEND VNET_SOURCES
   ip/ip_interface.c
   ip/ip_init.c
   ip/ip_in_out_acl.c
+  ip/ip_path_mtu.c
+  ip/ip_path_mtu_node.c
   ip/ip_punt_drop.c
   ip/ip_types.c
   ip/lookup.c
@@ -437,6 +439,7 @@ list(APPEND VNET_MULTIARCH_SOURCES
   ip/ip6_punt_drop.c
   ip/punt_node.c
   ip/ip_in_out_acl.c
+  ip/ip_path_mtu_node.c
 )
 
 list(APPEND VNET_HEADERS
index d389022..8808294 100644 (file)
@@ -20,6 +20,7 @@
 #include <vnet/adj/adj_mcast.h>
 #include <vnet/adj/adj_delegate.h>
 #include <vnet/fib/fib_node_list.h>
+#include <vnet/fib/fib_walk.h>
 
 /* Adjacency packet/byte counters indexed by adjacency index. */
 vlib_combined_counter_main_t adjacency_counters = {
@@ -326,6 +327,16 @@ adj_dpo_get_urpf (const dpo_id_t *dpo)
     return (adj->rewrite_header.sw_if_index);
 }
 
+u16
+adj_dpo_get_mtu (const dpo_id_t *dpo)
+{
+    ip_adjacency_t *adj;
+
+    adj = adj_get(dpo->dpoi_index);
+
+    return (adj->rewrite_header.max_l3_packet_bytes);
+}
+
 void
 adj_lock (adj_index_t adj_index)
 {
@@ -465,6 +476,19 @@ adj_mtu_update_walk_cb (adj_index_t ai,
 
     vnet_rewrite_update_mtu (vnet_get_main(), adj->ia_link,
                              &adj->rewrite_header);
+    adj_delegate_adj_modified(adj);
+
+    /**
+     * Backwalk to all Path MTU trackers, casual like ..
+     */
+    {
+       fib_node_back_walk_ctx_t bw_ctx = {
+           .fnbw_reason = FIB_NODE_BW_REASON_FLAG_ADJ_MTU,
+       };
+
+       fib_walk_async(FIB_NODE_TYPE_ADJ, ai,
+                       FIB_WALK_PRIORITY_LOW, &bw_ctx);
+    }
 
     return (ADJ_WALK_RC_CONTINUE);
 }
index 44bb2bd..c1922c7 100644 (file)
@@ -373,6 +373,9 @@ STATIC_ASSERT ((STRUCT_OFFSET_OF (ip_adjacency_t, cacheline3) ==
 /* An adj fits into 4 cachelines on your average machine */
 STATIC_ASSERT_SIZEOF (ip_adjacency_t, 4 * 64);
 #endif
+STATIC_ASSERT ((STRUCT_OFFSET_OF (ip_adjacency_t, sub_type.nbr.next_hop) ==
+                STRUCT_OFFSET_OF (ip_adjacency_t, sub_type.midchain.next_hop)),
+              "IP adjacency nbr and midchain offsets don't match");
 
 /**
  * @brief
index c52e3d0..e956318 100644 (file)
@@ -467,6 +467,7 @@ const static dpo_vft_t adj_glean_dpo_vft = {
     .dv_unlock = adj_dpo_unlock,
     .dv_format = format_adj_glean,
     .dv_get_urpf = adj_dpo_get_urpf,
+    .dv_get_mtu = adj_dpo_get_mtu,
 };
 
 /**
index 6639d32..253c1e9 100644 (file)
@@ -126,6 +126,7 @@ extern void adj_mcast_remove(fib_protocol_t proto,
 extern void adj_midchain_teardown(ip_adjacency_t *adj);
 
 extern u32 adj_dpo_get_urpf(const dpo_id_t *dpo);
+extern u16 adj_dpo_get_mtu(const dpo_id_t *dpo);
 
 /*
  * Adj BFD
index 5906522..a20f61f 100644 (file)
@@ -388,12 +388,14 @@ const static dpo_vft_t adj_mcast_dpo_vft = {
     .dv_unlock = adj_dpo_unlock,
     .dv_format = format_adj_mcast,
     .dv_get_urpf = adj_dpo_get_urpf,
+    .dv_get_mtu = adj_dpo_get_mtu,
 };
 const static dpo_vft_t adj_mcast_midchain_dpo_vft = {
     .dv_lock = adj_dpo_lock,
     .dv_unlock = adj_dpo_unlock,
     .dv_format = format_adj_mcast_midchain,
     .dv_get_urpf = adj_dpo_get_urpf,
+    .dv_get_mtu = adj_dpo_get_mtu,
 };
 
 /**
index a21cd21..3d879e9 100644 (file)
@@ -744,6 +744,7 @@ const static dpo_vft_t adj_midchain_dpo_vft = {
     .dv_unlock = adj_dpo_unlock,
     .dv_format = format_adj_midchain,
     .dv_get_urpf = adj_dpo_get_urpf,
+    .dv_get_mtu = adj_dpo_get_mtu,
 };
 
 /**
index 921588a..811d0b8 100644 (file)
@@ -222,6 +222,27 @@ adj_nbr_alloc (fib_protocol_t nh_proto,
     return (adj);
 }
 
+void
+adj_nbr_set_mtu (adj_index_t adj_index, u16 mtu)
+{
+    ip_adjacency_t *adj;
+
+    ASSERT(ADJ_INDEX_INVALID != adj_index);
+
+    adj = adj_get(adj_index);
+
+    if (0 == mtu)
+        vnet_rewrite_update_mtu(vnet_get_main(), adj->ia_link,
+                                &adj->rewrite_header);
+    else
+    {
+        vnet_rewrite_update_mtu(vnet_get_main(), adj->ia_link,
+                                &adj->rewrite_header);
+        adj->rewrite_header.max_l3_packet_bytes =
+            clib_min (adj->rewrite_header.max_l3_packet_bytes, mtu);
+    }
+}
+
 /*
  * adj_nbr_add_or_lock
  *
@@ -268,13 +289,13 @@ adj_nbr_add_or_lock (fib_protocol_t nh_proto,
         * So ask the interface to do it.
         */
        vnet_update_adjacency_for_sw_interface(vnm, sw_if_index, adj_index);
+        adj_delegate_adj_created(adj_get(adj_index));
     }
     else
     {
        adj_lock(adj_index);
     }
 
-    adj_delegate_adj_created(adj_get(adj_index));
     return (adj_index);
 }
 
@@ -1055,12 +1076,14 @@ const static dpo_vft_t adj_nbr_dpo_vft = {
     .dv_format = format_adj_nbr,
     .dv_mem_show = adj_mem_show,
     .dv_get_urpf = adj_dpo_get_urpf,
+    .dv_get_mtu = adj_dpo_get_mtu,
 };
 const static dpo_vft_t adj_nbr_incompl_dpo_vft = {
     .dv_lock = adj_dpo_lock,
     .dv_unlock = adj_dpo_unlock,
     .dv_format = format_adj_nbr_incomplete,
     .dv_get_urpf = adj_dpo_get_urpf,
+    .dv_get_mtu = adj_dpo_get_mtu,
 };
 
 /**
index 3a89dc8..4874e73 100644 (file)
@@ -75,6 +75,13 @@ extern adj_index_t adj_nbr_add_or_lock_w_rewrite(fib_protocol_t nh_proto,
                                                 const ip46_address_t *nh_addr,
                                                 u32 sw_if_index,
                                                 u8 *rewrite);
+
+/**
+ * Set the MTU on an adjacency
+ *
+ */
+extern void adj_nbr_set_mtu(adj_index_t ai, u16 mtu);
+
 /**
  * @brief When adding a rewrite to an adjacency these are flags that
  * apply to that rewrite
index 1331b55..d8342ff 100644 (file)
@@ -23,6 +23,8 @@
  * The VLIB graph nodes are graph of types, the DPO graph is a graph of instances.
  */
 
+// clang-format off
+
 #include <vnet/dpo/dpo.h>
 #include <vnet/ip/lookup.h>
 #include <vnet/ip/format.h>
@@ -395,6 +397,18 @@ dpo_get_urpf(const dpo_id_t *dpo)
     return (~0);
 }
 
+u16
+dpo_get_mtu(const dpo_id_t *dpo)
+{
+    if (dpo_id_is_valid(dpo) &&
+        (NULL != dpo_vfts[dpo->dpoi_type].dv_get_mtu))
+    {
+        return (dpo_vfts[dpo->dpoi_type].dv_get_mtu(dpo));
+    }
+
+    return (0xffff);
+}
+
 static u32
 dpo_get_next_node (dpo_type_t child_type,
                    dpo_proto_t child_proto,
@@ -649,3 +663,5 @@ VLIB_CLI_COMMAND (show_fib_memory, static) = {
     .short_help = "show dpo memory",
 };
 /* *INDENT-ON* */
+
+// clang-format on
index ee4990d..e9976c2 100644 (file)
@@ -24,6 +24,8 @@
  * instances.
  */
 
+// clang-format off
+
 #ifndef __DPO_H__
 #define __DPO_H__
 
@@ -361,6 +363,16 @@ extern void dpo_stack_from_node(u32 child_node,
  */
 extern u32 dpo_get_urpf(const dpo_id_t *dpo);
 
+/**
+ * Get the MTU DPO
+ *
+ * @param dpo
+ *  The DPO from which to get the MTU
+ *
+ * @return MTU (0xffff if something more usefull was unavailable)
+ */
+extern u16 dpo_get_mtu(const dpo_id_t *dpo);
+
 /**
  * @brief  A lock function registered for a DPO type
  */
@@ -388,6 +400,11 @@ typedef u32* (*dpo_get_next_node_t)(const dpo_id_t *dpo);
  */
 typedef u32 (*dpo_get_urpf_t)(const dpo_id_t *dpo);
 
+/**
+ * @brief Given a DPO instance return the MTU
+ */
+typedef u16 (*dpo_get_mtu_t)(const dpo_id_t *dpo);
+
 /**
  * @brief Called during FIB interposition when the originally
  * registered DPO is used to 'clone' an instance for interposition
@@ -432,6 +449,10 @@ typedef struct dpo_vft_t_
      * Get uRPF interface
      */
     dpo_get_urpf_t dv_get_urpf;
+    /**
+     * Get MTU
+     */
+    dpo_get_mtu_t dv_get_mtu;
     /**
      * Signal on an interposed child that the parent has changed
      */
@@ -548,3 +569,5 @@ do {                                                                    \
     if ((YESNO)) vlib_worker_thread_barrier_release((VM));
 
 #endif
+
+// clang-format on
index fb876a0..a212532 100644 (file)
@@ -25,6 +25,8 @@
 #include <vnet/ip/ip4_inlines.h>
 #include <vnet/ip/ip6_inlines.h>
 
+// clang-format off
+
 /*
  * distribution error tolerance for load-balancing
  */
@@ -918,11 +920,30 @@ load_balance_mem_show (void)
     load_balance_map_show_mem();
 }
 
+static u16
+load_balance_dpo_get_mtu (const dpo_id_t *dpo)
+{
+    const dpo_id_t *buckets;
+    load_balance_t *lb;
+    u16 i, mtu = 0xffff;
+
+    lb = load_balance_get(dpo->dpoi_index);
+    buckets = load_balance_get_buckets(lb);
+
+    for (i = 0; i < lb->lb_n_buckets; i++)
+    {
+        mtu = clib_min (mtu, dpo_get_mtu (&buckets[i]));
+    }
+
+    return (mtu);
+}
+
 const static dpo_vft_t lb_vft = {
     .dv_lock = load_balance_lock,
     .dv_unlock = load_balance_unlock,
     .dv_format = format_load_balance_dpo,
     .dv_mem_show = load_balance_mem_show,
+    .dv_get_mtu = load_balance_dpo_get_mtu,
 };
 
 /**
@@ -1323,3 +1344,5 @@ VLIB_REGISTER_NODE (bier_load_balance_node) = {
   .format_trace = format_bier_load_balance_trace,
   .sibling_of = "mpls-load-balance",
 };
+
+// clang-format on
index 683b544..b87cb1e 100644 (file)
@@ -18,6 +18,8 @@
 #include <vnet/mpls/mpls.h>
 #include <vnet/dpo/drop_dpo.h>
 
+// clang-format off
+
 #ifndef CLIB_MARCH_VARIANT
 /*
  * pool of all MPLS Label DPOs
@@ -1213,12 +1215,25 @@ mpls_label_interpose (const dpo_id_t *original,
             mpls_label_dpo_get_index(mld_clone));
 }
 
+static u16
+mpls_label_dpo_get_mtu (const dpo_id_t *dpo)
+{
+    mpls_label_dpo_t *mld;
+
+    mld = mpls_label_dpo_get(dpo->dpoi_index);
+
+    /* return the parent's MTU minus the amount of header
+     * this DPO imposes */
+    return (dpo_get_mtu (&mld->mld_dpo) - sizeof(mpls_label_t) * mld->mld_n_labels);
+}
+
 const static dpo_vft_t mld_vft = {
     .dv_lock = mpls_label_dpo_lock,
     .dv_unlock = mpls_label_dpo_unlock,
     .dv_format = format_mpls_label_dpo,
     .dv_mem_show = mpls_label_dpo_mem_show,
     .dv_mk_interpose = mpls_label_interpose,
+    .dv_get_mtu = mpls_label_dpo_get_mtu,
 };
 
 const static char* const mpls_label_imp_pipe_ip4_nodes[] =
@@ -1337,3 +1352,5 @@ mpls_label_dpo_get_type (mpls_label_dpo_flags_t flags)
     return (mpls_label_dpo_types[flags]);
 }
 #endif /* CLIB_MARCH_VARIANT */
+
+// clang-format on
index 6edf31b..119a7ac 100644 (file)
@@ -1362,7 +1362,7 @@ fib_entry_cover_updated (fib_node_index_t fib_entry_index)
        if (0 == index)
        {
            /*
-            * only the best source gets to set the back walk flags
+            * only the best source gets to set the install result
             */
            res = fib_entry_src_action_cover_update(fib_entry, esrc);
             bflags = fib_entry_src_get_flags(esrc);
@@ -1370,7 +1370,23 @@ fib_entry_cover_updated (fib_node_index_t fib_entry_index)
        }
        else
        {
-           fib_entry_src_action_cover_update(fib_entry, esrc);
+           /*
+            * contirubting sources can set backwalk flags
+            */
+            if (esrc->fes_flags & FIB_ENTRY_SRC_FLAG_CONTRIBUTING)
+            {
+                fib_entry_src_cover_res_t tmp = {
+                    .install = !0,
+                    .bw_reason = FIB_NODE_BW_REASON_FLAG_NONE,
+                };
+
+                tmp = fib_entry_src_action_cover_update(fib_entry, esrc);
+                res.bw_reason |= tmp.bw_reason;
+            }
+            else
+            {
+                fib_entry_src_action_cover_update(fib_entry, esrc);
+            }
        }
        index++;
     }));
index 5cf9182..27e67b1 100644 (file)
@@ -118,6 +118,10 @@ typedef enum fib_node_back_walk_reason_t_ {
      * a unipath adjacency changes
      */
     FIB_NODE_BW_REASON_ADJ_UPDATE,
+    /**
+     * Walk update the adjacency MTU
+     */
+    FIB_NODE_BW_REASON_ADJ_MTU,
     /**
      * Walk to update children to inform them the adjacency is now down.
      */
@@ -135,6 +139,7 @@ typedef enum fib_node_back_walk_reason_t_ {
     [FIB_NODE_BW_REASON_INTERFACE_DOWN] = "if-down",        \
     [FIB_NODE_BW_REASON_INTERFACE_DELETE] = "if-delete",    \
     [FIB_NODE_BW_REASON_ADJ_UPDATE] = "adj-update",         \
+    [FIB_NODE_BW_REASON_ADJ_MTU] = "adj-mtu",               \
     [FIB_NODE_BW_REASON_ADJ_DOWN] = "adj-down",             \
 }
 
@@ -154,6 +159,7 @@ typedef enum fib_node_bw_reason_flag_t_ {
     FIB_NODE_BW_REASON_FLAG_INTERFACE_DOWN = (1 << FIB_NODE_BW_REASON_INTERFACE_DOWN),
     FIB_NODE_BW_REASON_FLAG_INTERFACE_DELETE = (1 << FIB_NODE_BW_REASON_INTERFACE_DELETE),
     FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE = (1 << FIB_NODE_BW_REASON_ADJ_UPDATE),
+    FIB_NODE_BW_REASON_FLAG_ADJ_MTU = (1 << FIB_NODE_BW_REASON_ADJ_MTU),
     FIB_NODE_BW_REASON_FLAG_ADJ_DOWN = (1 << FIB_NODE_BW_REASON_ADJ_DOWN),
 } __attribute__ ((packed)) fib_node_bw_reason_flag_t;
 
index f48b644..01140d5 100644 (file)
@@ -999,6 +999,7 @@ fib_path_back_walk_notify (fib_node_t *node,
                &path->fp_dpo);
        }
        if ((FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE & ctx->fnbw_reason) ||
+            (FIB_NODE_BW_REASON_FLAG_ADJ_MTU    & ctx->fnbw_reason) ||
             (FIB_NODE_BW_REASON_FLAG_ADJ_DOWN   & ctx->fnbw_reason))
        {
            /*
index 0669c67..fcdf9c0 100644 (file)
@@ -495,8 +495,13 @@ mgre_update_adj (vnet_main_t * vnm, u32 sw_if_index, adj_index_t ai)
                           adj->ia_nh_proto, &adj->sub_type.nbr.next_hop);
 
   if (NULL == ne)
-    // no NHRP entry to provide the next-hop
-    return;
+    {
+      // no TEIB entry to provide the next-hop
+      adj_nbr_midchain_update_rewrite (
+       ai, gre_get_fixup (t->tunnel_dst.fp_proto, adj_get_link_type (ai)),
+       uword_to_pointer (t->flags, void *), ADJ_FLAG_NONE, NULL);
+      return;
+    }
 
   mgre_walk_ctx_t ctx = {
     .t = t,
index 3072e3e..f49fc16 100644 (file)
@@ -704,6 +704,63 @@ autoreply define ip_reassembly_enable_disable
   vl_api_ip_reass_type_t type;
 };
 
+/**
+    @brief Set a Path MTU value. i.e. a MTU value for a given neighbour.
+           The neighbour can be described as attached (w/ interface and next-hop)
+           or remote (w/ table_id and next-hop);
+    @param client_index - opaque cookie to identify the sender
+    @param context - sender context, to match reply w/ request
+    @param table_id - table-ID for next-hop
+    @param nh - Next hop
+    @param path_mtu - value to set, 0 is disable.
+*/
+typedef ip_path_mtu
+{
+  u32 client_index;
+  u32 context;
+  u32 table_id;
+  vl_api_address_t nh;
+  u16 path_mtu;
+};
+autoreply define ip_path_mtu_update
+{
+  u32 client_index;
+  u32 context;
+  vl_api_ip_path_mtu_t pmtu;
+};
+define ip_path_mtu_get
+{
+  u32 client_index;
+  u32 context;
+  u32 cursor;
+};
+define ip_path_mtu_get_reply
+{
+  u32 context;
+  i32 retval;
+  u32 cursor;
+};
+define ip_path_mtu_details
+{
+  u32 context;
+  vl_api_ip_path_mtu_t pmtu;
+};
+service {
+  rpc ip_path_mtu_get returns ip_path_mtu_get_reply
+    stream ip_path_mtu_details;
+};
+
+autoreply define ip_path_mtu_replace_begin
+{
+  u32 client_index;
+  u32 context;
+};
+autoreply define ip_path_mtu_replace_end
+{
+  u32 client_index;
+  u32 context;
+};
+
 /*
  * Local Variables:
  * eval: (c-set-style "gnu")
index 3bf404b..5b87f7c 100644 (file)
@@ -28,6 +28,7 @@
 #include <vnet/ip/ip_types_api.h>
 #include <vnet/ip/ip_punt_drop.h>
 #include <vnet/ip/ip_types_api.h>
+#include <vnet/ip/ip_path_mtu.h>
 #include <vnet/fib/fib_table.h>
 #include <vnet/fib/fib_api.h>
 #include <vnet/ethernet/arp_packet.h>
   _ (IP_REASSEMBLY_SET, ip_reassembly_set)                                    \
   _ (IP_REASSEMBLY_GET, ip_reassembly_get)                                    \
   _ (IP_REASSEMBLY_ENABLE_DISABLE, ip_reassembly_enable_disable)              \
-  _ (IP_PUNT_REDIRECT_DUMP, ip_punt_redirect_dump)
+  _ (IP_PUNT_REDIRECT_DUMP, ip_punt_redirect_dump)                            \
+  _ (IP_PATH_MTU_UPDATE, ip_path_mtu_update)                                  \
+  _ (IP_PATH_MTU_REPLACE_BEGIN, ip_path_mtu_replace_begin)                    \
+  _ (IP_PATH_MTU_REPLACE_END, ip_path_mtu_replace_end)                        \
+  _ (IP_PATH_MTU_GET, ip_path_mtu_get)
 
 static void
   vl_api_sw_interface_ip6_enable_disable_t_handler
@@ -1134,18 +1139,18 @@ static void
   REPLY_MACRO (VL_API_IP_CONTAINER_PROXY_ADD_DEL_REPLY);
 }
 
-typedef struct ip_container_proxy_walk_ctx_t_
+typedef struct ip_walk_ctx_t_
 {
   vl_api_registration_t *reg;
   u32 context;
-} ip_container_proxy_walk_ctx_t;
+} ip_walk_ctx_t;
 
 static int
 ip_container_proxy_send_details (const fib_prefix_t * pfx, u32 sw_if_index,
                                 void *args)
 {
   vl_api_ip_container_proxy_details_t *mp;
-  ip_container_proxy_walk_ctx_t *ctx = args;
+  ip_walk_ctx_t *ctx = args;
 
   mp = vl_msg_api_alloc (sizeof (*mp));
   if (!mp)
@@ -1173,7 +1178,7 @@ vl_api_ip_container_proxy_dump_t_handler (vl_api_ip_container_proxy_dump_t *
   if (!reg)
     return;
 
-  ip_container_proxy_walk_ctx_t ctx = {
+  ip_walk_ctx_t ctx = {
     .context = mp->context,
     .reg = reg,
   };
@@ -1624,21 +1629,15 @@ void
   REPLY_MACRO (VL_API_IP_REASSEMBLY_ENABLE_DISABLE_REPLY);
 }
 
-typedef struct ip_punt_redirect_walk_ctx_t_
-{
-  vl_api_registration_t *reg;
-  u32 context;
-} ip_punt_redirect_walk_ctx_t;
-
 static walk_rc_t
 send_ip_punt_redirect_details (u32 rx_sw_if_index,
                               const ip_punt_redirect_rx_t * ipr, void *arg)
 {
-  ip_punt_redirect_walk_ctx_t *ctx = arg;
   vl_api_ip_punt_redirect_details_t *mp;
   fib_path_encode_ctx_t path_ctx = {
     .rpaths = NULL,
   };
+  ip_walk_ctx_t *ctx = arg;
 
   mp = vl_msg_api_alloc (sizeof (*mp));
   if (!mp)
@@ -1676,7 +1675,7 @@ vl_api_ip_punt_redirect_dump_t_handler (vl_api_ip_punt_redirect_dump_t * mp)
   if (mp->is_ipv6 == 1)
     fproto = FIB_PROTOCOL_IP6;
 
-  ip_punt_redirect_walk_ctx_t ctx = {
+  ip_walk_ctx_t ctx = {
     .reg = reg,
     .context = mp->context,
   };
@@ -1699,6 +1698,73 @@ vl_api_ip_punt_redirect_dump_t_handler (vl_api_ip_punt_redirect_dump_t * mp)
     ip_punt_redirect_walk (fproto, send_ip_punt_redirect_details, &ctx);
 }
 
+void
+vl_api_ip_path_mtu_update_t_handler (vl_api_ip_path_mtu_update_t *mp)
+{
+  vl_api_ip_path_mtu_update_reply_t *rmp;
+  ip_address_t nh;
+  int rv = 0;
+
+  ip_address_decode2 (&mp->pmtu.nh, &nh);
+
+  rv = ip_path_mtu_update (&nh, ntohl (mp->pmtu.table_id),
+                          ntohs (mp->pmtu.path_mtu));
+
+  REPLY_MACRO (VL_API_IP_PATH_MTU_UPDATE_REPLY);
+}
+
+void
+vl_api_ip_path_mtu_replace_begin_t_handler (
+  vl_api_ip_path_mtu_replace_begin_t *mp)
+{
+  vl_api_ip_path_mtu_replace_begin_reply_t *rmp;
+  int rv;
+
+  rv = ip_path_mtu_replace_begin ();
+
+  REPLY_MACRO (VL_API_IP_PATH_MTU_REPLACE_BEGIN_REPLY);
+}
+
+void
+vl_api_ip_path_mtu_replace_end_t_handler (vl_api_ip_path_mtu_replace_end_t *mp)
+{
+  vl_api_ip_path_mtu_replace_end_reply_t *rmp;
+  int rv;
+
+  rv = ip_path_mtu_replace_end ();
+
+  REPLY_MACRO (VL_API_IP_PATH_MTU_REPLACE_END_REPLY);
+}
+
+static void
+send_ip_path_mtu_details (index_t ipti, vl_api_registration_t *rp, u32 context)
+{
+  vl_api_ip_path_mtu_details_t *rmp;
+  ip_address_t ip;
+  ip_pmtu_t *ipt;
+
+  ipt = ip_path_mtu_get (ipti);
+
+  REPLY_MACRO_DETAILS4 (VL_API_IP_PATH_MTU_DETAILS, rp, context, ({
+                         ip_pmtu_get_ip (ipt, &ip);
+                         ip_address_encode2 (&ip, &rmp->pmtu.nh);
+                         rmp->pmtu.table_id =
+                           htonl (ip_pmtu_get_table_id (ipt));
+                         rmp->pmtu.path_mtu = htons (ipt->ipt_cfg_pmtu);
+                       }));
+}
+
+static void
+vl_api_ip_path_mtu_get_t_handler (vl_api_ip_path_mtu_get_t *mp)
+{
+  vl_api_ip_path_mtu_get_reply_t *rmp;
+  i32 rv = 0;
+
+  REPLY_AND_DETAILS_MACRO (
+    VL_API_IP_PATH_MTU_GET_REPLY, ip_pmtu_pool,
+    ({ send_ip_path_mtu_details (cursor, rp, mp->context); }));
+}
+
 #define vl_msg_name_crc_list
 #include <vnet/ip/ip.api.h>
 #undef vl_msg_name_crc_list
diff --git a/src/vnet/ip/ip_path_mtu.c b/src/vnet/ip/ip_path_mtu.c
new file mode 100644 (file)
index 0000000..38adb44
--- /dev/null
@@ -0,0 +1,883 @@
+/*
+ *------------------------------------------------------------------
+ * ip_path_mtu.c
+ *
+ * Copyright (c) 2021 Graphiant.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vnet/ip/ip_path_mtu.h>
+#include <vnet/ip/ip_frag.h>
+#include <vnet/adj/adj_delegate.h>
+#include <vnet/adj/adj_nbr.h>
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/fib_entry_track.h>
+
+#include <vnet/dpo/drop_dpo.h>
+
+/**
+ * Path MTU
+ *
+ * A path is a peer. A peer is known by an IP address (in a table).
+ * Insert a DPO in the forwarding chain for the peer to perform the
+ * fragmentation.
+ * For attached peers, all traffic will use the peer's adjacency, there
+ * is already an MTU chekc in the adjacency (for the link's MTU) so as an
+ * optimisation, instead of using a DPO, we add a delegate to the adjacency
+ * to set the adjacency's MTU to the path MTU.
+ */
+
+/**
+ * the logger
+ */
+static vlib_log_class_t ip_pmtu_logger;
+
+static adj_delegate_type_t ip_pmtu_adj_delegate_type;
+static fib_source_t ip_pmtu_source;
+
+/**
+ * DPO pool
+ */
+ip_pmtu_dpo_t *ip_pmtu_dpo_pool;
+
+/**
+ * DPO type registered for these GBP FWD
+ */
+static dpo_type_t ip_pmtu_dpo_type;
+
+/**
+ * Fib node type for the tracker
+ */
+static fib_node_type_t ip_pmtu_fib_type;
+
+/**
+ * Path MTU tracker pool
+ */
+ip_pmtu_t *ip_pmtu_pool;
+
+/**
+ * Delegate added to adjacencies to track path MTU
+ */
+typedef struct ip_path_mtu_adj_delegate_t_
+{
+  u16 pmtu;
+} ip_path_mtu_adj_delegate_t;
+
+static ip_path_mtu_adj_delegate_t *ip_path_mtu_adj_delegate_pool;
+
+/* DB of all FIB PMTU settings */
+typedef struct ip_pmtu_key_t_
+{
+  ip46_address_t nh;
+  u32 table_id;
+  fib_protocol_t fproto;
+} __clib_packed ip_pmtu_key_t;
+
+static uword *ip_pmtu_db;
+
+#define IP_PMTU_TRKR_DBG(_ipt, _fmt, _args...)                                \
+  {                                                                           \
+    vlib_log_debug (ip_pmtu_logger, "[%U]: " _fmt ": ", format_ip_pmtu,       \
+                   _ipt - ip_pmtu_pool, ##_args);                            \
+  }
+#define IP_PMTU_DBG(_fmt, _args...)                                           \
+  {                                                                           \
+    vlib_log_debug (ip_pmtu_logger, _fmt ": ", ##_args);                      \
+  }
+
+static u8 *
+format_ip_pmtu_flags (u8 *s, va_list *ap)
+{
+  ip_pmtu_flags_t f = va_arg (*ap, ip_pmtu_flags_t);
+
+  if (0)
+    ;
+#define _(a, b, c) else if (f & IP_PMTU_FLAG_##a) s = format (s, "%s ", c);
+  foreach_ip_pmtu_flag
+#undef _
+
+    return (s);
+}
+
+u32
+ip_pmtu_get_table_id (const ip_pmtu_t *ipt)
+{
+  const fib_prefix_t *pfx;
+  u32 fib_index;
+
+  pfx = fib_entry_get_prefix (ipt->ipt_fib_entry);
+  fib_index = fib_entry_get_fib_index (ipt->ipt_fib_entry);
+
+  return (fib_table_get_table_id (fib_index, pfx->fp_proto));
+}
+
+void
+ip_pmtu_get_ip (const ip_pmtu_t *ipt, ip_address_t *ip)
+{
+  const fib_prefix_t *pfx;
+
+  pfx = fib_entry_get_prefix (ipt->ipt_fib_entry);
+  ip_address_from_46 (&pfx->fp_addr, pfx->fp_proto, ip);
+}
+
+static u8 *
+format_ip_pmtu (u8 *s, va_list *ap)
+{
+  ip_pmtu_t *ipt;
+  index_t ipti = va_arg (*ap, index_t);
+  const fib_prefix_t *pfx;
+  u32 fib_index;
+
+  ipt = pool_elt_at_index (ip_pmtu_pool, ipti);
+  pfx = fib_entry_get_prefix (ipt->ipt_fib_entry);
+  fib_index = fib_entry_get_fib_index (ipt->ipt_fib_entry);
+
+  s =
+    format (s, "[%d] [tbl:[%d:%d]] %U pmtu:[cfg:%d, oper:%d, parent:%d] [%U]",
+           ipti, ip_pmtu_get_table_id (ipt), fib_index, format_fib_prefix,
+           pfx, ipt->ipt_cfg_pmtu, ipt->ipt_oper_pmtu, ipt->ipt_parent_pmtu,
+           format_ip_pmtu_flags, ipt->ipt_flags);
+
+  return (s);
+}
+
+static u8 *
+format_ip_path_mtu_adj_delegate (const adj_delegate_t *aed, u8 *s)
+{
+  ip_path_mtu_adj_delegate_t *ip_adj;
+
+  ip_adj = pool_elt_at_index (ip_path_mtu_adj_delegate_pool, aed->ad_index);
+
+  s = format (s, "IP path-MTU: %d", ip_adj->pmtu);
+
+  return (s);
+}
+
+static void
+ip_pmtu_adj_delegate_adj_created (adj_index_t ai)
+{
+  ip_path_mtu_adj_delegate_t *ipp_ad;
+  const ip_pmtu_t *ipt;
+  ip_adjacency_t *adj;
+  u32 table_id;
+  uword *p;
+
+  adj = adj_get (ai);
+
+  switch (adj->lookup_next_index)
+    {
+    case IP_LOOKUP_NEXT_DROP:
+    case IP_LOOKUP_NEXT_PUNT:
+    case IP_LOOKUP_NEXT_LOCAL:
+    case IP_LOOKUP_NEXT_GLEAN:
+    case IP_LOOKUP_NEXT_MCAST:
+    case IP_LOOKUP_NEXT_BCAST:
+    case IP_LOOKUP_NEXT_MCAST_MIDCHAIN:
+    case IP_LOOKUP_NEXT_ICMP_ERROR:
+    case IP_LOOKUP_N_NEXT:
+      return;
+
+    case IP_LOOKUP_NEXT_ARP:
+    case IP_LOOKUP_NEXT_REWRITE:
+    case IP_LOOKUP_NEXT_MIDCHAIN:
+      break;
+    }
+
+  table_id = fib_table_get_table_id_for_sw_if_index (
+    adj->ia_nh_proto, adj->rewrite_header.sw_if_index);
+
+  ip_pmtu_key_t key = {
+    .nh = adj->sub_type.nbr.next_hop,
+    .table_id = table_id,
+    .fproto = adj->ia_nh_proto,
+  };
+
+  p = hash_get_mem (ip_pmtu_db, &key);
+
+  if (NULL == p)
+    return;
+
+  ipt = pool_elt_at_index (ip_pmtu_pool, p[0]);
+
+  pool_get (ip_path_mtu_adj_delegate_pool, ipp_ad);
+  ipp_ad->pmtu = ipt->ipt_cfg_pmtu;
+
+  adj_delegate_add (adj, ip_pmtu_adj_delegate_type,
+                   ipp_ad - ip_path_mtu_adj_delegate_pool);
+
+  adj_nbr_set_mtu (ai, ipp_ad->pmtu);
+
+  IP_PMTU_TRKR_DBG (ipt, "adj-added:", ai);
+}
+
+static void
+ip_pmtu_adj_delegate_adj_deleted (adj_delegate_t *ad)
+{
+  pool_put_index (ip_path_mtu_adj_delegate_pool, ad->ad_index);
+}
+
+static void
+ip_pmtu_adj_delegate_adj_modified (adj_delegate_t *ad)
+{
+  ip_path_mtu_adj_delegate_t *ipp_ad;
+
+  ipp_ad = pool_elt_at_index (ip_path_mtu_adj_delegate_pool, ad->ad_index);
+
+  adj_nbr_set_mtu (ad->ad_adj_index, ipp_ad->pmtu);
+}
+
+const adj_delegate_vft_t ip_path_adj_delegate_vft = {
+  .adv_format = format_ip_path_mtu_adj_delegate,
+  .adv_adj_deleted = ip_pmtu_adj_delegate_adj_deleted,
+  .adv_adj_modified = ip_pmtu_adj_delegate_adj_modified,
+  .adv_adj_created = ip_pmtu_adj_delegate_adj_created,
+};
+
+static bool
+ip_path_mtu_value_invalid (u16 pmtu)
+{
+  return (pmtu == 0 || pmtu == 0xffff);
+}
+
+static adj_walk_rc_t
+ip_ptmu_adj_walk_remove (adj_index_t ai, void *ctx)
+{
+  adj_delegate_t *ad;
+
+  ad = adj_delegate_get (adj_get (ai), ip_pmtu_adj_delegate_type);
+
+  if (ad)
+    {
+      adj_nbr_set_mtu (ai, 0);
+
+      pool_put_index (ip_path_mtu_adj_delegate_pool, ad->ad_index);
+      adj_delegate_remove (ai, ip_pmtu_adj_delegate_type);
+    }
+  return (ADJ_WALK_RC_CONTINUE);
+}
+
+static adj_walk_rc_t
+ip_ptmu_adj_walk_update (adj_index_t ai, void *ctx)
+{
+  ip_path_mtu_adj_delegate_t *ipp_ad;
+  adj_delegate_t *ad;
+  u16 *pmtup;
+
+  pmtup = ctx;
+  ad = adj_delegate_get (adj_get (ai), ip_pmtu_adj_delegate_type);
+
+  if (ad)
+    ipp_ad = pool_elt_at_index (ip_path_mtu_adj_delegate_pool, ad->ad_index);
+  else
+    {
+      pool_get (ip_path_mtu_adj_delegate_pool, ipp_ad);
+
+      adj_delegate_add (adj_get (ai), ip_pmtu_adj_delegate_type,
+                       ipp_ad - ip_path_mtu_adj_delegate_pool);
+    }
+
+  ipp_ad->pmtu = *pmtup;
+
+  adj_nbr_set_mtu (ai, ipp_ad->pmtu);
+
+  return (ADJ_WALK_RC_CONTINUE);
+}
+
+static ip_pmtu_dpo_t *
+ip_pmtu_dpo_alloc (void)
+{
+  ip_pmtu_dpo_t *ipm;
+
+  pool_get_aligned_zero (ip_pmtu_dpo_pool, ipm, sizeof (ip_pmtu_dpo_t));
+
+  return (ipm);
+}
+
+static ip_pmtu_dpo_t *
+ip_pmtu_dpo_get_from_dpo (const dpo_id_t *dpo)
+{
+  ASSERT (ip_pmtu_dpo_type == dpo->dpoi_type);
+
+  return (ip_pmtu_dpo_get (dpo->dpoi_index));
+}
+
+static index_t
+ip_pmtu_dpo_get_index (ip_pmtu_dpo_t *ipm)
+{
+  return (ipm - ip_pmtu_dpo_pool);
+}
+
+static void
+ip_pmtu_dpo_lock (dpo_id_t *dpo)
+{
+  ip_pmtu_dpo_t *ipm;
+
+  ipm = ip_pmtu_dpo_get_from_dpo (dpo);
+  ipm->ipm_locks++;
+}
+
+static void
+ip_pmtu_dpo_unlock (dpo_id_t *dpo)
+{
+  ip_pmtu_dpo_t *ipm;
+
+  ipm = ip_pmtu_dpo_get_from_dpo (dpo);
+  ipm->ipm_locks--;
+
+  if (0 == ipm->ipm_locks)
+    {
+      dpo_reset (&ipm->ipm_dpo);
+      pool_put (ip_pmtu_dpo_pool, ipm);
+    }
+}
+
+static u32
+ip_pmtu_dpo_get_urpf (const dpo_id_t *dpo)
+{
+  ip_pmtu_dpo_t *ipm;
+
+  ipm = ip_pmtu_dpo_get_from_dpo (dpo);
+
+  return (dpo_get_urpf (&ipm->ipm_dpo));
+}
+
+void
+ip_pmtu_dpo_add_or_lock (fib_protocol_t fproto, u16 pmtu, dpo_id_t *dpo)
+{
+  ip_pmtu_dpo_t *ipm;
+  dpo_id_t parent = DPO_INVALID;
+
+  ipm = ip_pmtu_dpo_alloc ();
+
+  ipm->ipm_proto = fib_proto_to_dpo (fproto);
+  ipm->ipm_pmtu = pmtu;
+
+  dpo_copy (&parent, drop_dpo_get (ipm->ipm_proto));
+  dpo_stack (ip_pmtu_dpo_type, ipm->ipm_proto, &ipm->ipm_dpo, &parent);
+  dpo_set (dpo, ip_pmtu_dpo_type, ipm->ipm_proto, ip_pmtu_dpo_get_index (ipm));
+}
+
+u8 *
+format_ip_pmtu_dpo (u8 *s, va_list *ap)
+{
+  index_t index = va_arg (*ap, index_t);
+  u32 indent = va_arg (*ap, u32);
+  ip_pmtu_dpo_t *ipm = ip_pmtu_dpo_get (index);
+
+  s = format (s, "ip-pmtu-dpo: %U, mtu:%d", format_dpo_proto, ipm->ipm_proto,
+             ipm->ipm_pmtu);
+  s = format (s, "\n%U", format_white_space, indent + 2);
+  s = format (s, "%U", format_dpo_id, &ipm->ipm_dpo, indent + 4);
+
+  return (s);
+}
+
+/**
+ * Interpose a path MTU DPO
+ */
+static void
+ip_pmtu_dpo_interpose (const dpo_id_t *original, const dpo_id_t *parent,
+                      dpo_id_t *clone)
+{
+  ip_pmtu_dpo_t *ipm, *ipm_clone;
+
+  ipm_clone = ip_pmtu_dpo_alloc ();
+  ipm = ip_pmtu_dpo_get (original->dpoi_index);
+
+  ipm_clone->ipm_proto = ipm->ipm_proto;
+  ipm_clone->ipm_pmtu = ipm->ipm_pmtu;
+
+  dpo_stack (ip_pmtu_dpo_type, ipm_clone->ipm_proto, &ipm_clone->ipm_dpo,
+            parent);
+  dpo_set (clone, ip_pmtu_dpo_type, ipm_clone->ipm_proto,
+          ip_pmtu_dpo_get_index (ipm_clone));
+}
+
+static u16
+ip_pmtu_dpo_get_mtu (const dpo_id_t *dpo)
+{
+  ip_pmtu_dpo_t *ipd;
+
+  ipd = pool_elt_at_index (ip_pmtu_dpo_pool, dpo->dpoi_index);
+
+  return (ipd->ipm_pmtu);
+}
+
+const static dpo_vft_t ip_pmtu_dpo_vft = {
+  .dv_lock = ip_pmtu_dpo_lock,
+  .dv_unlock = ip_pmtu_dpo_unlock,
+  .dv_format = format_ip_pmtu_dpo,
+  .dv_get_urpf = ip_pmtu_dpo_get_urpf,
+  .dv_mk_interpose = ip_pmtu_dpo_interpose,
+  .dv_get_mtu = ip_pmtu_dpo_get_mtu,
+};
+
+/**
+ * @brief The per-protocol VLIB graph nodes that are assigned to a glean
+ *        object.
+ *
+ * this means that these graph nodes are ones from which a glean is the
+ * parent object in the DPO-graph.
+ */
+const static char *const ip_pmtu_dpo_ip4_nodes[] = {
+  "ip4-pmtu-dpo",
+  NULL,
+};
+
+const static char *const ip_pmtu_dpo_ip6_nodes[] = {
+  "ip6-pmtu-dpo",
+  NULL,
+};
+
+const static char *const *const ip_pmtu_dpo_nodes[DPO_PROTO_NUM] = {
+  [DPO_PROTO_IP4] = ip_pmtu_dpo_ip4_nodes,
+  [DPO_PROTO_IP6] = ip_pmtu_dpo_ip6_nodes,
+};
+
+static bool
+ip_mtu_fib_entry_is_attached (fib_node_index_t fib_entry)
+{
+  const fib_prefix_t *pfx;
+  u32 cover, fib_index;
+
+  fib_index = fib_entry_get_fib_index (fib_entry);
+  pfx = fib_entry_get_prefix (fib_entry);
+
+  /*
+   * If the tracked prefix's cover is attached, then all packets that
+   * are forwarded to this neighbour will use the adjacency, this is a
+   * more efficient place to perform the MTU check and fragging
+   */
+  cover = fib_table_get_less_specific (fib_index, pfx);
+
+  return (FIB_ENTRY_FLAG_ATTACHED & fib_entry_get_flags (cover) ||
+         FIB_ENTRY_FLAG_ATTACHED & fib_entry_get_flags (fib_entry));
+}
+
+static index_t
+ip_pmtu_alloc (u32 fib_index, const fib_prefix_t *pfx,
+              const ip_pmtu_key_t *key, u16 pmtu)
+{
+  dpo_id_t ip_dpo = DPO_INVALID;
+  ip_pmtu_t *ipt;
+  fib_node_index_t cover;
+  const dpo_id_t *lb_dpo;
+  index_t ipti;
+
+  pool_get (ip_pmtu_pool, ipt);
+  fib_node_init (&(ipt->ipt_node), ip_pmtu_fib_type);
+
+  ipti = ipt - ip_pmtu_pool;
+  hash_set_mem_alloc (&ip_pmtu_db, key, ipti);
+
+  ipt->ipt_cfg_pmtu = pmtu;
+  ipt->ipt_fib_entry = fib_entry_track (fib_index, pfx, ip_pmtu_fib_type, ipti,
+                                       &ipt->ipt_sibling);
+
+  /*
+   * If the tracked prefix's cover is attached, then all packets that
+   * are forwarded to this neighbour will use the adjacency, this is a
+   * more efficient place to perform the MTU check and fragging
+   */
+  cover = fib_table_get_less_specific (fib_index, pfx);
+
+  if (ip_mtu_fib_entry_is_attached (ipt->ipt_fib_entry))
+    {
+      u32 sw_if_index;
+
+      ipt->ipt_flags |= IP_PMTU_FLAG_ATTACHED;
+      ipt->ipt_oper_pmtu = ipt->ipt_cfg_pmtu;
+
+      sw_if_index = fib_entry_get_resolving_interface (cover);
+
+      /* walk all adjs to add/update delegate */
+      adj_nbr_walk_nh (sw_if_index, pfx->fp_proto, &pfx->fp_addr,
+                      ip_ptmu_adj_walk_update, &ipt->ipt_oper_pmtu);
+    }
+  else
+    {
+      ipt->ipt_flags |= IP_PMTU_FLAG_REMOTE;
+
+      lb_dpo = fib_entry_contribute_ip_forwarding (ipt->ipt_fib_entry);
+
+      ipt->ipt_oper_pmtu = clib_min (dpo_get_mtu (lb_dpo), ipt->ipt_cfg_pmtu);
+
+      /*
+       * interpose a policy DPO from the nh so that MTU is applied
+       */
+      ip_pmtu_dpo_add_or_lock (pfx->fp_proto, ipt->ipt_oper_pmtu, &ip_dpo);
+
+      fib_table_entry_special_dpo_add (fib_index, pfx, ip_pmtu_source,
+                                      FIB_ENTRY_FLAG_INTERPOSE, &ip_dpo);
+      dpo_reset (&ip_dpo);
+    }
+
+  IP_PMTU_TRKR_DBG (ipt, "create");
+
+  return (ipti);
+}
+
+static void
+ip_pmtu_stack (ip_pmtu_t *ipt)
+{
+  bool was_attached, is_attached;
+  const fib_prefix_t *pfx;
+  u32 fib_index;
+
+  pfx = fib_entry_get_prefix (ipt->ipt_fib_entry);
+  fib_index = fib_entry_get_fib_index (ipt->ipt_fib_entry);
+
+  was_attached = !!(ipt->ipt_flags & IP_PMTU_FLAG_ATTACHED);
+  is_attached = ip_mtu_fib_entry_is_attached (ipt->ipt_fib_entry);
+
+  if (was_attached && !is_attached)
+    {
+      /* transition from attached to remote - walk all adjs to remove delegate
+       */
+      adj_nbr_walk_nh (fib_entry_get_resolving_interface (ipt->ipt_fib_entry),
+                      pfx->fp_proto, &pfx->fp_addr, ip_ptmu_adj_walk_remove,
+                      &ipt->ipt_oper_pmtu);
+      ipt->ipt_flags &= ~IP_PMTU_FLAG_ATTACHED;
+    }
+  if (!was_attached && is_attached)
+    {
+      /* transition from remote to attached - remove the DPO */
+      fib_table_entry_special_remove (fib_index, pfx, ip_pmtu_source);
+      ipt->ipt_flags &= ~IP_PMTU_FLAG_REMOTE;
+    }
+
+  if (is_attached)
+    {
+      /* walk all adjs to add/update delegate */
+      ipt->ipt_oper_pmtu = ipt->ipt_cfg_pmtu;
+      adj_nbr_walk_nh (fib_entry_get_resolving_interface (ipt->ipt_fib_entry),
+                      pfx->fp_proto, &pfx->fp_addr, ip_ptmu_adj_walk_update,
+                      &ipt->ipt_oper_pmtu);
+      ipt->ipt_flags |= IP_PMTU_FLAG_ATTACHED;
+    }
+  else
+    {
+      const dpo_id_t *lb_dpo;
+      u16 dpo_mtu;
+
+      fib_table_entry_special_remove (fib_index, pfx, ip_pmtu_source);
+
+      ipt->ipt_flags |= IP_PMTU_FLAG_REMOTE;
+      lb_dpo = fib_entry_contribute_ip_forwarding (ipt->ipt_fib_entry);
+      dpo_mtu = dpo_get_mtu (lb_dpo);
+
+      ipt->ipt_oper_pmtu = clib_min (dpo_mtu, ipt->ipt_cfg_pmtu);
+
+      /*
+       * if the configured path-MTU is less that the egress/interface then
+       * interpose a policy DPO from the nh so that MTU is applied
+       */
+      if (ipt->ipt_oper_pmtu < dpo_mtu)
+       {
+         dpo_id_t ip_dpo = DPO_INVALID;
+
+         ip_pmtu_dpo_add_or_lock (pfx->fp_proto, ipt->ipt_oper_pmtu, &ip_dpo);
+
+         fib_table_entry_special_dpo_update (
+           fib_index, pfx, ip_pmtu_source, FIB_ENTRY_FLAG_INTERPOSE, &ip_dpo);
+         dpo_reset (&ip_dpo);
+       }
+    }
+  IP_PMTU_TRKR_DBG (ipt, "stack");
+}
+
+static void
+ip_pmtu_update (index_t ipti, u16 pmtu)
+{
+  ip_pmtu_t *ipt;
+
+  ipt = pool_elt_at_index (ip_pmtu_pool, ipti);
+  ipt->ipt_flags &= ~IP_PMTU_FLAG_STALE;
+  ipt->ipt_cfg_pmtu = pmtu;
+
+  ip_pmtu_stack (ipt);
+}
+
+static index_t
+ip_pmtu_destroy (index_t ipti, const ip_pmtu_key_t *key)
+{
+  ip_pmtu_t *ipt;
+  const fib_prefix_t *pfx;
+
+  ipt = pool_elt_at_index (ip_pmtu_pool, ipti);
+  pfx = fib_entry_get_prefix (ipt->ipt_fib_entry);
+
+  IP_PMTU_TRKR_DBG (ipt, "destroy");
+
+  if (ipt->ipt_flags & IP_PMTU_FLAG_REMOTE)
+    fib_table_entry_special_remove (
+      fib_entry_get_fib_index (ipt->ipt_fib_entry), pfx, ip_pmtu_source);
+  else
+    /* remove the delegate from all the adjacencies */
+    adj_nbr_walk_nh (fib_entry_get_resolving_interface (ipt->ipt_fib_entry),
+                    pfx->fp_proto, &pfx->fp_addr, ip_ptmu_adj_walk_remove,
+                    NULL);
+
+  /*
+   * Drop the fib entry we're tracking
+   */
+  fib_entry_untrack (ipt->ipt_fib_entry, ipt->ipt_sibling);
+
+  /*
+   * remove from DB and return to pool
+   */
+  hash_unset_mem_free (&ip_pmtu_db, key);
+  pool_put (ip_pmtu_pool, ipt);
+
+  return (ipti);
+}
+
+int
+ip_path_mtu_update (const ip_address_t *nh, u32 table_id, u16 pmtu)
+{
+  fib_prefix_t pfx;
+  u32 fib_index;
+  uword *p;
+
+  ip_address_to_fib_prefix (nh, &pfx);
+  fib_index = fib_table_find (pfx.fp_proto, table_id);
+
+  if (~0 == fib_index)
+    return (VNET_API_ERROR_NO_SUCH_TABLE);
+
+  ip_pmtu_key_t key = {
+    .fproto = pfx.fp_proto,
+    .table_id = table_id,
+    .nh = pfx.fp_addr,
+  };
+
+  p = hash_get_mem (ip_pmtu_db, &key);
+
+  if (!ip_path_mtu_value_invalid (pmtu))
+    {
+      /* Add or update of path MTU */
+      if (NULL == p)
+       ip_pmtu_alloc (fib_index, &pfx, &key, pmtu);
+      else
+       ip_pmtu_update (p[0], pmtu);
+    }
+  else
+    {
+      if (NULL != p)
+       ip_pmtu_destroy (p[0], &key);
+    }
+
+  return (0);
+}
+
+static walk_rc_t
+ip_path_mtu_walk_mark (index_t ipti, void *ctx)
+{
+  ip_pmtu_t *ipt;
+
+  ipt = ip_path_mtu_get (ipti);
+
+  ipt->ipt_flags |= IP_PMTU_FLAG_STALE;
+
+  return (WALK_CONTINUE);
+}
+
+typedef struct ip_path_mtu_walk_sweep_ctx_t_
+{
+  index_t *indicies;
+} ip_path_mtu_walk_sweep_ctx_t;
+
+static walk_rc_t
+ip_path_mtu_walk_sweep (index_t ipti, void *arg)
+{
+  ip_path_mtu_walk_sweep_ctx_t *ctx = arg;
+  ip_pmtu_t *ipt;
+
+  ipt = ip_path_mtu_get (ipti);
+
+  if (ipt->ipt_flags & IP_PMTU_FLAG_STALE)
+    {
+      vec_add1 (ctx->indicies, ipti);
+    }
+
+  return (WALK_CONTINUE);
+}
+
+int
+ip_path_mtu_replace_begin (void)
+{
+  IP_PMTU_DBG ("replace-begin");
+
+  ip_path_mtu_walk (ip_path_mtu_walk_mark, NULL);
+
+  return (0);
+}
+
+int
+ip_path_mtu_replace_end (void)
+{
+  index_t *ipti;
+
+  IP_PMTU_DBG ("replace-end");
+
+  /*
+   * not safe to walk the pool whilst deleting, so create
+   * temporary storage of stale entries
+   */
+  ip_path_mtu_walk_sweep_ctx_t ctx = {
+    .indicies = NULL,
+  };
+
+  ip_path_mtu_walk (ip_path_mtu_walk_sweep, &ctx);
+
+  vec_foreach (ipti, ctx.indicies)
+    {
+      ip_pmtu_t *ipt;
+      ip_address_t ip;
+
+      ipt = ip_path_mtu_get (*ipti);
+      ip_pmtu_get_ip (ipt, &ip);
+      ip_path_mtu_update (&ip, ip_pmtu_get_table_id (ipt), 0);
+    }
+
+  vec_free (ctx.indicies);
+
+  return (0);
+}
+
+void
+ip_path_mtu_walk (ip_path_mtu_walk_t fn, void *ctx)
+{
+  index_t ipmi;
+
+  pool_foreach_index (ipmi, ip_pmtu_pool)
+    {
+      if (WALK_STOP == fn (ipmi, ctx))
+       break;
+    }
+}
+
+static fib_node_t *
+ip_pmtu_get_node (fib_node_index_t index)
+{
+  ip_pmtu_t *ipt;
+
+  ipt = pool_elt_at_index (ip_pmtu_pool, index);
+
+  return (&(ipt->ipt_node));
+}
+
+static ip_pmtu_t *
+ip_pmtu_get_from_node (fib_node_t *node)
+{
+  return (
+    (ip_pmtu_t *) (((char *) node) - STRUCT_OFFSET_OF (ip_pmtu_t, ipt_node)));
+}
+
+static void
+ip_pmtu_last_lock_gone (fib_node_t *node)
+{
+  /*
+   * the lifetime of the entry is managed by the API.
+   */
+  ASSERT (0);
+}
+
+/*
+ * A back walk has reached this BIER entry
+ */
+static fib_node_back_walk_rc_t
+ip_pmtu_back_walk_notify (fib_node_t *node, fib_node_back_walk_ctx_t *ctx)
+{
+  /*
+   * re-populate the ECMP tables with new choices
+   */
+  ip_pmtu_t *ipr = ip_pmtu_get_from_node (node);
+
+  ip_pmtu_stack (ipr);
+
+  /*
+   * no need to propagate further up the graph, since there's nothing there
+   */
+  return (FIB_NODE_BACK_WALK_CONTINUE);
+}
+
+static const fib_node_vft_t ip_ptmu_fib_node_vft = {
+  .fnv_get = ip_pmtu_get_node,
+  .fnv_last_lock = ip_pmtu_last_lock_gone,
+  .fnv_back_walk = ip_pmtu_back_walk_notify,
+};
+
+static clib_error_t *
+ip_path_module_init (vlib_main_t *vm)
+{
+  ip_pmtu_adj_delegate_type =
+    adj_delegate_register_new_type (&ip_path_adj_delegate_vft);
+  ip_pmtu_source = fib_source_allocate ("path-mtu", FIB_SOURCE_PRIORITY_HI,
+                                       FIB_SOURCE_BH_SIMPLE);
+  ip_pmtu_fib_type = fib_node_register_new_type (&ip_ptmu_fib_node_vft);
+
+  ip_pmtu_db = hash_create_mem (0, sizeof (ip_pmtu_key_t), sizeof (index_t));
+  ip_pmtu_logger = vlib_log_register_class ("ip", "pmtu");
+  ip_pmtu_dpo_type =
+    dpo_register_new_type (&ip_pmtu_dpo_vft, ip_pmtu_dpo_nodes);
+
+  return (NULL);
+}
+
+VLIB_INIT_FUNCTION (ip_path_module_init);
+
+static clib_error_t *
+show_ip_pmtu_command (vlib_main_t *vm, unformat_input_t *input,
+                     vlib_cli_command_t *cmd)
+{
+  index_t ipti;
+
+  if (unformat (input, "%d", &ipti))
+    {
+      /*
+       * show one in detail
+       */
+      if (!pool_is_free_index (ip_pmtu_pool, ipti))
+       vlib_cli_output (vm, "%U", format_ip_pmtu, ipti);
+      else
+       vlib_cli_output (vm, "entry %d invalid", ipti);
+    }
+  else
+    {
+      /*
+       * show all
+       */
+      pool_foreach_index (ipti, ip_pmtu_pool)
+       {
+         vlib_cli_output (vm, "%U", format_ip_pmtu, ipti);
+       }
+    }
+
+  return (NULL);
+}
+
+VLIB_CLI_COMMAND (show_fib_entry, static) = {
+  .path = "show ip pmtu",
+  .function = show_ip_pmtu_command,
+  .short_help = "show ip path MTU",
+};
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/ip/ip_path_mtu.h b/src/vnet/ip/ip_path_mtu.h
new file mode 100644 (file)
index 0000000..2c54fcd
--- /dev/null
@@ -0,0 +1,126 @@
+/*
+ *------------------------------------------------------------------
+ * ip_path_mtu.h
+ *
+ * Copyright (c) 2021 Graphiant.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vnet/ip/ip.h>
+
+/**
+ * @brief
+ * The Path MTU DPO. interposed in the forwarding chain of the host prefix.
+ */
+typedef struct ip_pmtu_dpo_t_
+{
+  /**
+   * The protocol of packets using this DPO
+   */
+  dpo_proto_t ipm_proto;
+
+  u8 __pad8;
+
+  /**
+   * Configured Path Mtu
+   */
+  u16 ipm_pmtu;
+
+  /**
+   * number of locks.
+   */
+  u16 ipm_locks;
+
+  /**
+   * Stacked DPO
+   */
+  dpo_id_t ipm_dpo;
+} ip_pmtu_dpo_t;
+
+/*
+ * PMTU DPOs are accessed in the data-path so they should not straddle a cache
+ * line. Align to a integer factor of a cacheline
+ */
+STATIC_ASSERT_SIZEOF (ip_pmtu_dpo_t, 2 * sizeof (u64));
+
+#define foreach_ip_pmtu_flag                                                  \
+  _ (ATTACHED, 0, "attached")                                                 \
+  _ (REMOTE, 1, "remote")                                                     \
+  _ (STALE, 2, "stale")
+
+typedef enum ip_pmtu_flags_t_
+{
+#define _(a, b, c) IP_PMTU_FLAG_##a = (1 << b),
+  foreach_ip_pmtu_flag
+#undef _
+} ip_pmtu_flags_t;
+
+/**
+ * Remote Path MTU tracking object
+ */
+typedef struct ip_pmtu_t_
+{
+  /** linkage into the FIB graph */
+  fib_node_t ipt_node;
+
+  /** Track fib entry */
+  fib_node_index_t ipt_fib_entry;
+  u32 ipt_sibling;
+  ip_pmtu_flags_t ipt_flags;
+
+  /** Configured MTU */
+  u16 ipt_cfg_pmtu;
+
+  /** MTU from the parent MTU */
+  u16 ipt_parent_pmtu;
+
+  /** operational MTU; the minimum value of the cfg and parent MTU */
+  u16 ipt_oper_pmtu;
+} ip_pmtu_t;
+
+extern int ip_path_mtu_update (const ip_address_t *nh, u32 table_id, u16 pmtu);
+
+typedef walk_rc_t (*ip_path_mtu_walk_t) (index_t ipti, void *ctx);
+
+extern void ip_path_mtu_walk (ip_path_mtu_walk_t fn, void *ctx);
+extern int ip_path_mtu_replace_begin (void);
+extern int ip_path_mtu_replace_end (void);
+
+extern u32 ip_pmtu_get_table_id (const ip_pmtu_t *ipt);
+extern void ip_pmtu_get_ip (const ip_pmtu_t *ipt, ip_address_t *ip);
+
+/**
+ * Data-plane accessor functions
+ */
+extern ip_pmtu_dpo_t *ip_pmtu_dpo_pool;
+static_always_inline ip_pmtu_dpo_t *
+ip_pmtu_dpo_get (index_t index)
+{
+  return (pool_elt_at_index (ip_pmtu_dpo_pool, index));
+}
+
+extern ip_pmtu_t *ip_pmtu_pool;
+static_always_inline ip_pmtu_t *
+ip_path_mtu_get (index_t index)
+{
+  return (pool_elt_at_index (ip_pmtu_pool, index));
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/ip/ip_path_mtu_node.c b/src/vnet/ip/ip_path_mtu_node.c
new file mode 100644 (file)
index 0000000..b13f9de
--- /dev/null
@@ -0,0 +1,206 @@
+/*
+ *------------------------------------------------------------------
+ * ip_path_mtu.c
+ *
+ * Copyright (c) 2020 Graphiant.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vnet/ip/ip_path_mtu.h>
+#include <vnet/ip/ip_frag.h>
+
+typedef enum
+{
+  IP_PMTU_DROP,
+  IP_PMTU_N_NEXT,
+} ip_pmtu_next_t;
+
+typedef struct ip_pmtu_trace_t_
+{
+  u16 pmtu;
+  u16 packet_size;
+} ip_pmtu_trace_t;
+
+static u8 *
+format_ip_pmtu_trace (u8 *s, va_list *args)
+{
+  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+  ip_pmtu_trace_t *t = va_arg (*args, ip_pmtu_trace_t *);
+
+  s = format (s, "path mtu:%d packet size:%d", t->pmtu, t->packet_size);
+
+  return s;
+}
+
+static inline uword
+ip_pmtu_dpo_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
+                   vlib_frame_t *frame, ip_address_family_t af)
+{
+  u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+  u32 frag_sent = 0, small_packets = 0;
+
+  from = vlib_frame_vector_args (frame);
+  n_left_from = frame->n_vectors;
+  next_index = node->cached_next_index;
+
+  u32 *buffer = 0;
+
+  while (n_left_from > 0)
+    {
+      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+      while (n_left_from > 0 && n_left_to_next > 0)
+       {
+         const ip_pmtu_dpo_t *ipm0;
+         u32 pi0, *frag_from, frag_left;
+         vlib_buffer_t *p0;
+         ip_frag_error_t error0;
+         u16 next0;
+
+         /*
+          * Note: The packet is not enqueued now. It is instead put
+          * in a vector where other fragments will be put as well.
+          */
+         pi0 = from[0];
+         from += 1;
+         n_left_from -= 1;
+
+         p0 = vlib_get_buffer (vm, pi0);
+         ipm0 = ip_pmtu_dpo_get (vnet_buffer (p0)->ip.adj_index[VLIB_TX]);
+         vnet_buffer (p0)->ip.adj_index[VLIB_TX] = ipm0->ipm_dpo.dpoi_index;
+         next0 = ipm0->ipm_dpo.dpoi_next_node;
+
+         if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
+           {
+             ip_pmtu_trace_t *t;
+             t = vlib_add_trace (vm, node, p0, sizeof (*t));
+             t->pmtu = ipm0->ipm_pmtu;
+             t->packet_size = vlib_buffer_length_in_chain (vm, p0);
+           }
+
+         if (AF_IP6 == af)
+           error0 =
+             ip6_frag_do_fragment (vm, pi0, ipm0->ipm_pmtu, 0, &buffer);
+         else
+           error0 =
+             ip4_frag_do_fragment (vm, pi0, ipm0->ipm_pmtu, 0, &buffer);
+
+         if (AF_IP4 == af && error0 == IP_FRAG_ERROR_DONT_FRAGMENT_SET)
+           {
+             icmp4_error_set_vnet_buffer (
+               p0, ICMP4_destination_unreachable,
+               ICMP4_destination_unreachable_fragmentation_needed_and_dont_fragment_set,
+               ipm0->ipm_pmtu);
+             next0 = IP_FRAG_NEXT_ICMP_ERROR;
+           }
+         else
+           {
+             next0 =
+               (error0 == IP_FRAG_ERROR_NONE ? next0 : IP_FRAG_NEXT_DROP);
+           }
+
+         if (error0 == IP_FRAG_ERROR_NONE)
+           {
+             /* Free original buffer chain */
+             frag_sent += vec_len (buffer);
+             small_packets += (vec_len (buffer) == 1);
+             vlib_buffer_free_one (vm, pi0); /* Free original packet */
+           }
+         else
+           {
+             vlib_error_count (vm, node->node_index, error0, 1);
+             vec_add1 (buffer, pi0); /* Get rid of the original buffer */
+           }
+
+         /* Send fragments that were added in the frame */
+         frag_from = buffer;
+         frag_left = vec_len (buffer);
+
+         while (frag_left > 0)
+           {
+             while (frag_left > 0 && n_left_to_next > 0)
+               {
+                 u32 i;
+                 i = to_next[0] = frag_from[0];
+                 frag_from += 1;
+                 frag_left -= 1;
+                 to_next += 1;
+                 n_left_to_next -= 1;
+
+                 vlib_get_buffer (vm, i)->error = node->errors[error0];
+                 vlib_validate_buffer_enqueue_x1 (
+                   vm, node, next_index, to_next, n_left_to_next, i, next0);
+               }
+             vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+             vlib_get_next_frame (vm, node, next_index, to_next,
+                                  n_left_to_next);
+           }
+         vec_reset_length (buffer);
+       }
+      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+    }
+  vec_free (buffer);
+
+  return frame->n_vectors;
+}
+
+// clang-format off
+
+VLIB_NODE_FN (ip4_ip_pmtu_dpo_node) (vlib_main_t *vm,
+                                     vlib_node_runtime_t *node,
+                                     vlib_frame_t *from_frame)
+{
+  return (ip_pmtu_dpo_inline (vm, node, from_frame, 0));
+}
+
+VLIB_NODE_FN (ip6_ip_pmtu_dpo_node) (vlib_main_t *vm,
+                                     vlib_node_runtime_t *node,
+                                     vlib_frame_t *from_frame)
+{
+  return (ip_pmtu_dpo_inline (vm, node, from_frame, 1));
+}
+
+VLIB_REGISTER_NODE (ip4_ip_pmtu_dpo_node) = {
+  .name = "ip4-pmtu-dpo",
+  .vector_size = sizeof (u32),
+  .format_trace = format_ip_pmtu_trace,
+  .n_errors = 0,
+  .n_next_nodes = IP_PMTU_N_NEXT,
+  .next_nodes =
+  {
+   [IP_PMTU_DROP] = "ip4-drop",
+  }
+};
+VLIB_REGISTER_NODE (ip6_ip_pmtu_dpo_node) = {
+  .name = "ip6-pmtu-dpo",
+  .vector_size = sizeof (u32),
+  .format_trace = format_ip_pmtu_trace,
+  .n_errors = 0,
+  .n_next_nodes = IP_PMTU_N_NEXT,
+  .next_nodes =
+  {
+   [IP_PMTU_DROP] = "ip6-drop",
+  }
+};
+
+// clang-format on
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
index c89d546..f9fa33c 100644 (file)
@@ -18,7 +18,7 @@ from vpp_ip_route import VppIpRoute, VppRoutePath, VppIpMRoute, \
     VppMRoutePath, VppMplsIpBind, \
     VppMplsTable, VppIpTable, FibPathType, find_route, \
     VppIpInterfaceAddress, find_route_in_dump, find_mroute_in_dump
-from vpp_ip import VppIpPuntPolicer, VppIpPuntRedirect
+from vpp_ip import VppIpPuntPolicer, VppIpPuntRedirect, VppIpPathMtu
 from vpp_sub_interface import VppSubInterface, VppDot1QSubint, VppDot1ADSubint
 from vpp_papi import VppEnum
 from vpp_neighbor import VppNeighbor
@@ -2565,5 +2565,173 @@ class TestIP4Replace(VppTestCase):
             self.assertTrue(pfx.query_vpp_config())
 
 
+class TestIPv4PathMTU(VppTestCase):
+    """ IPv4 Path MTU """
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestIPv4PathMTU, cls).setUpClass()
+
+        cls.create_pg_interfaces(range(2))
+
+        # setup all interfaces
+        for i in cls.pg_interfaces:
+            i.admin_up()
+            i.config_ip4()
+            i.resolve_arp()
+
+    @classmethod
+    def tearDownClass(cls):
+        super(TestIPv4PathMTU, cls).tearDownClass()
+
+    def test_path_mtu(self):
+        """ Path MTU """
+
+        #
+        # The goal here is not to test that fragmentation works correctly,
+        # that's done elsewhere, the intent is to ensure that the Path MTU
+        # settings are honoured.
+        #
+        self.vapi.cli("adjacency counters enable")
+
+        # set the interface MTU to a reasonable value
+        self.vapi.sw_interface_set_mtu(self.pg1.sw_if_index,
+                                       [1800, 0, 0, 0])
+
+        self.pg1.generate_remote_hosts(4)
+
+        p_2k = (Ether(dst=self.pg0.local_mac,
+                      src=self.pg0.remote_mac) /
+                IP(src=self.pg0.remote_ip4,
+                   dst=self.pg1.remote_ip4) /
+                UDP(sport=1234, dport=5678) /
+                Raw(b'0xa' * 640))
+        p_1k = (Ether(dst=self.pg0.local_mac,
+                      src=self.pg0.remote_mac) /
+                IP(src=self.pg0.remote_ip4,
+                   dst=self.pg1.remote_ip4) /
+                UDP(sport=1234, dport=5678) /
+                Raw(b'0xa' * 320))
+
+        nbr = VppNeighbor(self,
+                          self.pg1.sw_if_index,
+                          self.pg1.remote_mac,
+                          self.pg1.remote_ip4).add_vpp_config()
+
+        # this is now the interface MTU frags
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=2)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1)
+
+        # drop the path MTU for this neighbour to below the interface MTU
+        # expect more frags
+        pmtu = VppIpPathMtu(self, self.pg1.remote_ip4, 900).add_vpp_config()
+
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+        # print/format the adj delegate
+        self.logger.info(self.vapi.cli("sh adj 5"))
+
+        # increase the path MTU to more than the interface
+        # expect to use the interface MTU
+        pmtu.modify(8192)
+
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=2)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1)
+
+        # go back to an MTU from the path
+        # wrap the call around mark-n-sweep to enusre updates clear stale
+        self.vapi.ip_path_mtu_replace_begin()
+        pmtu.modify(900)
+        self.vapi.ip_path_mtu_replace_end()
+
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+        # raise the interface's MTU
+        # should still use that of the path
+        self.vapi.sw_interface_set_mtu(self.pg1.sw_if_index,
+                                       [2000, 0, 0, 0])
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+        # set path high and interface low
+        pmtu.modify(2000)
+        self.vapi.sw_interface_set_mtu(self.pg1.sw_if_index,
+                                       [900, 0, 0, 0])
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+        # remove the path MTU using the mark-n-sweep semantics
+        self.vapi.sw_interface_set_mtu(self.pg1.sw_if_index,
+                                       [1800, 0, 0, 0])
+        self.vapi.ip_path_mtu_replace_begin()
+        self.vapi.ip_path_mtu_replace_end()
+
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=2)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1)
+
+        #
+        # set path MTU for a neighbour that doesn't exist, yet
+        #
+        pmtu2 = VppIpPathMtu(self,
+                             self.pg1.remote_hosts[2].ip4,
+                             900).add_vpp_config()
+
+        p_2k = (Ether(dst=self.pg0.local_mac,
+                      src=self.pg0.remote_mac) /
+                IP(src=self.pg0.remote_ip4,
+                   dst=self.pg1.remote_hosts[2].ip4) /
+                UDP(sport=1234, dport=5678) /
+                Raw(b'0xa' * 640))
+        p_1k = (Ether(dst=self.pg0.local_mac,
+                      src=self.pg0.remote_mac) /
+                IP(src=self.pg0.remote_ip4,
+                   dst=self.pg1.remote_hosts[2].ip4) /
+                UDP(sport=1234, dport=5678) /
+                Raw(b'0xa' * 320))
+
+        nbr2 = VppNeighbor(self,
+                           self.pg1.sw_if_index,
+                           self.pg1.remote_hosts[2].mac,
+                           self.pg1.remote_hosts[2].ip4).add_vpp_config()
+
+        # should frag to the path MTU
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+        # remove and re-add the neighbour
+        nbr2.remove_vpp_config()
+        nbr2.add_vpp_config()
+
+        # should frag to the path MTU
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+        #
+        # set PMTUs for many peers
+        #
+        N_HOSTS = 16
+        self.pg1.generate_remote_hosts(16)
+        self.pg1.configure_ipv4_neighbors()
+
+        for h in range(N_HOSTS):
+            pmtu = VppIpPathMtu(self, self.pg1.remote_hosts[h].ip4, 900)
+            pmtu.add_vpp_config()
+            self.assertTrue(pmtu.query_vpp_config())
+
+        self.logger.info(self.vapi.cli("sh ip pmtu"))
+        dump = list(self.vapi.vpp.details_iter(self.vapi.ip_path_mtu_get))
+        self.assertEqual(N_HOSTS, len(dump))
+
+        for h in range(N_HOSTS):
+            p_2k[IP].dst = self.pg1.remote_hosts[h].ip4
+            p_1k[IP].dst = self.pg1.remote_hosts[h].ip4
+
+            # should frag to the path MTU
+            self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+            self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+
 if __name__ == '__main__':
     unittest.main(testRunner=VppTestRunner)
index 8a2b332..1672679 100644 (file)
@@ -7,6 +7,7 @@ import unittest
 from parameterized import parameterized
 import scapy.compat
 import scapy.layers.inet6 as inet6
+from scapy.layers.inet import UDP
 from scapy.contrib.mpls import MPLS
 from scapy.layers.inet6 import IPv6, ICMPv6ND_NS, ICMPv6ND_RS, \
     ICMPv6ND_RA, ICMPv6NDOptMTU, ICMPv6NDOptSrcLLAddr, ICMPv6NDOptPrefixInfo, \
@@ -22,13 +23,14 @@ from six import moves
 from framework import VppTestCase, VppTestRunner, tag_run_solo
 from util import ppp, ip6_normalize, mk_ll_addr
 from vpp_papi import VppEnum
-from vpp_ip import DpoProto, VppIpPuntPolicer, VppIpPuntRedirect
+from vpp_ip import DpoProto, VppIpPuntPolicer, VppIpPuntRedirect, VppIpPathMtu
 from vpp_ip_route import VppIpRoute, VppRoutePath, find_route, VppIpMRoute, \
     VppMRoutePath, VppMplsIpBind, \
     VppMplsRoute, VppMplsTable, VppIpTable, FibPathType, FibPathProto, \
     VppIpInterfaceAddress, find_route_in_dump, find_mroute_in_dump, \
     VppIp6LinkLocalAddress
 from vpp_neighbor import find_nbr, VppNeighbor
+from vpp_ipip_tun_interface import VppIpIpTunInterface
 from vpp_pg_interface import is_ipv6_misc
 from vpp_sub_interface import VppSubInterface, VppDot1QSubint
 from vpp_policer import VppPolicer, PolicerAction
@@ -3036,5 +3038,236 @@ class TestIP6LinkLocal(VppTestCase):
         self.send_and_expect(self.pg1, [p_echo_request_3], self.pg1)
 
 
+class TestIPv6PathMTU(VppTestCase):
+    """ IPv6 Path MTU """
+
+    def setUp(self):
+        super(TestIPv6PathMTU, self).setUp()
+
+        self.create_pg_interfaces(range(2))
+
+        # setup all interfaces
+        for i in self.pg_interfaces:
+            i.admin_up()
+            i.config_ip6()
+            i.resolve_ndp()
+
+    def tearDown(self):
+        super(TestIPv6PathMTU, self).tearDown()
+        for i in self.pg_interfaces:
+            i.unconfig_ip6()
+            i.admin_down()
+
+    def test_path_mtu_local(self):
+        """ Path MTU for attached neighbour """
+
+        self.vapi.cli("set log class ip level debug")
+        #
+        # The goal here is not test that fragmentation works correctly,
+        # that's done elsewhere, the intent is to ensure that the Path MTU
+        # settings are honoured.
+        #
+
+        #
+        # IPv6 will only frag locally generated packets, so use tunnelled
+        # packets post encap
+        #
+        tun = VppIpIpTunInterface(
+            self,
+            self.pg1,
+            self.pg1.local_ip6,
+            self.pg1.remote_ip6)
+        tun.add_vpp_config()
+        tun.admin_up()
+        tun.config_ip6()
+
+        # set the interface MTU to a reasonable value
+        self.vapi.sw_interface_set_mtu(self.pg1.sw_if_index,
+                                       [2800, 0, 0, 0])
+
+        p_2k = (Ether(dst=self.pg0.local_mac,
+                      src=self.pg0.remote_mac) /
+                IPv6(src=self.pg0.remote_ip6,
+                     dst=tun.remote_ip6) /
+                UDP(sport=1234, dport=5678) /
+                Raw(b'0xa' * 1000))
+        p_1k = (Ether(dst=self.pg0.local_mac,
+                      src=self.pg0.remote_mac) /
+                IPv6(src=self.pg0.remote_ip6,
+                     dst=tun.remote_ip6) /
+                UDP(sport=1234, dport=5678) /
+                Raw(b'0xa' * 600))
+
+        nbr = VppNeighbor(self,
+                          self.pg1.sw_if_index,
+                          self.pg1.remote_mac,
+                          self.pg1.remote_ip6).add_vpp_config()
+
+        # this is now the interface MTU frags
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=2)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1)
+
+        # drop the path MTU for this neighbour to below the interface MTU
+        # expect more frags
+        pmtu = VppIpPathMtu(self, self.pg1.remote_ip6, 1300).add_vpp_config()
+
+        # print/format the adj delegate and trackers
+        self.logger.info(self.vapi.cli("sh ip pmtu"))
+        self.logger.info(self.vapi.cli("sh adj 7"))
+
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+        # increase the path MTU to more than the interface
+        # expect to use the interface MTU
+        pmtu.modify(8192)
+
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=2)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1)
+
+        # go back to an MTU from the path
+        pmtu.modify(1300)
+
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+        # raise the interface's MTU
+        # should still use that of the path
+        self.vapi.sw_interface_set_mtu(self.pg1.sw_if_index,
+                                       [2000, 0, 0, 0])
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+        # set path high and interface low
+        pmtu.modify(2000)
+        self.vapi.sw_interface_set_mtu(self.pg1.sw_if_index,
+                                       [1300, 0, 0, 0])
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+        # remove the path MTU
+        self.vapi.sw_interface_set_mtu(self.pg1.sw_if_index,
+                                       [2800, 0, 0, 0])
+        pmtu.modify(0)
+
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=2)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1)
+
+    def test_path_mtu_remote(self):
+        """ Path MTU for remote neighbour """
+
+        self.vapi.cli("set log class ip level debug")
+        #
+        # The goal here is not test that fragmentation works correctly,
+        # that's done elsewhere, the intent is to ensure that the Path MTU
+        # settings are honoured.
+        #
+        tun_dst = "2001::1"
+
+        route = VppIpRoute(
+            self, tun_dst, 64,
+            [VppRoutePath(self.pg1.remote_ip6,
+                          self.pg1.sw_if_index)]).add_vpp_config()
+
+        #
+        # IPv6 will only frag locally generated packets, so use tunnelled
+        # packets post encap
+        #
+        tun = VppIpIpTunInterface(
+            self,
+            self.pg1,
+            self.pg1.local_ip6,
+            tun_dst)
+        tun.add_vpp_config()
+        tun.admin_up()
+        tun.config_ip6()
+
+        # set the interface MTU to a reasonable value
+        self.vapi.sw_interface_set_mtu(self.pg1.sw_if_index,
+                                       [2800, 0, 0, 0])
+
+        p_2k = (Ether(dst=self.pg0.local_mac,
+                      src=self.pg0.remote_mac) /
+                IPv6(src=self.pg0.remote_ip6,
+                     dst=tun.remote_ip6) /
+                UDP(sport=1234, dport=5678) /
+                Raw(b'0xa' * 1000))
+        p_1k = (Ether(dst=self.pg0.local_mac,
+                      src=self.pg0.remote_mac) /
+                IPv6(src=self.pg0.remote_ip6,
+                     dst=tun.remote_ip6) /
+                UDP(sport=1234, dport=5678) /
+                Raw(b'0xa' * 600))
+
+        nbr = VppNeighbor(self,
+                          self.pg1.sw_if_index,
+                          self.pg1.remote_mac,
+                          self.pg1.remote_ip6).add_vpp_config()
+
+        # this is now the interface MTU frags
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=2)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1)
+
+        # drop the path MTU for this neighbour to below the interface MTU
+        # expect more frags
+        pmtu = VppIpPathMtu(self, tun_dst, 1300).add_vpp_config()
+
+        # print/format the fib entry/dpo
+        self.logger.info(self.vapi.cli("sh ip6 fib 2001::1"))
+
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+        # increase the path MTU to more than the interface
+        # expect to use the interface MTU
+        pmtu.modify(8192)
+
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=2)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1)
+
+        # go back to an MTU from the path
+        pmtu.modify(1300)
+
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+        # raise the interface's MTU
+        # should still use that of the path
+        self.vapi.sw_interface_set_mtu(self.pg1.sw_if_index,
+                                       [2000, 0, 0, 0])
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+        # turn the tun_dst into an attached neighbour
+        route.modify([VppRoutePath("::",
+                                   self.pg1.sw_if_index)])
+        nbr2 = VppNeighbor(self,
+                           self.pg1.sw_if_index,
+                           self.pg1.remote_mac,
+                           tun_dst).add_vpp_config()
+
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+        # add back to not attached
+        nbr2.remove_vpp_config()
+        route.modify([VppRoutePath(self.pg1.remote_ip6,
+                                   self.pg1.sw_if_index)])
+
+        # set path high and interface low
+        pmtu.modify(2000)
+        self.vapi.sw_interface_set_mtu(self.pg1.sw_if_index,
+                                       [1300, 0, 0, 0])
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=3)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1, n_rx=2)
+
+        # remove the path MTU
+        self.vapi.sw_interface_set_mtu(self.pg1.sw_if_index,
+                                       [2800, 0, 0, 0])
+        pmtu.remove_vpp_config()
+        self.send_and_expect(self.pg0, [p_2k], self.pg1, n_rx=2)
+        self.send_and_expect(self.pg0, [p_1k], self.pg1)
+
+
 if __name__ == '__main__':
     unittest.main(testRunner=VppTestRunner)
index e2367a3..81305b2 100644 (file)
@@ -181,3 +181,49 @@ class VppIpPuntRedirect(VppObject):
         if self.get_vpp_config():
             return True
         return False
+
+
+class VppIpPathMtu(VppObject):
+    def __init__(self, test, nh, pmtu, table_id=0):
+        self._test = test
+        self.nh = nh
+        self.pmtu = pmtu
+        self.table_id = table_id
+
+    def add_vpp_config(self):
+        self._test.vapi.ip_path_mtu_update(pmtu={'nh': self.nh,
+                                                 'table_id': self.table_id,
+                                                 'path_mtu': self.pmtu})
+        self._test.registry.register(self, self._test.logger)
+        return self
+
+    def modify(self, pmtu):
+        self.pmtu = pmtu
+        self._test.vapi.ip_path_mtu_update(pmtu={'nh': self.nh,
+                                                 'table_id': self.table_id,
+                                                 'path_mtu': self.pmtu})
+        return self
+
+    def remove_vpp_config(self):
+        self._test.vapi.ip_path_mtu_update(pmtu={'nh': self.nh,
+                                                 'table_id': self.table_id,
+                                                 'path_mtu': 0})
+
+    def query_vpp_config(self):
+        ds = list(self._test.vapi.vpp.details_iter(
+            self._test.vapi.ip_path_mtu_get))
+
+        for d in ds:
+            if self.nh == str(d.pmtu.nh) \
+               and self.table_id == d.pmtu.table_id \
+               and self.pmtu == d.pmtu.path_mtu:
+                return True
+        return False
+
+    def object_id(self):
+        return ("ip-path-mtu-%d-%s-%d" % (self.table_id,
+                                          self.nh,
+                                          self.pmtu))
+
+    def __str__(self):
+        return self.object_id()