linux-cp: Basic MPLS support.
[vpp.git] / src / plugins / linux-cp / lcp_router.c
index af01b54..f53ec4f 100644 (file)
@@ -15,6 +15,7 @@
 
 #include <sys/socket.h>
 #include <linux/if.h>
+#include <linux/mpls.h>
 
 //#include <vlib/vlib.h>
 #include <vlib/unix/plugin.h>
@@ -27,6 +28,7 @@
 #include <netlink/route/link.h>
 #include <netlink/route/route.h>
 #include <netlink/route/neighbour.h>
+#include <netlink/route/nexthop.h>
 #include <netlink/route/addr.h>
 #include <netlink/route/link/vlan.h>
 
@@ -272,6 +274,25 @@ lcp_router_link_mtu (struct rtnl_link *rl, u32 sw_if_index)
     vnet_sw_interface_set_mtu (vnm, sw->sw_if_index, mtu);
 }
 
+static walk_rc_t
+lcp_router_link_addr_adj_upd_cb (vnet_main_t *vnm, u32 sw_if_index, void *arg)
+{
+  lcp_itf_pair_t *lip;
+
+  lip = lcp_itf_pair_get (lcp_itf_pair_find_by_phy (sw_if_index));
+  if (!lip)
+    {
+      return WALK_CONTINUE;
+    }
+
+  vnet_update_adjacency_for_sw_interface (vnm, lip->lip_phy_sw_if_index,
+                                         lip->lip_phy_adjs.adj_index[AF_IP4]);
+  vnet_update_adjacency_for_sw_interface (vnm, lip->lip_phy_sw_if_index,
+                                         lip->lip_phy_adjs.adj_index[AF_IP6]);
+
+  return WALK_CONTINUE;
+}
+
 static void
 lcp_router_link_addr (struct rtnl_link *rl, lcp_itf_pair_t *lip)
 {
@@ -301,10 +322,8 @@ lcp_router_link_addr (struct rtnl_link *rl, lcp_itf_pair_t *lip)
                                          mac_addr_bytes);
 
   /* mcast adjacencies need to be updated */
-  vnet_update_adjacency_for_sw_interface (vnm, lip->lip_phy_sw_if_index,
-                                         lip->lip_phy_adjs.adj_index[AF_IP4]);
-  vnet_update_adjacency_for_sw_interface (vnm, lip->lip_phy_sw_if_index,
-                                         lip->lip_phy_adjs.adj_index[AF_IP6]);
+  vnet_hw_interface_walk_sw (vnm, hw->hw_if_index,
+                            lcp_router_link_addr_adj_upd_cb, NULL);
 }
 
 static void lcp_router_table_flush (lcp_router_table_t *nlt,
@@ -419,6 +438,10 @@ lcp_router_link_add (struct rtnl_link *rl, void *ctx)
                               lip->lip_phy_sw_if_index);
              return;
            }
+
+         /* pool could grow during the previous operation */
+         lip = lcp_itf_pair_get (lipi);
+
          /* create the vlan interface on the parent host */
          if (vnet_create_sub_interface (lip->lip_host_sw_if_index, vlan, 18,
                                         0, vlan, &sub_host_sw_if_index))
@@ -575,9 +598,18 @@ VNET_HW_INTERFACE_LINK_UP_DOWN_FUNCTION (lcp_router_link_up_down);
 static fib_protocol_t
 lcp_router_proto_k2f (uint32_t k)
 {
-  if (AF_INET6 == k)
-    return (FIB_PROTOCOL_IP6);
-  return (FIB_PROTOCOL_IP4);
+  switch (k)
+    {
+    case AF_INET6:
+      return FIB_PROTOCOL_IP6;
+    case AF_INET:
+      return FIB_PROTOCOL_IP4;
+    case AF_MPLS:
+      return FIB_PROTOCOL_MPLS;
+    default:
+      ASSERT (0);
+      return FIB_PROTOCOL_NONE;
+    }
 }
 
 static void
@@ -587,6 +619,7 @@ lcp_router_mk_addr (const struct nl_addr *rna, ip_address_t *ia)
 
   ip_address_reset (ia);
   fproto = lcp_router_proto_k2f (nl_addr_get_family (rna));
+  ASSERT (FIB_PROTOCOL_MPLS != fproto);
 
   ip_address_set (ia, nl_addr_get_binary_addr (rna),
                  FIB_PROTOCOL_IP4 == fproto ? AF_IP4 : AF_IP6);
@@ -598,6 +631,8 @@ lcp_router_mk_addr46 (const struct nl_addr *rna, ip46_address_t *ia)
   fib_protocol_t fproto;
 
   fproto = lcp_router_proto_k2f (nl_addr_get_family (rna));
+  ASSERT (FIB_PROTOCOL_MPLS != fproto);
+
   ip46_address_reset (ia);
   if (FIB_PROTOCOL_IP4 == fproto)
     memcpy (&ia->ip4, nl_addr_get_binary_addr (rna), nl_addr_get_len (rna));
@@ -662,10 +697,25 @@ lcp_router_link_addr_add (struct rtnl_addr *la)
   lcp_router_link_addr_add_del (la, 0);
 }
 
+static walk_rc_t
+lcp_router_address_mark (index_t index, void *ctx)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+
+  lcp_itf_pair_t *lip = lcp_itf_pair_get (index);
+  if (!lip)
+    return WALK_CONTINUE;
+
+  ip_interface_address_mark_one_interface (
+    vnm, vnet_get_sw_interface (vnm, lip->lip_phy_sw_if_index), 0);
+
+  return WALK_CONTINUE;
+}
+
 static void
 lcp_router_link_addr_sync_begin (void)
 {
-  ip_interface_address_mark ();
+  lcp_itf_pair_walk (lcp_router_address_mark, 0);
 
   LCP_ROUTER_INFO ("Begin synchronization of interface addresses");
 }
@@ -695,8 +745,11 @@ lcp_router_neigh_del (struct rtnl_neigh *rn)
     {
       ip_address_t nh;
       int rv;
+      struct nl_addr *rna;
 
-      lcp_router_mk_addr (rtnl_neigh_get_dst (rn), &nh);
+      if ((rna = rtnl_neigh_get_dst (rn)) == NULL)
+       return;
+      lcp_router_mk_addr (rna, &nh);
 
       if (ip46_address_is_multicast (&ip_addr_46 (&nh)))
        {
@@ -744,8 +797,11 @@ lcp_router_neigh_add (struct rtnl_neigh *rn)
       struct nl_addr *ll;
       ip_address_t nh;
       int state;
+      struct nl_addr *rna;
 
-      lcp_router_mk_addr (rtnl_neigh_get_dst (rn), &nh);
+      if ((rna = rtnl_neigh_get_dst (rn)) == NULL)
+       return;
+      lcp_router_mk_addr (rna, &nh);
 
       if (ip46_address_is_multicast (&ip_addr_46 (&nh)))
        {
@@ -795,11 +851,23 @@ lcp_router_neigh_add (struct rtnl_neigh *rn)
                     rtnl_neigh_get_ifindex (rn));
 }
 
+static walk_rc_t
+lcp_router_neighbor_mark (index_t index, void *ctx)
+{
+  lcp_itf_pair_t *lip = lcp_itf_pair_get (index);
+  if (!lip)
+    return WALK_CONTINUE;
+
+  ip_neighbor_walk (AF_IP4, lip->lip_phy_sw_if_index, ip_neighbor_mark_one, 0);
+  ip_neighbor_walk (AF_IP6, lip->lip_phy_sw_if_index, ip_neighbor_mark_one, 0);
+
+  return WALK_CONTINUE;
+}
+
 static void
 lcp_router_neigh_sync_begin (void)
 {
-  ip_neighbor_mark (AF_IP4);
-  ip_neighbor_mark (AF_IP6);
+  lcp_itf_pair_walk (lcp_router_neighbor_mark, 0);
 
   LCP_ROUTER_INFO ("Begin synchronization of neighbors");
 }
@@ -934,9 +1002,31 @@ static void
 lcp_router_route_mk_prefix (struct rtnl_route *r, fib_prefix_t *p)
 {
   const struct nl_addr *addr = rtnl_route_get_dst (r);
+  u32 *baddr = nl_addr_get_binary_addr (addr);
+  u32 blen = nl_addr_get_len (addr);
+  ip46_address_t *paddr = &p->fp_addr;
+  u32 entry;
+
+  p->fp_proto = lcp_router_proto_k2f (nl_addr_get_family (addr));
+
+  switch (p->fp_proto)
+    {
+    case FIB_PROTOCOL_MPLS:
+      entry = ntohl (*baddr);
+      p->fp_label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
+      p->fp_len = 21;
+      p->fp_eos = MPLS_NON_EOS;
+      return;
+    case FIB_PROTOCOL_IP4:
+      ip46_address_reset (paddr);
+      memcpy (&paddr->ip4, baddr, blen);
+      break;
+    case FIB_PROTOCOL_IP6:
+      memcpy (&paddr->ip6, baddr, blen);
+      break;
+    }
 
   p->fp_len = nl_addr_get_prefixlen (addr);
-  p->fp_proto = lcp_router_mk_addr46 (addr, &p->fp_addr);
 }
 
 static void
@@ -954,6 +1044,37 @@ lcp_router_route_mk_mprefix (struct rtnl_route *r, mfib_prefix_t *p)
     p->fp_proto = lcp_router_mk_addr46 (addr, &p->fp_src_addr);
 }
 
+static int
+lcp_router_mpls_nladdr_to_path (fib_route_path_t *path, struct nl_addr *addr)
+{
+  if (!addr)
+    return 0;
+
+  struct mpls_label *stack = nl_addr_get_binary_addr (addr);
+  u32 entry, label;
+  u8 exp, ttl;
+  int label_count = 0;
+
+  while (1)
+    {
+      entry = ntohl (stack[label_count++].entry);
+      label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
+      exp = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT;
+      ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
+
+      fib_mpls_label_t fml = {
+       .fml_value = label,
+       .fml_exp = exp,
+       .fml_ttl = ttl,
+      };
+      vec_add1 (path->frp_label_stack, fml);
+
+      if (entry & MPLS_LS_S_MASK)
+       break;
+    }
+  return label_count;
+}
+
 typedef struct lcp_router_route_path_parse_t_
 {
   fib_route_path_t *paths;
@@ -969,6 +1090,7 @@ lcp_router_route_path_parse (struct rtnl_nexthop *rnh, void *arg)
   lcp_router_route_path_parse_t *ctx = arg;
   fib_route_path_t *path;
   u32 sw_if_index;
+  int label_count = 0;
 
   sw_if_index = lcp_router_intf_h2p (rtnl_route_nh_get_ifindex (rnh));
 
@@ -981,19 +1103,53 @@ lcp_router_route_path_parse (struct rtnl_nexthop *rnh, void *arg)
 
       path->frp_flags = FIB_ROUTE_PATH_FLAG_NONE | ctx->type_flags;
       path->frp_sw_if_index = sw_if_index;
-      path->frp_weight = rtnl_route_nh_get_weight (rnh);
       path->frp_preference = ctx->preference;
 
+      /*
+       * FIB Path Weight of 0 is meaningless and replaced with 1 further along.
+       * See fib_path_create. fib_path_cmp_w_route_path would fail to match
+       * such a fib_route_path_t with any fib_path_t, because a fib_path_t's
+       * fp_weight can never be 0.
+       */
+      path->frp_weight = clib_max (1, rtnl_route_nh_get_weight (rnh));
+
       addr = rtnl_route_nh_get_gateway (rnh);
+      if (!addr)
+       addr = rtnl_route_nh_get_via (rnh);
 
       if (addr)
-       fproto = lcp_router_mk_addr46 (rtnl_route_nh_get_gateway (rnh),
-                                      &path->frp_addr);
+       fproto = lcp_router_mk_addr46 (addr, &path->frp_addr);
       else
        fproto = ctx->route_proto;
 
       path->frp_proto = fib_proto_to_dpo (fproto);
 
+      if (ctx->route_proto == FIB_PROTOCOL_MPLS)
+       {
+         addr = rtnl_route_nh_get_newdst (rnh);
+         label_count = lcp_router_mpls_nladdr_to_path (path, addr);
+         if (label_count)
+           {
+             LCP_ROUTER_DBG (" is label swap to %u",
+                             path->frp_label_stack[0].fml_value);
+           }
+         else
+           {
+             fib_mpls_label_t fml = {
+               .fml_value = MPLS_LABEL_POP,
+             };
+             vec_add1 (path->frp_label_stack, fml);
+             LCP_ROUTER_DBG (" is label pop");
+           }
+       }
+
+#ifdef NL_CAPABILITY_VERSION_3_6_0
+      addr = rtnl_route_nh_get_encap_mpls_dst (rnh);
+      label_count = lcp_router_mpls_nladdr_to_path (path, addr);
+      if (label_count)
+       LCP_ROUTER_DBG (" has encap mpls, %d labels", label_count);
+#endif
+
       if (ctx->is_mcast)
        path->frp_mitf_flags = MFIB_ITF_FLAG_FORWARD;
 
@@ -1116,11 +1272,20 @@ lcp_router_route_del (struct rtnl_route *rr)
 
       fib_src = lcp_router_proto_fib_source (rproto);
 
-      if (pfx.fp_proto == FIB_PROTOCOL_IP6)
-       fib_table_entry_delete (nlt->nlt_fib_index, &pfx, fib_src);
-      else
-       fib_table_entry_path_remove2 (nlt->nlt_fib_index, &pfx, fib_src,
-                                     np.paths);
+      switch (pfx.fp_proto)
+       {
+       case FIB_PROTOCOL_IP6:
+         fib_table_entry_delete (nlt->nlt_fib_index, &pfx, fib_src);
+         break;
+       case FIB_PROTOCOL_MPLS:
+         fib_table_entry_path_remove2 (nlt->nlt_fib_index, &pfx, fib_src,
+                                       np.paths);
+         /* delete the EOS route in addition to NEOS - fallthrough */
+         pfx.fp_eos = MPLS_EOS;
+       default:
+         fib_table_entry_path_remove2 (nlt->nlt_fib_index, &pfx, fib_src,
+                                       np.paths);
+       }
     }
 
   vec_free (np.paths);
@@ -1128,8 +1293,28 @@ lcp_router_route_del (struct rtnl_route *rr)
   lcp_router_table_unlock (nlt);
 }
 
+static fib_route_path_t *
+lcp_router_fib_route_path_dup (fib_route_path_t *old)
+{
+  int idx;
+  fib_route_path_t *p;
+
+  fib_route_path_t *new = vec_dup (old);
+  if (!new)
+    return NULL;
+
+  for (idx = 0; idx < vec_len (new); idx++)
+    {
+      p = &new[idx];
+      if (p->frp_label_stack)
+       p->frp_label_stack = vec_dup (p->frp_label_stack);
+    }
+
+  return new;
+}
+
 static void
-lcp_router_route_add (struct rtnl_route *rr)
+lcp_router_route_add (struct rtnl_route *rr, int is_replace)
 {
   fib_entry_flag_t entry_flags;
   uint32_t table_id;
@@ -1158,61 +1343,89 @@ lcp_router_route_add (struct rtnl_route *rr)
       LCP_ROUTER_DBG ("route skip: %d:%U %U", rtnl_route_get_table (rr),
                      format_fib_prefix, &pfx, format_fib_entry_flags,
                      entry_flags);
+      return;
     }
-  else
-    {
-      LCP_ROUTER_DBG ("route add: %d:%U %U", rtnl_route_get_table (rr),
-                     format_fib_prefix, &pfx, format_fib_entry_flags,
-                     entry_flags);
+  LCP_ROUTER_DBG ("route %s: %d:%U %U", is_replace ? "replace" : "add",
+                 rtnl_route_get_table (rr), format_fib_prefix, &pfx,
+                 format_fib_entry_flags, entry_flags);
 
-      lcp_router_route_path_parse_t np = {
-       .route_proto = pfx.fp_proto,
-       .is_mcast = (rtype == RTN_MULTICAST),
-       .type_flags = lcp_router_route_type_frpflags[rtype],
-       .preference = (u8) rtnl_route_get_priority (rr),
-      };
+  lcp_router_route_path_parse_t np = {
+    .route_proto = pfx.fp_proto,
+    .is_mcast = (rtype == RTN_MULTICAST),
+    .type_flags = lcp_router_route_type_frpflags[rtype],
+    .preference = (u8) rtnl_route_get_priority (rr),
+  };
 
-      rtnl_route_foreach_nexthop (rr, lcp_router_route_path_parse, &np);
-      lcp_router_route_path_add_special (rr, &np);
+  rtnl_route_foreach_nexthop (rr, lcp_router_route_path_parse, &np);
+  lcp_router_route_path_add_special (rr, &np);
 
-      if (0 != vec_len (np.paths))
+  if (0 != vec_len (np.paths))
+    {
+      if (rtype == RTN_MULTICAST)
        {
-         if (rtype == RTN_MULTICAST)
-           {
-             /* it's not clear to me how linux expresses the RPF paramters
-              * so we'll allow from all interfaces and hope for the best */
-             mfib_prefix_t mpfx = {};
+         /* it's not clear to me how linux expresses the RPF paramters
+          * so we'll allow from all interfaces and hope for the best */
+         mfib_prefix_t mpfx = {};
 
-             lcp_router_route_mk_mprefix (rr, &mpfx);
+         lcp_router_route_mk_mprefix (rr, &mpfx);
 
-             mfib_table_entry_update (
-               nlt->nlt_mfib_index, &mpfx, MFIB_SOURCE_PLUGIN_LOW,
-               MFIB_RPF_ID_NONE, MFIB_ENTRY_FLAG_ACCEPT_ALL_ITF);
+         mfib_table_entry_update (nlt->nlt_mfib_index, &mpfx,
+                                  MFIB_SOURCE_PLUGIN_LOW, MFIB_RPF_ID_NONE,
+                                  MFIB_ENTRY_FLAG_ACCEPT_ALL_ITF);
 
-             mfib_table_entry_paths_update (nlt->nlt_mfib_index, &mpfx,
-                                            MFIB_SOURCE_PLUGIN_LOW,
-                                            MFIB_ENTRY_FLAG_NONE, np.paths);
-           }
-         else
+         mfib_table_entry_paths_update (nlt->nlt_mfib_index, &mpfx,
+                                        MFIB_SOURCE_PLUGIN_LOW,
+                                        MFIB_ENTRY_FLAG_NONE, np.paths);
+       }
+      else
+       {
+         fib_source_t fib_src;
+         const fib_route_path_t *rpath;
+
+         vec_foreach (rpath, np.paths)
            {
-             fib_source_t fib_src;
+             if (fib_route_path_is_attached (rpath))
+               {
+                 entry_flags |= FIB_ENTRY_FLAG_ATTACHED;
+                 break;
+               }
+           }
 
-             fib_src = lcp_router_proto_fib_source (rproto);
+         fib_src = lcp_router_proto_fib_source (rproto);
 
-             if (pfx.fp_proto == FIB_PROTOCOL_IP6)
-               fib_table_entry_path_add2 (nlt->nlt_fib_index, &pfx, fib_src,
-                                          entry_flags, np.paths);
-             else
+         if (pfx.fp_proto == FIB_PROTOCOL_MPLS)
+           {
+             /* in order to avoid double-frees, we duplicate the paths. */
+             fib_route_path_t *pathdup =
+               lcp_router_fib_route_path_dup (np.paths);
+             if (is_replace)
                fib_table_entry_update (nlt->nlt_fib_index, &pfx, fib_src,
-                                       entry_flags, np.paths);
+                                       entry_flags, pathdup);
+             else
+               fib_table_entry_path_add2 (nlt->nlt_fib_index, &pfx, fib_src,
+                                          entry_flags, pathdup);
+             vec_free (pathdup);
+
+             /* install EOS route in addition to NEOS */
+             pfx.fp_eos = MPLS_EOS;
+             pfx.fp_payload_proto = np.paths[0].frp_proto;
            }
+
+         if (is_replace)
+           fib_table_entry_update (nlt->nlt_fib_index, &pfx, fib_src,
+                                   entry_flags, np.paths);
+         else
+           fib_table_entry_path_add2 (nlt->nlt_fib_index, &pfx, fib_src,
+                                      entry_flags, np.paths);
        }
-      else
-       LCP_ROUTER_DBG ("no paths for route add: %d:%U %U",
-                       rtnl_route_get_table (rr), format_fib_prefix, &pfx,
-                       format_fib_entry_flags, entry_flags);
-      vec_free (np.paths);
     }
+  else
+    {
+      LCP_ROUTER_DBG ("no paths for route: %d:%U %U",
+                     rtnl_route_get_table (rr), format_fib_prefix, &pfx,
+                     format_fib_entry_flags, entry_flags);
+    }
+  vec_free (np.paths);
 }
 
 static void