ip: functional interface to ip fragmentation 24/22624/13
authorOle Troan <ot@cisco.com>
Wed, 9 Oct 2019 11:33:19 +0000 (13:33 +0200)
committerNeale Ranns <nranns@cisco.com>
Mon, 11 Nov 2019 12:33:36 +0000 (12:33 +0000)
This provides a functional interface to IP fragmentation.
Allowing external features to fragment. Supports
arbitrary encap size, for e.g. MPLS or inner fragmentation
of tunnels.

This also removed dual loop in MAP that was fundamentally broken.

Type: fix
Signed-off-by: Ole Troan <ot@cisco.com>
Change-Id: Ia89ecec8ee3cbe2416edbe87630fdb714898c2a8
Signed-off-by: Ole Troan <ot@cisco.com>
12 files changed:
src/plugins/map/ip4_map.c
src/plugins/map/ip4_map_t.c
src/plugins/map/ip6_map.c
src/plugins/map/ip6_map_t.c
src/plugins/map/test/test_map.py
src/vnet/ip/ip4_forward.c
src/vnet/ip/ip6_forward.c
src/vnet/ip/ip_frag.c
src/vnet/ip/ip_frag.h
src/vnet/mpls/mpls_output.c
test/test_mpls.py
test/vpp_interface.py

index ad94907..f2a0090 100644 (file)
@@ -26,8 +26,6 @@ enum ip4_map_next_e
 #ifdef MAP_SKIP_IP6_LOOKUP
   IP4_MAP_NEXT_IP6_REWRITE,
 #endif
-  IP4_MAP_NEXT_IP4_FRAGMENT,
-  IP4_MAP_NEXT_IP6_FRAGMENT,
   IP4_MAP_NEXT_ICMP_ERROR,
   IP4_MAP_NEXT_DROP,
   IP4_MAP_N_NEXT,
@@ -117,17 +115,26 @@ ip4_map_decrement_ttl (ip4_header_t * ip, u8 * error)
 }
 
 static u32
-ip4_map_fragment (vlib_buffer_t * b, u16 mtu, bool df, u8 * error)
+ip4_map_fragment (vlib_main_t * vm, u32 bi, u16 mtu, bool df, u32 ** buffers,
+                 u8 * error)
 {
   map_main_t *mm = &map_main;
+  vlib_buffer_t *b = vlib_get_buffer (vm, bi);
 
   if (mm->frag_inner)
     {
-      // TODO: Fix inner fragmentation after removed inner support from ip-frag.
-      ip_frag_set_vnet_buffer (b, /*sizeof (ip6_header_t), */ mtu,
-                              IP4_FRAG_NEXT_IP6_LOOKUP,
-                              IP_FRAG_FLAG_IP6_HEADER);
-      return (IP4_MAP_NEXT_IP4_FRAGMENT);
+      /* IPv4 fragmented packets inside of IPv6 */
+      ip4_frag_do_fragment (vm, bi, mtu, sizeof (ip6_header_t), buffers);
+
+      /* Fixup */
+      u32 *i;
+      vec_foreach (i, *buffers)
+      {
+       vlib_buffer_t *p = vlib_get_buffer (vm, *i);
+       ip6_header_t *ip6 = vlib_buffer_get_current (p);
+       ip6->payload_length =
+         clib_host_to_net_u16 (p->current_length - sizeof (ip6_header_t));
+      }
     }
   else
     {
@@ -140,10 +147,11 @@ ip4_map_fragment (vlib_buffer_t * b, u16 mtu, bool df, u8 * error)
          *error = MAP_ERROR_DF_SET;
          return (IP4_MAP_NEXT_ICMP_ERROR);
        }
-      ip_frag_set_vnet_buffer (b, mtu, IP6_FRAG_NEXT_IP6_LOOKUP,
-                              IP_FRAG_FLAG_IP6_HEADER);
-      return (IP4_MAP_NEXT_IP6_FRAGMENT);
+
+      /* Create IPv6 fragments here */
+      ip6_frag_do_fragment (vm, bi, mtu, 0, buffers);
     }
+  return (IP4_MAP_NEXT_IP6_LOOKUP);
 }
 
 /*
@@ -165,189 +173,6 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
   while (n_left_from > 0)
     {
       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
-
-      /* Dual loop */
-      while (n_left_from >= 4 && n_left_to_next >= 2)
-       {
-         u32 pi0, pi1;
-         vlib_buffer_t *p0, *p1;
-         map_domain_t *d0, *d1;
-         u8 error0 = MAP_ERROR_NONE, error1 = MAP_ERROR_NONE;
-         ip4_header_t *ip40, *ip41;
-         u16 port0 = 0, port1 = 0;
-         ip6_header_t *ip6h0, *ip6h1;
-         u32 map_domain_index0 = ~0, map_domain_index1 = ~0;
-         u32 next0 = IP4_MAP_NEXT_IP6_LOOKUP, next1 =
-           IP4_MAP_NEXT_IP6_LOOKUP;
-
-         /* Prefetch next iteration. */
-         {
-           vlib_buffer_t *p2, *p3;
-
-           p2 = vlib_get_buffer (vm, from[2]);
-           p3 = vlib_get_buffer (vm, from[3]);
-
-           vlib_prefetch_buffer_header (p2, STORE);
-           vlib_prefetch_buffer_header (p3, STORE);
-           /* IPv4 + 8 = 28. possibly plus -40 */
-           CLIB_PREFETCH (p2->data - 40, 68, STORE);
-           CLIB_PREFETCH (p3->data - 40, 68, STORE);
-         }
-
-         pi0 = to_next[0] = from[0];
-         pi1 = to_next[1] = from[1];
-         from += 2;
-         n_left_from -= 2;
-         to_next += 2;
-         n_left_to_next -= 2;
-
-         p0 = vlib_get_buffer (vm, pi0);
-         p1 = vlib_get_buffer (vm, pi1);
-         ip40 = vlib_buffer_get_current (p0);
-         ip41 = vlib_buffer_get_current (p1);
-         d0 =
-           ip4_map_get_domain (&ip40->dst_address, &map_domain_index0,
-                               &error0);
-         d1 =
-           ip4_map_get_domain (&ip41->dst_address, &map_domain_index1,
-                               &error1);
-
-         /*
-          * Shared IPv4 address
-          */
-         port0 = ip4_map_port_and_security_check (d0, p0, &error0);
-         port1 = ip4_map_port_and_security_check (d1, p1, &error1);
-
-         /* Decrement IPv4 TTL */
-         ip4_map_decrement_ttl (ip40, &error0);
-         ip4_map_decrement_ttl (ip41, &error1);
-         bool df0 =
-           ip40->flags_and_fragment_offset &
-           clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT);
-         bool df1 =
-           ip41->flags_and_fragment_offset &
-           clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT);
-
-         /* MAP calc */
-         u32 da40 = clib_net_to_host_u32 (ip40->dst_address.as_u32);
-         u32 da41 = clib_net_to_host_u32 (ip41->dst_address.as_u32);
-         u16 dp40 = clib_net_to_host_u16 (port0);
-         u16 dp41 = clib_net_to_host_u16 (port1);
-         u64 dal60 = map_get_pfx (d0, da40, dp40);
-         u64 dal61 = map_get_pfx (d1, da41, dp41);
-         u64 dar60 = map_get_sfx (d0, da40, dp40);
-         u64 dar61 = map_get_sfx (d1, da41, dp41);
-         if (dal60 == 0 && dar60 == 0 && error0 == MAP_ERROR_NONE)
-           error0 = MAP_ERROR_NO_BINDING;
-         if (dal61 == 0 && dar61 == 0 && error1 == MAP_ERROR_NONE)
-           error1 = MAP_ERROR_NO_BINDING;
-
-         /* construct ipv6 header */
-         vlib_buffer_advance (p0, -sizeof (ip6_header_t));
-         vlib_buffer_advance (p1, -sizeof (ip6_header_t));
-         ip6h0 = vlib_buffer_get_current (p0);
-         ip6h1 = vlib_buffer_get_current (p1);
-         vnet_buffer (p0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
-         vnet_buffer (p1)->sw_if_index[VLIB_TX] = (u32) ~ 0;
-
-         ip6h0->ip_version_traffic_class_and_flow_label =
-           ip4_map_vtcfl (ip40, p0);
-         ip6h1->ip_version_traffic_class_and_flow_label =
-           ip4_map_vtcfl (ip41, p1);
-         ip6h0->payload_length = ip40->length;
-         ip6h1->payload_length = ip41->length;
-         ip6h0->protocol = IP_PROTOCOL_IP_IN_IP;
-         ip6h1->protocol = IP_PROTOCOL_IP_IN_IP;
-         ip6h0->hop_limit = 0x40;
-         ip6h1->hop_limit = 0x40;
-         ip6h0->src_address = d0->ip6_src;
-         ip6h1->src_address = d1->ip6_src;
-         ip6h0->dst_address.as_u64[0] = clib_host_to_net_u64 (dal60);
-         ip6h0->dst_address.as_u64[1] = clib_host_to_net_u64 (dar60);
-         ip6h1->dst_address.as_u64[0] = clib_host_to_net_u64 (dal61);
-         ip6h1->dst_address.as_u64[1] = clib_host_to_net_u64 (dar61);
-
-         /*
-          * Determine next node. Can be one of:
-          * ip6-lookup, ip6-rewrite, ip4-fragment, error-drop
-          */
-         if (PREDICT_TRUE (error0 == MAP_ERROR_NONE))
-           {
-             if (PREDICT_FALSE
-                 (d0->mtu
-                  && (clib_net_to_host_u16 (ip6h0->payload_length) +
-                      sizeof (*ip6h0) > d0->mtu)))
-               {
-                 next0 = ip4_map_fragment (p0, d0->mtu, df0, &error0);
-               }
-             else
-               {
-                 next0 =
-                   ip4_map_ip6_lookup_bypass (p0,
-                                              ip40) ?
-                   IP4_MAP_NEXT_IP6_REWRITE : next0;
-                 vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX,
-                                                  thread_index,
-                                                  map_domain_index0, 1,
-                                                  clib_net_to_host_u16
-                                                  (ip6h0->payload_length) +
-                                                  40);
-               }
-           }
-         else
-           {
-             next0 = IP4_MAP_NEXT_DROP;
-           }
-
-         /*
-          * Determine next node. Can be one of:
-          * ip6-lookup, ip6-rewrite, ip4-fragment, error-drop
-          */
-         if (PREDICT_TRUE (error1 == MAP_ERROR_NONE))
-           {
-             if (PREDICT_FALSE
-                 (d1->mtu
-                  && (clib_net_to_host_u16 (ip6h1->payload_length) +
-                      sizeof (*ip6h1) > d1->mtu)))
-               {
-                 next1 = ip4_map_fragment (p1, d1->mtu, df1, &error1);
-               }
-             else
-               {
-                 next1 =
-                   ip4_map_ip6_lookup_bypass (p1,
-                                              ip41) ?
-                   IP4_MAP_NEXT_IP6_REWRITE : next1;
-                 vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX,
-                                                  thread_index,
-                                                  map_domain_index1, 1,
-                                                  clib_net_to_host_u16
-                                                  (ip6h1->payload_length) +
-                                                  40);
-               }
-           }
-         else
-           {
-             next1 = IP4_MAP_NEXT_DROP;
-           }
-
-         if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
-           {
-             map_add_trace (vm, node, p0, map_domain_index0, port0);
-           }
-         if (PREDICT_FALSE (p1->flags & VLIB_BUFFER_IS_TRACED))
-           {
-             map_add_trace (vm, node, p1, map_domain_index1, port0);
-           }
-
-         p0->error = error_node->errors[error0];
-         p1->error = error_node->errors[error1];
-
-         vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next,
-                                          n_left_to_next, pi0, pi1, next0,
-                                          next1);
-       }
-
       while (n_left_from > 0 && n_left_to_next > 0)
        {
          u32 pi0;
@@ -359,12 +184,13 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
          ip6_header_t *ip6h0;
          u32 next0 = IP4_MAP_NEXT_IP6_LOOKUP;
          u32 map_domain_index0 = ~0;
+         u32 *buffer0 = 0;
+         bool free_original_buffer0 = false;
+         u32 *frag_from0, frag_left0;
 
          pi0 = to_next[0] = from[0];
          from += 1;
          n_left_from -= 1;
-         to_next += 1;
-         n_left_to_next -= 1;
 
          p0 = vlib_get_buffer (vm, pi0);
          ip40 = vlib_buffer_get_current (p0);
@@ -413,7 +239,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 
          /*
           * Determine next node. Can be one of:
-          * ip6-lookup, ip6-rewrite, ip4-fragment, error-drop
+          * ip6-lookup, ip6-rewrite, error-drop
           */
          if (PREDICT_TRUE (error0 == MAP_ERROR_NONE))
            {
@@ -422,7 +248,14 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
                   && (clib_net_to_host_u16 (ip6h0->payload_length) +
                       sizeof (*ip6h0) > d0->mtu)))
                {
-                 next0 = ip4_map_fragment (p0, d0->mtu, df0, &error0);
+                 next0 =
+                   ip4_map_fragment (vm, pi0, d0->mtu, df0, &buffer0,
+                                     &error0);
+
+                 if (error0 == MAP_ERROR_NONE)
+                   {
+                     free_original_buffer0 = true;
+                   }
                }
              else
                {
@@ -450,8 +283,41 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 
          p0->error = error_node->errors[error0];
        exit:
-         vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
-                                          n_left_to_next, pi0, next0);
+         /* Send fragments that were added in the frame */
+         if (free_original_buffer0)
+           {
+             vlib_buffer_free_one (vm, pi0);   /* Free original packet */
+           }
+         else
+           {
+             vec_add1 (buffer0, pi0);
+           }
+
+         frag_from0 = buffer0;
+         frag_left0 = vec_len (buffer0);
+
+         while (frag_left0 > 0)
+           {
+             while (frag_left0 > 0 && n_left_to_next > 0)
+               {
+                 u32 i0;
+                 i0 = to_next[0] = frag_from0[0];
+                 frag_from0 += 1;
+                 frag_left0 -= 1;
+                 to_next += 1;
+                 n_left_to_next -= 1;
+
+                 vlib_get_buffer (vm, i0)->error =
+                   error_node->errors[error0];
+                 vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+                                                  to_next, n_left_to_next,
+                                                  i0, next0);
+               }
+             vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+             vlib_get_next_frame (vm, node, next_index, to_next,
+                                  n_left_to_next);
+           }
+         vec_reset_length (buffer0);
        }
       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
     }
@@ -491,8 +357,6 @@ VLIB_REGISTER_NODE(ip4_map_node) = {
 #ifdef MAP_SKIP_IP6_LOOKUP
     [IP4_MAP_NEXT_IP6_REWRITE] = "ip6-load-balance",
 #endif
-    [IP4_MAP_NEXT_IP4_FRAGMENT] = "ip4-frag",
-    [IP4_MAP_NEXT_IP6_FRAGMENT] = "ip6-frag",
     [IP4_MAP_NEXT_ICMP_ERROR] = "ip4-icmp-error",
     [IP4_MAP_NEXT_DROP] = "error-drop",
   },
index 621fb06..c254efc 100644 (file)
@@ -168,7 +168,7 @@ ip4_map_t_icmp (vlib_main_t * vm,
          if (vnet_buffer (p0)->map_t.mtu < p0->current_length)
            {
              vnet_buffer (p0)->ip_frag.mtu = vnet_buffer (p0)->map_t.mtu;
-             vnet_buffer (p0)->ip_frag.next_index = IP6_FRAG_NEXT_IP6_LOOKUP;
+             vnet_buffer (p0)->ip_frag.next_index = IP_FRAG_NEXT_IP6_LOOKUP;
              next0 = IP4_MAPT_ICMP_NEXT_IP6_FRAG;
            }
        err0:
@@ -287,7 +287,7 @@ ip4_map_t_fragmented (vlib_main_t * vm,
                {
                  vnet_buffer (p0)->ip_frag.mtu = vnet_buffer (p0)->map_t.mtu;
                  vnet_buffer (p0)->ip_frag.next_index =
-                   IP6_FRAG_NEXT_IP6_LOOKUP;
+                   IP_FRAG_NEXT_IP6_LOOKUP;
                  next0 = IP4_MAPT_FRAGMENTED_NEXT_IP6_FRAG;
                }
            }
@@ -453,7 +453,7 @@ ip4_map_t_tcp_udp (vlib_main_t * vm,
                  //Send to fragmentation node if necessary
                  vnet_buffer (p0)->ip_frag.mtu = vnet_buffer (p0)->map_t.mtu;
                  vnet_buffer (p0)->ip_frag.next_index =
-                   IP6_FRAG_NEXT_IP6_LOOKUP;
+                   IP_FRAG_NEXT_IP6_LOOKUP;
                  next0 = IP4_MAPT_TCP_UDP_NEXT_IP6_FRAG;
                }
            }
index 96f81ef..47958f9 100644 (file)
@@ -314,7 +314,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
                    {
                      vnet_buffer (p0)->ip_frag.flags = 0;
                      vnet_buffer (p0)->ip_frag.next_index =
-                       IP4_FRAG_NEXT_IP4_LOOKUP;
+                       IP_FRAG_NEXT_IP4_LOOKUP;
                      vnet_buffer (p0)->ip_frag.mtu = d0->mtu;
                      next0 = IP6_MAP_NEXT_IP4_FRAGMENT;
                    }
@@ -346,7 +346,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
                    {
                      vnet_buffer (p1)->ip_frag.flags = 0;
                      vnet_buffer (p1)->ip_frag.next_index =
-                       IP4_FRAG_NEXT_IP4_LOOKUP;
+                       IP_FRAG_NEXT_IP4_LOOKUP;
                      vnet_buffer (p1)->ip_frag.mtu = d1->mtu;
                      next1 = IP6_MAP_NEXT_IP4_FRAGMENT;
                    }
@@ -497,7 +497,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
                    {
                      vnet_buffer (p0)->ip_frag.flags = 0;
                      vnet_buffer (p0)->ip_frag.next_index =
-                       IP4_FRAG_NEXT_IP4_LOOKUP;
+                       IP_FRAG_NEXT_IP4_LOOKUP;
                      vnet_buffer (p0)->ip_frag.mtu = d0->mtu;
                      next0 = IP6_MAP_NEXT_IP4_FRAGMENT;
                    }
@@ -622,7 +622,7 @@ ip6_map_post_ip4_reass (vlib_main_t * vm,
               && error0 == MAP_ERROR_NONE))
            {
              vnet_buffer (p0)->ip_frag.flags = 0;
-             vnet_buffer (p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP4_LOOKUP;
+             vnet_buffer (p0)->ip_frag.next_index = IP_FRAG_NEXT_IP4_LOOKUP;
              vnet_buffer (p0)->ip_frag.mtu = d0->mtu;
              next0 = IP6_MAP_POST_IP4_REASS_NEXT_IP4_FRAGMENT;
            }
index 6e9c0d7..ef7b913 100644 (file)
@@ -169,7 +169,7 @@ ip6_map_t_icmp (vlib_main_t * vm,
            {
              // Send to fragmentation node if necessary
              vnet_buffer (p0)->ip_frag.mtu = vnet_buffer (p0)->map_t.mtu;
-             vnet_buffer (p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP4_LOOKUP;
+             vnet_buffer (p0)->ip_frag.next_index = IP_FRAG_NEXT_IP4_LOOKUP;
              next0 = IP6_MAPT_ICMP_NEXT_IP4_FRAG;
            }
        err0:
@@ -288,7 +288,7 @@ ip6_map_t_fragmented (vlib_main_t * vm,
                  // Send to fragmentation node if necessary
                  vnet_buffer (p0)->ip_frag.mtu = vnet_buffer (p0)->map_t.mtu;
                  vnet_buffer (p0)->ip_frag.next_index =
-                   IP4_FRAG_NEXT_IP4_LOOKUP;
+                   IP_FRAG_NEXT_IP4_LOOKUP;
                  next0 = IP6_MAPT_FRAGMENTED_NEXT_IP4_FRAG;
                }
            }
@@ -441,7 +441,7 @@ ip6_map_t_tcp_udp (vlib_main_t * vm,
                  // Send to fragmentation node if necessary
                  vnet_buffer (p0)->ip_frag.mtu = vnet_buffer (p0)->map_t.mtu;
                  vnet_buffer (p0)->ip_frag.next_index =
-                   IP4_FRAG_NEXT_IP4_LOOKUP;
+                   IP_FRAG_NEXT_IP4_LOOKUP;
                  next0 = IP6_MAPT_TCP_UDP_NEXT_IP4_FRAG;
                }
            }
index a7e5f16..c1fe05e 100644 (file)
@@ -140,7 +140,7 @@ class TestMAP(VppTestCase):
               IP(src=self.pg0.remote_ip4, dst=self.pg0.remote_ip4) /
               UDP(sport=20000, dport=10000) /
               Raw(b'\xa5' * 100))
-        rx = self.send_and_expect(self.pg0, v4*1, self.pg0)
+        rx = self.send_and_expect(self.pg0, v4 * 4, self.pg0)
         v4_reply = v4[1]
         v4_reply.ttl -= 1
         for p in rx:
@@ -154,7 +154,7 @@ class TestMAP(VppTestCase):
               UDP(sport=20000, dport=10000) /
               Raw(b'\xa5' * 100))
 
-        self.send_and_assert_encapped_one(v4, "3000::1", map_translated_addr)
+        self.send_and_assert_encapped(v4 * 4, "3000::1", map_translated_addr)
 
         #
         # Verify reordered fragments are able to pass as well
@@ -294,6 +294,76 @@ class TestMAP(VppTestCase):
         pre_res_route.remove_vpp_config()
         self.vapi.ppcli("map params pre-resolve del ip6-nh 4001::1")
 
+    def test_map_e_inner_frag(self):
+        """ MAP-E Inner fragmentation """
+
+        #
+        # Add a route to the MAP-BR
+        #
+        map_br_pfx = "2001::"
+        map_br_pfx_len = 32
+        map_route = VppIpRoute(self,
+                               map_br_pfx,
+                               map_br_pfx_len,
+                               [VppRoutePath(self.pg1.remote_ip6,
+                                             self.pg1.sw_if_index)])
+        map_route.add_vpp_config()
+
+        #
+        # Add a domain that maps from pg0 to pg1
+        #
+        map_dst = '2001::/32'
+        map_src = '3000::1/128'
+        client_pfx = '192.168.0.0/16'
+        map_translated_addr = '2001:0:101:7000:0:c0a8:101:7'
+        tag = 'MAP-E tag.'
+        self.vapi.map_add_domain(ip4_prefix=client_pfx,
+                                 ip6_prefix=map_dst,
+                                 ip6_src=map_src,
+                                 ea_bits_len=20,
+                                 psid_offset=4,
+                                 psid_length=4,
+                                 mtu=1000,
+                                 tag=tag)
+
+        # Enable MAP on interface.
+        self.vapi.map_if_enable_disable(is_enable=1,
+                                        sw_if_index=self.pg0.sw_if_index,
+                                        is_translation=0)
+
+        # Enable inner fragmentation
+        self.vapi.map_param_set_fragmentation(inner=1)
+
+        v4 = (Ether(dst=self.pg0.local_mac, src=self.pg0.remote_mac) /
+              IP(src=self.pg0.remote_ip4, dst='192.168.1.1') /
+              UDP(sport=20000, dport=10000) /
+              Raw(b'\xa5' * 1300))
+
+        self.pg_send(self.pg0, v4*1)
+        rx = self.pg1.get_capture(2)
+
+        frags = fragment_rfc791(v4[1], 1000)
+        frags[0].id = 0
+        frags[1].id = 0
+        frags[0].ttl -= 1
+        frags[1].ttl -= 1
+        frags[0].chksum = 0
+        frags[1].chksum = 0
+
+        v6_reply1 = (IPv6(src='3000::1', dst=map_translated_addr, hlim=63) /
+                     frags[0])
+        v6_reply2 = (IPv6(src='3000::1', dst=map_translated_addr, hlim=63) /
+                     frags[1])
+        rx[0][1].fl = 0
+        rx[1][1].fl = 0
+        rx[0][1][IP].id = 0
+        rx[1][1][IP].id = 0
+        rx[0][1][IP].chksum = 0
+        rx[1][1][IP].chksum = 0
+
+        self.validate(rx[0][1], v6_reply1)
+        self.validate(rx[1][1], v6_reply2)
+
     def validate(self, rx, expected):
         self.assertEqual(rx, expected.__class__(scapy.compat.raw(expected)))
 
index 1550b31..44a6819 100644 (file)
@@ -2293,8 +2293,8 @@ typedef enum
 
 always_inline void
 ip4_mtu_check (vlib_buffer_t * b, u16 packet_len,
-              u16 adj_packet_bytes, bool df, u16 * next, u32 * error,
-              u8 is_midchain)
+              u16 adj_packet_bytes, bool df, u16 * next,
+              u8 is_midchain, u32 * error)
 {
   if (packet_len > adj_packet_bytes)
     {
@@ -2312,8 +2312,8 @@ ip4_mtu_check (vlib_buffer_t * b, u16 packet_len,
          /* IP fragmentation */
          ip_frag_set_vnet_buffer (b, adj_packet_bytes,
                                   (is_midchain ?
-                                   IP4_FRAG_NEXT_IP4_REWRITE_MIDCHAIN :
-                                   IP4_FRAG_NEXT_IP4_REWRITE), 0);
+                                   IP_FRAG_NEXT_IP_REWRITE_MIDCHAIN :
+                                   IP_FRAG_NEXT_IP_REWRITE), 0);
          *next = IP4_REWRITE_NEXT_FRAGMENT;
        }
     }
@@ -2486,12 +2486,12 @@ ip4_rewrite_inline_with_gso (vlib_main_t * vm,
                     adj0[0].rewrite_header.max_l3_packet_bytes,
                     ip0->flags_and_fragment_offset &
                     clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
-                    next + 0, &error0, is_midchain);
+                    next + 0, is_midchain, &error0);
       ip4_mtu_check (b[1], ip1_len,
                     adj1[0].rewrite_header.max_l3_packet_bytes,
                     ip1->flags_and_fragment_offset &
                     clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
-                    next + 1, &error1, is_midchain);
+                    next + 1, is_midchain, &error1);
 
       if (is_mcast)
        {
@@ -2660,7 +2660,7 @@ ip4_rewrite_inline_with_gso (vlib_main_t * vm,
                     adj0[0].rewrite_header.max_l3_packet_bytes,
                     ip0->flags_and_fragment_offset &
                     clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
-                    next + 0, &error0, is_midchain);
+                    next + 0, is_midchain, &error0);
 
       if (is_mcast)
        {
@@ -2758,7 +2758,7 @@ ip4_rewrite_inline_with_gso (vlib_main_t * vm,
                     adj0[0].rewrite_header.max_l3_packet_bytes,
                     ip0->flags_and_fragment_offset &
                     clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
-                    next + 0, &error0, is_midchain);
+                    next + 0, is_midchain, &error0);
 
       if (is_mcast)
        {
index 50de501..9656621 100644 (file)
@@ -1652,7 +1652,7 @@ typedef enum
 always_inline void
 ip6_mtu_check (vlib_buffer_t * b, u16 packet_bytes,
               u16 adj_packet_bytes, bool is_locally_generated,
-              u32 * next, u32 * error)
+              u32 * next, u8 is_midchain, u32 * error)
 {
   if (adj_packet_bytes >= 1280 && packet_bytes > adj_packet_bytes)
     {
@@ -1660,7 +1660,9 @@ ip6_mtu_check (vlib_buffer_t * b, u16 packet_bytes,
        {
          /* IP fragmentation */
          ip_frag_set_vnet_buffer (b, adj_packet_bytes,
-                                  IP6_FRAG_NEXT_IP6_REWRITE, 0);
+                                  (is_midchain ?
+                                   IP_FRAG_NEXT_IP_REWRITE_MIDCHAIN :
+                                   IP_FRAG_NEXT_IP_REWRITE), 0);
          *next = IP6_REWRITE_NEXT_FRAGMENT;
          *error = IP6_ERROR_MTU_EXCEEDED;
        }
@@ -1840,10 +1842,12 @@ ip6_rewrite_inline_with_gso (vlib_main_t * vm,
 
          ip6_mtu_check (p0, ip0_len,
                         adj0[0].rewrite_header.max_l3_packet_bytes,
-                        is_locally_originated0, &next0, &error0);
+                        is_locally_originated0, &next0, is_midchain,
+                        &error0);
          ip6_mtu_check (p1, ip1_len,
                         adj1[0].rewrite_header.max_l3_packet_bytes,
-                        is_locally_originated1, &next1, &error1);
+                        is_locally_originated1, &next1, is_midchain,
+                        &error1);
 
          /* Don't adjust the buffer for hop count issue; icmp-error node
           * wants to see the IP header */
@@ -2011,7 +2015,8 @@ ip6_rewrite_inline_with_gso (vlib_main_t * vm,
 
          ip6_mtu_check (p0, ip0_len,
                         adj0[0].rewrite_header.max_l3_packet_bytes,
-                        is_locally_originated0, &next0, &error0);
+                        is_locally_originated0, &next0, is_midchain,
+                        &error0);
 
          /* Don't adjust the buffer for hop count issue; icmp-error node
           * wants to see the IP header */
index 54efb63..9aa8777 100644 (file)
 
 #include <vnet/ip/ip.h>
 
-/*
- * Copy the mpls header if present.
- * The current is pointing to the ip header.
- * Adjust the buffer and point to the mpls headers on these fragments
- * before sending the packet back to mpls-output node.
- */
-static inline void
-copy_mpls_hdr (vlib_buffer_t * to_b, vlib_buffer_t * from_b)
-{
-  if ((vnet_buffer (from_b)->ip_frag.flags) & IP_FRAG_FLAG_MPLS_HEADER)
-    {
-      u8 mpls_hdr_length = vnet_buffer (from_b)->mpls.mpls_hdr_length;
-      u8 *org_from_mpls_packet =
-       from_b->data + (from_b->current_data - mpls_hdr_length);
-      clib_memcpy_fast ((to_b->data - mpls_hdr_length), org_from_mpls_packet,
-                       mpls_hdr_length);
-      vlib_buffer_advance (to_b, -vnet_buffer (to_b)->mpls.mpls_hdr_length);
-    }
-}
-
 typedef struct
 {
   u8 ipv6;
@@ -87,14 +67,6 @@ frag_set_sw_if_index (vlib_buffer_t * to, vlib_buffer_t * from)
       vnet_buffer2 (to)->qos = vnet_buffer2 (from)->qos;
       to->flags |= VNET_BUFFER_F_QOS_DATA_VALID;
     }
-
-  /* Copy mpls opaque data */
-  if ((vnet_buffer (from)->ip_frag.flags) & IP_FRAG_FLAG_MPLS_HEADER)
-    {
-      vnet_buffer (to)->mpls.pyld_proto = vnet_buffer (from)->mpls.pyld_proto;
-      vnet_buffer (to)->mpls.mpls_hdr_length =
-       vnet_buffer (from)->mpls.mpls_hdr_length;
-    }
 }
 
 static vlib_buffer_t *
@@ -116,20 +88,20 @@ frag_buffer_alloc (vlib_buffer_t * org_b, u32 * bi)
  * but does not generate buffer chains. I.e. a fragment is always
  * contained with in a single buffer and limited to the max buffer
  * size.
+ * from_bi: current pointer must point to IPv4 header
  */
-void
-ip4_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer,
-                     ip_frag_error_t * error)
+ip_frag_error_t
+ip4_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u16 mtu,
+                     u16 l2unfragmentablesize, u32 ** buffer)
 {
   vlib_buffer_t *from_b;
   ip4_header_t *ip4;
-  u16 mtu, len, max, rem, ip_frag_id, ip_frag_offset;
+  u16 len, max, rem, ip_frag_id, ip_frag_offset;
   u8 *org_from_packet, more;
 
   from_b = vlib_get_buffer (vm, from_bi);
-  mtu = vnet_buffer (from_b)->ip_frag.mtu;
   org_from_packet = vlib_buffer_get_current (from_b);
-  ip4 = (ip4_header_t *) vlib_buffer_get_current (from_b);
+  ip4 = vlib_buffer_get_current (from_b) + l2unfragmentablesize;
 
   rem = clib_net_to_host_u16 (ip4->length) - sizeof (ip4_header_t);
   max =
@@ -139,21 +111,18 @@ ip4_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer,
   if (rem >
       (vlib_buffer_length_in_chain (vm, from_b) - sizeof (ip4_header_t)))
     {
-      *error = IP_FRAG_ERROR_MALFORMED;
-      return;
+      return IP_FRAG_ERROR_MALFORMED;
     }
 
   if (mtu < sizeof (ip4_header_t))
     {
-      *error = IP_FRAG_ERROR_CANT_FRAGMENT_HEADER;
-      return;
+      return IP_FRAG_ERROR_CANT_FRAGMENT_HEADER;
     }
 
   if (ip4->flags_and_fragment_offset &
       clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT))
     {
-      *error = IP_FRAG_ERROR_DONT_FRAGMENT_SET;
-      return;
+      return IP_FRAG_ERROR_DONT_FRAGMENT_SET;
     }
 
   if (ip4_is_fragment (ip4))
@@ -174,7 +143,8 @@ ip4_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer,
   u8 *from_data = (void *) (ip4 + 1);
   vlib_buffer_t *org_from_b = from_b;
   u16 fo = 0;
-  u16 left_in_from_buffer = from_b->current_length - sizeof (ip4_header_t);
+  u16 left_in_from_buffer =
+    from_b->current_length - (l2unfragmentablesize + sizeof (ip4_header_t));
   u16 ptr = 0;
 
   /* Do the actual fragmentation */
@@ -190,17 +160,19 @@ ip4_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer,
        len &= ~0x7;
       if ((to_b = frag_buffer_alloc (org_from_b, &to_bi)) == 0)
        {
-         *error = IP_FRAG_ERROR_MEMORY;
-         return;
+         return IP_FRAG_ERROR_MEMORY;
        }
       vec_add1 (*buffer, to_bi);
       frag_set_sw_if_index (to_b, org_from_b);
 
       /* Copy ip4 header */
-      clib_memcpy_fast (to_b->data, org_from_packet, sizeof (ip4_header_t));
-      to_ip4 = vlib_buffer_get_current (to_b);
+      to_data = vlib_buffer_get_current (to_b);
+      clib_memcpy_fast (to_data, org_from_packet,
+                       l2unfragmentablesize + sizeof (ip4_header_t));
+      to_ip4 = (ip4_header_t *) (to_data + l2unfragmentablesize);
       to_data = (void *) (to_ip4 + 1);
       vnet_buffer (to_b)->l3_hdr_offset = to_b->current_data;
+      vlib_buffer_copy_trace_flag (vm, from_b, to_bi);
       to_b->flags |= VNET_BUFFER_F_L3_HDR_OFFSET_VALID;
 
       if (from_b->flags & VNET_BUFFER_F_L4_HDR_OFFSET_VALID)
@@ -232,8 +204,7 @@ ip4_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer,
          /* Move buffer */
          if (!(from_b->flags & VLIB_BUFFER_NEXT_PRESENT))
            {
-             *error = IP_FRAG_ERROR_MALFORMED;
-             return;
+             return IP_FRAG_ERROR_MALFORMED;
            }
          from_b = vlib_get_buffer (vm, from_b->next_buffer);
          from_data = (u8 *) vlib_buffer_get_current (from_b);
@@ -242,8 +213,9 @@ ip4_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer,
          to_ptr += bytes_to_copy;
        }
 
-      to_b->current_length = len + sizeof (ip4_header_t);
       to_b->flags |= VNET_BUFFER_F_IS_IP4;
+      to_b->current_length =
+       len + sizeof (ip4_header_t) + l2unfragmentablesize;
 
       to_ip4->fragment_id = ip_frag_id;
       to_ip4->flags_and_fragment_offset =
@@ -256,31 +228,11 @@ ip4_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer,
       /* we've just done the IP checksum .. */
       to_b->flags &= ~VNET_BUFFER_F_OFFLOAD_IP_CKSUM;
 
-      if (vnet_buffer (org_from_b)->ip_frag.flags & IP_FRAG_FLAG_IP4_HEADER)
-       {
-         /* Encapsulating ipv4 header */
-         ip4_header_t *encap_header4 =
-           (ip4_header_t *) vlib_buffer_get_current (to_b);
-         encap_header4->length = clib_host_to_net_u16 (to_b->current_length);
-         encap_header4->checksum = ip4_header_checksum (encap_header4);
-       }
-      else if (vnet_buffer (org_from_b)->
-              ip_frag.flags & IP_FRAG_FLAG_IP6_HEADER)
-       {
-         /* Encapsulating ipv6 header */
-         ip6_header_t *encap_header6 =
-           (ip6_header_t *) vlib_buffer_get_current (to_b);
-         encap_header6->payload_length =
-           clib_host_to_net_u16 (to_b->current_length -
-                                 sizeof (*encap_header6));
-       }
-
-      /* Copy mpls header if present */
-      copy_mpls_hdr (to_b, org_from_b);
-
       rem -= len;
       fo += len;
     }
+
+  return IP_FRAG_ERROR_NONE;
 }
 
 void
@@ -322,19 +274,19 @@ frag_node_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          pi0 = from[0];
          from += 1;
          n_left_from -= 1;
-         error0 = IP_FRAG_ERROR_NONE;
 
          p0 = vlib_get_buffer (vm, pi0);
+         u16 mtu = vnet_buffer (p0)->ip_frag.mtu;
          if (is_ip6)
-           ip6_frag_do_fragment (vm, pi0, &buffer, &error0);
+           error0 = ip6_frag_do_fragment (vm, pi0, mtu, 0, &buffer);
          else
-           ip4_frag_do_fragment (vm, pi0, &buffer, &error0);
+           error0 = ip4_frag_do_fragment (vm, pi0, mtu, 0, &buffer);
 
          if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
            {
              ip_frag_trace_t *tr =
                vlib_add_trace (vm, node, p0, sizeof (*tr));
-             tr->mtu = vnet_buffer (p0)->ip_frag.mtu;
+             tr->mtu = mtu;
              tr->ipv6 = is_ip6 ? 1 : 0;
              tr->n_fragments = vec_len (buffer);
              tr->next = vnet_buffer (p0)->ip_frag.next_index;
@@ -345,20 +297,13 @@ frag_node_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
              icmp4_error_set_vnet_buffer (p0, ICMP4_destination_unreachable,
                                           ICMP4_destination_unreachable_fragmentation_needed_and_dont_fragment_set,
                                           vnet_buffer (p0)->ip_frag.mtu);
-             next0 = IP4_FRAG_NEXT_ICMP_ERROR;
+             next0 = IP_FRAG_NEXT_ICMP_ERROR;
            }
          else
            {
-             if (is_ip6)
-               next0 =
-                 (error0 ==
-                  IP_FRAG_ERROR_NONE) ? vnet_buffer (p0)->
-                 ip_frag.next_index : IP6_FRAG_NEXT_DROP;
-             else
-               next0 =
-                 (error0 ==
-                  IP_FRAG_ERROR_NONE) ? vnet_buffer (p0)->
-                 ip_frag.next_index : IP4_FRAG_NEXT_DROP;
+             next0 = (error0 == IP_FRAG_ERROR_NONE ?
+                      vnet_buffer (p0)->ip_frag.next_index :
+                      IP_FRAG_NEXT_DROP);
            }
 
          if (error0 == IP_FRAG_ERROR_NONE)
@@ -431,18 +376,20 @@ ip6_frag (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 /*
  * Fragments the packet given in from_bi. Fragments are returned in the buffer vector.
  * Caller must ensure the original packet is freed.
+ * from_bi: current pointer must point to IPv6 header
  */
-void
-ip6_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer,
-                     ip_frag_error_t * error)
+ip_frag_error_t
+ip6_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u16 mtu,
+                     u16 l2unfragmentablesize, u32 ** buffer)
 {
   vlib_buffer_t *from_b;
   ip6_header_t *ip6;
-  u16 mtu, len, max, rem, ip_frag_id;
+  u16 len, max, rem, ip_frag_id;
+  u8 *org_from_packet;
 
   from_b = vlib_get_buffer (vm, from_bi);
-  mtu = vnet_buffer (from_b)->ip_frag.mtu;
-  ip6 = (ip6_header_t *) vlib_buffer_get_current (from_b);
+  org_from_packet = vlib_buffer_get_current (from_b);
+  ip6 = vlib_buffer_get_current (from_b) + l2unfragmentablesize;
 
   rem = clib_net_to_host_u16 (ip6->payload_length);
   max = (mtu - sizeof (ip6_header_t) - sizeof (ip6_frag_hdr_t)) & ~0x7;        // TODO: Is max correct??
@@ -450,21 +397,20 @@ ip6_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer,
   if (rem >
       (vlib_buffer_length_in_chain (vm, from_b) - sizeof (ip6_header_t)))
     {
-      *error = IP_FRAG_ERROR_MALFORMED;
-      return;
+      return IP_FRAG_ERROR_MALFORMED;
     }
 
   /* TODO: Look through header chain for fragmentation header */
   if (ip6->protocol == IP_PROTOCOL_IPV6_FRAGMENTATION)
     {
-      *error = IP_FRAG_ERROR_MALFORMED;
-      return;
+      return IP_FRAG_ERROR_MALFORMED;
     }
 
   u8 *from_data = (void *) (ip6 + 1);
   vlib_buffer_t *org_from_b = from_b;
   u16 fo = 0;
-  u16 left_in_from_buffer = from_b->current_length - sizeof (ip6_header_t);
+  u16 left_in_from_buffer =
+    from_b->current_length - (l2unfragmentablesize + sizeof (ip6_header_t));
   u16 ptr = 0;
 
   ip_frag_id = ++running_fragment_id;  // Fix
@@ -485,14 +431,14 @@ ip6_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer,
        len &= ~0x7;
       if ((to_b = frag_buffer_alloc (org_from_b, &to_bi)) == 0)
        {
-         *error = IP_FRAG_ERROR_MEMORY;
-         return;
+         return IP_FRAG_ERROR_MEMORY;
        }
       vec_add1 (*buffer, to_bi);
       frag_set_sw_if_index (to_b, org_from_b);
 
       /* Copy ip6 header */
-      clib_memcpy_fast (to_b->data, ip6, sizeof (ip6_header_t));
+      clib_memcpy_fast (to_b->data, org_from_packet,
+                       l2unfragmentablesize + sizeof (ip6_header_t));
       to_ip6 = vlib_buffer_get_current (to_b);
       to_frag_hdr = (ip6_frag_hdr_t *) (to_ip6 + 1);
       to_data = (void *) (to_frag_hdr + 1);
@@ -530,8 +476,7 @@ ip6_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer,
          /* Move buffer */
          if (!(from_b->flags & VLIB_BUFFER_NEXT_PRESENT))
            {
-             *error = IP_FRAG_ERROR_MALFORMED;
-             return;
+             return IP_FRAG_ERROR_MALFORMED;
            }
          from_b = vlib_get_buffer (vm, from_b->next_buffer);
          from_data = (u8 *) vlib_buffer_get_current (from_b);
@@ -551,12 +496,11 @@ ip6_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer,
       to_frag_hdr->next_hdr = ip6->protocol;
       to_frag_hdr->rsv = 0;
 
-      /* Copy mpls header if present */
-      copy_mpls_hdr (to_b, org_from_b);
-
       rem -= len;
       fo += len;
     }
+
+  return IP_FRAG_ERROR_NONE;
 }
 
 static char *ip4_frag_error_strings[] = {
@@ -576,15 +520,14 @@ VLIB_REGISTER_NODE (ip4_frag_node) = {
   .n_errors = IP_FRAG_N_ERROR,
   .error_strings = ip4_frag_error_strings,
 
-  .n_next_nodes = IP4_FRAG_N_NEXT,
+  .n_next_nodes = IP_FRAG_N_NEXT,
   .next_nodes = {
-    [IP4_FRAG_NEXT_IP4_REWRITE] = "ip4-rewrite",
-    [IP4_FRAG_NEXT_IP4_REWRITE_MIDCHAIN] = "ip4-midchain",
-    [IP4_FRAG_NEXT_IP4_LOOKUP] = "ip4-lookup",
-    [IP4_FRAG_NEXT_IP6_LOOKUP] = "ip6-lookup",
-    [IP4_FRAG_NEXT_MPLS_OUTPUT] = "mpls-output",
-    [IP4_FRAG_NEXT_ICMP_ERROR] = "ip4-icmp-error",
-    [IP4_FRAG_NEXT_DROP] = "ip4-drop"
+    [IP_FRAG_NEXT_IP_REWRITE] = "ip4-rewrite",
+    [IP_FRAG_NEXT_IP_REWRITE_MIDCHAIN] = "ip4-midchain",
+    [IP_FRAG_NEXT_IP4_LOOKUP] = "ip4-lookup",
+    [IP_FRAG_NEXT_IP6_LOOKUP] = "ip6-lookup",
+    [IP_FRAG_NEXT_ICMP_ERROR] = "ip4-icmp-error",
+    [IP_FRAG_NEXT_DROP] = "ip4-drop"
   },
 };
 /* *INDENT-ON* */
@@ -600,14 +543,14 @@ VLIB_REGISTER_NODE (ip6_frag_node) = {
   .n_errors = IP_FRAG_N_ERROR,
   .error_strings = ip4_frag_error_strings,
 
-  .n_next_nodes = IP6_FRAG_N_NEXT,
+  .n_next_nodes = IP_FRAG_N_NEXT,
   .next_nodes = {
-    [IP6_FRAG_NEXT_IP6_REWRITE] = "ip6-rewrite",
-    [IP6_FRAG_NEXT_IP6_REWRITE_MIDCHAIN] = "ip6-midchain",
-    [IP6_FRAG_NEXT_IP4_LOOKUP] = "ip4-lookup",
-    [IP6_FRAG_NEXT_IP6_LOOKUP] = "ip6-lookup",
-    [IP6_FRAG_NEXT_MPLS_OUTPUT] = "mpls-output",
-    [IP6_FRAG_NEXT_DROP] = "ip6-drop"
+    [IP_FRAG_NEXT_IP_REWRITE] = "ip6-rewrite",
+    [IP_FRAG_NEXT_IP_REWRITE_MIDCHAIN] = "ip6-midchain",
+    [IP_FRAG_NEXT_IP4_LOOKUP] = "ip4-lookup",
+    [IP_FRAG_NEXT_IP6_LOOKUP] = "ip6-lookup",
+    [IP_FRAG_NEXT_ICMP_ERROR] = "error-drop",
+    [IP_FRAG_NEXT_DROP] = "ip6-drop"
   },
 };
 /* *INDENT-ON* */
index ce4236b..86462e6 100644 (file)
@@ -39,7 +39,6 @@
 
 #define IP_FRAG_FLAG_IP4_HEADER 0x01   //Encapsulating IPv4 header
 #define IP_FRAG_FLAG_IP6_HEADER 0x02   //Encapsulating IPv6 header
-#define IP_FRAG_FLAG_MPLS_HEADER 0x04  //Encapsulating MPLS header
 
 #define IP4_FRAG_NODE_NAME "ip4-frag"
 #define IP6_FRAG_NODE_NAME "ip6-frag"
@@ -49,26 +48,14 @@ extern vlib_node_registration_t ip6_frag_node;
 
 typedef enum
 {
-  IP4_FRAG_NEXT_IP4_REWRITE,
-  IP4_FRAG_NEXT_IP4_REWRITE_MIDCHAIN,
-  IP4_FRAG_NEXT_IP4_LOOKUP,
-  IP4_FRAG_NEXT_IP6_LOOKUP,
-  IP4_FRAG_NEXT_MPLS_OUTPUT,
-  IP4_FRAG_NEXT_ICMP_ERROR,
-  IP4_FRAG_NEXT_DROP,
-  IP4_FRAG_N_NEXT
-} ip4_frag_next_t;
-
-typedef enum
-{
-  IP6_FRAG_NEXT_IP4_LOOKUP,
-  IP6_FRAG_NEXT_IP6_LOOKUP,
-  IP6_FRAG_NEXT_IP6_REWRITE,
-  IP6_FRAG_NEXT_IP6_REWRITE_MIDCHAIN,
-  IP6_FRAG_NEXT_MPLS_OUTPUT,
-  IP6_FRAG_NEXT_DROP,
-  IP6_FRAG_N_NEXT
-} ip6_frag_next_t;
+  IP_FRAG_NEXT_IP_REWRITE,
+  IP_FRAG_NEXT_IP_REWRITE_MIDCHAIN,
+  IP_FRAG_NEXT_IP4_LOOKUP,
+  IP_FRAG_NEXT_IP6_LOOKUP,
+  IP_FRAG_NEXT_ICMP_ERROR,
+  IP_FRAG_NEXT_DROP,
+  IP_FRAG_N_NEXT
+} ip_frag_next_t;
 
 #define foreach_ip_frag_error                          \
   /* Must be first. */                                 \
@@ -91,12 +78,16 @@ typedef enum
 
 void ip_frag_set_vnet_buffer (vlib_buffer_t * b, u16 mtu,
                              u8 next_index, u8 flags);
-void
-ip4_frag_do_fragment (vlib_main_t * vm, u32 pi, u32 ** buffer,
-                     ip_frag_error_t * error);
-void
-ip6_frag_do_fragment (vlib_main_t * vm, u32 pi, u32 ** buffer,
-                     ip_frag_error_t * error);
+
+extern ip_frag_error_t ip4_frag_do_fragment (vlib_main_t * vm,
+                                            u32 from_bi,
+                                            u16 mtu,
+                                            u16 encapsize, u32 ** buffer);
+extern ip_frag_error_t ip6_frag_do_fragment (vlib_main_t * vm,
+                                            u32 from_bi,
+                                            u16 mtu,
+                                            u16 encapsize, u32 ** buffer);
+
 #endif /* ifndef IP_FRAG_H */
 
 /*
index 5ede22a..247f531 100644 (file)
@@ -34,8 +34,7 @@ typedef enum {
 
 #define foreach_mpls_output_next               \
 _(DROP, "error-drop")                           \
-_(IP4_FRAG, "ip4-frag")                         \
-_(IP6_FRAG, "ip6-frag")
+_(FRAG, "mpls-frag")
 
 typedef enum {
 #define _(s,n) MPLS_OUTPUT_NEXT_##s,
@@ -58,31 +57,6 @@ format_mpls_output_trace (u8 * s, va_list * args)
   return s;
 }
 
-/*
- * Save the mpls header length and adjust the current to ip header
- */
-static inline u32
-set_mpls_fragmentation(vlib_buffer_t * p0, ip_adjacency_t * adj0)
-{
-  u32 next0;
-
-  /* advance size of (all) mpls header to ip header before fragmenting */
-  /* save the current pointing to first mpls header. */
-  vnet_buffer (p0)->mpls.mpls_hdr_length = vnet_buffer(p0)->l3_hdr_offset - p0->current_data;
-  vlib_buffer_advance (p0, vnet_buffer (p0)->mpls.mpls_hdr_length);
-
-  /* IP fragmentation */
-  ip_frag_set_vnet_buffer (p0, adj0[0].rewrite_header.max_l3_packet_bytes,
-                           IP4_FRAG_NEXT_MPLS_OUTPUT,
-                           ((vnet_buffer (p0)->mpls.pyld_proto == DPO_PROTO_IP4) ? IP_FRAG_FLAG_IP4_HEADER:IP_FRAG_FLAG_IP6_HEADER));
-
-  /* Tell ip_frag to retain certain mpls parameters after fragmentation of mpls packet */
-  vnet_buffer (p0)->ip_frag.flags = (vnet_buffer (p0)->ip_frag.flags | IP_FRAG_FLAG_MPLS_HEADER);
-  next0 = (vnet_buffer (p0)->mpls.pyld_proto == DPO_PROTO_IP4)? MPLS_OUTPUT_NEXT_IP4_FRAG:MPLS_OUTPUT_NEXT_IP6_FRAG;
-
-  return next0;
-}
-
 static inline uword
 mpls_output_inline (vlib_main_t * vm,
                     vlib_node_runtime_t * node,
@@ -196,7 +170,7 @@ mpls_output_inline (vlib_main_t * vm,
           else
             {
              error0 = IP4_ERROR_MTU_EXCEEDED;
-             next0 = set_mpls_fragmentation (p0, adj0);
+             next0 = MPLS_OUTPUT_NEXT_FRAG;
               vlib_node_increment_counter (vm, mpls_output_node.index,
                                            MPLS_ERROR_PKTS_NEED_FRAG,
                                            1);
@@ -219,7 +193,7 @@ mpls_output_inline (vlib_main_t * vm,
           else
             {
              error1 = IP4_ERROR_MTU_EXCEEDED;
-             next1 = set_mpls_fragmentation (p1, adj1);
+             next1 = MPLS_OUTPUT_NEXT_FRAG;
               vlib_node_increment_counter (vm, mpls_output_node.index,
                                            MPLS_ERROR_PKTS_NEED_FRAG,
                                            1);
@@ -308,7 +282,7 @@ mpls_output_inline (vlib_main_t * vm,
           else
             {
              error0 = IP4_ERROR_MTU_EXCEEDED;
-             next0 = set_mpls_fragmentation (p0, adj0);
+             next0 = MPLS_OUTPUT_NEXT_FRAG;
               vlib_node_increment_counter (vm, mpls_output_node.index,
                                            MPLS_ERROR_PKTS_NEED_FRAG,
                                            1);
@@ -371,11 +345,9 @@ VLIB_REGISTER_NODE (mpls_output_node) = {
 
   .n_next_nodes = MPLS_OUTPUT_N_NEXT,
   .next_nodes = {
-#define _(s,n) [MPLS_OUTPUT_NEXT_##s] = n,
-    foreach_mpls_output_next
-#undef _
-  },
-
+        [MPLS_OUTPUT_NEXT_DROP] = "mpls-drop",
+        [MPLS_OUTPUT_NEXT_FRAG] = "mpls-frag",
+    },
   .format_trace = format_mpls_output_trace,
 };
 
@@ -390,12 +362,184 @@ VLIB_REGISTER_NODE (mpls_midchain_node) = {
   .name = "mpls-midchain",
   .vector_size = sizeof (u32),
 
-  .format_trace = format_mpls_output_trace,
+  .n_errors = MPLS_N_ERROR,
+  .error_strings = mpls_error_strings,
 
   .sibling_of = "mpls-output",
+  .format_trace = format_mpls_output_trace,
 };
 
-/**
+static char *mpls_frag_error_strings[] = {
+#define _(sym,string) string,
+  foreach_ip_frag_error
+#undef _
+};
+
+typedef struct mpls_frag_trace_t_
+{
+    u16 pkt_size;
+    u16 mtu;
+} mpls_frag_trace_t;
+
+typedef enum
+{
+    MPLS_FRAG_NEXT_REWRITE,
+    MPLS_FRAG_NEXT_REWRITE_MIDCHAIN,
+    MPLS_FRAG_NEXT_ICMP_ERROR,
+    MPLS_FRAG_NEXT_DROP,
+    MPLS_FRAG_N_NEXT,
+} mpls_frag_next_t;
+
+static uword
+mpls_frag (vlib_main_t * vm,
+           vlib_node_runtime_t * node,
+           vlib_frame_t * frame)
+{
+    u32 n_left_from, next_index, * from, * to_next, n_left_to_next, *frags;
+    vlib_node_runtime_t * error_node;
+
+    error_node = vlib_node_get_runtime (vm, mpls_output_node.index);
+    from = vlib_frame_vector_args (frame);
+    n_left_from = frame->n_vectors;
+    next_index = node->cached_next_index;
+    frags = NULL;
+
+    while (n_left_from > 0)
+    {
+        vlib_get_next_frame (vm, node, next_index,
+                             to_next, n_left_to_next);
+
+        while (n_left_from > 0 && n_left_to_next > 0)
+        {
+            ip_adjacency_t * adj0;
+            vlib_buffer_t * p0;
+            mpls_frag_next_t next0;
+            u32 pi0, adj_index0;
+            ip_frag_error_t error0 = IP_FRAG_ERROR_NONE;
+            i16 encap_size;
+            u8 is_ip4;
+
+            pi0 = to_next[0] = from[0];
+            p0 = vlib_get_buffer (vm, pi0);
+            from += 1;
+            n_left_from -= 1;
+            is_ip4 = vnet_buffer (p0)->mpls.pyld_proto == DPO_PROTO_IP4;
+
+            adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
+            adj0 = adj_get(adj_index0);
+
+            /* the size of the MPLS stack */
+            encap_size = vnet_buffer(p0)->l3_hdr_offset - p0->current_data;
+            
+            /* IP fragmentation */
+            if (is_ip4)
+                error0 = ip4_frag_do_fragment (vm, pi0,
+                                               adj0->rewrite_header.max_l3_packet_bytes,
+                                               encap_size, &frags);
+            else
+                error0 = ip6_frag_do_fragment (vm, pi0,
+                                               adj0->rewrite_header.max_l3_packet_bytes,
+                                               encap_size, &frags);
+
+            if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
+           {
+                mpls_frag_trace_t *tr =
+                    vlib_add_trace (vm, node, p0, sizeof (*tr));
+                 tr->mtu = adj0->rewrite_header.max_l3_packet_bytes;
+                 tr->pkt_size = vlib_buffer_length_in_chain(vm, p0);
+           }
+
+            if (PREDICT_TRUE(error0 == IP_FRAG_ERROR_NONE))
+           {
+                /* Free original buffer chain */
+                vlib_buffer_free_one (vm, pi0);        /* Free original packet */
+                next0 = (IP_LOOKUP_NEXT_MIDCHAIN == adj0->lookup_next_index ?
+                         MPLS_FRAG_NEXT_REWRITE_MIDCHAIN :
+                         MPLS_FRAG_NEXT_REWRITE);
+           }
+            else if (is_ip4 && error0 == IP_FRAG_ERROR_DONT_FRAGMENT_SET)
+           {
+                icmp4_error_set_vnet_buffer (
+                    p0, ICMP4_destination_unreachable,
+                    ICMP4_destination_unreachable_fragmentation_needed_and_dont_fragment_set,
+                    vnet_buffer (p0)->ip_frag.mtu);
+                next0 = MPLS_FRAG_NEXT_ICMP_ERROR;
+           }
+            else
+           {
+                vlib_error_count (vm, next_index, error0, 1);
+                vec_add1 (frags, pi0); /* Get rid of the original buffer */
+                next0 = MPLS_FRAG_NEXT_DROP;
+           }
+
+            /* Send fragments that were added in the frame */
+            u32 *frag_from, frag_left;
+
+            frag_from = frags;
+            frag_left = vec_len (frags);
+
+            while (frag_left > 0)
+            {
+                while (frag_left > 0 && n_left_to_next > 0)
+                {
+                    u32 i;
+                    i = to_next[0] = frag_from[0];
+                    frag_from += 1;
+                    frag_left -= 1;
+                    to_next += 1;
+                    n_left_to_next -= 1;
+
+                    p0 = vlib_get_buffer (vm, i);
+                    p0->error = error_node->errors[error0];
+
+                    vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+                                                     to_next, n_left_to_next, i,
+                                                     next0);
+                }
+                vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+                vlib_get_next_frame (vm, node, next_index, to_next,
+                                     n_left_to_next);
+            }
+            vec_reset_length (frags);
+       }
+        vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+    }
+    vec_free (frags);
+
+    return frame->n_vectors;
+}
+
+static u8 *
+format_mpls_frag_trace (u8 * s, va_list * args)
+{
+    CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+    CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+    mpls_frag_trace_t *t = va_arg (*args, mpls_frag_trace_t *);
+
+    s = format (s, "mtu:%d pkt-size:%d", t->mtu, t->pkt_size);
+  return s;
+}
+
+VLIB_REGISTER_NODE (mpls_frag_node) = {
+    .function = mpls_frag,
+    .name = "mpls-frag",
+    .vector_size = sizeof (u32),
+    .format_trace = format_mpls_frag_trace,
+    .type = VLIB_NODE_TYPE_INTERNAL,
+
+    .n_errors = IP_FRAG_N_ERROR,
+    .error_strings = mpls_frag_error_strings,
+
+    .n_next_nodes = MPLS_FRAG_N_NEXT,
+    .next_nodes = {
+        [MPLS_FRAG_NEXT_REWRITE] = "mpls-output",
+        [MPLS_FRAG_NEXT_REWRITE_MIDCHAIN] = "mpls-midchain",
+        [MPLS_FRAG_NEXT_ICMP_ERROR] = "ip4-icmp-error",
+        [MPLS_FRAG_NEXT_DROP] = "mpls-drop"
+    },
+};
+
+/*
  * @brief Next index values from the MPLS incomplete adj node
  */
 #define foreach_mpls_adj_incomplete_next               \
index ebeea5f..8ed047d 100644 (file)
@@ -154,7 +154,8 @@ class TestMPLS(VppTestCase):
             pkts.append(p)
         return pkts
 
-    def create_stream_ip4(self, src_if, dst_ip, ip_ttl=64, ip_dscp=0):
+    def create_stream_ip4(self, src_if, dst_ip, ip_ttl=64,
+                          ip_dscp=0, payload_size=None):
         self.reset_packet_infos()
         pkts = []
         for i in range(0, 257):
@@ -166,6 +167,8 @@ class TestMPLS(VppTestCase):
                  UDP(sport=1234, dport=1234) /
                  Raw(payload))
             info.data = p.copy()
+            if payload_size:
+                self.extend_packet(p, payload_size)
             pkts.append(p)
         return pkts
 
@@ -911,7 +914,7 @@ class TestMPLS(VppTestCase):
         """ MPLS Tunnel Tests - Pipe """
 
         #
-        # Create a tunnel with a single out label
+        # Create a tunnel with two out labels
         #
         mpls_tun = VppMPLSTunnelInterface(
             self,
@@ -964,6 +967,38 @@ class TestMPLS(VppTestCase):
                                           VppMplsLabel(46),
                                           VppMplsLabel(33, ttl=255)])
 
+        #
+        # change tunnel's MTU to a low value
+        #
+        mpls_tun.set_l3_mtu(1200)
+
+        # send IP into the tunnel to be fragmented
+        tx = self.create_stream_ip4(self.pg0, "10.0.0.3",
+                                    payload_size=1500)
+        rx = self.send_and_expect(self.pg0, tx, self.pg0, len(tx)*2)
+
+        fake_tx = []
+        for p in tx:
+            fake_tx.append(p)
+            fake_tx.append(p)
+        self.verify_capture_tunneled_ip4(self.pg0, rx, fake_tx,
+                                         [VppMplsLabel(44),
+                                          VppMplsLabel(46)])
+
+        # send MPLS into the tunnel to be fragmented
+        tx = self.create_stream_ip4(self.pg0, "10.0.0.4",
+                                    payload_size=1500)
+        rx = self.send_and_expect(self.pg0, tx, self.pg0, len(tx)*2)
+
+        fake_tx = []
+        for p in tx:
+            fake_tx.append(p)
+            fake_tx.append(p)
+        self.verify_capture_tunneled_ip4(self.pg0, rx, fake_tx,
+                                         [VppMplsLabel(44),
+                                          VppMplsLabel(46),
+                                          VppMplsLabel(33, ttl=255)])
+
     def test_tunnel_uniform(self):
         """ MPLS Tunnel Tests - Uniform """
 
index 431a03a..a5f6f45 100644 (file)
@@ -495,3 +495,15 @@ class VppInterface(object):
     def get_tx_stats(self):
         c = self.test.statistics.get_counter("^/if/tx$")
         return c[0][self.sw_if_index]
+
+    def set_l3_mtu(self, mtu):
+        self.test.vapi.sw_interface_set_mtu(self.sw_if_index, [mtu, 0, 0, 0])
+
+    def set_ip4_mtu(self, mtu):
+        self.test.vapi.sw_interface_set_mtu(self.sw_if_index, [0, mtu, 0, 0])
+
+    def set_ip6_mtu(self, mtu):
+        self.test.vapi.sw_interface_set_mtu(self.sw_if_index, [0, 0, mtu, 0])
+
+    def set_mpls_mtu(self, mtu):
+        self.test.vapi.sw_interface_set_mtu(self.sw_if_index, [0, 0, 0, mtu])