Merge "Avoid crash in vhost-user driver when running multithreaded"
authorDamjan Marion <damarion@cisco.com>
Thu, 14 Jan 2016 18:48:26 +0000 (18:48 +0000)
committerGerrit Code Review <gerrit@fd.io>
Thu, 14 Jan 2016 18:48:26 +0000 (18:48 +0000)
13 files changed:
.gitreview
vnet/Makefile.am
vnet/vnet/ip/icmp4.c
vnet/vnet/ip/icmp4.h [new file with mode: 0644]
vnet/vnet/ip/icmp6.c
vnet/vnet/ip/ip.h
vnet/vnet/ip/ip4_input.c
vnet/vnet/ip/ip6_input.c
vnet/vnet/ip/ip_frag.c
vnet/vnet/ip/ip_frag.h
vnet/vnet/map/ip4_map.c
vnet/vnet/map/map.c
vnet/vnet/map/map.h

index 0efefe1..1db08df 100644 (file)
@@ -1,4 +1,4 @@
 [gerrit]
-host=gerrit.projectrotterdam.info
+host=gerrit.fd.io
 port=29418
 project=vpp
index 1bde1e8..a0c20e9 100644 (file)
@@ -252,6 +252,7 @@ libvnet_la_SOURCES +=                               \
 nobase_include_HEADERS +=                      \
  vnet/ip/format.h                              \
  vnet/ip/icmp46_packet.h                       \
+ vnet/ip/icmp4.h                               \
  vnet/ip/icmp6.h                               \
  vnet/ip/igmp_packet.h                         \
  vnet/ip/ip.h                                  \
index 4ee8f15..abad5bd 100644 (file)
 #include <vnet/ip/ip.h>
 #include <vnet/pg/pg.h>
 
+
+static char * icmp_error_strings[] = {
+#define _(f,s) s,
+  foreach_icmp4_error
+#undef _
+};
+
 static u8 * format_ip4_icmp_type_and_code (u8 * s, va_list * args)
 {
   icmp4_type_t type = va_arg (*args, int);
@@ -96,10 +103,6 @@ static u8 * format_ip4_icmp_header (u8 * s, va_list * args)
   return s;
 }
 
-typedef struct {
-  u8 packet_data[64];
-} icmp_input_trace_t;
-
 static u8 * format_icmp_input_trace (u8 * s, va_list * va)
 {
   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
@@ -113,20 +116,6 @@ static u8 * format_icmp_input_trace (u8 * s, va_list * va)
   return s;
 }
 
-typedef enum {
-  ICMP4_ERROR_UNKNOWN_TYPE,
-  ICMP4_ERROR_ECHO_REPLIES_SENT,
-  ICMP4_ERROR_TTL_EXPIRE_RESP_SENT,
-  ICMP4_ERROR_TTL_EXPIRE_RESP_DROP,
-} icmp_error_t;
-
-static char * icmp_error_strings[] = {
-  [ICMP4_ERROR_UNKNOWN_TYPE] = "unknown type",
-  [ICMP4_ERROR_ECHO_REPLIES_SENT] = "echo replies sent",
-  [ICMP4_ERROR_TTL_EXPIRE_RESP_SENT] = "TTL time exceeded response sent",
-  [ICMP4_ERROR_TTL_EXPIRE_RESP_DROP] = "TTL time exceeded response dropped",
-};
-
 typedef enum {
   ICMP_INPUT_NEXT_ERROR,
   ICMP_INPUT_N_NEXT,
@@ -418,19 +407,42 @@ VLIB_REGISTER_NODE (ip4_icmp_echo_request_node,static) = {
 };
 
 typedef enum {
-  ICMP4_TTL_EXPIRE_NEXT_DROP,
-  ICMP4_TTL_EXPIRE_NEXT_LOOKUP,
-  ICMP4_TTL_EXPIRE_N_NEXT,
-} icmp_ttl_expire_next_t;
+  IP4_ICMP_ERROR_NEXT_DROP,
+  IP4_ICMP_ERROR_NEXT_LOOKUP,
+  IP4_ICMP_ERROR_N_NEXT,
+} ip4_icmp_error_next_t;
+
+void
+icmp4_error_set_vnet_buffer (vlib_buffer_t *b, u8 type, u8 code, u32 data)
+{
+  vnet_buffer(b)->ip.icmp.type = type;
+  vnet_buffer(b)->ip.icmp.code = code;
+  vnet_buffer(b)->ip.icmp.data = data;
+}
+
+static u8
+icmp4_icmp_type_to_error (u8 type)
+{
+  switch (type) {
+  case ICMP4_destination_unreachable:
+    return ICMP4_ERROR_DEST_UNREACH_SENT;
+  case ICMP4_time_exceeded:
+    return ICMP4_ERROR_TTL_EXPIRE_SENT;
+  case ICMP4_parameter_problem:
+    return ICMP4_ERROR_PARAM_PROBLEM_SENT;
+  default:
+    return ICMP4_ERROR_DROP;
+  }
+}
 
 static uword
-ip4_icmp_ttl_expire (vlib_main_t * vm,
-                     vlib_node_runtime_t * node,
-                     vlib_frame_t * frame)
+ip4_icmp_error (vlib_main_t * vm,
+               vlib_node_runtime_t * node,
+               vlib_frame_t * frame)
 {
   u32 * from, * to_next;
   uword n_left_from, n_left_to_next;
-  icmp_ttl_expire_next_t next_index;
+  ip4_icmp_error_next_t next_index;
   ip4_main_t *im = &ip4_main;
   ip_lookup_main_t * lm = &im->lookup_main;
 
@@ -442,117 +454,113 @@ ip4_icmp_ttl_expire (vlib_main_t * vm,
     vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
                                   /* stride */ 1, sizeof (icmp_input_trace_t));
 
-  while (n_left_from > 0)
-    {
-      vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
-
-      while (n_left_from > 0 && n_left_to_next > 0)
-        {
-          u32 pi0 = from[0];
-          u32 next0 = ICMP4_TTL_EXPIRE_NEXT_LOOKUP;
-          u8 error0 = ICMP4_ERROR_TTL_EXPIRE_RESP_SENT;
-          u32 len0, new_len0;
-          vlib_buffer_t * p0;
-          ip4_header_t * ip0, * out_ip0;
-          icmp46_header_t * icmp0;
-          ip_csum_t sum;
-          u32 sw_if_index0, if_add_index0; 
-
-          /* Speculatively enqueue p0 to the current next frame */
-          to_next[0] = pi0;
-          from += 1;
-          to_next += 1;
-          n_left_from -= 1;
-          n_left_to_next -= 1;
-
-          p0 = vlib_get_buffer(vm, pi0);
-          ip0 = vlib_buffer_get_current(p0);
-          len0 = vlib_buffer_length_in_chain (vm, p0);
-          sw_if_index0 = vnet_buffer(p0)->sw_if_index[VLIB_RX];
-
-          /* Cut payload to just IP header plus first 8 bytes */
-          new_len0 = (ip0->ip_version_and_header_length &0xf)*4 + 8;
-          if (len0 > new_len0)
-            {
-              p0->current_length = new_len0; /* should fit in 1st buffer */
-              if (PREDICT_FALSE(p0->total_length_not_including_first_buffer))
-                { /* clear current_length of all other buffers in chain */
-                  vlib_buffer_t *b = p0;
-                  p0->total_length_not_including_first_buffer = 0;
-                  while (b->flags & VLIB_BUFFER_NEXT_PRESENT)
-                    {
-                      b = vlib_get_buffer (vm, b->next_buffer);
-                      b->current_length = 0;
-                    }                  
-                }
-            }
+  while (n_left_from > 0) {
+    vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
 
-          /* Add IP header and ICMP header including a 4 byte unused field */
-          vlib_buffer_advance(p0, 
-                              -sizeof(ip4_header_t)-sizeof(icmp46_header_t)-4);
-          out_ip0 = vlib_buffer_get_current(p0);
-          icmp0 = (icmp46_header_t *) &out_ip0[1];
-
-          /* Fill ip header fields */
-          out_ip0->ip_version_and_header_length = 0x45;
-          out_ip0->tos = 0;
-          out_ip0->length = clib_host_to_net_u16(p0->current_length);
-          out_ip0->fragment_id = 0;
-          out_ip0->ttl = 0xff;
-          out_ip0->protocol = IP_PROTOCOL_ICMP;
-          out_ip0->dst_address = ip0->src_address;
-          if_add_index0 = 
-              lm->if_address_pool_index_by_sw_if_index[sw_if_index0];
-          if (PREDICT_TRUE(if_add_index0 != ~0)) 
-            {
-              ip_interface_address_t *if_add = 
-                  pool_elt_at_index(lm->if_address_pool, if_add_index0);
-              ip4_address_t *if_ip = 
-                  ip_interface_address_get_address(lm, if_add);
-              out_ip0->src_address = *if_ip;
-              vlib_error_count (vm, node->node_index, error0, 1);
-            } 
-          else   /* interface has no IP4 address - should not happen */
-            {
-              next0 = ICMP4_TTL_EXPIRE_NEXT_DROP;
-              error0 = ICMP4_ERROR_TTL_EXPIRE_RESP_DROP;
-            }
-          out_ip0->checksum = ip4_header_checksum(out_ip0);
-
-          /* Fill icmp header fields */
-          icmp0->type = ICMP4_time_exceeded;
-          icmp0->code = ICMP4_time_exceeded_ttl_exceeded_in_transit;
-          icmp0->checksum = 0;
-          sum = ip_incremental_checksum(
-              0, icmp0, p0->current_length - sizeof(ip4_header_t));
-          icmp0->checksum = ~ip_csum_fold(sum);
-
-          /* Update error status */
-          p0->error = node->errors[error0];
-
-          /* Verify speculative enqueue, maybe switch current next frame */
-          vlib_validate_buffer_enqueue_x1(vm, node, next_index,
-                                          to_next, n_left_to_next,
-                                          pi0, next0);
-        }
-      vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+    while (n_left_from > 0 && n_left_to_next > 0) {
+      u32 pi0 = from[0];
+      u32 next0 = IP4_ICMP_ERROR_NEXT_LOOKUP;
+      u8 error0 = ICMP4_ERROR_NONE;
+      vlib_buffer_t * p0;
+      ip4_header_t * ip0, * out_ip0;
+      icmp46_header_t * icmp0;
+      u32 sw_if_index0, if_add_index0;
+      ip_csum_t sum;
+
+      /* Speculatively enqueue p0 to the current next frame */
+      to_next[0] = pi0;
+      from += 1;
+      to_next += 1;
+      n_left_from -= 1;
+      n_left_to_next -= 1;
+
+      p0 = vlib_get_buffer(vm, pi0);
+      ip0 = vlib_buffer_get_current(p0);
+      sw_if_index0 = vnet_buffer(p0)->sw_if_index[VLIB_RX];
+
+      /*
+       * RFC1812 says to keep as much of the original packet as
+       * possible within the minimum MTU (576). We cheat "a little"
+       * here by keeping whatever fits in the first buffer, to be more
+       * efficient
+       */
+      if (PREDICT_FALSE(p0->total_length_not_including_first_buffer)) {
+       /* clear current_length of all other buffers in chain */
+       vlib_buffer_t *b = p0;
+       p0->total_length_not_including_first_buffer = 0;
+       while (b->flags & VLIB_BUFFER_NEXT_PRESENT) {
+         b = vlib_get_buffer (vm, b->next_buffer);
+         b->current_length = 0;
+       }                  
+      }
+      p0->current_length = p0->current_length > 576 ? 576 : p0->current_length;
+
+      /* Add IP header and ICMPv4 header including a 4 byte data field */
+      vlib_buffer_advance(p0, 
+                         -sizeof(ip4_header_t)-sizeof(icmp46_header_t)-4);
+      out_ip0 = vlib_buffer_get_current(p0);
+      icmp0 = (icmp46_header_t *) &out_ip0[1];
+
+      /* Fill ip header fields */
+      out_ip0->ip_version_and_header_length = 0x45;
+      out_ip0->tos = 0;
+      out_ip0->length = clib_host_to_net_u16(p0->current_length);
+      out_ip0->fragment_id = 0;
+      out_ip0->flags_and_fragment_offset = 0;
+      out_ip0->ttl = 0xff;
+      out_ip0->protocol = IP_PROTOCOL_ICMP;
+      out_ip0->dst_address = ip0->src_address;
+      if_add_index0 = 
+       lm->if_address_pool_index_by_sw_if_index[sw_if_index0];
+      if (PREDICT_TRUE(if_add_index0 != ~0)) {
+       ip_interface_address_t *if_add = 
+         pool_elt_at_index(lm->if_address_pool, if_add_index0);
+       ip4_address_t *if_ip = 
+         ip_interface_address_get_address(lm, if_add);
+       out_ip0->src_address = *if_ip;
+      } else {
+       /* interface has no IP4 address - should not happen */
+       next0 = IP4_ICMP_ERROR_NEXT_DROP;
+       error0 = ICMP4_ERROR_DROP;
+      }
+      out_ip0->checksum = ip4_header_checksum(out_ip0);
+
+      /* Fill icmp header fields */
+      icmp0->type = vnet_buffer(p0)->ip.icmp.type;
+      icmp0->code = vnet_buffer(p0)->ip.icmp.code;
+      *((u32 *)(icmp0 + 1)) = clib_host_to_net_u32(vnet_buffer(p0)->ip.icmp.data);
+      icmp0->checksum = 0;
+      sum = ip_incremental_checksum(0, icmp0, p0->current_length - sizeof(ip4_header_t));
+      icmp0->checksum = ~ip_csum_fold(sum);
+
+      /* Update error status */
+      if (error0 == ICMP4_ERROR_NONE)
+       error0 = icmp4_icmp_type_to_error(icmp0->type);
+      vlib_error_count(vm, node->node_index, error0, 1);
+
+      /* Verify speculative enqueue, maybe switch current next frame */
+      vlib_validate_buffer_enqueue_x1(vm, node, next_index,
+                                     to_next, n_left_to_next,
+                                     pi0, next0);
     }
+    vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+  }
 
   return frame->n_vectors;
 }
 
-VLIB_REGISTER_NODE (ip4_icmp_ttl_expire_node) = {
-  .function = ip4_icmp_ttl_expire,
-  .name = "ip4-icmp-ttl-expire",
+VLIB_REGISTER_NODE (ip4_icmp_error_node) = {
+  .function = ip4_icmp_error,
+  .name = "ip4-icmp-error",
   .vector_size = sizeof (u32),
 
   .n_errors = ARRAY_LEN (icmp_error_strings),
   .error_strings = icmp_error_strings,
 
-  .n_next_nodes = ICMP4_TTL_EXPIRE_N_NEXT,
+  .n_next_nodes = IP4_ICMP_ERROR_N_NEXT,
   .next_nodes = {
-    [ICMP4_TTL_EXPIRE_NEXT_DROP] = "error-drop",
-    [ICMP4_TTL_EXPIRE_NEXT_LOOKUP] = "ip4-lookup",
+    [IP4_ICMP_ERROR_NEXT_DROP] = "error-drop",
+    [IP4_ICMP_ERROR_NEXT_LOOKUP] = "ip4-lookup",
   },
 
   .format_trace = format_icmp_input_trace,
diff --git a/vnet/vnet/ip/icmp4.h b/vnet/vnet/ip/icmp4.h
new file mode 100644 (file)
index 0000000..f99bf2d
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vnet_icmp4_h
+#define included_vnet_icmp4_h
+
+#define foreach_icmp4_error                                             \
+  _ (NONE, "valid packets")                                             \
+  _ (UNKNOWN_TYPE, "unknown type")                                      \
+  _ (INVALID_CODE_FOR_TYPE, "invalid code for type")                    \
+  _ (INVALID_HOP_LIMIT_FOR_TYPE, "hop_limit != 255")                    \
+  _ (LENGTH_TOO_SMALL_FOR_TYPE, "payload length too small for type")    \
+  _ (OPTIONS_WITH_ODD_LENGTH,                                           \
+     "total option length not multiple of 8 bytes")                     \
+  _ (OPTION_WITH_ZERO_LENGTH, "option has zero length")                 \
+  _ (ECHO_REPLIES_SENT, "echo replies sent")                            \
+  _ (DST_LOOKUP_MISS, "icmp6 dst address lookup misses")                \
+  _ (DEST_UNREACH_SENT, "destination unreachable response sent")       \
+  _ (TTL_EXPIRE_SENT, "hop limit exceeded response sent")              \
+  _ (PARAM_PROBLEM_SENT, "parameter Pproblem response sent")           \
+  _ (DROP, "error message dropped")
+
+typedef enum {
+#define _(f,s) ICMP4_ERROR_##f,
+  foreach_icmp4_error
+#undef _
+} icmp4_error_t;
+
+typedef struct {
+  u8 packet_data[64];
+} icmp_input_trace_t;
+
+format_function_t format_icmp4_input_trace;
+void ip4_icmp_register_type (vlib_main_t * vm, icmp4_type_t type, u32 node_index);
+void icmp4_error_set_vnet_buffer (vlib_buffer_t *b, u8 type, u8 code, u32 data);
+
+#endif /* included_vnet_icmp4_h */
index c5eb0f6..e6022ad 100644 (file)
@@ -571,6 +571,7 @@ ip6_icmp_error (vlib_main_t * vm,
                   b->current_length = 0;
                 }                  
             }
+         p0->current_length = p0->current_length > 1280 ? 1280 : p0->current_length;
 
           /* Add IP header and ICMPv6 header including a 4 byte data field */
           vlib_buffer_advance(p0, 
@@ -581,8 +582,8 @@ ip6_icmp_error (vlib_main_t * vm,
           /* Fill ip header fields */
           out_ip0->ip_version_traffic_class_and_flow_label = 
               clib_host_to_net_u32(0x6<<28);
-         u16 plen = p0->current_length > 1280 ? 1280 : p0->current_length;
-          out_ip0->payload_length = clib_host_to_net_u16(plen - sizeof(ip6_header_t));
+
+          out_ip0->payload_length = clib_host_to_net_u16(p0->current_length - sizeof(ip6_header_t));
           out_ip0->protocol = IP_PROTOCOL_ICMP6;
           out_ip0->hop_limit = 0xff;
           out_ip0->dst_address = ip0->src_address;
index e47512a..a0b4ea6 100644 (file)
@@ -56,6 +56,7 @@
 #include <vnet/ip/ip4.h>
 #include <vnet/ip/ip4_error.h>
 #include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/icmp4.h>
 
 #include <vnet/ip/ip6.h>
 #include <vnet/ip/ip6_packet.h>
index 68edc0f..f31df0f 100644 (file)
@@ -64,7 +64,7 @@ typedef enum {
   IP4_INPUT_NEXT_PUNT,
   IP4_INPUT_NEXT_LOOKUP,
   IP4_INPUT_NEXT_LOOKUP_MULTICAST,
-  IP4_INPUT_NEXT_TTL_EXPIRE,
+  IP4_INPUT_NEXT_ICMP_ERROR,
   IP4_INPUT_N_NEXT,
 } ip4_input_next_t;
 
@@ -220,19 +220,21 @@ ip4_input_inline (vlib_main_t * vm,
 
       if (PREDICT_FALSE(error0 != IP4_ERROR_NONE))
         {
-          next0 = (error0 != IP4_ERROR_OPTIONS
-                   ? (error0 == IP4_ERROR_TIME_EXPIRED
-                      ? IP4_INPUT_NEXT_TTL_EXPIRE
-                      : IP4_INPUT_NEXT_DROP)
-                   : IP4_INPUT_NEXT_PUNT);
+         if (error0 == IP4_ERROR_TIME_EXPIRED) {
+           icmp4_error_set_vnet_buffer(p0, ICMP4_time_exceeded,
+                                       ICMP4_time_exceeded_ttl_exceeded_in_transit, 0);
+           next0 = IP4_INPUT_NEXT_ICMP_ERROR;
+         } else
+           next0 = error0 != IP4_ERROR_OPTIONS ? IP4_INPUT_NEXT_DROP : IP4_INPUT_NEXT_PUNT;
         }
       if (PREDICT_FALSE(error1 != IP4_ERROR_NONE))
         {
-          next1 = (error1 != IP4_ERROR_OPTIONS
-                   ? (error1 == IP4_ERROR_TIME_EXPIRED
-                      ? IP4_INPUT_NEXT_TTL_EXPIRE
-                      : IP4_INPUT_NEXT_DROP)
-                   : IP4_INPUT_NEXT_PUNT);
+         if (error1 == IP4_ERROR_TIME_EXPIRED) {
+           icmp4_error_set_vnet_buffer(p1, ICMP4_time_exceeded,
+                                       ICMP4_time_exceeded_ttl_exceeded_in_transit, 0);
+           next1 = IP4_INPUT_NEXT_ICMP_ERROR;
+         } else
+           next1 = error1 != IP4_ERROR_OPTIONS ? IP4_INPUT_NEXT_DROP : IP4_INPUT_NEXT_PUNT;
         }
 
          vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
@@ -307,11 +309,12 @@ ip4_input_inline (vlib_main_t * vm,
          p0->error = error_node->errors[error0];
       if (PREDICT_FALSE(error0 != IP4_ERROR_NONE))
         {
-          next0 = (error0 != IP4_ERROR_OPTIONS
-                   ? (error0 == IP4_ERROR_TIME_EXPIRED
-                      ? IP4_INPUT_NEXT_TTL_EXPIRE
-                      : IP4_INPUT_NEXT_DROP)
-                   : IP4_INPUT_NEXT_PUNT);
+         if (error0 == IP4_ERROR_TIME_EXPIRED) {
+           icmp4_error_set_vnet_buffer(p0, ICMP4_time_exceeded,
+                                       ICMP4_time_exceeded_ttl_exceeded_in_transit, 0);
+           next0 = IP4_INPUT_NEXT_ICMP_ERROR;
+         } else
+           next0 = error0 != IP4_ERROR_OPTIONS ? IP4_INPUT_NEXT_DROP : IP4_INPUT_NEXT_PUNT;
         }
 
          vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
@@ -361,7 +364,7 @@ VLIB_REGISTER_NODE (ip4_input_node) = {
     [IP4_INPUT_NEXT_PUNT] = "error-punt",
     [IP4_INPUT_NEXT_LOOKUP] = "ip4-lookup",
     [IP4_INPUT_NEXT_LOOKUP_MULTICAST] = "ip4-lookup-multicast",
-    [IP4_INPUT_NEXT_TTL_EXPIRE] = "ip4-icmp-ttl-expire",
+    [IP4_INPUT_NEXT_ICMP_ERROR] = "ip4-icmp-error",
   },
 
   .format_buffer = format_ip4_header,
@@ -379,7 +382,7 @@ VLIB_REGISTER_NODE (ip4_input_no_checksum_node,static) = {
     [IP4_INPUT_NEXT_PUNT] = "error-punt",
     [IP4_INPUT_NEXT_LOOKUP] = "ip4-lookup",
     [IP4_INPUT_NEXT_LOOKUP_MULTICAST] = "ip4-lookup-multicast",
-    [IP4_INPUT_NEXT_TTL_EXPIRE] = "ip4-icmp-ttl-expire",
+    [IP4_INPUT_NEXT_ICMP_ERROR] = "ip4-icmp-error",
   },
 
   .format_buffer = format_ip4_header,
index 473b2b2..f96a1cf 100644 (file)
@@ -62,7 +62,7 @@ static u8 * format_ip6_input_trace (u8 * s, va_list * va)
 typedef enum {
   IP6_INPUT_NEXT_DROP,
   IP6_INPUT_NEXT_LOOKUP,
-  IP6_INPUT_NEXT_ICMP,
+  IP6_INPUT_NEXT_ICMP_ERROR,
   IP6_INPUT_N_NEXT,
 } ip6_input_next_t;
 
@@ -189,7 +189,7 @@ ip6_input (vlib_main_t * vm,
          if (error0 == IP6_ERROR_TIME_EXPIRED) {
            icmp6_error_set_vnet_buffer(p0, ICMP6_time_exceeded,
                                          ICMP6_time_exceeded_ttl_exceeded_in_transit, 0);
-           next0 = IP6_INPUT_NEXT_ICMP;
+           next0 = IP6_INPUT_NEXT_ICMP_ERROR;
          } else {
            next0 = IP6_INPUT_NEXT_DROP;
          }
@@ -199,7 +199,7 @@ ip6_input (vlib_main_t * vm,
          if (error1 == IP6_ERROR_TIME_EXPIRED) {
            icmp6_error_set_vnet_buffer(p1, ICMP6_time_exceeded,
                                          ICMP6_time_exceeded_ttl_exceeded_in_transit, 0);
-           next1 = IP6_INPUT_NEXT_ICMP;
+           next1 = IP6_INPUT_NEXT_ICMP_ERROR;
          } else {
            next1 = IP6_INPUT_NEXT_DROP;
          }
@@ -262,7 +262,7 @@ ip6_input (vlib_main_t * vm,
          if (error0 == IP6_ERROR_TIME_EXPIRED) {
            icmp6_error_set_vnet_buffer(p0, ICMP6_time_exceeded,
                                          ICMP6_time_exceeded_ttl_exceeded_in_transit, 0);
-           next0 = IP6_INPUT_NEXT_ICMP;
+           next0 = IP6_INPUT_NEXT_ICMP_ERROR;
          } else {
            next0 = IP6_INPUT_NEXT_DROP;
          }
@@ -298,7 +298,7 @@ VLIB_REGISTER_NODE (ip6_input_node) = {
   .next_nodes = {
     [IP6_INPUT_NEXT_DROP] = "error-drop",
     [IP6_INPUT_NEXT_LOOKUP] = "ip6-lookup",
-    [IP6_INPUT_NEXT_ICMP] = "ip6-icmp-error",
+    [IP6_INPUT_NEXT_ICMP_ERROR] = "ip6-icmp-error",
   },
 
   .format_buffer = format_ip6_header,
index 2217618..3436090 100644 (file)
@@ -37,9 +37,8 @@ static u8 * format_ip_frag_trace (u8 * s, va_list * args)
   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
   ip_frag_trace_t * t = va_arg (*args, ip_frag_trace_t *);
-  s = format(s, "IPv%s offset: %u mtu: %u fragments: %u next: %s",
-             t->ipv6?"6":"4",
-             t->header_offset, t->mtu, t->n_fragments, node->next_node_names[t->next]);
+  s = format(s, "IPv%s offset: %u mtu: %u fragments: %u",
+             t->ipv6?"6":"4", t->header_offset, t->mtu, t->n_fragments);
   return s;
 }
 
@@ -146,6 +145,14 @@ ip4_frag_do_fragment(vlib_main_t *vm, u32 pi, u32 **buffer, ip_frag_error_t *err
   }
 }
 
+void
+ip_frag_set_vnet_buffer (vlib_buffer_t *b, u16 offset, u16 mtu, u8 next_index, u8 flags)
+{
+  vnet_buffer(b)->ip_frag.header_offset = offset;
+  vnet_buffer(b)->ip_frag.mtu = mtu;
+  vnet_buffer(b)->ip_frag.next_index = next_index;
+  vnet_buffer(b)->ip_frag.flags = flags;
+}
 
 static uword
 ip4_frag (vlib_main_t *vm,
@@ -189,13 +196,25 @@ ip4_frag (vlib_main_t *vm,
         tr->next = vnet_buffer(p0)->ip_frag.next_index;
       }
 
-      next0 = (error0 == IP_FRAG_ERROR_NONE) ? vnet_buffer(p0)->ip_frag.next_index : IP4_FRAG_NEXT_DROP;
-      frag_sent += vec_len(buffer);
-      small_packets += (vec_len(buffer) == 1);
+      if (error0 == IP_FRAG_ERROR_DONT_FRAGMENT_SET) {
+       icmp4_error_set_vnet_buffer(p0, ICMP4_destination_unreachable,
+                                   ICMP4_destination_unreachable_fragmentation_needed_and_dont_fragment_set,
+                                   vnet_buffer(p0)->ip_frag.mtu);
+       vlib_buffer_advance(p0, vnet_buffer(p0)->ip_frag.header_offset);
+       next0 = IP4_FRAG_NEXT_ICMP_ERROR;
+      } else
+       next0 = (error0 == IP_FRAG_ERROR_NONE) ? vnet_buffer(p0)->ip_frag.next_index : IP4_FRAG_NEXT_DROP;
+
+      if (error0 == IP_FRAG_ERROR_NONE) {
+       frag_sent += vec_len(buffer);
+       small_packets += (vec_len(buffer) == 1);
+      } else
+       vlib_error_count(vm, ip4_frag_node.index, error0, 1);
 
       //Send fragments that were added in the frame
       frag_from = buffer;
       frag_left = vec_len(buffer);
+
       while (frag_left > 0) {
         while (frag_left > 0 && n_left_to_next > 0) {
           u32 i;
@@ -218,6 +237,7 @@ ip4_frag (vlib_main_t *vm,
     vlib_put_next_frame(vm, node, next_index, n_left_to_next);
   }
   vec_free(buffer);
+
   vlib_node_increment_counter(vm, ip4_frag_node.index, IP_FRAG_ERROR_FRAGMENT_SENT, frag_sent);
   vlib_node_increment_counter(vm, ip4_frag_node.index, IP_FRAG_ERROR_SMALL_PACKET, small_packets);
 
@@ -426,6 +446,7 @@ VLIB_REGISTER_NODE (ip4_frag_node) = {
   .next_nodes = {
     [IP4_FRAG_NEXT_IP4_LOOKUP] = "ip4-lookup",
     [IP4_FRAG_NEXT_IP6_LOOKUP] = "ip6-lookup",
+    [IP4_FRAG_NEXT_ICMP_ERROR] = "ip4-icmp-error",
     [IP4_FRAG_NEXT_DROP] = "error-drop"
   },
 };
index 0456690..7623865 100644 (file)
@@ -49,6 +49,7 @@ vlib_node_registration_t ip6_frag_node;
 typedef enum {
   IP4_FRAG_NEXT_IP4_LOOKUP,
   IP4_FRAG_NEXT_IP6_LOOKUP,
+  IP4_FRAG_NEXT_ICMP_ERROR,
   IP4_FRAG_NEXT_DROP,
   IP4_FRAG_N_NEXT
 } ip4_frag_next_t;
@@ -65,8 +66,8 @@ typedef enum {
  _(NONE, "packet fragmented")                          \
  _(SMALL_PACKET, "packet smaller than MTU")             \
  _(FRAGMENT_SENT, "number of sent fragments")           \
- _(CANT_FRAGMENT_HEADER, "can't fragment header'")      \
- _(DONT_FRAGMENT_SET, "can't fragment this packet'")    \
+ _(CANT_FRAGMENT_HEADER, "can't fragment header")      \
+ _(DONT_FRAGMENT_SET, "can't fragment this packet")    \
  _(MALFORMED, "malformed packet")                       \
  _(MEMORY, "could not allocate buffer")                 \
  _(UNKNOWN, "unknown error")
@@ -78,4 +79,6 @@ typedef enum {
    IP_FRAG_N_ERROR,
  } ip_frag_error_t;
 
+void ip_frag_set_vnet_buffer(vlib_buffer_t *b, u16 offset, u16 mtu, u8 next_index, u8 flags);
+
 #endif /* ifndef IP_FRAG_H */
index 343b57d..7b9b3ed 100644 (file)
@@ -27,8 +27,10 @@ enum ip4_map_next_e {
 #ifdef MAP_SKIP_IP6_LOOKUP
   IP4_MAP_NEXT_IP6_REWRITE,
 #endif
-  IP4_MAP_NEXT_FRAGMENT,
+  IP4_MAP_NEXT_IP4_FRAGMENT,
+  IP4_MAP_NEXT_IP6_FRAGMENT,
   IP4_MAP_NEXT_REASS,
+  IP4_MAP_NEXT_ICMP_ERROR,
   IP4_MAP_NEXT_DROP,
   IP4_MAP_N_NEXT,
 };
@@ -177,6 +179,27 @@ ip4_map_decrement_ttl (ip4_header_t *ip, u8 *error)
   ASSERT (ip->checksum == ip4_header_checksum(ip));
 }
 
+static u32
+ip4_map_fragment (vlib_buffer_t *b, u16 mtu, bool df, u8 *error)
+{
+  map_main_t *mm = &map_main;
+
+  if (mm->frag_inner) {
+    ip_frag_set_vnet_buffer(b, sizeof(ip6_header_t), mtu, IP4_FRAG_NEXT_IP6_LOOKUP, IP_FRAG_FLAG_IP6_HEADER);
+    return (IP4_MAP_NEXT_IP4_FRAGMENT);
+  } else {
+    if (df && !mm->frag_ignore_df) {
+      icmp4_error_set_vnet_buffer(b, ICMP4_destination_unreachable,
+                                 ICMP4_destination_unreachable_fragmentation_needed_and_dont_fragment_set, mtu);
+      vlib_buffer_advance(b, sizeof(ip6_header_t));
+      *error = MAP_ERROR_DF_SET;
+      return (IP4_MAP_NEXT_ICMP_ERROR);
+    }
+    ip_frag_set_vnet_buffer(b, 0, mtu, IP6_FRAG_NEXT_IP6_LOOKUP, IP_FRAG_FLAG_IP6_HEADER);
+    return (IP4_MAP_NEXT_IP6_FRAGMENT);
+  }
+}
+
 /*
  * ip4_map
  */
@@ -247,6 +270,12 @@ ip4_map (vlib_main_t *vm,
       port0 = ip4_map_port_and_security_check(d0, ip40, &next0, &error0);
       port1 = ip4_map_port_and_security_check(d1, ip41, &next1, &error1);
 
+      /* Decrement IPv4 TTL */
+      ip4_map_decrement_ttl(ip40, &error0);
+      ip4_map_decrement_ttl(ip41, &error1);
+      bool df0 = ip40->flags_and_fragment_offset & clib_host_to_net_u16(IP4_HEADER_FLAG_DONT_FRAGMENT);
+      bool df1 = ip41->flags_and_fragment_offset & clib_host_to_net_u16(IP4_HEADER_FLAG_DONT_FRAGMENT);
+
       /* MAP calc */
       u32 da40 = clib_net_to_host_u32(ip40->dst_address.as_u32);
       u32 da41 = clib_net_to_host_u32(ip41->dst_address.as_u32);
@@ -288,11 +317,7 @@ ip4_map (vlib_main_t *vm,
        */
       if (PREDICT_TRUE(error0 == MAP_ERROR_NONE)) {
        if (PREDICT_FALSE(d0->mtu && (clib_net_to_host_u16(ip6h0->payload_length) + sizeof(*ip6h0) > d0->mtu))) {
-         vnet_buffer(p0)->ip_frag.header_offset = sizeof(*ip6h0);
-         vnet_buffer(p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP6_LOOKUP;
-         vnet_buffer(p0)->ip_frag.mtu = d0->mtu;
-         vnet_buffer(p0)->ip_frag.flags = IP_FRAG_FLAG_IP6_HEADER;
-         next0 = IP4_MAP_NEXT_FRAGMENT;
+         next0 = ip4_map_fragment(p0, d0->mtu, df0, &error0);
        } else {
          next0 = ip4_map_ip6_lookup_bypass(p0, ip40) ? IP4_MAP_NEXT_IP6_REWRITE : next0;
          vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_TX, cpu_index, map_domain_index0, 1,
@@ -308,11 +333,7 @@ ip4_map (vlib_main_t *vm,
        */
       if (PREDICT_TRUE(error1 == MAP_ERROR_NONE)) {
        if (PREDICT_FALSE(d1->mtu && (clib_net_to_host_u16(ip6h1->payload_length) + sizeof(*ip6h1) > d1->mtu))) {
-         vnet_buffer(p1)->ip_frag.header_offset = sizeof(*ip6h1);
-         vnet_buffer(p1)->ip_frag.next_index = IP4_FRAG_NEXT_IP6_LOOKUP;
-         vnet_buffer(p1)->ip_frag.mtu = d1->mtu;
-         vnet_buffer(p1)->ip_frag.flags = IP_FRAG_FLAG_IP6_HEADER;
-         next1 = IP4_MAP_NEXT_FRAGMENT;
+         next1 = ip4_map_fragment(p1, d1->mtu, df1, &error1);
        } else {
          next1 = ip4_map_ip6_lookup_bypass(p1, ip41) ? IP4_MAP_NEXT_IP6_REWRITE : next1;
          vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_TX, cpu_index, map_domain_index1, 1,
@@ -369,6 +390,7 @@ ip4_map (vlib_main_t *vm,
 
       /* Decrement IPv4 TTL */
       ip4_map_decrement_ttl(ip40, &error0);
+      bool df0 = ip40->flags_and_fragment_offset & clib_host_to_net_u16(IP4_HEADER_FLAG_DONT_FRAGMENT);
 
       /* MAP calc */
       u32 da40 = clib_net_to_host_u32(ip40->dst_address.as_u32);
@@ -396,11 +418,7 @@ ip4_map (vlib_main_t *vm,
        */
       if (PREDICT_TRUE(error0 == MAP_ERROR_NONE)) {
        if (PREDICT_FALSE(d0->mtu && (clib_net_to_host_u16(ip6h0->payload_length) + sizeof(*ip6h0) > d0->mtu))) {
-         vnet_buffer(p0)->ip_frag.header_offset = sizeof(*ip6h0);
-         vnet_buffer(p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP6_LOOKUP;
-         vnet_buffer(p0)->ip_frag.mtu = d0->mtu;
-         vnet_buffer(p0)->ip_frag.flags = IP_FRAG_FLAG_IP6_HEADER;
-         next0 = IP4_MAP_NEXT_FRAGMENT;
+         next0 = ip4_map_fragment(p0, d0->mtu, df0, &error0);
        } else {
          next0 = ip4_map_ip6_lookup_bypass(p0, ip40) ? IP4_MAP_NEXT_IP6_REWRITE : next0;
          vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_TX, cpu_index, map_domain_index0, 1,
@@ -591,8 +609,10 @@ VLIB_REGISTER_NODE(ip4_map_node) = {
 #ifdef MAP_SKIP_IP6_LOOKUP
     [IP4_MAP_NEXT_IP6_REWRITE] = "ip6-rewrite",
 #endif
-    [IP4_MAP_NEXT_FRAGMENT] = "ip4-frag",
+    [IP4_MAP_NEXT_IP4_FRAGMENT] = "ip4-frag",
+    [IP4_MAP_NEXT_IP6_FRAGMENT] = "ip6-frag",
     [IP4_MAP_NEXT_REASS] = "ip4-map-reass",
+    [IP4_MAP_NEXT_ICMP_ERROR] = "ip4-icmp-error",
     [IP4_MAP_NEXT_DROP] = "error-drop",
   },
 };
index eb5496b..a63122b 100644 (file)
@@ -652,6 +652,58 @@ map_icmp_unreachables_command_fn (vlib_main_t *vm,
   return 0;
 }
 
+static clib_error_t *
+map_fragment_command_fn (vlib_main_t *vm,
+                        unformat_input_t *input,
+                        vlib_cli_command_t *cmd)
+{
+  unformat_input_t _line_input, *line_input = &_line_input;
+  map_main_t *mm = &map_main;
+
+  /* Get a line of input. */
+  if (!unformat_user(input, unformat_line_input, line_input))
+    return 0;
+  while (unformat_check_input(line_input) != UNFORMAT_END_OF_INPUT) {
+    if (unformat(line_input, "inner"))
+      mm->frag_inner = true;
+    else if (unformat(line_input, "outer"))
+      mm->frag_inner = false;
+    else
+      return clib_error_return(0, "unknown input `%U'",
+                               format_unformat_error, input);
+  }
+  unformat_free(line_input);
+
+  return 0;
+}
+
+static clib_error_t *
+map_fragment_df_command_fn (vlib_main_t *vm,
+                           unformat_input_t *input,
+                           vlib_cli_command_t *cmd)
+{
+  unformat_input_t _line_input, *line_input = &_line_input;
+  map_main_t *mm = &map_main;
+
+  /* Get a line of input. */
+  if (!unformat_user(input, unformat_line_input, line_input))
+    return 0;
+  while (unformat_check_input(line_input) != UNFORMAT_END_OF_INPUT) {
+    if (unformat(line_input, "on"))
+      mm->frag_ignore_df = true;
+    else if (unformat(line_input, "off"))
+      mm->frag_ignore_df = false;
+    else
+      return clib_error_return(0, "unknown input `%U'",
+                               format_unformat_error, input);
+  }
+  unformat_free(line_input);
+
+  return 0;
+}
+
 static clib_error_t *
 map_traffic_class_command_fn (vlib_main_t *vm,
                              unformat_input_t *input,
@@ -869,6 +921,8 @@ show_map_stats_command_fn (vlib_main_t *vm, unformat_input_t *input, vlib_cli_co
 
   vlib_cli_output(vm, "ICMP-relay IPv4 source address: %U\n", format_ip4_address, &mm->icmp4_src_address);
   vlib_cli_output(vm, "ICMP6 unreachables sent for unmatched packets: %s\n", mm->icmp6_enabled ? "enabled" : "disabled");
+  vlib_cli_output(vm, "Inner fragmentation: %s\n", mm->frag_inner ? "enabled" : "disabled");
+  vlib_cli_output(vm, "Fragment packets regardless of DF flag: %s\n", mm->frag_ignore_df ? "enabled" : "disabled");
 
   /*
    * Counters
@@ -1563,11 +1617,23 @@ VLIB_CLI_COMMAND(map_icmp_relay_source_address_command, static) = {
 };
 
 VLIB_CLI_COMMAND(map_icmp_unreachables_command, static) = {
-  .path = "map params icmp unreachables",
+  .path = "map params icmp6 unreachables",
   .short_help = "unreachables {on|off}",
   .function = map_icmp_unreachables_command_fn,
 };
 
+VLIB_CLI_COMMAND(map_fragment_command, static) = {
+  .path = "map params fragment",
+  .short_help = "[inner|outer] [ignore-df [on|off]]",
+  .function = map_fragment_command_fn,
+};
+
+VLIB_CLI_COMMAND(map_fragment_df_command, static) = {
+  .path = "map params fragment ignore-df",
+  .short_help = "on|off",
+  .function = map_fragment_df_command_fn,
+};
+
 VLIB_CLI_COMMAND(map_security_check_frag_command, static) = {
   .path = "map params security-check fragments",
   .short_help = 
@@ -1639,6 +1705,10 @@ clib_error_t *map_init (vlib_main_t *vm)
   /* ICMP6 Type 1, Code 5 for security check failure */
   mm->icmp6_enabled = false;
 
+  /* Inner or outer fragmentation */
+  mm->frag_inner = false;
+  mm->frag_ignore_df = false;
+
   vec_validate(mm->domain_counters, MAP_N_DOMAIN_COUNTER - 1);
   mm->domain_counters[MAP_DOMAIN_COUNTER_RX].name = "rx";
   mm->domain_counters[MAP_DOMAIN_COUNTER_TX].name = "tx";
index 6d12b71..d38d7f4 100644 (file)
@@ -190,9 +190,6 @@ typedef struct {
   vlib_combined_counter_main_t *domain_counters;
   volatile u32 *counter_lock;
 
-  /* Global counters */
-  vlib_simple_counter_main_t icmp_relayed;
-
 #ifdef MAP_SKIP_IP6_LOOKUP
   /* pre-presolve */
   u32 adj6_index, adj4_index;
@@ -203,12 +200,14 @@ typedef struct {
   /* Traffic class: zero, copy (~0) or fixed value */
   u8 tc;
   bool tc_copy;
-  bool sec_check;
-  bool sec_check_frag;
-  bool icmp6_enabled;
+
+  bool sec_check;              /* Inbound security check */
+  bool sec_check_frag;         /* Inbound security check for (subsequent) fragments */
+  bool icmp6_enabled;          /* Send destination unreachable for security check failure */
 
   /* ICMPv6 -> ICMPv4 relay parameters */
   ip4_address_t icmp4_src_address;
+  vlib_simple_counter_main_t icmp_relayed;
 
   /* convenience */
   vlib_main_t *vlib_main;
@@ -217,13 +216,13 @@ typedef struct {
   /*
    * IPv4 encap and decap reassembly
    */
-  //Conf
+  /* Configuration */
   f32 ip4_reass_conf_ht_ratio; //Size of ht is 2^ceil(log2(ratio*pool_size))
   u16 ip4_reass_conf_pool_size; //Max number of allocated reass structures
   u16 ip4_reass_conf_lifetime_ms; //Time a reassembly struct is considered valid in ms
   u32 ip4_reass_conf_buffers; //Maximum number of buffers used by ip4 reassembly
 
-  //Runtime
+  /* Runtime */
   map_ip4_reass_t *ip4_reass_pool;
   u8 ip4_reass_ht_log2len; //Hash table size is 2^log2len
   u16 ip4_reass_allocated;
@@ -231,19 +230,22 @@ typedef struct {
   u16 ip4_reass_fifo_last;
   volatile u32 *ip4_reass_lock;
 
-  //Counters
+  /* Counters */
   u32 ip4_reass_buffered_counter;
 
+  bool frag_inner;             /* Inner or outer fragmentation */
+  bool frag_ignore_df;         /* Fragment (outer) packet even if DF is set */
+
   /*
    * IPv6 decap reassembly
    */
-  //Conf
+  /* Configuration */
   f32 ip6_reass_conf_ht_ratio; //Size of ht is 2^ceil(log2(ratio*pool_size))
   u16 ip6_reass_conf_pool_size; //Max number of allocated reass structures
   u16 ip6_reass_conf_lifetime_ms; //Time a reassembly struct is considered valid in ms
   u32 ip6_reass_conf_buffers; //Maximum number of buffers used by ip6 reassembly
 
-  //Runtime
+  /* Runtime */
   map_ip6_reass_t *ip6_reass_pool;
   u8 ip6_reass_ht_log2len; //Hash table size is 2^log2len
   u16 ip6_reass_allocated;
@@ -251,19 +253,18 @@ typedef struct {
   u16 ip6_reass_fifo_last;
   volatile u32 *ip6_reass_lock;
 
-  //Counters
+  /* Counters */
   u32 ip6_reass_buffered_counter;
 
 } map_main_t;
 
 /*
- * TODO: Remove SEC_CHECK / TRANSLATED_4TO6 / TRANSLATED_6TO4
+ * MAP Error counters/messages
  */
 #define foreach_map_error                              \
   /* Must be first. */                                 \
  _(NONE, "valid MAP packets")                          \
  _(BAD_PROTOCOL, "bad protocol")                       \
- _(WRONG_ICMP_TYPE, "wrong icmp type")                 \
  _(SEC_CHECK, "security check failed")                 \
  _(ENCAP_SEC_CHECK, "encap security check failed")     \
  _(DECAP_SEC_CHECK, "decap security check failed")     \
@@ -277,7 +278,7 @@ typedef struct {
  _(FRAGMENT_MALFORMED, "fragment has unexpected format")\
  _(FRAGMENT_DROPPED, "dropped cached fragment")         \
  _(MALFORMED, "malformed packet")                      \
- _(IP4_ERROR_TIME_EXPIRED, "time expired")
+ _(DF_SET, "can't fragment, DF set")
 
 typedef enum {
 #define _(sym,str) MAP_ERROR_##sym,