ip: rate-limit the sending of ICMP error messages
[vpp.git] / src / vnet / ip / icmp4.c
index 3626e96..857c3b1 100644 (file)
@@ -40,7 +40,8 @@
 #include <vlib/vlib.h>
 #include <vnet/ip/ip.h>
 #include <vnet/pg/pg.h>
-
+#include <vnet/ip/ip_sas.h>
+#include <vnet/util/throttle.h>
 
 static char *icmp_error_strings[] = {
 #define _(f,s) s,
@@ -48,6 +49,9 @@ static char *icmp_error_strings[] = {
 #undef _
 };
 
+/** ICMP throttling */
+static throttle_t icmp_throttle;
+
 static u8 *
 format_ip4_icmp_type_and_code (u8 * s, va_list * args)
 {
@@ -102,6 +106,12 @@ format_ip4_icmp_header (u8 * s, va_list * args)
              format_ip4_icmp_type_and_code, icmp->type, icmp->code,
              clib_net_to_host_u16 (icmp->checksum));
 
+  if ((ICMP4_echo_request == icmp->type || ICMP4_echo_reply == icmp->type)
+      && sizeof (icmp[0]) + sizeof (u16) < max_header_bytes)
+    {
+      s = format (s, " id %u", clib_net_to_host_u16 (*(u16 *) (icmp + 1)));
+    }
+
   return s;
 }
 
@@ -171,7 +181,7 @@ ip4_icmp_input (vlib_main_t * vm,
              vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
              p0 = vlib_get_buffer (vm, from[1]);
              ip0 = vlib_buffer_get_current (p0);
-             CLIB_PREFETCH (ip0, CLIB_CACHE_LINE_BYTES, LOAD);
+             clib_prefetch_load (ip0);
            }
 
          bi0 = to_next[0] = from[0];
@@ -188,15 +198,10 @@ ip4_icmp_input (vlib_main_t * vm,
          next0 = im->ip4_input_next_index_by_type[type0];
 
          p0->error = node->errors[ICMP4_ERROR_UNKNOWN_TYPE];
-         if (PREDICT_FALSE (next0 != next))
-           {
-             vlib_put_next_frame (vm, node, next, n_left_to_next + 1);
-             next = next0;
-             vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
-             to_next[0] = bi0;
-             to_next += 1;
-             n_left_to_next -= 1;
-           }
+
+         /* Verify speculative enqueue, maybe switch current next frame */
+         vlib_validate_buffer_enqueue_x1 (vm, node, next, to_next,
+                                          n_left_to_next, bi0, next0);
        }
 
       vlib_put_next_frame (vm, node, next, n_left_to_next);
@@ -206,7 +211,7 @@ ip4_icmp_input (vlib_main_t * vm,
 }
 
 /* *INDENT-OFF* */
-VLIB_REGISTER_NODE (ip4_icmp_input_node,static) = {
+VLIB_REGISTER_NODE (ip4_icmp_input_node) = {
   .function = ip4_icmp_input,
   .name = "ip4-icmp-input",
 
@@ -224,202 +229,6 @@ VLIB_REGISTER_NODE (ip4_icmp_input_node,static) = {
 };
 /* *INDENT-ON* */
 
-static uword
-ip4_icmp_echo_request (vlib_main_t * vm,
-                      vlib_node_runtime_t * node, vlib_frame_t * frame)
-{
-  uword n_packets = frame->n_vectors;
-  u32 *from, *to_next;
-  u32 n_left_from, n_left_to_next, next;
-  ip4_main_t *i4m = &ip4_main;
-  u16 *fragment_ids, *fid;
-  u8 host_config_ttl = i4m->host_config.ttl;
-
-  from = vlib_frame_vector_args (frame);
-  n_left_from = n_packets;
-  next = node->cached_next_index;
-
-  if (node->flags & VLIB_NODE_FLAG_TRACE)
-    vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
-                                  /* stride */ 1,
-                                  sizeof (icmp_input_trace_t));
-
-  /* Get random fragment IDs for replies. */
-  fid = fragment_ids = clib_random_buffer_get_data (&vm->random_buffer,
-                                                   n_packets *
-                                                   sizeof (fragment_ids[0]));
-
-  while (n_left_from > 0)
-    {
-      vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
-
-      while (n_left_from > 2 && n_left_to_next > 2)
-       {
-         vlib_buffer_t *p0, *p1;
-         ip4_header_t *ip0, *ip1;
-         icmp46_header_t *icmp0, *icmp1;
-         u32 bi0, src0, dst0;
-         u32 bi1, src1, dst1;
-         ip_csum_t sum0, sum1;
-
-         bi0 = to_next[0] = from[0];
-         bi1 = to_next[1] = from[1];
-
-         from += 2;
-         n_left_from -= 2;
-         to_next += 2;
-         n_left_to_next -= 2;
-
-         p0 = vlib_get_buffer (vm, bi0);
-         p1 = vlib_get_buffer (vm, bi1);
-         ip0 = vlib_buffer_get_current (p0);
-         ip1 = vlib_buffer_get_current (p1);
-         icmp0 = ip4_next_header (ip0);
-         icmp1 = ip4_next_header (ip1);
-
-         vnet_buffer (p0)->sw_if_index[VLIB_RX] =
-           vnet_main.local_interface_sw_if_index;
-         vnet_buffer (p1)->sw_if_index[VLIB_RX] =
-           vnet_main.local_interface_sw_if_index;
-
-         /* Update ICMP checksum. */
-         sum0 = icmp0->checksum;
-         sum1 = icmp1->checksum;
-
-         ASSERT (icmp0->type == ICMP4_echo_request);
-         ASSERT (icmp1->type == ICMP4_echo_request);
-         sum0 = ip_csum_update (sum0, ICMP4_echo_request, ICMP4_echo_reply,
-                                icmp46_header_t, type);
-         sum1 = ip_csum_update (sum1, ICMP4_echo_request, ICMP4_echo_reply,
-                                icmp46_header_t, type);
-         icmp0->type = ICMP4_echo_reply;
-         icmp1->type = ICMP4_echo_reply;
-
-         icmp0->checksum = ip_csum_fold (sum0);
-         icmp1->checksum = ip_csum_fold (sum1);
-
-         src0 = ip0->src_address.data_u32;
-         src1 = ip1->src_address.data_u32;
-         dst0 = ip0->dst_address.data_u32;
-         dst1 = ip1->dst_address.data_u32;
-
-         /* Swap source and destination address.
-            Does not change checksum. */
-         ip0->src_address.data_u32 = dst0;
-         ip1->src_address.data_u32 = dst1;
-         ip0->dst_address.data_u32 = src0;
-         ip1->dst_address.data_u32 = src1;
-
-         /* Update IP checksum. */
-         sum0 = ip0->checksum;
-         sum1 = ip1->checksum;
-
-         sum0 = ip_csum_update (sum0, ip0->ttl, host_config_ttl,
-                                ip4_header_t, ttl);
-         sum1 = ip_csum_update (sum1, ip1->ttl, host_config_ttl,
-                                ip4_header_t, ttl);
-         ip0->ttl = host_config_ttl;
-         ip1->ttl = host_config_ttl;
-
-         /* New fragment id. */
-         sum0 = ip_csum_update (sum0, ip0->fragment_id, fid[0],
-                                ip4_header_t, fragment_id);
-         sum1 = ip_csum_update (sum1, ip1->fragment_id, fid[1],
-                                ip4_header_t, fragment_id);
-         ip0->fragment_id = fid[0];
-         ip1->fragment_id = fid[1];
-         fid += 2;
-
-         ip0->checksum = ip_csum_fold (sum0);
-         ip1->checksum = ip_csum_fold (sum1);
-
-         ASSERT (ip0->checksum == ip4_header_checksum (ip0));
-         ASSERT (ip1->checksum == ip4_header_checksum (ip1));
-
-         p0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
-         p1->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
-       }
-
-      while (n_left_from > 0 && n_left_to_next > 0)
-       {
-         vlib_buffer_t *p0;
-         ip4_header_t *ip0;
-         icmp46_header_t *icmp0;
-         u32 bi0, src0, dst0;
-         ip_csum_t sum0;
-
-         bi0 = to_next[0] = from[0];
-
-         from += 1;
-         n_left_from -= 1;
-         to_next += 1;
-         n_left_to_next -= 1;
-
-         p0 = vlib_get_buffer (vm, bi0);
-         ip0 = vlib_buffer_get_current (p0);
-         icmp0 = ip4_next_header (ip0);
-
-         vnet_buffer (p0)->sw_if_index[VLIB_RX] =
-           vnet_main.local_interface_sw_if_index;
-
-         /* Update ICMP checksum. */
-         sum0 = icmp0->checksum;
-
-         ASSERT (icmp0->type == ICMP4_echo_request);
-         sum0 = ip_csum_update (sum0, ICMP4_echo_request, ICMP4_echo_reply,
-                                icmp46_header_t, type);
-         icmp0->type = ICMP4_echo_reply;
-         icmp0->checksum = ip_csum_fold (sum0);
-
-         src0 = ip0->src_address.data_u32;
-         dst0 = ip0->dst_address.data_u32;
-         ip0->src_address.data_u32 = dst0;
-         ip0->dst_address.data_u32 = src0;
-
-         /* Update IP checksum. */
-         sum0 = ip0->checksum;
-
-         sum0 = ip_csum_update (sum0, ip0->ttl, host_config_ttl,
-                                ip4_header_t, ttl);
-         ip0->ttl = host_config_ttl;
-
-         sum0 = ip_csum_update (sum0, ip0->fragment_id, fid[0],
-                                ip4_header_t, fragment_id);
-         ip0->fragment_id = fid[0];
-         fid += 1;
-
-         ip0->checksum = ip_csum_fold (sum0);
-
-         ASSERT (ip0->checksum == ip4_header_checksum (ip0));
-
-         p0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
-       }
-
-      vlib_put_next_frame (vm, node, next, n_left_to_next);
-    }
-
-  vlib_error_count (vm, ip4_icmp_input_node.index,
-                   ICMP4_ERROR_ECHO_REPLIES_SENT, frame->n_vectors);
-
-  return frame->n_vectors;
-}
-
-/* *INDENT-OFF* */
-VLIB_REGISTER_NODE (ip4_icmp_echo_request_node,static) = {
-  .function = ip4_icmp_echo_request,
-  .name = "ip4-icmp-echo-request",
-
-  .vector_size = sizeof (u32),
-
-  .format_trace = format_icmp_input_trace,
-
-  .n_next_nodes = 1,
-  .next_nodes = {
-    [0] = "ip4-load-balance",
-  },
-};
-/* *INDENT-ON* */
-
 typedef enum
 {
   IP4_ICMP_ERROR_NEXT_DROP,
@@ -427,14 +236,6 @@ typedef enum
   IP4_ICMP_ERROR_N_NEXT,
 } ip4_icmp_error_next_t;
 
-void
-icmp4_error_set_vnet_buffer (vlib_buffer_t * b, u8 type, u8 code, u32 data)
-{
-  vnet_buffer (b)->ip.icmp.type = type;
-  vnet_buffer (b)->ip.icmp.code = code;
-  vnet_buffer (b)->ip.icmp.data = data;
-}
-
 static u8
 icmp4_icmp_type_to_error (u8 type)
 {
@@ -458,13 +259,14 @@ ip4_icmp_error (vlib_main_t * vm,
   u32 *from, *to_next;
   uword n_left_from, n_left_to_next;
   ip4_icmp_error_next_t next_index;
-  ip4_main_t *im = &ip4_main;
-  ip_lookup_main_t *lm = &im->lookup_main;
+  u32 thread_index = vm->thread_index;
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
   next_index = node->cached_next_index;
 
+  u64 seed = throttle_seed (&icmp_throttle, thread_index, vlib_time_now (vm));
+
   if (node->flags & VLIB_NODE_FLAG_TRACE)
     vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
                                   /* stride */ 1,
@@ -476,15 +278,43 @@ ip4_icmp_error (vlib_main_t * vm,
 
       while (n_left_from > 0 && n_left_to_next > 0)
        {
-         u32 pi0 = from[0];
+         /*
+          * Duplicate first buffer and free the original chain.  Keep
+          * as much of the original packet as possible, within the
+          * minimum MTU. We chat "a little" here by keeping whatever
+          * is available in the first buffer.
+          */
+
+         u32 pi0 = ~0;
+         u32 org_pi0 = from[0];
          u32 next0 = IP4_ICMP_ERROR_NEXT_LOOKUP;
          u8 error0 = ICMP4_ERROR_NONE;
-         vlib_buffer_t *p0;
+         vlib_buffer_t *p0, *org_p0;
          ip4_header_t *ip0, *out_ip0;
          icmp46_header_t *icmp0;
-         u32 sw_if_index0, if_add_index0;
+         u32 sw_if_index0;
          ip_csum_t sum;
 
+         org_p0 = vlib_get_buffer (vm, org_pi0);
+         ip0 = vlib_buffer_get_current (org_p0);
+
+         /* Rate limit based on the src,dst addresses in the original packet
+          */
+         u64 r0 =
+           (u64) ip0->dst_address.as_u32 << 32 | ip0->src_address.as_u32;
+
+         if (throttle_check (&icmp_throttle, thread_index, r0, seed))
+           {
+             vlib_error_count (vm, node->node_index, ICMP4_ERROR_DROP, 1);
+             from += 1;
+             n_left_from -= 1;
+             continue;
+           }
+
+         p0 = vlib_buffer_copy_no_chain (vm, org_p0, &pi0);
+         if (!p0 || pi0 == ~0) /* Out of buffers */
+           continue;
+
          /* Speculatively enqueue p0 to the current next frame */
          to_next[0] = pi0;
          from += 1;
@@ -492,27 +322,9 @@ ip4_icmp_error (vlib_main_t * vm,
          n_left_from -= 1;
          n_left_to_next -= 1;
 
-         p0 = vlib_get_buffer (vm, pi0);
-         ip0 = vlib_buffer_get_current (p0);
          sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];
 
-         /*
-          * RFC1812 says to keep as much of the original packet as
-          * possible within the minimum MTU (576). We cheat "a little"
-          * here by keeping whatever fits in the first buffer, to be more
-          * efficient
-          */
-         if (PREDICT_FALSE (p0->total_length_not_including_first_buffer))
-           {
-             /* clear current_length of all other buffers in chain */
-             vlib_buffer_t *b = p0;
-             p0->total_length_not_including_first_buffer = 0;
-             while (b->flags & VLIB_BUFFER_NEXT_PRESENT)
-               {
-                 b = vlib_get_buffer (vm, b->next_buffer);
-                 b->current_length = 0;
-               }
-           }
+         vlib_buffer_copy_trace_flag (vm, p0, pi0);
 
          /* Add IP header and ICMPv4 header including a 4 byte data field */
          vlib_buffer_advance (p0,
@@ -521,7 +333,6 @@ ip4_icmp_error (vlib_main_t * vm,
 
          p0->current_length =
            p0->current_length > 576 ? 576 : p0->current_length;
-
          out_ip0 = vlib_buffer_get_current (p0);
          icmp0 = (icmp46_header_t *) & out_ip0[1];
 
@@ -534,25 +345,14 @@ ip4_icmp_error (vlib_main_t * vm,
          out_ip0->ttl = 0xff;
          out_ip0->protocol = IP_PROTOCOL_ICMP;
          out_ip0->dst_address = ip0->src_address;
-         if_add_index0 = ~0;
-         if (PREDICT_TRUE (vec_len (lm->if_address_pool_index_by_sw_if_index)
-                           > sw_if_index0))
-           if_add_index0 =
-             lm->if_address_pool_index_by_sw_if_index[sw_if_index0];
-         if (PREDICT_TRUE (if_add_index0 != ~0))
-           {
-             ip_interface_address_t *if_add =
-               pool_elt_at_index (lm->if_address_pool, if_add_index0);
-             ip4_address_t *if_ip =
-               ip_interface_address_get_address (lm, if_add);
-             out_ip0->src_address = *if_ip;
-           }
-         else
-           {
-             /* interface has no IP4 address - should not happen */
+         /* Prefer a source address from "offending interface" */
+         if (!ip4_sas_by_sw_if_index (sw_if_index0, &out_ip0->dst_address,
+                                      &out_ip0->src_address))
+           { /* interface has no IP6 address - should not happen */
              next0 = IP4_ICMP_ERROR_NEXT_DROP;
              error0 = ICMP4_ERROR_DROP;
            }
+
          out_ip0->checksum = ip4_header_checksum (out_ip0);
 
          /* Fill icmp header fields */
@@ -570,6 +370,7 @@ ip4_icmp_error (vlib_main_t * vm,
          /* Update error status */
          if (error0 == ICMP4_ERROR_NONE)
            error0 = icmp4_icmp_type_to_error (icmp0->type);
+
          vlib_error_count (vm, node->node_index, error0, 1);
 
          /* Verify speculative enqueue, maybe switch current next frame */
@@ -580,6 +381,15 @@ ip4_icmp_error (vlib_main_t * vm,
       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
     }
 
+  /*
+   * push the original buffers to error-drop, so that
+   * they can get the error counters handled, then freed
+   */
+  vlib_buffer_enqueue_to_single_next (vm, node,
+                                     vlib_frame_vector_args (frame),
+                                     IP4_ICMP_ERROR_NEXT_DROP,
+                                     frame->n_vectors);
+
   return frame->n_vectors;
 }
 
@@ -653,7 +463,13 @@ icmp4_pg_edit_function (pg_main_t * pg,
       ASSERT (p0->current_data == 0);
       ip0 = (void *) (p0->data + ip_offset);
       icmp0 = (void *) (p0->data + icmp_offset);
-      len0 = clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
+
+      /* if IP length has been specified, then calculate the length based on buffer */
+      if (ip0->length == 0)
+       len0 = vlib_buffer_length_in_chain (vm, p0) - icmp_offset;
+      else
+       len0 = clib_net_to_host_u16 (ip0->length) - icmp_offset;
+
       icmp0->checksum =
        ~ip_csum_fold (ip_incremental_checksum (0, icmp0, len0));
     }
@@ -733,10 +549,17 @@ void
 ip4_icmp_register_type (vlib_main_t * vm, icmp4_type_t type, u32 node_index)
 {
   icmp4_main_t *im = &icmp4_main;
+  u32 old_next_index;
 
   ASSERT ((int) type < ARRAY_LEN (im->ip4_input_next_index_by_type));
+  old_next_index = im->ip4_input_next_index_by_type[type];
+
   im->ip4_input_next_index_by_type[type]
     = vlib_node_add_next (vm, ip4_icmp_input_node.index, node_index);
+
+  if (old_next_index &&
+      (old_next_index != im->ip4_input_next_index_by_type[type]))
+    clib_warning ("WARNING: changed next_by_type[%d]", (int) type);
 }
 
 static clib_error_t *
@@ -770,8 +593,10 @@ icmp4_init (vlib_main_t * vm)
               ICMP_INPUT_NEXT_ERROR,
               sizeof (cm->ip4_input_next_index_by_type));
 
-  ip4_icmp_register_type (vm, ICMP4_echo_request,
-                         ip4_icmp_echo_request_node.index);
+  vlib_thread_main_t *tm = &vlib_thread_main;
+  u32 n_vlib_mains = tm->n_vlib_mains;
+
+  throttle_init (&icmp_throttle, n_vlib_mains, 1e-3);
 
   return 0;
 }