ipsec: FIx feature ordering
[vpp.git] / src / vnet / ip / ip4_reassembly.c
index 446df33..b82bafe 100644 (file)
@@ -30,6 +30,7 @@
 #define IP4_REASS_TIMEOUT_DEFAULT_MS 100
 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000        // 10 seconds default
 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
+#define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
 
 #define IP4_REASS_DEBUG_BUFFERS 0
@@ -57,6 +58,7 @@
 typedef enum
 {
   IP4_REASS_RC_OK,
+  IP4_REASS_RC_TOO_MANY_FRAGMENTS,
   IP4_REASS_RC_INTERNAL_ERROR,
   IP4_REASS_RC_NO_BUF,
 } ip4_reass_rc_t;
@@ -130,10 +132,13 @@ typedef struct
   // trace operation counter
   u32 trace_op_counter;
   // next index - used by non-feature node
-  u8 next_index;
+  u32 next_index;
+  // error next index - used by custom apps (~0 if not used)
+  u32 error_next_index;
   // minimum fragment length for this reassembly - used to estimate MTU
   u16 min_fragment_length;
-
+  // number of fragments in this reassembly
+  u32 fragments_n;
 } ip4_reass_t;
 
 typedef struct
@@ -150,6 +155,9 @@ typedef struct
   u32 timeout_ms;
   f64 timeout;
   u32 expire_walk_interval_ms;
+  // maximum number of fragments in one reassembly
+  u32 max_reass_len;
+  // maximum number of reassemblies
   u32 max_reass_n;
 
   // IPv4 runtime
@@ -286,12 +294,6 @@ ip4_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
 {
   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
-  if (pool_is_free_index (vm->trace_main.trace_buffer_pool, b->trace_index))
-    {
-      // this buffer's trace is gone
-      b->flags &= ~VLIB_BUFFER_IS_TRACED;
-      return;
-    }
   ip4_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
   t->reass_id = reass->id;
   t->action = action;
@@ -326,8 +328,8 @@ ip4_reass_free (ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
 }
 
 always_inline void
-ip4_reass_on_timeout (vlib_main_t * vm, ip4_reass_main_t * rm,
-                     ip4_reass_t * reass)
+ip4_reass_drop_all (vlib_main_t * vm, vlib_node_runtime_t * node,
+                   ip4_reass_main_t * rm, ip4_reass_t * reass)
 {
   u32 range_bi = reass->first_bi;
   vlib_buffer_t *range_b;
@@ -354,14 +356,45 @@ ip4_reass_on_timeout (vlib_main_t * vm, ip4_reass_main_t * rm,
        }
       range_bi = range_vnb->ip.reass.next_range_bi;
     }
-  vlib_buffer_free (vm, to_free, vec_len (to_free));
-  vec_free (to_free);
+  /* send to next_error_index */
+  if (~0 != reass->error_next_index)
+    {
+      u32 n_left_to_next, *to_next, next_index;
+
+      next_index = reass->error_next_index;
+      u32 bi = ~0;
+
+      while (vec_len (to_free) > 0)
+       {
+         vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+         while (vec_len (to_free) > 0 && n_left_to_next > 0)
+           {
+             bi = vec_pop (to_free);
+
+             if (~0 != bi)
+               {
+                 to_next[0] = bi;
+                 to_next += 1;
+                 n_left_to_next -= 1;
+                 vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+                                                  to_next, n_left_to_next,
+                                                  bi, next_index);
+               }
+           }
+         vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+       }
+    }
+  else
+    {
+      vlib_buffer_free (vm, to_free, vec_len (to_free));
+    }
 }
 
 static ip4_reass_t *
-ip4_reass_find_or_create (vlib_main_t * vm, ip4_reass_main_t * rm,
-                         ip4_reass_per_thread_t * rt, ip4_reass_kv_t * kv,
-                         u8 * do_handoff)
+ip4_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
+                         ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
+                         ip4_reass_kv_t * kv, u8 * do_handoff)
 {
   ip4_reass_t *reass = NULL;
   f64 now = vlib_time_now (rm->vlib_main);
@@ -378,7 +411,7 @@ ip4_reass_find_or_create (vlib_main_t * vm, ip4_reass_main_t * rm,
 
       if (now > reass->last_heard + rm->timeout)
        {
-         ip4_reass_on_timeout (vm, rm, reass);
+         ip4_reass_drop_all (vm, node, rm, reass);
          ip4_reass_free (rm, rt, reass);
          reass = NULL;
        }
@@ -404,6 +437,8 @@ ip4_reass_find_or_create (vlib_main_t * vm, ip4_reass_main_t * rm,
       reass->first_bi = ~0;
       reass->last_packet_octet = ~0;
       reass->data_len = 0;
+      reass->next_index = ~0;
+      reass->error_next_index = ~0;
       ++rt->reass_n;
     }
 
@@ -426,7 +461,7 @@ always_inline ip4_reass_rc_t
 ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
                    ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
                    ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
-                   bool is_feature)
+                   bool is_custom_app)
 {
   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
   vlib_buffer_t *last_b = NULL;
@@ -475,7 +510,7 @@ ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
              if (trim_front > tmp->current_length)
                {
                  /* drop whole buffer */
-                 vlib_buffer_free_one (vm, tmp_bi);
+                 u32 to_be_freed_bi = tmp_bi;
                  trim_front -= tmp->current_length;
                  if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
                    {
@@ -483,7 +518,9 @@ ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
                    }
                  tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
                  tmp_bi = tmp->next_buffer;
+                 tmp->next_buffer = 0;
                  tmp = vlib_get_buffer (vm, tmp_bi);
+                 vlib_buffer_free_one (vm, to_be_freed_bi);
                  continue;
                }
              else
@@ -514,23 +551,37 @@ ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
                    }
                }
              total_length += tmp->current_length;
+             if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
+               {
+                 tmp_bi = tmp->next_buffer;
+                 tmp = vlib_get_buffer (vm, tmp->next_buffer);
+               }
+             else
+               {
+                 break;
+               }
            }
          else
            {
-             vlib_buffer_free_one (vm, tmp_bi);
+             u32 to_be_freed_bi = tmp_bi;
              if (reass->first_bi == tmp_bi)
                {
                  return IP4_REASS_RC_INTERNAL_ERROR;
                }
-           }
-         if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
-           {
-             tmp_bi = tmp->next_buffer;
-             tmp = vlib_get_buffer (vm, tmp->next_buffer);
-           }
-         else
-           {
-             break;
+             if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
+               {
+                 tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+                 tmp_bi = tmp->next_buffer;
+                 tmp->next_buffer = 0;
+                 tmp = vlib_get_buffer (vm, tmp_bi);
+                 vlib_buffer_free_one (vm, to_be_freed_bi);
+               }
+             else
+               {
+                 tmp->next_buffer = 0;
+                 vlib_buffer_free_one (vm, to_be_freed_bi);
+                 break;
+               }
            }
        }
       sub_chain_bi =
@@ -544,6 +595,7 @@ ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
       return IP4_REASS_RC_INTERNAL_ERROR;
     }
   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+
   if (total_length < first_b->current_length)
     {
       return IP4_REASS_RC_INTERNAL_ERROR;
@@ -559,7 +611,8 @@ ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
     {
       return IP4_REASS_RC_NO_BUF;
     }
-
+  // reset to reconstruct the mbuf linking
+  first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
     {
       ip4_reass_add_trace (vm, node, rm, reass, reass->first_bi, FINALIZE, 0);
@@ -591,7 +644,7 @@ ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
 #endif
     }
   *bi0 = reass->first_bi;
-  if (is_feature)
+  if (!is_custom_app)
     {
       *next0 = IP4_REASSEMBLY_NEXT_INPUT;
     }
@@ -672,7 +725,7 @@ ip4_reass_remove_range_from_chain (vlib_main_t * vm,
   reass->data_len -= ip4_reass_buffer_get_data_len (discard_b);
   while (1)
     {
-      vlib_buffer_free_one (vm, discard_bi);
+      u32 to_be_freed_bi = discard_bi;
       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
        {
          ip4_reass_add_trace (vm, node, rm, reass, discard_bi, RANGE_DISCARD,
@@ -682,10 +735,14 @@ ip4_reass_remove_range_from_chain (vlib_main_t * vm,
        {
          discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
          discard_bi = discard_b->next_buffer;
+         discard_b->next_buffer = 0;
          discard_b = vlib_get_buffer (vm, discard_bi);
+         vlib_buffer_free_one (vm, to_be_freed_bi);
        }
       else
        {
+         discard_b->next_buffer = 0;
+         vlib_buffer_free_one (vm, to_be_freed_bi);
          break;
        }
     }
@@ -696,14 +753,19 @@ always_inline ip4_reass_rc_t
 ip4_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
                  ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
                  ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
-                 bool is_feature)
+                 bool is_custom_app)
 {
   ip4_reass_rc_t rc = IP4_REASS_RC_OK;
   int consumed = 0;
   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
   ip4_header_t *fip = vlib_buffer_get_current (fb);
   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
-  reass->next_index = fvnb->ip.reass.next_index;       // store next_index before it's overwritten
+  if (is_custom_app)
+    {
+      // store (error_)next_index before it's overwritten
+      reass->next_index = fvnb->ip.reass.next_index;
+      reass->error_next_index = fvnb->ip.reass.error_next_index;
+    }
   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
   const u32 fragment_length =
     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
@@ -736,6 +798,7 @@ ip4_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
        }
       *bi0 = ~0;
       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
+      reass->fragments_n = 1;
       return IP4_REASS_RC_OK;
     }
   reass->min_fragment_length = clib_min (clib_net_to_host_u16 (fip->length),
@@ -893,6 +956,7 @@ ip4_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
        }
       break;
     }
+  ++reass->fragments_n;
   if (consumed)
     {
       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
@@ -904,13 +968,17 @@ ip4_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
       reass->data_len == reass->last_packet_octet + 1)
     {
       return ip4_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
-                                is_feature);
+                                is_custom_app);
     }
   else
     {
       if (consumed)
        {
          *bi0 = ~0;
+         if (reass->fragments_n > rm->max_reass_len)
+           {
+             rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
+           }
        }
       else
        {
@@ -922,9 +990,9 @@ ip4_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
 }
 
 always_inline uword
-ip4_reassembly_inline (vlib_main_t * vm,
-                      vlib_node_runtime_t * node,
-                      vlib_frame_t * frame, bool is_feature)
+ip4_reassembly_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
+                      vlib_frame_t * frame, bool is_feature,
+                      bool is_custom_app)
 {
   u32 *from = vlib_frame_vector_args (frame);
   u32 n_left_from, n_left_to_next, *to_next, next_index;
@@ -952,7 +1020,7 @@ ip4_reassembly_inline (vlib_main_t * vm,
          if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
            {
              // this is a whole packet - no fragmentation
-             if (is_feature)
+             if (!is_custom_app)
                {
                  next0 = IP4_REASSEMBLY_NEXT_INPUT;
                }
@@ -986,7 +1054,8 @@ ip4_reassembly_inline (vlib_main_t * vm,
                    (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
 
                  ip4_reass_t *reass =
-                   ip4_reass_find_or_create (vm, rm, rt, &kv, &do_handoff);
+                   ip4_reass_find_or_create (vm, node, rm, rt, &kv,
+                                             &do_handoff);
 
                  if (PREDICT_FALSE (do_handoff))
                    {
@@ -1003,16 +1072,33 @@ ip4_reassembly_inline (vlib_main_t * vm,
                    {
                      switch (ip4_reass_update
                              (vm, node, rm, rt, reass, &bi0, &next0,
-                              &error0, is_feature))
+                              &error0, is_custom_app))
                        {
                        case IP4_REASS_RC_OK:
                          /* nothing to do here */
                          break;
+                       case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
+                         vlib_node_increment_counter (vm, node->node_index,
+                                                      IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
+                                                      1);
+                         ip4_reass_drop_all (vm, node, rm, reass);
+                         ip4_reass_free (rm, rt, reass);
+                         goto next_packet;
+                         break;
                        case IP4_REASS_RC_NO_BUF:
-                         /* fallthrough */
+                         vlib_node_increment_counter (vm, node->node_index,
+                                                      IP4_ERROR_REASS_NO_BUF,
+                                                      1);
+                         ip4_reass_drop_all (vm, node, rm, reass);
+                         ip4_reass_free (rm, rt, reass);
+                         goto next_packet;
+                         break;
                        case IP4_REASS_RC_INTERNAL_ERROR:
                          /* drop everything and start with a clean slate */
-                         ip4_reass_on_timeout (vm, rm, reass);
+                         vlib_node_increment_counter (vm, node->node_index,
+                                                      IP4_ERROR_REASS_INTERNAL_ERROR,
+                                                      1);
+                         ip4_reass_drop_all (vm, node, rm, reass);
                          ip4_reass_free (rm, rt, reass);
                          goto next_packet;
                          break;
@@ -1065,7 +1151,8 @@ static char *ip4_reassembly_error_strings[] = {
 VLIB_NODE_FN (ip4_reass_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
                               vlib_frame_t * frame)
 {
-  return ip4_reassembly_inline (vm, node, frame, false /* is_feature */ );
+  return ip4_reassembly_inline (vm, node, frame, false /* is_feature */ ,
+                               false /* is_custom_app */ );
 }
 
 /* *INDENT-OFF* */
@@ -1090,7 +1177,8 @@ VLIB_NODE_FN (ip4_reass_node_feature) (vlib_main_t * vm,
                                       vlib_node_runtime_t * node,
                                       vlib_frame_t * frame)
 {
-  return ip4_reassembly_inline (vm, node, frame, true /* is_feature */ );
+  return ip4_reassembly_inline (vm, node, frame, true /* is_feature */ ,
+                               false /* is_custom_app */ );
 }
 
 /* *INDENT-OFF* */
@@ -1114,7 +1202,8 @@ VLIB_REGISTER_NODE (ip4_reass_node_feature) = {
 VNET_FEATURE_INIT (ip4_reassembly_feature, static) = {
     .arc_name = "ip4-unicast",
     .node_name = "ip4-reassembly-feature",
-    .runs_before = VNET_FEATURES ("ip4-lookup"),
+    .runs_before = VNET_FEATURES ("ip4-lookup",
+                                  "ipsec4-input-feature"),
     .runs_after = 0,
 };
 /* *INDENT-ON* */
@@ -1162,20 +1251,21 @@ ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
 
 static void
 ip4_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
-                     u32 expire_walk_interval_ms)
+                     u32 max_reassembly_length, u32 expire_walk_interval_ms)
 {
   ip4_reass_main.timeout_ms = timeout_ms;
   ip4_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
   ip4_reass_main.max_reass_n = max_reassemblies;
+  ip4_reass_main.max_reass_len = max_reassembly_length;
   ip4_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
 }
 
 vnet_api_error_t
 ip4_reass_set (u32 timeout_ms, u32 max_reassemblies,
-              u32 expire_walk_interval_ms)
+              u32 max_reassembly_length, u32 expire_walk_interval_ms)
 {
   u32 old_nbuckets = ip4_reass_get_nbuckets ();
-  ip4_reass_set_params (timeout_ms, max_reassemblies,
+  ip4_reass_set_params (timeout_ms, max_reassemblies, max_reassembly_length,
                        expire_walk_interval_ms);
   vlib_process_signal_event (ip4_reass_main.vlib_main,
                             ip4_reass_main.ip4_reass_expire_node_idx,
@@ -1202,6 +1292,7 @@ ip4_reass_set (u32 timeout_ms, u32 max_reassemblies,
          clib_bihash_free_16_8 (&ip4_reass_main.hash);
          clib_memcpy_fast (&ip4_reass_main.hash, &new_hash,
                            sizeof (ip4_reass_main.hash));
+         clib_bihash_copied (&ip4_reass_main.hash, &new_hash);
        }
     }
   return 0;
@@ -1209,10 +1300,11 @@ ip4_reass_set (u32 timeout_ms, u32 max_reassemblies,
 
 vnet_api_error_t
 ip4_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
-              u32 * expire_walk_interval_ms)
+              u32 * max_reassembly_length, u32 * expire_walk_interval_ms)
 {
   *timeout_ms = ip4_reass_main.timeout_ms;
   *max_reassemblies = ip4_reass_main.max_reass_n;
+  *max_reassembly_length = ip4_reass_main.max_reass_len;
   *expire_walk_interval_ms = ip4_reass_main.expire_walk_interval_ms;
   return 0;
 }
@@ -1242,6 +1334,7 @@ ip4_reass_init_function (vlib_main_t * vm)
 
   ip4_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
                        IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
+                       IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
                        IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
 
   nbuckets = ip4_reass_get_nbuckets ();
@@ -1316,7 +1409,7 @@ ip4_reass_walk_expired (vlib_main_t * vm,
           vec_foreach (i, pool_indexes_to_free)
           {
             ip4_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
-            ip4_reass_on_timeout (vm, rm, reass);
+            ip4_reass_drop_all (vm, node, rm, reass);
             ip4_reass_free (rm, rt, reass);
           }
           /* *INDENT-ON* */