IPv4/6 reassembly 32/9532/10
authorKlement Sekera <ksekera@cisco.com>
Wed, 20 Sep 2017 06:26:30 +0000 (08:26 +0200)
committerFlorin Coras <florin.coras@gmail.com>
Thu, 1 Feb 2018 23:41:17 +0000 (23:41 +0000)
Change-Id: Ic5dcadd13c88b8a5e7896dab82404509c081614a
Signed-off-by: Klement Sekera <ksekera@cisco.com>
35 files changed:
src/vlib/buffer_funcs.h
src/vnet.am
src/vnet/buffer.h
src/vnet/ip/icmp46_packet.h
src/vnet/ip/icmp6.h
src/vnet/ip/ip.api
src/vnet/ip/ip4_error.h
src/vnet/ip/ip4_forward.c
src/vnet/ip/ip4_input.c
src/vnet/ip/ip4_input.h
src/vnet/ip/ip4_reassembly.c [new file with mode: 0644]
src/vnet/ip/ip4_reassembly.h [new file with mode: 0644]
src/vnet/ip/ip6_error.h
src/vnet/ip/ip6_forward.c
src/vnet/ip/ip6_input.h
src/vnet/ip/ip6_packet.h
src/vnet/ip/ip6_reassembly.c [new file with mode: 0644]
src/vnet/ip/ip6_reassembly.h [new file with mode: 0644]
src/vnet/ip/ip_api.c
src/vnet/ip/lookup.c
src/vnet/ip/lookup.h
src/vnet/ip/protocols.def
test/Makefile
test/framework.py
test/hook.py
test/patches/scapy-2.3.3/inet6.py.patch
test/sys_req/set_system_parameters.sh [new file with mode: 0755]
test/sys_req/system_parameters [new file with mode: 0644]
test/test_reassembly.py [new file with mode: 0644]
test/util.py
test/vpp_gre_interface.py
test/vpp_interface.py
test/vpp_lo_interface.py
test/vpp_papi_provider.py
test/vpp_punt_socket.py [new file with mode: 0644]

index 4831eb5..cc56db7 100644 (file)
@@ -1004,6 +1004,68 @@ vlib_validate_buffer_set_in_use (vlib_buffer_t * b, u32 expected)
 #endif
 }
 
+/** minimum data size of first buffer in a buffer chain */
+#define VLIB_BUFFER_CHAIN_MIN_FIRST_DATA_SIZE (256)
+
+/**
+ * @brief compress buffer chain in a way where the first buffer is at least
+ * VLIB_BUFFER_CHAIN_MIN_FIRST_DATA_SIZE long
+ *
+ * @param[in] vm - vlib_main
+ * @param[in,out] first - first buffer in chain
+ * @param[in,out] discard_vector - vector of buffer indexes which were removed
+ * from the chain
+ */
+always_inline void
+vlib_buffer_chain_compress (vlib_main_t * vm,
+                           vlib_buffer_t * first, u32 ** discard_vector)
+{
+  if (first->current_length >= VLIB_BUFFER_CHAIN_MIN_FIRST_DATA_SIZE ||
+      !(first->flags & VLIB_BUFFER_NEXT_PRESENT))
+    {
+      /* this is already big enough or not a chain */
+      return;
+    }
+  /* probe free list to find allocated buffer size to avoid overfill */
+  u32 index;
+  vlib_buffer_free_list_t *free_list =
+    vlib_buffer_get_buffer_free_list (vm, first, &index);
+
+  u32 want_first_size = clib_min (VLIB_BUFFER_CHAIN_MIN_FIRST_DATA_SIZE,
+                                 free_list->n_data_bytes -
+                                 first->current_data);
+  do
+    {
+      vlib_buffer_t *second = vlib_get_buffer (vm, first->next_buffer);
+      u32 need = want_first_size - first->current_length;
+      u32 amount_to_copy = clib_min (need, second->current_length);
+      clib_memcpy (((u8 *) vlib_buffer_get_current (first)) +
+                  first->current_length,
+                  vlib_buffer_get_current (second), amount_to_copy);
+      first->current_length += amount_to_copy;
+      vlib_buffer_advance (second, amount_to_copy);
+      if (first->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID)
+       {
+         first->total_length_not_including_first_buffer -= amount_to_copy;
+       }
+      if (!second->current_length)
+       {
+         vec_add1 (*discard_vector, first->next_buffer);
+         if (second->flags & VLIB_BUFFER_NEXT_PRESENT)
+           {
+             first->next_buffer = second->next_buffer;
+           }
+         else
+           {
+             first->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+           }
+         second->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+       }
+    }
+  while ((first->current_length < want_first_size) &&
+        (first->flags & VLIB_BUFFER_NEXT_PRESENT));
+}
+
 #endif /* included_vlib_buffer_funcs_h */
 
 /*
index 7eb4159..8f88474 100644 (file)
@@ -343,6 +343,7 @@ libvnet_la_SOURCES +=                               \
  vnet/ip/ip4_pg.c                              \
  vnet/ip/ip4_source_and_port_range_check.c     \
  vnet/ip/ip4_source_check.c                    \
+ vnet/ip/ip4_reassembly.c                       \
  vnet/ip/ip6_format.c                          \
  vnet/ip/ip6_forward.c                         \
  vnet/ip/ip6_punt_drop.c                       \
@@ -350,6 +351,7 @@ libvnet_la_SOURCES +=                               \
  vnet/ip/ip6_input.c                           \
  vnet/ip/ip6_neighbor.c                                \
  vnet/ip/ip6_pg.c                              \
+ vnet/ip/ip6_reassembly.c                       \
  vnet/ip/ip_api.c                              \
  vnet/ip/ip_checksum.c                         \
  vnet/ip/ip_frag.c                             \
index c9f1d6e..5d7273a 100644 (file)
@@ -177,6 +177,18 @@ typedef struct
          u8 code;
          u32 data;
        } icmp;
+
+       /* reassembly */
+       struct
+       {
+         u16 fragment_first;
+         u16 fragment_last;
+         u16 range_first;
+         u16 range_last;
+         u32 next_range_bi;
+         u16 ip6_frag_hdr_offset;
+         u16 estimated_mtu;
+       } reass;
       };
 
     } ip;
index a86cbd5..e761af8 100644 (file)
   _ (parameter_problem, 0, erroneous_header_field)                     \
   _ (parameter_problem, 1, unrecognized_next_header)                   \
   _ (parameter_problem, 2, unrecognized_option)                                \
+  _ (parameter_problem, 3, first_fragment_has_incomplete_header_chain)  \
   _ (router_renumbering, 0, command)                                   \
   _ (router_renumbering, 1, result)                                    \
   _ (node_information_request, 0, data_contains_ip6_address)           \
index 9a3487b..1cac824 100644 (file)
@@ -45,7 +45,7 @@
   _ (DEST_UNREACH_SENT, "destination unreachable response sent")       \
   _ (PACKET_TOO_BIG_SENT, "packet too big response sent")              \
   _ (TTL_EXPIRE_SENT, "hop limit exceeded response sent")              \
-  _ (PARAM_PROBLEM_SENT, "parameter Pproblem response sent")           \
+  _ (PARAM_PROBLEM_SENT, "parameter problem response sent")            \
   _ (DROP, "error message dropped")
 
 
index 0da0a61..6ed5a9d 100644 (file)
@@ -812,6 +812,34 @@ autoreply define ioam_disable
   u16 id;
 };
 
+autoreply define ip_reassembly_set
+{
+  u32 client_index;
+  u32 context;
+  u32 timeout_ms;
+  u32 max_reassemblies;
+  u32 expire_walk_interval_ms;
+  u8 is_ip6;
+};
+
+define ip_reassembly_get
+{
+  u32 client_index;
+  u32 context;
+  u8 is_ip6;
+};
+
+define ip_reassembly_get_reply
+{
+  u32 client_index;
+  u32 context;
+  i32 retval;
+  u32 timeout_ms;
+  u32 max_reassemblies;
+  u32 expire_walk_interval_ms;
+  u8 is_ip6;
+};
+
 /*
  * Local Variables:
  * eval: (c-set-style "gnu")
index 4e545ae..00b1c6d 100644 (file)
   _ (UNICAST_SOURCE_CHECK_FAILS, "ip4 unicast source check fails")     \
                                                                         \
   /* Spoofed packets in ip4-rewrite-local */                            \
-  _(SPOOFED_LOCAL_PACKETS, "ip4 spoofed local-address packet drops")    \
+  _ (SPOOFED_LOCAL_PACKETS, "ip4 spoofed local-address packet drops")   \
                                                                         \
   /* Errors singalled by ip4-inacl */                                   \
   _ (INACL_TABLE_MISS, "input ACL table-miss drops")                    \
   _ (INACL_SESSION_DENY, "input ACL session deny drops")                \
                                                                         \
   /* Erros from mfib-forward */                                         \
-  _ (RPF_FAILURE, "Multicast RPF check failed")
+  _ (RPF_FAILURE, "Multicast RPF check failed")                         \
+                                                                        \
+  /* Errors signalled by ip4-reassembly */                              \
+  _ (REASS_DUPLICATE_FRAGMENT, "duplicate/overlapping fragments")       \
+  _ (REASS_LIMIT_REACHED, "drops due to concurrent reassemblies limit") \
+  _ (REASS_TIMEOUT, "fragments dropped due to reassembly timeout")
 
 typedef enum
 {
index 3ddf6df..2a21135 100755 (executable)
@@ -1582,8 +1582,12 @@ ip4_local_inline (vlib_main_t * vm,
 
          /* Treat IP frag packets as "experimental" protocol for now
             until support of IP frag reassembly is implemented */
-         proto0 = ip4_is_fragment (ip0) ? 0xfe : ip0->protocol;
-         proto1 = ip4_is_fragment (ip1) ? 0xfe : ip1->protocol;
+         proto0 =
+           ip4_is_fragment (ip0) ? IP_PROTOCOL_VPP_FRAGMENTATION :
+           ip0->protocol;
+         proto1 =
+           ip4_is_fragment (ip1) ? IP_PROTOCOL_VPP_FRAGMENTATION :
+           ip1->protocol;
 
          if (head_of_feature_arc == 0)
            goto skip_checks;
@@ -1748,7 +1752,9 @@ ip4_local_inline (vlib_main_t * vm,
 
          /* Treat IP frag packets as "experimental" protocol for now
             until support of IP frag reassembly is implemented */
-         proto0 = ip4_is_fragment (ip0) ? 0xfe : ip0->protocol;
+         proto0 =
+           ip4_is_fragment (ip0) ? IP_PROTOCOL_VPP_FRAGMENTATION :
+           ip0->protocol;
 
          if (head_of_feature_arc == 0 || p0->flags & VNET_BUFFER_F_IS_NATED)
            goto skip_check;
@@ -1839,6 +1845,7 @@ VLIB_REGISTER_NODE (ip4_local_node) =
     [IP_LOCAL_NEXT_PUNT] = "ip4-punt",
     [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup",
     [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",
+    [IP_LOCAL_NEXT_REASSEMBLY] = "ip4-reassembly",
   },
 };
 /* *INDENT-ON* */
index 121f40f..61c86ef 100644 (file)
@@ -289,6 +289,7 @@ VLIB_REGISTER_NODE (ip4_input_node) = {
     [IP4_INPUT_NEXT_LOOKUP] = "ip4-lookup",
     [IP4_INPUT_NEXT_LOOKUP_MULTICAST] = "ip4-mfib-forward-lookup",
     [IP4_INPUT_NEXT_ICMP_ERROR] = "ip4-icmp-error",
+    [IP4_INPUT_NEXT_REASSEMBLY] = "ip4-reassembly",
   },
 
   .format_buffer = format_ip4_header,
@@ -311,6 +312,7 @@ VLIB_REGISTER_NODE (ip4_input_no_checksum_node,static) = {
     [IP4_INPUT_NEXT_LOOKUP] = "ip4-lookup",
     [IP4_INPUT_NEXT_LOOKUP_MULTICAST] = "ip4-mfib-forward-lookup",
     [IP4_INPUT_NEXT_ICMP_ERROR] = "ip4-icmp-error",
+    [IP4_INPUT_NEXT_REASSEMBLY] = "ip4-reassembly",
   },
 
   .format_buffer = format_ip4_header,
index 75306a3..2692848 100644 (file)
@@ -52,6 +52,7 @@ typedef enum
   IP4_INPUT_NEXT_LOOKUP,
   IP4_INPUT_NEXT_LOOKUP_MULTICAST,
   IP4_INPUT_NEXT_ICMP_ERROR,
+  IP4_INPUT_NEXT_REASSEMBLY,
   IP4_INPUT_N_NEXT,
 } ip4_input_next_t;
 
diff --git a/src/vnet/ip/ip4_reassembly.c b/src/vnet/ip/ip4_reassembly.c
new file mode 100644 (file)
index 0000000..6b8d665
--- /dev/null
@@ -0,0 +1,1277 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ * @brief IPv4 Reassembly.
+ *
+ * This file contains the source code for IPv4 reassembly.
+ */
+
+#include <vppinfra/vec.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vppinfra/bihash_24_8.h>
+#include <vnet/ip/ip4_reassembly.h>
+
+#define MSEC_PER_SEC 1000
+#define IP4_REASS_TIMEOUT_DEFAULT_MS 100
+#define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000        // 10 seconds default
+#define IP4_REASS_MAX_REASSEMBLIES_DEAFULT 1024
+#define IP4_REASS_HT_LOAD_FACTOR (0.75)
+
+#define IP4_REASS_DEBUG_BUFFERS 0
+#if IP4_REASS_DEBUG_BUFFERS
+#define IP4_REASS_DEBUG_BUFFER(bi, what)             \
+  do                                                 \
+    {                                                \
+      u32 _bi = bi;                                  \
+      printf (#what "buffer %u", _bi);               \
+      vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
+      while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
+        {                                            \
+          _bi = _b->next_buffer;                     \
+          printf ("[%u]", _bi);                      \
+          _b = vlib_get_buffer (vm, _bi);            \
+        }                                            \
+      printf ("\n");                                 \
+      fflush (stdout);                               \
+    }                                                \
+  while (0)
+#else
+#define IP4_REASS_DEBUG_BUFFER(...)
+#endif
+
+static vlib_node_registration_t ip4_reass_node;
+
+typedef struct
+{
+  union
+  {
+    struct
+    {
+      // align by making this 4 octets even though its a 2 octets field
+      u32 xx_id;
+      ip4_address_t src;
+      ip4_address_t dst;
+      // align by making this 4 octets even though its a 2 octets field
+      u32 frag_id;
+      // align by making this 4 octets even though its a 1 octet field
+      u32 proto;
+      u32 unused;
+    };
+    u64 as_u64[3];
+  };
+} ip4_reass_key_t;
+
+always_inline u32
+ip4_reass_buffer_get_data_offset_no_check (vlib_buffer_t * b)
+{
+  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
+  return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
+}
+
+always_inline u32
+ip4_reass_buffer_get_data_offset (vlib_buffer_t * b)
+{
+  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
+  ASSERT (vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first);
+  return ip4_reass_buffer_get_data_offset_no_check (b);
+}
+
+always_inline u16
+ip4_reass_buffer_get_data_len_no_check (vlib_buffer_t * b)
+{
+  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
+  return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
+    (vnb->ip.reass.fragment_first + ip4_reass_buffer_get_data_offset (b)) + 1;
+}
+
+always_inline u16
+ip4_reass_buffer_get_data_len (vlib_buffer_t * b)
+{
+  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
+  ASSERT (vnb->ip.reass.range_last > vnb->ip.reass.fragment_first);
+  return ip4_reass_buffer_get_data_len_no_check (b);
+}
+
+typedef struct
+{
+  // hash table key
+  ip4_reass_key_t key;
+  f64 first_heard;
+  // time when last packet was received
+  f64 last_heard;
+  // internal id of this reassembly
+  u32 id;
+  // buffer index of first buffer in this reassembly context
+  u32 first_bi;
+  // last octet of packet, ~0 until fragment without more_fragments arrives
+  u32 last_packet_octet;
+  // length of data collected so far
+  u32 data_len;
+  // trace operation counter
+  u32 trace_op_counter;
+} ip4_reass_t;
+
+typedef struct
+{
+  // IPv4 config
+  u32 timeout_ms;
+  f64 timeout;
+  u32 expire_walk_interval_ms;
+  u32 max_reass_n;
+
+  // IPv4 runtime
+  ip4_reass_t *pool;
+  clib_bihash_24_8_t hash;
+  u32 reass_n;
+  u32 id_counter;
+  u32 buffers_n;
+
+  // convenience
+  vlib_main_t *vlib_main;
+  vnet_main_t *vnet_main;
+
+  // node index of ip4-drop node
+  u32 ip4_drop_idx;
+  u32 ip4_reass_expire_node_idx;
+
+} ip4_reass_main_t;
+
+ip4_reass_main_t ip4_reass_main;
+
+typedef enum
+{
+  IP4_REASSEMBLY_NEXT_INPUT,
+  IP4_REASSEMBLY_NEXT_DROP,
+  IP4_REASSEMBLY_N_NEXT,
+} ip4_reass_next_t;
+
+typedef enum
+{
+  RANGE_NEW,
+  RANGE_SHRINK,
+  RANGE_DISCARD,
+  RANGE_OVERLAP,
+  FINALIZE,
+} ip4_reass_trace_operation_e;
+
+typedef struct
+{
+  u16 range_first;
+  u16 range_last;
+  u32 range_bi;
+  i32 data_offset;
+  u32 data_len;
+  u32 first_bi;
+} ip4_reass_range_trace_t;
+
+typedef struct
+{
+  ip4_reass_trace_operation_e action;
+  u32 pool_index;
+  u32 reass_id;
+  ip4_reass_range_trace_t trace_range;
+  u32 size_diff;
+  u32 op_id;
+  u32 fragment_first;
+  u32 fragment_last;
+  u32 total_data_len;
+} ip4_reass_trace_t;
+
+void
+ip4_reass_trace_details (vlib_main_t * vm, u32 bi,
+                        ip4_reass_range_trace_t * trace)
+{
+  vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
+  trace->range_first = vnb->ip.reass.range_first;
+  trace->range_last = vnb->ip.reass.range_last;
+  trace->data_offset = ip4_reass_buffer_get_data_offset_no_check (b);
+  trace->data_len = ip4_reass_buffer_get_data_len_no_check (b);
+  trace->range_bi = bi;
+}
+
+u8 *
+format_ip4_reass_range_trace (u8 * s, va_list * args)
+{
+  ip4_reass_range_trace_t *trace = va_arg (*args, ip4_reass_range_trace_t *);
+  s = format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
+             trace->range_last, trace->data_offset, trace->data_len,
+             trace->range_bi);
+  return s;
+}
+
+u8 *
+format_ip4_reass_trace (u8 * s, va_list * args)
+{
+  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+  ip4_reass_trace_t *t = va_arg (*args, ip4_reass_trace_t *);
+  s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
+  u32 indent = format_get_indent (s);
+  s = format (s, "first bi: %u, data len: %u, ip/fragment[%u, %u]",
+             t->trace_range.first_bi, t->total_data_len, t->fragment_first,
+             t->fragment_last);
+  switch (t->action)
+    {
+    case RANGE_SHRINK:
+      s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
+                 format_ip4_reass_range_trace, &t->trace_range,
+                 t->size_diff);
+      break;
+    case RANGE_DISCARD:
+      s = format (s, "\n%Udiscard %U", format_white_space, indent,
+                 format_ip4_reass_range_trace, &t->trace_range);
+      break;
+    case RANGE_NEW:
+      s = format (s, "\n%Unew %U", format_white_space, indent,
+                 format_ip4_reass_range_trace, &t->trace_range);
+      break;
+    case RANGE_OVERLAP:
+      s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
+                 format_ip4_reass_range_trace, &t->trace_range);
+      break;
+    case FINALIZE:
+      s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
+      break;
+    }
+  return s;
+}
+
+void
+ip4_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
+                    ip4_reass_main_t * rm, ip4_reass_t * reass, u32 bi,
+                    ip4_reass_trace_operation_e action, u32 size_diff)
+{
+  vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
+  ip4_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
+  t->pool_index = reass - rm->pool;
+  t->reass_id = reass->id;
+  t->action = action;
+  ip4_reass_trace_details (vm, bi, &t->trace_range);
+  t->size_diff = size_diff;
+  t->op_id = reass->trace_op_counter;
+  ++reass->trace_op_counter;
+  t->fragment_first = vnb->ip.reass.fragment_first;
+  t->fragment_last = vnb->ip.reass.fragment_last;
+  t->trace_range.first_bi = reass->first_bi;
+  t->total_data_len = reass->data_len;
+#if 0
+  static u8 *s = NULL;
+  s = format (s, "%U", format_ip4_reass_trace, NULL, NULL, t);
+  printf ("%.*s\n", vec_len (s), s);
+  fflush (stdout);
+  vec_reset_length (s);
+#endif
+}
+
+void
+ip4_reass_free (ip4_reass_main_t * rm, ip4_reass_t * reass)
+{
+  clib_bihash_kv_24_8_t kv;
+  kv.key[0] = reass->key.as_u64[0];
+  kv.key[1] = reass->key.as_u64[1];
+  kv.key[2] = reass->key.as_u64[2];
+  clib_bihash_add_del_24_8 (&rm->hash, &kv, 0);
+  pool_put (rm->pool, reass);
+  --rm->reass_n;
+}
+
+static void
+ip4_reass_on_timeout (vlib_main_t * vm, ip4_reass_main_t * rm,
+                     ip4_reass_t * reass, u32 ** vec_drop_timeout)
+{
+  u32 range_bi = reass->first_bi;
+  vlib_buffer_t *range_b;
+  vnet_buffer_opaque_t *range_vnb;
+  while (~0 != range_bi)
+    {
+      range_b = vlib_get_buffer (vm, range_bi);
+      range_vnb = vnet_buffer (range_b);
+      u32 bi = range_bi;
+      while (~0 != bi)
+       {
+         vec_add1 (*vec_drop_timeout, bi);
+         vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+         if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+           {
+             bi = b->next_buffer;
+             b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+           }
+         else
+           {
+             bi = ~0;
+           }
+       }
+      range_bi = range_vnb->ip.reass.next_range_bi;
+    }
+}
+
+ip4_reass_t *
+ip4_reass_find_or_create (vlib_main_t * vm, ip4_reass_main_t * rm,
+                         ip4_reass_key_t * k, u32 ** vec_drop_timeout)
+{
+  ip4_reass_t *reass = NULL;
+  f64 now = vlib_time_now (rm->vlib_main);
+  clib_bihash_kv_24_8_t kv, value;
+  kv.key[0] = k->as_u64[0];
+  kv.key[1] = k->as_u64[1];
+  kv.key[2] = k->as_u64[2];
+
+  if (!clib_bihash_search_24_8 (&rm->hash, &kv, &value))
+    {
+      reass = pool_elt_at_index (rm->pool, value.value);
+      if (now > reass->last_heard + rm->timeout)
+       {
+         ip4_reass_on_timeout (vm, rm, reass, vec_drop_timeout);
+         ip4_reass_free (rm, reass);
+         reass = NULL;
+       }
+    }
+
+  if (reass)
+    {
+      reass->last_heard = now;
+      return reass;
+    }
+
+  if (rm->reass_n >= rm->max_reass_n)
+    {
+      reass = NULL;
+      return reass;
+    }
+  else
+    {
+      pool_get (rm->pool, reass);
+      memset (reass, 0, sizeof (*reass));
+      reass->id = rm->id_counter;
+      ++rm->id_counter;
+      reass->first_bi = ~0;
+      reass->last_packet_octet = ~0;
+      reass->data_len = 0;
+      ++rm->reass_n;
+    }
+
+  reass->key.as_u64[0] = kv.key[0] = k->as_u64[0];
+  reass->key.as_u64[1] = kv.key[1] = k->as_u64[1];
+  reass->key.as_u64[2] = kv.key[2] = k->as_u64[2];
+  kv.value = reass - rm->pool;
+  reass->last_heard = now;
+
+  if (clib_bihash_add_del_24_8 (&rm->hash, &kv, 1))
+    {
+      ip4_reass_free (rm, reass);
+      reass = NULL;
+    }
+
+  return reass;
+}
+
+void
+ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
+                   ip4_reass_main_t * rm, ip4_reass_t * reass, u32 * bi0,
+                   u32 * next0, vlib_error_t * error0, u32 next_input,
+                   u32 ** vec_drop_compress, u32 ** vec_drop_overlap)
+{
+  ASSERT (~0 != reass->first_bi);
+  vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
+  vlib_buffer_t *last_b = NULL;
+  u32 sub_chain_bi = reass->first_bi;
+  u32 total_length = 0;
+  u32 buf_cnt = 0;
+  u32 dropped_cnt = 0;
+  do
+    {
+      u32 tmp_bi = sub_chain_bi;
+      vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
+      ip4_header_t *ip = vlib_buffer_get_current (tmp);
+      u32 data_len = ip4_reass_buffer_get_data_len (tmp);
+      u32 trim_front =
+       ip4_header_bytes (ip) + ip4_reass_buffer_get_data_offset (tmp);
+      u32 trim_end =
+       vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
+      if (tmp_bi == reass->first_bi)
+       {
+         /* first buffer - keep ip4 header */
+         ASSERT (0 == ip4_reass_buffer_get_data_offset (tmp));
+         trim_front = 0;
+         trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
+           ip4_header_bytes (ip);
+       }
+      u32 keep_data =
+       vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
+      while (1)
+       {
+         ++buf_cnt;
+         if (trim_front)
+           {
+             if (trim_front > tmp->current_length)
+               {
+                 /* drop whole buffer */
+                 vec_add1 (*vec_drop_compress, tmp_bi);
+                 ++dropped_cnt;
+                 trim_front -= tmp->current_length;
+                 ASSERT (tmp->flags & VLIB_BUFFER_NEXT_PRESENT);
+                 tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+                 tmp_bi = tmp->next_buffer;
+                 tmp = vlib_get_buffer (vm, tmp_bi);
+                 continue;
+               }
+             else
+               {
+                 vlib_buffer_advance (tmp, trim_front);
+                 trim_front = 0;
+               }
+           }
+         if (keep_data)
+           {
+             if (last_b)
+               {
+                 last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+                 last_b->next_buffer = tmp_bi;
+               }
+             last_b = tmp;
+             if (keep_data <= tmp->current_length)
+               {
+                 tmp->current_length = keep_data;
+                 keep_data = 0;
+               }
+             else
+               {
+                 keep_data -= tmp->current_length;
+                 ASSERT (tmp->flags & VLIB_BUFFER_NEXT_PRESENT);
+               }
+             total_length += tmp->current_length;
+           }
+         else
+           {
+             vec_add1 (*vec_drop_overlap, tmp_bi);
+             ASSERT (reass->first_bi != tmp_bi);
+             ++dropped_cnt;
+           }
+         if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
+           {
+             tmp_bi = tmp->next_buffer;
+             tmp = vlib_get_buffer (vm, tmp->next_buffer);
+           }
+         else
+           {
+             break;
+           }
+       }
+      sub_chain_bi =
+       vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
+       reass.next_range_bi;
+    }
+  while (~0 != sub_chain_bi);
+  last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+  ASSERT (rm->buffers_n >= (buf_cnt - dropped_cnt));
+  rm->buffers_n -= buf_cnt - dropped_cnt;
+  ASSERT (total_length >= first_b->current_length);
+  total_length -= first_b->current_length;
+  first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+  first_b->total_length_not_including_first_buffer = total_length;
+  ip4_header_t *ip = vlib_buffer_get_current (first_b);
+  ip->flags_and_fragment_offset = 0;
+  ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
+  ip->checksum = ip4_header_checksum (ip);
+  vlib_buffer_chain_compress (vm, first_b, vec_drop_compress);
+  if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
+    {
+      ip4_reass_add_trace (vm, node, rm, reass, reass->first_bi, FINALIZE, 0);
+#if 0
+      // following code does a hexdump of packet fragments to stdout ...
+      do
+       {
+         u32 bi = reass->first_bi;
+         u8 *s = NULL;
+         while (~0 != bi)
+           {
+             vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+             s = format (s, "%u: %U\n", bi, format_hexdump,
+                         vlib_buffer_get_current (b), b->current_length);
+             if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+               {
+                 bi = b->next_buffer;
+               }
+             else
+               {
+                 break;
+               }
+           }
+         printf ("%.*s\n", vec_len (s), s);
+         fflush (stdout);
+         vec_free (s);
+       }
+      while (0);
+#endif
+    }
+  *bi0 = reass->first_bi;
+  *next0 = next_input;
+  *error0 = IP4_ERROR_NONE;
+  ip4_reass_free (rm, reass);
+  reass = NULL;
+}
+
+static u32
+ip4_reass_get_buffer_chain_length (vlib_main_t * vm, vlib_buffer_t * b)
+{
+  u32 len = 0;
+  while (b)
+    {
+      ++len;
+      if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+       {
+         b = vlib_get_buffer (vm, b->next_buffer);
+       }
+      else
+       {
+         break;
+       }
+    }
+  return len;
+}
+
+static void
+ip4_reass_insert_range_in_chain (vlib_main_t * vm,
+                                ip4_reass_main_t * rm,
+                                ip4_reass_t * reass,
+                                u32 prev_range_bi, u32 new_next_bi)
+{
+
+  vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
+  vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
+  if (~0 != prev_range_bi)
+    {
+      vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
+      vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
+      new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
+      prev_vnb->ip.reass.next_range_bi = new_next_bi;
+    }
+  else
+    {
+      if (~0 != reass->first_bi)
+       {
+         new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
+       }
+      reass->first_bi = new_next_bi;
+    }
+  reass->data_len += ip4_reass_buffer_get_data_len (new_next_b);
+  rm->buffers_n += ip4_reass_get_buffer_chain_length (vm, new_next_b);
+}
+
+static void
+ip4_reass_remove_range_from_chain (vlib_main_t * vm,
+                                  vlib_node_runtime_t * node,
+                                  ip4_reass_main_t * rm,
+                                  u32 ** vec_drop_overlap,
+                                  ip4_reass_t * reass, u32 prev_range_bi,
+                                  u32 discard_bi)
+{
+  vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
+  vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
+  if (~0 != prev_range_bi)
+    {
+      vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
+      vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
+      ASSERT (prev_vnb->ip.reass.next_range_bi == discard_bi);
+      prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
+    }
+  else
+    {
+      reass->first_bi = discard_vnb->ip.reass.next_range_bi;
+    }
+  reass->data_len -= ip4_reass_buffer_get_data_len (discard_b);
+  while (1)
+    {
+      vec_add1 (*vec_drop_overlap, discard_bi);
+      if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
+       {
+         ip4_reass_add_trace (vm, node, rm, reass, discard_bi, RANGE_DISCARD,
+                              0);
+       }
+      if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
+       {
+         discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+         discard_bi = discard_b->next_buffer;
+         discard_b = vlib_get_buffer (vm, discard_bi);
+       }
+      else
+       {
+         break;
+       }
+    }
+}
+
+void
+ip4_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
+                 ip4_reass_main_t * rm, ip4_reass_t * reass, u32 * bi0,
+                 u32 * next0, vlib_error_t * error0,
+                 u32 ** vec_drop_overlap, u32 ** vec_drop_compress,
+                 u32 next_input, u32 next_drop)
+{
+  int consumed = 0;
+  vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
+  ip4_header_t *fip = vlib_buffer_get_current (fb);
+  ASSERT (fb->current_length >= sizeof (*fip));
+  vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
+  u32 fragment_first = fvnb->ip.reass.fragment_first =
+    ip4_get_fragment_offset_bytes (fip);
+  u32 fragment_length =
+    clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
+  u32 fragment_last = fvnb->ip.reass.fragment_last =
+    fragment_first + fragment_length - 1;
+  int more_fragments = ip4_get_fragment_more (fip);
+  u32 candidate_range_bi = reass->first_bi;
+  u32 prev_range_bi = ~0;
+  fvnb->ip.reass.range_first = fragment_first;
+  fvnb->ip.reass.range_last = fragment_last;
+  fvnb->ip.reass.next_range_bi = ~0;
+  if (!more_fragments)
+    {
+      reass->last_packet_octet = fragment_last;
+    }
+  if (~0 == reass->first_bi)
+    {
+      // starting a new reassembly
+      ip4_reass_insert_range_in_chain (vm, rm, reass, prev_range_bi, *bi0);
+      if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
+       {
+         ip4_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
+       }
+      *bi0 = ~0;
+      fvnb->ip.reass.estimated_mtu = clib_net_to_host_u16 (fip->length);
+      return;
+    }
+  fvnb->ip.reass.estimated_mtu = clib_min (clib_net_to_host_u16 (fip->length),
+                                          fvnb->ip.reass.estimated_mtu);
+  while (~0 != candidate_range_bi)
+    {
+      vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
+      vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
+      if (fragment_first > candidate_vnb->ip.reass.range_last)
+       {
+         // this fragments starts after candidate range
+         prev_range_bi = candidate_range_bi;
+         candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
+         if (candidate_vnb->ip.reass.range_last < fragment_last &&
+             ~0 == candidate_range_bi)
+           {
+             // special case - this fragment falls beyond all known ranges
+             ip4_reass_insert_range_in_chain (vm, rm, reass, prev_range_bi,
+                                              *bi0);
+             consumed = 1;
+             break;
+           }
+         continue;
+       }
+      if (fragment_last < candidate_vnb->ip.reass.range_first)
+       {
+         // this fragment ends before candidate range without any overlap
+         ip4_reass_insert_range_in_chain (vm, rm, reass, prev_range_bi,
+                                          *bi0);
+         consumed = 1;
+       }
+      else
+       {
+         if (fragment_first >= candidate_vnb->ip.reass.range_first &&
+             fragment_last <= candidate_vnb->ip.reass.range_last)
+           {
+             // this fragment is a (sub)part of existing range, ignore it
+             if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
+               {
+                 ip4_reass_add_trace (vm, node, rm, reass, *bi0,
+                                      RANGE_OVERLAP, 0);
+               }
+             break;
+           }
+         int discard_candidate = 0;
+         if (fragment_first < candidate_vnb->ip.reass.range_first)
+           {
+             u32 overlap =
+               fragment_last - candidate_vnb->ip.reass.range_first + 1;
+             if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
+               {
+                 candidate_vnb->ip.reass.range_first += overlap;
+                 ASSERT (reass->data_len >= overlap);
+                 reass->data_len -= overlap;
+                 if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
+                   {
+                     ip4_reass_add_trace (vm, node, rm, reass,
+                                          candidate_range_bi, RANGE_SHRINK,
+                                          overlap);
+                   }
+                 ip4_reass_insert_range_in_chain (vm, rm, reass,
+                                                  prev_range_bi, *bi0);
+                 consumed = 1;
+               }
+             else
+               {
+                 discard_candidate = 1;
+               }
+           }
+         else if (fragment_last > candidate_vnb->ip.reass.range_last)
+           {
+             u32 overlap =
+               candidate_vnb->ip.reass.range_last - fragment_first + 1;
+             if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
+               {
+                 fvnb->ip.reass.range_first += overlap;
+                 if (~0 != candidate_vnb->ip.reass.next_range_bi)
+                   {
+                     prev_range_bi = candidate_range_bi;
+                     candidate_range_bi =
+                       candidate_vnb->ip.reass.next_range_bi;
+                     continue;
+                   }
+                 else
+                   {
+                     // special case - last range discarded
+                     ip4_reass_insert_range_in_chain (vm, rm, reass,
+                                                      candidate_range_bi,
+                                                      *bi0);
+                     consumed = 1;
+                   }
+               }
+             else
+               {
+                 discard_candidate = 1;
+               }
+           }
+         else
+           {
+             discard_candidate = 1;
+           }
+         if (discard_candidate)
+           {
+             u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
+             // discard candidate range, probe next range
+             ip4_reass_remove_range_from_chain (vm, node, rm,
+                                                vec_drop_overlap, reass,
+                                                prev_range_bi,
+                                                candidate_range_bi);
+             if (~0 != next_range_bi)
+               {
+                 candidate_range_bi = next_range_bi;
+                 continue;
+               }
+             else
+               {
+                 // special case - last range discarded
+                 ip4_reass_insert_range_in_chain (vm, rm, reass,
+                                                  prev_range_bi, *bi0);
+                 consumed = 1;
+               }
+           }
+       }
+      break;
+    }
+  if (consumed)
+    {
+      if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
+       {
+         ip4_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
+       }
+    }
+  if (~0 != reass->last_packet_octet &&
+      reass->data_len == reass->last_packet_octet + 1)
+    {
+      ip4_reass_finalize (vm, node, rm, reass, bi0, next0, error0, next_input,
+                         vec_drop_compress, vec_drop_overlap);
+    }
+  else
+    {
+      if (consumed)
+       {
+         *bi0 = ~0;
+       }
+      else
+       {
+         *next0 = next_drop;
+         *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
+       }
+    }
+}
+
+always_inline uword
+ip4_reassembly (vlib_main_t * vm, vlib_node_runtime_t * node,
+               vlib_frame_t * frame)
+{
+  u32 *from = vlib_frame_vector_args (frame);
+  u32 n_left_from, n_left_to_next, *to_next, next_index;
+  ip4_reass_main_t *rm = &ip4_reass_main;
+
+  n_left_from = frame->n_vectors;
+  next_index = node->cached_next_index;
+  static u32 *vec_drop_timeout = NULL; // indexes of buffers which timed out
+  static u32 *vec_drop_overlap = NULL; // indexes of buffers which were discarded due to overlap
+  static u32 *vec_drop_compress = NULL;        // indexes of buffers dicarded due to buffer compression
+  while (n_left_from > 0 || vec_len (vec_drop_timeout) > 0 ||
+        vec_len (vec_drop_overlap) > 0)
+    {
+      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+      while (vec_len (vec_drop_timeout) > 0 && n_left_to_next > 0)
+       {
+         u32 bi = vec_pop (vec_drop_timeout);
+         vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+         b->error = node->errors[IP4_ERROR_REASS_TIMEOUT];
+         to_next[0] = bi;
+         to_next += 1;
+         n_left_to_next -= 1;
+         vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+                                          n_left_to_next, bi,
+                                          IP4_REASSEMBLY_NEXT_DROP);
+         IP4_REASS_DEBUG_BUFFER (bi, enqueue_drop_timeout);
+         ASSERT (rm->buffers_n > 0);
+         --rm->buffers_n;
+       }
+
+      while (vec_len (vec_drop_overlap) > 0 && n_left_to_next > 0)
+       {
+         u32 bi = vec_pop (vec_drop_overlap);
+         vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+         b->error = node->errors[IP4_ERROR_REASS_DUPLICATE_FRAGMENT];
+         to_next[0] = bi;
+         to_next += 1;
+         n_left_to_next -= 1;
+         vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+                                          n_left_to_next, bi,
+                                          IP4_REASSEMBLY_NEXT_DROP);
+         IP4_REASS_DEBUG_BUFFER (bi, enqueue_drop_duplicate_fragment);
+         ASSERT (rm->buffers_n > 0);
+         --rm->buffers_n;
+       }
+
+      while (vec_len (vec_drop_compress) > 0 && n_left_to_next > 0)
+       {
+         u32 bi = vec_pop (vec_drop_compress);
+         vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+         b->error = node->errors[IP4_ERROR_NONE];
+         to_next[0] = bi;
+         to_next += 1;
+         n_left_to_next -= 1;
+         vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+                                          n_left_to_next, bi,
+                                          IP4_REASSEMBLY_NEXT_DROP);
+         IP4_REASS_DEBUG_BUFFER (bi, enqueue_drop_compress);
+         ASSERT (rm->buffers_n > 0);
+         --rm->buffers_n;
+       }
+
+      while (n_left_from > 0 && n_left_to_next > 0)
+       {
+         u32 bi0;
+         vlib_buffer_t *b0;
+         u32 next0;            //, error0;
+
+         bi0 = from[0];
+         b0 = vlib_get_buffer (vm, bi0);
+
+         ip4_header_t *ip0 = vlib_buffer_get_current (b0);
+         ip4_reass_key_t k;
+         k.src.as_u32 = ip0->src_address.as_u32;
+         k.dst.as_u32 = ip0->dst_address.as_u32;
+         k.xx_id = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+         k.frag_id = ip0->fragment_id;
+         k.proto = ip0->protocol;
+         k.unused = 0;
+         ip4_reass_t *reass =
+           ip4_reass_find_or_create (vm, rm, &k, &vec_drop_timeout);
+
+         u32 error0 = IP4_ERROR_NONE;
+         if (reass)
+           {
+             ip4_reass_update (vm, node, rm, reass, &bi0, &next0, &error0,
+                               &vec_drop_overlap, &vec_drop_compress,
+                               IP4_REASSEMBLY_NEXT_INPUT,
+                               IP4_REASSEMBLY_NEXT_DROP);
+           }
+         else
+           {
+             next0 = IP4_REASSEMBLY_NEXT_DROP;
+             error0 = IP4_ERROR_REASS_LIMIT_REACHED;
+           }
+
+         b0->error = node->errors[error0];
+
+         if (bi0 != ~0)
+           {
+             to_next[0] = bi0;
+             to_next += 1;
+             n_left_to_next -= 1;
+             vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+                                              n_left_to_next, bi0, next0);
+             IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
+           }
+
+         from += 1;
+         n_left_from -= 1;
+       }
+
+      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+    }
+
+  return frame->n_vectors;
+}
+
+static char *ip4_reassembly_error_strings[] = {
+#define _(sym, string) string,
+  foreach_ip4_error
+#undef _
+};
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (ip4_reass_node, static) = {
+    .function = ip4_reassembly,
+    .name = "ip4-reassembly",
+    .vector_size = sizeof (u32),
+    .format_trace = format_ip4_reass_trace,
+    .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
+    .error_strings = ip4_reassembly_error_strings,
+    .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
+    .next_nodes =
+        {
+                [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
+                [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
+        },
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (ip4_reass_node, ip4_reassembly)
+     static u32 ip4_reass_get_nbuckets ()
+{
+  ip4_reass_main_t *rm = &ip4_reass_main;
+  u32 nbuckets;
+  u8 i;
+
+  nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
+
+  for (i = 0; i < 31; i++)
+    if ((1 << i) >= nbuckets)
+      break;
+  nbuckets = 1 << i;
+
+  return nbuckets;
+}
+
+typedef enum
+{
+  IP4_EVENT_CONFIG_CHANGED = 1,
+} ip4_reass_event_t;
+
+typedef struct
+{
+  int failure;
+  clib_bihash_24_8_t *new_hash;
+} ip4_rehash_cb_ctx;
+
+void
+ip4_rehash_cb (clib_bihash_kv_24_8_t * kv, void *_ctx)
+{
+  ip4_rehash_cb_ctx *ctx = _ctx;
+  if (clib_bihash_add_del_24_8 (ctx->new_hash, kv, 1))
+    {
+      ctx->failure = 1;
+    }
+}
+
+vnet_api_error_t
+ip4_reass_set (u32 timeout_ms, u32 max_reassemblies,
+              u32 expire_walk_interval_ms)
+{
+  u32 old_nbuckets = ip4_reass_get_nbuckets ();
+  ip4_reass_main.timeout_ms = timeout_ms;
+  ip4_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
+  ip4_reass_main.max_reass_n = max_reassemblies;
+  ip4_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
+  vlib_process_signal_event (ip4_reass_main.vlib_main,
+                            ip4_reass_main.ip4_reass_expire_node_idx,
+                            IP4_EVENT_CONFIG_CHANGED, 0);
+  u32 new_nbuckets = ip4_reass_get_nbuckets ();
+  if (ip4_reass_main.max_reass_n > 0 && new_nbuckets > 1 &&
+      new_nbuckets != old_nbuckets)
+    {
+      clib_bihash_24_8_t new_hash;
+      memset (&new_hash, 0, sizeof (new_hash));
+      ip4_rehash_cb_ctx ctx;
+      ctx.failure = 0;
+      ctx.new_hash = &new_hash;
+      clib_bihash_init_24_8 (&new_hash, "ip4-reass", new_nbuckets,
+                            new_nbuckets * 1024);
+      clib_bihash_foreach_key_value_pair_24_8 (&ip4_reass_main.hash,
+                                              ip4_rehash_cb, &ctx);
+      if (ctx.failure)
+       {
+         clib_bihash_free_24_8 (&new_hash);
+         return -1;
+       }
+      else
+       {
+         clib_bihash_free_24_8 (&ip4_reass_main.hash);
+         clib_memcpy (&ip4_reass_main.hash, &new_hash,
+                      sizeof (ip4_reass_main.hash));
+       }
+    }
+  return 0;
+}
+
+vnet_api_error_t
+ip4_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
+              u32 * expire_walk_interval_ms)
+{
+  *timeout_ms = ip4_reass_main.timeout_ms;
+  *max_reassemblies = ip4_reass_main.max_reass_n;
+  *expire_walk_interval_ms = ip4_reass_main.expire_walk_interval_ms;
+  return 0;
+}
+
+clib_error_t *
+ip4_reass_init_function (vlib_main_t * vm)
+{
+  ip4_reass_main_t *rm = &ip4_reass_main;
+  clib_error_t *error = 0;
+  u32 nbuckets;
+
+  rm->vlib_main = vm;
+  rm->vnet_main = vnet_get_main ();
+
+  rm->reass_n = 0;
+  pool_alloc (rm->pool, rm->max_reass_n);
+  ip4_reass_set (IP4_REASS_TIMEOUT_DEFAULT_MS,
+                IP4_REASS_MAX_REASSEMBLIES_DEAFULT,
+                IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
+
+  nbuckets = ip4_reass_get_nbuckets ();
+  clib_bihash_init_24_8 (&rm->hash, "ip4-reass", nbuckets, nbuckets * 1024);
+
+  vlib_node_t *node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
+  ASSERT (node);
+  rm->ip4_drop_idx = node->index;
+  node = vlib_get_node_by_name (vm, (u8 *) "ip4-reassembly-expire-walk");
+  ASSERT (node);
+  rm->ip4_reass_expire_node_idx = node->index;
+  return error;
+}
+
+VLIB_INIT_FUNCTION (ip4_reass_init_function);
+
+static uword
+ip4_reass_walk_expired (vlib_main_t * vm,
+                       vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+  ip4_reass_main_t *rm = &ip4_reass_main;
+  uword event_type, *event_data = 0;
+
+  while (true)
+    {
+      vlib_process_wait_for_event_or_clock (vm,
+                                           (f64) rm->expire_walk_interval_ms
+                                           / (f64) MSEC_PER_SEC);
+      event_type = vlib_process_get_events (vm, &event_data);
+
+      switch (event_type)
+       {
+       case ~0:                /* no events => timeout */
+         /* nothing to do here */
+         break;
+       case IP4_EVENT_CONFIG_CHANGED:
+         break;
+       default:
+         clib_warning ("BUG: event type 0x%wx", event_type);
+         break;
+       }
+      f64 now = vlib_time_now (vm);
+
+      ip4_reass_t *reass;
+      u32 *vec_drop_timeout = NULL;
+      int *pool_indexes_to_free = NULL;
+
+      int index;
+      /* *INDENT-OFF* */
+      pool_foreach_index (index, rm->pool, ({
+                            reass = pool_elt_at_index (rm->pool, index);
+                            if (now > reass->last_heard + rm->timeout)
+                              {
+                                vec_add1 (pool_indexes_to_free, index);
+                              }
+                          }));
+      /* *INDENT-ON* */
+      int *i;
+      /* *INDENT-OFF* */
+      vec_foreach (i, pool_indexes_to_free)
+      {
+        ip4_reass_t *reass = pool_elt_at_index (rm->pool, i[0]);
+        ip4_reass_on_timeout (vm, rm, reass, &vec_drop_timeout);
+        ip4_reass_free (rm, reass);
+      }
+      /* *INDENT-ON* */
+
+      while (vec_len (vec_drop_timeout) > 0)
+       {
+         vlib_frame_t *f = vlib_get_frame_to_node (vm, rm->ip4_drop_idx);
+         u32 *to_next = vlib_frame_vector_args (f);
+         u32 n_left_to_next = VLIB_FRAME_SIZE - f->n_vectors;
+         u32 n_trace = 0;
+         while (vec_len (vec_drop_timeout) > 0 && n_left_to_next > 0)
+           {
+             u32 bi = vec_pop (vec_drop_timeout);
+             vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+             if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
+               {
+                 if (pool_is_free_index (vm->trace_main.trace_buffer_pool,
+                                         b->trace_index))
+                   {
+                     /* the trace is gone, don't trace this buffer anymore */
+                     b->flags &= ~VLIB_BUFFER_IS_TRACED;
+                   }
+                 else
+                   {
+                     ++n_trace;
+                   }
+               }
+             b->error = node->errors[IP4_ERROR_REASS_TIMEOUT];
+             to_next[0] = bi;
+             ++f->n_vectors;
+             to_next += 1;
+             n_left_to_next -= 1;
+             IP4_REASS_DEBUG_BUFFER (bi, enqueue_drop_timeout_walk);
+             ASSERT (rm->buffers_n > 0);
+             --rm->buffers_n;
+           }
+         if (PREDICT_FALSE (n_trace > 0))
+           {
+             f->flags |= VLIB_FRAME_TRACE;
+           }
+         vlib_put_frame_to_node (vm, rm->ip4_drop_idx, f);
+       }
+
+      vec_free (pool_indexes_to_free);
+      vec_free (vec_drop_timeout);
+      if (event_data)
+       {
+         _vec_len (event_data) = 0;
+       }
+    }
+
+  return 0;
+}
+
+static vlib_node_registration_t ip4_reass_expire_node;
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (ip4_reass_expire_node, static) = {
+    .function = ip4_reass_walk_expired,
+    .type = VLIB_NODE_TYPE_PROCESS,
+    .name = "ip4-reassembly-expire-walk",
+    .format_trace = format_ip4_reass_trace,
+    .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
+    .error_strings = ip4_reassembly_error_strings,
+
+};
+/* *INDENT-ON* */
+
+static u8 *
+format_ip4_reass_key (u8 * s, va_list * args)
+{
+  ip4_reass_key_t *key = va_arg (*args, ip4_reass_key_t *);
+  s = format (s, "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
+             key->xx_id, format_ip4_address, &key->src, format_ip4_address,
+             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
+  return s;
+}
+
+static u8 *
+format_ip4_reass (u8 * s, va_list * args)
+{
+  vlib_main_t *vm = va_arg (*args, vlib_main_t *);
+  ip4_reass_t *reass = va_arg (*args, ip4_reass_t *);
+
+  s = format (s, "ID: %u, key: %U\n  first_bi: %u, data_len: %u, "
+             "last_packet_octet: %u, trace_op_counter: %u\n",
+             reass->id, format_ip4_reass_key, &reass->key, reass->first_bi,
+             reass->data_len, reass->last_packet_octet,
+             reass->trace_op_counter);
+  u32 bi = reass->first_bi;
+  u32 counter = 0;
+  while (~0 != bi)
+    {
+      vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+      vnet_buffer_opaque_t *vnb = vnet_buffer (b);
+      s = format (s, "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
+                 "fragment[%u, %u]\n",
+                 counter, vnb->ip.reass.range_first,
+                 vnb->ip.reass.range_last, bi,
+                 ip4_reass_buffer_get_data_offset_no_check (b),
+                 ip4_reass_buffer_get_data_len_no_check (b),
+                 vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
+      if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+       {
+         bi = b->next_buffer;
+       }
+      else
+       {
+         bi = ~0;
+       }
+    }
+  return s;
+}
+
+static clib_error_t *
+show_ip4_reass (vlib_main_t * vm, unformat_input_t * input,
+               CLIB_UNUSED (vlib_cli_command_t * lmd))
+{
+  ip4_reass_main_t *rm = &ip4_reass_main;
+
+  vlib_cli_output (vm, "---------------------");
+  vlib_cli_output (vm, "IP4 reassembly status");
+  vlib_cli_output (vm, "---------------------");
+  if (unformat (input, "details"))
+    {
+      ip4_reass_t *reass;
+      /* *INDENT-OFF* */
+      pool_foreach (reass, rm->pool, {
+        vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
+      });
+      /* *INDENT-ON* */
+    }
+  vlib_cli_output (vm, "---------------------");
+  vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n", rm->reass_n);
+  vlib_cli_output (vm,
+                  "Maximum configured concurrent IP4 reassemblies: %lu\n",
+                  (long unsigned) rm->max_reass_n);
+  vlib_cli_output (vm, "Buffers in use: %lu\n",
+                  (long unsigned) rm->buffers_n);
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_ip4_reassembly_cmd, static) = {
+    .path = "show ip4-reassembly",
+    .short_help = "show ip4-reassembly [details]",
+    .function = show_ip4_reass,
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/ip/ip4_reassembly.h b/src/vnet/ip/ip4_reassembly.h
new file mode 100644 (file)
index 0000000..a48c07e
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ * @brief IPv4 Reassembly.
+ *
+ * This file contains the source code for IPv4 reassembly.
+ */
+
+#ifndef __included_ip4_reassembly_h__
+#define __included_ip4_reassembly_h__
+
+#include <vnet/api_errno.h>
+#include <vnet/vnet.h>
+
+/**
+ * @brief set ip4 reassembly configuration
+ */
+vnet_api_error_t ip4_reass_set (u32 timeout_ms, u32 max_reassemblies,
+                               u32 expire_walk_interval_ms);
+
+/**
+ * @brief get ip4 reassembly configuration
+ */
+vnet_api_error_t ip4_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
+                               u32 * expire_walk_interval_ms);
+
+#endif /* __included_ip4_reassembly_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
index 7bb4b77..19f2062 100644 (file)
                                                                         \
  /* Erros singalled by ip6-inacl */                                     \
   _ (INACL_TABLE_MISS, "input ACL table-miss drops")                    \
-  _ (INACL_SESSION_DENY, "input ACL session deny drops")
+  _ (INACL_SESSION_DENY, "input ACL session deny drops")                \
+                                                                        \
+  /* Errors signalled by ip6-reassembly */                              \
+  _ (REASS_MISSING_UPPER, "missing-upper layer drops")                  \
+  _ (REASS_DUPLICATE_FRAGMENT, "duplicate fragments")                   \
+  _ (REASS_OVERLAPPING_FRAGMENT, "overlapping fragments")               \
+  _ (REASS_LIMIT_REACHED, "drops due to concurrent reassemblies limit") \
+  _ (REASS_TIMEOUT, "fragments dropped due to reassembly timeout")
 
 typedef enum
 {
index c1c9ec0..5b06ca7 100644 (file)
@@ -1578,6 +1578,7 @@ ip6_local_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          next0 = lm->local_next_by_ip_protocol[ip0->protocol];
          next0 =
            error0 != IP6_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0;
+
          p0->error = error_node->errors[error0];
 
          if (head_of_feature_arc)
@@ -1619,6 +1620,7 @@ VLIB_REGISTER_NODE (ip6_local_node, static) =
     [IP_LOCAL_NEXT_PUNT] = "ip6-punt",
     [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip6-udp-lookup",
     [IP_LOCAL_NEXT_ICMP] = "ip6-icmp-input",
+    [IP_LOCAL_NEXT_REASSEMBLY] = "ip6-reassembly",
   },
 };
 /* *INDENT-ON* */
index 4c0d784..fe993ca 100644 (file)
@@ -41,6 +41,7 @@
 #define included_ip6_input_h
 
 #include <vnet/ip/ip.h>
+#include <vnet/ip/icmp6.h>
 
 extern char *ip6_error_strings[];
 
index c0c745e..6b72437 100644 (file)
@@ -519,6 +519,9 @@ typedef CLIB_PACKED (struct {
 #define ip6_frag_hdr_offset(hdr) \
   (clib_net_to_host_u16((hdr)->fragment_offset_and_more) >> 3)
 
+#define ip6_frag_hdr_offset_bytes(hdr) \
+  (8 * ip6_frag_hdr_offset(hdr))
+
 #define ip6_frag_hdr_more(hdr) \
   (clib_net_to_host_u16((hdr)->fragment_offset_and_more) & 0x1)
 
diff --git a/src/vnet/ip/ip6_reassembly.c b/src/vnet/ip/ip6_reassembly.c
new file mode 100644 (file)
index 0000000..d448032
--- /dev/null
@@ -0,0 +1,1347 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ * @brief IPv6 Reassembly.
+ *
+ * This file contains the source code for IPv6 reassembly.
+ */
+
+#include <vppinfra/vec.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vppinfra/bihash_48_8.h>
+#include <vnet/ip/ip6_reassembly.h>
+
+#define MSEC_PER_SEC 1000
+#define IP6_REASS_TIMEOUT_DEFAULT_MS 100
+#define IP6_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000        // 10 seconds default
+#define IP6_REASS_MAX_REASSEMBLIES_DEAFULT 1024
+#define IP6_REASS_HT_LOAD_FACTOR (0.75)
+
+static vlib_node_registration_t ip6_reass_node;
+
+typedef struct
+{
+  union
+  {
+    struct
+    {
+      ip6_address_t src;
+      ip6_address_t dst;
+      // align by making this 4 octets even though its a 2 octets field
+      u32 xx_id;
+      // align by making this 4 octets even though its a 2 octets field
+      u32 frag_id;
+      // align by making this 4 octets even though its a 1 octet field
+      u32 proto;
+      u32 unused;
+    };
+    u64 as_u64[6];
+  };
+} ip6_reass_key_t;
+
+always_inline u32
+ip6_reass_buffer_get_data_offset_no_check (vlib_buffer_t * b)
+{
+  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
+  return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
+}
+
+always_inline u32
+ip6_reass_buffer_get_data_offset (vlib_buffer_t * b)
+{
+  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
+  ASSERT (vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first);
+  return ip6_reass_buffer_get_data_offset_no_check (b);
+}
+
+always_inline u16
+ip6_reass_buffer_get_data_len_no_check (vlib_buffer_t * b)
+{
+  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
+  return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
+    (vnb->ip.reass.fragment_first + ip6_reass_buffer_get_data_offset (b)) + 1;
+}
+
+always_inline u16
+ip6_reass_buffer_get_data_len (vlib_buffer_t * b)
+{
+  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
+  ASSERT (vnb->ip.reass.range_last > vnb->ip.reass.fragment_first);
+  return ip6_reass_buffer_get_data_len_no_check (b);
+}
+
+typedef struct
+{
+  // hash table key
+  ip6_reass_key_t key;
+  // time when first packet was received
+  f64 first_heard;
+  // time when last packet was received
+  f64 last_heard;
+  // internal id of this reassembly
+  u32 id;
+  // buffer index of first buffer in this reassembly context
+  u32 first_bi;
+  // last octet of packet, ~0 until fragment without more_fragments arrives
+  u32 last_packet_octet;
+  // length of data collected so far
+  u32 data_len;
+  // trace operation counter
+  u32 trace_op_counter;
+} ip6_reass_t;
+
+typedef struct
+{
+  // IPv6 config
+  u32 timeout_ms;
+  f64 timeout;
+  u32 expire_walk_interval_ms;
+  u32 max_reass_n;
+
+  // IPv6 runtime
+  ip6_reass_t *pool;
+  clib_bihash_48_8_t hash;
+  u32 reass_n;
+  u32 id_counter;
+  u32 buffers_n;
+
+  // convenience
+  vlib_main_t *vlib_main;
+  vnet_main_t *vnet_main;
+
+  // node index of ip6-drop node
+  u32 ip6_drop_idx;
+  u32 ip6_icmp_error_idx;
+  u32 ip6_reass_expire_node_idx;
+
+} ip6_reass_main_t;
+
+ip6_reass_main_t ip6_reass_main;
+
+typedef enum
+{
+  IP6_REASSEMBLY_NEXT_INPUT,
+  IP6_REASSEMBLY_NEXT_DROP,
+  IP6_REASSEMBLY_NEXT_ICMP_ERROR,
+  IP6_REASSEMBLY_N_NEXT,
+} ip6_reass_next_t;
+
+typedef enum
+{
+  RANGE_NEW,
+  RANGE_OVERLAP,
+  ICMP_ERROR_RT_EXCEEDED,
+  ICMP_ERROR_FL_TOO_BIG,
+  ICMP_ERROR_FL_NOT_MULT_8,
+  FINALIZE,
+} ip6_reass_trace_operation_e;
+
+typedef struct
+{
+  u16 range_first;
+  u16 range_last;
+  u32 range_bi;
+  i32 data_offset;
+  u32 data_len;
+  u32 first_bi;
+} ip6_reass_range_trace_t;
+
+typedef struct
+{
+  ip6_reass_trace_operation_e action;
+  u32 pool_index;
+  u32 reass_id;
+  ip6_reass_range_trace_t trace_range;
+  u32 size_diff;
+  u32 op_id;
+  u32 fragment_first;
+  u32 fragment_last;
+  u32 total_data_len;
+} ip6_reass_trace_t;
+
+static void
+ip6_reass_trace_details (vlib_main_t * vm, u32 bi,
+                        ip6_reass_range_trace_t * trace)
+{
+  vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
+  trace->range_first = vnb->ip.reass.range_first;
+  trace->range_last = vnb->ip.reass.range_last;
+  trace->data_offset = ip6_reass_buffer_get_data_offset_no_check (b);
+  trace->data_len = ip6_reass_buffer_get_data_len_no_check (b);
+  trace->range_bi = bi;
+}
+
+u8 *
+format_ip6_reass_range_trace (u8 * s, va_list * args)
+{
+  ip6_reass_range_trace_t *trace = va_arg (*args, ip6_reass_range_trace_t *);
+  s = format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
+             trace->range_last, trace->data_offset, trace->data_len,
+             trace->range_bi);
+  return s;
+}
+
+u8 *
+format_ip6_reass_trace (u8 * s, va_list * args)
+{
+  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+  ip6_reass_trace_t *t = va_arg (*args, ip6_reass_trace_t *);
+  s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
+  u32 indent = format_get_indent (s);
+  s = format (s, "first bi: %u, data len: %u, ip/fragment[%u, %u]",
+             t->trace_range.first_bi, t->total_data_len, t->fragment_first,
+             t->fragment_last);
+  switch (t->action)
+    {
+    case RANGE_NEW:
+      s = format (s, "\n%Unew %U", format_white_space, indent,
+                 format_ip6_reass_range_trace, &t->trace_range);
+      break;
+    case RANGE_OVERLAP:
+      s = format (s, "\n%Uoverlap %U", format_white_space, indent,
+                 format_ip6_reass_range_trace, &t->trace_range);
+      break;
+    case ICMP_ERROR_FL_TOO_BIG:
+      s = format (s, "\n%Uicmp-error - frag_len > 65535 %U",
+                 format_white_space, indent, format_ip6_reass_range_trace,
+                 &t->trace_range);
+      break;
+    case ICMP_ERROR_FL_NOT_MULT_8:
+      s = format (s, "\n%Uicmp-error - frag_len mod 8 != 0 %U",
+                 format_white_space, indent, format_ip6_reass_range_trace,
+                 &t->trace_range);
+      break;
+    case ICMP_ERROR_RT_EXCEEDED:
+      s = format (s, "\n%Uicmp-error - reassembly time exceeded",
+                 format_white_space, indent);
+      break;
+    case FINALIZE:
+      s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
+      break;
+    }
+  return s;
+}
+
+static void
+ip6_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
+                    ip6_reass_main_t * rm, ip6_reass_t * reass,
+                    u32 bi, ip6_reass_trace_operation_e action,
+                    u32 size_diff)
+{
+  vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
+  ip6_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
+  t->pool_index = reass - rm->pool;
+  t->reass_id = reass->id;
+  t->action = action;
+  ip6_reass_trace_details (vm, bi, &t->trace_range);
+  t->size_diff = size_diff;
+  t->op_id = reass->trace_op_counter;
+  ++reass->trace_op_counter;
+  t->fragment_first = vnb->ip.reass.fragment_first;
+  t->fragment_last = vnb->ip.reass.fragment_last;
+  t->trace_range.first_bi = reass->first_bi;
+  t->total_data_len = reass->data_len;
+#if 0
+  static u8 *s = NULL;
+  s = format (s, "%U", format_ip6_reass_trace, NULL, NULL, t);
+  printf ("%.*s\n", vec_len (s), s);
+  fflush (stdout);
+  vec_reset_length (s);
+#endif
+}
+
+static void
+ip6_reass_free (ip6_reass_main_t * rm, ip6_reass_t * reass)
+{
+  clib_bihash_kv_48_8_t kv;
+  kv.key[0] = reass->key.as_u64[0];
+  kv.key[1] = reass->key.as_u64[1];
+  kv.key[2] = reass->key.as_u64[2];
+  kv.key[3] = reass->key.as_u64[3];
+  kv.key[4] = reass->key.as_u64[4];
+  kv.key[5] = reass->key.as_u64[5];
+  clib_bihash_add_del_48_8 (&rm->hash, &kv, 0);
+  pool_put (rm->pool, reass);
+  --rm->reass_n;
+}
+
+static void
+ip6_reass_drop_all (vlib_main_t * vm, ip6_reass_main_t * rm,
+                   ip6_reass_t * reass, u32 ** vec_drop_bi)
+{
+  u32 range_bi = reass->first_bi;
+  vlib_buffer_t *range_b;
+  vnet_buffer_opaque_t *range_vnb;
+  while (~0 != range_bi)
+    {
+      range_b = vlib_get_buffer (vm, range_bi);
+      range_vnb = vnet_buffer (range_b);
+      u32 bi = range_bi;
+      while (~0 != bi)
+       {
+         vec_add1 (*vec_drop_bi, bi);
+         vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+         if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+           {
+             bi = b->next_buffer;
+             b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+           }
+         else
+           {
+             bi = ~0;
+           }
+       }
+      range_bi = range_vnb->ip.reass.next_range_bi;
+    }
+}
+
+static void
+ip6_reass_on_timeout (vlib_main_t * vm, vlib_node_runtime_t * node,
+                     ip6_reass_main_t * rm, ip6_reass_t * reass,
+                     u32 * icmp_bi, u32 ** vec_timeout)
+{
+  if (~0 == reass->first_bi)
+    {
+      return;
+    }
+  vlib_buffer_t *b = vlib_get_buffer (vm, reass->first_bi);
+  if (0 == vnet_buffer (b)->ip.reass.fragment_first)
+    {
+      *icmp_bi = reass->first_bi;
+      if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
+       {
+         ip6_reass_add_trace (vm, node, rm, reass, reass->first_bi,
+                              ICMP_ERROR_RT_EXCEEDED, 0);
+       }
+      // fragment with offset zero received - send icmp message back
+      if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+       {
+         // separate first buffer from chain and steer it towards icmp node
+         b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+         reass->first_bi = b->next_buffer;
+       }
+      else
+       {
+         reass->first_bi = vnet_buffer (b)->ip.reass.next_range_bi;
+       }
+      icmp6_error_set_vnet_buffer (b, ICMP6_time_exceeded,
+                                  ICMP6_time_exceeded_fragment_reassembly_time_exceeded,
+                                  0);
+    }
+  ip6_reass_drop_all (vm, rm, reass, vec_timeout);
+}
+
+static ip6_reass_t *
+ip6_reass_find_or_create (vlib_main_t * vm,
+                         vlib_node_runtime_t * node,
+                         ip6_reass_main_t * rm,
+                         ip6_reass_key_t * k, u32 * icmp_bi,
+                         u32 ** vec_timeout)
+{
+  ip6_reass_t *reass = NULL;
+  f64 now = vlib_time_now (rm->vlib_main);
+  clib_bihash_kv_48_8_t kv, value;
+  kv.key[0] = k->as_u64[0];
+  kv.key[1] = k->as_u64[1];
+  kv.key[2] = k->as_u64[2];
+  kv.key[3] = k->as_u64[3];
+  kv.key[4] = k->as_u64[4];
+  kv.key[5] = k->as_u64[5];
+
+  if (!clib_bihash_search_48_8 (&rm->hash, &kv, &value))
+    {
+      reass = pool_elt_at_index (rm->pool, value.value);
+      if (now > reass->last_heard + rm->timeout)
+       {
+         ip6_reass_on_timeout (vm, node, rm, reass, icmp_bi, vec_timeout);
+         ip6_reass_free (rm, reass);
+         reass = NULL;
+       }
+    }
+
+  if (reass)
+    {
+      reass->last_heard = now;
+      return reass;
+    }
+
+  if (rm->reass_n >= rm->max_reass_n)
+    {
+      reass = NULL;
+      return reass;
+    }
+  else
+    {
+      pool_get (rm->pool, reass);
+      memset (reass, 0, sizeof (*reass));
+      reass->id = rm->id_counter;
+      ++rm->id_counter;
+      reass->first_bi = ~0;
+      reass->last_packet_octet = ~0;
+      reass->data_len = 0;
+      ++rm->reass_n;
+    }
+
+  reass->key.as_u64[0] = kv.key[0] = k->as_u64[0];
+  reass->key.as_u64[1] = kv.key[1] = k->as_u64[1];
+  reass->key.as_u64[2] = kv.key[2] = k->as_u64[2];
+  reass->key.as_u64[3] = kv.key[3] = k->as_u64[3];
+  reass->key.as_u64[4] = kv.key[4] = k->as_u64[4];
+  reass->key.as_u64[5] = kv.key[5] = k->as_u64[5];
+  kv.value = reass - rm->pool;
+  reass->last_heard = now;
+
+  if (clib_bihash_add_del_48_8 (&rm->hash, &kv, 1))
+    {
+      ip6_reass_free (rm, reass);
+      reass = NULL;
+    }
+
+  return reass;
+}
+
+void
+ip6_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
+                   ip6_reass_main_t * rm, ip6_reass_t * reass, u32 * bi0,
+                   u32 * next0, vlib_error_t * error0, u32 next_input,
+                   u32 ** vec_drop_compress)
+{
+  ASSERT (~0 != reass->first_bi);
+  *bi0 = reass->first_bi;
+  *next0 = next_input;
+  *error0 = IP6_ERROR_NONE;
+  ip6_frag_hdr_t *frag_hdr;
+  vlib_buffer_t *last_b = NULL;
+  u32 sub_chain_bi = reass->first_bi;
+  u32 total_length = 0;
+  u32 buf_cnt = 0;
+  u32 dropped_cnt = 0;
+  do
+    {
+      u32 tmp_bi = sub_chain_bi;
+      vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
+      u32 data_len = ip6_reass_buffer_get_data_len (tmp);
+      u32 trim_front = vnet_buffer (tmp)->ip.reass.ip6_frag_hdr_offset +
+       sizeof (*frag_hdr) + ip6_reass_buffer_get_data_offset (tmp);
+      u32 trim_end =
+       vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
+      if (tmp_bi == reass->first_bi)
+       {
+         /* first buffer - keep ip6 header */
+         ASSERT (0 == ip6_reass_buffer_get_data_offset (tmp));
+         trim_front = 0;
+         trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
+           (vnet_buffer (tmp)->ip.reass.ip6_frag_hdr_offset +
+            sizeof (*frag_hdr));
+       }
+      u32 keep_data =
+       vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
+      while (1)
+       {
+         ++buf_cnt;
+         if (trim_front)
+           {
+             if (trim_front > tmp->current_length)
+               {
+                 /* drop whole buffer */
+                 vec_add1 (*vec_drop_compress, tmp_bi);
+                 ++dropped_cnt;
+                 trim_front -= tmp->current_length;
+                 ASSERT (tmp->flags & VLIB_BUFFER_NEXT_PRESENT);
+                 tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+                 tmp_bi = tmp->next_buffer;
+                 tmp = vlib_get_buffer (vm, tmp_bi);
+                 continue;
+               }
+             else
+               {
+                 vlib_buffer_advance (tmp, trim_front);
+                 trim_front = 0;
+               }
+           }
+         if (keep_data)
+           {
+             if (last_b)
+               {
+                 last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+                 last_b->next_buffer = tmp_bi;
+               }
+             last_b = tmp;
+             if (keep_data <= tmp->current_length)
+               {
+                 tmp->current_length = keep_data;
+                 keep_data = 0;
+               }
+             else
+               {
+                 keep_data -= tmp->current_length;
+                 ASSERT (tmp->flags & VLIB_BUFFER_NEXT_PRESENT);
+               }
+             total_length += tmp->current_length;
+           }
+         else
+           {
+             vec_add1 (*vec_drop_compress, tmp_bi);
+             ASSERT (reass->first_bi != tmp_bi);
+             ++dropped_cnt;
+           }
+         if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
+           {
+             tmp_bi = tmp->next_buffer;
+             tmp = vlib_get_buffer (vm, tmp->next_buffer);
+           }
+         else
+           {
+             break;
+           }
+       }
+      sub_chain_bi =
+       vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
+       reass.next_range_bi;
+    }
+  while (~0 != sub_chain_bi);
+  last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+  vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
+  ASSERT (total_length >= first_b->current_length);
+  total_length -= first_b->current_length;
+  first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+  first_b->total_length_not_including_first_buffer = total_length;
+  // drop fragment header
+  vnet_buffer_opaque_t *first_b_vnb = vnet_buffer (first_b);
+  ip6_header_t *ip = vlib_buffer_get_current (first_b);
+  u16 ip6_frag_hdr_offset = first_b_vnb->ip.reass.ip6_frag_hdr_offset;
+  ip6_ext_header_t *prev_hdr;
+  ip6_ext_header_find_t (ip, prev_hdr, frag_hdr,
+                        IP_PROTOCOL_IPV6_FRAGMENTATION);
+  if (prev_hdr)
+    {
+      prev_hdr->next_hdr = frag_hdr->next_hdr;
+    }
+  else
+    {
+      ip->protocol = frag_hdr->next_hdr;
+    }
+  ASSERT ((u8 *) frag_hdr - (u8 *) ip == ip6_frag_hdr_offset);
+  memmove (frag_hdr, (u8 *) frag_hdr + sizeof (*frag_hdr),
+          first_b->current_length - ip6_frag_hdr_offset -
+          sizeof (ip6_frag_hdr_t));
+  first_b->current_length -= sizeof (*frag_hdr);
+  ip->payload_length =
+    clib_host_to_net_u16 (total_length + first_b->current_length -
+                         sizeof (*ip));
+  ip6_reass_free (rm, reass);
+  vlib_buffer_chain_compress (vm, first_b, vec_drop_compress);
+  if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
+    {
+      ip6_reass_add_trace (vm, node, rm, reass, reass->first_bi, FINALIZE, 0);
+#if 0
+      // following code does a hexdump of packet fragments to stdout ...
+      do
+       {
+         u32 bi = reass->first_bi;
+         u8 *s = NULL;
+         while (~0 != bi)
+           {
+             vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+             s = format (s, "%u: %U\n", bi, format_hexdump,
+                         vlib_buffer_get_current (b), b->current_length);
+             if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+               {
+                 bi = b->next_buffer;
+               }
+             else
+               {
+                 break;
+               }
+           }
+         printf ("%.*s\n", vec_len (s), s);
+         fflush (stdout);
+         vec_free (s);
+       }
+      while (0);
+#endif
+    }
+  reass = NULL;
+}
+
+static u32
+ip6_reass_get_buffer_chain_length (vlib_main_t * vm, vlib_buffer_t * b)
+{
+  u32 len = 0;
+  while (b)
+    {
+      ++len;
+      if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+       {
+         b = vlib_get_buffer (vm, b->next_buffer);
+       }
+      else
+       {
+         break;
+       }
+    }
+  return len;
+}
+
+static void
+ip6_reass_insert_range_in_chain (vlib_main_t * vm,
+                                ip6_reass_main_t * rm,
+                                ip6_reass_t * reass,
+                                u32 prev_range_bi, u32 new_next_bi)
+{
+
+  vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
+  vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
+  if (~0 != prev_range_bi)
+    {
+      vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
+      vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
+      new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
+      prev_vnb->ip.reass.next_range_bi = new_next_bi;
+    }
+  else
+    {
+      if (~0 != reass->first_bi)
+       {
+         new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
+       }
+      reass->first_bi = new_next_bi;
+    }
+  reass->data_len += ip6_reass_buffer_get_data_len (new_next_b);
+  rm->buffers_n += ip6_reass_get_buffer_chain_length (vm, new_next_b);
+}
+
+void
+ip6_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
+                 ip6_reass_main_t * rm, ip6_reass_t * reass, u32 * bi0,
+                 u32 * next0, vlib_error_t * error0,
+                 ip6_frag_hdr_t * frag_hdr, u32 ** vec_drop_overlap,
+                 u32 ** vec_drop_compress, u32 next_input, u32 next_drop,
+                 u32 next_icmp_error)
+{
+  int consumed = 0;
+  vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
+  vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
+  fvnb->ip.reass.ip6_frag_hdr_offset =
+    (u8 *) frag_hdr - (u8 *) vlib_buffer_get_current (fb);
+  ip6_header_t *fip = vlib_buffer_get_current (fb);
+  ASSERT (fb->current_length > sizeof (*fip));
+  ASSERT (fvnb->ip.reass.ip6_frag_hdr_offset > 0 &&
+         fvnb->ip.reass.ip6_frag_hdr_offset < fb->current_length);
+  u32 fragment_first = fvnb->ip.reass.fragment_first =
+    ip6_frag_hdr_offset_bytes (frag_hdr);
+  u32 fragment_length =
+    vlib_buffer_length_in_chain (vm, fb) -
+    (fvnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
+  u32 fragment_last = fvnb->ip.reass.fragment_last =
+    fragment_first + fragment_length - 1;
+  int more_fragments = ip6_frag_hdr_more (frag_hdr);
+  u32 candidate_range_bi = reass->first_bi;
+  u32 prev_range_bi = ~0;
+  fvnb->ip.reass.range_first = fragment_first;
+  fvnb->ip.reass.range_last = fragment_last;
+  fvnb->ip.reass.next_range_bi = ~0;
+  if (more_fragments && 0 != fragment_length % 8)
+    {
+      if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
+       {
+         ip6_reass_add_trace (vm, node, rm, reass, *bi0,
+                              ICMP_ERROR_FL_NOT_MULT_8, 0);
+       }
+      *next0 = next_icmp_error;
+      icmp6_error_set_vnet_buffer (fb, ICMP6_parameter_problem,
+                                  ICMP6_parameter_problem_erroneous_header_field,
+                                  (u8 *) & fip->payload_length - (u8 *) fip);
+      return;
+    }
+  if (fragment_first + fragment_length > 65535)
+    {
+      if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
+       {
+         ip6_reass_add_trace (vm, node, rm, reass, *bi0,
+                              ICMP_ERROR_FL_TOO_BIG, 0);
+       }
+      *next0 = next_icmp_error;
+      ip6_header_t *ip0 = vlib_buffer_get_current (fb);
+      icmp6_error_set_vnet_buffer (fb, ICMP6_parameter_problem,
+                                  ICMP6_parameter_problem_erroneous_header_field,
+                                  (u8 *) & frag_hdr->fragment_offset_and_more
+                                  - (u8 *) ip0);
+      return;
+    }
+  if (!more_fragments)
+    {
+      reass->last_packet_octet = fragment_last;
+    }
+  if (~0 == reass->first_bi)
+    {
+      // starting a new reassembly
+      ip6_reass_insert_range_in_chain (vm, rm, reass, prev_range_bi, *bi0);
+      if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
+       {
+         ip6_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
+       }
+      *bi0 = ~0;
+      return;
+    }
+  fvnb->ip.reass.estimated_mtu =
+    clib_min (clib_net_to_host_u16 (fip->payload_length),
+             fvnb->ip.reass.estimated_mtu);
+  while (~0 != candidate_range_bi)
+    {
+      vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
+      vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
+      if (fragment_first > candidate_vnb->ip.reass.range_last)
+       {
+         // this fragments starts after candidate range
+         prev_range_bi = candidate_range_bi;
+         candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
+         if (candidate_vnb->ip.reass.range_last < fragment_last &&
+             ~0 == candidate_range_bi)
+           {
+             // special case - this fragment falls beyond all known ranges
+             ip6_reass_insert_range_in_chain (vm, rm, reass, prev_range_bi,
+                                              *bi0);
+             consumed = 1;
+             break;
+           }
+         continue;
+       }
+      if (fragment_last < candidate_vnb->ip.reass.range_first)
+       {
+         // this fragment ends before candidate range without any overlap
+         ip6_reass_insert_range_in_chain (vm, rm, reass, prev_range_bi,
+                                          *bi0);
+         consumed = 1;
+       }
+      else if (fragment_first == candidate_vnb->ip.reass.range_first &&
+              fragment_last == candidate_vnb->ip.reass.range_last)
+       {
+         // duplicate fragment - ignore
+       }
+      else
+       {
+         // overlapping fragment - not allowed by RFC 8200
+         ip6_reass_drop_all (vm, rm, reass, vec_drop_overlap);
+         ip6_reass_free (rm, reass);
+         if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
+           {
+             ip6_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_OVERLAP,
+                                  0);
+           }
+         *next0 = next_drop;
+         *error0 = IP6_ERROR_REASS_OVERLAPPING_FRAGMENT;
+       }
+      break;
+    }
+  if (consumed)
+    {
+      if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
+       {
+         ip6_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
+       }
+    }
+  if (~0 != reass->last_packet_octet &&
+      reass->data_len == reass->last_packet_octet + 1)
+    {
+      ip6_reass_finalize (vm, node, rm, reass, bi0, next0, error0, next_input,
+                         vec_drop_compress);
+    }
+  else
+    {
+      if (consumed)
+       {
+         *bi0 = ~0;
+       }
+      else
+       {
+         *next0 = next_drop;
+         *error0 = IP6_ERROR_REASS_DUPLICATE_FRAGMENT;
+       }
+    }
+}
+
+always_inline uword
+ip6_reassembly (vlib_main_t * vm, vlib_node_runtime_t * node,
+               vlib_frame_t * frame)
+{
+  u32 *from = vlib_frame_vector_args (frame);
+  u32 n_left_from, n_left_to_next, *to_next, next_index;
+  ip6_reass_main_t *rm = &ip6_reass_main;
+
+  n_left_from = frame->n_vectors;
+  next_index = node->cached_next_index;
+  static u32 *vec_timeout = NULL;      // indexes of buffers which timed out
+  static u32 *vec_drop_overlap = NULL; // indexes of buffers dropped due to overlap
+  static u32 *vec_drop_compress = NULL;        // indexes of buffers dropped due to buffer compression
+  while (n_left_from > 0 || vec_len (vec_timeout) > 0 ||
+        vec_len (vec_drop_overlap) > 0)
+    {
+      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+      while (vec_len (vec_timeout) > 0 && n_left_to_next > 0)
+       {
+         u32 bi = vec_pop (vec_timeout);
+         vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+         b->error = node->errors[IP6_ERROR_REASS_TIMEOUT];
+         to_next[0] = bi;
+         to_next += 1;
+         n_left_to_next -= 1;
+         vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+                                          n_left_to_next, bi,
+                                          IP6_REASSEMBLY_NEXT_DROP);
+         ASSERT (rm->buffers_n > 0);
+         --rm->buffers_n;
+       }
+
+      while (vec_len (vec_drop_overlap) > 0 && n_left_to_next > 0)
+       {
+         u32 bi = vec_pop (vec_drop_overlap);
+         vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+         b->error = node->errors[IP6_ERROR_REASS_OVERLAPPING_FRAGMENT];
+         to_next[0] = bi;
+         to_next += 1;
+         n_left_to_next -= 1;
+         vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+                                          n_left_to_next, bi,
+                                          IP6_REASSEMBLY_NEXT_DROP);
+         ASSERT (rm->buffers_n > 0);
+         --rm->buffers_n;
+       }
+
+      while (vec_len (vec_drop_compress) > 0 && n_left_to_next > 0)
+       {
+         u32 bi = vec_pop (vec_drop_compress);
+         vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+         b->error = node->errors[IP6_ERROR_NONE];
+         to_next[0] = bi;
+         to_next += 1;
+         n_left_to_next -= 1;
+         vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+                                          n_left_to_next, bi,
+                                          IP6_REASSEMBLY_NEXT_DROP);
+         ASSERT (rm->buffers_n > 0);
+         --rm->buffers_n;
+       }
+
+      while (n_left_from > 0 && n_left_to_next > 0)
+       {
+         u32 bi0;
+         vlib_buffer_t *b0;
+         u32 next0;            //, error0;
+
+         bi0 = from[0];
+         b0 = vlib_get_buffer (vm, bi0);
+
+         ip6_header_t *ip0 = vlib_buffer_get_current (b0);
+         ip6_frag_hdr_t *frag_hdr;
+         ip6_ext_header_t *prev_hdr;
+         ip6_ext_header_find_t (ip0, prev_hdr, frag_hdr,
+                                IP_PROTOCOL_IPV6_FRAGMENTATION);
+         if (0 == ip6_frag_hdr_offset (frag_hdr))
+           {
+             // first fragment - verify upper-layer is present
+             ip6_ext_header_t *tmp = (ip6_ext_header_t *) frag_hdr;
+             while (ip6_ext_hdr (tmp->next_hdr))
+               {
+                 tmp = ip6_ext_next_header (tmp);
+               }
+             if (IP_PROTOCOL_IP6_NONXT == tmp->next_hdr)
+               {
+                 icmp6_error_set_vnet_buffer (b0, ICMP6_parameter_problem,
+                                              ICMP6_parameter_problem_first_fragment_has_incomplete_header_chain,
+                                              0);
+                 b0->error = node->errors[IP6_ERROR_REASS_MISSING_UPPER];
+
+                 to_next[0] = bi0;
+                 to_next += 1;
+                 n_left_to_next -= 1;
+                 vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+                                                  to_next, n_left_to_next,
+                                                  bi0,
+                                                  IP6_REASSEMBLY_NEXT_ICMP_ERROR);
+                 goto next;
+               }
+           }
+         vnet_buffer (b0)->ip.reass.ip6_frag_hdr_offset =
+           (u8 *) frag_hdr - (u8 *) ip0;
+
+         ip6_reass_key_t k;
+         k.src.as_u64[0] = ip0->src_address.as_u64[0];
+         k.src.as_u64[1] = ip0->src_address.as_u64[1];
+         k.dst.as_u64[0] = ip0->dst_address.as_u64[0];
+         k.dst.as_u64[1] = ip0->dst_address.as_u64[1];
+         k.xx_id = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+         k.frag_id = frag_hdr->identification;
+         k.proto = ip0->protocol;
+         k.unused = 0;
+         u32 icmp_bi = ~0;
+         ip6_reass_t *reass =
+           ip6_reass_find_or_create (vm, node, rm, &k, &icmp_bi,
+                                     &vec_timeout);
+
+         u32 error0 = IP6_ERROR_NONE;
+         if (reass)
+           {
+             ip6_reass_update (vm, node, rm, reass, &bi0, &next0, &error0,
+                               frag_hdr, &vec_drop_overlap,
+                               &vec_drop_compress, IP6_REASSEMBLY_NEXT_INPUT,
+                               IP6_REASSEMBLY_NEXT_DROP,
+                               IP6_REASSEMBLY_NEXT_ICMP_ERROR);
+           }
+         else
+           {
+             next0 = IP6_REASSEMBLY_NEXT_DROP;
+             error0 = IP6_ERROR_REASS_LIMIT_REACHED;
+           }
+
+         b0->error = node->errors[error0];
+
+         if (~0 != bi0)
+           {
+             to_next[0] = bi0;
+             to_next += 1;
+             n_left_to_next -= 1;
+             vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+                                              n_left_to_next, bi0, next0);
+           }
+
+         if (~0 != icmp_bi)
+           {
+             next0 = IP6_REASSEMBLY_NEXT_ICMP_ERROR;
+             to_next[0] = icmp_bi;
+             to_next += 1;
+             n_left_to_next -= 1;
+             vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+                                              n_left_to_next, icmp_bi,
+                                              next0);
+           }
+       next:
+         from += 1;
+         n_left_from -= 1;
+       }
+
+      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+    }
+
+  return frame->n_vectors;
+}
+
+static char *ip6_reassembly_error_strings[] = {
+#define _(sym, string) string,
+  foreach_ip6_error
+#undef _
+};
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (ip6_reass_node, static) = {
+    .function = ip6_reassembly,
+    .name = "ip6-reassembly",
+    .vector_size = sizeof (u32),
+    .format_trace = format_ip6_reass_trace,
+    .n_errors = ARRAY_LEN (ip6_reassembly_error_strings),
+    .error_strings = ip6_reassembly_error_strings,
+    .n_next_nodes = IP6_REASSEMBLY_N_NEXT,
+    .next_nodes =
+        {
+                [IP6_REASSEMBLY_NEXT_INPUT] = "ip6-input",
+                [IP6_REASSEMBLY_NEXT_DROP] = "ip6-drop",
+                [IP6_REASSEMBLY_NEXT_ICMP_ERROR] = "ip6-icmp-error",
+        },
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (ip6_reass_node, ip6_reassembly)
+     static u32 ip6_reass_get_nbuckets ()
+{
+  ip6_reass_main_t *rm = &ip6_reass_main;
+  u32 nbuckets;
+  u8 i;
+
+  nbuckets = (u32) (rm->max_reass_n / IP6_REASS_HT_LOAD_FACTOR);
+
+  for (i = 0; i < 31; i++)
+    if ((1 << i) >= nbuckets)
+      break;
+  nbuckets = 1 << i;
+
+  return nbuckets;
+}
+
+typedef enum
+{
+  IP6_EVENT_CONFIG_CHANGED = 1,
+} ip6_reass_event_t;
+
+typedef struct
+{
+  int failure;
+  clib_bihash_48_8_t *new_hash;
+} ip6_rehash_cb_ctx;
+
+void
+ip6_rehash_cb (clib_bihash_kv_48_8_t * kv, void *_ctx)
+{
+  ip6_rehash_cb_ctx *ctx = _ctx;
+  if (clib_bihash_add_del_48_8 (ctx->new_hash, kv, 1))
+    {
+      ctx->failure = 1;
+    }
+}
+
+vnet_api_error_t
+ip6_reass_set (u32 timeout_ms, u32 max_reassemblies,
+              u32 expire_walk_interval_ms)
+{
+  u32 old_nbuckets = ip6_reass_get_nbuckets ();
+  ip6_reass_main.timeout_ms = timeout_ms;
+  ip6_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
+  ip6_reass_main.max_reass_n = max_reassemblies;
+  ip6_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
+  vlib_process_signal_event (ip6_reass_main.vlib_main,
+                            ip6_reass_main.ip6_reass_expire_node_idx,
+                            IP6_EVENT_CONFIG_CHANGED, 0);
+  u32 new_nbuckets = ip6_reass_get_nbuckets ();
+  if (ip6_reass_main.max_reass_n > 0 && new_nbuckets > 1 &&
+      new_nbuckets != old_nbuckets)
+    {
+      clib_bihash_48_8_t new_hash;
+      memset (&new_hash, 0, sizeof (new_hash));
+      ip6_rehash_cb_ctx ctx;
+      ctx.failure = 0;
+      ctx.new_hash = &new_hash;
+      clib_bihash_init_48_8 (&new_hash, "ip6-reass", new_nbuckets,
+                            new_nbuckets * 1024);
+      clib_bihash_foreach_key_value_pair_48_8 (&ip6_reass_main.hash,
+                                              ip6_rehash_cb, &ctx);
+      if (ctx.failure)
+       {
+         clib_bihash_free_48_8 (&new_hash);
+         return -1;
+       }
+      else
+       {
+         clib_bihash_free_48_8 (&ip6_reass_main.hash);
+         clib_memcpy (&ip6_reass_main.hash, &new_hash,
+                      sizeof (ip6_reass_main.hash));
+       }
+    }
+  return 0;
+}
+
+vnet_api_error_t
+ip6_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
+              u32 * expire_walk_interval_ms)
+{
+  *timeout_ms = ip6_reass_main.timeout_ms;
+  *max_reassemblies = ip6_reass_main.max_reass_n;
+  *expire_walk_interval_ms = ip6_reass_main.expire_walk_interval_ms;
+  return 0;
+}
+
+clib_error_t *
+ip6_reass_init_function (vlib_main_t * vm)
+{
+  ip6_reass_main_t *rm = &ip6_reass_main;
+  clib_error_t *error = 0;
+  u32 nbuckets;
+
+  rm->vlib_main = vm;
+  rm->vnet_main = vnet_get_main ();
+
+  rm->reass_n = 0;
+  pool_alloc (rm->pool, rm->max_reass_n);
+  ip6_reass_set (IP6_REASS_TIMEOUT_DEFAULT_MS,
+                IP6_REASS_MAX_REASSEMBLIES_DEAFULT,
+                IP6_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
+
+  nbuckets = ip6_reass_get_nbuckets ();
+  clib_bihash_init_48_8 (&rm->hash, "ip6-reass", nbuckets, nbuckets * 1024);
+
+  vlib_node_t *node = vlib_get_node_by_name (vm, (u8 *) "ip6-drop");
+  ASSERT (node);
+  rm->ip6_drop_idx = node->index;
+  node = vlib_get_node_by_name (vm, (u8 *) "ip6-reassembly-expire-walk");
+  ASSERT (node);
+  rm->ip6_reass_expire_node_idx = node->index;
+  node = vlib_get_node_by_name (vm, (u8 *) "ip6-icmp-error");
+  ASSERT (node);
+  rm->ip6_icmp_error_idx = node->index;
+
+  if ((error = vlib_call_init_function (vm, ip_main_init)))
+    return error;
+  ip6_register_protocol (IP_PROTOCOL_IPV6_FRAGMENTATION,
+                        ip6_reass_node.index);
+  return error;
+}
+
+VLIB_INIT_FUNCTION (ip6_reass_init_function);
+
+static uword
+ip6_reass_walk_expired (vlib_main_t * vm,
+                       vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+  ip6_reass_main_t *rm = &ip6_reass_main;
+  uword event_type, *event_data = 0;
+
+  while (true)
+    {
+      vlib_process_wait_for_event_or_clock (vm,
+                                           (f64) rm->expire_walk_interval_ms
+                                           / (f64) MSEC_PER_SEC);
+      event_type = vlib_process_get_events (vm, &event_data);
+
+      switch (event_type)
+       {
+       case ~0:                /* no events => timeout */
+         /* nothing to do here */
+         break;
+       case IP6_EVENT_CONFIG_CHANGED:
+         break;
+       default:
+         clib_warning ("BUG: event type 0x%wx", event_type);
+         break;
+       }
+      f64 now = vlib_time_now (vm);
+
+      ip6_reass_t *reass;
+      u32 *vec_timeout = NULL;
+      int *pool_indexes_to_free = NULL;
+
+      int index;
+      /* *INDENT-OFF* */
+      pool_foreach_index (index, rm->pool, ({
+                            reass = pool_elt_at_index (rm->pool, index);
+                            if (now > reass->last_heard + rm->timeout)
+                              {
+                                vec_add1 (pool_indexes_to_free, index);
+                              }
+                          }));
+      /* *INDENT-ON* */
+      int *i;
+      u32 *vec_icmp_bi = NULL;
+      /* *INDENT-OFF* */
+      vec_foreach (i, pool_indexes_to_free)
+      {
+        ip6_reass_t *reass = pool_elt_at_index (rm->pool, i[0]);
+        u32 icmp_bi = ~0;
+        ip6_reass_on_timeout (vm, node, rm, reass, &icmp_bi, &vec_timeout);
+        if (~0 != icmp_bi)
+          {
+            vec_add1 (vec_icmp_bi, icmp_bi);
+          }
+        ip6_reass_free (rm, reass);
+      }
+      /* *INDENT-ON* */
+
+      while (vec_len (vec_timeout) > 0)
+       {
+         vlib_frame_t *f = vlib_get_frame_to_node (vm, rm->ip6_drop_idx);
+         u32 *to_next = vlib_frame_vector_args (f);
+         u32 n_left_to_next = VLIB_FRAME_SIZE - f->n_vectors;
+         u32 n_trace = 0;
+         while (vec_len (vec_timeout) > 0 && n_left_to_next > 0)
+           {
+             u32 bi = vec_pop (vec_timeout);
+             vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+             if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
+               {
+                 if (pool_is_free_index (vm->trace_main.trace_buffer_pool,
+                                         b->trace_index))
+                   {
+                     /* the trace is gone, don't trace this buffer anymore */
+                     b->flags &= ~VLIB_BUFFER_IS_TRACED;
+                   }
+                 else
+                   {
+                     ++n_trace;
+                   }
+               }
+             b->error = node->errors[IP6_ERROR_REASS_TIMEOUT];
+             to_next[0] = bi;
+             ++f->n_vectors;
+             to_next += 1;
+             n_left_to_next -= 1;
+             ASSERT (rm->buffers_n > 0);
+             --rm->buffers_n;
+           }
+         if (PREDICT_FALSE (n_trace > 0))
+           {
+             f->flags |= VLIB_FRAME_TRACE;
+           }
+         vlib_put_frame_to_node (vm, rm->ip6_drop_idx, f);
+       }
+
+      while (vec_len (vec_icmp_bi) > 0)
+       {
+         vlib_frame_t *f =
+           vlib_get_frame_to_node (vm, rm->ip6_icmp_error_idx);
+         u32 *to_next = vlib_frame_vector_args (f);
+         u32 n_left_to_next = VLIB_FRAME_SIZE - f->n_vectors;
+         u32 n_trace = 0;
+         while (vec_len (vec_icmp_bi) > 0 && n_left_to_next > 0)
+           {
+             u32 bi = vec_pop (vec_icmp_bi);
+             vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+             if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
+               {
+                 if (pool_is_free_index (vm->trace_main.trace_buffer_pool,
+                                         b->trace_index))
+                   {
+                     /* the trace is gone, don't trace this buffer anymore */
+                     b->flags &= ~VLIB_BUFFER_IS_TRACED;
+                   }
+                 else
+                   {
+                     ++n_trace;
+                   }
+               }
+             b->error = node->errors[IP6_ERROR_REASS_TIMEOUT];
+             to_next[0] = bi;
+             ++f->n_vectors;
+             to_next += 1;
+             n_left_to_next -= 1;
+             ASSERT (rm->buffers_n > 0);
+             --rm->buffers_n;
+           }
+         if (PREDICT_FALSE (n_trace > 0))
+           {
+             f->flags |= VLIB_FRAME_TRACE;
+           }
+         vlib_put_frame_to_node (vm, rm->ip6_icmp_error_idx, f);
+       }
+
+      vec_free (pool_indexes_to_free);
+      vec_free (vec_timeout);
+      vec_free (vec_icmp_bi);
+      if (event_data)
+       {
+         _vec_len (event_data) = 0;
+       }
+    }
+
+  return 0;
+}
+
+static vlib_node_registration_t ip6_reass_expire_node;
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (ip6_reass_expire_node, static) = {
+    .function = ip6_reass_walk_expired,
+    .format_trace = format_ip6_reass_trace,
+    .type = VLIB_NODE_TYPE_PROCESS,
+    .name = "ip6-reassembly-expire-walk",
+
+    .n_errors = ARRAY_LEN (ip6_reassembly_error_strings),
+    .error_strings = ip6_reassembly_error_strings,
+
+};
+/* *INDENT-ON* */
+
+static u8 *
+format_ip6_reass_key (u8 * s, va_list * args)
+{
+  ip6_reass_key_t *key = va_arg (*args, ip6_reass_key_t *);
+  s = format (s, "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
+             key->xx_id, format_ip6_address, &key->src, format_ip6_address,
+             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
+  return s;
+}
+
+static u8 *
+format_ip6_reass (u8 * s, va_list * args)
+{
+  vlib_main_t *vm = va_arg (*args, vlib_main_t *);
+  ip6_reass_t *reass = va_arg (*args, ip6_reass_t *);
+
+  s = format (s, "ID: %u, key: %U\n  first_bi: %u, data_len: %u, "
+             "last_packet_octet: %u, trace_op_counter: %u\n",
+             reass->id, format_ip6_reass_key, &reass->key, reass->first_bi,
+             reass->data_len, reass->last_packet_octet,
+             reass->trace_op_counter);
+  u32 bi = reass->first_bi;
+  u32 counter = 0;
+  while (~0 != bi)
+    {
+      vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+      vnet_buffer_opaque_t *vnb = vnet_buffer (b);
+      s = format (s, "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
+                 "fragment[%u, %u]\n",
+                 counter, vnb->ip.reass.range_first,
+                 vnb->ip.reass.range_last, bi,
+                 ip6_reass_buffer_get_data_offset_no_check (b),
+                 ip6_reass_buffer_get_data_len_no_check (b),
+                 vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
+      if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+       {
+         bi = b->next_buffer;
+       }
+      else
+       {
+         bi = ~0;
+       }
+    }
+  return s;
+}
+
+static clib_error_t *
+show_ip6_reass (vlib_main_t * vm, unformat_input_t * input,
+               CLIB_UNUSED (vlib_cli_command_t * lmd))
+{
+  ip6_reass_main_t *rm = &ip6_reass_main;
+
+  vlib_cli_output (vm, "---------------------");
+  vlib_cli_output (vm, "IP6 reassembly status");
+  vlib_cli_output (vm, "---------------------");
+  if (unformat (input, "details"))
+    {
+      ip6_reass_t *reass;
+      /* *INDENT-OFF* */
+      pool_foreach (reass, rm->pool, {
+        vlib_cli_output (vm, "%U", format_ip6_reass, vm, reass);
+      });
+      /* *INDENT-ON* */
+    }
+  vlib_cli_output (vm, "---------------------");
+  vlib_cli_output (vm, "Current IP6 reassemblies count: %lu\n", rm->reass_n);
+  vlib_cli_output (vm,
+                  "Maximum configured concurrent IP6 reassemblies: %lu\n",
+                  (long unsigned) rm->max_reass_n);
+  vlib_cli_output (vm, "Buffers in use: %lu\n",
+                  (long unsigned) rm->buffers_n);
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_ip6_reassembly_cmd, static) = {
+    .path = "show ip6-reassembly",
+    .short_help = "show ip6-reassembly [details]",
+    .function = show_ip6_reass,
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/ip/ip6_reassembly.h b/src/vnet/ip/ip6_reassembly.h
new file mode 100644 (file)
index 0000000..0de4e04
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ * @brief IPv6 Reassembly.
+ *
+ * This file contains the source code for IPv6 reassembly.
+ */
+
+#ifndef __included_ip6_reassembly_h__
+#define __included_ip6_reassembly_h__
+
+#include <vnet/api_errno.h>
+#include <vnet/vnet.h>
+
+/**
+ * @brief set ip6 reassembly configuration
+ */
+vnet_api_error_t ip6_reass_set (u32 timeout_ms, u32 max_reassemblies,
+                               u32 expire_walk_interval_ms);
+
+/**
+ * @brief get ip6 reassembly configuration
+ */
+vnet_api_error_t ip6_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
+                               u32 * expire_walk_interval_ms);
+
+#endif /* __included_ip6_reassembly_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
index c0889eb..60fa2fa 100644 (file)
@@ -42,6 +42,8 @@
 #include <vnet/fib/ip4_fib.h>
 #include <vnet/fib/ip6_fib.h>
 #include <vnet/ip/ip6_hop_by_hop.h>
+#include <vnet/ip/ip4_reassembly.h>
+#include <vnet/ip/ip6_reassembly.h>
 
 #include <vnet/vnet_msg_enum.h>
 
@@ -97,7 +99,9 @@ _(IOAM_DISABLE, ioam_disable)                                           \
 _(IP_SOURCE_AND_PORT_RANGE_CHECK_ADD_DEL,                               \
   ip_source_and_port_range_check_add_del)                               \
 _(IP_SOURCE_AND_PORT_RANGE_CHECK_INTERFACE_ADD_DEL,                     \
-  ip_source_and_port_range_check_interface_add_del)
+  ip_source_and_port_range_check_interface_add_del)                     \
+_(IP_REASSEMBLY_SET, ip_reassembly_set)                                 \
+_(IP_REASSEMBLY_GET, ip_reassembly_get)
 
 extern void stats_dslock_with_hint (int hint, int tag);
 extern void stats_dsunlock (void);
@@ -2786,6 +2790,61 @@ vl_api_set_arp_neighbor_limit_t_handler (vl_api_set_arp_neighbor_limit_t * mp)
   REPLY_MACRO (VL_API_SET_ARP_NEIGHBOR_LIMIT_REPLY);
 }
 
+void
+vl_api_ip_reassembly_set_t_handler (vl_api_ip_reassembly_set_t * mp)
+{
+  vl_api_ip_reassembly_set_reply_t *rmp;
+  int rv = 0;
+  if (mp->is_ip6)
+    {
+      rv = ip6_reass_set (clib_net_to_host_u32 (mp->timeout_ms),
+                         clib_net_to_host_u32 (mp->max_reassemblies),
+                         clib_net_to_host_u32 (mp->expire_walk_interval_ms));
+    }
+  else
+    {
+      rv = ip4_reass_set (clib_net_to_host_u32 (mp->timeout_ms),
+                         clib_net_to_host_u32 (mp->max_reassemblies),
+                         clib_net_to_host_u32 (mp->expire_walk_interval_ms));
+    }
+
+  REPLY_MACRO (VL_API_IP_REASSEMBLY_SET_REPLY);
+}
+
+void
+vl_api_ip_reassembly_get_t_handler (vl_api_ip_reassembly_get_t * mp)
+{
+  unix_shared_memory_queue_t *q;
+
+  q = vl_api_client_index_to_input_queue (mp->client_index);
+
+  if (q == 0)
+    return;
+
+  vl_api_ip_reassembly_get_reply_t *rmp = vl_msg_api_alloc (sizeof (*rmp));
+  memset (rmp, 0, sizeof (*rmp));
+  rmp->_vl_msg_id = ntohs (VL_API_IP_REASSEMBLY_GET_REPLY);
+  rmp->context = mp->context;
+  rmp->retval = 0;
+  if (mp->is_ip6)
+    {
+      rmp->is_ip6 = 1;
+      ip6_reass_get (&rmp->timeout_ms, &rmp->max_reassemblies,
+                    &rmp->expire_walk_interval_ms);
+    }
+  else
+    {
+      rmp->is_ip6 = 0;
+      ip4_reass_get (&rmp->timeout_ms, &rmp->max_reassemblies,
+                    &rmp->expire_walk_interval_ms);
+    }
+  rmp->timeout_ms = clib_host_to_net_u32 (rmp->timeout_ms);
+  rmp->max_reassemblies = clib_host_to_net_u32 (rmp->max_reassemblies);
+  rmp->expire_walk_interval_ms =
+    clib_host_to_net_u32 (rmp->expire_walk_interval_ms);
+  vl_msg_api_send_shmem (q, (u8 *) & rmp);
+}
+
 #define vl_msg_name_crc_list
 #include <vnet/ip/ip.api.h>
 #undef vl_msg_name_crc_list
index 67e46bd..f2880bf 100644 (file)
@@ -230,6 +230,8 @@ ip_lookup_init (ip_lookup_main_t * lm, u32 is_ip6)
       }
 
     lm->local_next_by_ip_protocol[IP_PROTOCOL_UDP] = IP_LOCAL_NEXT_UDP_LOOKUP;
+    lm->local_next_by_ip_protocol[IP_PROTOCOL_VPP_FRAGMENTATION] =
+      IP_LOCAL_NEXT_REASSEMBLY;
     lm->local_next_by_ip_protocol[is_ip6 ? IP_PROTOCOL_ICMP6 :
                                  IP_PROTOCOL_ICMP] = IP_LOCAL_NEXT_ICMP;
     lm->builtin_protocol_by_ip_protocol[IP_PROTOCOL_UDP] =
index a89546f..9fe7a1b 100644 (file)
@@ -111,6 +111,7 @@ typedef enum
   IP_LOCAL_NEXT_PUNT,
   IP_LOCAL_NEXT_UDP_LOOKUP,
   IP_LOCAL_NEXT_ICMP,
+  IP_LOCAL_NEXT_REASSEMBLY,
   IP_LOCAL_N_NEXT,
 } ip_local_next_t;
 
index 77fab31..4ec0bbc 100644 (file)
@@ -158,5 +158,10 @@ ip_protocol (134, RSVP_E2E_IGNORE)
 ip_protocol (135, MOBILITY)
 ip_protocol (136, UDP_LITE)
 ip_protocol (137, MPLS_IN_IP)
+/*
+ * VPPs way of dealing with fragments is to mark them as 0xfe, to be picked
+ * by the corresponding nodes based on this protocol
+ */
+ip_protocol (0xfe, VPP_FRAGMENTATION)
 ip_protocol (255, RESERVED)
 
index f3b7f5c..155325a 100644 (file)
@@ -88,7 +88,7 @@ $(PAPI_INSTALL_DONE): $(PIP_PATCH_DONE)
        @touch $@
 
 define retest-func
-       @env VPP_TEST_FAILED_DIR=$(VPP_TEST_FAILED_DIR) scripts/setsid_wrapper.sh $(FORCE_FOREGROUND) $(PYTHON_VENV_PATH)/bin/activate python run_tests.py -d $(TEST_DIR) $(UNITTEST_EXTRA_OPTS) || env VPP_TEST_FAILED_DIR=$(VPP_TEST_FAILED_DIR) COMPRESS_FAILED_TEST_LOGS=$(COMPRESS_FAILED_TEST_LOGS) scripts/compress_failed.sh
+@env VPP_TEST_FAILED_DIR=$(VPP_TEST_FAILED_DIR) scripts/setsid_wrapper.sh $(FORCE_FOREGROUND) $(PYTHON_VENV_PATH)/bin/activate python run_tests.py -d $(TEST_DIR) $(UNITTEST_EXTRA_OPTS) || env VPP_TEST_FAILED_DIR=$(VPP_TEST_FAILED_DIR) COMPRESS_FAILED_TEST_LOGS=$(COMPRESS_FAILED_TEST_LOGS) scripts/compress_failed.sh
 endef
 
 .PHONY: sanity
@@ -103,6 +103,7 @@ endif
 
 sanity: verify-no-running-vpp
        @sys_req/dev_shm_size.sh
+       @sys_req/set_system_parameters.sh sys_req/system_parameters
        @bash -c "$(SANITY_IMPORT_VPP_PAPI_CMD) ||\
                (echo \"*******************************************************************\" &&\
                 echo \"* Sanity check failed, cannot import vpp_papi\" &&\
index 2e3f978..fc6f550 100644 (file)
@@ -24,6 +24,7 @@ from vpp_lo_interface import VppLoInterface
 from vpp_papi_provider import VppPapiProvider
 from log import *
 from vpp_object import VppObjectRegistry
+from vpp_punt_socket import vpp_uds_socket_name
 if os.name == 'posix' and sys.version_info[0] < 3:
     # using subprocess32 is recommended by python official documentation
     # @ https://docs.python.org/2/library/subprocess.html
@@ -255,7 +256,8 @@ class VppTestCase(unittest.TestCase):
                            coredump_size, "}", "api-trace", "{", "on", "}",
                            "api-segment", "{", "prefix", cls.shm_prefix, "}",
                            "plugins", "{", "plugin", "dpdk_plugin.so", "{",
-                           "disable", "}", "}"]
+                           "disable", "}", "}",
+                           "punt", "{", "socket", cls.punt_socket_path, "}"]
         if plugin_path is not None:
             cls.vpp_cmdline.extend(["plugin_path", plugin_path])
         cls.logger.info("vpp_cmdline: %s" % cls.vpp_cmdline)
@@ -317,7 +319,7 @@ class VppTestCase(unittest.TestCase):
         Remove shared memory files, start vpp and connect the vpp-api
         """
         gc.collect()  # run garbage collection first
-        random.seed()
+        random.seed(1)
         cls.logger = getLogger(cls.__name__)
         cls.tempdir = tempfile.mkdtemp(
             prefix='vpp-unittest-%s-' % cls.__name__)
@@ -328,6 +330,7 @@ class VppTestCase(unittest.TestCase):
         cls.file_handler.setLevel(DEBUG)
         cls.logger.addHandler(cls.file_handler)
         cls.shm_prefix = cls.tempdir.split("/")[-1]
+        cls.punt_socket_path = '%s/%s' % (cls.tempdir, vpp_uds_socket_name)
         os.chdir(cls.tempdir)
         cls.logger.info("Temporary dir is %s, shm prefix is %s",
                         cls.tempdir, cls.shm_prefix)
@@ -499,13 +502,16 @@ class VppTestCase(unittest.TestCase):
         type(self).test_instance = self
 
     @classmethod
-    def pg_enable_capture(cls, interfaces):
+    def pg_enable_capture(cls, interfaces=None):
         """
         Enable capture on packet-generator interfaces
 
-        :param interfaces: iterable interface indexes
+        :param interfaces: iterable interface indexes (if None,
+                           use self.pg_interfaces)
 
         """
+        if interfaces is None:
+            interfaces = cls.pg_interfaces
         for i in interfaces:
             i.enable_capture()
 
@@ -573,19 +579,21 @@ class VppTestCase(unittest.TestCase):
         return result
 
     @staticmethod
-    def extend_packet(packet, size):
+    def extend_packet(packet, size, padding=' '):
         """
-        Extend packet to given size by padding with spaces
+        Extend packet to given size by padding with spaces or custom padding
         NOTE: Currently works only when Raw layer is present.
 
         :param packet: packet
         :param size: target size
+        :param padding: padding used to extend the payload
 
         """
         packet_len = len(packet) + 4
         extend = size - packet_len
         if extend > 0:
-            packet[Raw].load += ' ' * extend
+            num = (extend / len(padding)) + 1
+            packet[Raw].load += (padding * num)[:extend]
 
     @classmethod
     def reset_packet_infos(cls):
index 77a2fc5..0ef1377 100644 (file)
@@ -2,7 +2,7 @@ import signal
 import os
 import traceback
 from log import RED, single_line_delim, double_line_delim
-from debug import spawn_gdb, gdb_path
+from debug import spawn_gdb
 
 
 class Hook(object):
@@ -62,14 +62,10 @@ class PollHook(Hook):
 
     def on_crash(self, core_path):
         if self.testcase.debug_core:
-            if not spawn_gdb(self.testcase.vpp_bin, core_path, self.logger):
-                self.logger.error(
-                    "Debugger '%s' does not exist or is not an executable.." %
-                    gdb_path)
-            else:
-                return
-        self.logger.critical("Core file present, debug with: gdb %s %s" %
-                             (self.testcase.vpp_bin, core_path))
+            spawn_gdb(self.testcase.vpp_bin, core_path, self.logger)
+        else:
+            self.logger.critical("Core file present, debug with: gdb %s %s" %
+                                 (self.testcase.vpp_bin, core_path))
 
     def poll_vpp(self):
         """
index cf3217e..706e197 100644 (file)
@@ -1,7 +1,8 @@
 diff --git a/scapy/layers/inet6.py b/scapy/layers/inet6.py
---- a/scapy/layers/inet6.py    2017-06-01 14:04:18.160881034 +0200
-+++ b/scapy/layers/inet6.py    2017-06-02 09:08:40.133800208 +0200
-@@ -369,6 +369,8 @@
+index 03b80ec..06ef27f 100644
+--- a/scapy/layers/inet6.py
++++ b/scapy/layers/inet6.py
+@@ -369,6 +369,8 @@ class _IPv6GuessPayload:
              return Raw
          elif self.nh == 135 and len(p) > 3: # Mobile IPv6
              return _mip6_mhtype2cls.get(ord(p[2]), MIP6MH_Generic)
@@ -9,11 +10,11 @@ diff --git a/scapy/layers/inet6.py b/scapy/layers/inet6.py
 +            return IPv6ExtHdrSegmentRouting
          else:
              return get_cls(ipv6nhcls.get(self.nh,"Raw"), "Raw")
-
-@@ -430,6 +432,14 @@
+@@ -430,6 +432,14 @@ class IPv6(_IPv6GuessPayload, Packet, IPTools):
                      sd = strxor(sd, a)
                  sd = inet_ntop(socket.AF_INET6, sd)
-
 +        if self.nh == 43 and isinstance(self.payload, IPv6ExtHdrSegmentRouting):
 +            # With segment routing header (rh == 4), the destination is
 +            # the first address of the IPv6 addresses list
@@ -23,10 +24,10 @@ diff --git a/scapy/layers/inet6.py b/scapy/layers/inet6.py
 +                sd = self.dst
 +
          if self.nh == 44 and isinstance(self.payload, IPv6ExtHdrFragment):
-             nh = self.payload.nh
-
-@@ -489,6 +499,8 @@
-             return self.payload.answers(other.payload.payload)
+             nh = self.payload.nh 
+@@ -489,6 +499,8 @@ class IPv6(_IPv6GuessPayload, Packet, IPTools):
+             return self.payload.answers(other.payload.payload) 
          elif other.nh == 43 and isinstance(other.payload, IPv6ExtHdrRouting):
              return self.payload.answers(other.payload.payload) # Buggy if self.payload is a IPv6ExtHdrRouting
 +        elif other.nh == 43 and isinstance(other.payload, IPv6ExtHdrSegmentRouting):
@@ -34,10 +35,10 @@ diff --git a/scapy/layers/inet6.py b/scapy/layers/inet6.py
          elif other.nh == 60 and isinstance(other.payload, IPv6ExtHdrDestOpt):
              return self.payload.payload.answers(other.payload.payload)
          elif self.nh == 60 and isinstance(self.payload, IPv6ExtHdrDestOpt): # BU in reply to BRR, for instance
-@@ -919,6 +931,148 @@
+@@ -919,6 +931,148 @@ class IPv6ExtHdrRouting(_IPv6ExtHdr):
              pkt = pkt[:3]+struct.pack("B", len(self.addresses))+pkt[4:]
          return _IPv6ExtHdr.post_build(self, pkt, pay)
-
 +######################### Segment Routing Header ############################
 +
 +# This implementation is based on draft 06, available at:
diff --git a/test/sys_req/set_system_parameters.sh b/test/sys_req/set_system_parameters.sh
new file mode 100755 (executable)
index 0000000..533a02e
--- /dev/null
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+file="$1"
+
+usage(){
+       echo "Usage: $0 <requirements file>"
+}
+
+if [ "$file" == "" ]
+then
+       echo "Invalid parameters specified."
+       usage
+       exit 1
+fi
+
+if [ ! -f $file ]
+then
+       echo "File '$file' does not exist."
+       usage
+       exit 1
+fi
+
+cat $file | grep -v -e '^#.*$' | grep -v -e '^ *$' | while read line
+do
+       value_file=`echo $line | awk '{print $1}'`
+       operator=`echo $line | awk '{print $2}'`
+       value=`echo $line | awk '{print $3}'`
+       set_value=`echo $line | awk '{print $4}'`
+       if [[ "$value_file" == "" || "$operator" == "" || "$value" == "" || "$set_value" == "" ]]
+       then
+               echo "Syntax error in requirements file."
+               exit 1
+       fi
+       current_value=`cat $value_file`
+       if test "$current_value" $operator "$value"
+       then
+               if test "$V" = "2"
+               then
+                       echo "Requirement '$value_file $operator $value' satisfied."
+               fi
+       else
+               echo "Requirement '$value_file $operator $value' not satisfied."
+               echo "Writing '$set_value' to '$value_file'."
+               echo "$set_value" | tee "$value_file" > /dev/null
+               if ! test "`cat $value_file`" = "$set_value"
+               then
+                       echo "Repeating the write using sudo..."
+                       echo "$set_value" | sudo -n tee "$value_file" > /dev/null
+                       if ! test "`cat $value_file`" = "$set_value"
+                       then
+                               echo "Couldn't set the required value. Is that value allowed? Is sudo working?"
+                               exit 1
+                       fi
+               fi
+               echo "Succesfully wrote '$set_value' to '$value_file'."
+       fi
+done
diff --git a/test/sys_req/system_parameters b/test/sys_req/system_parameters
new file mode 100644 (file)
index 0000000..6373774
--- /dev/null
@@ -0,0 +1,18 @@
+# test framework system requirements
+# format of this file is
+# <path> <operator> <comparison-value> <set-value>
+#
+# path - path to value e.g. in /proc which needs to be checked
+# operator - test operator (e.g. -gt)
+# comparison-value - value, against which the value read from <path> is compared
+# set-value - value, to which the path is set if the test fails
+#
+# the comparison is done using `test' command
+
+
+# test_reassembly.py
+# needed by test_reassembly which uses udp punt via unix domain sockets
+# to ensure that all data which vpp might produce in a burst fits into
+# the socket send buffer
+/proc/sys/net/core/wmem_max -ge 4636252 4636252
+/proc/sys/net/core/wmem_default -ge 4636252 4636252
diff --git a/test/test_reassembly.py b/test/test_reassembly.py
new file mode 100644 (file)
index 0000000..a2d77d4
--- /dev/null
@@ -0,0 +1,966 @@
+#!/usr/bin/env python
+import unittest
+from random import shuffle
+
+from framework import VppTestCase, VppTestRunner
+
+from scapy.packet import Raw
+from scapy.layers.l2 import Ether, GRE
+from scapy.layers.inet import IP, UDP
+from util import ppp, fragment_rfc791, fragment_rfc8200
+from vpp_punt_socket import VppUDSPuntSocket
+from scapy.layers.inet6 import IPv6, IPv6ExtHdrFragment, ICMPv6ParamProblem,\
+    ICMPv6TimeExceeded
+from vpp_gre_interface import VppGreInterface, VppGre6Interface
+from vpp_ip_route import VppIpRoute, VppRoutePath, DpoProto
+
+test_packet_count = 257
+
+
+class TestIPv4Reassembly(VppTestCase):
+    """ IPv4 Reassembly """
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestIPv4Reassembly, cls).setUpClass()
+
+        cls.create_pg_interfaces([0])
+        cls.pg_if = cls.pg0
+
+        # setup all interfaces
+        for i in cls.pg_interfaces:
+            i.admin_up()
+            i.config_ip4()
+            i.resolve_arp()
+
+        cls.punt_port = 9999
+        cls.punt_socket = VppUDSPuntSocket(cls, cls.punt_port)
+
+        # packet sizes
+        cls.packet_sizes = [64, 512, 1518, 9018]
+        cls.padding = " abcdefghijklmn"
+        cls.create_stream(cls.packet_sizes)
+        cls.create_fragments()
+
+    def setUp(self):
+        """ Test setup - force timeout on existing reassemblies """
+        super(TestIPv4Reassembly, self).setUp()
+        self.vapi.ip_reassembly_set(timeout_ms=0, max_reassemblies=1000,
+                                    expire_walk_interval_ms=10)
+        self.sleep(.25)
+        self.vapi.ip_reassembly_set(timeout_ms=1000000, max_reassemblies=1000,
+                                    expire_walk_interval_ms=10000)
+
+    def tearDown(self):
+        super(TestIPv4Reassembly, self).tearDown()
+        self.logger.debug(self.vapi.ppcli("show ip4-reassembly details"))
+
+    @classmethod
+    def create_stream(cls, packet_sizes, packet_count=test_packet_count):
+        """Create input packet stream for defined interface.
+
+        :param list packet_sizes: Required packet sizes.
+        """
+        for i in range(0, packet_count):
+            info = cls.create_packet_info(cls.pg_if, cls.pg_if)
+            payload = cls.info_to_payload(info)
+            p = (Ether(dst=cls.pg_if.local_mac, src=cls.pg_if.remote_mac) /
+                 IP(id=info.index, src=cls.pg_if.remote_ip4,
+                    dst=cls.pg_if.local_ip4) /
+                 UDP(sport=1234, dport=cls.punt_port) /
+                 Raw(payload))
+            size = packet_sizes[(i // 2) % len(packet_sizes)]
+            cls.extend_packet(p, size, cls.padding)
+            info.data = p
+
+    @classmethod
+    def create_fragments(cls):
+        infos = cls._packet_infos
+        cls.pkt_infos = []
+        for index, info in infos.iteritems():
+            p = info.data
+            # self.logger.debug(ppp("Packet:", p.__class__(str(p))))
+            fragments_400 = fragment_rfc791(p, 400)
+            fragments_300 = fragment_rfc791(p, 300)
+            fragments_200 = [
+                x for f in fragments_400 for x in fragment_rfc791(f, 200)]
+            cls.pkt_infos.append(
+                (index, fragments_400, fragments_300, fragments_200))
+        cls.fragments_400 = [
+            x for (_, frags, _, _) in cls.pkt_infos for x in frags]
+        cls.fragments_300 = [
+            x for (_, _, frags, _) in cls.pkt_infos for x in frags]
+        cls.fragments_200 = [
+            x for (_, _, _, frags) in cls.pkt_infos for x in frags]
+        cls.logger.debug("Fragmented %s packets into %s 400-byte fragments, "
+                         "%s 300-byte fragments and %s 200-byte fragments" %
+                         (len(infos), len(cls.fragments_400),
+                             len(cls.fragments_300), len(cls.fragments_200)))
+
+    def verify_capture(self, capture, dropped_packet_indexes=[]):
+        """Verify captured packet stream.
+
+        :param list capture: Captured packet stream.
+        """
+        info = None
+        seen = set()
+        for packet in capture:
+            try:
+                sw_if_index = packet['sw_if_index']
+                punt_action = packet['punt_action']
+                packet = Ether(packet['packet'])
+                self.logger.debug(ppp("Got packet from %s, action %s" %
+                                      (sw_if_index, punt_action), packet))
+                ip = packet[IP]
+                udp = packet[UDP]
+                payload_info = self.payload_to_info(str(packet[Raw]))
+                packet_index = payload_info.index
+                self.assertTrue(
+                    packet_index not in dropped_packet_indexes,
+                    ppp("Packet received, but should be dropped:", packet))
+                if packet_index in seen:
+                    raise Exception(ppp("Duplicate packet received", packet))
+                seen.add(packet_index)
+                self.assertEqual(payload_info.dst, self.pg_if.sw_if_index)
+                info = self._packet_infos[packet_index]
+                self.assertTrue(info is not None)
+                self.assertEqual(packet_index, info.index)
+                saved_packet = info.data
+                self.assertEqual(ip.src, saved_packet[IP].src)
+                self.assertEqual(ip.dst, saved_packet[IP].dst)
+                self.assertEqual(udp.payload, saved_packet[UDP].payload)
+            except:
+                self.logger.error(ppp("Unexpected or invalid packet:", packet))
+                raise
+        for index in self._packet_infos:
+            self.assertTrue(index in seen or index in dropped_packet_indexes,
+                            "Packet with packet_index %d not received" % index)
+
+    def test_reassembly(self):
+        """ basic reassembly """
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(self.fragments_200)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.pkt_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+        # run it all again to verify correctness
+        self.pg_enable_capture()
+        self.pg_if.add_stream(self.fragments_200)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.pkt_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+    def test_reversed(self):
+        """ reverse order reassembly """
+
+        fragments = list(self.fragments_200)
+        fragments.reverse()
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.packet_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+        # run it all again to verify correctness
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.packet_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+    def test_random(self):
+        """ random order reassembly """
+
+        fragments = list(self.fragments_200)
+        shuffle(fragments)
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.packet_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+        # run it all again to verify correctness
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.packet_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+    def test_duplicates(self):
+        """ duplicate fragments """
+
+        fragments = [
+            x for (_, frags, _, _) in self.pkt_infos
+            for x in frags
+            for _ in range(0, min(2, len(frags)))
+        ]
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.pkt_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+    def test_overlap1(self):
+        """ overlapping fragments case #1 """
+
+        fragments = []
+        for _, _, frags_300, frags_200 in self.pkt_infos:
+            if len(frags_300) == 1:
+                fragments.extend(frags_300)
+            else:
+                for i, j in zip(frags_200, frags_300):
+                    fragments.extend(i)
+                    fragments.extend(j)
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.pkt_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+        # run it all to verify correctness
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.pkt_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+    def test_overlap2(self):
+        """ overlapping fragments case #2 """
+
+        fragments = []
+        for _, _, frags_300, frags_200 in self.pkt_infos:
+            if len(frags_300) == 1:
+                fragments.extend(frags_300)
+            else:
+                # care must be taken here so that there are no fragments
+                # received by vpp after reassembly is finished, otherwise
+                # new reassemblies will be started and packet generator will
+                # freak out when it detects unfreed buffers
+                zipped = zip(frags_300, frags_200)
+                for i, j in zipped[:-1]:
+                    fragments.extend(i)
+                    fragments.extend(j)
+                fragments.append(zipped[-1][0])
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.pkt_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+        # run it all to verify correctness
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.pkt_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+    def test_timeout_inline(self):
+        """ timeout (inline) """
+
+        dropped_packet_indexes = set(
+            index for (index, frags, _, _) in self.pkt_infos if len(frags) > 1
+        )
+
+        self.vapi.ip_reassembly_set(timeout_ms=0, max_reassemblies=1000,
+                                    expire_walk_interval_ms=10000)
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(self.fragments_400)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(
+            len(self.pkt_infos) - len(dropped_packet_indexes))
+        self.verify_capture(packets, dropped_packet_indexes)
+        self.pg_if.assert_nothing_captured()
+
+    def test_timeout_cleanup(self):
+        """ timeout (cleanup) """
+
+        # whole packets + fragmented packets sans last fragment
+        fragments = [
+            x for (_, frags_400, _, _) in self.pkt_infos
+            for x in frags_400[:-1 if len(frags_400) > 1 else None]
+        ]
+
+        # last fragments for fragmented packets
+        fragments2 = [frags_400[-1]
+                      for (_, frags_400, _, _) in self.pkt_infos
+                      if len(frags_400) > 1]
+
+        dropped_packet_indexes = set(
+            index for (index, frags_400, _, _) in self.pkt_infos
+            if len(frags_400) > 1)
+
+        self.vapi.ip_reassembly_set(timeout_ms=100, max_reassemblies=1000,
+                                    expire_walk_interval_ms=50)
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        self.sleep(.25, "wait before sending rest of fragments")
+
+        self.pg_if.add_stream(fragments2)
+        self.pg_start()
+        self.sleep(.25, "wait for vpp to process packets")
+
+        packets = self.punt_socket.wait_for_packets(
+            len(self.pkt_infos) - len(dropped_packet_indexes))
+        self.verify_capture(packets, dropped_packet_indexes)
+        self.pg_if.assert_nothing_captured()
+
+    def test_disabled(self):
+        """ reassembly disabled """
+
+        dropped_packet_indexes = set(
+            index for (index, frags_400, _, _) in self.pkt_infos
+            if len(frags_400) > 1)
+
+        self.vapi.ip_reassembly_set(timeout_ms=1000, max_reassemblies=0,
+                                    expire_walk_interval_ms=10000)
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(self.fragments_400)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(
+            len(self.pkt_infos) - len(dropped_packet_indexes))
+        self.verify_capture(packets, dropped_packet_indexes)
+        self.pg_if.assert_nothing_captured()
+
+
+class TestIPv6Reassembly(VppTestCase):
+    """ IPv6 Reassembly """
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestIPv6Reassembly, cls).setUpClass()
+
+        cls.create_pg_interfaces([0])
+        cls.pg_if = cls.pg0
+
+        # setup all interfaces
+        for i in cls.pg_interfaces:
+            i.admin_up()
+            i.config_ip6()
+            i.resolve_ndp()
+
+        cls.punt_port = 9999
+        cls.punt_socket = VppUDSPuntSocket(cls, cls.punt_port, is_ip4=0)
+
+        # packet sizes
+        cls.packet_sizes = [64, 512, 1518, 9018]
+        cls.padding = " abcdefghijklmn"
+        cls.create_stream(cls.packet_sizes)
+        cls.create_fragments()
+
+    def setUp(self):
+        """ Test setup - force timeout on existing reassemblies """
+        super(TestIPv6Reassembly, self).setUp()
+        self.vapi.ip_reassembly_set(timeout_ms=0, max_reassemblies=1000,
+                                    expire_walk_interval_ms=10, is_ip6=1)
+        self.sleep(.25)
+        self.vapi.ip_reassembly_set(timeout_ms=1000000, max_reassemblies=1000,
+                                    expire_walk_interval_ms=10000, is_ip6=1)
+
+    def tearDown(self):
+        super(TestIPv6Reassembly, self).tearDown()
+        self.logger.debug(self.vapi.ppcli("show ip6-reassembly details"))
+
+    @classmethod
+    def create_stream(cls, packet_sizes, packet_count=test_packet_count):
+        """Create input packet stream for defined interface.
+
+        :param list packet_sizes: Required packet sizes.
+        """
+        for i in range(0, packet_count):
+            info = cls.create_packet_info(cls.pg_if, cls.pg_if)
+            payload = cls.info_to_payload(info)
+            p = (Ether(dst=cls.pg_if.local_mac, src=cls.pg_if.remote_mac) /
+                 IPv6(src=cls.pg_if.remote_ip6,
+                      dst=cls.pg_if.local_ip6) /
+                 UDP(sport=1234, dport=cls.punt_port) /
+                 Raw(payload))
+            size = packet_sizes[(i // 2) % len(packet_sizes)]
+            cls.extend_packet(p, size, cls.padding)
+            info.data = p
+
+    @classmethod
+    def create_fragments(cls):
+        infos = cls._packet_infos
+        cls.pkt_infos = []
+        for index, info in infos.iteritems():
+            p = info.data
+            # self.logger.debug(ppp("Packet:", p.__class__(str(p))))
+            fragments_400 = fragment_rfc8200(p, info.index, 400)
+            fragments_300 = fragment_rfc8200(p, info.index, 300)
+            cls.pkt_infos.append((index, fragments_400, fragments_300))
+        cls.fragments_400 = [
+            x for _, frags, _ in cls.pkt_infos for x in frags]
+        cls.fragments_300 = [
+            x for _, _, frags in cls.pkt_infos for x in frags]
+        cls.logger.debug("Fragmented %s packets into %s 400-byte fragments, "
+                         "and %s 300-byte fragments" %
+                         (len(infos), len(cls.fragments_400),
+                             len(cls.fragments_300)))
+
+    def verify_capture(self, capture, dropped_packet_indexes=[]):
+        """Verify captured packet strea .
+
+        :param list capture: Captured packet stream.
+        """
+        info = None
+        seen = set()
+        for packet in capture:
+            try:
+                sw_if_index = packet['sw_if_index']
+                punt_action = packet['punt_action']
+                packet = Ether(packet['packet'])
+                self.logger.debug(ppp("Got packet from %s, action %s" %
+                                      (sw_if_index, punt_action), packet))
+                ip = packet[IPv6]
+                udp = packet[UDP]
+                payload_info = self.payload_to_info(str(packet[Raw]))
+                packet_index = payload_info.index
+                self.assertTrue(
+                    packet_index not in dropped_packet_indexes,
+                    ppp("Packet received, but should be dropped:", packet))
+                if packet_index in seen:
+                    raise Exception(ppp("Duplicate packet received", packet))
+                seen.add(packet_index)
+                self.assertEqual(payload_info.dst, self.pg_if.sw_if_index)
+                info = self._packet_infos[packet_index]
+                self.assertTrue(info is not None)
+                self.assertEqual(packet_index, info.index)
+                saved_packet = info.data
+                self.assertEqual(ip.src, saved_packet[IPv6].src)
+                self.assertEqual(ip.dst, saved_packet[IPv6].dst)
+                self.assertEqual(udp.payload, saved_packet[UDP].payload)
+            except:
+                self.logger.error(ppp("Unexpected or invalid packet:", packet))
+                raise
+        for index in self._packet_infos:
+            self.assertTrue(index in seen or index in dropped_packet_indexes,
+                            "Packet with packet_index %d not received" % index)
+
+    def test_reassembly(self):
+        """ basic reassembly """
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(self.fragments_400)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.pkt_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+        # run it all again to verify correctness
+        self.pg_enable_capture()
+        self.pg_if.add_stream(self.fragments_400)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.pkt_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+    def test_reversed(self):
+        """ reverse order reassembly """
+
+        fragments = list(self.fragments_400)
+        fragments.reverse()
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.pkt_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+        # run it all again to verify correctness
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.pkt_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+    def test_random(self):
+        """ random order reassembly """
+
+        fragments = list(self.fragments_400)
+        shuffle(fragments)
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.pkt_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+        # run it all again to verify correctness
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.pkt_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+    def test_duplicates(self):
+        """ duplicate fragments """
+
+        fragments = [
+            x for (_, frags, _) in self.pkt_infos
+            for x in frags
+            for _ in range(0, min(2, len(frags)))
+        ]
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(len(self.pkt_infos))
+        self.verify_capture(packets)
+        self.pg_if.assert_nothing_captured()
+
+    def test_overlap1(self):
+        """ overlapping fragments case #1 """
+
+        fragments = []
+        for _, frags_400, frags_300 in self.pkt_infos:
+            if len(frags_300) == 1:
+                fragments.extend(frags_400)
+            else:
+                for i, j in zip(frags_300, frags_400):
+                    fragments.extend(i)
+                    fragments.extend(j)
+
+        dropped_packet_indexes = set(
+            index for (index, _, frags) in self.pkt_infos if len(frags) > 1
+        )
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        self.sleep(.1, "wait for vpp to process packets")
+        packets = self.punt_socket.wait_for_packets(
+            len(self.pkt_infos) - len(dropped_packet_indexes))
+        self.verify_capture(packets, dropped_packet_indexes)
+        self.pg_if.assert_nothing_captured()
+
+    def test_overlap2(self):
+        """ overlapping fragments case #2 """
+
+        fragments = []
+        for _, frags_400, frags_30 in self.pkt_infos:
+            if len(frags_400) == 1:
+                fragments.extend(frags_400)
+            else:
+                # care must be taken here so that there are no fragments
+                # received by vpp after reassembly is finished, otherwise
+                # new reassemblies will be started and packet generator will
+                # freak out when it detects unfreed buffers
+                zipped = zip(frags_400, frags_30)
+                for i, j in zipped[:-1]:
+                    fragments.extend(i)
+                    fragments.extend(j)
+                fragments.append(zipped[-1][0])
+
+        dropped_packet_indexes = set(
+            index for (index, _, frags) in self.pkt_infos if len(frags) > 1
+        )
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        self.sleep(.1, "wait for vpp to process packets")
+        packets = self.punt_socket.wait_for_packets(
+            len(self.pkt_infos) - len(dropped_packet_indexes))
+        self.verify_capture(packets, dropped_packet_indexes)
+        self.pg_if.assert_nothing_captured()
+
+    def test_timeout_inline(self):
+        """ timeout (inline) """
+
+        dropped_packet_indexes = set(
+            index for (index, frags, _) in self.pkt_infos if len(frags) > 1
+        )
+
+        self.vapi.ip_reassembly_set(timeout_ms=0, max_reassemblies=1000,
+                                    expire_walk_interval_ms=10000, is_ip6=1)
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(self.fragments_400)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(
+            len(self.pkt_infos) - len(dropped_packet_indexes))
+        self.verify_capture(packets, dropped_packet_indexes)
+        pkts = self.pg_if.get_capture(
+            expected_count=len(dropped_packet_indexes))
+        for icmp in pkts:
+            self.assertIn(ICMPv6TimeExceeded, icmp)
+            self.assertIn(IPv6ExtHdrFragment, icmp)
+            self.assertIn(icmp[IPv6ExtHdrFragment].id, dropped_packet_indexes)
+            dropped_packet_indexes.remove(icmp[IPv6ExtHdrFragment].id)
+
+    def test_timeout_cleanup(self):
+        """ timeout (cleanup) """
+
+        # whole packets + fragmented packets sans last fragment
+        fragments = [
+            x for (_, frags_400, _) in self.pkt_infos
+            for x in frags_400[:-1 if len(frags_400) > 1 else None]
+        ]
+
+        # last fragments for fragmented packets
+        fragments2 = [frags_400[-1]
+                      for (_, frags_400, _) in self.pkt_infos
+                      if len(frags_400) > 1]
+
+        dropped_packet_indexes = set(
+            index for (index, frags_400, _) in self.pkt_infos
+            if len(frags_400) > 1)
+
+        self.vapi.ip_reassembly_set(timeout_ms=100, max_reassemblies=1000,
+                                    expire_walk_interval_ms=50)
+
+        self.vapi.ip_reassembly_set(timeout_ms=100, max_reassemblies=1000,
+                                    expire_walk_interval_ms=50, is_ip6=1)
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(fragments)
+        self.pg_start()
+
+        self.sleep(.25, "wait before sending rest of fragments")
+
+        self.pg_if.add_stream(fragments2)
+        self.pg_start()
+        self.sleep(.25, "wait for vpp to process packets")
+
+        packets = self.punt_socket.wait_for_packets(
+            len(self.pkt_infos) - len(dropped_packet_indexes))
+        self.verify_capture(packets, dropped_packet_indexes)
+        pkts = self.pg_if.get_capture(
+            expected_count=len(dropped_packet_indexes))
+        for icmp in pkts:
+            self.assertIn(ICMPv6TimeExceeded, icmp)
+            self.assertIn(IPv6ExtHdrFragment, icmp)
+            self.assertIn(icmp[IPv6ExtHdrFragment].id, dropped_packet_indexes)
+            dropped_packet_indexes.remove(icmp[IPv6ExtHdrFragment].id)
+
+    def test_disabled(self):
+        """ reassembly disabled """
+
+        dropped_packet_indexes = set(
+            index for (index, frags_400, _) in self.pkt_infos
+            if len(frags_400) > 1)
+
+        self.vapi.ip_reassembly_set(timeout_ms=1000, max_reassemblies=0,
+                                    expire_walk_interval_ms=10000, is_ip6=1)
+
+        self.pg_enable_capture()
+        self.pg_if.add_stream(self.fragments_400)
+        self.pg_start()
+
+        packets = self.punt_socket.wait_for_packets(
+            len(self.pkt_infos) - len(dropped_packet_indexes))
+        self.verify_capture(packets, dropped_packet_indexes)
+        self.pg_if.assert_nothing_captured()
+
+    def test_missing_upper(self):
+        """ missing upper layer """
+        p = (Ether(dst=self.pg_if.local_mac, src=self.pg_if.remote_mac) /
+             IPv6(src=self.pg_if.remote_ip6,
+                  dst=self.pg_if.local_ip6) /
+             UDP(sport=1234, dport=self.punt_port) /
+             Raw())
+        self.extend_packet(p, 1000, self.padding)
+        fragments = fragment_rfc8200(p, 1, 500)
+        bad_fragment = p.__class__(str(fragments[1]))
+        bad_fragment[IPv6ExtHdrFragment].nh = 59
+        bad_fragment[IPv6ExtHdrFragment].offset = 0
+        self.pg_enable_capture()
+        self.pg_if.add_stream([bad_fragment])
+        self.pg_start()
+        pkts = self.pg_if.get_capture(expected_count=1)
+        icmp = pkts[0]
+        self.assertIn(ICMPv6ParamProblem, icmp)
+        self.assert_equal(icmp[ICMPv6ParamProblem].code, 3, "ICMP code")
+
+    def test_invalid_frag_size(self):
+        """ fragment size not a multiple of 8 """
+        p = (Ether(dst=self.pg_if.local_mac, src=self.pg_if.remote_mac) /
+             IPv6(src=self.pg_if.remote_ip6,
+                  dst=self.pg_if.local_ip6) /
+             UDP(sport=1234, dport=self.punt_port) /
+             Raw())
+        self.extend_packet(p, 1000, self.padding)
+        fragments = fragment_rfc8200(p, 1, 500)
+        bad_fragment = fragments[0]
+        self.extend_packet(bad_fragment, len(bad_fragment) + 5)
+        self.pg_enable_capture()
+        self.pg_if.add_stream([bad_fragment])
+        self.pg_start()
+        pkts = self.pg_if.get_capture(expected_count=1)
+        icmp = pkts[0]
+        self.assertIn(ICMPv6ParamProblem, icmp)
+        self.assert_equal(icmp[ICMPv6ParamProblem].code, 0, "ICMP code")
+
+    def test_invalid_packet_size(self):
+        """ total packet size > 65535 """
+        p = (Ether(dst=self.pg_if.local_mac, src=self.pg_if.remote_mac) /
+             IPv6(src=self.pg_if.remote_ip6,
+                  dst=self.pg_if.local_ip6) /
+             UDP(sport=1234, dport=self.punt_port) /
+             Raw())
+        self.extend_packet(p, 1000, self.padding)
+        fragments = fragment_rfc8200(p, 1, 500)
+        bad_fragment = fragments[1]
+        bad_fragment[IPv6ExtHdrFragment].offset = 65500
+        self.pg_enable_capture()
+        self.pg_if.add_stream([bad_fragment])
+        self.pg_start()
+        pkts = self.pg_if.get_capture(expected_count=1)
+        icmp = pkts[0]
+        self.assertIn(ICMPv6ParamProblem, icmp)
+        self.assert_equal(icmp[ICMPv6ParamProblem].code, 0, "ICMP code")
+
+
+class TestFIFReassembly(VppTestCase):
+    """ Fragments in fragments reassembly """
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestFIFReassembly, cls).setUpClass()
+
+        cls.create_pg_interfaces([0])
+        cls.pg_if = cls.pg0
+        cls.pg_if.admin_up()
+        cls.pg_if.config_ip4()
+        cls.pg_if.resolve_arp()
+        cls.pg_if.config_ip6()
+        cls.pg_if.resolve_ndp()
+
+        cls.punt_port = 9999
+        cls.punt4_socket = VppUDSPuntSocket(cls, cls.punt_port)
+        cls.punt6_socket = VppUDSPuntSocket(cls, cls.punt_port, is_ip4=0)
+        cls.packet_sizes = [64, 512, 1518, 9018]
+        cls.padding = " abcdefghijklmn"
+
+    def setUp(self):
+        """ Test setup - force timeout on existing reassemblies """
+        super(TestFIFReassembly, self).setUp()
+        self.vapi.ip_reassembly_set(timeout_ms=0, max_reassemblies=1000,
+                                    expire_walk_interval_ms=10)
+        self.vapi.ip_reassembly_set(timeout_ms=0, max_reassemblies=1000,
+                                    expire_walk_interval_ms=10, is_ip6=1)
+        self.sleep(.25)
+        self.vapi.ip_reassembly_set(timeout_ms=1000000, max_reassemblies=1000,
+                                    expire_walk_interval_ms=10000)
+        self.vapi.ip_reassembly_set(timeout_ms=1000000, max_reassemblies=1000,
+                                    expire_walk_interval_ms=10000, is_ip6=1)
+
+    def tearDown(self):
+        self.logger.debug(self.vapi.ppcli("show ip4-reassembly details"))
+        self.logger.debug(self.vapi.ppcli("show ip6-reassembly details"))
+        super(TestFIFReassembly, self).tearDown()
+
+    def verify_capture(self, capture, ip_class, dropped_packet_indexes=[]):
+        """Verify captured packet stream.
+
+        :param list capture: Captured packet stream.
+        """
+        info = None
+        seen = set()
+        for packet in capture:
+            try:
+                sw_if_index = packet['sw_if_index']
+                punt_action = packet['punt_action']
+                packet = Ether(packet['packet'])
+                self.logger.debug(ppp("Got packet from %s, action %s" %
+                                      (sw_if_index, punt_action), packet))
+                ip = packet[ip_class]
+                udp = packet[UDP]
+                payload_info = self.payload_to_info(str(packet[Raw]))
+                packet_index = payload_info.index
+                self.assertTrue(
+                    packet_index not in dropped_packet_indexes,
+                    ppp("Packet received, but should be dropped:", packet))
+                if packet_index in seen:
+                    raise Exception(ppp("Duplicate packet received", packet))
+                seen.add(packet_index)
+                self.assertEqual(payload_info.dst, self.pg_if.sw_if_index)
+                info = self._packet_infos[packet_index]
+                self.assertTrue(info is not None)
+                self.assertEqual(packet_index, info.index)
+                saved_packet = info.data
+                self.assertEqual(ip.src, saved_packet[ip_class].src)
+                self.assertEqual(ip.dst, saved_packet[ip_class].dst)
+                self.assertEqual(udp.payload, saved_packet[UDP].payload)
+            except:
+                self.logger.error(ppp("Unexpected or invalid packet:", packet))
+                raise
+        for index in self._packet_infos:
+            self.assertTrue(index in seen or index in dropped_packet_indexes,
+                            "Packet with packet_index %d not received" % index)
+
+    def test_fif4(self):
+        """ Fragments in fragments (4o4) """
+
+        # TODO this should be ideally in setUpClass, but then we hit a bug
+        # with VppIpRoute incorrectly reporting it's present when it's not
+        # so we need to manually remove the vpp config, thus we cannot have
+        # it shared for multiple test cases
+        self.tun_ip4 = "1.1.1.2"
+
+        self.gre4 = VppGreInterface(self, self.pg0.local_ip4, self.tun_ip4)
+        self.gre4.add_vpp_config()
+        self.gre4.admin_up()
+        self.gre4.config_ip4()
+
+        self.route4 = VppIpRoute(self, self.tun_ip4, 32,
+                                 [VppRoutePath(self.pg0.remote_ip4,
+                                               self.pg0.sw_if_index)])
+        self.route4.add_vpp_config()
+
+        self.reset_packet_infos()
+        for i in range(test_packet_count):
+            info = self.create_packet_info(self.pg0, self.pg0)
+            payload = self.info_to_payload(info)
+            p = (IP(id=i, src=self.pg0.remote_ip4, dst=self.pg0.local_ip4) /
+                 UDP(sport=1234, dport=self.punt_port) /
+                 Raw(payload))
+            size = self.packet_sizes[(i // 2) % len(self.packet_sizes)]
+            self.extend_packet(p, size, self.padding)
+            info.data = p
+
+        fragments = [x for _, p in self._packet_infos.iteritems()
+                     for x in fragment_rfc791(p.data, 400)]
+
+        encapped_fragments = \
+            [Ether(dst=self.pg0.local_mac, src=self.pg0.remote_mac) /
+             IP(src=self.tun_ip4, dst=self.pg0.local_ip4) /
+                GRE() /
+                p
+                for p in fragments]
+
+        fragmented_encapped_fragments = \
+            [x for p in encapped_fragments
+             for x in fragment_rfc791(p, 200)]
+
+        self.pg0.add_stream(fragmented_encapped_fragments)
+
+        self.pg_enable_capture(self.pg_interfaces)
+        self.pg_start()
+
+        self.pg0.assert_nothing_captured()
+        packets = self.punt4_socket.wait_for_packets(len(self._packet_infos))
+        self.verify_capture(packets, IP)
+
+        # TODO remove gre vpp config by hand until VppIpRoute gets fixed
+        # so that it's query_vpp_config() works as it should
+        self.gre4.remove_vpp_config()
+
+    def test_fif6(self):
+        """ Fragments in fragments (6o6) """
+        # TODO this should be ideally in setUpClass, but then we hit a bug
+        # with VppIpRoute incorrectly reporting it's present when it's not
+        # so we need to manually remove the vpp config, thus we cannot have
+        # it shared for multiple test cases
+        self.tun_ip6 = "1002::1"
+
+        self.gre6 = VppGre6Interface(self, self.pg0.local_ip6, self.tun_ip6)
+        self.gre6.add_vpp_config()
+        self.gre6.admin_up()
+        self.gre6.config_ip6()
+
+        self.route6 = VppIpRoute(self, self.tun_ip6, 128,
+                                 [VppRoutePath(self.pg0.remote_ip6,
+                                               self.pg0.sw_if_index,
+                                               proto=DpoProto.DPO_PROTO_IP6)],
+                                 is_ip6=1)
+        self.route6.add_vpp_config()
+
+        self.reset_packet_infos()
+        for i in range(test_packet_count):
+            info = self.create_packet_info(self.pg0, self.pg0)
+            payload = self.info_to_payload(info)
+            p = (IPv6(src=self.pg0.remote_ip6, dst=self.pg0.local_ip6) /
+                 UDP(sport=1234, dport=self.punt_port) /
+                 Raw(payload))
+            size = self.packet_sizes[(i // 2) % len(self.packet_sizes)]
+            self.extend_packet(p, size, self.padding)
+            info.data = p
+
+        fragments = [x for _, i in self._packet_infos.iteritems()
+                     for x in fragment_rfc8200(
+                         i.data, i.index, 400)]
+
+        encapped_fragments = \
+            [Ether(dst=self.pg0.local_mac, src=self.pg0.remote_mac) /
+             IPv6(src=self.tun_ip6, dst=self.pg0.local_ip6) /
+                GRE() /
+                p
+                for p in fragments]
+
+        fragmented_encapped_fragments = \
+            [x for p in encapped_fragments for x in (
+                fragment_rfc8200(
+                    p,
+                    2 * len(self._packet_infos) + p[IPv6ExtHdrFragment].id,
+                    200)
+                if IPv6ExtHdrFragment in p else [p]
+            )
+            ]
+
+        self.pg0.add_stream(fragmented_encapped_fragments)
+
+        self.pg_enable_capture(self.pg_interfaces)
+        self.pg_start()
+
+        self.pg0.assert_nothing_captured()
+        packets = self.punt6_socket.wait_for_packets(len(self._packet_infos))
+        self.verify_capture(packets, IPv6)
+
+        # TODO remove gre vpp config by hand until VppIpRoute gets fixed
+        # so that it's query_vpp_config() works as it should
+        self.gre6.remove_vpp_config()
+
+
+if __name__ == '__main__':
+    unittest.main(testRunner=VppTestRunner)
index 3e0267a..512bf9e 100644 (file)
@@ -8,10 +8,10 @@ from scapy.layers.inet6 import in6_mactoifaceid
 
 from scapy.layers.l2 import Ether
 from scapy.packet import Raw
-from scapy.layers.inet import IP, UDP, TCP
-from scapy.layers.inet6 import IPv6, ICMPv6Unknown, ICMPv6EchoRequest
-from scapy.packet import Packet
-from socket import inet_pton, AF_INET, AF_INET6
+from scapy.layers.inet import IP
+from scapy.layers.inet6 import IPv6, IPv6ExtHdrFragment, IPv6ExtHdrRouting,\
+    IPv6ExtHdrHopByHop
+from socket import AF_INET6
 
 
 def ppp(headline, packet):
@@ -37,8 +37,7 @@ def ppc(headline, capture, limit=10):
     tail = ""
     if limit < len(capture):
         tail = "\nPrint limit reached, %s out of %s packets printed" % (
-            len(capture), limit)
-        limit = len(capture)
+            limit, len(capture))
     body = "".join([ppp("Packet #%s:" % count, p)
                     for count, p in zip(range(0, limit), capture)])
     return "%s\n%s%s" % (headline, body, tail)
@@ -174,6 +173,7 @@ class ForeignAddressFactory(object):
 
 class L4_Conn():
     """ L4 'connection' tied to two VPP interfaces """
+
     def __init__(self, testcase, if1, if2, af, l4proto, port1, port2):
         self.testcase = testcase
         self.ifs = [None, None]
@@ -189,7 +189,7 @@ class L4_Conn():
     def pkt(self, side, l4args={}, payload="x"):
         is_ip6 = 1 if self.address_family == AF_INET6 else 0
         s0 = side
-        s1 = 1-side
+        s1 = 1 - side
         src_if = self.ifs[s0]
         dst_if = self.ifs[s1]
         layer_3 = [IP(src=src_if.remote_ip4, dst=dst_if.remote_ip4),
@@ -208,7 +208,7 @@ class L4_Conn():
             l4args['flags'] = flags
         self.ifs[side].add_stream(self.pkt(side,
                                            l4args=l4args, payload=payload))
-        self.ifs[1-side].enable_capture()
+        self.ifs[1 - side].enable_capture()
         self.testcase.pg_start()
 
     def recv(self, side):
@@ -217,15 +217,190 @@ class L4_Conn():
 
     def send_through(self, side, flags=None, payload=""):
         self.send(side, flags, payload)
-        p = self.recv(1-side)
+        p = self.recv(1 - side)
         return p
 
     def send_pingpong(self, side, flags1=None, flags2=None):
         p1 = self.send_through(side, flags1)
-        p2 = self.send_through(1-side, flags2)
+        p2 = self.send_through(1 - side, flags2)
         return [p1, p2]
 
 
 class L4_CONN_SIDE:
     L4_CONN_SIDE_ZERO = 0
     L4_CONN_SIDE_ONE = 1
+
+
+class LoggerWrapper(object):
+    def __init__(self, logger=None):
+        self._logger = logger
+
+    def debug(self, *args, **kwargs):
+        if self._logger:
+            self._logger.debug(*args, **kwargs)
+
+    def error(self, *args, **kwargs):
+        if self._logger:
+            self._logger.error(*args, **kwargs)
+
+
+def fragment_rfc791(packet, fragsize, _logger=None):
+    """
+    Fragment an IPv4 packet per RFC 791
+    :param packet: packet to fragment
+    :param fragsize: size at which to fragment
+    :note: IP options are not supported
+    :returns: list of fragments
+    """
+    logger = LoggerWrapper(_logger)
+    logger.debug(ppp("Fragmenting packet:", packet))
+    packet = packet.__class__(str(packet))  # recalculate all values
+    if len(packet[IP].options) > 0:
+        raise Exception("Not implemented")
+    if len(packet) <= fragsize:
+        return [packet]
+
+    pre_ip_len = len(packet) - len(packet[IP])
+    ip_header_len = packet[IP].ihl * 4
+    hex_packet = str(packet)
+    hex_headers = hex_packet[:(pre_ip_len + ip_header_len)]
+    hex_payload = hex_packet[(pre_ip_len + ip_header_len):]
+
+    pkts = []
+    ihl = packet[IP].ihl
+    otl = len(packet[IP])
+    nfb = (fragsize - pre_ip_len - ihl * 4) / 8
+    fo = packet[IP].frag
+
+    p = packet.__class__(hex_headers + hex_payload[:nfb * 8])
+    p[IP].flags = "MF"
+    p[IP].frag = fo
+    p[IP].len = ihl * 4 + nfb * 8
+    del p[IP].chksum
+    pkts.append(p)
+
+    p = packet.__class__(hex_headers + hex_payload[nfb * 8:])
+    p[IP].len = otl - nfb * 8
+    p[IP].frag = fo + nfb
+    del p[IP].chksum
+
+    more_fragments = fragment_rfc791(p, fragsize, _logger)
+    pkts.extend(more_fragments)
+
+    return pkts
+
+
+def fragment_rfc8200(packet, identification, fragsize, _logger=None):
+    """
+    Fragment an IPv6 packet per RFC 8200
+    :param packet: packet to fragment
+    :param fragsize: size at which to fragment
+    :note: IP options are not supported
+    :returns: list of fragments
+    """
+    logger = LoggerWrapper(_logger)
+    packet = packet.__class__(str(packet))  # recalculate all values
+    if len(packet) <= fragsize:
+        return [packet]
+    logger.debug(ppp("Fragmenting packet:", packet))
+    pkts = []
+    counter = 0
+    routing_hdr = None
+    hop_by_hop_hdr = None
+    upper_layer = None
+    seen_ipv6 = False
+    ipv6_nr = -1
+    l = packet.getlayer(counter)
+    while l is not None:
+        if l.__class__ is IPv6:
+            if seen_ipv6:
+                # ignore 2nd IPv6 header and everything below..
+                break
+            ipv6_nr = counter
+            seen_ipv6 = True
+        elif l.__class__ is IPv6ExtHdrFragment:
+            raise Exception("Already fragmented")
+        elif l.__class__ is IPv6ExtHdrRouting:
+            routing_hdr = counter
+        elif l.__class__ is IPv6ExtHdrHopByHop:
+            hop_by_hop_hdr = counter
+        elif seen_ipv6 and not upper_layer and \
+                not l.__class__.__name__.startswith('IPv6ExtHdr'):
+            upper_layer = counter
+        counter = counter + 1
+        l = packet.getlayer(counter)
+
+    logger.debug(
+        "Layers seen: IPv6(#%s), Routing(#%s), HopByHop(#%s), upper(#%s)" %
+        (ipv6_nr, routing_hdr, hop_by_hop_hdr, upper_layer))
+
+    if upper_layer is None:
+        raise Exception("Upper layer header not found in IPv6 packet")
+
+    last_per_fragment_hdr = ipv6_nr
+    if routing_hdr is None:
+        if hop_by_hop_hdr is not None:
+            last_per_fragment_hdr = hop_by_hop_hdr
+    else:
+        last_per_fragment_hdr = routing_hdr
+    logger.debug("Last per-fragment hdr is #%s" % (last_per_fragment_hdr))
+
+    per_fragment_headers = packet.copy()
+    per_fragment_headers[last_per_fragment_hdr].remove_payload()
+    logger.debug(ppp("Per-fragment headers:", per_fragment_headers))
+
+    ext_and_upper_layer = packet.getlayer(last_per_fragment_hdr)[1]
+    hex_payload = str(ext_and_upper_layer)
+    logger.debug("Payload length is %s" % len(hex_payload))
+    logger.debug(ppp("Ext and upper layer:", ext_and_upper_layer))
+
+    fragment_ext_hdr = IPv6ExtHdrFragment()
+    logger.debug(ppp("Fragment header:", fragment_ext_hdr))
+
+    if len(per_fragment_headers) + len(fragment_ext_hdr) +\
+            len(ext_and_upper_layer) - len(ext_and_upper_layer.payload)\
+            > fragsize:
+        raise Exception("Cannot fragment this packet - MTU too small "
+                        "(%s, %s, %s, %s, %s)" % (
+                            len(per_fragment_headers), len(fragment_ext_hdr),
+                            len(ext_and_upper_layer),
+                            len(ext_and_upper_layer.payload), fragsize))
+
+    orig_nh = packet[IPv6].nh
+    p = per_fragment_headers
+    del p[IPv6].plen
+    del p[IPv6].nh
+    p = p / fragment_ext_hdr
+    del p[IPv6ExtHdrFragment].nh
+    first_payload_len_nfb = (fragsize - len(p)) / 8
+    p = p / Raw(hex_payload[:first_payload_len_nfb * 8])
+    del p[IPv6].plen
+    p[IPv6ExtHdrFragment].nh = orig_nh
+    p[IPv6ExtHdrFragment].id = identification
+    p[IPv6ExtHdrFragment].offset = 0
+    p[IPv6ExtHdrFragment].m = 1
+    p = p.__class__(str(p))
+    logger.debug(ppp("Fragment %s:" % len(pkts), p))
+    pkts.append(p)
+    offset = first_payload_len_nfb * 8
+    logger.debug("Offset after first fragment: %s" % offset)
+    while len(hex_payload) > offset:
+        p = per_fragment_headers
+        del p[IPv6].plen
+        del p[IPv6].nh
+        p = p / fragment_ext_hdr
+        del p[IPv6ExtHdrFragment].nh
+        l_nfb = (fragsize - len(p)) / 8
+        p = p / Raw(hex_payload[offset:offset + l_nfb * 8])
+        p[IPv6ExtHdrFragment].nh = orig_nh
+        p[IPv6ExtHdrFragment].id = identification
+        p[IPv6ExtHdrFragment].offset = offset / 8
+        p[IPv6ExtHdrFragment].m = 1
+        p = p.__class__(str(p))
+        logger.debug(ppp("Fragment %s:" % len(pkts), p))
+        pkts.append(p)
+        offset = offset + l_nfb * 8
+
+    pkts[-1][IPv6ExtHdrFragment].m = 0  # reset more-flags in last fragment
+
+    return pkts
index acfd348..998f6ea 100644 (file)
@@ -26,14 +26,21 @@ class VppGreInterface(VppInterface):
                                               is_teb=self.t_is_teb)
         self._sw_if_index = r.sw_if_index
         self.generate_remote_hosts()
+        self._test.registry.register(self, self._test.logger)
 
     def remove_vpp_config(self):
         s = socket.inet_pton(socket.AF_INET, self.t_src)
         d = socket.inet_pton(socket.AF_INET, self.t_dst)
         self.unconfig()
-        r = self.test.vapi.gre_tunnel_add_del(s, d,
-                                              outer_fib_id=self.t_outer_fib,
-                                              is_add=0)
+        self.test.vapi.gre_tunnel_add_del(s, d,
+                                          outer_fib_id=self.t_outer_fib,
+                                          is_add=0)
+
+    def __str__(self):
+        return self.object_id()
+
+    def object_id(self):
+        return "gre-%d" % self._sw_if_index
 
 
 class VppGre6Interface(VppInterface):
@@ -60,12 +67,19 @@ class VppGre6Interface(VppInterface):
                                               is_ip6=1)
         self._sw_if_index = r.sw_if_index
         self.generate_remote_hosts()
+        self._test.registry.register(self, self._test.logger)
 
     def remove_vpp_config(self):
         s = socket.inet_pton(socket.AF_INET6, self.t_src)
         d = socket.inet_pton(socket.AF_INET6, self.t_dst)
         self.unconfig()
-        r = self.test.vapi.gre_tunnel_add_del(s, d,
-                                              outer_fib_id=self.t_outer_fib,
-                                              is_add=0,
-                                              is_ip6=1)
+        self.test.vapi.gre_tunnel_add_del(s, d,
+                                          outer_fib_id=self.t_outer_fib,
+                                          is_add=0,
+                                          is_ip6=1)
+
+    def __str__(self):
+        return self.object_id()
+
+    def object_id(self):
+        return "gre-%d" % self._sw_if_index
index b8505ce..8891dde 100644 (file)
@@ -372,3 +372,15 @@ class VppInterface(object):
         self.test.vapi.proxy_arp_intfc_enable_disable(
             self.sw_if_index,
             enable)
+
+    def query_vpp_config(self):
+        dump = self.test.vapi.sw_interface_dump()
+        return self.is_interface_config_in_dump(dump)
+
+    def is_interface_config_in_dump(self, dump):
+        for i in dump:
+            if i.interface_name.rstrip(' \t\r\n\0') == self.name and \
+               i.sw_if_index == self.sw_if_index:
+                return True
+        else:
+            return False
index 963123f..ae4111b 100644 (file)
@@ -19,17 +19,5 @@ class VppLoInterface(VppInterface, VppObject):
     def remove_vpp_config(self):
         self.test.vapi.delete_loopback(self.sw_if_index)
 
-    def query_vpp_config(self):
-        dump = self.test.vapi.sw_interface_dump()
-        return self.is_interface_config_in_dump(dump)
-
-    def is_interface_config_in_dump(self, dump):
-        for i in dump:
-            if i.interface_name.rstrip(' \t\r\n\0') == self.name and \
-               i.sw_if_index == self.sw_if_index:
-                return True
-        else:
-            return False
-
     def object_id(self):
         return "loopback-%d" % self._sw_if_index
index 0269736..087a14b 100644 (file)
@@ -3102,3 +3102,26 @@ class VppPapiProvider(object):
              'ip6_fib_id': ip6_fib_id,
              'namespace_id': namespace_id,
              'namespace_id_len': len(namespace_id)})
+
+    def punt_socket_register(self, l4_port, pathname, header_version=1,
+                             is_ip4=1, l4_protocol=0x11):
+        """ Punt to socket """
+        return self.api(self.papi.punt_socket_register,
+                        {'is_ip4': is_ip4,
+                         'l4_protocol': l4_protocol,
+                         'l4_port': l4_port,
+                         'pathname': pathname,
+                         'header_version': header_version})
+
+    def ip_reassembly_set(self, timeout_ms, max_reassemblies,
+                          expire_walk_interval_ms, is_ip6=0):
+        """ Set IP reassembly parameters """
+        return self.api(self.papi.ip_reassembly_set,
+                        {'is_ip6': is_ip6,
+                         'timeout_ms': timeout_ms,
+                         'expire_walk_interval_ms': expire_walk_interval_ms,
+                         'max_reassemblies': max_reassemblies})
+
+    def ip_reassembly_get(self, is_ip6=0):
+        """ Get IP reassembly parameters """
+        return self.api(self.papi.ip_reassembly_get, {'is_ip6': is_ip6})
diff --git a/test/vpp_punt_socket.py b/test/vpp_punt_socket.py
new file mode 100644 (file)
index 0000000..5004b23
--- /dev/null
@@ -0,0 +1,75 @@
+from socket import socket, AF_UNIX, SOCK_DGRAM
+from select import select
+from time import time
+from struct import unpack, calcsize
+from util import ppc
+from scapy.layers.l2 import Ether
+
+client_uds_socket_name = "client-uds-socket"
+vpp_uds_socket_name = "vpp-uds-socket"
+
+VPP_PUNT_HEADER_FMT = '=Ii'
+VPP_PUNT_HEADER_SIZE = calcsize(VPP_PUNT_HEADER_FMT)
+
+
+class VppPuntAction:
+    PUNT_L2 = 0
+    PUNT_IP4_ROUTED = 1
+    PUNT_IP6_ROUTED = 2
+
+
+class VppUDSPuntSocket(object):
+    def __init__(self, testcase, port, is_ip4=1, l4_protocol=0x11):
+        client_path = '%s/%s-%s-%s' % (testcase.tempdir,
+                                       client_uds_socket_name,
+                                       "4" if is_ip4 else "6", port)
+        testcase.vapi.punt_socket_register(
+            port, client_path, is_ip4=is_ip4, l4_protocol=l4_protocol)
+        self.testcase = testcase
+        self.uds = socket(AF_UNIX, SOCK_DGRAM)
+        self.uds.bind(client_path)
+        self.uds.connect(testcase.punt_socket_path)
+
+    def wait_for_packets(self, count, timeout=1):
+        packets = []
+        now = time()
+        deadline = now + timeout
+        while len(packets) < count and now < deadline:
+            r, w, e = select([self.uds], [], [self.uds], deadline - now)
+            if self.uds in r:
+                x = self.uds.recv(1024 * 1024)
+                sw_if_index, punt_action = unpack(
+                    VPP_PUNT_HEADER_FMT, x[:VPP_PUNT_HEADER_SIZE])
+                packets.append({'sw_if_index': sw_if_index,
+                                'punt_action': punt_action,
+                                'packet': x[VPP_PUNT_HEADER_SIZE:]})
+
+            if self.uds in e:
+                raise Exception("select() indicates error on UDS socket")
+            now = time()
+
+        if len(packets) != count:
+            raise Exception("Unexpected packet count received, got %s packets,"
+                            " expected %s packets" % (len(packets), count))
+        self.testcase.logger.debug(
+            "Got %s packets via punt socket" % len(packets))
+        return packets
+
+    def assert_nothing_captured(self, timeout=.25):
+        packets = []
+        now = time()
+        deadline = now + timeout
+        while now < deadline:
+            r, w, e = select([self.uds], [], [self.uds], deadline - now)
+            if self.uds in r:
+                x = self.uds.recv(1024 * 1024)
+                packets.append(Ether(x[VPP_PUNT_HEADER_SIZE:]))
+            if self.uds in e:
+                raise Exception("select() indicates error on UDS socket")
+            now = time()
+
+        if len(packets) > 0:
+            self.testcase.logger.error(
+                ppc("Unexpected packets captured:", packets))
+            raise Exception("Unexpected packet count received, got %s packets,"
+                            " expected no packets" % len(packets))