ip: add extended shallow reassembly 39/40839/8
authorKlement Sekera <[email protected]>
Fri, 19 Apr 2024 07:15:01 +0000 (09:15 +0200)
committerKlement Sekera <[email protected]>
Mon, 7 Oct 2024 14:39:42 +0000 (16:39 +0200)
This patch adds some fixes and improvements:

Fixes bug where save_rewrite_length gets overwritten on reassembly
handoff.

Fixes bug where duplicate fragments could cause a reassembly context
to be lost, because the race losing thread would remove bihash entry
created by winning thread.

Improves tracing by adding more events.

Adds extended shallow reassembly. This is a toggleable option, which if
turned on will cause reassembly to wait for both first and last
fragments to calculate total IP payload length. Furthermore it'll store
a local copy of first fragment and necessary data to retrieve it in
vnet_buffer2. This allows downstream features to access full L3/L4
headers when dealing with fragments.

Type: fix
Change-Id: I81695070533410c5815291dbc65ea71c87e3ae05
Signed-off-by: Klement Sekera <[email protected]>
src/plugins/map/ip6_map_t.c
src/vnet/buffer.h
src/vnet/ip/ip6_to_ip4.h
src/vnet/ip/reass/ip4_sv_reass.c
src/vnet/ip/reass/ip4_sv_reass.h
src/vnet/ip/reass/ip6_sv_reass.c
src/vnet/ip/reass/ip6_sv_reass.h

index 51853d6..f8d894a 100644 (file)
@@ -151,9 +151,8 @@ ip6_map_t_icmp (vlib_main_t * vm,
                               vnet_buffer (p0)->map_t.map_domain_index);
          ctx0.d = d0;
          ctx0.sender_port = 0;
-         if (!ip6_get_port
-             (vm, p0, ip60, p0->current_length, NULL, &ctx0.sender_port,
-              NULL, NULL, NULL, NULL))
+         if (!ip6_get_port (vm, p0, ip60, p0->current_length, NULL,
+                            &ctx0.sender_port, NULL, NULL, NULL, NULL, NULL))
            {
              // In case of 1:1 mapping, we don't care about the port
              if (!(d0->ea_bits_len == 0 && d0->rules))
index e60b8ff..247af56 100644 (file)
@@ -241,7 +241,8 @@ typedef struct
                u8 ip_proto;    /* protocol in ip header */
                u8 icmp_type_or_tcp_flags;
                u8 is_non_first_fragment : 1;
-               u8 l4_layer_truncated : 7;
+               u8 l4_hdr_truncated : 1;
+               u8 unused : 6;
                u32 tcp_seq_number;
              };
              /* full reassembly output variables */
@@ -492,7 +493,22 @@ typedef struct
     };
   } nat;
 
-  u32 unused[8];
+  struct
+  {
+    /*
+     * Shallow virtual reassembly output values.
+     * Only populated if extended reassembly enabled via
+     * ipX_sv_reass_enable_disable_extended().
+     */
+    struct
+    {
+      u32 thread_index;
+      u32 pool_index;
+      u32 id;
+    } reass;
+  } ip;
+
+  u32 unused[5];
 } vnet_buffer_opaque2_t;
 
 #define vnet_buffer2(b) ((vnet_buffer_opaque2_t *) (b)->opaque2)
index 29d5718..ebabcd0 100644 (file)
@@ -96,10 +96,10 @@ ip6_parse (vlib_main_t *vm, vlib_buffer_t *b, ip6_header_t *ip6, u32 buff_len,
  * @returns 1 on success, 0 otherwise.
  */
 always_inline u16
-ip6_get_port (vlib_main_t * vm, vlib_buffer_t * b, ip6_header_t * ip6,
-             u16 buffer_len, u8 * ip_protocol, u16 * src_port,
-             u16 * dst_port, u8 * icmp_type_or_tcp_flags,
-             u32 * tcp_ack_number, u32 * tcp_seq_number)
+ip6_get_port (vlib_main_t *vm, vlib_buffer_t *b, ip6_header_t *ip6,
+             u16 buffer_len, u8 *ip_protocol, u16 *src_port, u16 *dst_port,
+             u8 *icmp_type_or_tcp_flags, u32 *tcp_ack_number,
+             u32 *tcp_seq_number, void **l4_hdr)
 {
   u8 l4_protocol;
   u16 l4_offset;
@@ -120,8 +120,19 @@ ip6_get_port (vlib_main_t * vm, vlib_buffer_t * b, ip6_header_t * ip6,
       *ip_protocol = l4_protocol;
     }
   l4 = u8_ptr_add (ip6, l4_offset);
+  if (l4_hdr)
+    *l4_hdr = l4;
   if (l4_protocol == IP_PROTOCOL_TCP || l4_protocol == IP_PROTOCOL_UDP)
     {
+      if ((IP_PROTOCOL_UDP == l4_protocol &&
+          u8_ptr_add (l4, sizeof (udp_header_t)) >
+            u8_ptr_add (vlib_buffer_get_current (b), b->current_length)) ||
+         (IP_PROTOCOL_TCP == l4_protocol &&
+          u8_ptr_add (l4, sizeof (tcp_header_t)) >
+            u8_ptr_add (vlib_buffer_get_current (b), b->current_length)))
+       {
+         return 0;
+       }
       if (src_port)
        *src_port = ((udp_header_t *) (l4))->src_port;
       if (dst_port)
@@ -135,6 +146,11 @@ ip6_get_port (vlib_main_t * vm, vlib_buffer_t * b, ip6_header_t * ip6,
     }
   else if (l4_protocol == IP_PROTOCOL_ICMP6)
     {
+      if (u8_ptr_add (l4, sizeof (icmp46_header_t)) >
+         u8_ptr_add (vlib_buffer_get_current (b), b->current_length))
+       {
+         return 0;
+       }
       icmp46_header_t *icmp = (icmp46_header_t *) (l4);
       if (icmp_type_or_tcp_flags)
        *icmp_type_or_tcp_flags = ((icmp46_header_t *) (l4))->type;
index ad8f178..50b4b22 100644 (file)
 #include <vppinfra/bihash_16_8.h>
 #include <vnet/ip/reass/ip4_sv_reass.h>
 
-#define MSEC_PER_SEC 1000
+#define MSEC_PER_SEC                   1000
 #define IP4_SV_REASS_TIMEOUT_DEFAULT_MS 100
-#define IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000     // 10 seconds default
-#define IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT 1024
+#define IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS                          \
+  10000 // 10 seconds default
+#define IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT     1024
 #define IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
-#define IP4_SV_REASS_HT_LOAD_FACTOR (0.75)
+#define IP4_SV_REASS_HT_LOAD_FACTOR               (0.75)
 
 typedef enum
 {
@@ -94,17 +95,23 @@ typedef struct
   // buffer indexes of buffers in this reassembly in chronological order -
   // including overlaps and duplicate fragments
   u32 *cached_buffers;
-  // set to true when this reassembly is completed
-  bool is_complete;
-  // ip protocol
+
+  bool first_fragment_seen;
+  bool last_fragment_seen;
+
+  // vnet_buffer data
   u8 ip_proto;
   u8 icmp_type_or_tcp_flags;
   u32 tcp_ack_number;
   u32 tcp_seq_number;
-  // l4 src port
   u16 l4_src_port;
-  // l4 dst port
   u16 l4_dst_port;
+
+  // vnet_buffer2 data
+  u32 total_ip_payload_length;
+  u32 first_fragment_total_ip_header_length;
+  u32 first_fragment_clone_bi;
+
   u32 next_index;
   // lru indexes
   u32 lru_prev;
@@ -114,13 +121,11 @@ typedef struct
 typedef struct
 {
   ip4_sv_reass_t *pool;
-  u32 reass_n;
   u32 id_counter;
   clib_spinlock_t lock;
   // lru indexes
   u32 lru_first;
   u32 lru_last;
-
 } ip4_sv_reass_per_thread_t;
 
 typedef struct
@@ -143,8 +148,6 @@ typedef struct
   vlib_main_t *vlib_main;
   vnet_main_t *vnet_main;
 
-  // node index of ip4-drop node
-  u32 ip4_drop_idx;
   u32 ip4_sv_reass_expire_node_idx;
 
   /** Worker handoff */
@@ -159,6 +162,8 @@ typedef struct
   // reference count for enabling/disabling feature - per interface
   u32 *output_feature_use_refcount_per_intf;
 
+  // extended reassembly refcount - see ip4_sv_reass_enable_disable_extended()
+  u32 extended_refcount;
 } ip4_sv_reass_main_t;
 
 extern ip4_sv_reass_main_t ip4_sv_reass_main;
@@ -178,9 +183,15 @@ typedef enum
 typedef enum
 {
   REASS_FRAGMENT_CACHE,
-  REASS_FINISH,
+  REASS_FIRST_FRAG,
+  REASS_LAST_FRAG,
   REASS_FRAGMENT_FORWARD,
   REASS_PASSTHROUGH,
+  REASS_HANDOFF,
+  REASS_KEY,
+  REASS_FREE_TIMEOUT,
+  REASS_FREE_LRU,
+  REASS_FREE_ERROR,
 } ip4_sv_reass_trace_operation_e;
 
 typedef struct
@@ -191,19 +202,23 @@ typedef struct
   u8 ip_proto;
   u16 l4_src_port;
   u16 l4_dst_port;
-  int l4_layer_truncated;
+  int l4_hdr_truncated;
+  u32 handoff_thread_index;
+  clib_bihash_kv_16_8_t kv;
 } ip4_sv_reass_trace_t;
 
 extern vlib_node_registration_t ip4_sv_reass_node;
 extern vlib_node_registration_t ip4_sv_reass_node_feature;
 
 static u8 *
-format_ip4_sv_reass_trace (u8 * s, va_list * args)
+format_ip4_sv_reass_trace (u8 *s, va_list *args)
 {
   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
   ip4_sv_reass_trace_t *t = va_arg (*args, ip4_sv_reass_trace_t *);
-  if (REASS_PASSTHROUGH != t->action)
+  if (REASS_PASSTHROUGH != t->action && REASS_HANDOFF != t->action &&
+      REASS_KEY != t->action && REASS_FREE_TIMEOUT != t->action &&
+      REASS_FREE_LRU != t->action && REASS_FREE_ERROR != t->action)
     {
       s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
     }
@@ -212,25 +227,42 @@ format_ip4_sv_reass_trace (u8 * s, va_list * args)
     case REASS_FRAGMENT_CACHE:
       s = format (s, "[cached]");
       break;
-    case REASS_FINISH:
+    case REASS_FIRST_FRAG:
       s =
-       format (s, "[finish, ip proto=%u, src_port=%u, dst_port=%u]",
+       format (s, "[first-frag-seen, ip proto=%u, src_port=%u, dst_port=%u]",
                t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
                clib_net_to_host_u16 (t->l4_dst_port));
       break;
+    case REASS_LAST_FRAG:
+      s = format (s, "[last-frag-seen]");
+      break;
+    case REASS_HANDOFF:
+      s = format (s, "[handoff, thread index: %u]", t->handoff_thread_index);
+      break;
+    case REASS_KEY:
+      s = format (s, "[lookup, key: %U]", format_bihash_kvp_16_8, &t->kv);
+      break;
+    case REASS_FREE_LRU:
+      s = format (s, "[free, LRU pressure]");
+      break;
+    case REASS_FREE_TIMEOUT:
+      s = format (s, "[free, timed out]");
+      break;
+    case REASS_FREE_ERROR:
+      s = format (s, "[free, error occurred]");
+      break;
     case REASS_FRAGMENT_FORWARD:
-      s =
-       format (s, "[forward, ip proto=%u, src_port=%u, dst_port=%u]",
-               t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
-               clib_net_to_host_u16 (t->l4_dst_port));
+      s = format (s, "[forward, ip proto=%u, src_port=%u, dst_port=%u]",
+                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
+                 clib_net_to_host_u16 (t->l4_dst_port));
       break;
     case REASS_PASSTHROUGH:
       s = format (s, "[not-fragmented]");
       break;
     }
-  if (t->l4_layer_truncated)
+  if (t->l4_hdr_truncated)
     {
-      s = format (s, " [l4-layer-truncated]");
+      s = format (s, " [l4-hdr-truncated]");
     }
   return s;
 }
@@ -239,12 +271,12 @@ static void
 ip4_sv_reass_add_trace (vlib_main_t *vm, vlib_node_runtime_t *node,
                        ip4_sv_reass_t *reass, u32 bi,
                        ip4_sv_reass_trace_operation_e action, u32 ip_proto,
-                       u16 l4_src_port, u16 l4_dst_port,
-                       int l4_layer_truncated)
+                       u16 l4_src_port, u16 l4_dst_port, int l4_hdr_truncated,
+                       u32 handoff_thread_index)
 {
   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
-  if (pool_is_free_index
-      (vm->trace_main.trace_buffer_pool, vlib_buffer_get_trace_index (b)))
+  if (pool_is_free_index (vm->trace_main.trace_buffer_pool,
+                         vlib_buffer_get_trace_index (b)))
     {
       // this buffer's trace is gone
       b->flags &= ~VLIB_BUFFER_IS_TRACED;
@@ -261,7 +293,8 @@ ip4_sv_reass_add_trace (vlib_main_t *vm, vlib_node_runtime_t *node,
   t->ip_proto = ip_proto;
   t->l4_src_port = l4_src_port;
   t->l4_dst_port = l4_dst_port;
-  t->l4_layer_truncated = l4_layer_truncated;
+  t->l4_hdr_truncated = l4_hdr_truncated;
+  t->handoff_thread_index = handoff_thread_index;
 #if 0
   static u8 *s = NULL;
   s = format (s, "%U", format_ip4_sv_reass_trace, NULL, NULL, t);
@@ -271,29 +304,56 @@ ip4_sv_reass_add_trace (vlib_main_t *vm, vlib_node_runtime_t *node,
 #endif
 }
 
+static void
+ip4_sv_reass_trace_timeout (vlib_main_t *vm, vlib_node_runtime_t *node,
+                           ip4_sv_reass_t *reass, u32 bi)
+{
+  return ip4_sv_reass_add_trace (vm, node, reass, bi, REASS_FREE_TIMEOUT, ~0,
+                                ~0, ~0, 0, ~0);
+}
+
+static void
+ip4_sv_reass_trace_lru_free (vlib_main_t *vm, vlib_node_runtime_t *node,
+                            ip4_sv_reass_t *reass, u32 bi)
+{
+  return ip4_sv_reass_add_trace (vm, node, reass, bi, REASS_FREE_LRU, ~0, ~0,
+                                ~0, 0, ~0);
+}
+
+static void
+ip4_sv_reass_trace_error_free (vlib_main_t *vm, vlib_node_runtime_t *node,
+                              ip4_sv_reass_t *reass, u32 bi)
+{
+  return ip4_sv_reass_add_trace (vm, node, reass, bi, REASS_FREE_ERROR, ~0, ~0,
+                                ~0, 0, ~0);
+}
 
 always_inline void
-ip4_sv_reass_free (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
-                  ip4_sv_reass_per_thread_t * rt, ip4_sv_reass_t * reass)
+ip4_sv_reass_free (vlib_main_t *vm, ip4_sv_reass_main_t *rm,
+                  ip4_sv_reass_per_thread_t *rt, ip4_sv_reass_t *reass,
+                  bool del_bihash)
 {
-  clib_bihash_kv_16_8_t kv;
-  kv.key[0] = reass->key.as_u64[0];
-  kv.key[1] = reass->key.as_u64[1];
-  clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
+  if (del_bihash)
+    {
+      clib_bihash_kv_16_8_t kv;
+      kv.key[0] = reass->key.as_u64[0];
+      kv.key[1] = reass->key.as_u64[1];
+      clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
+    }
   vlib_buffer_free (vm, reass->cached_buffers,
                    vec_len (reass->cached_buffers));
   vec_free (reass->cached_buffers);
   reass->cached_buffers = NULL;
+  if (~0 != reass->first_fragment_clone_bi)
+    vlib_buffer_free_one (vm, reass->first_fragment_clone_bi);
   if (~0 != reass->lru_prev)
     {
-      ip4_sv_reass_t *lru_prev =
-       pool_elt_at_index (rt->pool, reass->lru_prev);
+      ip4_sv_reass_t *lru_prev = pool_elt_at_index (rt->pool, reass->lru_prev);
       lru_prev->lru_next = reass->lru_next;
     }
   if (~0 != reass->lru_next)
     {
-      ip4_sv_reass_t *lru_next =
-       pool_elt_at_index (rt->pool, reass->lru_next);
+      ip4_sv_reass_t *lru_next = pool_elt_at_index (rt->pool, reass->lru_next);
       lru_next->lru_prev = reass->lru_prev;
     }
   if (rt->lru_first == reass - rt->pool)
@@ -305,20 +365,13 @@ ip4_sv_reass_free (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
       rt->lru_last = reass->lru_prev;
     }
   pool_put (rt->pool, reass);
-  --rt->reass_n;
-}
-
-always_inline void
-ip4_sv_reass_init (ip4_sv_reass_t * reass)
-{
-  reass->cached_buffers = NULL;
-  reass->is_complete = false;
 }
 
 always_inline ip4_sv_reass_t *
-ip4_sv_reass_find_or_create (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
-                            ip4_sv_reass_per_thread_t * rt,
-                            ip4_sv_reass_kv_t * kv, u8 * do_handoff)
+ip4_sv_reass_find_or_create (vlib_main_t *vm, vlib_node_runtime_t *node,
+                            u32 bi, ip4_sv_reass_main_t *rm,
+                            ip4_sv_reass_per_thread_t *rt,
+                            ip4_sv_reass_kv_t *kv, u8 *do_handoff)
 {
   ip4_sv_reass_t *reass = NULL;
   f64 now = vlib_time_now (vm);
@@ -336,7 +389,8 @@ again:
 
       if (now > reass->last_heard + rm->timeout)
        {
-         ip4_sv_reass_free (vm, rm, rt, reass);
+         ip4_sv_reass_trace_timeout (vm, node, reass, bi);
+         ip4_sv_reass_free (vm, rm, rt, reass, true);
          reass = NULL;
        }
     }
@@ -347,18 +401,17 @@ again:
       return reass;
     }
 
-  if (rt->reass_n >= rm->max_reass_n && rm->max_reass_n)
+  if (pool_elts (rt->pool) >= rm->max_reass_n && rm->max_reass_n)
     {
       reass = pool_elt_at_index (rt->pool, rt->lru_first);
-      ip4_sv_reass_free (vm, rm, rt, reass);
+      ip4_sv_reass_trace_lru_free (vm, node, reass, bi);
+      ip4_sv_reass_free (vm, rm, rt, reass, true);
     }
 
-  pool_get (rt->pool, reass);
-  clib_memset (reass, 0, sizeof (*reass));
+  pool_get_zero (rt->pool, reass);
+  reass->first_fragment_clone_bi = ~0;
   reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
   ++rt->id_counter;
-  ip4_sv_reass_init (reass);
-  ++rt->reass_n;
   reass->lru_prev = reass->lru_next = ~0;
 
   if (~0 != rt->lru_last)
@@ -382,7 +435,7 @@ again:
   int rv = clib_bihash_add_del_16_8 (&rm->hash, &kv->kv, 2);
   if (rv)
     {
-      ip4_sv_reass_free (vm, rm, rt, reass);
+      ip4_sv_reass_free (vm, rm, rt, reass, false);
       reass = NULL;
       // if other worker created a context already work with the other copy
       if (-2 == rv)
@@ -392,10 +445,23 @@ again:
   return reass;
 }
 
+always_inline bool
+ip4_sv_reass_is_complete (ip4_sv_reass_t *reass, bool extended)
+{
+  /*
+   * Both first and last fragments have to be seen for extended reassembly to
+   * be complete. Otherwise first fragment is enough.
+   */
+  if (extended)
+    return reass->first_fragment_seen && reass->last_fragment_seen;
+
+  return reass->first_fragment_seen;
+}
+
 always_inline ip4_sv_reass_rc_t
 ip4_sv_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
                     ip4_sv_reass_main_t *rm, ip4_header_t *ip0,
-                    ip4_sv_reass_t *reass, u32 bi0)
+                    ip4_sv_reass_t *reass, u32 bi0, bool extended)
 {
   vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
   ip4_sv_reass_rc_t rc = IP4_SV_REASS_RC_OK;
@@ -409,33 +475,59 @@ ip4_sv_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
        return IP4_SV_REASS_RC_UNSUPP_IP_PROTO;
       if (IP_PROTOCOL_TCP == reass->ip_proto)
        {
-         reass->icmp_type_or_tcp_flags = ((tcp_header_t *) (ip0 + 1))->flags;
-         reass->tcp_ack_number = ((tcp_header_t *) (ip0 + 1))->ack_number;
-         reass->tcp_seq_number = ((tcp_header_t *) (ip0 + 1))->seq_number;
+         tcp_header_t *th = ip4_next_header (ip0);
+         reass->icmp_type_or_tcp_flags = th->flags;
+         reass->tcp_ack_number = th->ack_number;
+         reass->tcp_seq_number = th->seq_number;
        }
       else if (IP_PROTOCOL_ICMP == reass->ip_proto)
        {
          reass->icmp_type_or_tcp_flags =
-           ((icmp46_header_t *) (ip0 + 1))->type;
+           ((icmp46_header_t *) (ip4_next_header (ip0)))->type;
+       }
+      reass->first_fragment_seen = true;
+      if (extended)
+       {
+         reass->first_fragment_total_ip_header_length =
+           ip4_header_bytes (ip0);
+         vlib_buffer_t *clone = vlib_buffer_copy_no_chain (
+           vm, b0, &reass->first_fragment_clone_bi);
+         if (!clone)
+           reass->first_fragment_clone_bi = ~0;
        }
-      reass->is_complete = true;
       vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
        {
          ip4_sv_reass_add_trace (
-           vm, node, reass, bi0, REASS_FINISH, reass->ip_proto,
+           vm, node, reass, bi0, REASS_FIRST_FRAG, reass->ip_proto,
            reass->l4_src_port, reass->l4_dst_port,
-           vnet_buffer (b0)->ip.reass.l4_layer_truncated);
+           vnet_buffer (b0)->ip.reass.l4_hdr_truncated, ~0);
+       }
+    }
+  if (!ip4_get_fragment_more (ip0))
+    {
+      const u32 fragment_length =
+       clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
+      reass->last_fragment_seen = true;
+      reass->total_ip_payload_length = fragment_first + fragment_length;
+      vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
+      if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+       {
+         ip4_sv_reass_add_trace (
+           vm, node, reass, bi0, REASS_LAST_FRAG, ~0, ~0, ~0,
+           vnet_buffer (b0)->ip.reass.l4_hdr_truncated, ~0);
        }
     }
+
   vec_add1 (reass->cached_buffers, bi0);
-  if (!reass->is_complete)
+
+  if (!ip4_sv_reass_is_complete (reass, extended))
     {
       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
        {
          ip4_sv_reass_add_trace (
            vm, node, reass, bi0, REASS_FRAGMENT_CACHE, ~0, ~0, ~0,
-           vnet_buffer (b0)->ip.reass.l4_layer_truncated);
+           vnet_buffer (b0)->ip.reass.l4_hdr_truncated, ~0);
        }
       if (vec_len (reass->cached_buffers) > rm->max_reass_len)
        {
@@ -446,30 +538,63 @@ ip4_sv_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
 }
 
 always_inline int
-l4_layer_truncated (ip4_header_t *ip)
+l4_hdr_truncated (ip4_header_t *ip)
 {
-  static const int l4_layer_length[256] = {
-    [IP_PROTOCOL_TCP] = sizeof (tcp_header_t),
-    [IP_PROTOCOL_UDP] = sizeof (udp_header_t),
-    [IP_PROTOCOL_ICMP] = sizeof (icmp46_header_t),
-  };
+  if (IP_PROTOCOL_UDP == ip->protocol)
+    return ((u8 *) ip + ip4_header_bytes (ip) + sizeof (udp_header_t) >
+           (u8 *) ip + clib_net_to_host_u16 (ip->length));
+  if (IP_PROTOCOL_ICMP == ip->protocol)
+    return ((u8 *) ip + ip4_header_bytes (ip) + sizeof (icmp46_header_t) >
+           (u8 *) ip + clib_net_to_host_u16 (ip->length));
 
-  return ((u8 *) ip + ip4_header_bytes (ip) + l4_layer_length[ip->protocol] >
+  if (IP_PROTOCOL_TCP != ip->protocol)
+    return false;
+
+  tcp_header_t *th = ip4_next_header (ip);
+  const u32 tcp_opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t);
+
+  return ((u8 *) ip + ip4_header_bytes (ip) + sizeof (tcp_header_t) +
+           tcp_opts_len >
          (u8 *) ip + clib_net_to_host_u16 (ip->length));
 }
 
+always_inline void
+ip4_sv_reass_reset_vnet_buffer2 (vlib_buffer_t *b)
+{
+  vnet_buffer2 (b)->ip.reass.pool_index = ~0;
+  vnet_buffer2 (b)->ip.reass.thread_index = ~0;
+  vnet_buffer2 (b)->ip.reass.id = ~0;
+}
+
+always_inline void
+ip4_sv_reass_set_vnet_buffer2_from_reass (vlib_main_t *vm, vlib_buffer_t *b,
+                                         ip4_sv_reass_t *reass)
+{
+  vnet_buffer2 (b)->ip.reass.thread_index = vm->thread_index;
+  vnet_buffer2 (b)->ip.reass.id = reass->id;
+  vnet_buffer2 (b)->ip.reass.pool_index =
+    reass - ip4_sv_reass_main.per_thread_data[vm->thread_index].pool;
+}
+
+struct ip4_sv_reass_args
+{
+  bool is_feature;
+  bool is_output_feature;
+  bool is_custom;
+  bool with_custom_context;
+  bool extended;
+};
+
 always_inline uword
 ip4_sv_reass_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
-                    vlib_frame_t *frame, bool is_feature,
-                    bool is_output_feature, bool is_custom,
-                    bool with_custom_context)
+                    vlib_frame_t *frame, struct ip4_sv_reass_args a)
 {
   u32 *from = vlib_frame_vector_args (frame);
   u32 n_left_from, n_left_to_next, *to_next, *to_next_aux, next_index;
   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
   ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
   u32 *context;
-  if (with_custom_context)
+  if (a.with_custom_context)
     context = vlib_frame_aux_args (frame);
 
   clib_spinlock_lock (&rt->lock);
@@ -509,16 +634,16 @@ ip4_sv_reass_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
 
       ip4_header_t *ip0 = (ip4_header_t *) u8_ptr_add (
        vlib_buffer_get_current (b0),
-       (ptrdiff_t) (is_output_feature ? 1 : 0) *
+       (ptrdiff_t) (a.is_output_feature ? 1 : 0) *
          vnet_buffer (b0)->ip.save_rewrite_length);
       ip4_header_t *ip1 = (ip4_header_t *) u8_ptr_add (
        vlib_buffer_get_current (b1),
-       (ptrdiff_t) (is_output_feature ? 1 : 0) *
+       (ptrdiff_t) (a.is_output_feature ? 1 : 0) *
          vnet_buffer (b1)->ip.save_rewrite_length);
 
-      if (PREDICT_FALSE
-         (ip4_get_fragment_more (ip0) || ip4_get_fragment_offset (ip0))
-         || (ip4_get_fragment_more (ip1) || ip4_get_fragment_offset (ip1)))
+      if (PREDICT_FALSE (ip4_get_fragment_more (ip0) ||
+                        ip4_get_fragment_offset (ip0)) ||
+         (ip4_get_fragment_more (ip1) || ip4_get_fragment_offset (ip1)))
        {
          // fragment found, go slow path
          b -= 2;
@@ -529,39 +654,41 @@ ip4_sv_reass_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
            }
          goto slow_path;
        }
-      if (is_feature)
+      if (a.is_feature)
        {
          vnet_feature_next (&next0, b0);
        }
       else
        {
-         next0 = is_custom ? vnet_buffer (b0)->ip.reass.next_index :
-           IP4_SV_REASSEMBLY_NEXT_INPUT;
+         next0 = a.is_custom ? vnet_buffer (b0)->ip.reass.next_index :
+                               IP4_SV_REASSEMBLY_NEXT_INPUT;
        }
       vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
       vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
-      if (l4_layer_truncated (ip0))
+
+      if (a.extended)
+       ip4_sv_reass_reset_vnet_buffer2 (b0);
+
+      if (l4_hdr_truncated (ip0))
        {
-         vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
-         vnet_buffer (b0)->ip.reass.l4_src_port = 0;
-         vnet_buffer (b0)->ip.reass.l4_dst_port = 0;
+         vnet_buffer (b0)->ip.reass.l4_hdr_truncated = 1;
        }
       else
        {
-         vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
+         vnet_buffer (b0)->ip.reass.l4_hdr_truncated = 0;
          if (IP_PROTOCOL_TCP == ip0->protocol)
            {
              vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
-               ((tcp_header_t *) (ip0 + 1))->flags;
+               ((tcp_header_t *) (ip4_next_header (ip0)))->flags;
              vnet_buffer (b0)->ip.reass.tcp_ack_number =
-               ((tcp_header_t *) (ip0 + 1))->ack_number;
+               ((tcp_header_t *) (ip4_next_header (ip0)))->ack_number;
              vnet_buffer (b0)->ip.reass.tcp_seq_number =
-               ((tcp_header_t *) (ip0 + 1))->seq_number;
+               ((tcp_header_t *) (ip4_next_header (ip0)))->seq_number;
            }
          else if (IP_PROTOCOL_ICMP == ip0->protocol)
            {
              vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
-               ((icmp46_header_t *) (ip0 + 1))->type;
+               ((icmp46_header_t *) (ip4_next_header (ip0)))->type;
            }
          vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
          vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
@@ -573,41 +700,43 @@ ip4_sv_reass_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
            vnet_buffer (b0)->ip.reass.ip_proto,
            vnet_buffer (b0)->ip.reass.l4_src_port,
            vnet_buffer (b0)->ip.reass.l4_dst_port,
-           vnet_buffer (b0)->ip.reass.l4_layer_truncated);
+           vnet_buffer (b0)->ip.reass.l4_hdr_truncated, ~0);
        }
-      if (is_feature)
+      if (a.is_feature)
        {
          vnet_feature_next (&next1, b1);
        }
       else
        {
-         next1 = is_custom ? vnet_buffer (b1)->ip.reass.next_index :
-           IP4_SV_REASSEMBLY_NEXT_INPUT;
+         next1 = a.is_custom ? vnet_buffer (b1)->ip.reass.next_index :
+                               IP4_SV_REASSEMBLY_NEXT_INPUT;
        }
       vnet_buffer (b1)->ip.reass.is_non_first_fragment = 0;
       vnet_buffer (b1)->ip.reass.ip_proto = ip1->protocol;
-      if (l4_layer_truncated (ip1))
+
+      if (a.extended)
+       ip4_sv_reass_reset_vnet_buffer2 (b1);
+
+      if (l4_hdr_truncated (ip1))
        {
-         vnet_buffer (b1)->ip.reass.l4_layer_truncated = 1;
-         vnet_buffer (b1)->ip.reass.l4_src_port = 0;
-         vnet_buffer (b1)->ip.reass.l4_dst_port = 0;
+         vnet_buffer (b1)->ip.reass.l4_hdr_truncated = 1;
        }
       else
        {
-         vnet_buffer (b1)->ip.reass.l4_layer_truncated = 0;
+         vnet_buffer (b1)->ip.reass.l4_hdr_truncated = 0;
          if (IP_PROTOCOL_TCP == ip1->protocol)
            {
              vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
-               ((tcp_header_t *) (ip1 + 1))->flags;
+               ((tcp_header_t *) (ip4_next_header (ip1)))->flags;
              vnet_buffer (b1)->ip.reass.tcp_ack_number =
-               ((tcp_header_t *) (ip1 + 1))->ack_number;
+               ((tcp_header_t *) (ip4_next_header (ip1)))->ack_number;
              vnet_buffer (b1)->ip.reass.tcp_seq_number =
-               ((tcp_header_t *) (ip1 + 1))->seq_number;
+               ((tcp_header_t *) (ip4_next_header (ip1)))->seq_number;
            }
          else if (IP_PROTOCOL_ICMP == ip1->protocol)
            {
              vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
-               ((icmp46_header_t *) (ip1 + 1))->type;
+               ((icmp46_header_t *) (ip4_next_header (ip1)))->type;
            }
          vnet_buffer (b1)->ip.reass.l4_src_port = ip4_get_port (ip1, 1);
          vnet_buffer (b1)->ip.reass.l4_dst_port = ip4_get_port (ip1, 0);
@@ -619,14 +748,14 @@ ip4_sv_reass_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
            vnet_buffer (b1)->ip.reass.ip_proto,
            vnet_buffer (b1)->ip.reass.l4_src_port,
            vnet_buffer (b1)->ip.reass.l4_dst_port,
-           vnet_buffer (b1)->ip.reass.l4_layer_truncated);
+           vnet_buffer (b1)->ip.reass.l4_hdr_truncated, ~0);
        }
 
       n_left_from -= 2;
       next[0] = next0;
       next[1] = next1;
       next += 2;
-      if (with_custom_context)
+      if (a.with_custom_context)
        context += 2;
     }
 
@@ -639,10 +768,10 @@ ip4_sv_reass_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
 
       ip4_header_t *ip0 = (ip4_header_t *) u8_ptr_add (
        vlib_buffer_get_current (b0),
-       (ptrdiff_t) (is_output_feature ? 1 : 0) *
+       (ptrdiff_t) (a.is_output_feature ? 1 : 0) *
          vnet_buffer (b0)->ip.save_rewrite_length);
-      if (PREDICT_FALSE
-         (ip4_get_fragment_more (ip0) || ip4_get_fragment_offset (ip0)))
+      if (PREDICT_FALSE (ip4_get_fragment_more (ip0) ||
+                        ip4_get_fragment_offset (ip0)))
        {
          // fragment found, go slow path
          b -= 1;
@@ -653,38 +782,41 @@ ip4_sv_reass_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
            }
          goto slow_path;
        }
-      if (is_feature)
+      if (a.is_feature)
        {
          vnet_feature_next (&next0, b0);
        }
       else
        {
-         next0 =
-           is_custom ? vnet_buffer (b0)->ip.
-           reass.next_index : IP4_SV_REASSEMBLY_NEXT_INPUT;
+         next0 = a.is_custom ? vnet_buffer (b0)->ip.reass.next_index :
+                               IP4_SV_REASSEMBLY_NEXT_INPUT;
        }
       vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
       vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
-      if (l4_layer_truncated (ip0))
+
+      if (a.extended)
+       ip4_sv_reass_reset_vnet_buffer2 (b0);
+
+      if (l4_hdr_truncated (ip0))
        {
-         vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
+         vnet_buffer (b0)->ip.reass.l4_hdr_truncated = 1;
        }
       else
        {
-         vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
+         vnet_buffer (b0)->ip.reass.l4_hdr_truncated = 0;
          if (IP_PROTOCOL_TCP == ip0->protocol)
            {
              vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
-               ((tcp_header_t *) (ip0 + 1))->flags;
+               ((tcp_header_t *) (ip4_next_header (ip0)))->flags;
              vnet_buffer (b0)->ip.reass.tcp_ack_number =
-               ((tcp_header_t *) (ip0 + 1))->ack_number;
+               ((tcp_header_t *) (ip4_next_header (ip0)))->ack_number;
              vnet_buffer (b0)->ip.reass.tcp_seq_number =
-               ((tcp_header_t *) (ip0 + 1))->seq_number;
+               ((tcp_header_t *) (ip4_next_header (ip0)))->seq_number;
            }
          else if (IP_PROTOCOL_ICMP == ip0->protocol)
            {
              vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
-               ((icmp46_header_t *) (ip0 + 1))->type;
+               ((icmp46_header_t *) (ip4_next_header (ip0)))->type;
            }
          vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
          vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
@@ -696,13 +828,13 @@ ip4_sv_reass_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
            vnet_buffer (b0)->ip.reass.ip_proto,
            vnet_buffer (b0)->ip.reass.l4_src_port,
            vnet_buffer (b0)->ip.reass.l4_dst_port,
-           vnet_buffer (b0)->ip.reass.l4_layer_truncated);
+           vnet_buffer (b0)->ip.reass.l4_hdr_truncated, ~0);
        }
 
       n_left_from -= 1;
       next[0] = next0;
       next += 1;
-      if (with_custom_context)
+      if (a.with_custom_context)
        context += 1;
     }
 
@@ -717,7 +849,7 @@ slow_path:
 
   while (n_left_from > 0)
     {
-      if (with_custom_context)
+      if (a.with_custom_context)
        vlib_get_next_frame_with_aux_safe (vm, node, next_index, to_next,
                                           to_next_aux, n_left_to_next);
       else
@@ -736,12 +868,12 @@ slow_path:
 
          ip4_header_t *ip0 = (ip4_header_t *) u8_ptr_add (
            vlib_buffer_get_current (b0),
-           (ptrdiff_t) (is_output_feature ? 1 : 0) *
+           (ptrdiff_t) (a.is_output_feature ? 1 : 0) *
              vnet_buffer (b0)->ip.save_rewrite_length);
          if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
            {
              // this is a regular packet - no fragmentation
-             if (is_custom)
+             if (a.is_custom)
                {
                  next0 = vnet_buffer (b0)->ip.reass.next_index;
                }
@@ -751,28 +883,28 @@ slow_path:
                }
              vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
              vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
-             if (l4_layer_truncated (ip0))
+             if (l4_hdr_truncated (ip0))
                {
-                 vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
+                 vnet_buffer (b0)->ip.reass.l4_hdr_truncated = 1;
                  vnet_buffer (b0)->ip.reass.l4_src_port = 0;
                  vnet_buffer (b0)->ip.reass.l4_dst_port = 0;
                }
              else
                {
-                 vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
+                 vnet_buffer (b0)->ip.reass.l4_hdr_truncated = 0;
                  if (IP_PROTOCOL_TCP == ip0->protocol)
                    {
                      vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
-                       ((tcp_header_t *) (ip0 + 1))->flags;
+                       ((tcp_header_t *) (ip4_next_header (ip0)))->flags;
                      vnet_buffer (b0)->ip.reass.tcp_ack_number =
-                       ((tcp_header_t *) (ip0 + 1))->ack_number;
+                       ((tcp_header_t *) (ip4_next_header (ip0)))->ack_number;
                      vnet_buffer (b0)->ip.reass.tcp_seq_number =
-                       ((tcp_header_t *) (ip0 + 1))->seq_number;
+                       ((tcp_header_t *) (ip4_next_header (ip0)))->seq_number;
                    }
                  else if (IP_PROTOCOL_ICMP == ip0->protocol)
                    {
                      vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
-                       ((icmp46_header_t *) (ip0 + 1))->type;
+                       ((icmp46_header_t *) (ip4_next_header (ip0)))->type;
                    }
                  vnet_buffer (b0)->ip.reass.l4_src_port =
                    ip4_get_port (ip0, 1);
@@ -786,7 +918,7 @@ slow_path:
                    vnet_buffer (b0)->ip.reass.ip_proto,
                    vnet_buffer (b0)->ip.reass.l4_src_port,
                    vnet_buffer (b0)->ip.reass.l4_dst_port,
-                   vnet_buffer (b0)->ip.reass.l4_layer_truncated);
+                   vnet_buffer (b0)->ip.reass.l4_hdr_truncated, ~0);
                }
              goto packet_enqueue;
            }
@@ -794,7 +926,11 @@ slow_path:
          const u32 fragment_length =
            clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
          const u32 fragment_last = fragment_first + fragment_length - 1;
-         if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))     // 8 is minimum frag length per RFC 791
+         if (fragment_first > fragment_last ||
+             fragment_first + fragment_length > UINT16_MAX - 20 ||
+             (fragment_length < 8 &&
+              ip4_get_fragment_more (
+                ip0))) // 8 is minimum frag length per RFC 791
            {
              next0 = IP4_SV_REASSEMBLY_NEXT_DROP;
              error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
@@ -804,7 +940,7 @@ slow_path:
          ip4_sv_reass_kv_t kv;
          u8 do_handoff = 0;
 
-         if (with_custom_context)
+         if (a.with_custom_context)
            kv.k.as_u64[0] = (u64) *context | (u64) ip0->src_address.as_u32
                                                << 32;
          else
@@ -816,15 +952,29 @@ slow_path:
                           (u64) ip0->fragment_id << 32 |
                           (u64) ip0->protocol << 48;
 
-         ip4_sv_reass_t *reass =
-           ip4_sv_reass_find_or_create (vm, rm, rt, &kv, &do_handoff);
+         if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+           {
+             ip4_sv_reass_trace_t *t =
+               vlib_add_trace (vm, node, b0, sizeof (t[0]));
+             t->action = REASS_KEY;
+             STATIC_ASSERT_SIZEOF (t->kv, sizeof (kv));
+             clib_memcpy (&t->kv, &kv, sizeof (kv));
+           }
+
+         ip4_sv_reass_t *reass = ip4_sv_reass_find_or_create (
+           vm, node, bi0, rm, rt, &kv, &do_handoff);
 
          if (PREDICT_FALSE (do_handoff))
            {
+             if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+               {
+                 ip4_sv_reass_add_trace (vm, node, reass, bi0, REASS_HANDOFF,
+                                         ~0, ~0, ~0, 0, kv.v.thread_index);
+               }
              next0 = IP4_SV_REASSEMBLY_NEXT_HANDOFF;
              vnet_buffer (b0)->ip.reass.owner_thread_index =
                kv.v.thread_index;
-             if (with_custom_context)
+             if (a.with_custom_context)
                forward_context = 1;
              goto packet_enqueue;
            }
@@ -837,9 +987,9 @@ slow_path:
              goto packet_enqueue;
            }
 
-         if (reass->is_complete)
+         if (ip4_sv_reass_is_complete (reass, a.extended))
            {
-             if (is_custom)
+             if (a.is_custom)
                {
                  next0 = vnet_buffer (b0)->ip.reass.next_index;
                }
@@ -848,7 +998,7 @@ slow_path:
                  next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
                }
              vnet_buffer (b0)->ip.reass.is_non_first_fragment =
-               ! !fragment_first;
+               !!fragment_first;
              vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
              vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
                reass->icmp_type_or_tcp_flags;
@@ -858,18 +1008,20 @@ slow_path:
                reass->tcp_seq_number;
              vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
              vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
+             if (a.extended)
+               ip4_sv_reass_set_vnet_buffer2_from_reass (vm, b0, reass);
              if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
                {
                  ip4_sv_reass_add_trace (
                    vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
                    reass->ip_proto, reass->l4_src_port, reass->l4_dst_port,
-                   vnet_buffer (b0)->ip.reass.l4_layer_truncated);
+                   vnet_buffer (b0)->ip.reass.l4_hdr_truncated, ~0);
                }
              goto packet_enqueue;
            }
 
          ip4_sv_reass_rc_t rc =
-           ip4_sv_reass_update (vm, node, rm, ip0, reass, bi0);
+           ip4_sv_reass_update (vm, node, rm, ip0, reass, bi0, a.extended);
          u32 counter = ~0;
          switch (rc)
            {
@@ -886,61 +1038,64 @@ slow_path:
          if (~0 != counter)
            {
              vlib_node_increment_counter (vm, node->node_index, counter, 1);
-             ip4_sv_reass_free (vm, rm, rt, reass);
+             ip4_sv_reass_trace_error_free (vm, node, reass, bi0);
+             ip4_sv_reass_free (vm, rm, rt, reass, true);
              goto next_packet;
            }
-         if (reass->is_complete)
+         if (ip4_sv_reass_is_complete (reass, a.extended))
            {
              u32 idx;
              vec_foreach_index (idx, reass->cached_buffers)
-             {
-               u32 bi0 = vec_elt (reass->cached_buffers, idx);
-               vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
-               ip0 = (ip4_header_t *) u8_ptr_add (
-                 vlib_buffer_get_current (b0),
-                 (ptrdiff_t) (is_output_feature ? 1 : 0) *
-                   vnet_buffer (b0)->ip.save_rewrite_length);
-               u32 next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
-               if (is_feature)
-                 {
-                   vnet_feature_next (&next0, b0);
-                 }
-               if (is_custom)
-                 {
-                   next0 = vnet_buffer (b0)->ip.reass.next_index;
-                 }
-               if (0 == n_left_to_next)
-                 {
-                   vlib_put_next_frame (vm, node, next_index,
-                                        n_left_to_next);
-                   vlib_get_next_frame (vm, node, next_index, to_next,
-                                        n_left_to_next);
-                 }
-               to_next[0] = bi0;
-               to_next += 1;
-               n_left_to_next -= 1;
-               vnet_buffer (b0)->ip.reass.is_non_first_fragment =
-                 ! !ip4_get_fragment_offset (ip0);
-               vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
-               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
-                 reass->icmp_type_or_tcp_flags;
-               vnet_buffer (b0)->ip.reass.tcp_ack_number =
-                 reass->tcp_ack_number;
-               vnet_buffer (b0)->ip.reass.tcp_seq_number =
-                 reass->tcp_seq_number;
-               vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
-               vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
-               if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
-                 {
-                   ip4_sv_reass_add_trace (
-                     vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
-                     reass->ip_proto, reass->l4_src_port, reass->l4_dst_port,
-                     vnet_buffer (b0)->ip.reass.l4_layer_truncated);
-                 }
-               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
-                                                to_next, n_left_to_next, bi0,
-                                                next0);
-             }
+               {
+                 u32 bi0 = vec_elt (reass->cached_buffers, idx);
+                 vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
+                 ip0 = (ip4_header_t *) u8_ptr_add (
+                   vlib_buffer_get_current (b0),
+                   (ptrdiff_t) (a.is_output_feature ? 1 : 0) *
+                     vnet_buffer (b0)->ip.save_rewrite_length);
+                 u32 next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
+                 if (a.is_feature)
+                   {
+                     vnet_feature_next (&next0, b0);
+                   }
+                 if (a.is_custom)
+                   {
+                     next0 = vnet_buffer (b0)->ip.reass.next_index;
+                   }
+                 if (0 == n_left_to_next)
+                   {
+                     vlib_put_next_frame (vm, node, next_index,
+                                          n_left_to_next);
+                     vlib_get_next_frame (vm, node, next_index, to_next,
+                                          n_left_to_next);
+                   }
+                 to_next[0] = bi0;
+                 to_next += 1;
+                 n_left_to_next -= 1;
+                 vnet_buffer (b0)->ip.reass.is_non_first_fragment =
+                   !!ip4_get_fragment_offset (ip0);
+                 vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
+                 vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
+                   reass->icmp_type_or_tcp_flags;
+                 vnet_buffer (b0)->ip.reass.tcp_ack_number =
+                   reass->tcp_ack_number;
+                 vnet_buffer (b0)->ip.reass.tcp_seq_number =
+                   reass->tcp_seq_number;
+                 vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
+                 vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
+                 if (a.extended)
+                   ip4_sv_reass_set_vnet_buffer2_from_reass (vm, b0, reass);
+                 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+                   {
+                     ip4_sv_reass_add_trace (
+                       vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
+                       reass->ip_proto, reass->l4_src_port,
+                       reass->l4_dst_port,
+                       vnet_buffer (b0)->ip.reass.l4_hdr_truncated, ~0);
+                   }
+                 vlib_validate_buffer_enqueue_x1 (
+                   vm, node, next_index, to_next, n_left_to_next, bi0, next0);
+               }
              vec_set_len (reass->cached_buffers,
                           0); // buffers are owned by frame now
            }
@@ -950,12 +1105,13 @@ slow_path:
          to_next[0] = bi0;
          to_next += 1;
          n_left_to_next -= 1;
-         if (is_feature && IP4_ERROR_NONE == error0)
+         if (a.is_feature && IP4_ERROR_NONE == error0 &&
+             IP4_SV_REASSEMBLY_NEXT_HANDOFF != next0)
            {
              b0 = vlib_get_buffer (vm, bi0);
              vnet_feature_next (&next0, b0);
            }
-         if (with_custom_context && forward_context)
+         if (a.with_custom_context && forward_context)
            {
              if (to_next_aux)
                {
@@ -973,7 +1129,7 @@ slow_path:
        next_packet:
          from += 1;
          n_left_from -= 1;
-         if (with_custom_context)
+         if (a.with_custom_context)
            context += 1;
        }
 
@@ -985,13 +1141,20 @@ done:
   return frame->n_vectors;
 }
 
-VLIB_NODE_FN (ip4_sv_reass_node) (vlib_main_t * vm,
-                                 vlib_node_runtime_t * node,
-                                 vlib_frame_t * frame)
+VLIB_NODE_FN (ip4_sv_reass_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
-  return ip4_sv_reass_inline (
-    vm, node, frame, false /* is_feature */, false /* is_output_feature */,
-    false /* is_custom */, false /* with_custom_context */);
+  /*
+   * Extended reassembly is not supported for non-feature nodes.
+   */
+  return ip4_sv_reass_inline (vm, node, frame,
+                             (struct ip4_sv_reass_args){
+                               .is_feature = false,
+                               .is_output_feature = false,
+                               .is_custom = false,
+                               .with_custom_context = false,
+                               .extended = false,
+                             });
 }
 
 VLIB_REGISTER_NODE (ip4_sv_reass_node) = {
@@ -1010,13 +1173,27 @@ VLIB_REGISTER_NODE (ip4_sv_reass_node) = {
         },
 };
 
-VLIB_NODE_FN (ip4_sv_reass_node_feature) (vlib_main_t * vm,
-                                         vlib_node_runtime_t * node,
-                                         vlib_frame_t * frame)
+VLIB_NODE_FN (ip4_sv_reass_node_feature)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
-  return ip4_sv_reass_inline (
-    vm, node, frame, true /* is_feature */, false /* is_output_feature */,
-    false /* is_custom */, false /* with_custom_context */);
+  if (ip4_sv_reass_main.extended_refcount > 0)
+    return ip4_sv_reass_inline (vm, node, frame,
+                               (struct ip4_sv_reass_args){
+                                 .is_feature = true,
+                                 .is_output_feature = false,
+                                 .is_custom = false,
+                                 .with_custom_context = false,
+                                 .extended = true,
+                               });
+
+  return ip4_sv_reass_inline (vm, node, frame,
+                             (struct ip4_sv_reass_args){
+                               .is_feature = true,
+                               .is_output_feature = false,
+                               .is_custom = false,
+                               .with_custom_context = false,
+                               .extended = false,
+                             });
 }
 
 VLIB_REGISTER_NODE (ip4_sv_reass_node_feature) = {
@@ -1035,19 +1212,33 @@ VLIB_REGISTER_NODE (ip4_sv_reass_node_feature) = {
 };
 
 VNET_FEATURE_INIT (ip4_sv_reass_feature) = {
-    .arc_name = "ip4-unicast",
-    .node_name = "ip4-sv-reassembly-feature",
-    .runs_before = VNET_FEATURES ("ip4-lookup"),
-    .runs_after = 0,
+  .arc_name = "ip4-unicast",
+  .node_name = "ip4-sv-reassembly-feature",
+  .runs_before = VNET_FEATURES ("ip4-lookup"),
+  .runs_after = 0,
 };
 
-VLIB_NODE_FN (ip4_sv_reass_node_output_feature) (vlib_main_t * vm,
-                                                vlib_node_runtime_t * node,
-                                                vlib_frame_t * frame)
+VLIB_NODE_FN (ip4_sv_reass_node_output_feature)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
-  return ip4_sv_reass_inline (
-    vm, node, frame, true /* is_feature */, true /* is_output_feature */,
-    false /* is_custom */, false /* with_custom_context */);
+  if (ip4_sv_reass_main.extended_refcount > 0)
+    return ip4_sv_reass_inline (vm, node, frame,
+                               (struct ip4_sv_reass_args){
+                                 .is_feature = true,
+                                 .is_output_feature = true,
+                                 .is_custom = false,
+                                 .with_custom_context = false,
+                                 .extended = true,
+                               });
+
+  return ip4_sv_reass_inline (vm, node, frame,
+                             (struct ip4_sv_reass_args){
+                               .is_feature = true,
+                               .is_output_feature = true,
+                               .is_custom = false,
+                               .with_custom_context = false,
+                               .extended = false,
+                             });
 }
 
 VLIB_REGISTER_NODE (ip4_sv_reass_node_output_feature) = {
@@ -1066,10 +1257,10 @@ VLIB_REGISTER_NODE (ip4_sv_reass_node_output_feature) = {
 };
 
 VNET_FEATURE_INIT (ip4_sv_reass_output_feature) = {
-    .arc_name = "ip4-output",
-    .node_name = "ip4-sv-reassembly-output-feature",
-    .runs_before = 0,
-    .runs_after = 0,
+  .arc_name = "ip4-output",
+  .node_name = "ip4-sv-reassembly-output-feature",
+  .runs_before = 0,
+  .runs_after = 0,
 };
 
 VLIB_REGISTER_NODE (ip4_sv_reass_custom_node) = {
@@ -1088,13 +1279,20 @@ VLIB_REGISTER_NODE (ip4_sv_reass_custom_node) = {
         },
 };
 
-VLIB_NODE_FN (ip4_sv_reass_custom_node) (vlib_main_t * vm,
-                                        vlib_node_runtime_t * node,
-                                        vlib_frame_t * frame)
+VLIB_NODE_FN (ip4_sv_reass_custom_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
-  return ip4_sv_reass_inline (
-    vm, node, frame, false /* is_feature */, false /* is_output_feature */,
-    true /* is_custom */, false /* with_custom_context */);
+  /*
+   * Extended reassembly is not supported for non-feature nodes.
+   */
+  return ip4_sv_reass_inline (vm, node, frame,
+                             (struct ip4_sv_reass_args){
+                               .is_feature = false,
+                               .is_output_feature = false,
+                               .is_custom = true,
+                               .with_custom_context = false,
+                               .extended = false,
+                             });
 }
 
 VLIB_REGISTER_NODE (ip4_sv_reass_custom_context_node) = {
@@ -1117,9 +1315,17 @@ VLIB_REGISTER_NODE (ip4_sv_reass_custom_context_node) = {
 VLIB_NODE_FN (ip4_sv_reass_custom_context_node)
 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
-  return ip4_sv_reass_inline (
-    vm, node, frame, false /* is_feature */, false /* is_output_feature */,
-    true /* is_custom */, true /* with_custom_context */);
+  /*
+   * Extended reassembly is not supported for non-feature nodes.
+   */
+  return ip4_sv_reass_inline (vm, node, frame,
+                             (struct ip4_sv_reass_args){
+                               .is_feature = false,
+                               .is_output_feature = false,
+                               .is_custom = true,
+                               .with_custom_context = true,
+                               .extended = false,
+                             });
 }
 
 #ifndef CLIB_MARCH_VARIANT
@@ -1154,7 +1360,7 @@ typedef struct
 
 #ifndef CLIB_MARCH_VARIANT
 static int
-ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
+ip4_rehash_cb (clib_bihash_kv_16_8_t *kv, void *_ctx)
 {
   ip4_rehash_cb_ctx *ctx = _ctx;
   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
@@ -1181,8 +1387,8 @@ ip4_sv_reass_set (u32 timeout_ms, u32 max_reassemblies,
                  u32 max_reassembly_length, u32 expire_walk_interval_ms)
 {
   u32 old_nbuckets = ip4_sv_reass_get_nbuckets ();
-  ip4_sv_reass_set_params (timeout_ms, max_reassemblies,
-                          max_reassembly_length, expire_walk_interval_ms);
+  ip4_sv_reass_set_params (timeout_ms, max_reassemblies, max_reassembly_length,
+                          expire_walk_interval_ms);
   vlib_process_signal_event (ip4_sv_reass_main.vlib_main,
                             ip4_sv_reass_main.ip4_sv_reass_expire_node_idx,
                             IP4_EVENT_CONFIG_CHANGED, 0);
@@ -1215,8 +1421,8 @@ ip4_sv_reass_set (u32 timeout_ms, u32 max_reassemblies,
 }
 
 vnet_api_error_t
-ip4_sv_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
-                 u32 * max_reassembly_length, u32 * expire_walk_interval_ms)
+ip4_sv_reass_get (u32 *timeout_ms, u32 *max_reassemblies,
+                 u32 *max_reassembly_length, u32 *expire_walk_interval_ms)
 {
   *timeout_ms = ip4_sv_reass_main.timeout_ms;
   *max_reassemblies = ip4_sv_reass_main.max_reass_n;
@@ -1226,7 +1432,7 @@ ip4_sv_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
 }
 
 static clib_error_t *
-ip4_sv_reass_init_function (vlib_main_t * vm)
+ip4_sv_reass_init_function (vlib_main_t *vm)
 {
   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
   clib_error_t *error = 0;
@@ -1239,11 +1445,11 @@ ip4_sv_reass_init_function (vlib_main_t * vm)
   vec_validate (rm->per_thread_data, vlib_num_workers ());
   ip4_sv_reass_per_thread_t *rt;
   vec_foreach (rt, rm->per_thread_data)
-  {
-    clib_spinlock_init (&rt->lock);
-    pool_alloc (rt->pool, rm->max_reass_n);
-    rt->lru_first = rt->lru_last = ~0;
-  }
+    {
+      clib_spinlock_init (&rt->lock);
+      pool_alloc (rt->pool, rm->max_reass_n);
+      rt->lru_first = rt->lru_last = ~0;
+    }
 
   node = vlib_get_node_by_name (vm, (u8 *) "ip4-sv-reassembly-expire-walk");
   ASSERT (node);
@@ -1258,10 +1464,6 @@ ip4_sv_reass_init_function (vlib_main_t * vm)
   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets,
                         (uword) nbuckets * 1024);
 
-  node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
-  ASSERT (node);
-  rm->ip4_drop_idx = node->index;
-
   rm->fq_index = vlib_frame_queue_main_init (ip4_sv_reass_node.index, 0);
   rm->fq_feature_index =
     vlib_frame_queue_main_init (ip4_sv_reass_node_feature.index, 0);
@@ -1289,10 +1491,8 @@ ip4_sv_reass_walk_expired (vlib_main_t *vm,
 
   while (true)
     {
-      vlib_process_wait_for_event_or_clock (vm,
-                                           (f64)
-                                           rm->expire_walk_interval_ms /
-                                           (f64) MSEC_PER_SEC);
+      vlib_process_wait_for_event_or_clock (
+       vm, (f64) rm->expire_walk_interval_ms / (f64) MSEC_PER_SEC);
       event_type = vlib_process_get_events (vm, &event_data);
 
       switch (event_type)
@@ -1321,19 +1521,20 @@ ip4_sv_reass_walk_expired (vlib_main_t *vm,
          clib_spinlock_lock (&rt->lock);
 
          vec_reset_length (pool_indexes_to_free);
-          pool_foreach_index (index, rt->pool)  {
-                                reass = pool_elt_at_index (rt->pool, index);
-                                if (now > reass->last_heard + rm->timeout)
-                                  {
-                                    vec_add1 (pool_indexes_to_free, index);
-                                  }
-                              }
+         pool_foreach_index (index, rt->pool)
+           {
+             reass = pool_elt_at_index (rt->pool, index);
+             if (now > reass->last_heard + rm->timeout)
+               {
+                 vec_add1 (pool_indexes_to_free, index);
+               }
+           }
          int *i;
-          vec_foreach (i, pool_indexes_to_free)
-          {
-            ip4_sv_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
-            ip4_sv_reass_free (vm, rm, rt, reass);
-          }
+         vec_foreach (i, pool_indexes_to_free)
+           {
+             ip4_sv_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
+             ip4_sv_reass_free (vm, rm, rt, reass, true);
+           }
 
          clib_spinlock_unlock (&rt->lock);
        }
@@ -1358,7 +1559,7 @@ VLIB_REGISTER_NODE (ip4_sv_reass_expire_node) = {
 };
 
 static u8 *
-format_ip4_sv_reass_key (u8 * s, va_list * args)
+format_ip4_sv_reass_key (u8 *s, va_list *args)
 {
   ip4_sv_reass_key_t *key = va_arg (*args, ip4_sv_reass_key_t *);
   s =
@@ -1369,37 +1570,35 @@ format_ip4_sv_reass_key (u8 * s, va_list * args)
 }
 
 static u8 *
-format_ip4_sv_reass (u8 * s, va_list * args)
+format_ip4_sv_reass (u8 *s, va_list *args)
 {
   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
   ip4_sv_reass_t *reass = va_arg (*args, ip4_sv_reass_t *);
 
-  s = format (s, "ID: %lu, key: %U trace_op_counter: %u\n",
-             reass->id, format_ip4_sv_reass_key, &reass->key,
-             reass->trace_op_counter);
+  s = format (s, "ID: %lu, key: %U trace_op_counter: %u\n", reass->id,
+             format_ip4_sv_reass_key, &reass->key, reass->trace_op_counter);
 
   vlib_buffer_t *b;
   u32 *bip;
   u32 counter = 0;
   vec_foreach (bip, reass->cached_buffers)
-  {
-    u32 bi = *bip;
-    do
-      {
-       b = vlib_get_buffer (vm, bi);
-       s = format (s, "  #%03u: bi: %u, ", counter, bi);
-       ++counter;
-       bi = b->next_buffer;
-      }
-    while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
-  }
+    {
+      u32 bi = *bip;
+      do
+       {
+         b = vlib_get_buffer (vm, bi);
+         s = format (s, "  #%03u: bi: %u, ", counter, bi);
+         ++counter;
+         bi = b->next_buffer;
+       }
+      while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
+    }
   return s;
 }
 
 static clib_error_t *
-show_ip4_reass (vlib_main_t * vm,
-               unformat_input_t * input,
-               CLIB_UNUSED (vlib_cli_command_t * lmd))
+show_ip4_reass (vlib_main_t *vm, unformat_input_t *input,
+               CLIB_UNUSED (vlib_cli_command_t *lmd))
 {
   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
 
@@ -1422,82 +1621,81 @@ show_ip4_reass (vlib_main_t * vm,
       clib_spinlock_lock (&rt->lock);
       if (details)
        {
-          pool_foreach (reass, rt->pool) {
-            vlib_cli_output (vm, "%U", format_ip4_sv_reass, vm, reass);
-          }
+         pool_foreach (reass, rt->pool)
+           {
+             vlib_cli_output (vm, "%U", format_ip4_sv_reass, vm, reass);
+           }
        }
-      sum_reass_n += rt->reass_n;
+      sum_reass_n += pool_elts (rt->pool);
       clib_spinlock_unlock (&rt->lock);
     }
   vlib_cli_output (vm, "---------------------");
   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
                   (long unsigned) sum_reass_n);
   vlib_cli_output (vm,
-                  "Maximum configured concurrent shallow virtual IP4 reassemblies per worker-thread: %lu\n",
+                  "Maximum configured concurrent shallow virtual IP4 "
+                  "reassemblies per worker-thread: %lu\n",
                   (long unsigned) rm->max_reass_n);
   vlib_cli_output (vm,
                   "Maximum configured amount of fragments per shallow "
                   "virtual IP4 reassembly: %lu\n",
                   (long unsigned) rm->max_reass_len);
+  vlib_cli_output (
+    vm, "Maximum configured shallow virtual IP4 reassembly timeout: %lums\n",
+    (long unsigned) rm->timeout_ms);
   vlib_cli_output (vm,
-                  "Maximum configured shallow virtual IP4 reassembly timeout: %lums\n",
-                  (long unsigned) rm->timeout_ms);
-  vlib_cli_output (vm,
-                  "Maximum configured shallow virtual IP4 reassembly expire walk interval: %lums\n",
+                  "Maximum configured shallow virtual IP4 reassembly expire "
+                  "walk interval: %lums\n",
                   (long unsigned) rm->expire_walk_interval_ms);
+
   return 0;
 }
 
 VLIB_CLI_COMMAND (show_ip4_sv_reass_cmd, static) = {
-    .path = "show ip4-sv-reassembly",
-    .short_help = "show ip4-sv-reassembly [details]",
-    .function = show_ip4_reass,
+  .path = "show ip4-sv-reassembly",
+  .short_help = "show ip4-sv-reassembly [details]",
+  .function = show_ip4_reass,
 };
 
 #ifndef CLIB_MARCH_VARIANT
 vnet_api_error_t
 ip4_sv_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
 {
-  return ip4_sv_reass_enable_disable_with_refcnt (sw_if_index,
-                                                 enable_disable);
+  return ip4_sv_reass_enable_disable_with_refcnt (sw_if_index, enable_disable);
 }
 #endif /* CLIB_MARCH_VARIANT */
 
-
-#define foreach_ip4_sv_reass_handoff_error                       \
-_(CONGESTION_DROP, "congestion drop")
-
+#define foreach_ip4_sv_reass_handoff_error                                    \
+  _ (CONGESTION_DROP, "congestion drop")
 
 typedef enum
 {
-#define _(sym,str) IP4_SV_REASSEMBLY_HANDOFF_ERROR_##sym,
+#define _(sym, str) IP4_SV_REASSEMBLY_HANDOFF_ERROR_##sym,
   foreach_ip4_sv_reass_handoff_error
 #undef _
     IP4_SV_REASSEMBLY_HANDOFF_N_ERROR,
 } ip4_sv_reass_handoff_error_t;
 
 static char *ip4_sv_reass_handoff_error_strings[] = {
-#define _(sym,string) string,
+#define _(sym, string) string,
   foreach_ip4_sv_reass_handoff_error
 #undef _
 };
 
 typedef struct
 {
-  u32 next_worker_index;
+  u32 thread_index;
 } ip4_sv_reass_handoff_trace_t;
 
 static u8 *
-format_ip4_sv_reass_handoff_trace (u8 * s, va_list * args)
+format_ip4_sv_reass_handoff_trace (u8 *s, va_list *args)
 {
   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
   ip4_sv_reass_handoff_trace_t *t =
     va_arg (*args, ip4_sv_reass_handoff_trace_t *);
 
-  s =
-    format (s, "ip4-sv-reassembly-handoff: next-worker %d",
-           t->next_worker_index);
+  s = format (s, "to thread-index: %u", t->thread_index);
 
   return s;
 }
@@ -1539,13 +1737,12 @@ ip4_sv_reass_handoff_node_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
     {
       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
 
-      if (PREDICT_FALSE
-         ((node->flags & VLIB_NODE_FLAG_TRACE)
-          && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
+      if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) &&
+                        (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
        {
          ip4_sv_reass_handoff_trace_t *t =
            vlib_add_trace (vm, node, b[0], sizeof (*t));
-         t->next_worker_index = ti[0];
+         t->thread_index = ti[0];
        }
 
       n_left_from -= 1;
@@ -1560,15 +1757,14 @@ ip4_sv_reass_handoff_node_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
       vm, node, fq_index, from, thread_indices, frame->n_vectors, 1);
 
   if (n_enq < frame->n_vectors)
-    vlib_node_increment_counter (vm, node->node_index,
-                                IP4_SV_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
-                                frame->n_vectors - n_enq);
+    vlib_node_increment_counter (
+      vm, node->node_index, IP4_SV_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
+      frame->n_vectors - n_enq);
   return frame->n_vectors;
 }
 
-VLIB_NODE_FN (ip4_sv_reass_handoff_node) (vlib_main_t * vm,
-                                         vlib_node_runtime_t * node,
-                                         vlib_frame_t * frame)
+VLIB_NODE_FN (ip4_sv_reass_handoff_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
   return ip4_sv_reass_handoff_node_inline (
     vm, node, frame,
@@ -1616,10 +1812,8 @@ VLIB_REGISTER_NODE (ip4_sv_reass_custom_context_handoff_node) = {
   },
 };
 
-VLIB_NODE_FN (ip4_sv_reass_feature_handoff_node) (vlib_main_t * vm,
-                                                   vlib_node_runtime_t *
-                                                   node,
-                                                   vlib_frame_t * frame)
+VLIB_NODE_FN (ip4_sv_reass_feature_handoff_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
   return ip4_sv_reass_handoff_node_inline (
     vm, node, frame,
@@ -1676,10 +1870,10 @@ ip4_sv_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
     {
       if (!rm->feature_use_refcount_per_intf[sw_if_index])
        {
-         ++rm->feature_use_refcount_per_intf[sw_if_index];
-         return vnet_feature_enable_disable ("ip4-unicast",
-                                             "ip4-sv-reassembly-feature",
-                                             sw_if_index, 1, 0, 0);
+         int rv = vnet_feature_enable_disable (
+           "ip4-unicast", "ip4-sv-reassembly-feature", sw_if_index, 1, 0, 0);
+         if (0 != rv)
+           return rv;
        }
       ++rm->feature_use_refcount_per_intf[sw_if_index];
     }
@@ -1688,9 +1882,10 @@ ip4_sv_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
       if (rm->feature_use_refcount_per_intf[sw_if_index])
        --rm->feature_use_refcount_per_intf[sw_if_index];
       if (!rm->feature_use_refcount_per_intf[sw_if_index])
-       return vnet_feature_enable_disable ("ip4-unicast",
-                                           "ip4-sv-reassembly-feature",
-                                           sw_if_index, 0, 0, 0);
+       {
+         return vnet_feature_enable_disable (
+           "ip4-unicast", "ip4-sv-reassembly-feature", sw_if_index, 0, 0, 0);
+       }
     }
   return 0;
 }
@@ -1710,8 +1905,7 @@ ip4_sv_reass_custom_context_register_next_node (uword node_index)
 }
 
 int
-ip4_sv_reass_output_enable_disable_with_refcnt (u32 sw_if_index,
-                                               int is_enable)
+ip4_sv_reass_output_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
 {
   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
   vec_validate (rm->output_feature_use_refcount_per_intf, sw_if_index);
@@ -1719,10 +1913,11 @@ ip4_sv_reass_output_enable_disable_with_refcnt (u32 sw_if_index,
     {
       if (!rm->output_feature_use_refcount_per_intf[sw_if_index])
        {
-         ++rm->output_feature_use_refcount_per_intf[sw_if_index];
-         return vnet_feature_enable_disable ("ip4-output",
-                                             "ip4-sv-reassembly-output-feature",
-                                             sw_if_index, 1, 0, 0);
+         int rv = vnet_feature_enable_disable (
+           "ip4-output", "ip4-sv-reassembly-output-feature", sw_if_index, 1,
+           0, 0);
+         if (0 != rv)
+           return rv;
        }
       ++rm->output_feature_use_refcount_per_intf[sw_if_index];
     }
@@ -1731,12 +1926,66 @@ ip4_sv_reass_output_enable_disable_with_refcnt (u32 sw_if_index,
       if (rm->output_feature_use_refcount_per_intf[sw_if_index])
        --rm->output_feature_use_refcount_per_intf[sw_if_index];
       if (!rm->output_feature_use_refcount_per_intf[sw_if_index])
-       return vnet_feature_enable_disable ("ip4-output",
-                                           "ip4-sv-reassembly-output-feature",
-                                           sw_if_index, 0, 0, 0);
+       {
+         return vnet_feature_enable_disable (
+           "ip4-output", "ip4-sv-reassembly-output-feature", sw_if_index, 0,
+           0, 0);
+       }
     }
   return 0;
 }
+
+void
+ip4_sv_reass_enable_disable_extended (bool is_enable)
+{
+  if (is_enable)
+    ++ip4_sv_reass_main.extended_refcount;
+  else
+    --ip4_sv_reass_main.extended_refcount;
+}
+
+int
+ip4_sv_reass_extended_lock (vlib_buffer_t *b,
+                           struct ip4_sv_lock_unlock_args *a)
+{
+  ip4_sv_reass_per_thread_t *per_thread =
+    &ip4_sv_reass_main
+       .per_thread_data[vnet_buffer2 (b)->ip.reass.thread_index];
+
+  if (!vec_is_member (ip4_sv_reass_main.per_thread_data, per_thread))
+    return -1;
+
+  clib_spinlock_lock (&per_thread->lock);
+  if (pool_is_free_index (per_thread->pool,
+                         vnet_buffer2 (b)->ip.reass.pool_index))
+    goto fail;
+
+  ip4_sv_reass_t *reass = pool_elt_at_index (
+    per_thread->pool, vnet_buffer2 (b)->ip.reass.pool_index);
+  if (vnet_buffer2 (b)->ip.reass.id == reass->id)
+    {
+      *a->total_ip_payload_length = reass->total_ip_payload_length;
+
+      *a->first_fragment_buffer_index = reass->first_fragment_clone_bi;
+      *a->first_fragment_total_ip_header_length =
+       reass->first_fragment_total_ip_header_length;
+      return 0;
+    }
+
+fail:
+  clib_spinlock_unlock (&per_thread->lock);
+  return -1;
+}
+
+void
+ip4_sv_reass_extended_unlock (vlib_buffer_t *b)
+{
+  ip4_sv_reass_per_thread_t *per_thread =
+    &ip4_sv_reass_main
+       .per_thread_data[vnet_buffer2 (b)->ip.reass.thread_index];
+  clib_spinlock_unlock (&per_thread->lock);
+}
+
 #endif
 
 /*
index 3a684eb..a1e5659 100644 (file)
@@ -23,6 +23,7 @@
 #ifndef __included_ip4_sv_reass_h__
 #define __included_ip4_sv_reass_h__
 
+#include <stdbool.h>
 #include <vnet/api_errno.h>
 #include <vnet/vnet.h>
 
@@ -48,6 +49,33 @@ int ip4_sv_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable);
 int ip4_sv_reass_output_enable_disable_with_refcnt (u32 sw_if_index,
                                                    int is_enable);
 
+/*
+ * Enable or disable extended reassembly.
+ *
+ * Extended reassembly means that fragments are cached until both first and
+ * last fragments are seen. Furthermore, first fragment buffer will be cloned
+ * and stored in reassembly context for later retrieval.
+ */
+void ip4_sv_reass_enable_disable_extended (bool is_enable);
+
+struct ip4_sv_lock_unlock_args
+{
+  u32 *total_ip_payload_length;
+  u32 *first_fragment_buffer_index;
+  u32 *first_fragment_total_ip_header_length;
+};
+
+/*
+ * Lock thread-level lock and fetch information from reassembly context.
+ * Uses vnet_buffer2 data filled by extended reassembly.
+ *
+ * Returns 0 on success, -1 otherwise.
+ */
+int ip4_sv_reass_extended_lock (vlib_buffer_t *b,
+                               struct ip4_sv_lock_unlock_args *a);
+
+void ip4_sv_reass_extended_unlock (vlib_buffer_t *b);
+
 uword ip4_sv_reass_custom_register_next_node (uword node_index);
 uword ip4_sv_reass_custom_context_register_next_node (uword node_index);
 
index 7d60e6a..69b27c5 100644 (file)
 #include <vnet/ip/reass/ip6_sv_reass.h>
 #include <vnet/ip/ip6_inlines.h>
 
-#define MSEC_PER_SEC 1000
+#define MSEC_PER_SEC                   1000
 #define IP6_SV_REASS_TIMEOUT_DEFAULT_MS 100
-#define IP6_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000     // 10 seconds default
-#define IP6_SV_REASS_MAX_REASSEMBLIES_DEFAULT 1024
+#define IP6_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS                          \
+  10000 // 10 seconds default
+#define IP6_SV_REASS_MAX_REASSEMBLIES_DEFAULT     1024
 #define IP6_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
-#define IP6_SV_REASS_HT_LOAD_FACTOR (0.75)
+#define IP6_SV_REASS_HT_LOAD_FACTOR               (0.75)
 
 typedef enum
 {
@@ -94,17 +95,23 @@ typedef struct
   // buffer indexes of buffers in this reassembly in chronological order -
   // including overlaps and duplicate fragments
   u32 *cached_buffers;
-  // set to true when this reassembly is completed
-  bool is_complete;
-  // ip protocol
+
+  bool first_fragment_seen;
+  bool last_fragment_seen;
+
+  // vnet_buffer data
   u8 ip_proto;
   u8 icmp_type_or_tcp_flags;
   u32 tcp_ack_number;
   u32 tcp_seq_number;
-  // l4 src port
   u16 l4_src_port;
-  // l4 dst port
   u16 l4_dst_port;
+
+  // vnet_buffer2 data
+  u32 total_ip_payload_length;
+  u32 first_fragment_total_ip_header_length;
+  u32 first_fragment_clone_bi;
+
   // lru indexes
   u32 lru_prev;
   u32 lru_next;
@@ -142,9 +149,6 @@ typedef struct
   vlib_main_t *vlib_main;
   vnet_main_t *vnet_main;
 
-  // node index of ip6-drop node
-  u32 ip6_drop_idx;
-  u32 ip6_icmp_error_idx;
   u32 ip6_sv_reass_expire_node_idx;
 
   /** Worker handoff */
@@ -157,6 +161,9 @@ typedef struct
   u32 *feature_use_refcount_per_intf;
   // reference count for enabling/disabling output feature - per interface
   u32 *output_feature_use_refcount_per_intf;
+
+  // extended reassembly refcount - see ip6_sv_reass_enable_disable_extended()
+  u32 extended_refcount;
 } ip6_sv_reass_main_t;
 
 extern ip6_sv_reass_main_t ip6_sv_reass_main;
@@ -177,7 +184,8 @@ typedef enum
 typedef enum
 {
   REASS_FRAGMENT_CACHE,
-  REASS_FINISH,
+  REASS_FIRST_FRAG,
+  REASS_LAST_FRAG,
   REASS_FRAGMENT_FORWARD,
   REASS_PASSTHROUGH,
 } ip6_sv_reass_trace_operation_e;
@@ -193,7 +201,7 @@ typedef struct
 } ip6_sv_reass_trace_t;
 
 static u8 *
-format_ip6_sv_reass_trace (u8 * s, va_list * args)
+format_ip6_sv_reass_trace (u8 *s, va_list *args)
 {
   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
@@ -207,17 +215,19 @@ format_ip6_sv_reass_trace (u8 * s, va_list * args)
     case REASS_FRAGMENT_CACHE:
       s = format (s, "[cached]");
       break;
-    case REASS_FINISH:
+    case REASS_FIRST_FRAG:
       s =
-       format (s, "[finish, ip proto=%u, src_port=%u, dst_port=%u]",
+       format (s, "[first-frag-seen, ip proto=%u, src_port=%u, dst_port=%u]",
                t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
                clib_net_to_host_u16 (t->l4_dst_port));
       break;
+    case REASS_LAST_FRAG:
+      s = format (s, "[last-frag-seen]");
+      break;
     case REASS_FRAGMENT_FORWARD:
-      s =
-       format (s, "[forward, ip proto=%u, src_port=%u, dst_port=%u]",
-               t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
-               clib_net_to_host_u16 (t->l4_dst_port));
+      s = format (s, "[forward, ip proto=%u, src_port=%u, dst_port=%u]",
+                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
+                 clib_net_to_host_u16 (t->l4_dst_port));
       break;
     case REASS_PASSTHROUGH:
       s = format (s, "[not fragmented or atomic fragment]");
@@ -227,14 +237,14 @@ format_ip6_sv_reass_trace (u8 * s, va_list * args)
 }
 
 static void
-ip6_sv_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
-                       ip6_sv_reass_t * reass, u32 bi,
-                       ip6_sv_reass_trace_operation_e action,
-                       u32 ip_proto, u16 l4_src_port, u16 l4_dst_port)
+ip6_sv_reass_add_trace (vlib_main_t *vm, vlib_node_runtime_t *node,
+                       ip6_sv_reass_t *reass, u32 bi,
+                       ip6_sv_reass_trace_operation_e action, u32 ip_proto,
+                       u16 l4_src_port, u16 l4_dst_port)
 {
   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
-  if (pool_is_free_index
-      (vm->trace_main.trace_buffer_pool, vlib_buffer_get_trace_index (b)))
+  if (pool_is_free_index (vm->trace_main.trace_buffer_pool,
+                         vlib_buffer_get_trace_index (b)))
     {
       // this buffer's trace is gone
       b->flags &= ~VLIB_BUFFER_IS_TRACED;
@@ -261,31 +271,35 @@ ip6_sv_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
 }
 
 always_inline void
-ip6_sv_reass_free (vlib_main_t * vm, ip6_sv_reass_main_t * rm,
-                  ip6_sv_reass_per_thread_t * rt, ip6_sv_reass_t * reass)
+ip6_sv_reass_free (vlib_main_t *vm, ip6_sv_reass_main_t *rm,
+                  ip6_sv_reass_per_thread_t *rt, ip6_sv_reass_t *reass,
+                  bool del_bihash)
 {
-  clib_bihash_kv_48_8_t kv;
-  kv.key[0] = reass->key.as_u64[0];
-  kv.key[1] = reass->key.as_u64[1];
-  kv.key[2] = reass->key.as_u64[2];
-  kv.key[3] = reass->key.as_u64[3];
-  kv.key[4] = reass->key.as_u64[4];
-  kv.key[5] = reass->key.as_u64[5];
-  clib_bihash_add_del_48_8 (&rm->hash, &kv, 0);
+  if (del_bihash)
+    {
+      clib_bihash_kv_48_8_t kv;
+      kv.key[0] = reass->key.as_u64[0];
+      kv.key[1] = reass->key.as_u64[1];
+      kv.key[2] = reass->key.as_u64[2];
+      kv.key[3] = reass->key.as_u64[3];
+      kv.key[4] = reass->key.as_u64[4];
+      kv.key[5] = reass->key.as_u64[5];
+      clib_bihash_add_del_48_8 (&rm->hash, &kv, 0);
+    }
   vlib_buffer_free (vm, reass->cached_buffers,
                    vec_len (reass->cached_buffers));
   vec_free (reass->cached_buffers);
   reass->cached_buffers = NULL;
+  if (~0 != reass->first_fragment_clone_bi)
+    vlib_buffer_free_one (vm, reass->first_fragment_clone_bi);
   if (~0 != reass->lru_prev)
     {
-      ip6_sv_reass_t *lru_prev =
-       pool_elt_at_index (rt->pool, reass->lru_prev);
+      ip6_sv_reass_t *lru_prev = pool_elt_at_index (rt->pool, reass->lru_prev);
       lru_prev->lru_next = reass->lru_next;
     }
   if (~0 != reass->lru_next)
     {
-      ip6_sv_reass_t *lru_next =
-       pool_elt_at_index (rt->pool, reass->lru_next);
+      ip6_sv_reass_t *lru_next = pool_elt_at_index (rt->pool, reass->lru_next);
       lru_next->lru_prev = reass->lru_prev;
     }
   if (rt->lru_first == reass - rt->pool)
@@ -300,13 +314,6 @@ ip6_sv_reass_free (vlib_main_t * vm, ip6_sv_reass_main_t * rm,
   --rt->reass_n;
 }
 
-always_inline void
-ip6_sv_reass_init (ip6_sv_reass_t * reass)
-{
-  reass->cached_buffers = NULL;
-  reass->is_complete = false;
-}
-
 always_inline ip6_sv_reass_t *
 ip6_sv_reass_find_or_create (vlib_main_t *vm, ip6_sv_reass_main_t *rm,
                             ip6_sv_reass_per_thread_t *rt,
@@ -328,7 +335,7 @@ again:
 
       if (now > reass->last_heard + rm->timeout)
        {
-         ip6_sv_reass_free (vm, rm, rt, reass);
+         ip6_sv_reass_free (vm, rm, rt, reass, true);
          reass = NULL;
        }
     }
@@ -339,19 +346,17 @@ again:
       return reass;
     }
 
-  if (rt->reass_n >= rm->max_reass_n)
+  if (rt->reass_n >= rm->max_reass_n && rm->max_reass_n)
     {
       reass = pool_elt_at_index (rt->pool, rt->lru_first);
-      ip6_sv_reass_free (vm, rm, rt, reass);
+      ip6_sv_reass_free (vm, rm, rt, reass, true);
     }
 
-  pool_get (rt->pool, reass);
-  clib_memset (reass, 0, sizeof (*reass));
+  pool_get_zero (rt->pool, reass);
+  reass->first_fragment_clone_bi = ~0;
   reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
   ++rt->id_counter;
-  ip6_sv_reass_init (reass);
   ++rt->reass_n;
-
   reass->lru_prev = reass->lru_next = ~0;
 
   if (~0 != rt->lru_last)
@@ -379,7 +384,7 @@ again:
   int rv = clib_bihash_add_del_48_8 (&rm->hash, &kv->kv, 2);
   if (rv)
     {
-      ip6_sv_reass_free (vm, rm, rt, reass);
+      ip6_sv_reass_free (vm, rm, rt, reass, false);
       reass = NULL;
       // if other worker created a context already work with the other copy
       if (-2 == rv)
@@ -389,10 +394,23 @@ again:
   return reass;
 }
 
+always_inline bool
+ip6_sv_reass_is_complete (ip6_sv_reass_t *reass, bool extended)
+{
+  /*
+   * Both first and last fragments have to be seen for extended reassembly to
+   * be complete. Otherwise first fragment is enough.
+   */
+  if (extended)
+    return reass->first_fragment_seen && reass->last_fragment_seen;
+
+  return reass->first_fragment_seen;
+}
+
 always_inline ip6_sv_reass_rc_t
 ip6_sv_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
                     ip6_sv_reass_main_t *rm, ip6_sv_reass_t *reass, u32 bi0,
-                    ip6_frag_hdr_t *frag_hdr)
+                    ip6_frag_hdr_t *frag_hdr, bool extended)
 {
   vlib_buffer_t *fb = vlib_get_buffer (vm, bi0);
   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
@@ -420,26 +438,51 @@ ip6_sv_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
   fvnb->ip.reass.range_first = fragment_first;
   fvnb->ip.reass.range_last = fragment_last;
   fvnb->ip.reass.next_range_bi = ~0;
+  void *l4_hdr = NULL;
   if (0 == fragment_first)
     {
-      if (!ip6_get_port
-         (vm, fb, fip, fb->current_length, &reass->ip_proto,
-          &reass->l4_src_port, &reass->l4_dst_port,
-          &reass->icmp_type_or_tcp_flags, &reass->tcp_ack_number,
-          &reass->tcp_seq_number))
+      if (!ip6_get_port (vm, fb, fip, fb->current_length, &reass->ip_proto,
+                        &reass->l4_src_port, &reass->l4_dst_port,
+                        &reass->icmp_type_or_tcp_flags,
+                        &reass->tcp_ack_number, &reass->tcp_seq_number,
+                        &l4_hdr))
        return IP6_SV_REASS_RC_UNSUPP_IP_PROTO;
 
-      reass->is_complete = true;
+      reass->first_fragment_seen = true;
+      if (extended)
+       {
+         reass->first_fragment_total_ip_header_length =
+           (u8 *) l4_hdr - (u8 *) fip;
+         vlib_buffer_t *clone = vlib_buffer_copy_no_chain (
+           vm, fb, &reass->first_fragment_clone_bi);
+         if (!clone)
+           reass->first_fragment_clone_bi = ~0;
+       }
+
       vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
        {
-         ip6_sv_reass_add_trace (vm, node, reass, bi0, REASS_FINISH,
+         ip6_sv_reass_add_trace (vm, node, reass, bi0, REASS_FIRST_FRAG,
                                  reass->ip_proto, reass->l4_src_port,
                                  reass->l4_dst_port);
        }
     }
+
+  if (!ip6_frag_hdr_more (frag_hdr))
+    {
+      reass->last_fragment_seen = true;
+      reass->total_ip_payload_length = fragment_last - 1;
+      vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
+      if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+       {
+         ip6_sv_reass_add_trace (vm, node, reass, bi0, REASS_LAST_FRAG, ~0,
+                                 ~0, ~0);
+       }
+    }
+
   vec_add1 (reass->cached_buffers, bi0);
-  if (!reass->is_complete)
+
+  if (!ip6_sv_reass_is_complete (reass, extended))
     {
       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
        {
@@ -472,9 +515,8 @@ ip6_sv_reass_verify_upper_layer_present (vlib_buffer_t *b,
 }
 
 always_inline bool
-ip6_sv_reass_verify_fragment_multiple_8 (vlib_main_t * vm,
-                                        vlib_buffer_t * b,
-                                        ip6_frag_hdr_t * frag_hdr)
+ip6_sv_reass_verify_fragment_multiple_8 (vlib_main_t *vm, vlib_buffer_t *b,
+                                        ip6_frag_hdr_t *frag_hdr)
 {
   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
   ip6_header_t *ip = vlib_buffer_get_current (b);
@@ -484,18 +526,18 @@ ip6_sv_reass_verify_fragment_multiple_8 (vlib_main_t * vm,
     (vnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
   if (more_fragments && 0 != fragment_length % 8)
     {
-      icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
-                                  ICMP6_parameter_problem_erroneous_header_field,
-                                  (u8 *) & ip->payload_length - (u8 *) ip);
+      icmp6_error_set_vnet_buffer (
+       b, ICMP6_parameter_problem,
+       ICMP6_parameter_problem_erroneous_header_field,
+       (u8 *) &ip->payload_length - (u8 *) ip);
       return false;
     }
   return true;
 }
 
 always_inline bool
-ip6_sv_reass_verify_packet_size_lt_64k (vlib_main_t * vm,
-                                       vlib_buffer_t * b,
-                                       ip6_frag_hdr_t * frag_hdr)
+ip6_sv_reass_verify_packet_size_lt_64k (vlib_main_t *vm, vlib_buffer_t *b,
+                                       ip6_frag_hdr_t *frag_hdr)
 {
   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
   u32 fragment_first = ip6_frag_hdr_offset_bytes (frag_hdr);
@@ -505,21 +547,40 @@ ip6_sv_reass_verify_packet_size_lt_64k (vlib_main_t * vm,
   if (fragment_first + fragment_length > 65535)
     {
       ip6_header_t *ip0 = vlib_buffer_get_current (b);
-      icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
-                                  ICMP6_parameter_problem_erroneous_header_field,
-                                  (u8 *) & frag_hdr->fragment_offset_and_more
-                                  - (u8 *) ip0);
+      icmp6_error_set_vnet_buffer (
+       b, ICMP6_parameter_problem,
+       ICMP6_parameter_problem_erroneous_header_field,
+       (u8 *) &frag_hdr->fragment_offset_and_more - (u8 *) ip0);
       return false;
     }
   return true;
 }
 
+always_inline void
+ip6_sv_reass_reset_vnet_buffer2 (vlib_buffer_t *b)
+{
+  vnet_buffer2 (b)->ip.reass.pool_index = ~0;
+  vnet_buffer2 (b)->ip.reass.thread_index = ~0;
+  vnet_buffer2 (b)->ip.reass.id = ~0;
+}
+
+always_inline void
+ip6_sv_reass_set_vnet_buffer2_from_reass (vlib_main_t *vm, vlib_buffer_t *b,
+                                         ip6_sv_reass_t *reass)
+{
+  vnet_buffer2 (b)->ip.reass.thread_index = vm->thread_index;
+  vnet_buffer2 (b)->ip.reass.id = reass->id;
+  vnet_buffer2 (b)->ip.reass.pool_index =
+    reass - ip6_sv_reass_main.per_thread_data[vm->thread_index].pool;
+}
+
 struct ip6_sv_reass_args
 {
   bool is_feature;
   bool is_output_feature;
   bool custom_next;
   bool custom_context;
+  bool extended;
 };
 
 always_inline uword
@@ -581,21 +642,26 @@ ip6_sv_reassembly_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
              hdr_chain.eh[res].protocol != IP_PROTOCOL_IPV6_FRAGMENTATION ||
              is_atomic_fragment)
            {
-             // this is a regular unfragmented packet or an atomic fragment
-             if (!ip6_get_port
-                 (vm, b0, ip0, b0->current_length,
-                  &(vnet_buffer (b0)->ip.reass.ip_proto),
-                  &(vnet_buffer (b0)->ip.reass.l4_src_port),
-                  &(vnet_buffer (b0)->ip.reass.l4_dst_port),
-                  &(vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags),
-                  &(vnet_buffer (b0)->ip.reass.tcp_ack_number),
-                  &(vnet_buffer (b0)->ip.reass.tcp_seq_number)))
+             void *l4_hdr;
+             // this is a regular unfragmented packet or an atomic
+             // fragment
+             if (!ip6_get_port (
+                   vm, b0, ip0, b0->current_length,
+                   &(vnet_buffer (b0)->ip.reass.ip_proto),
+                   &(vnet_buffer (b0)->ip.reass.l4_src_port),
+                   &(vnet_buffer (b0)->ip.reass.l4_dst_port),
+                   &(vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags),
+                   &(vnet_buffer (b0)->ip.reass.tcp_ack_number),
+                   &(vnet_buffer (b0)->ip.reass.tcp_seq_number), &l4_hdr))
                {
                  error0 = IP6_ERROR_REASS_UNSUPP_IP_PROTO;
                  b0->error = node->errors[error0];
                  next0 = IP6_SV_REASSEMBLY_NEXT_DROP;
                  goto packet_enqueue;
                }
+             if (a.extended)
+               ip6_sv_reass_reset_vnet_buffer2 (b0);
+             vnet_buffer (b0)->ip.reass.l4_hdr_truncated = 0;
              vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
              next0 = a.custom_next ? vnet_buffer (b0)->ip.reass.next_index :
                                      IP6_SV_REASSEMBLY_NEXT_INPUT;
@@ -672,10 +738,11 @@ ip6_sv_reassembly_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
              goto packet_enqueue;
            }
 
-         if (reass->is_complete)
+         if (ip6_sv_reass_is_complete (reass, a.extended))
            {
+             vnet_buffer (b0)->ip.reass.l4_hdr_truncated = 0;
              vnet_buffer (b0)->ip.reass.is_non_first_fragment =
-               ! !ip6_frag_hdr_offset (frag_hdr);
+               !!ip6_frag_hdr_offset (frag_hdr);
              vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
              vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
                reass->icmp_type_or_tcp_flags;
@@ -685,6 +752,10 @@ ip6_sv_reassembly_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
                reass->tcp_seq_number;
              vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
              vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
+
+             if (a.extended)
+               ip6_sv_reass_set_vnet_buffer2_from_reass (vm, b0, reass);
+
              next0 = a.custom_next ? vnet_buffer (b0)->ip.reass.next_index :
                                      IP6_SV_REASSEMBLY_NEXT_INPUT;
              if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
@@ -697,7 +768,8 @@ ip6_sv_reassembly_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
            }
 
          u32 counter = ~0;
-         switch (ip6_sv_reass_update (vm, node, rm, reass, bi0, frag_hdr))
+         switch (ip6_sv_reass_update (vm, node, rm, reass, bi0, frag_hdr,
+                                      a.extended))
            {
            case IP6_SV_REASS_RC_OK:
              /* nothing to do here */
@@ -718,55 +790,57 @@ ip6_sv_reassembly_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
          if (~0 != counter)
            {
              vlib_node_increment_counter (vm, node->node_index, counter, 1);
-             ip6_sv_reass_free (vm, rm, rt, reass);
+             ip6_sv_reass_free (vm, rm, rt, reass, true);
              goto next_packet;
            }
 
-         if (reass->is_complete)
+         if (ip6_sv_reass_is_complete (reass, a.extended))
            {
              u32 idx;
              vec_foreach_index (idx, reass->cached_buffers)
-             {
-               u32 bi0 = vec_elt (reass->cached_buffers, idx);
-               if (0 == n_left_to_next)
-                 {
-                   vlib_put_next_frame (vm, node, next_index,
-                                        n_left_to_next);
-                   vlib_get_next_frame (vm, node, next_index, to_next,
-                                        n_left_to_next);
-                 }
-               to_next[0] = bi0;
-               to_next += 1;
-               n_left_to_next -= 1;
-               b0 = vlib_get_buffer (vm, bi0);
-               if (a.is_feature || a.is_output_feature)
-                 {
-                   vnet_feature_next (&next0, b0);
-                 }
-               frag_hdr =
-                 vlib_buffer_get_current (b0) +
-                 vnet_buffer (b0)->ip.reass.ip6_frag_hdr_offset;
-               vnet_buffer (b0)->ip.reass.is_non_first_fragment =
-                 ! !ip6_frag_hdr_offset (frag_hdr);
-               vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
-               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
-                 reass->icmp_type_or_tcp_flags;
-               vnet_buffer (b0)->ip.reass.tcp_ack_number =
-                 reass->tcp_ack_number;
-               vnet_buffer (b0)->ip.reass.tcp_seq_number =
-                 reass->tcp_seq_number;
-               vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
-               vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
-               if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
-                 {
-                   ip6_sv_reass_add_trace (
-                     vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
-                     reass->ip_proto, reass->l4_src_port, reass->l4_dst_port);
-                 }
-               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
-                                                to_next, n_left_to_next, bi0,
-                                                next0);
-             }
+               {
+                 u32 bi0 = vec_elt (reass->cached_buffers, idx);
+                 if (0 == n_left_to_next)
+                   {
+                     vlib_put_next_frame (vm, node, next_index,
+                                          n_left_to_next);
+                     vlib_get_next_frame (vm, node, next_index, to_next,
+                                          n_left_to_next);
+                   }
+                 to_next[0] = bi0;
+                 to_next += 1;
+                 n_left_to_next -= 1;
+                 b0 = vlib_get_buffer (vm, bi0);
+                 if (a.is_feature || a.is_output_feature)
+                   {
+                     vnet_feature_next (&next0, b0);
+                   }
+                 frag_hdr = vlib_buffer_get_current (b0) +
+                            vnet_buffer (b0)->ip.reass.ip6_frag_hdr_offset;
+                 vnet_buffer (b0)->ip.reass.l4_hdr_truncated = 0;
+                 vnet_buffer (b0)->ip.reass.is_non_first_fragment =
+                   !!ip6_frag_hdr_offset (frag_hdr);
+                 vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
+                 vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
+                   reass->icmp_type_or_tcp_flags;
+                 vnet_buffer (b0)->ip.reass.tcp_ack_number =
+                   reass->tcp_ack_number;
+                 vnet_buffer (b0)->ip.reass.tcp_seq_number =
+                   reass->tcp_seq_number;
+                 vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
+                 vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
+                 if (a.extended)
+                   ip6_sv_reass_set_vnet_buffer2_from_reass (vm, b0, reass);
+                 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+                   {
+                     ip6_sv_reass_add_trace (
+                       vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
+                       reass->ip_proto, reass->l4_src_port,
+                       reass->l4_dst_port);
+                   }
+                 vlib_validate_buffer_enqueue_x1 (
+                   vm, node, next_index, to_next, n_left_to_next, bi0, next0);
+               }
              vec_set_len (reass->cached_buffers,
                           0); // buffers are owned by frame now
            }
@@ -815,12 +889,16 @@ ip6_sv_reassembly_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
 VLIB_NODE_FN (ip6_sv_reass_node)
 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
+  /*
+   * Extended reassembly not supported for non-feature nodes.
+   */
   return ip6_sv_reassembly_inline (vm, node, frame,
                                   (struct ip6_sv_reass_args){
                                     .is_feature = false,
                                     .is_output_feature = false,
                                     .custom_context = false,
                                     .custom_next = false,
+                                    .extended = false,
                                   });
 }
 
@@ -840,16 +918,25 @@ VLIB_REGISTER_NODE (ip6_sv_reass_node) = {
         },
 };
 
-VLIB_NODE_FN (ip6_sv_reass_node_feature) (vlib_main_t * vm,
-                                         vlib_node_runtime_t * node,
-                                         vlib_frame_t * frame)
+VLIB_NODE_FN (ip6_sv_reass_node_feature)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
+  if (ip6_sv_reass_main.extended_refcount > 0)
+    return ip6_sv_reassembly_inline (vm, node, frame,
+                                    (struct ip6_sv_reass_args){
+                                      .is_feature = true,
+                                      .is_output_feature = false,
+                                      .custom_context = false,
+                                      .custom_next = false,
+                                      .extended = true,
+                                    });
   return ip6_sv_reassembly_inline (vm, node, frame,
                                   (struct ip6_sv_reass_args){
                                     .is_feature = true,
                                     .is_output_feature = false,
                                     .custom_context = false,
                                     .custom_next = false,
+                                    .extended = false,
                                   });
 }
 
@@ -870,21 +957,31 @@ VLIB_REGISTER_NODE (ip6_sv_reass_node_feature) = {
 };
 
 VNET_FEATURE_INIT (ip6_sv_reassembly_feature) = {
-    .arc_name = "ip6-unicast",
-    .node_name = "ip6-sv-reassembly-feature",
-    .runs_before = VNET_FEATURES ("ip6-lookup"),
-    .runs_after = 0,
+  .arc_name = "ip6-unicast",
+  .node_name = "ip6-sv-reassembly-feature",
+  .runs_before = VNET_FEATURES ("ip6-lookup"),
+  .runs_after = 0,
 };
 
 VLIB_NODE_FN (ip6_sv_reass_node_output_feature)
 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
+  if (ip6_sv_reass_main.extended_refcount > 0)
+    return ip6_sv_reassembly_inline (vm, node, frame,
+                                    (struct ip6_sv_reass_args){
+                                      .is_feature = false,
+                                      .is_output_feature = true,
+                                      .custom_context = false,
+                                      .custom_next = false,
+                                      .extended = true,
+                                    });
   return ip6_sv_reassembly_inline (vm, node, frame,
                                   (struct ip6_sv_reass_args){
                                     .is_feature = false,
                                     .is_output_feature = true,
                                     .custom_context = false,
                                     .custom_next = false,
+                                    .extended = false,
                                   });
 }
 
@@ -913,12 +1010,16 @@ VNET_FEATURE_INIT (ip6_sv_reassembly_output_feature) = {
 VLIB_NODE_FN (ip6_sv_reass_custom_context_node)
 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
+  /*
+   * Extended reassembly not supported for non-feature nodes.
+   */
   return ip6_sv_reassembly_inline (vm, node, frame,
                                   (struct ip6_sv_reass_args){
                                     .is_feature = false,
                                     .is_output_feature = false,
                                     .custom_context = true,
                                     .custom_next = true,
+                                    .extended = false,
                                   });
 }
 
@@ -971,7 +1072,7 @@ typedef struct
 } ip6_rehash_cb_ctx;
 
 static int
-ip6_rehash_cb (clib_bihash_kv_48_8_t * kv, void *_ctx)
+ip6_rehash_cb (clib_bihash_kv_48_8_t *kv, void *_ctx)
 {
   ip6_rehash_cb_ctx *ctx = _ctx;
   if (clib_bihash_add_del_48_8 (ctx->new_hash, kv, 1))
@@ -998,8 +1099,8 @@ ip6_sv_reass_set (u32 timeout_ms, u32 max_reassemblies,
                  u32 max_reassembly_length, u32 expire_walk_interval_ms)
 {
   u32 old_nbuckets = ip6_sv_reass_get_nbuckets ();
-  ip6_sv_reass_set_params (timeout_ms, max_reassemblies,
-                          max_reassembly_length, expire_walk_interval_ms);
+  ip6_sv_reass_set_params (timeout_ms, max_reassemblies, max_reassembly_length,
+                          expire_walk_interval_ms);
   vlib_process_signal_event (ip6_sv_reass_main.vlib_main,
                             ip6_sv_reass_main.ip6_sv_reass_expire_node_idx,
                             IP6_EVENT_CONFIG_CHANGED, 0);
@@ -1032,8 +1133,8 @@ ip6_sv_reass_set (u32 timeout_ms, u32 max_reassemblies,
 }
 
 vnet_api_error_t
-ip6_sv_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
-                 u32 * max_reassembly_length, u32 * expire_walk_interval_ms)
+ip6_sv_reass_get (u32 *timeout_ms, u32 *max_reassemblies,
+                 u32 *max_reassembly_length, u32 *expire_walk_interval_ms)
 {
   *timeout_ms = ip6_sv_reass_main.timeout_ms;
   *max_reassemblies = ip6_sv_reass_main.max_reass_n;
@@ -1043,7 +1144,7 @@ ip6_sv_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
 }
 
 static clib_error_t *
-ip6_sv_reass_init_function (vlib_main_t * vm)
+ip6_sv_reass_init_function (vlib_main_t *vm)
 {
   ip6_sv_reass_main_t *rm = &ip6_sv_reass_main;
   clib_error_t *error = 0;
@@ -1056,11 +1157,11 @@ ip6_sv_reass_init_function (vlib_main_t * vm)
   vec_validate (rm->per_thread_data, vlib_num_workers ());
   ip6_sv_reass_per_thread_t *rt;
   vec_foreach (rt, rm->per_thread_data)
-  {
-    clib_spinlock_init (&rt->lock);
-    pool_alloc (rt->pool, rm->max_reass_n);
-    rt->lru_first = rt->lru_last = ~0;
-  }
+    {
+      clib_spinlock_init (&rt->lock);
+      pool_alloc (rt->pool, rm->max_reass_n);
+      rt->lru_first = rt->lru_last = ~0;
+    }
 
   node = vlib_get_node_by_name (vm, (u8 *) "ip6-sv-reassembly-expire-walk");
   ASSERT (node);
@@ -1075,13 +1176,6 @@ ip6_sv_reass_init_function (vlib_main_t * vm)
   clib_bihash_init_48_8 (&rm->hash, "ip6-sv-reass", nbuckets,
                         (uword) nbuckets * 1024);
 
-  node = vlib_get_node_by_name (vm, (u8 *) "ip6-drop");
-  ASSERT (node);
-  rm->ip6_drop_idx = node->index;
-  node = vlib_get_node_by_name (vm, (u8 *) "ip6-icmp-error");
-  ASSERT (node);
-  rm->ip6_icmp_error_idx = node->index;
-
   if ((error = vlib_call_init_function (vm, ip_main_init)))
     return error;
 
@@ -1111,9 +1205,8 @@ ip6_sv_reass_walk_expired (vlib_main_t *vm,
 
   while (true)
     {
-      vlib_process_wait_for_event_or_clock (vm,
-                                           (f64) rm->expire_walk_interval_ms
-                                           / (f64) MSEC_PER_SEC);
+      vlib_process_wait_for_event_or_clock (
+       vm, (f64) rm->expire_walk_interval_ms / (f64) MSEC_PER_SEC);
       event_type = vlib_process_get_events (vm, &event_data);
 
       switch (event_type)
@@ -1142,19 +1235,20 @@ ip6_sv_reass_walk_expired (vlib_main_t *vm,
          clib_spinlock_lock (&rt->lock);
 
          vec_reset_length (pool_indexes_to_free);
-          pool_foreach_index (index, rt->pool)  {
-                                reass = pool_elt_at_index (rt->pool, index);
-                                if (now > reass->last_heard + rm->timeout)
-                                  {
-                                    vec_add1 (pool_indexes_to_free, index);
-                                  }
-                              }
+         pool_foreach_index (index, rt->pool)
+           {
+             reass = pool_elt_at_index (rt->pool, index);
+             if (now > reass->last_heard + rm->timeout)
+               {
+                 vec_add1 (pool_indexes_to_free, index);
+               }
+           }
          int *i;
-          vec_foreach (i, pool_indexes_to_free)
-          {
-            ip6_sv_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
-            ip6_sv_reass_free (vm, rm, rt, reass);
-          }
+         vec_foreach (i, pool_indexes_to_free)
+           {
+             ip6_sv_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
+             ip6_sv_reass_free (vm, rm, rt, reass, true);
+           }
 
          clib_spinlock_unlock (&rt->lock);
        }
@@ -1180,7 +1274,7 @@ VLIB_REGISTER_NODE (ip6_sv_reass_expire_node) = {
 };
 
 static u8 *
-format_ip6_sv_reass_key (u8 * s, va_list * args)
+format_ip6_sv_reass_key (u8 *s, va_list *args)
 {
   ip6_sv_reass_key_t *key = va_arg (*args, ip6_sv_reass_key_t *);
   s =
@@ -1191,35 +1285,34 @@ format_ip6_sv_reass_key (u8 * s, va_list * args)
 }
 
 static u8 *
-format_ip6_sv_reass (u8 * s, va_list * args)
+format_ip6_sv_reass (u8 *s, va_list *args)
 {
   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
   ip6_sv_reass_t *reass = va_arg (*args, ip6_sv_reass_t *);
 
-  s = format (s, "ID: %lu, key: %U, trace_op_counter: %u\n",
-             reass->id, format_ip6_sv_reass_key, &reass->key,
-             reass->trace_op_counter);
+  s = format (s, "ID: %lu, key: %U, trace_op_counter: %u\n", reass->id,
+             format_ip6_sv_reass_key, &reass->key, reass->trace_op_counter);
   vlib_buffer_t *b;
   u32 *bip;
   u32 counter = 0;
   vec_foreach (bip, reass->cached_buffers)
-  {
-    u32 bi = *bip;
-    do
-      {
-       b = vlib_get_buffer (vm, bi);
-       s = format (s, "  #%03u: bi: %u\n", counter, bi);
-       ++counter;
-       bi = b->next_buffer;
-      }
-    while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
-  }
+    {
+      u32 bi = *bip;
+      do
+       {
+         b = vlib_get_buffer (vm, bi);
+         s = format (s, "  #%03u: bi: %u\n", counter, bi);
+         ++counter;
+         bi = b->next_buffer;
+       }
+      while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
+    }
   return s;
 }
 
 static clib_error_t *
-show_ip6_sv_reass (vlib_main_t * vm, unformat_input_t * input,
-                  CLIB_UNUSED (vlib_cli_command_t * lmd))
+show_ip6_sv_reass (vlib_main_t *vm, unformat_input_t *input,
+                  CLIB_UNUSED (vlib_cli_command_t *lmd))
 {
   ip6_sv_reass_main_t *rm = &ip6_sv_reass_main;
 
@@ -1243,9 +1336,10 @@ show_ip6_sv_reass (vlib_main_t * vm, unformat_input_t * input,
       clib_spinlock_lock (&rt->lock);
       if (details)
        {
-          pool_foreach (reass, rt->pool) {
-            vlib_cli_output (vm, "%U", format_ip6_sv_reass, vm, reass);
-          }
+         pool_foreach (reass, rt->pool)
+           {
+             vlib_cli_output (vm, "%U", format_ip6_sv_reass, vm, reass);
+           }
        }
       sum_reass_n += rt->reass_n;
       clib_spinlock_unlock (&rt->lock);
@@ -1254,72 +1348,69 @@ show_ip6_sv_reass (vlib_main_t * vm, unformat_input_t * input,
   vlib_cli_output (vm, "Current IP6 reassemblies count: %lu\n",
                   (long unsigned) sum_reass_n);
   vlib_cli_output (vm,
-                  "Maximum configured concurrent shallow virtual IP6 reassemblies per worker-thread: %lu\n",
+                  "Maximum configured concurrent shallow virtual IP6 "
+                  "reassemblies per worker-thread: %lu\n",
                   (long unsigned) rm->max_reass_n);
   vlib_cli_output (vm,
                   "Maximum configured amount of fragments per shallow "
                   "virtual IP6 reassembly: %lu\n",
                   (long unsigned) rm->max_reass_len);
+  vlib_cli_output (
+    vm, "Maximum configured shallow virtual IP6 reassembly timeout: %lums\n",
+    (long unsigned) rm->timeout_ms);
   vlib_cli_output (vm,
-                  "Maximum configured shallow virtual IP6 reassembly timeout: %lums\n",
-                  (long unsigned) rm->timeout_ms);
-  vlib_cli_output (vm,
-                  "Maximum configured shallow virtual IP6 reassembly expire walk interval: %lums\n",
+                  "Maximum configured shallow virtual IP6 reassembly expire "
+                  "walk interval: %lums\n",
                   (long unsigned) rm->expire_walk_interval_ms);
-  vlib_cli_output (vm, "Buffers in use: %lu\n",
-                  (long unsigned) sum_buffers_n);
+  vlib_cli_output (vm, "Buffers in use: %lu\n", (long unsigned) sum_buffers_n);
   return 0;
 }
 
 VLIB_CLI_COMMAND (show_ip6_sv_reassembly_cmd, static) = {
-    .path = "show ip6-sv-reassembly",
-    .short_help = "show ip6-sv-reassembly [details]",
-    .function = show_ip6_sv_reass,
+  .path = "show ip6-sv-reassembly",
+  .short_help = "show ip6-sv-reassembly [details]",
+  .function = show_ip6_sv_reass,
 };
 
 #ifndef CLIB_MARCH_VARIANT
 vnet_api_error_t
 ip6_sv_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
 {
-  return ip6_sv_reass_enable_disable_with_refcnt (sw_if_index,
-                                                 enable_disable);
+  return ip6_sv_reass_enable_disable_with_refcnt (sw_if_index, enable_disable);
 }
 #endif /* CLIB_MARCH_VARIANT */
 
-#define foreach_ip6_sv_reassembly_handoff_error                       \
-_(CONGESTION_DROP, "congestion drop")
-
+#define foreach_ip6_sv_reassembly_handoff_error                               \
+  _ (CONGESTION_DROP, "congestion drop")
 
 typedef enum
 {
-#define _(sym,str) IP6_SV_REASSEMBLY_HANDOFF_ERROR_##sym,
+#define _(sym, str) IP6_SV_REASSEMBLY_HANDOFF_ERROR_##sym,
   foreach_ip6_sv_reassembly_handoff_error
 #undef _
     IP6_SV_REASSEMBLY_HANDOFF_N_ERROR,
 } ip6_sv_reassembly_handoff_error_t;
 
 static char *ip6_sv_reassembly_handoff_error_strings[] = {
-#define _(sym,string) string,
+#define _(sym, string) string,
   foreach_ip6_sv_reassembly_handoff_error
 #undef _
 };
 
 typedef struct
 {
-  u32 next_worker_index;
+  u32 thread_index;
 } ip6_sv_reassembly_handoff_trace_t;
 
 static u8 *
-format_ip6_sv_reassembly_handoff_trace (u8 * s, va_list * args)
+format_ip6_sv_reassembly_handoff_trace (u8 *s, va_list *args)
 {
   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
   ip6_sv_reassembly_handoff_trace_t *t =
     va_arg (*args, ip6_sv_reassembly_handoff_trace_t *);
 
-  s =
-    format (s, "ip6-sv-reassembly-handoff: next-worker %d",
-           t->next_worker_index);
+  s = format (s, "to thread-index: %u", t->thread_index);
 
   return s;
 }
@@ -1360,13 +1451,12 @@ ip6_sv_reassembly_handoff_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
     {
       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
 
-      if (PREDICT_FALSE
-         ((node->flags & VLIB_NODE_FLAG_TRACE)
-          && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
+      if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) &&
+                        (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
        {
          ip6_sv_reassembly_handoff_trace_t *t =
            vlib_add_trace (vm, node, b[0], sizeof (*t));
-         t->next_worker_index = ti[0];
+         t->thread_index = ti[0];
        }
 
       n_left_from -= 1;
@@ -1381,15 +1471,14 @@ ip6_sv_reassembly_handoff_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
       vm, node, fq_index, from, thread_indices, frame->n_vectors, 1);
 
   if (n_enq < frame->n_vectors)
-    vlib_node_increment_counter (vm, node->node_index,
-                                IP6_SV_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
-                                frame->n_vectors - n_enq);
+    vlib_node_increment_counter (
+      vm, node->node_index, IP6_SV_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
+      frame->n_vectors - n_enq);
   return frame->n_vectors;
 }
 
-VLIB_NODE_FN (ip6_sv_reassembly_handoff_node) (vlib_main_t * vm,
-                                              vlib_node_runtime_t * node,
-                                              vlib_frame_t * frame)
+VLIB_NODE_FN (ip6_sv_reassembly_handoff_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
   return ip6_sv_reassembly_handoff_inline (
     vm, node, frame,
@@ -1412,8 +1501,8 @@ VLIB_REGISTER_NODE (ip6_sv_reassembly_handoff_node) = {
   },
 };
 
-VLIB_NODE_FN (ip6_sv_reassembly_feature_handoff_node) (vlib_main_t * vm,
-                               vlib_node_runtime_t * node, vlib_frame_t * frame)
+VLIB_NODE_FN (ip6_sv_reassembly_feature_handoff_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
   return ip6_sv_reassembly_handoff_inline (
     vm, node, frame,
@@ -1495,10 +1584,10 @@ ip6_sv_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
     {
       if (!rm->feature_use_refcount_per_intf[sw_if_index])
        {
-         ++rm->feature_use_refcount_per_intf[sw_if_index];
-         return vnet_feature_enable_disable ("ip6-unicast",
-                                             "ip6-sv-reassembly-feature",
-                                             sw_if_index, 1, 0, 0);
+         int rv = vnet_feature_enable_disable (
+           "ip6-unicast", "ip6-sv-reassembly-feature", sw_if_index, 1, 0, 0);
+         if (0 != rv)
+           return rv;
        }
       ++rm->feature_use_refcount_per_intf[sw_if_index];
     }
@@ -1506,9 +1595,8 @@ ip6_sv_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
     {
       --rm->feature_use_refcount_per_intf[sw_if_index];
       if (!rm->feature_use_refcount_per_intf[sw_if_index])
-       return vnet_feature_enable_disable ("ip6-unicast",
-                                           "ip6-sv-reassembly-feature",
-                                           sw_if_index, 0, 0, 0);
+       return vnet_feature_enable_disable (
+         "ip6-unicast", "ip6-sv-reassembly-feature", sw_if_index, 0, 0, 0);
     }
   return 0;
 }
@@ -1522,10 +1610,11 @@ ip6_sv_reass_output_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
     {
       if (!rm->output_feature_use_refcount_per_intf[sw_if_index])
        {
-         ++rm->output_feature_use_refcount_per_intf[sw_if_index];
-         return vnet_feature_enable_disable (
+         int rv = vnet_feature_enable_disable (
            "ip6-output", "ip6-sv-reassembly-output-feature", sw_if_index, 1,
            0, 0);
+         if (0 != rv)
+           return rv;
        }
       ++rm->output_feature_use_refcount_per_intf[sw_if_index];
     }
@@ -1547,6 +1636,57 @@ ip6_sv_reass_custom_context_register_next_node (uword node_index)
     vlib_get_main (), ip6_sv_reassembly_custom_context_handoff_node.index,
     node_index);
 }
+
+void
+ip6_sv_reass_enable_disable_extended (bool is_enable)
+{
+  if (is_enable)
+    ++ip6_sv_reass_main.extended_refcount;
+  else
+    --ip6_sv_reass_main.extended_refcount;
+}
+
+int
+ip6_sv_reass_extended_lock (vlib_buffer_t *b,
+                           struct ip6_sv_lock_unlock_args *a)
+{
+  ip6_sv_reass_per_thread_t *per_thread =
+    &ip6_sv_reass_main
+       .per_thread_data[vnet_buffer2 (b)->ip.reass.thread_index];
+
+  if (!vec_is_member (ip6_sv_reass_main.per_thread_data, per_thread))
+    return -1;
+
+  clib_spinlock_lock (&per_thread->lock);
+  if (pool_is_free_index (per_thread->pool,
+                         vnet_buffer2 (b)->ip.reass.pool_index))
+    goto fail;
+
+  ip6_sv_reass_t *reass = pool_elt_at_index (
+    per_thread->pool, vnet_buffer2 (b)->ip.reass.pool_index);
+  if (vnet_buffer2 (b)->ip.reass.id == reass->id)
+    {
+      *a->total_ip_payload_length = reass->total_ip_payload_length;
+
+      *a->first_fragment_buffer_index = reass->first_fragment_clone_bi;
+      *a->first_fragment_total_ip_header_length =
+       reass->first_fragment_total_ip_header_length;
+      return 0;
+    }
+
+fail:
+  clib_spinlock_unlock (&per_thread->lock);
+  return -1;
+}
+
+void
+ip6_sv_reass_extended_unlock (vlib_buffer_t *b)
+{
+  ip6_sv_reass_per_thread_t *per_thread =
+    &ip6_sv_reass_main
+       .per_thread_data[vnet_buffer2 (b)->ip.reass.thread_index];
+  clib_spinlock_unlock (&per_thread->lock);
+}
 #endif
 
 /*
index b236e53..9220581 100644 (file)
@@ -23,6 +23,7 @@
 #ifndef __included_ip6_sv_reass_h__
 #define __included_ip6_sv_reass_h__
 
+#include <stdbool.h>
 #include <vnet/api_errno.h>
 #include <vnet/vnet.h>
 
@@ -46,6 +47,33 @@ vnet_api_error_t
 ip6_sv_reass_output_enable_disable_with_refcnt (u32 sw_if_index,
                                                int is_enable);
 
+/*
+ * Enable or disable extended reassembly.
+ *
+ * Extended reassembly means that fragments are cached until both first and
+ * last fragments are seen. Furthermore, first fragment buffer will be cloned
+ * and stored in reassembly context for later retrieval.
+ */
+void ip6_sv_reass_enable_disable_extended (bool is_enable);
+
+struct ip6_sv_lock_unlock_args
+{
+  u32 *total_ip_payload_length;
+  u32 *first_fragment_buffer_index;
+  u32 *first_fragment_total_ip_header_length;
+};
+
+/*
+ * Lock thread-level lock and fetch information from reassembly context.
+ * Uses vnet_buffer2 data filled by extended reassembly.
+ *
+ * Returns 0 on success, -1 otherwise.
+ */
+int ip6_sv_reass_extended_lock (vlib_buffer_t *b,
+                               struct ip6_sv_lock_unlock_args *a);
+
+void ip6_sv_reass_extended_unlock (vlib_buffer_t *b);
+
 int ip6_sv_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable);
 uword ip6_sv_reass_custom_context_register_next_node (uword node_index);