pg: add GSO support
[vpp.git] / src / vnet / pg / input.c
index 597ae06..151624f 100644 (file)
  *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+  /*
+   * To be honest, the packet generator needs an extreme
+   * makeover. Two key assumptions which drove the current implementation
+   * are no longer true. First, buffer managers implement a
+   * post-TX recycle list. Second, that packet generator performance
+   * is first-order important.
+   */
+
 #include <vlib/vlib.h>
 #include <vnet/pg/pg.h>
 #include <vnet/vnet.h>
+#include <vnet/ethernet/ethernet.h>
 #include <vnet/feature/feature.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/ip6_packet.h>
+#include <vnet/udp/udp_packet.h>
 #include <vnet/devices/devices.h>
 
 static int
@@ -65,7 +77,7 @@ validate_buffer_data2 (vlib_buffer_t * b, pg_stream_t * s,
   if (i >= n_bytes)
     return 1;
 
-  clib_warning ("buffer %U", format_vlib_buffer, b);
+  clib_warning ("buffer %U", format_vnet_buffer, b);
   clib_warning ("differ at index %d", i);
   clib_warning ("is     %U", format_hex_bytes, bd, n_bytes);
   clib_warning ("mask   %U", format_hex_bytes, pm, n_bytes);
@@ -1053,48 +1065,6 @@ pg_set_next_buffer_pointers (pg_main_t * pg,
     }
 }
 
-static_always_inline void
-init_replay_buffers_inline (vlib_main_t * vm,
-                           pg_stream_t * s,
-                           u32 * buffers,
-                           u32 n_buffers, u32 data_offset, u32 n_data)
-{
-  u32 n_left, *b, i, l;
-
-  n_left = n_buffers;
-  b = buffers;
-  i = s->current_replay_packet_index;
-  l = vec_len (s->replay_packet_templates);
-
-  while (n_left >= 1)
-    {
-      u32 bi0, n0;
-      vlib_buffer_t *b0;
-      u8 *d0;
-
-      bi0 = b[0];
-      b += 1;
-      n_left -= 1;
-
-      b0 = vlib_get_buffer (vm, bi0);
-
-      vnet_buffer (b0)->sw_if_index[VLIB_RX] = s->sw_if_index[VLIB_RX];
-      /* was s->sw_if_index[VLIB_TX]; */
-      vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
-
-      d0 = vec_elt (s->replay_packet_templates, i);
-
-      n0 = n_data;
-      if (data_offset + n_data >= vec_len (d0))
-       n0 = vec_len (d0) > data_offset ? vec_len (d0) - data_offset : 0;
-
-      b0->current_length = n0;
-
-      clib_memcpy (b0->data, d0 + data_offset, n0);
-      i = i + 1 == l ? 0 : i + 1;
-    }
-}
-
 static_always_inline void
 init_buffers_inline (vlib_main_t * vm,
                     pg_stream_t * s,
@@ -1104,9 +1074,7 @@ init_buffers_inline (vlib_main_t * vm,
   u32 n_left, *b;
   u8 *data, *mask;
 
-  if (vec_len (s->replay_packet_templates) > 0)
-    return init_replay_buffers_inline (vm, s, buffers, n_buffers, data_offset,
-                                      n_data);
+  ASSERT (s->replay_packet_templates == 0);
 
   data = s->fixed_packet_data + data_offset;
   mask = s->fixed_packet_data_mask + data_offset;
@@ -1143,12 +1111,12 @@ init_buffers_inline (vlib_main_t * vm,
        vnet_buffer (b1)->sw_if_index[VLIB_RX] = s->sw_if_index[VLIB_RX];
 
       vnet_buffer (b0)->sw_if_index[VLIB_TX] =
-       vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+       vnet_buffer (b1)->sw_if_index[VLIB_TX] = s->sw_if_index[VLIB_TX];
 
       if (set_data)
        {
-         clib_memcpy (b0->data, data, n_data);
-         clib_memcpy (b1->data, data, n_data);
+         clib_memcpy_fast (b0->data, data, n_data);
+         clib_memcpy_fast (b1->data, data, n_data);
        }
       else
        {
@@ -1168,35 +1136,15 @@ init_buffers_inline (vlib_main_t * vm,
 
       b0 = vlib_get_buffer (vm, bi0);
       vnet_buffer (b0)->sw_if_index[VLIB_RX] = s->sw_if_index[VLIB_RX];
-      /* s->sw_if_index[VLIB_TX]; */
-      vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+      vnet_buffer (b0)->sw_if_index[VLIB_TX] = s->sw_if_index[VLIB_TX];
 
       if (set_data)
-       clib_memcpy (b0->data, data, n_data);
+       clib_memcpy_fast (b0->data, data, n_data);
       else
        ASSERT (validate_buffer_data2 (b0, s, data_offset, n_data));
     }
 }
 
-static void
-pg_buffer_init (vlib_main_t * vm,
-               vlib_buffer_free_list_t * fl, u32 * buffers, u32 n_buffers)
-{
-  pg_main_t *pg = &pg_main;
-  pg_stream_t *s;
-  uword bi, si;
-
-  si = fl->buffer_init_function_opaque & pow2_mask (24);
-  bi = fl->buffer_init_function_opaque >> 24;
-
-  s = pool_elt_at_index (pg->streams, si);
-
-  init_buffers_inline (vm, s, buffers, n_buffers,
-                      /* data_offset */ bi * s->buffer_bytes,
-                      /* n_data */ s->buffer_bytes,
-                      /* set_data */ 1);
-}
-
 static u32
 pg_stream_fill_helper (pg_main_t * pg,
                       pg_stream_t * s,
@@ -1204,30 +1152,12 @@ pg_stream_fill_helper (pg_main_t * pg,
                       u32 * buffers, u32 * next_buffers, u32 n_alloc)
 {
   vlib_main_t *vm = vlib_get_main ();
-  vlib_buffer_free_list_t *f;
   uword is_start_of_packet = bi == s->buffer_indices;
   u32 n_allocated;
 
-  f = vlib_buffer_get_free_list (vm, bi->free_list_index);
-
-  /*
-   * Historically, the pg maintained its own free lists and
-   * device drivers tx paths would return pkts.
-   */
-  if (vm->buffer_main->extern_buffer_mgmt == 0 &&
-      !(s->flags & PG_STREAM_FLAGS_DISABLE_BUFFER_RECYCLE))
-    f->buffer_init_function = pg_buffer_init;
-  f->buffer_init_function_opaque =
-    (s - pg->streams) | ((bi - s->buffer_indices) << 24);
-
-  if (is_start_of_packet)
-    vnet_buffer (&f->buffer_init_template)->sw_if_index[VLIB_RX]
-      = vnet_main.local_interface_sw_if_index;
+  ASSERT (vec_len (s->replay_packet_templates) == 0);
 
-  n_allocated = vlib_buffer_alloc_from_free_list (vm,
-                                                 buffers,
-                                                 n_alloc,
-                                                 bi->free_list_index);
+  n_allocated = vlib_buffer_alloc (vm, buffers, n_alloc);
   if (n_allocated == 0)
     return 0;
 
@@ -1238,53 +1168,146 @@ pg_stream_fill_helper (pg_main_t * pg,
   n_alloc = n_allocated;
 
   /* Reinitialize buffers */
-  if (vm->buffer_main->extern_buffer_mgmt == 0 || CLIB_DEBUG > 0
-      || (s->flags & PG_STREAM_FLAGS_DISABLE_BUFFER_RECYCLE))
-    init_buffers_inline
-      (vm, s,
-       buffers,
-       n_alloc, (bi - s->buffer_indices) * s->buffer_bytes /* data offset */ ,
-       s->buffer_bytes,
-       /* set_data */
-       vm->buffer_main->extern_buffer_mgmt != 0
-       || (s->flags & PG_STREAM_FLAGS_DISABLE_BUFFER_RECYCLE) != 0);
+  init_buffers_inline
+    (vm, s,
+     buffers,
+     n_alloc, (bi - s->buffer_indices) * s->buffer_bytes /* data offset */ ,
+     s->buffer_bytes,
+     /* set_data */ 1);
 
   if (next_buffers)
     pg_set_next_buffer_pointers (pg, s, buffers, next_buffers, n_alloc);
 
   if (is_start_of_packet)
     {
-      if (vec_len (s->replay_packet_templates) > 0)
+      pg_generate_set_lengths (pg, s, buffers, n_alloc);
+      if (vec_len (s->buffer_indices) > 1)
+       pg_generate_fix_multi_buffer_lengths (pg, s, buffers, n_alloc);
+
+      pg_generate_edit (pg, s, buffers, n_alloc);
+    }
+
+  return n_alloc;
+}
+
+static u32
+pg_stream_fill_replay (pg_main_t * pg, pg_stream_t * s, u32 n_alloc)
+{
+  pg_buffer_index_t *bi;
+  u32 n_left, i, l;
+  u32 buffer_alloc_request = 0;
+  u32 buffer_alloc_result;
+  u32 current_buffer_index;
+  u32 *buffers;
+  vlib_main_t *vm = vlib_get_main ();
+  vnet_main_t *vnm = vnet_get_main ();
+  u32 buf_sz = vlib_buffer_get_default_data_size (vm);
+  vnet_interface_main_t *im = &vnm->interface_main;
+  vnet_sw_interface_t *si;
+
+  buffers = pg->replay_buffers_by_thread[vm->thread_index];
+  vec_reset_length (buffers);
+  bi = s->buffer_indices;
+
+  n_left = n_alloc;
+  i = s->current_replay_packet_index;
+  l = vec_len (s->replay_packet_templates);
+
+  /* Figure out how many buffers we need */
+  while (n_left > 0)
+    {
+      u8 *d0;
+
+      d0 = vec_elt (s->replay_packet_templates, i);
+      buffer_alloc_request += (vec_len (d0) + (buf_sz - 1)) / buf_sz;
+
+      i = ((i + 1) == l) ? 0 : i + 1;
+      n_left--;
+    }
+
+  ASSERT (buffer_alloc_request > 0);
+  vec_validate (buffers, buffer_alloc_request - 1);
+
+  /* Allocate that many buffers */
+  buffer_alloc_result = vlib_buffer_alloc (vm, buffers, buffer_alloc_request);
+  if (buffer_alloc_result < buffer_alloc_request)
+    {
+      clib_warning ("alloc failure, got %d not %d", buffer_alloc_result,
+                   buffer_alloc_request);
+      vlib_buffer_free_no_next (vm, buffers, buffer_alloc_result);
+      pg->replay_buffers_by_thread[vm->thread_index] = buffers;
+      return 0;
+    }
+
+  /* Now go generate the buffers, and add them to the FIFO */
+  n_left = n_alloc;
+
+  current_buffer_index = 0;
+  i = s->current_replay_packet_index;
+  l = vec_len (s->replay_packet_templates);
+  while (n_left > 0)
+    {
+      u8 *d0;
+      int not_last;
+      u32 data_offset;
+      u32 bytes_to_copy, bytes_this_chunk;
+      vlib_buffer_t *b;
+
+      d0 = vec_elt (s->replay_packet_templates, i);
+      data_offset = 0;
+      bytes_to_copy = vec_len (d0);
+
+      /* Add head chunk to pg fifo */
+      clib_fifo_add1 (bi->buffer_fifo, buffers[current_buffer_index]);
+
+      /* Copy the data */
+      while (bytes_to_copy)
        {
-         vnet_main_t *vnm = vnet_get_main ();
-         vnet_interface_main_t *im = &vnm->interface_main;
-         vnet_sw_interface_t *si =
-           vnet_get_sw_interface (vnm, s->sw_if_index[VLIB_RX]);
-         u32 l = 0;
-         u32 i;
-         for (i = 0; i < n_alloc; i++)
-           l += vlib_buffer_index_length_in_chain (vm, buffers[i]);
-         vlib_increment_combined_counter (im->combined_sw_if_counters
-                                          + VNET_INTERFACE_COUNTER_RX,
-                                          vlib_get_thread_index (),
-                                          si->sw_if_index, n_alloc, l);
-         s->current_replay_packet_index += n_alloc;
-         s->current_replay_packet_index %=
-           vec_len (s->replay_packet_templates);
+         bytes_this_chunk = clib_min (bytes_to_copy, buf_sz);
+         ASSERT (current_buffer_index < vec_len (buffers));
+         b = vlib_get_buffer (vm, buffers[current_buffer_index]);
+         clib_memcpy_fast (b->data, d0 + data_offset, bytes_this_chunk);
+         vnet_buffer (b)->sw_if_index[VLIB_RX] = s->sw_if_index[VLIB_RX];
+         vnet_buffer (b)->sw_if_index[VLIB_TX] = s->sw_if_index[VLIB_TX];
+         b->flags = 0;
+         b->next_buffer = 0;
+         b->current_data = 0;
+         b->current_length = bytes_this_chunk;
+
+         not_last = bytes_this_chunk < bytes_to_copy;
+         if (not_last)
+           {
+             ASSERT (current_buffer_index < (vec_len (buffers) - 1));
+             b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+             b->next_buffer = buffers[current_buffer_index + 1];
+           }
+         bytes_to_copy -= bytes_this_chunk;
+         data_offset += bytes_this_chunk;
+         current_buffer_index++;
        }
-      else
-       {
-         pg_generate_set_lengths (pg, s, buffers, n_alloc);
-         if (vec_len (s->buffer_indices) > 1)
-           pg_generate_fix_multi_buffer_lengths (pg, s, buffers, n_alloc);
 
-         pg_generate_edit (pg, s, buffers, n_alloc);
-       }
+      i = ((i + 1) == l) ? 0 : i + 1;
+      n_left--;
     }
 
+  /* Update the interface counters */
+  si = vnet_get_sw_interface (vnm, s->sw_if_index[VLIB_RX]);
+  l = 0;
+  for (i = 0; i < n_alloc; i++)
+    l += vlib_buffer_index_length_in_chain (vm, buffers[i]);
+  vlib_increment_combined_counter (im->combined_sw_if_counters
+                                  + VNET_INTERFACE_COUNTER_RX,
+                                  vlib_get_thread_index (),
+                                  si->sw_if_index, n_alloc, l);
+
+  s->current_replay_packet_index += n_alloc;
+  s->current_replay_packet_index %= vec_len (s->replay_packet_templates);
+
+  pg->replay_buffers_by_thread[vm->thread_index] = buffers;
   return n_alloc;
 }
 
+
 static u32
 pg_stream_fill (pg_main_t * pg, pg_stream_t * s, u32 n_buffers)
 {
@@ -1311,6 +1334,12 @@ pg_stream_fill (pg_main_t * pg, pg_stream_t * s, u32 n_buffers)
        n_alloc = 0;
     }
 
+  /*
+   * Handle pcap replay directly
+   */
+  if (s->replay_packet_templates)
+    return pg_stream_fill_replay (pg, s, n_alloc);
+
   /* All buffer fifos should have the same size. */
   if (CLIB_DEBUG > 0)
     {
@@ -1388,7 +1417,7 @@ format_pg_input_trace (u8 * s, va_list * va)
   pg_main_t *pg = &pg_main;
   pg_stream_t *stream;
   vlib_node_t *n;
-  uword indent = format_get_indent (s);
+  u32 indent = format_get_indent (s);
 
   stream = 0;
   if (!pool_is_free_index (pg->streams, t->stream_index))
@@ -1400,10 +1429,10 @@ format_pg_input_trace (u8 * s, va_list * va)
     s = format (s, "stream %d", t->stream_index);
 
   s = format (s, ", %d bytes", t->packet_length);
-  s = format (s, ", %d sw_if_index", t->sw_if_index);
+  s = format (s, ", sw_if_index %d", t->sw_if_index);
 
   s = format (s, "\n%U%U",
-             format_white_space, indent, format_vlib_buffer, &t->buffer);
+             format_white_space, indent, format_vnet_buffer, &t->buffer);
 
   s = format (s, "\n%U", format_white_space, indent);
 
@@ -1423,16 +1452,14 @@ format_pg_input_trace (u8 * s, va_list * va)
 
 static void
 pg_input_trace (pg_main_t * pg,
-               vlib_node_runtime_t * node,
-               pg_stream_t * s, u32 * buffers, u32 n_buffers)
+               vlib_node_runtime_t * node, u32 stream_index, u32 next_index,
+               u32 * buffers, u32 n_buffers)
 {
   vlib_main_t *vm = vlib_get_main ();
-  u32 *b, n_left, stream_index, next_index;
+  u32 *b, n_left;
 
   n_left = n_buffers;
   b = buffers;
-  stream_index = s - pg->streams;
-  next_index = s->next_index;
 
   while (n_left >= 2)
     {
@@ -1463,13 +1490,15 @@ pg_input_trace (pg_main_t * pg,
       t0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX];
       t1->sw_if_index = vnet_buffer (b1)->sw_if_index[VLIB_RX];
 
-      clib_memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
-      clib_memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b1->pre_data));
+      clib_memcpy_fast (&t0->buffer, b0,
+                       sizeof (b0[0]) - sizeof (b0->pre_data));
+      clib_memcpy_fast (&t1->buffer, b1,
+                       sizeof (b1[0]) - sizeof (b1->pre_data));
 
-      clib_memcpy (t0->buffer.pre_data, b0->data,
-                  sizeof (t0->buffer.pre_data));
-      clib_memcpy (t1->buffer.pre_data, b1->data,
-                  sizeof (t1->buffer.pre_data));
+      clib_memcpy_fast (t0->buffer.pre_data, b0->data,
+                       sizeof (t0->buffer.pre_data));
+      clib_memcpy_fast (t1->buffer.pre_data, b1->data,
+                       sizeof (t1->buffer.pre_data));
     }
 
   while (n_left >= 1)
@@ -1490,9 +1519,74 @@ pg_input_trace (pg_main_t * pg,
       t0->stream_index = stream_index;
       t0->packet_length = vlib_buffer_length_in_chain (vm, b0);
       t0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX];
-      clib_memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
-      clib_memcpy (t0->buffer.pre_data, b0->data,
-                  sizeof (t0->buffer.pre_data));
+      clib_memcpy_fast (&t0->buffer, b0,
+                       sizeof (b0[0]) - sizeof (b0->pre_data));
+      clib_memcpy_fast (t0->buffer.pre_data, b0->data,
+                       sizeof (t0->buffer.pre_data));
+    }
+}
+
+static_always_inline void
+fill_gso_buffer_flags (vlib_main_t * vm, u32 * buffers, u32 n_buffers,
+                      u32 packet_data_size)
+{
+
+  for (int i = 0; i < n_buffers; i++)
+    {
+      vlib_buffer_t *b0 = vlib_get_buffer (vm, buffers[i]);
+      u8 l4_proto = 0;
+      u8 l4_hdr_sz = 0;
+
+      ethernet_header_t *eh = (ethernet_header_t *) b0->data;
+      u16 ethertype = clib_net_to_host_u16 (eh->type);
+      u16 l2hdr_sz = sizeof (ethernet_header_t);
+
+      vnet_buffer (b0)->l2_hdr_offset = 0;
+      vnet_buffer (b0)->l3_hdr_offset = l2hdr_sz;
+      if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP4))
+       {
+         ip4_header_t *ip4 = (ip4_header_t *) (b0->data + l2hdr_sz);
+         vnet_buffer (b0)->l4_hdr_offset = l2hdr_sz + ip4_header_bytes (ip4);
+         l4_proto = ip4->protocol;
+         b0->flags |=
+           (VNET_BUFFER_F_IS_IP4 | VNET_BUFFER_F_L2_HDR_OFFSET_VALID
+            | VNET_BUFFER_F_L3_HDR_OFFSET_VALID |
+            VNET_BUFFER_F_L4_HDR_OFFSET_VALID);
+         b0->flags |= VNET_BUFFER_F_OFFLOAD_IP_CKSUM;
+       }
+      else if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP6))
+       {
+         ip6_header_t *ip6 = (ip6_header_t *) (b0->data + l2hdr_sz);
+         /* FIXME IPv6 EH traversal */
+         vnet_buffer (b0)->l4_hdr_offset = l2hdr_sz + sizeof (ip6_header_t);
+         l4_proto = ip6->protocol;
+         b0->flags |=
+           (VNET_BUFFER_F_IS_IP6 | VNET_BUFFER_F_L2_HDR_OFFSET_VALID
+            | VNET_BUFFER_F_L3_HDR_OFFSET_VALID |
+            VNET_BUFFER_F_L4_HDR_OFFSET_VALID);
+         b0->flags |= VNET_BUFFER_F_OFFLOAD_IP_CKSUM;
+       }
+      if (l4_proto == IP_PROTOCOL_TCP)
+       {
+         b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
+         tcp_header_t *tcp = (tcp_header_t *) (b0->data +
+                                               vnet_buffer
+                                               (b0)->l4_hdr_offset);
+         l4_hdr_sz = tcp_header_bytes (tcp);
+         tcp->checksum = 0;
+         vnet_buffer2 (b0)->gso_l4_hdr_sz = l4_hdr_sz;
+         vnet_buffer2 (b0)->gso_size = packet_data_size;
+         b0->flags |= VNET_BUFFER_F_GSO;
+       }
+      else if (l4_proto == IP_PROTOCOL_UDP)
+       {
+         b0->flags |= VNET_BUFFER_F_OFFLOAD_UDP_CKSUM;
+         udp_header_t *udp = (udp_header_t *) (b0->data +
+                                               vnet_buffer
+                                               (b0)->l4_hdr_offset);
+         vnet_buffer2 (b0)->gso_l4_hdr_sz = sizeof (*udp);
+         udp->checksum = 0;
+       }
     }
 }
 
@@ -1511,6 +1605,7 @@ pg_generate_packets (vlib_node_runtime_t * node,
   u8 feature_arc_index = fm->device_input_feature_arc_index;
   cm = &fm->feature_config_mains[feature_arc_index];
   u32 current_config_index = ~(u32) 0;
+  pg_interface_t *pi = pool_elt_at_index (pg->interfaces, s->pg_if_index);
   int i;
 
   bi0 = s->buffer_indices;
@@ -1532,7 +1627,23 @@ pg_generate_packets (vlib_node_runtime_t * node,
     {
       u32 *head, *start, *end;
 
-      vlib_get_next_frame (vm, node, next_index, to_next, n_left);
+      if (PREDICT_TRUE (next_index == VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT))
+       {
+         vlib_next_frame_t *nf;
+         vlib_frame_t *f;
+         ethernet_input_frame_t *ef;
+         vlib_get_new_next_frame (vm, node, next_index, to_next, n_left);
+         nf = vlib_node_runtime_get_next_frame (vm, node, next_index);
+         f = vlib_get_frame (vm, nf->frame);
+         f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX;
+
+         ef = vlib_frame_scalar_args (f);
+         ef->sw_if_index = pi->sw_if_index;
+         ef->hw_if_index = pi->hw_if_index;
+         vlib_frame_no_append (f);
+       }
+      else
+       vlib_get_next_frame (vm, node, next_index, to_next, n_left);
 
       n_this_frame = n_packets_to_generate;
       if (n_this_frame > n_left)
@@ -1543,39 +1654,58 @@ pg_generate_packets (vlib_node_runtime_t * node,
       head = clib_fifo_head (bi0->buffer_fifo);
 
       if (head + n_this_frame <= end)
-       vlib_copy_buffers (to_next, head, n_this_frame);
+       vlib_buffer_copy_indices (to_next, head, n_this_frame);
       else
        {
          u32 n = end - head;
-         vlib_copy_buffers (to_next + 0, head, n);
-         vlib_copy_buffers (to_next + n, start, n_this_frame - n);
+         vlib_buffer_copy_indices (to_next + 0, head, n);
+         vlib_buffer_copy_indices (to_next + n, start, n_this_frame - n);
        }
 
-      vec_foreach (bi, s->buffer_indices)
-       clib_fifo_advance_head (bi->buffer_fifo, n_this_frame);
+      if (s->replay_packet_templates == 0)
+       {
+         vec_foreach (bi, s->buffer_indices)
+           clib_fifo_advance_head (bi->buffer_fifo, n_this_frame);
+       }
+      else
+       {
+         clib_fifo_advance_head (bi0->buffer_fifo, n_this_frame);
+       }
 
       if (current_config_index != ~(u32) 0)
        for (i = 0; i < n_this_frame; i++)
          {
            vlib_buffer_t *b;
            b = vlib_get_buffer (vm, to_next[i]);
-           vnet_buffer (b)->device_input_feat.saved_next_index =
-             s->next_index;
-           vnet_buffer (b)->device_input_feat.buffer_advance = 0;
            b->current_config_index = current_config_index;
-           b->feature_arc_index = feature_arc_index;
+           vnet_buffer (b)->feature_arc_index = feature_arc_index;
          }
 
+      if (pi->gso_enabled)
+       fill_gso_buffer_flags (vm, to_next, n_this_frame, pi->gso_size);
+
       n_trace = vlib_get_trace_count (vm, node);
       if (n_trace > 0)
        {
          u32 n = clib_min (n_trace, n_this_frame);
-         pg_input_trace (pg, node, s, to_next, n);
+         pg_input_trace (pg, node, s - pg->streams, next_index, to_next, n);
          vlib_set_trace_count (vm, node, n_trace - n);
        }
       n_packets_to_generate -= n_this_frame;
       n_packets_generated += n_this_frame;
       n_left -= n_this_frame;
+      if (CLIB_DEBUG > 0)
+       {
+         int i;
+         vlib_buffer_t *b;
+
+         for (i = 0; i < n_this_frame; i++)
+           {
+             b = vlib_get_buffer (vm, to_next[i]);
+             ASSERT ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0 ||
+                     b->current_length >= VLIB_BUFFER_MIN_CHAIN_SEG_SIZE);
+           }
+       }
       vlib_put_next_frame (vm, node, next_index, n_left);
     }