vlib: multiarch vlib_frame_queue_dequeue()

[vpp.git] / src / vlib / buffer_funcs.h
diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h

index 8091d83..d579e1b 100644 (file)
--- a/src/vlib/buffer_funcs.h
+++ b/src/vlib/buffer_funcs.h
@@ -51,6 +51,31 @@
      vlib buffer access methods.
  */
  
+typedef void (vlib_buffer_enqueue_to_next_fn_t) (vlib_main_t *vm,
+                                                vlib_node_runtime_t *node,
+                                                u32 *buffers, u16 *nexts,
+                                                uword count);
+typedef void (vlib_buffer_enqueue_to_single_next_fn_t) (
+  vlib_main_t *vm, vlib_node_runtime_t *node, u32 *ers, u16 next_index,
+  u32 count);
+
+typedef u32 (vlib_buffer_enqueue_to_thread_fn_t) (
+  vlib_main_t *vm, u32 frame_queue_index, u32 *buffer_indices,
+  u16 *thread_indices, u32 n_packets, int drop_on_congestion);
+
+typedef u32 (vlib_frame_queue_dequeue_fn_t) (vlib_main_t *vm,
+                                            vlib_frame_queue_main_t *fqm);
+
+typedef struct
+{
+  vlib_buffer_enqueue_to_next_fn_t *buffer_enqueue_to_next_fn;
+  vlib_buffer_enqueue_to_single_next_fn_t *buffer_enqueue_to_single_next_fn;
+  vlib_buffer_enqueue_to_thread_fn_t *buffer_enqueue_to_thread_fn;
+  vlib_frame_queue_dequeue_fn_t *frame_queue_dequeue_fn;
+} vlib_buffer_func_main_t;
+
+extern vlib_buffer_func_main_t vlib_buffer_func_main;
+
  always_inline void
  vlib_buffer_validate (vlib_main_t * vm, vlib_buffer_t * b)
  {
@@ -101,42 +126,42 @@ vlib_buffer_get_default_data_size (vlib_main_t * vm)
  static_always_inline void
  vlib_buffer_copy_indices (u32 * dst, u32 * src, u32 n_indices)
  {
-#if defined(CLIB_HAVE_VEC512)
-  while (n_indices >= 16)
+  clib_memcpy_u32 (dst, src, n_indices);
+}
+
+always_inline void
+vlib_buffer_copy_indices_from_ring (u32 * dst, u32 * ring, u32 start,
+                                   u32 ring_size, u32 n_buffers)
+{
+  ASSERT (n_buffers <= ring_size);
+
+  if (PREDICT_TRUE (start + n_buffers <= ring_size))
      {
-      u32x16_store_unaligned (u32x16_load_unaligned (src), dst);
-      dst += 16;
-      src += 16;
-      n_indices -= 16;
+      vlib_buffer_copy_indices (dst, ring + start, n_buffers);
      }
-#endif
-
-#if defined(CLIB_HAVE_VEC256)
-  while (n_indices >= 8)
+  else
      {
-      u32x8_store_unaligned (u32x8_load_unaligned (src), dst);
-      dst += 8;
-      src += 8;
-      n_indices -= 8;
+      u32 n = ring_size - start;
+      vlib_buffer_copy_indices (dst, ring + start, n);
+      vlib_buffer_copy_indices (dst + n, ring, n_buffers - n);
      }
-#endif
+}
  
-#if defined(CLIB_HAVE_VEC128)
-  while (n_indices >= 4)
+always_inline void
+vlib_buffer_copy_indices_to_ring (u32 * ring, u32 * src, u32 start,
+                                 u32 ring_size, u32 n_buffers)
+{
+  ASSERT (n_buffers <= ring_size);
+
+  if (PREDICT_TRUE (start + n_buffers <= ring_size))
      {
-      u32x4_store_unaligned (u32x4_load_unaligned (src), dst);
-      dst += 4;
-      src += 4;
-      n_indices -= 4;
+      vlib_buffer_copy_indices (ring + start, src, n_buffers);
      }
-#endif
-
-  while (n_indices)
+  else
      {
-      dst[0] = src[0];
-      dst += 1;
-      src += 1;
-      n_indices -= 1;
+      u32 n = ring_size - start;
+      vlib_buffer_copy_indices (ring + start, src, n);
+      vlib_buffer_copy_indices (ring, src + n, n_buffers - n);
      }
  }
  
@@ -179,37 +204,78 @@ vlib_get_buffers_with_offset (vlib_main_t * vm, u32 * bi, void **b, int count,
                               i32 offset)
  {
    uword buffer_mem_start = vm->buffer_main->buffer_mem_start;
-#ifdef CLIB_HAVE_VEC256
-  u64x4 off = u64x4_splat (buffer_mem_start + offset);
+#ifdef CLIB_HAVE_VEC512
+  u64x8 of8 = u64x8_splat (buffer_mem_start + offset);
+  u64x4 off = u64x8_extract_lo (of8);
    /* if count is not const, compiler will not unroll while loop
       se we maintain two-in-parallel variant */
+  while (count >= 32)
+    {
+      u64x8 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (bi));
+      u64x8 b1 = u64x8_from_u32x8 (u32x8_load_unaligned (bi + 8));
+      u64x8 b2 = u64x8_from_u32x8 (u32x8_load_unaligned (bi + 16));
+      u64x8 b3 = u64x8_from_u32x8 (u32x8_load_unaligned (bi + 24));
+      /* shift and add to get vlib_buffer_t pointer */
+      u64x8_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b);
+      u64x8_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b + 8);
+      u64x8_store_unaligned ((b2 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b + 16);
+      u64x8_store_unaligned ((b3 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b + 24);
+      b += 32;
+      bi += 32;
+      count -= 32;
+    }
    while (count >= 8)
      {
-      u64x4 b0 = u32x4_extend_to_u64x4 (u32x4_load_unaligned (bi));
-      u64x4 b1 = u32x4_extend_to_u64x4 (u32x4_load_unaligned (bi + 4));
+      u64x8 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (bi));
        /* shift and add to get vlib_buffer_t pointer */
-      u64x4_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b);
-      u64x4_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 4);
+      u64x8_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b);
        b += 8;
        bi += 8;
        count -= 8;
      }
+#elif defined CLIB_HAVE_VEC256
+  u64x4 off = u64x4_splat (buffer_mem_start + offset);
+  /* if count is not const, compiler will not unroll while loop
+     se we maintain two-in-parallel variant */
+  while (count >= 32)
+    {
+      u64x4 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (bi));
+      u64x4 b1 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 4));
+      u64x4 b2 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 8));
+      u64x4 b3 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 12));
+      u64x4 b4 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 16));
+      u64x4 b5 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 20));
+      u64x4 b6 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 24));
+      u64x4 b7 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 28));
+      /* shift and add to get vlib_buffer_t pointer */
+      u64x4_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b);
+      u64x4_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 4);
+      u64x4_store_unaligned ((b2 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 8);
+      u64x4_store_unaligned ((b3 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 12);
+      u64x4_store_unaligned ((b4 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 16);
+      u64x4_store_unaligned ((b5 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 20);
+      u64x4_store_unaligned ((b6 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 24);
+      u64x4_store_unaligned ((b7 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 28);
+      b += 32;
+      bi += 32;
+      count -= 32;
+    }
  #endif
    while (count >= 4)
      {
  #ifdef CLIB_HAVE_VEC256
-      u64x4 b0 = u32x4_extend_to_u64x4 (u32x4_load_unaligned (bi));
+      u64x4 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (bi));
        /* shift and add to get vlib_buffer_t pointer */
        u64x4_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b);
  #elif defined (CLIB_HAVE_VEC128)
        u64x2 off = u64x2_splat (buffer_mem_start + offset);
        u32x4 bi4 = u32x4_load_unaligned (bi);
-      u64x2 b0 = u32x4_extend_to_u64x2 ((u32x4) bi4);
+      u64x2 b0 = u64x2_from_u32x4 ((u32x4) bi4);
  #if defined (__aarch64__)
-      u64x2 b1 = u32x4_extend_to_u64x2_high ((u32x4) bi4);
+      u64x2 b1 = u64x2_from_u32x4_high ((u32x4) bi4);
  #else
        bi4 = u32x4_shuffle (bi4, 2, 3, 0, 1);
-      u64x2 b1 = u32x4_extend_to_u64x2 ((u32x4) bi4);
+      u64x2 b1 = u64x2_from_u32x4 ((u32x4) bi4);
  #endif
        u64x2_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b);
        u64x2_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 2);
@@ -474,6 +540,13 @@ vlib_buffer_is_known (vlib_main_t * vm, u32 buffer_index)
  u8 *vlib_validate_buffer (vlib_main_t * vm, u32 buffer_index,
                           uword follow_chain);
  
+u8 *vlib_validate_buffers (vlib_main_t * vm,
+                          u32 * buffers,
+                          uword next_buffer_stride,
+                          uword n_buffers,
+                          vlib_buffer_known_state_t known_state,
+                          uword follow_buffer_next);
+
  static_always_inline vlib_buffer_pool_t *
  vlib_get_buffer_pool (vlib_main_t * vm, u8 buffer_pool_index)
  {
@@ -481,7 +554,7 @@ vlib_get_buffer_pool (vlib_main_t * vm, u8 buffer_pool_index)
    return vec_elt_at_index (bm->buffer_pools, buffer_pool_index);
  }
  
-static_always_inline uword
+static_always_inline __clib_warn_unused_result uword
  vlib_buffer_pool_get (vlib_main_t * vm, u8 buffer_pool_index, u32 * buffers,
                       u32 n_buffers)
  {
@@ -491,19 +564,19 @@ vlib_buffer_pool_get (vlib_main_t * vm, u8 buffer_pool_index, u32 * buffers,
    ASSERT (bp->buffers);
  
    clib_spinlock_lock (&bp->lock);
-  len = vec_len (bp->buffers);
+  len = bp->n_avail;
    if (PREDICT_TRUE (n_buffers < len))
      {
        len -= n_buffers;
        vlib_buffer_copy_indices (buffers, bp->buffers + len, n_buffers);
-      _vec_len (bp->buffers) = len;
+      bp->n_avail = len;
        clib_spinlock_unlock (&bp->lock);
        return n_buffers;
      }
    else
      {
        vlib_buffer_copy_indices (buffers, bp->buffers, len);
-      _vec_len (bp->buffers) = 0;
+      bp->n_avail = 0;
        clib_spinlock_unlock (&bp->lock);
        return len;
      }
@@ -519,7 +592,7 @@ vlib_buffer_pool_get (vlib_main_t * vm, u8 buffer_pool_index, u32 * buffers,
      less than the number requested or zero
  */
  
-always_inline u32
+always_inline __clib_warn_unused_result u32
  vlib_buffer_alloc_from_pool (vlib_main_t * vm, u32 * buffers, u32 n_buffers,
                              u8 buffer_pool_index)
  {
@@ -528,19 +601,42 @@ vlib_buffer_alloc_from_pool (vlib_main_t * vm, u32 * buffers, u32 n_buffers,
    vlib_buffer_pool_thread_t *bpt;
    u32 *src, *dst, len, n_left;
  
+  /* If buffer allocation fault injection is configured */
+  if (VLIB_BUFFER_ALLOC_FAULT_INJECTOR > 0)
+    {
+      u32 vlib_buffer_alloc_may_fail (vlib_main_t *, u32);
+
+      /* See how many buffers we're willing to allocate */
+      n_buffers = vlib_buffer_alloc_may_fail (vm, n_buffers);
+      if (n_buffers == 0)
+       return (n_buffers);
+    }
+
    bp = vec_elt_at_index (bm->buffer_pools, buffer_pool_index);
    bpt = vec_elt_at_index (bp->threads, vm->thread_index);
  
    dst = buffers;
    n_left = n_buffers;
-  len = vec_len (bpt->cached_buffers);
+  len = bpt->n_cached;
  
    /* per-thread cache contains enough buffers */
    if (len >= n_buffers)
      {
        src = bpt->cached_buffers + len - n_buffers;
        vlib_buffer_copy_indices (dst, src, n_buffers);
-      _vec_len (bpt->cached_buffers) -= n_buffers;
+      bpt->n_cached -= n_buffers;
+
+      if (CLIB_DEBUG > 0)
+       vlib_buffer_validate_alloc_free (vm, buffers, n_buffers,
+                                        VLIB_BUFFER_KNOWN_FREE);
+      return n_buffers;
+    }
+
+  /* alloc bigger than cache - take buffers directly from main pool */
+  if (n_buffers >= VLIB_BUFFER_POOL_PER_THREAD_CACHE_SZ)
+    {
+      n_buffers = vlib_buffer_pool_get (vm, buffer_pool_index, buffers,
+                                       n_buffers);
  
        if (CLIB_DEBUG > 0)
         vlib_buffer_validate_alloc_free (vm, buffers, n_buffers,
@@ -552,23 +648,22 @@ vlib_buffer_alloc_from_pool (vlib_main_t * vm, u32 * buffers, u32 n_buffers,
    if (len)
      {
        vlib_buffer_copy_indices (dst, bpt->cached_buffers, len);
-      _vec_len (bpt->cached_buffers) = 0;
+      bpt->n_cached = 0;
        dst += len;
        n_left -= len;
      }
  
    len = round_pow2 (n_left, 32);
-  vec_validate_aligned (bpt->cached_buffers, len - 1, CLIB_CACHE_LINE_BYTES);
    len = vlib_buffer_pool_get (vm, buffer_pool_index, bpt->cached_buffers,
                               len);
-  _vec_len (bpt->cached_buffers) = len;
+  bpt->n_cached = len;
  
    if (len)
      {
        u32 n_copy = clib_min (len, n_left);
        src = bpt->cached_buffers + len - n_copy;
        vlib_buffer_copy_indices (dst, src, n_copy);
-      _vec_len (bpt->cached_buffers) -= n_copy;
+      bpt->n_cached -= n_copy;
        n_left -= n_copy;
      }
  
@@ -591,7 +686,7 @@ vlib_buffer_alloc_from_pool (vlib_main_t * vm, u32 * buffers, u32 n_buffers,
      @return - (u32) number of buffers actually allocated, may be
      less than the number requested or zero
  */
-always_inline u32
+always_inline __clib_warn_unused_result u32
  vlib_buffer_alloc_on_numa (vlib_main_t * vm, u32 * buffers, u32 n_buffers,
                            u32 numa_node)
  {
@@ -608,7 +703,7 @@ vlib_buffer_alloc_on_numa (vlib_main_t * vm, u32 * buffers, u32 n_buffers,
      less than the number requested or zero
  */
  
-always_inline u32
+always_inline __clib_warn_unused_result u32
  vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
  {
    return vlib_buffer_alloc_on_numa (vm, buffers, n_buffers, vm->numa_node);
@@ -624,7 +719,7 @@ vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
      @return - (u32) number of buffers actually allocated, may be
      less than the number requested or zero
  */
-always_inline u32
+always_inline __clib_warn_unused_result u32
  vlib_buffer_alloc_to_ring (vlib_main_t * vm, u32 * ring, u32 start,
                            u32 ring_size, u32 n_buffers)
  {
@@ -653,7 +748,7 @@ vlib_buffer_alloc_to_ring (vlib_main_t * vm, u32 * ring, u32 start,
      @return - (u32) number of buffers actually allocated, may be
      less than the number requested or zero
  */
-always_inline u32
+always_inline __clib_warn_unused_result u32
  vlib_buffer_alloc_to_ring_from_pool (vlib_main_t * vm, u32 * ring, u32 start,
                                      u32 ring_size, u32 n_buffers,
                                      u8 buffer_pool_index)
@@ -681,26 +776,33 @@ vlib_buffer_pool_put (vlib_main_t * vm, u8 buffer_pool_index,
                       u32 * buffers, u32 n_buffers)
  {
    vlib_buffer_pool_t *bp = vlib_get_buffer_pool (vm, buffer_pool_index);
-  vlib_buffer_pool_thread_t *bpt =
-    vec_elt_at_index (bp->threads, vm->thread_index);
+  vlib_buffer_pool_thread_t *bpt = vec_elt_at_index (bp->threads,
+                                                    vm->thread_index);
+  u32 n_cached, n_empty;
  
    if (CLIB_DEBUG > 0)
      vlib_buffer_validate_alloc_free (vm, buffers, n_buffers,
                                      VLIB_BUFFER_KNOWN_ALLOCATED);
  
-  vec_add_aligned (bpt->cached_buffers, buffers, n_buffers,
-                  CLIB_CACHE_LINE_BYTES);
-
-  if (vec_len (bpt->cached_buffers) > 4 * VLIB_FRAME_SIZE)
+  n_cached = bpt->n_cached;
+  n_empty = VLIB_BUFFER_POOL_PER_THREAD_CACHE_SZ - n_cached;
+  if (n_buffers <= n_empty)
      {
-      clib_spinlock_lock (&bp->lock);
-      /* keep last stored buffers, as they are more likely hot in the cache */
-      vec_add_aligned (bp->buffers, bpt->cached_buffers, VLIB_FRAME_SIZE,
-                      CLIB_CACHE_LINE_BYTES);
-      vec_delete (bpt->cached_buffers, VLIB_FRAME_SIZE, 0);
-      bpt->n_alloc -= VLIB_FRAME_SIZE;
-      clib_spinlock_unlock (&bp->lock);
+      vlib_buffer_copy_indices (bpt->cached_buffers + n_cached,
+                               buffers, n_buffers);
+      bpt->n_cached = n_cached + n_buffers;
+      return;
      }
+
+  vlib_buffer_copy_indices (bpt->cached_buffers + n_cached,
+                           buffers + n_buffers - n_empty, n_empty);
+  bpt->n_cached = VLIB_BUFFER_POOL_PER_THREAD_CACHE_SZ;
+
+  clib_spinlock_lock (&bp->lock);
+  vlib_buffer_copy_indices (bp->buffers + bp->n_avail, buffers,
+                           n_buffers - n_empty);
+  bp->n_avail += n_buffers - n_empty;
+  clib_spinlock_unlock (&bp->lock);
  }
  
  static_always_inline void
@@ -714,28 +816,42 @@ vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers,
    vlib_buffer_t bt = { };
  #if defined(CLIB_HAVE_VEC128)
    vlib_buffer_t bpi_mask = {.buffer_pool_index = ~0 };
-  vlib_buffer_t bpi_vec = {.buffer_pool_index = ~0 };
+  vlib_buffer_t bpi_vec = {};
    vlib_buffer_t flags_refs_mask = {
      .flags = VLIB_BUFFER_NEXT_PRESENT,
      .ref_count = ~1
    };
  #endif
  
+  if (PREDICT_FALSE (n_buffers == 0))
+    return;
+
+  vlib_buffer_t *b = vlib_get_buffer (vm, buffers[0]);
+  buffer_pool_index = b->buffer_pool_index;
+  bp = vlib_get_buffer_pool (vm, buffer_pool_index);
+  vlib_buffer_copy_template (&bt, &bp->buffer_template);
+#if defined(CLIB_HAVE_VEC128)
+  bpi_vec.buffer_pool_index = buffer_pool_index;
+#endif
+
    while (n_buffers)
      {
        vlib_buffer_t *b[8];
        u32 bi, sum = 0, flags, next;
  
-      if (n_buffers < 12)
+      if (n_buffers < 4)
         goto one_by_one;
  
        vlib_get_buffers (vm, buffers, b, 4);
-      vlib_get_buffers (vm, buffers + 8, b + 4, 4);
  
-      vlib_prefetch_buffer_header (b[4], LOAD);
-      vlib_prefetch_buffer_header (b[5], LOAD);
-      vlib_prefetch_buffer_header (b[6], LOAD);
-      vlib_prefetch_buffer_header (b[7], LOAD);
+      if (n_buffers >= 12)
+       {
+         vlib_get_buffers (vm, buffers + 8, b + 4, 4);
+         vlib_prefetch_buffer_header (b[4], LOAD);
+         vlib_prefetch_buffer_header (b[5], LOAD);
+         vlib_prefetch_buffer_header (b[6], LOAD);
+         vlib_prefetch_buffer_header (b[7], LOAD);
+       }
  
  #if defined(CLIB_HAVE_VEC128)
        u8x16 p0, p1, p2, p3, r;
@@ -954,13 +1070,18 @@ vlib_buffer_free_from_ring_no_next (vlib_main_t * vm, u32 * ring, u32 start,
  int vlib_buffer_add_data (vlib_main_t * vm, u32 * buffer_index, void *data,
                           u32 n_data_bytes);
  
+/* Define vlib_buffer and vnet_buffer flags bits preserved for copy/clone */
+#define VLIB_BUFFER_COPY_CLONE_FLAGS_MASK                      \
+  (VLIB_BUFFER_NEXT_PRESENT | VLIB_BUFFER_TOTAL_LENGTH_VALID | \
+   VLIB_BUFFER_IS_TRACED | ~VLIB_BUFFER_FLAGS_ALL)
+
  /* duplicate all buffers in chain */
  always_inline vlib_buffer_t *
  vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b)
  {
    vlib_buffer_t *s, *d, *fd;
    uword n_alloc, n_buffers = 1;
-  u32 flag_mask = VLIB_BUFFER_NEXT_PRESENT | VLIB_BUFFER_TOTAL_LENGTH_VALID;
+  u32 flag_mask = VLIB_BUFFER_COPY_CLONE_FLAGS_MASK;
    int i;
  
    s = b;
@@ -987,6 +1108,7 @@ vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b)
    d->current_data = s->current_data;
    d->current_length = s->current_length;
    d->flags = s->flags & flag_mask;
+  d->trace_handle = s->trace_handle;
    d->total_length_not_including_first_buffer =
      s->total_length_not_including_first_buffer;
    clib_memcpy_fast (d->opaque, s->opaque, sizeof (s->opaque));
@@ -1131,8 +1253,9 @@ vlib_buffer_clone_256 (vlib_main_t * vm, u32 src_buffer, u32 * buffers,
           d->total_length_not_including_first_buffer +=
             s->total_length_not_including_first_buffer;
         }
-      d->flags = s->flags | VLIB_BUFFER_NEXT_PRESENT;
-      d->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
+      d->flags = (s->flags & VLIB_BUFFER_COPY_CLONE_FLAGS_MASK) |
+       VLIB_BUFFER_NEXT_PRESENT;
+      d->trace_handle = s->trace_handle;
        clib_memcpy_fast (d->opaque, s->opaque, sizeof (s->opaque));
        clib_memcpy_fast (d->opaque2, s->opaque2, sizeof (s->opaque2));
        clib_memcpy_fast (vlib_buffer_get_current (d),
@@ -1140,11 +1263,11 @@ vlib_buffer_clone_256 (vlib_main_t * vm, u32 src_buffer, u32 * buffers,
        d->next_buffer = src_buffer;
      }
    vlib_buffer_advance (s, head_end_offset);
-  s->ref_count = n_buffers;
+  s->ref_count = n_buffers ? n_buffers : s->ref_count;
    while (s->flags & VLIB_BUFFER_NEXT_PRESENT)
      {
        s = vlib_get_buffer (vm, s->next_buffer);
-      s->ref_count = n_buffers;
+      s->ref_count = n_buffers ? n_buffers : s->ref_count;
      }
  
    return n_buffers;
@@ -1307,7 +1430,7 @@ vlib_buffer_chain_append_data_with_alloc (vlib_main_t * vm,
  void vlib_buffer_chain_validate (vlib_main_t * vm, vlib_buffer_t * first);
  
  format_function_t format_vlib_buffer, format_vlib_buffer_and_data,
-  format_vlib_buffer_contents;
+  format_vlib_buffer_contents, format_vlib_buffer_no_chain;
  
  typedef struct
  {