+static_always_inline void
+vlib_buffer_pool_put (vlib_main_t * vm, u8 buffer_pool_index,
+ u32 * buffers, u32 n_buffers)
+{
+ vlib_buffer_pool_t *bp = vlib_get_buffer_pool (vm, buffer_pool_index);
+ vlib_buffer_pool_thread_t *bpt = vec_elt_at_index (bp->threads,
+ vm->thread_index);
+ u32 n_cached, n_empty;
+
+ if (CLIB_DEBUG > 0)
+ vlib_buffer_validate_alloc_free (vm, buffers, n_buffers,
+ VLIB_BUFFER_KNOWN_ALLOCATED);
+
+ n_cached = bpt->n_cached;
+ n_empty = VLIB_BUFFER_POOL_PER_THREAD_CACHE_SZ - n_cached;
+ if (n_buffers <= n_empty)
+ {
+ vlib_buffer_copy_indices (bpt->cached_buffers + n_cached,
+ buffers, n_buffers);
+ bpt->n_cached = n_cached + n_buffers;
+ return;
+ }
+
+ vlib_buffer_copy_indices (bpt->cached_buffers + n_cached,
+ buffers + n_buffers - n_empty, n_empty);
+ bpt->n_cached = VLIB_BUFFER_POOL_PER_THREAD_CACHE_SZ;
+
+ clib_spinlock_lock (&bp->lock);
+ vlib_buffer_copy_indices (bp->buffers + bp->n_avail, buffers,
+ n_buffers - n_empty);
+ bp->n_avail += n_buffers - n_empty;
+ clib_spinlock_unlock (&bp->lock);
+}
+
+static_always_inline void
+vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers,
+ int maybe_next)
+{
+ const int queue_size = 128;
+ vlib_buffer_pool_t *bp = 0;
+ u8 buffer_pool_index = ~0;
+ u32 n_queue = 0, queue[queue_size + 4];
+ vlib_buffer_t bt = { };
+#if defined(CLIB_HAVE_VEC128)
+ vlib_buffer_t bpi_mask = {.buffer_pool_index = ~0 };
+ vlib_buffer_t bpi_vec = {.buffer_pool_index = ~0 };
+ vlib_buffer_t flags_refs_mask = {
+ .flags = VLIB_BUFFER_NEXT_PRESENT,
+ .ref_count = ~1
+ };
+#endif
+
+ while (n_buffers)
+ {
+ vlib_buffer_t *b[8];
+ u32 bi, sum = 0, flags, next;
+
+ if (n_buffers < 12)
+ goto one_by_one;
+
+ vlib_get_buffers (vm, buffers, b, 4);
+ vlib_get_buffers (vm, buffers + 8, b + 4, 4);
+
+ vlib_prefetch_buffer_header (b[4], LOAD);
+ vlib_prefetch_buffer_header (b[5], LOAD);
+ vlib_prefetch_buffer_header (b[6], LOAD);
+ vlib_prefetch_buffer_header (b[7], LOAD);
+
+#if defined(CLIB_HAVE_VEC128)
+ u8x16 p0, p1, p2, p3, r;
+ p0 = u8x16_load_unaligned (b[0]);
+ p1 = u8x16_load_unaligned (b[1]);
+ p2 = u8x16_load_unaligned (b[2]);
+ p3 = u8x16_load_unaligned (b[3]);
+
+ r = p0 ^ bpi_vec.as_u8x16[0];
+ r |= p1 ^ bpi_vec.as_u8x16[0];
+ r |= p2 ^ bpi_vec.as_u8x16[0];
+ r |= p3 ^ bpi_vec.as_u8x16[0];
+ r &= bpi_mask.as_u8x16[0];
+ r |= (p0 | p1 | p2 | p3) & flags_refs_mask.as_u8x16[0];
+
+ sum = !u8x16_is_all_zero (r);
+#else
+ sum |= b[0]->flags;
+ sum |= b[1]->flags;
+ sum |= b[2]->flags;
+ sum |= b[3]->flags;
+ sum &= VLIB_BUFFER_NEXT_PRESENT;
+ sum += b[0]->ref_count - 1;
+ sum += b[1]->ref_count - 1;
+ sum += b[2]->ref_count - 1;
+ sum += b[3]->ref_count - 1;
+ sum |= b[0]->buffer_pool_index ^ buffer_pool_index;
+ sum |= b[1]->buffer_pool_index ^ buffer_pool_index;
+ sum |= b[2]->buffer_pool_index ^ buffer_pool_index;
+ sum |= b[3]->buffer_pool_index ^ buffer_pool_index;
+#endif
+
+ if (sum)
+ goto one_by_one;
+
+ vlib_buffer_copy_indices (queue + n_queue, buffers, 4);
+ vlib_buffer_copy_template (b[0], &bt);
+ vlib_buffer_copy_template (b[1], &bt);
+ vlib_buffer_copy_template (b[2], &bt);
+ vlib_buffer_copy_template (b[3], &bt);
+ n_queue += 4;
+
+ vlib_buffer_validate (vm, b[0]);
+ vlib_buffer_validate (vm, b[1]);
+ vlib_buffer_validate (vm, b[2]);
+ vlib_buffer_validate (vm, b[3]);
+
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[1]);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[2]);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[3]);
+
+ if (n_queue >= queue_size)
+ {
+ vlib_buffer_pool_put (vm, buffer_pool_index, queue, n_queue);
+ n_queue = 0;
+ }
+ buffers += 4;
+ n_buffers -= 4;
+ continue;
+
+ one_by_one:
+ bi = buffers[0];
+
+ next_in_chain:
+ b[0] = vlib_get_buffer (vm, bi);
+ flags = b[0]->flags;
+ next = b[0]->next_buffer;
+
+ if (PREDICT_FALSE (buffer_pool_index != b[0]->buffer_pool_index))
+ {
+
+ if (n_queue)
+ {
+ vlib_buffer_pool_put (vm, buffer_pool_index, queue, n_queue);
+ n_queue = 0;
+ }
+
+ buffer_pool_index = b[0]->buffer_pool_index;
+#if defined(CLIB_HAVE_VEC128)
+ bpi_vec.buffer_pool_index = buffer_pool_index;
+#endif
+ bp = vlib_get_buffer_pool (vm, buffer_pool_index);
+ vlib_buffer_copy_template (&bt, &bp->buffer_template);
+ }
+
+ vlib_buffer_validate (vm, b[0]);
+
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]);
+
+ if (clib_atomic_sub_fetch (&b[0]->ref_count, 1) == 0)
+ {
+ vlib_buffer_copy_template (b[0], &bt);
+ queue[n_queue++] = bi;
+ }
+
+ if (n_queue == queue_size)
+ {
+ vlib_buffer_pool_put (vm, buffer_pool_index, queue, queue_size);
+ n_queue = 0;
+ }
+
+ if (maybe_next && (flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ bi = next;
+ goto next_in_chain;
+ }
+
+ buffers++;
+ n_buffers--;
+ }
+
+ if (n_queue)
+ vlib_buffer_pool_put (vm, buffer_pool_index, queue, n_queue);
+}
+
+