+/** \brief Allocate buffers into ring from specific buffer pool
+
+ @param vm - (vlib_main_t *) vlib main data structure pointer
+ @param buffers - (u32 * ) buffer index ring
+ @param start - (u32) first slot in the ring
+ @param ring_size - (u32) ring size
+ @param n_buffers - (u32) number of buffers requested
+ @return - (u32) number of buffers actually allocated, may be
+ less than the number requested or zero
+*/
+always_inline u32
+vlib_buffer_alloc_to_ring_from_pool (vlib_main_t * vm, u32 * ring, u32 start,
+ u32 ring_size, u32 n_buffers,
+ u8 buffer_pool_index)
+{
+ u32 n_alloc;
+
+ ASSERT (n_buffers <= ring_size);
+
+ if (PREDICT_TRUE (start + n_buffers <= ring_size))
+ return vlib_buffer_alloc_from_pool (vm, ring + start, n_buffers,
+ buffer_pool_index);
+
+ n_alloc = vlib_buffer_alloc_from_pool (vm, ring + start, ring_size - start,
+ buffer_pool_index);
+
+ if (PREDICT_TRUE (n_alloc == ring_size - start))
+ n_alloc += vlib_buffer_alloc_from_pool (vm, ring, n_buffers - n_alloc,
+ buffer_pool_index);
+
+ return n_alloc;
+}
+
+static_always_inline void
+vlib_buffer_pool_put (vlib_main_t * vm, u8 buffer_pool_index,
+ u32 * buffers, u32 n_buffers)
+{
+ vlib_buffer_pool_t *bp = vlib_get_buffer_pool (vm, buffer_pool_index);
+ vlib_buffer_pool_thread_t *bpt =
+ vec_elt_at_index (bp->threads, vm->thread_index);
+
+ if (CLIB_DEBUG > 0)
+ vlib_buffer_validate_alloc_free (vm, buffers, n_buffers,
+ VLIB_BUFFER_KNOWN_ALLOCATED);
+
+ vec_add_aligned (bpt->cached_buffers, buffers, n_buffers,
+ CLIB_CACHE_LINE_BYTES);
+
+ if (vec_len (bpt->cached_buffers) > 4 * VLIB_FRAME_SIZE)
+ {
+ clib_spinlock_lock (&bp->lock);
+ /* keep last stored buffers, as they are more likely hot in the cache */
+ vec_add_aligned (bp->buffers, bpt->cached_buffers, VLIB_FRAME_SIZE,
+ CLIB_CACHE_LINE_BYTES);
+ vec_delete (bpt->cached_buffers, VLIB_FRAME_SIZE, 0);
+ bpt->n_alloc -= VLIB_FRAME_SIZE;
+ clib_spinlock_unlock (&bp->lock);
+ }
+}
+
+static_always_inline void
+vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers,
+ int maybe_next)
+{
+ const int queue_size = 128;
+ vlib_buffer_pool_t *bp = 0;
+ u8 buffer_pool_index = ~0;
+ u32 n_queue = 0, queue[queue_size + 4];
+ vlib_buffer_t bt = { };
+#if defined(CLIB_HAVE_VEC128) && !__aarch64__
+ vlib_buffer_t bpi_mask = {.buffer_pool_index = ~0 };
+ vlib_buffer_t bpi_vec = {.buffer_pool_index = ~0 };
+ vlib_buffer_t flags_refs_mask = {
+ .flags = VLIB_BUFFER_NEXT_PRESENT,
+ .ref_count = ~0
+ };
+#endif
+
+ while (n_buffers)
+ {
+ vlib_buffer_t *b[8];
+ u32 bi, sum = 0, flags, next;
+
+ if (n_buffers < 12)
+ goto one_by_one;
+
+ vlib_get_buffers (vm, buffers, b, 4);
+ vlib_get_buffers (vm, buffers + 8, b + 4, 4);
+
+ vlib_prefetch_buffer_header (b[4], LOAD);
+ vlib_prefetch_buffer_header (b[5], LOAD);
+ vlib_prefetch_buffer_header (b[6], LOAD);
+ vlib_prefetch_buffer_header (b[7], LOAD);
+
+#if defined(CLIB_HAVE_VEC128) && !__aarch64__
+ u8x16 p0, p1, p2, p3, r;
+ p0 = u8x16_load_unaligned (b[0]);
+ p1 = u8x16_load_unaligned (b[1]);
+ p2 = u8x16_load_unaligned (b[2]);
+ p3 = u8x16_load_unaligned (b[3]);
+
+ r = p0 ^ bpi_vec.as_u8x16[0];
+ r |= p1 ^ bpi_vec.as_u8x16[0];
+ r |= p2 ^ bpi_vec.as_u8x16[0];
+ r |= p3 ^ bpi_vec.as_u8x16[0];
+ r &= bpi_mask.as_u8x16[0];
+ r |= (p0 | p1 | p2 | p3) & flags_refs_mask.as_u8x16[0];
+
+ sum = !u8x16_is_all_zero (r);
+#else
+ sum |= b[0]->flags;
+ sum |= b[1]->flags;
+ sum |= b[2]->flags;
+ sum |= b[3]->flags;
+ sum &= VLIB_BUFFER_NEXT_PRESENT;
+ sum += b[0]->ref_count - 1;
+ sum += b[1]->ref_count - 1;
+ sum += b[2]->ref_count - 1;
+ sum += b[3]->ref_count - 1;
+ sum |= b[0]->buffer_pool_index ^ buffer_pool_index;
+ sum |= b[1]->buffer_pool_index ^ buffer_pool_index;
+ sum |= b[2]->buffer_pool_index ^ buffer_pool_index;
+ sum |= b[3]->buffer_pool_index ^ buffer_pool_index;
+#endif
+
+ if (sum)
+ goto one_by_one;
+
+ vlib_buffer_copy_indices (queue + n_queue, buffers, 4);
+ vlib_buffer_copy_template (b[0], &bt);
+ vlib_buffer_copy_template (b[1], &bt);
+ vlib_buffer_copy_template (b[2], &bt);
+ vlib_buffer_copy_template (b[3], &bt);
+ n_queue += 4;
+
+ vlib_buffer_validate (vm, b[0]);
+ vlib_buffer_validate (vm, b[1]);
+ vlib_buffer_validate (vm, b[2]);
+ vlib_buffer_validate (vm, b[3]);
+
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[1]);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[2]);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[3]);
+
+ if (n_queue >= queue_size)
+ {
+ vlib_buffer_pool_put (vm, buffer_pool_index, queue, n_queue);
+ n_queue = 0;
+ }
+ buffers += 4;
+ n_buffers -= 4;
+ continue;
+
+ one_by_one:
+ bi = buffers[0];
+
+ next_in_chain:
+ b[0] = vlib_get_buffer (vm, bi);
+ flags = b[0]->flags;
+ next = b[0]->next_buffer;
+
+ if (PREDICT_FALSE (buffer_pool_index != b[0]->buffer_pool_index))
+ {
+
+ if (n_queue)
+ {
+ vlib_buffer_pool_put (vm, buffer_pool_index, queue, n_queue);
+ n_queue = 0;
+ }
+
+ buffer_pool_index = b[0]->buffer_pool_index;
+#if defined(CLIB_HAVE_VEC128) && !__aarch64__
+ bpi_vec.buffer_pool_index = buffer_pool_index;
+#endif
+ bp = vlib_get_buffer_pool (vm, buffer_pool_index);
+ vlib_buffer_copy_template (&bt, &bp->buffer_template);
+ }
+
+ vlib_buffer_validate (vm, b[0]);
+
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]);
+
+ if (clib_atomic_sub_fetch (&b[0]->ref_count, 1) == 0)
+ {
+ vlib_buffer_copy_template (b[0], &bt);
+ queue[n_queue++] = bi;
+ }
+
+ if (n_queue == queue_size)
+ {
+ vlib_buffer_pool_put (vm, buffer_pool_index, queue, queue_size);
+ n_queue = 0;
+ }
+
+ if (flags & VLIB_BUFFER_NEXT_PRESENT)
+ {
+ bi = next;
+ goto next_in_chain;
+ }
+
+ buffers++;
+ n_buffers--;
+ }
+
+ if (n_queue)
+ vlib_buffer_pool_put (vm, buffer_pool_index, queue, n_queue);
+}
+
+