* Copyright(c) 2021 Cisco Systems, Inc.
*/
+#include <vppinfra/clib.h>
#include <vlib/vlib.h>
+#include <vppinfra/vector_funcs.h>
-void __clib_section (".vlib_buffer_enqueue_to_next_fn") CLIB_MULTIARCH_FN (
- vlib_buffer_enqueue_to_next_fn) (vlib_main_t *vm, vlib_node_runtime_t *node,
- u32 *buffers, u16 *nexts, uword count)
+typedef struct
{
- u32 *to_next, n_left_to_next, max;
- u16 next_index;
-
- next_index = nexts[0];
- vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
- max = clib_min (n_left_to_next, count);
+ uword used_elts[VLIB_FRAME_SIZE / 64];
+ u32 uword_offset;
+} extract_data_t;
- while (count)
+static_always_inline u32 *
+extract_unused_elts_x64 (u32 *elts, u16 *indices, u16 index, int n_left,
+ u64 *bmp, u32 *dst)
+{
+ u64 mask = 0;
+#if defined(CLIB_HAVE_VEC128)
+ mask = clib_compare_u16_x64 (index, indices);
+ if (n_left == 64)
{
- u32 n_enqueued;
- if ((nexts[0] != next_index) || n_left_to_next == 0)
+ if (mask == ~0ULL)
{
- vlib_put_next_frame (vm, node, next_index, n_left_to_next);
- next_index = nexts[0];
- vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
- max = clib_min (n_left_to_next, count);
+ clib_memcpy_u32 (dst, elts, 64);
+ *bmp = ~0ULL;
+ return dst + 64;
}
-#if defined(CLIB_HAVE_VEC512)
- u16x32 next32 = CLIB_MEM_OVERFLOW_LOAD (u16x32_load_unaligned, nexts);
- next32 = (next32 == u16x32_splat (next32[0]));
- u64 bitmap = u16x32_msb_mask (next32);
- n_enqueued = count_trailing_zeros (~bitmap);
+ }
+ else
+ mask &= pow2_mask (n_left);
+
+ *bmp |= mask;
+
+#if defined(CLIB_HAVE_VEC512_COMPRESS)
+ u32x16u *ev = (u32x16u *) elts;
+ for (int i = 0; i < 4; i++)
+ {
+ int cnt = _popcnt32 ((u16) mask);
+ u32x16_compress_store (ev[i], mask, dst);
+ dst += cnt;
+ mask >>= 16;
+ }
+
+#elif defined(CLIB_HAVE_VEC256_COMPRESS)
+ u32x8u *ev = (u32x8u *) elts;
+ for (int i = 0; i < 8; i++)
+ {
+ int cnt = _popcnt32 ((u8) mask);
+ u32x8_compress_store (ev[i], mask, dst);
+ dst += cnt;
+ mask >>= 8;
+ }
#elif defined(CLIB_HAVE_VEC256)
- u16x16 next16 = CLIB_MEM_OVERFLOW_LOAD (u16x16_load_unaligned, nexts);
- next16 = (next16 == u16x16_splat (next16[0]));
- u64 bitmap = u8x32_msb_mask ((u8x32) next16);
- n_enqueued = count_trailing_zeros (~bitmap) / 2;
-#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
- u16x8 next8 = CLIB_MEM_OVERFLOW_LOAD (u16x8_load_unaligned, nexts);
- next8 = (next8 == u16x8_splat (next8[0]));
- u64 bitmap = u8x16_msb_mask ((u8x16) next8);
- n_enqueued = count_trailing_zeros (~bitmap) / 2;
+ while (mask)
+ {
+ u16 bit = count_trailing_zeros (mask);
+ mask = clear_lowest_set_bit (mask);
+ dst++[0] = elts[bit];
+ }
#else
- u16 x = 0;
- if (count + 3 < max)
+ while (mask)
+ {
+ u16 bit = count_trailing_zeros (mask);
+ mask ^= 1ULL << bit;
+ dst++[0] = elts[bit];
+ }
+#endif
+#else
+ for (int i = 0; i < n_left; i++)
+ {
+ if (indices[i] == index)
{
- x |= next_index ^ nexts[1];
- x |= next_index ^ nexts[2];
- x |= next_index ^ nexts[3];
- n_enqueued = (x == 0) ? 4 : 1;
+ dst++[0] = elts[i];
+ mask |= 1ULL << i;
}
- else
- n_enqueued = 1;
+ }
+ *bmp |= mask;
#endif
+ return dst;
+}
- if (PREDICT_FALSE (n_enqueued > max))
- n_enqueued = max;
+static_always_inline u32
+extract_unused_elts_by_index (extract_data_t *d, u32 *elts, u16 *indices,
+ u16 index, int n_left, u32 *dst)
+{
+ u32 *dst0 = dst;
+ u64 *bmp = d->used_elts;
+ while (n_left >= 64)
+ {
+ dst = extract_unused_elts_x64 (elts, indices, index, 64, bmp, dst);
-#ifdef CLIB_HAVE_VEC512
- if (n_enqueued >= 32)
- {
- vlib_buffer_copy_indices (to_next, buffers, 32);
- nexts += 32;
- to_next += 32;
- buffers += 32;
- n_left_to_next -= 32;
- count -= 32;
- max -= 32;
- continue;
- }
-#endif
+ /* next */
+ indices += 64;
+ elts += 64;
+ bmp++;
+ n_left -= 64;
+ }
-#ifdef CLIB_HAVE_VEC256
- if (n_enqueued >= 16)
- {
- vlib_buffer_copy_indices (to_next, buffers, 16);
- nexts += 16;
- to_next += 16;
- buffers += 16;
- n_left_to_next -= 16;
- count -= 16;
- max -= 16;
- continue;
- }
-#endif
+ if (n_left)
+ dst = extract_unused_elts_x64 (elts, indices, index, n_left, bmp, dst);
-#ifdef CLIB_HAVE_VEC128
- if (n_enqueued >= 8)
- {
- vlib_buffer_copy_indices (to_next, buffers, 8);
- nexts += 8;
- to_next += 8;
- buffers += 8;
- n_left_to_next -= 8;
- count -= 8;
- max -= 8;
- continue;
- }
-#endif
+ return dst - dst0;
+}
+
+static_always_inline u32
+find_first_unused_elt (extract_data_t *d)
+{
+ u64 *ue = d->used_elts + d->uword_offset;
+
+ while (PREDICT_FALSE (ue[0] == ~0))
+ {
+ ue++;
+ d->uword_offset++;
+ }
+
+ return d->uword_offset * 64 + count_trailing_zeros (~ue[0]);
+}
+
+static_always_inline u32
+enqueue_one (vlib_main_t *vm, vlib_node_runtime_t *node, extract_data_t *d,
+ u16 next_index, u32 *buffers, u16 *nexts, u32 n_buffers,
+ u32 n_left, u32 *tmp)
+{
+ vlib_frame_t *f;
+ u32 n_extracted, n_free;
+ u32 *to;
+
+ f = vlib_get_next_frame_internal (vm, node, next_index, 0);
+
+ n_free = VLIB_FRAME_SIZE - f->n_vectors;
+
+ /* if frame contains enough space for worst case scenario, we can avoid
+ * use of tmp */
+ if (n_free >= n_left)
+ to = (u32 *) vlib_frame_vector_args (f) + f->n_vectors;
+ else
+ to = tmp;
+
+ n_extracted = extract_unused_elts_by_index (d, buffers, nexts, next_index,
+ n_buffers, to);
+
+ if (to != tmp)
+ {
+ /* indices already written to frame, just close it */
+ vlib_put_next_frame (vm, node, next_index, n_free - n_extracted);
+ }
+ else if (n_free >= n_extracted)
+ {
+ /* enough space in the existing frame */
+ to = (u32 *) vlib_frame_vector_args (f) + f->n_vectors;
+ vlib_buffer_copy_indices (to, tmp, n_extracted);
+ vlib_put_next_frame (vm, node, next_index, n_free - n_extracted);
+ }
+ else
+ {
+ /* full frame */
+ to = (u32 *) vlib_frame_vector_args (f) + f->n_vectors;
+ vlib_buffer_copy_indices (to, tmp, n_free);
+ vlib_put_next_frame (vm, node, next_index, 0);
+
+ /* second frame */
+ u32 n_2nd_frame = n_extracted - n_free;
+ f = vlib_get_next_frame_internal (vm, node, next_index, 1);
+ to = vlib_frame_vector_args (f);
+ vlib_buffer_copy_indices (to, tmp + n_free, n_2nd_frame);
+ vlib_put_next_frame (vm, node, next_index,
+ VLIB_FRAME_SIZE - n_2nd_frame);
+ }
+
+ return n_left - n_extracted;
+}
+
+void __clib_section (".vlib_buffer_enqueue_to_next_fn")
+CLIB_MULTIARCH_FN (vlib_buffer_enqueue_to_next_fn)
+(vlib_main_t *vm, vlib_node_runtime_t *node, u32 *buffers, u16 *nexts,
+ uword count)
+{
+ u32 tmp[VLIB_FRAME_SIZE];
+ u32 n_left;
+ u16 next_index;
- if (n_enqueued >= 4)
+ while (count >= VLIB_FRAME_SIZE)
+ {
+ extract_data_t d = {};
+ n_left = VLIB_FRAME_SIZE;
+
+ next_index = nexts[0];
+ n_left = enqueue_one (vm, node, &d, next_index, buffers, nexts,
+ VLIB_FRAME_SIZE, n_left, tmp);
+
+ while (n_left)
{
- vlib_buffer_copy_indices (to_next, buffers, 4);
- nexts += 4;
- to_next += 4;
- buffers += 4;
- n_left_to_next -= 4;
- count -= 4;
- max -= 4;
- continue;
+ next_index = nexts[find_first_unused_elt (&d)];
+ n_left = enqueue_one (vm, node, &d, next_index, buffers, nexts,
+ VLIB_FRAME_SIZE, n_left, tmp);
}
- /* copy */
- to_next[0] = buffers[0];
+ buffers += VLIB_FRAME_SIZE;
+ nexts += VLIB_FRAME_SIZE;
+ count -= VLIB_FRAME_SIZE;
+ }
- /* next */
- nexts += 1;
- to_next += 1;
- buffers += 1;
- n_left_to_next -= 1;
- count -= 1;
- max -= 1;
+ if (count)
+ {
+ extract_data_t d = {};
+ next_index = nexts[0];
+ n_left = count;
+
+ n_left = enqueue_one (vm, node, &d, next_index, buffers, nexts, count,
+ n_left, tmp);
+
+ while (n_left)
+ {
+ next_index = nexts[find_first_unused_elt (&d)];
+ n_left = enqueue_one (vm, node, &d, next_index, buffers, nexts,
+ count, n_left, tmp);
+ }
}
- vlib_put_next_frame (vm, node, next_index, n_left_to_next);
}
+
CLIB_MARCH_FN_REGISTRATION (vlib_buffer_enqueue_to_next_fn);
void __clib_section (".vlib_buffer_enqueue_to_single_next_fn")
- CLIB_MULTIARCH_FN (vlib_buffer_enqueue_to_single_next_fn) (
- vlib_main_t *vm, vlib_node_runtime_t *node, u32 *buffers, u16 next_index,
- u32 count)
+CLIB_MULTIARCH_FN (vlib_buffer_enqueue_to_single_next_fn)
+(vlib_main_t *vm, vlib_node_runtime_t *node, u32 *buffers, u16 next_index,
+ u32 count)
{
u32 *to_next, n_left_to_next, n_enq;
CLIB_MARCH_FN_REGISTRATION (vlib_buffer_enqueue_to_single_next_fn);
u32 __clib_section (".vlib_buffer_enqueue_to_thread_fn")
- CLIB_MULTIARCH_FN (vlib_buffer_enqueue_to_thread_fn) (
- vlib_main_t *vm, u32 frame_queue_index, u32 *buffer_indices,
- u16 *thread_indices, u32 n_packets, int drop_on_congestion)
+CLIB_MULTIARCH_FN (vlib_buffer_enqueue_to_thread_fn)
+(vlib_main_t *vm, u32 frame_queue_index, u32 *buffer_indices,
+ u16 *thread_indices, u32 n_packets, int drop_on_congestion)
{
vlib_thread_main_t *tm = vlib_get_thread_main ();
vlib_frame_queue_main_t *fqm;
CLIB_MARCH_FN_REGISTRATION (vlib_buffer_enqueue_to_thread_fn);
+/*
+ * Check the frame queue to see if any frames are available.
+ * If so, pull the packets off the frames and put them to
+ * the handoff node.
+ */
+u32 __clib_section (".vlib_frame_queue_dequeue_fn")
+CLIB_MULTIARCH_FN (vlib_frame_queue_dequeue_fn)
+(vlib_main_t *vm, vlib_frame_queue_main_t *fqm)
+{
+ u32 thread_id = vm->thread_index;
+ vlib_frame_queue_t *fq = fqm->vlib_frame_queues[thread_id];
+ vlib_frame_queue_elt_t *elt;
+ u32 *from, *to;
+ vlib_frame_t *f;
+ int msg_type;
+ int processed = 0;
+ u32 vectors = 0;
+
+ ASSERT (fq);
+ ASSERT (vm == vlib_global_main.vlib_mains[thread_id]);
+
+ if (PREDICT_FALSE (fqm->node_index == ~0))
+ return 0;
+ /*
+ * Gather trace data for frame queues
+ */
+ if (PREDICT_FALSE (fq->trace))
+ {
+ frame_queue_trace_t *fqt;
+ frame_queue_nelt_counter_t *fqh;
+ u32 elix;
+
+ fqt = &fqm->frame_queue_traces[thread_id];
+
+ fqt->nelts = fq->nelts;
+ fqt->head = fq->head;
+ fqt->head_hint = fq->head_hint;
+ fqt->tail = fq->tail;
+ fqt->threshold = fq->vector_threshold;
+ fqt->n_in_use = fqt->tail - fqt->head;
+ if (fqt->n_in_use >= fqt->nelts)
+ {
+ // if beyond max then use max
+ fqt->n_in_use = fqt->nelts - 1;
+ }
+
+ /* Record the number of elements in use in the histogram */
+ fqh = &fqm->frame_queue_histogram[thread_id];
+ fqh->count[fqt->n_in_use]++;
+
+ /* Record a snapshot of the elements in use */
+ for (elix = 0; elix < fqt->nelts; elix++)
+ {
+ elt = fq->elts + ((fq->head + 1 + elix) & (fq->nelts - 1));
+ if (1 || elt->valid)
+ {
+ fqt->n_vectors[elix] = elt->n_vectors;
+ }
+ }
+ fqt->written = 1;
+ }
+
+ while (1)
+ {
+ vlib_buffer_t *b;
+ if (fq->head == fq->tail)
+ {
+ fq->head_hint = fq->head;
+ return processed;
+ }
+
+ elt = fq->elts + ((fq->head + 1) & (fq->nelts - 1));
+
+ if (!elt->valid)
+ {
+ fq->head_hint = fq->head;
+ return processed;
+ }
+
+ from = elt->buffer_index;
+ msg_type = elt->msg_type;
+
+ ASSERT (msg_type == VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME);
+ ASSERT (elt->n_vectors <= VLIB_FRAME_SIZE);
+
+ f = vlib_get_frame_to_node (vm, fqm->node_index);
+
+ /* If the first vector is traced, set the frame trace flag */
+ b = vlib_get_buffer (vm, from[0]);
+ if (b->flags & VLIB_BUFFER_IS_TRACED)
+ f->frame_flags |= VLIB_NODE_FLAG_TRACE;
+
+ to = vlib_frame_vector_args (f);
+
+ vlib_buffer_copy_indices (to, from, elt->n_vectors);
+
+ vectors += elt->n_vectors;
+ f->n_vectors = elt->n_vectors;
+ vlib_put_frame_to_node (vm, fqm->node_index, f);
+
+ elt->valid = 0;
+ elt->n_vectors = 0;
+ elt->msg_type = 0xfefefefe;
+ CLIB_MEMORY_BARRIER ();
+ fq->head++;
+ processed++;
+
+ /*
+ * Limit the number of packets pushed into the graph
+ */
+ if (vectors >= fq->vector_threshold)
+ {
+ fq->head_hint = fq->head;
+ return processed;
+ }
+ }
+ ASSERT (0);
+ return processed;
+}
+CLIB_MARCH_FN_REGISTRATION (vlib_frame_queue_dequeue_fn);
+
#ifndef CLIB_MARCH_VARIANT
vlib_buffer_func_main_t vlib_buffer_func_main;
CLIB_MARCH_FN_POINTER (vlib_buffer_enqueue_to_single_next_fn);
bfm->buffer_enqueue_to_thread_fn =
CLIB_MARCH_FN_POINTER (vlib_buffer_enqueue_to_thread_fn);
+ bfm->frame_queue_dequeue_fn =
+ CLIB_MARCH_FN_POINTER (vlib_frame_queue_dequeue_fn);
return 0;
}