From: Damjan Marion Date: Sat, 2 Jun 2018 18:42:07 +0000 (+0200) Subject: dpdk: improve buffer alloc perfomance X-Git-Tag: v18.07-rc1~236 X-Git-Url: https://gerrit.fd.io/r/gitweb?a=commitdiff_plain;h=8855386411af888e47c60645daa1fe6081fa56e1;p=vpp.git dpdk: improve buffer alloc perfomance This is ~50% improvement in buffer alloc performance. For a 256 buffer allocation, it was ~10 clocks/buffer, now is < 5 clocks. Change-Id: I97590e240a79a42bcab5eb26587fc2d11e6eb163 Signed-off-by: Damjan Marion --- diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index 3b3aaf2379f..78d5becad78 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -88,6 +88,7 @@ STATIC_ASSERT (VLIB_BUFFER_PRE_DATA_SIZE == RTE_PKTMBUF_HEADROOM, typedef struct { CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + struct rte_mbuf **mbuf_alloc_list; struct rte_mbuf ***mbuf_pending_free_list; /* cached last pool */ @@ -199,106 +200,91 @@ CLIB_MULTIARCH_FN (dpdk_buffer_fill_free_list) (vlib_main_t * vm, uword min_free_buffers) { dpdk_main_t *dm = &dpdk_main; - vlib_buffer_t *b0, *b1, *b2, *b3; - int n, i; - u32 bi0, bi1, bi2, bi3; + dpdk_buffer_main_t *dbm = &dpdk_buffer_main; + struct rte_mbuf **mb; + uword n_left, first; + word n_alloc; unsigned socket_id = rte_socket_id (); + u32 thread_index = vlib_get_thread_index (); + dpdk_buffer_per_thread_data *d = vec_elt_at_index (dbm->ptd, thread_index); struct rte_mempool *rmp = dm->pktmbuf_pools[socket_id]; dpdk_mempool_private_t *privp = rte_mempool_get_priv (rmp); - struct rte_mbuf *mb0, *mb1, *mb2, *mb3; vlib_buffer_t bt; + u32 *bi; /* Too early? */ if (PREDICT_FALSE (rmp == 0)) return 0; /* Already have enough free buffers on free list? */ - n = min_free_buffers - vec_len (fl->buffers); - if (n <= 0) + n_alloc = min_free_buffers - vec_len (fl->buffers); + if (n_alloc <= 0) return min_free_buffers; /* Always allocate round number of buffers. */ - n = round_pow2 (n, CLIB_CACHE_LINE_BYTES / sizeof (u32)); + n_alloc = round_pow2 (n_alloc, CLIB_CACHE_LINE_BYTES / sizeof (u32)); /* Always allocate new buffers in reasonably large sized chunks. */ - n = clib_max (n, fl->min_n_buffers_each_alloc); + n_alloc = clib_max (n_alloc, fl->min_n_buffers_each_alloc); - vec_validate_aligned (vm->mbuf_alloc_list, n - 1, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (d->mbuf_alloc_list, n_alloc - 1, + CLIB_CACHE_LINE_BYTES); - if (rte_mempool_get_bulk (rmp, vm->mbuf_alloc_list, n) < 0) + if (rte_mempool_get_bulk (rmp, (void *) d->mbuf_alloc_list, n_alloc) < 0) return 0; memset (&bt, 0, sizeof (vlib_buffer_t)); vlib_buffer_init_for_free_list (&bt, fl); bt.buffer_pool_index = privp->buffer_pool_index; - _vec_len (vm->mbuf_alloc_list) = n; - - i = 0; - int f = vec_len (fl->buffers); - vec_resize_aligned (fl->buffers, n, CLIB_CACHE_LINE_BYTES); - - while (i < (n - 7)) - { - vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf - (vm->mbuf_alloc_list[i + 4]), STORE); - vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf - (vm->mbuf_alloc_list[i + 5]), STORE); - vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf - (vm->mbuf_alloc_list[i + 6]), STORE); - vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf - (vm->mbuf_alloc_list[i + 7]), STORE); - - mb0 = vm->mbuf_alloc_list[i]; - mb1 = vm->mbuf_alloc_list[i + 1]; - mb2 = vm->mbuf_alloc_list[i + 2]; - mb3 = vm->mbuf_alloc_list[i + 3]; - - b0 = vlib_buffer_from_rte_mbuf (mb0); - b1 = vlib_buffer_from_rte_mbuf (mb1); - b2 = vlib_buffer_from_rte_mbuf (mb2); - b3 = vlib_buffer_from_rte_mbuf (mb3); - - bi0 = vlib_get_buffer_index (vm, b0); - bi1 = vlib_get_buffer_index (vm, b1); - bi2 = vlib_get_buffer_index (vm, b2); - bi3 = vlib_get_buffer_index (vm, b3); - - fl->buffers[f++] = bi0; - fl->buffers[f++] = bi1; - fl->buffers[f++] = bi2; - fl->buffers[f++] = bi3; - - clib_memcpy64_x4 (b0, b1, b2, b3, &bt); - - if (fl->buffer_init_function) - { - fl->buffer_init_function (vm, fl, &bi0, 1); - fl->buffer_init_function (vm, fl, &bi1, 1); - fl->buffer_init_function (vm, fl, &bi2, 1); - fl->buffer_init_function (vm, fl, &bi3, 1); - } - i += 4; - } + _vec_len (d->mbuf_alloc_list) = n_alloc; - while (i < n) - { - mb0 = vm->mbuf_alloc_list[i]; + first = vec_len (fl->buffers); + vec_resize_aligned (fl->buffers, n_alloc, CLIB_CACHE_LINE_BYTES); - b0 = vlib_buffer_from_rte_mbuf (mb0); - bi0 = vlib_get_buffer_index (vm, b0); + n_left = n_alloc; + mb = d->mbuf_alloc_list; + bi = fl->buffers + first; - fl->buffers[f++] = bi0; - clib_memcpy (b0, &bt, sizeof (vlib_buffer_t)); + ASSERT (n_left % 8 == 0); - if (fl->buffer_init_function) - fl->buffer_init_function (vm, fl, &bi0, 1); - i++; + while (n_left >= 8) + { + if (PREDICT_FALSE (n_left < 24)) + goto no_prefetch; + + vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[16]), STORE); + vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[17]), STORE); + vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[18]), STORE); + vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[19]), STORE); + vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[20]), STORE); + vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[21]), STORE); + vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[22]), STORE); + vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[23]), STORE); + + no_prefetch: + vlib_get_buffer_indices_with_offset (vm, (void **) mb, bi, 8, + sizeof (struct rte_mbuf)); + clib_memcpy64_x4 (vlib_buffer_from_rte_mbuf (mb[0]), + vlib_buffer_from_rte_mbuf (mb[1]), + vlib_buffer_from_rte_mbuf (mb[2]), + vlib_buffer_from_rte_mbuf (mb[3]), &bt); + clib_memcpy64_x4 (vlib_buffer_from_rte_mbuf (mb[4]), + vlib_buffer_from_rte_mbuf (mb[5]), + vlib_buffer_from_rte_mbuf (mb[6]), + vlib_buffer_from_rte_mbuf (mb[7]), &bt); + + n_left -= 8; + mb += 8; + bi += 8; } - fl->n_alloc += n; + if (fl->buffer_init_function) + fl->buffer_init_function (vm, fl, fl->buffers + first, n_alloc); + + fl->n_alloc += n_alloc; - return n; + return n_alloc; } static_always_inline void diff --git a/src/vlib/main.h b/src/vlib/main.h index 57b1efb7513..16e4120067d 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -175,8 +175,6 @@ typedef struct vlib_main_t /* to compare with node runtime */ u32 thread_index; - void **mbuf_alloc_list; - /* List of init functions to call, setup by constructors */ _vlib_init_function_list_elt_t *init_function_registrations; _vlib_init_function_list_elt_t *worker_init_function_registrations; diff --git a/src/vlib/threads.c b/src/vlib/threads.c index edf5a0e0711..bbe94c7f272 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -827,7 +827,6 @@ start_workers (vlib_main_t * vm) vm_clone->thread_index = worker_thread_index; vm_clone->heap_base = w->thread_mheap; - vm_clone->mbuf_alloc_list = 0; vm_clone->init_functions_called = hash_create (0, /* value bytes */ 0); vm_clone->pending_rpc_requests = 0;