From 68b4da67deb2e8ca224bb5abaeb9dbc7ae8e378c Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Sun, 30 Sep 2018 18:26:20 +0200 Subject: [PATCH] Numa-aware, growable physical memory allocator (pmalloc) Change-Id: Ic4c46bc733afae8bf0d8146623ed15633928de30 Signed-off-by: Damjan Marion --- src/plugins/avf/avf.h | 4 +- src/plugins/avf/device.c | 103 ++++---- src/plugins/avf/input.c | 12 +- src/plugins/avf/output.c | 4 +- src/plugins/dpdk/buffer.c | 34 ++- src/plugins/dpdk/device/dpdk.h | 3 +- src/plugins/dpdk/ipsec/ipsec.c | 21 +- src/plugins/ixge/ixge.c | 35 +-- src/plugins/ixge/ixge.h | 2 - src/plugins/memif/memif.c | 10 +- src/plugins/vmxnet3/vmxnet3.c | 73 ++---- src/plugins/vmxnet3/vmxnet3.h | 6 +- src/vlib/CMakeLists.txt | 2 +- src/vlib/buffer.c | 75 ++---- src/vlib/buffer.h | 16 +- src/vlib/buffer_funcs.h | 5 +- src/vlib/linux/pci.c | 34 +++ src/vlib/linux/physmem.c | 307 ---------------------- src/vlib/linux/vfio.c | 68 +++-- src/vlib/linux/vfio.h | 4 +- src/vlib/main.c | 2 +- src/vlib/main.h | 19 +- src/vlib/pci/pci.h | 5 + src/vlib/physmem.c | 150 +++++++++++ src/vlib/physmem.h | 32 +-- src/vlib/physmem_funcs.h | 113 +++------ src/vlib/unix/main.c | 1 - src/vlib/unix/unix.h | 2 - src/vppinfra/CMakeLists.txt | 3 + src/vppinfra/linux/syscall.h | 6 + src/vppinfra/pmalloc.c | 562 +++++++++++++++++++++++++++++++++++++++++ src/vppinfra/pmalloc.h | 131 ++++++++++ src/vppinfra/test_pmalloc.c | 169 +++++++++++++ 33 files changed, 1300 insertions(+), 713 deletions(-) delete mode 100755 src/vlib/linux/physmem.c create mode 100755 src/vlib/physmem.c create mode 100644 src/vppinfra/pmalloc.c create mode 100644 src/vppinfra/pmalloc.h create mode 100644 src/vppinfra/test_pmalloc.c diff --git a/src/plugins/avf/avf.h b/src/plugins/avf/avf.h index 187e5c2cd69..4fa19b11b73 100644 --- a/src/plugins/avf/avf.h +++ b/src/plugins/avf/avf.h @@ -26,7 +26,7 @@ _(0, INITIALIZED, "initialized") \ _(1, ERROR, "error") \ _(2, ADMIN_UP, "admin-up") \ - _(3, IOVA, "iova") \ + _(3, VA_DMA, "vaddr-dma") \ _(4, LINK_UP, "link-up") \ _(5, SHARED_TXQ_LOCK, "shared-txq-lock") \ _(6, ELOG, "elog") @@ -188,8 +188,6 @@ typedef struct avf_device_t *devices; avf_per_thread_data_t *per_thread_data; - vlib_physmem_region_index_t physmem_region; - int physmem_region_alloc; vlib_log_class_t log_class; diff --git a/src/plugins/avf/device.c b/src/plugins/avf/device.c index 914c32cab4b..713953fe4c6 100644 --- a/src/plugins/avf/device.c +++ b/src/plugins/avf/device.c @@ -215,18 +215,23 @@ avf_cmd_rx_ctl_reg_write (vlib_main_t * vm, avf_device_t * ad, u32 reg, clib_error_t * avf_rxq_init (vlib_main_t * vm, avf_device_t * ad, u16 qid, u16 rxq_size) { - avf_main_t *am = &avf_main; + clib_error_t *err; avf_rxq_t *rxq; - clib_error_t *error = 0; u32 n_alloc, i; vec_validate_aligned (ad->rxqs, qid, CLIB_CACHE_LINE_BYTES); rxq = vec_elt_at_index (ad->rxqs, qid); rxq->size = rxq_size; rxq->next = 0; - rxq->descs = vlib_physmem_alloc_aligned (vm, am->physmem_region, &error, - rxq->size * sizeof (avf_rx_desc_t), + rxq->descs = vlib_physmem_alloc_aligned (vm, rxq->size * + sizeof (avf_rx_desc_t), 2 * CLIB_CACHE_LINE_BYTES); + if (rxq->descs == 0) + return vlib_physmem_last_error (vm); + + if ((err = vlib_pci_map_dma (vm, ad->pci_dev_handle, (void *) rxq->descs))) + return err; + clib_memset ((void *) rxq->descs, 0, rxq->size * sizeof (avf_rx_desc_t)); vec_validate_aligned (rxq->bufs, rxq->size, CLIB_CACHE_LINE_BYTES); rxq->qrx_tail = ad->bar0 + AVF_QRX_TAIL (qid); @@ -241,7 +246,7 @@ avf_rxq_init (vlib_main_t * vm, avf_device_t * ad, u16 qid, u16 rxq_size) for (i = 0; i < n_alloc; i++) { vlib_buffer_t *b = vlib_get_buffer (vm, rxq->bufs[i]); - if (ad->flags & AVF_DEVICE_F_IOVA) + if (ad->flags & AVF_DEVICE_F_VA_DMA) d->qword[0] = vlib_buffer_get_va (b); else d->qword[0] = vlib_buffer_get_pa (vm, b); @@ -255,9 +260,8 @@ avf_rxq_init (vlib_main_t * vm, avf_device_t * ad, u16 qid, u16 rxq_size) clib_error_t * avf_txq_init (vlib_main_t * vm, avf_device_t * ad, u16 qid, u16 txq_size) { - avf_main_t *am = &avf_main; + clib_error_t *err; avf_txq_t *txq; - clib_error_t *error = 0; if (qid >= ad->num_queue_pairs) { @@ -273,9 +277,15 @@ avf_txq_init (vlib_main_t * vm, avf_device_t * ad, u16 qid, u16 txq_size) txq = vec_elt_at_index (ad->txqs, qid); txq->size = txq_size; txq->next = 0; - txq->descs = vlib_physmem_alloc_aligned (vm, am->physmem_region, &error, - txq->size * sizeof (avf_tx_desc_t), + txq->descs = vlib_physmem_alloc_aligned (vm, txq->size * + sizeof (avf_tx_desc_t), 2 * CLIB_CACHE_LINE_BYTES); + if (txq->descs == 0) + return vlib_physmem_last_error (vm); + + if ((err = vlib_pci_map_dma (vm, ad->pci_dev_handle, (void *) txq->descs))) + return err; + vec_validate_aligned (txq->bufs, txq->size, CLIB_CACHE_LINE_BYTES); txq->qtx_tail = ad->bar0 + AVF_QTX_TAIL (qid); @@ -305,10 +315,8 @@ avf_arq_slot_init (avf_device_t * ad, u16 slot) static inline uword avf_dma_addr (vlib_main_t * vm, avf_device_t * ad, void *p) { - avf_main_t *am = &avf_main; - return (ad->flags & AVF_DEVICE_F_IOVA) ? - pointer_to_uword (p) : - vlib_physmem_virtual_to_physical (vm, am->physmem_region, p); + return (ad->flags & AVF_DEVICE_F_VA_DMA) ? + pointer_to_uword (p) : vlib_physmem_get_pa (vm, p); } static void @@ -1126,16 +1134,16 @@ avf_delete_if (vlib_main_t * vm, avf_device_t * ad) vlib_pci_device_close (vm, ad->pci_dev_handle); - vlib_physmem_free (vm, am->physmem_region, ad->atq); - vlib_physmem_free (vm, am->physmem_region, ad->arq); - vlib_physmem_free (vm, am->physmem_region, ad->atq_bufs); - vlib_physmem_free (vm, am->physmem_region, ad->arq_bufs); + vlib_physmem_free (vm, ad->atq); + vlib_physmem_free (vm, ad->arq); + vlib_physmem_free (vm, ad->atq_bufs); + vlib_physmem_free (vm, ad->arq_bufs); /* *INDENT-OFF* */ vec_foreach_index (i, ad->rxqs) { avf_rxq_t *rxq = vec_elt_at_index (ad->rxqs, i); - vlib_physmem_free (vm, am->physmem_region, (void *) rxq->descs); + vlib_physmem_free (vm, (void *) rxq->descs); if (rxq->n_enqueued) vlib_buffer_free_from_ring (vm, rxq->bufs, rxq->next, rxq->size, rxq->n_enqueued); @@ -1148,7 +1156,7 @@ avf_delete_if (vlib_main_t * vm, avf_device_t * ad) vec_foreach_index (i, ad->txqs) { avf_txq_t *txq = vec_elt_at_index (ad->txqs, i); - vlib_physmem_free (vm, am->physmem_region, (void *) txq->descs); + vlib_physmem_free (vm, (void *) txq->descs); if (txq->n_enqueued) { u16 first = (txq->next - txq->n_enqueued) & (txq->size -1); @@ -1226,44 +1234,51 @@ avf_create_if (vlib_main_t * vm, avf_create_if_args_t * args) if ((error = vlib_pci_enable_msix_irq (vm, h, 0, 2))) goto error; - if (am->physmem_region_alloc == 0) + if (!(ad->atq = vlib_physmem_alloc (vm, sizeof (avf_aq_desc_t) * + AVF_MBOX_LEN))) { - u32 flags = VLIB_PHYSMEM_F_INIT_MHEAP | VLIB_PHYSMEM_F_HUGETLB; - error = vlib_physmem_region_alloc (vm, "avf descriptors", 4 << 20, 0, - flags, &am->physmem_region); - if (error) - goto error; - am->physmem_region_alloc = 1; + error = vlib_physmem_last_error (vm); + goto error; } - ad->atq = vlib_physmem_alloc_aligned (vm, am->physmem_region, &error, - sizeof (avf_aq_desc_t) * AVF_MBOX_LEN, - 64); - if (error) + + if ((error = vlib_pci_map_dma (vm, h, ad->atq))) goto error; - ad->arq = vlib_physmem_alloc_aligned (vm, am->physmem_region, &error, - sizeof (avf_aq_desc_t) * AVF_MBOX_LEN, - 64); - if (error) + if (!(ad->arq = vlib_physmem_alloc (vm, sizeof (avf_aq_desc_t) * + AVF_MBOX_LEN))) + { + error = vlib_physmem_last_error (vm); + goto error; + } + + if ((error = vlib_pci_map_dma (vm, h, ad->arq))) goto error; - ad->atq_bufs = vlib_physmem_alloc_aligned (vm, am->physmem_region, &error, - AVF_MBOX_BUF_SZ * AVF_MBOX_LEN, - 64); - if (error) + if (!(ad->atq_bufs = vlib_physmem_alloc (vm, AVF_MBOX_BUF_SZ * + AVF_MBOX_LEN))) + { + error = vlib_physmem_last_error (vm); + goto error; + } + + if ((error = vlib_pci_map_dma (vm, h, ad->atq_bufs))) goto error; - ad->arq_bufs = vlib_physmem_alloc_aligned (vm, am->physmem_region, &error, - AVF_MBOX_BUF_SZ * AVF_MBOX_LEN, - 64); - if (error) + if (!(ad->arq_bufs = vlib_physmem_alloc (vm, AVF_MBOX_BUF_SZ * + AVF_MBOX_LEN))) + { + error = vlib_physmem_last_error (vm); + goto error; + } + + if ((error = vlib_pci_map_dma (vm, h, ad->arq_bufs))) goto error; if ((error = vlib_pci_intr_enable (vm, h))) goto error; - /* FIXME detect */ - ad->flags |= AVF_DEVICE_F_IOVA; + if (vlib_pci_supports_virtual_addr_dma (vm, h)) + ad->flags |= AVF_DEVICE_F_VA_DMA; if ((error = avf_device_init (vm, am, ad, args))) goto error; diff --git a/src/plugins/avf/input.c b/src/plugins/avf/input.c index efe3b0e0251..6d39ed646b5 100644 --- a/src/plugins/avf/input.c +++ b/src/plugins/avf/input.c @@ -48,7 +48,7 @@ static __clib_unused char *avf_input_error_strings[] = { #define AVF_INPUT_REFILL_TRESHOLD 32 static_always_inline void avf_rxq_refill (vlib_main_t * vm, vlib_node_runtime_t * node, avf_rxq_t * rxq, - int use_iova) + int use_va_dma) { u16 n_refill, mask, n_alloc, slot; u32 s0, s1, s2, s3; @@ -103,7 +103,7 @@ avf_rxq_refill (vlib_main_t * vm, vlib_node_runtime_t * node, avf_rxq_t * rxq, b[2] = vlib_get_buffer (vm, rxq->bufs[s2]); b[3] = vlib_get_buffer (vm, rxq->bufs[s3]); - if (use_iova) + if (use_va_dma) { d[0]->qword[0] = vlib_buffer_get_va (b[0]); d[1]->qword[0] = vlib_buffer_get_va (b[1]); @@ -132,7 +132,7 @@ avf_rxq_refill (vlib_main_t * vm, vlib_node_runtime_t * node, avf_rxq_t * rxq, s0 = slot; d[0] = ((avf_rx_desc_t *) rxq->descs) + s0; b[0] = vlib_get_buffer (vm, rxq->bufs[s0]); - if (use_iova) + if (use_va_dma) d[0]->qword[0] = vlib_buffer_get_va (b[0]); else d[0]->qword[0] = vlib_buffer_get_pa (vm, b[0]); @@ -433,10 +433,10 @@ avf_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto done; /* refill rx ring */ - if (ad->flags & AVF_DEVICE_F_IOVA) - avf_rxq_refill (vm, node, rxq, 1 /* use_iova */ ); + if (ad->flags & AVF_DEVICE_F_VA_DMA) + avf_rxq_refill (vm, node, rxq, 1 /* use_va_dma */ ); else - avf_rxq_refill (vm, node, rxq, 0 /* use_iova */ ); + avf_rxq_refill (vm, node, rxq, 0 /* use_va_dma */ ); vlib_get_buffers (vm, buffer_indices, bufs, n_rxv); n_rx_packets = n_rxv; diff --git a/src/plugins/avf/output.c b/src/plugins/avf/output.c index c2c4b01d022..72892d16dff 100644 --- a/src/plugins/avf/output.c +++ b/src/plugins/avf/output.c @@ -118,7 +118,7 @@ retry: b2 = vlib_get_buffer (vm, bi2); b3 = vlib_get_buffer (vm, bi3); - if (ad->flags & AVF_DEVICE_F_IOVA) + if (ad->flags & AVF_DEVICE_F_VA_DMA) { d0->qword[0] = vlib_buffer_get_current_va (b0); d1->qword[0] = vlib_buffer_get_current_va (b1); @@ -151,7 +151,7 @@ retry: txq->bufs[next] = bi0; b0 = vlib_get_buffer (vm, bi0); - if (ad->flags & AVF_DEVICE_F_IOVA) + if (ad->flags & AVF_DEVICE_F_VA_DMA) d0->qword[0] = vlib_buffer_get_current_va (b0); else d0->qword[0] = vlib_buffer_get_current_pa (vm, b0); diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index f7ed932e35f..770e76d398e 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -393,12 +393,12 @@ dpdk_packet_template_init (vlib_main_t * vm, clib_error_t * dpdk_pool_create (vlib_main_t * vm, u8 * pool_name, u32 elt_size, u32 num_elts, u32 pool_priv_size, u16 cache_size, u8 numa, - struct rte_mempool **_mp, vlib_physmem_region_index_t * pri) + struct rte_mempool **_mp, u32 * map_index) { struct rte_mempool *mp; enum rte_iova_mode iova_mode; - vlib_physmem_region_t *pr; dpdk_mempool_private_t priv; + vlib_physmem_map_t *pm; clib_error_t *error = 0; size_t min_chunk_size, align; int map_dma = 1; @@ -406,7 +406,6 @@ dpdk_pool_create (vlib_main_t * vm, u8 * pool_name, u32 elt_size, i32 ret; uword i; - mp = rte_mempool_create_empty ((char *) pool_name, num_elts, elt_size, 512, pool_priv_size, numa, 0); if (!mp) @@ -417,16 +416,13 @@ dpdk_pool_create (vlib_main_t * vm, u8 * pool_name, u32 elt_size, size = rte_mempool_op_calc_mem_size_default (mp, num_elts, 21, &min_chunk_size, &align); - error = vlib_physmem_region_alloc (vm, (char *) pool_name, size, numa, - VLIB_PHYSMEM_F_HUGETLB | - VLIB_PHYSMEM_F_SHARED, pri); - if (error) + if ((error = vlib_physmem_shared_map_create (vm, (char *) pool_name, size, + numa, map_index))) { rte_mempool_free (mp); return error; } - - pr = vlib_physmem_get_region (vm, pri[0]); + pm = vlib_physmem_get_map (vm, *map_index); /* Call the mempool priv initializer */ priv.mbp_priv.mbuf_data_room_size = VLIB_BUFFER_PRE_DATA_SIZE + @@ -438,12 +434,12 @@ dpdk_pool_create (vlib_main_t * vm, u8 * pool_name, u32 elt_size, map_dma = 0; iova_mode = rte_eal_iova_mode (); - for (i = 0; i < pr->n_pages; i++) + for (i = 0; i < pm->n_pages; i++) { - size_t page_sz = 1ull << pr->log2_page_size; - char *va = ((char *) pr->mem) + i * page_sz; + size_t page_sz = 1ULL << pm->log2_page_size; + char *va = ((char *) pm->base) + i * page_sz; uword pa = iova_mode == RTE_IOVA_VA ? - pointer_to_uword (va) : pr->page_table[i]; + pointer_to_uword (va) : pm->page_table[i]; ret = rte_mempool_populate_iova (mp, va, pa, page_sz, 0, 0); if (ret < 0) { @@ -467,10 +463,10 @@ dpdk_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, { dpdk_main_t *dm = &dpdk_main; struct rte_mempool *rmp; - vlib_physmem_region_index_t pri; clib_error_t *error = 0; u8 *pool_name; u32 elt_size, i; + u32 map_index; vec_validate_aligned (dm->pktmbuf_pools, socket_id, CLIB_CACHE_LINE_BYTES); @@ -484,10 +480,9 @@ dpdk_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, VLIB_BUFFER_HDR_SIZE /* priv size */ + VLIB_BUFFER_PRE_DATA_SIZE + VLIB_BUFFER_DATA_SIZE; /*data room size */ - error = - dpdk_pool_create (vm, pool_name, elt_size, num_mbufs, - sizeof (dpdk_mempool_private_t), 512, socket_id, - &rmp, &pri); + error = dpdk_pool_create (vm, pool_name, elt_size, num_mbufs, + sizeof (dpdk_mempool_private_t), 512, socket_id, + &rmp, &map_index); vec_free (pool_name); @@ -497,7 +492,8 @@ dpdk_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, rte_mempool_obj_iter (rmp, rte_pktmbuf_init, 0); dpdk_mempool_private_t *privp = rte_mempool_get_priv (rmp); - privp->buffer_pool_index = vlib_buffer_pool_create (vm, pri, 0); + privp->buffer_pool_index = + vlib_buffer_register_physmem_map (vm, map_index); dm->pktmbuf_pools[socket_id] = rmp; diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index c4f908ea408..46d53f1d895 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -527,8 +527,7 @@ clib_error_t *unformat_hqos (unformat_input_t * input, clib_error_t *dpdk_pool_create (vlib_main_t * vm, u8 * pool_name, u32 elt_size, u32 num_elts, u32 pool_priv_size, u16 cache_size, u8 numa, - struct rte_mempool **_mp, - vlib_physmem_region_index_t * pri); + struct rte_mempool **_mp, u32 * map_index); clib_error_t *dpdk_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, unsigned socket_id); diff --git a/src/plugins/dpdk/ipsec/ipsec.c b/src/plugins/dpdk/ipsec/ipsec.c index ded8912e943..bcc4b62ad76 100644 --- a/src/plugins/dpdk/ipsec/ipsec.c +++ b/src/plugins/dpdk/ipsec/ipsec.c @@ -827,7 +827,7 @@ crypto_create_crypto_op_pool (vlib_main_t * vm, u8 numa) struct rte_crypto_op_pool_private *priv; struct rte_mempool *mp; clib_error_t *error = NULL; - vlib_physmem_region_index_t pri; + u32 map_index; data = vec_elt_at_index (dcm->data, numa); @@ -837,9 +837,8 @@ crypto_create_crypto_op_pool (vlib_main_t * vm, u8 numa) pool_name = format (0, "crypto_pool_numa%u%c", numa, 0); - error = - dpdk_pool_create (vm, pool_name, crypto_op_len (), conf->num_mbufs, - pool_priv_size, 512, numa, &mp, &pri); + error = dpdk_pool_create (vm, pool_name, crypto_op_len (), conf->num_mbufs, + pool_priv_size, 512, numa, &mp, &map_index); vec_free (pool_name); @@ -867,8 +866,8 @@ crypto_create_session_h_pool (vlib_main_t * vm, u8 numa) u8 *pool_name; struct rte_mempool *mp; clib_error_t *error = NULL; - vlib_physmem_region_index_t pri; u32 elt_size; + u32 map_index; data = vec_elt_at_index (dcm->data, numa); @@ -880,9 +879,8 @@ crypto_create_session_h_pool (vlib_main_t * vm, u8 numa) elt_size = rte_cryptodev_sym_get_header_session_size (); - error = - dpdk_pool_create (vm, pool_name, elt_size, DPDK_CRYPTO_NB_SESS_OBJS, - 0, 512, numa, &mp, &pri); + error = dpdk_pool_create (vm, pool_name, elt_size, DPDK_CRYPTO_NB_SESS_OBJS, + 0, 512, numa, &mp, &map_index); vec_free (pool_name); @@ -902,9 +900,9 @@ crypto_create_session_drv_pool (vlib_main_t * vm, crypto_dev_t * dev) u8 *pool_name; struct rte_mempool *mp; clib_error_t *error = NULL; - vlib_physmem_region_index_t pri; u32 elt_size; u8 numa = dev->numa; + u32 map_index; data = vec_elt_at_index (dcm->data, numa); @@ -920,9 +918,8 @@ crypto_create_session_drv_pool (vlib_main_t * vm, crypto_dev_t * dev) elt_size = rte_cryptodev_sym_get_private_session_size (dev->id); - error = - dpdk_pool_create (vm, pool_name, elt_size, DPDK_CRYPTO_NB_SESS_OBJS, - 0, 512, numa, &mp, &pri); + error = dpdk_pool_create (vm, pool_name, elt_size, DPDK_CRYPTO_NB_SESS_OBJS, + 0, 512, numa, &mp, &map_index); vec_free (pool_name); diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c index 3c4a5b4bbfc..5dc22bed3a5 100644 --- a/src/plugins/ixge/ixge.c +++ b/src/plugins/ixge/ixge.c @@ -2488,13 +2488,11 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) round_pow2 (xm->n_descriptors[rt], xm->n_descriptors_per_cache_line); dq->head_index = dq->tail_index = 0; - dq->descriptors = - vlib_physmem_alloc_aligned (vm, xm->physmem_region, &error, - dq->n_descriptors * - sizeof (dq->descriptors[0]), - 128 /* per chip spec */ ); - if (error) - return error; + dq->descriptors = vlib_physmem_alloc_aligned (vm, dq->n_descriptors * + sizeof (dq->descriptors[0]), + 128 /* per chip spec */ ); + if (!dq->descriptors) + return vlib_physmem_last_error (vm); clib_memset (dq->descriptors, 0, dq->n_descriptors * sizeof (dq->descriptors[0])); @@ -2518,10 +2516,10 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) { u32 i; - dq->tx.head_index_write_back = vlib_physmem_alloc (vm, - xm->physmem_region, - &error, - CLIB_CACHE_LINE_BYTES); + dq->tx.head_index_write_back = + vlib_physmem_alloc (vm, CLIB_CACHE_LINE_BYTES); + if (!dq->tx.head_index_write_back) + return vlib_physmem_last_error (vm); for (i = 0; i < dq->n_descriptors; i++) dq->descriptors[i].tx = xm->tx_descriptor_template; @@ -2533,9 +2531,7 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) ixge_dma_regs_t *dr = get_dma_regs (xd, rt, queue_index); u64 a; - a = - vlib_physmem_virtual_to_physical (vm, xm->physmem_region, - dq->descriptors); + a = vlib_physmem_get_pa (vm, dq->descriptors); dr->descriptor_address[0] = a & 0xFFFFFFFF; dr->descriptor_address[1] = a >> (u64) 32; dr->n_descriptor_bytes = dq->n_descriptors * sizeof (dq->descriptors[0]); @@ -2560,8 +2556,7 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) /* Make sure its initialized before hardware can get to it. */ dq->tx.head_index_write_back[0] = dq->head_index; - a = vlib_physmem_virtual_to_physical (vm, xm->physmem_region, - dq->tx.head_index_write_back); + a = vlib_physmem_get_pa (vm, dq->tx.head_index_write_back); dr->tx.head_index_write_back_address[0] = /* enable bit */ 1 | a; dr->tx.head_index_write_back_address[1] = (u64) a >> (u64) 32; } @@ -2848,14 +2843,6 @@ ixge_pci_init (vlib_main_t * vm, vlib_pci_dev_handle_t h) vlib_pci_addr_t *addr = vlib_pci_get_addr (vm, h); vlib_pci_device_info_t *d = vlib_pci_get_device_info (vm, addr, 0); - /* Allocate physmem region for DMA buffers */ - if (xm->physmem_region_allocated == 0) - { - error = vlib_physmem_region_alloc (vm, "ixge decriptors", 2 << 20, 0, - VLIB_PHYSMEM_F_INIT_MHEAP, - &xm->physmem_region); - xm->physmem_region_allocated = 1; - } if (error) return error; diff --git a/src/plugins/ixge/ixge.h b/src/plugins/ixge/ixge.h index c766397525b..f80d9c0e7cf 100644 --- a/src/plugins/ixge/ixge.h +++ b/src/plugins/ixge/ixge.h @@ -1265,8 +1265,6 @@ typedef struct f64 time_last_stats_update; - vlib_physmem_region_index_t physmem_region; - int physmem_region_allocated; } ixge_main_t; extern ixge_main_t ixge_main; diff --git a/src/plugins/memif/memif.c b/src/plugins/memif/memif.c index 763572c2f0e..f976f16dec8 100644 --- a/src/plugins/memif/memif.c +++ b/src/plugins/memif/memif.c @@ -340,12 +340,12 @@ memif_init_regions_and_queues (memif_if_t * mif) /* *INDENT-OFF* */ vec_foreach (bp, buffer_main.buffer_pools) { - vlib_physmem_region_t *pr; - pr = vlib_physmem_get_region (vm, bp->physmem_region); + vlib_physmem_map_t *pm; + pm = vlib_physmem_get_map (vm, bp->physmem_map_index); vec_add2_aligned (mif->regions, r, 1, CLIB_CACHE_LINE_BYTES); - r->fd = pr->fd; - r->region_size = pr->size; - r->shm = pr->mem; + r->fd = pm->fd; + r->region_size = pm->n_pages << pm->log2_page_size; + r->shm = pm->base; r->is_external = 1; } /* *INDENT-ON* */ diff --git a/src/plugins/vmxnet3/vmxnet3.c b/src/plugins/vmxnet3/vmxnet3.c index e34b3e69a27..f60a8fe2215 100644 --- a/src/plugins/vmxnet3/vmxnet3.c +++ b/src/plugins/vmxnet3/vmxnet3.c @@ -139,19 +139,16 @@ vmxnet3_write_mac (vmxnet3_device_t * vd) static clib_error_t * vmxnet3_provision_driver_shared (vlib_main_t * vm, vmxnet3_device_t * vd) { - vmxnet3_main_t *vmxm = &vmxnet3_main; vmxnet3_shared *shared; vmxnet3_queues *q; u64 shared_dma; - clib_error_t *error; u16 qid = 0, rid; vmxnet3_rxq_t *rxq = vec_elt_at_index (vd->rxqs, qid); vmxnet3_txq_t *txq = vec_elt_at_index (vd->txqs, qid); - vd->dma = vlib_physmem_alloc_aligned (vm, vmxm->physmem_region, &error, - sizeof (*vd->dma), 512); - if (error) - return error; + vd->dma = vlib_physmem_alloc_aligned (vm, sizeof (*vd->dma), 512); + if (vd->dma == 0) + return vlib_physmem_last_error (vm); clib_memset (vd->dma, 0, sizeof (*vd->dma)); @@ -222,9 +219,7 @@ vmxnet3_disable_interrupt (vmxnet3_device_t * vd) static clib_error_t * vmxnet3_rxq_init (vlib_main_t * vm, vmxnet3_device_t * vd, u16 qid, u16 qsz) { - vmxnet3_main_t *vmxm = &vmxnet3_main; vmxnet3_rxq_t *rxq; - clib_error_t *error; u16 rid; vec_validate_aligned (vd->rxqs, qid, CLIB_CACHE_LINE_BYTES); @@ -233,19 +228,19 @@ vmxnet3_rxq_init (vlib_main_t * vm, vmxnet3_device_t * vd, u16 qid, u16 qsz) rxq->size = qsz; for (rid = 0; rid < VMXNET3_RX_RING_SIZE; rid++) { - rxq->rx_desc[rid] = - vlib_physmem_alloc_aligned (vm, vmxm->physmem_region, - &error, qsz * sizeof (*rxq->rx_desc[rid]), - 512); - if (error) - return error; + rxq->rx_desc[rid] = vlib_physmem_alloc_aligned + (vm, qsz * sizeof (*rxq->rx_desc[rid]), 512); + + if (rxq->rx_desc[rid] == 0) + return vlib_physmem_last_error (vm); + clib_memset (rxq->rx_desc[rid], 0, qsz * sizeof (*rxq->rx_desc[rid])); } - rxq->rx_comp = vlib_physmem_alloc_aligned (vm, vmxm->physmem_region, &error, - qsz * sizeof (*rxq->rx_comp), + rxq->rx_comp = vlib_physmem_alloc_aligned (vm, qsz * sizeof (*rxq->rx_comp), 512); - if (error) - return error; + if (rxq->rx_comp == 0) + return vlib_physmem_last_error (vm); + clib_memset (rxq->rx_comp, 0, qsz * sizeof (*rxq->rx_comp)); for (rid = 0; rid < VMXNET3_RX_RING_SIZE; rid++) { @@ -264,9 +259,7 @@ vmxnet3_rxq_init (vlib_main_t * vm, vmxnet3_device_t * vd, u16 qid, u16 qsz) static clib_error_t * vmxnet3_txq_init (vlib_main_t * vm, vmxnet3_device_t * vd, u16 qid, u16 qsz) { - vmxnet3_main_t *vmxm = &vmxnet3_main; vmxnet3_txq_t *txq; - clib_error_t *error; if (qid >= vd->num_tx_queues) { @@ -282,17 +275,17 @@ vmxnet3_txq_init (vlib_main_t * vm, vmxnet3_device_t * vd, u16 qid, u16 qsz) txq = vec_elt_at_index (vd->txqs, qid); clib_memset (txq, 0, sizeof (*txq)); txq->size = qsz; - txq->tx_desc = vlib_physmem_alloc_aligned (vm, vmxm->physmem_region, &error, - qsz * sizeof (*txq->tx_desc), + txq->tx_desc = vlib_physmem_alloc_aligned (vm, qsz * sizeof (*txq->tx_desc), 512); - if (error) - return error; - clib_memset (txq->tx_desc, 0, qsz * sizeof (*txq->tx_desc)); - txq->tx_comp = vlib_physmem_alloc_aligned (vm, vmxm->physmem_region, &error, - qsz * sizeof (*txq->tx_comp), + if (txq->tx_desc == 0) + return vlib_physmem_last_error (vm); + + memset (txq->tx_desc, 0, qsz * sizeof (*txq->tx_desc)); + txq->tx_comp = vlib_physmem_alloc_aligned (vm, qsz * sizeof (*txq->tx_comp), 512); - if (error) - return error; + if (txq->tx_comp == 0) + return vlib_physmem_last_error (vm); + clib_memset (txq->tx_comp, 0, qsz * sizeof (*txq->tx_comp)); vec_validate_aligned (txq->tx_ring.bufs, txq->size, CLIB_CACHE_LINE_BYTES); txq->tx_ring.gen = VMXNET3_TXF_GEN; @@ -307,7 +300,6 @@ vmxnet3_device_init (vlib_main_t * vm, vmxnet3_device_t * vd, { clib_error_t *error = 0; u32 ret, i; - vmxnet3_main_t *vmxm = &vmxnet3_main; vlib_thread_main_t *tm = vlib_get_thread_main (); vd->num_tx_queues = 1; @@ -372,17 +364,6 @@ vmxnet3_device_init (vlib_main_t * vm, vmxnet3_device_t * vd, ret = vmxnet3_reg_read (vd, 1, VMXNET3_REG_MACH); clib_memcpy (vd->mac_addr + 4, &ret, 2); - if (vmxm->physmem_region_alloc == 0) - { - u32 flags = VLIB_PHYSMEM_F_INIT_MHEAP | VLIB_PHYSMEM_F_HUGETLB; - error = - vlib_physmem_region_alloc (vm, "vmxnet3 descriptors", 4 << 20, 0, - flags, &vmxm->physmem_region); - if (error) - return error; - vmxm->physmem_region_alloc = 1; - } - error = vmxnet3_rxq_init (vm, vd, 0, args->rxq_size); if (error) return error; @@ -629,9 +610,9 @@ vmxnet3_delete_if (vlib_main_t * vm, vmxnet3_device_t * vd) vlib_buffer_free_from_ring (vm, ring->bufs, desc_idx, rxq->size, ring->fill); vec_free (ring->bufs); - vlib_physmem_free (vm, vmxm->physmem_region, rxq->rx_desc[rid]); + vlib_physmem_free (vm, rxq->rx_desc[rid]); } - vlib_physmem_free (vm, vmxm->physmem_region, rxq->rx_comp); + vlib_physmem_free (vm, rxq->rx_comp); } /* *INDENT-ON* */ vec_free (vd->rxqs); @@ -654,13 +635,13 @@ vmxnet3_delete_if (vlib_main_t * vm, vmxnet3_device_t * vd) } clib_spinlock_free (&txq->lock); vec_free (txq->tx_ring.bufs); - vlib_physmem_free (vm, vmxm->physmem_region, txq->tx_desc); - vlib_physmem_free (vm, vmxm->physmem_region, txq->tx_comp); + vlib_physmem_free (vm, txq->tx_desc); + vlib_physmem_free (vm, txq->tx_comp); } /* *INDENT-ON* */ vec_free (vd->txqs); - vlib_physmem_free (vm, vmxm->physmem_region, vd->dma); + vlib_physmem_free (vm, vd->dma); clib_error_free (vd->error); clib_memset (vd, 0, sizeof (*vd)); diff --git a/src/plugins/vmxnet3/vmxnet3.h b/src/plugins/vmxnet3/vmxnet3.h index 391ddc17113..befbe368d67 100644 --- a/src/plugins/vmxnet3/vmxnet3.h +++ b/src/plugins/vmxnet3/vmxnet3.h @@ -493,8 +493,6 @@ typedef struct typedef struct { vmxnet3_device_t *devices; - vlib_physmem_region_index_t physmem_region; - u32 physmem_region_alloc; u16 msg_id_base; } vmxnet3_main_t; @@ -546,10 +544,8 @@ vmxnet3_reg_read (vmxnet3_device_t * vd, u8 bar, u32 addr) static_always_inline uword vmxnet3_dma_addr (vlib_main_t * vm, vmxnet3_device_t * vd, void *p) { - vmxnet3_main_t *vmxm = &vmxnet3_main; - return (vd->flags & VMXNET3_DEVICE_F_IOVA) ? pointer_to_uword (p) : - vlib_physmem_virtual_to_physical (vm, vmxm->physmem_region, p); + vlib_physmem_get_pa (vm, p); } static_always_inline void diff --git a/src/vlib/CMakeLists.txt b/src/vlib/CMakeLists.txt index 72c73f3c2d8..2a6cbd54da2 100644 --- a/src/vlib/CMakeLists.txt +++ b/src/vlib/CMakeLists.txt @@ -39,7 +39,6 @@ add_vpp_library(vlib i2c.c init.c linux/pci.c - linux/physmem.c linux/vfio.c log.c main.c @@ -47,6 +46,7 @@ add_vpp_library(vlib node_cli.c node_format.c pci/pci.c + physmem.c threads.c threads_cli.c trace.c diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index ee76290e25a..711b83514f4 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -470,26 +470,10 @@ vlib_buffer_delete_free_list_internal (vlib_main_t * vm, } static_always_inline void * -vlib_buffer_pool_get_buffer (vlib_buffer_pool_t * bp) +vlib_buffer_pool_get_buffer (vlib_main_t * vm, vlib_buffer_pool_t * bp) { - uword slot, page, addr; - - if (PREDICT_FALSE (bp->n_elts == bp->n_used)) - { - clib_spinlock_unlock (&bp->lock); - return 0; - } - slot = bp->next_clear; - bp->bitmap = clib_bitmap_set (bp->bitmap, slot, 1); - bp->next_clear = clib_bitmap_next_clear (bp->bitmap, slot + 1); - bp->n_used++; - - page = slot / bp->buffers_per_page; - slot -= page * bp->buffers_per_page; - - addr = bp->start + (page << bp->log2_page_size) + slot * bp->buffer_size; - - return uword_to_pointer (addr, void *); + return vlib_physmem_alloc_from_map (vm, bp->physmem_map_index, + bp->buffer_size, CLIB_CACHE_LINE_BYTES); } /* Make sure free list has at least given number of free buffers. */ @@ -533,7 +517,7 @@ vlib_buffer_fill_free_list_internal (vlib_main_t * vm, clib_spinlock_lock (&bp->lock); while (n_alloc < n) { - if ((b = vlib_buffer_pool_get_buffer (bp)) == 0) + if ((b = vlib_buffer_pool_get_buffer (vm, bp)) == 0) goto done; n_alloc += 1; @@ -866,14 +850,13 @@ vlib_buffer_chain_append_data_with_alloc (vlib_main_t * vm, } u8 -vlib_buffer_pool_create (vlib_main_t * vm, vlib_physmem_region_index_t pri, - u16 buffer_size) +vlib_buffer_register_physmem_map (vlib_main_t * vm, u32 physmem_map_index) { vlib_buffer_main_t *bm = &buffer_main; - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, pri); vlib_buffer_pool_t *p; - uword start = pointer_to_uword (pr->mem); - uword size = pr->size; + vlib_physmem_map_t *m = vlib_physmem_get_map (vm, physmem_map_index); + uword start = pointer_to_uword (m->base); + uword size = m->n_pages << m->log2_page_size; if (bm->buffer_mem_size == 0) { @@ -903,18 +886,8 @@ vlib_buffer_pool_create (vlib_main_t * vm, vlib_physmem_region_index_t pri, vec_add2 (bm->buffer_pools, p, 1); p->start = start; p->size = size; - p->physmem_region = pri; - - if (buffer_size == 0) - goto done; + p->physmem_map_index = physmem_map_index; - p->log2_page_size = pr->log2_page_size; - p->buffer_size = buffer_size; - p->buffers_per_page = (1ull << pr->log2_page_size) / p->buffer_size; - p->n_elts = p->buffers_per_page * pr->n_pages; - p->n_used = 0; - clib_spinlock_init (&p->lock); -done: ASSERT (p - bm->buffer_pools < 256); return p - bm->buffer_pools; } @@ -983,8 +956,9 @@ clib_error_t * vlib_buffer_main_init (struct vlib_main_t * vm) { vlib_buffer_main_t *bm = &buffer_main; - vlib_physmem_region_index_t pri; clib_error_t *error; + u32 physmem_map_index; + u8 pool_index; if (vlib_buffer_callbacks) { @@ -1003,25 +977,18 @@ vlib_buffer_main_init (struct vlib_main_t * vm) &vlib_buffer_delete_free_list_internal; clib_spinlock_init (&bm->buffer_known_hash_lockp); - /* allocate default region */ - error = vlib_physmem_region_alloc (vm, "buffers", - vlib_buffer_physmem_sz, 0, - VLIB_PHYSMEM_F_SHARED | - VLIB_PHYSMEM_F_HUGETLB, &pri); + if ((error = vlib_physmem_shared_map_create (vm, "buffers", + vlib_buffer_physmem_sz, 1, + &physmem_map_index))) + return error; - if (error == 0) - goto done; + pool_index = vlib_buffer_register_physmem_map (vm, physmem_map_index); + vlib_buffer_pool_t *bp = vlib_buffer_pool_get (pool_index); + clib_spinlock_init (&bp->lock); + bp->buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES + + sizeof (vlib_buffer_t); - clib_error_free (error); - - error = vlib_physmem_region_alloc (vm, "buffers", - vlib_buffer_physmem_sz, 0, - VLIB_PHYSMEM_F_SHARED, &pri); -done: - if (error == 0) - vlib_buffer_pool_create (vm, pri, sizeof (vlib_buffer_t) + - VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES); - return error; + return 0; } static clib_error_t * diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 0d24779e52b..f9750ecf3e1 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -412,16 +412,9 @@ typedef struct uword start; uword size; uword log2_page_size; - vlib_physmem_region_index_t physmem_region; - + u32 physmem_map_index; + u32 buffer_size; u32 *buffers; - - u16 buffer_size; - uword buffers_per_page; - uword n_elts; - uword n_used; - uword next_clear; - uword *bitmap; clib_spinlock_t lock; } vlib_buffer_pool_t; @@ -466,9 +459,8 @@ vlib_buffer_pool_get (u8 buffer_pool_index) return vec_elt_at_index (bm->buffer_pools, buffer_pool_index); } -u8 vlib_buffer_pool_create (struct vlib_main_t * vm, - vlib_physmem_region_index_t region, - u16 buffer_size); +u8 vlib_buffer_register_physmem_map (struct vlib_main_t * vm, + u32 physmem_map_index); clib_error_t *vlib_buffer_main_init (struct vlib_main_t *vm); diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 1110c206e52..6106b7984a2 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -317,10 +317,7 @@ vlib_buffer_contents (vlib_main_t * vm, u32 buffer_index, u8 * contents) always_inline uword vlib_buffer_get_pa (vlib_main_t * vm, vlib_buffer_t * b) { - vlib_buffer_main_t *bm = &buffer_main; - vlib_buffer_pool_t *pool = vec_elt_at_index (bm->buffer_pools, - b->buffer_pool_index); - return vlib_physmem_virtual_to_physical (vm, pool->physmem_region, b->data); + return vlib_physmem_get_pa (vm, b->data); } always_inline uword diff --git a/src/vlib/linux/pci.c b/src/vlib/linux/pci.c index b55fb5042f7..0e2241b0e58 100644 --- a/src/vlib/linux/pci.c +++ b/src/vlib/linux/pci.c @@ -951,6 +951,21 @@ add_device_vfio (vlib_main_t * vm, linux_pci_device_t * p, linux_pci_vfio_unmask_intx (vm, p); } + if (p->supports_va_dma) + { + vlib_buffer_pool_t *bp; + /* *INDENT-OFF* */ + vec_foreach (bp, buffer_main.buffer_pools) + { + u32 i; + vlib_physmem_map_t *pm; + pm = vlib_physmem_get_map (vm, bp->physmem_map_index); + for (i = 0; i < pm->n_pages; i++) + vfio_map_physmem_page (vm, pm->base + (i << pm->log2_page_size)); + } + /* *INDENT-ON* */ + } + if (r && r->init_function) err = r->init_function (lpm->vlib_main, p->handle); @@ -1091,6 +1106,25 @@ vlib_pci_map_region_fixed (vlib_main_t * vm, vlib_pci_dev_handle_t h, return (vlib_pci_map_region_int (vm, h, resource, addr, result)); } +clib_error_t * +vlib_pci_map_dma (vlib_main_t * vm, vlib_pci_dev_handle_t h, void *ptr) +{ + linux_pci_device_t *p = linux_pci_get_device (h); + + if (!p->supports_va_dma) + return 0; + + return vfio_map_physmem_page (vm, ptr); +} + +int +vlib_pci_supports_virtual_addr_dma (vlib_main_t * vm, vlib_pci_dev_handle_t h) +{ + linux_pci_device_t *p = linux_pci_get_device (h); + + return p->supports_va_dma != 0; +} + clib_error_t * vlib_pci_device_open (vlib_main_t * vm, vlib_pci_addr_t * addr, pci_device_id_t ids[], vlib_pci_dev_handle_t * handle) diff --git a/src/vlib/linux/physmem.c b/src/vlib/linux/physmem.c deleted file mode 100755 index 90b0f8cab3d..00000000000 --- a/src/vlib/linux/physmem.c +++ /dev/null @@ -1,307 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * physmem.c: Unix physical memory - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -static void * -unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, - uword n_bytes, uword alignment) -{ - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - uword lo_offset, hi_offset; - uword *to_free = 0; - - if (pr->heap == 0) - return 0; - - /* IO memory is always at least cache aligned. */ - alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); - - while (1) - { -#if USE_DLMALLOC == 0 - - mheap_get_aligned (pr->heap, n_bytes, - /* align */ alignment, - /* align offset */ 0, - &lo_offset); -#else - lo_offset = (uword) mspace_get_aligned (pr->heap, n_bytes, - alignment, ~0ULL /* offset */ ); - if (lo_offset == 0) - lo_offset = ~0ULL; -#endif - - /* Allocation failed? */ - if (lo_offset == ~0) - break; - - /* Make sure allocation does not span DMA physical chunk boundary. */ - hi_offset = lo_offset + n_bytes - 1; - - if (((pointer_to_uword (pr->heap) + lo_offset) >> pr->log2_page_size) == - ((pointer_to_uword (pr->heap) + hi_offset) >> pr->log2_page_size)) - break; - - /* Allocation would span chunk boundary, queue it to be freed as soon as - we find suitable chunk. */ - vec_add1 (to_free, lo_offset); - } - - if (to_free != 0) - { - uword i; - for (i = 0; i < vec_len (to_free); i++) - { -#if USE_DLMALLOC == 0 - mheap_put (pr->heap, to_free[i]); -#else - mspace_put_no_offset (pr->heap, (void *) to_free[i]); -#endif - } - vec_free (to_free); - } - -#if USE_DLMALLOC == 0 - return lo_offset != ~0 ? (void *) (pr->heap + lo_offset) : 0; -#else - return lo_offset != ~0 ? (void *) lo_offset : 0; -#endif -} - -static void -unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x) -{ - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - /* Return object to region's heap. */ -#if USE_DLMALLOC == 0 - mheap_put (pr->heap, x - pr->heap); -#else - mspace_put_no_offset (pr->heap, x); -#endif -} - -static clib_error_t * -unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, - u8 numa_node, u32 flags, - vlib_physmem_region_index_t * idx) -{ - vlib_physmem_main_t *vpm = &physmem_main; - vlib_physmem_region_t *pr; - clib_error_t *error = 0; - clib_mem_vm_alloc_t alloc = { 0 }; - int i; - - pool_get (vpm->regions, pr); - - if ((pr - vpm->regions) >= 256) - { - error = clib_error_return (0, "maximum number of regions reached"); - goto error; - } - - alloc.name = name; - alloc.size = size; - alloc.numa_node = numa_node; - - alloc.flags = (flags & VLIB_PHYSMEM_F_SHARED) ? - CLIB_MEM_VM_F_SHARED : CLIB_MEM_VM_F_LOCKED; - - if ((flags & VLIB_PHYSMEM_F_HUGETLB)) - { - alloc.flags |= CLIB_MEM_VM_F_HUGETLB; - alloc.flags |= CLIB_MEM_VM_F_HUGETLB_PREALLOC; - alloc.flags |= CLIB_MEM_VM_F_NUMA_FORCE; - } - else - { - alloc.flags |= CLIB_MEM_VM_F_NUMA_PREFER; - } - - error = clib_mem_vm_ext_alloc (&alloc); - if (error) - goto error; - - pr->index = pr - vpm->regions; - pr->flags = flags; - pr->fd = alloc.fd; - pr->mem = alloc.addr; - pr->log2_page_size = alloc.log2_page_size; - pr->n_pages = alloc.n_pages; - pr->size = (u64) pr->n_pages << (u64) pr->log2_page_size; - pr->page_mask = (1ull << pr->log2_page_size) - 1; - pr->numa_node = numa_node; - pr->name = format (0, "%s%c", name, 0); - - for (i = 0; i < pr->n_pages; i++) - { - void *ptr = pr->mem + ((u64) i << pr->log2_page_size); - int node; - if ((move_pages (0, 1, &ptr, 0, &node, 0) == 0) && (numa_node != node)) - { - clib_warning ("physmem page for region \'%s\' allocated on the" - " wrong numa node (requested %u actual %u)", - pr->name, pr->numa_node, node, i); - break; - } - } - - pr->page_table = clib_mem_vm_get_paddr (pr->mem, pr->log2_page_size, - pr->n_pages); - - linux_vfio_dma_map_regions (vm); - - if (flags & VLIB_PHYSMEM_F_INIT_MHEAP) - { -#if USE_DLMALLOC == 0 - pr->heap = mheap_alloc_with_flags (pr->mem, pr->size, - /* Don't want mheap mmap/munmap with IO memory. */ - MHEAP_FLAG_DISABLE_VM | - MHEAP_FLAG_THREAD_SAFE); -#else - pr->heap = create_mspace_with_base (pr->mem, pr->size, 1 /* locked */ ); - mspace_disable_expand (pr->heap); -#endif - } - - *idx = pr->index; - - goto done; - -error: - clib_memset (pr, 0, sizeof (*pr)); - pool_put (vpm->regions, pr); - -done: - return error; -} - -static void -unix_physmem_region_free (vlib_main_t * vm, vlib_physmem_region_index_t idx) -{ - vlib_physmem_main_t *vpm = &physmem_main; - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - - if (pr->fd > 0) - close (pr->fd); - munmap (pr->mem, pr->size); - vec_free (pr->name); - pool_put (vpm->regions, pr); -} - -clib_error_t * -unix_physmem_init (vlib_main_t * vm) -{ - vlib_physmem_main_t *vpm = &physmem_main; - clib_error_t *error = 0; - u64 *pt = 0; - - /* Avoid multiple calls. */ - if (vm->os_physmem_alloc_aligned) - return error; - - /* check if pagemap is accessible */ - pt = clib_mem_vm_get_paddr (&pt, min_log2 (sysconf (_SC_PAGESIZE)), 1); - if (pt[0]) - vpm->flags |= VLIB_PHYSMEM_MAIN_F_HAVE_PAGEMAP; - vec_free (pt); - - if ((error = linux_vfio_init (vm))) - return error; - - vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; - vm->os_physmem_free = unix_physmem_free; - vm->os_physmem_region_alloc = unix_physmem_region_alloc; - vm->os_physmem_region_free = unix_physmem_region_free; - - return error; -} - -static clib_error_t * -show_physmem (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - vlib_physmem_main_t *vpm = &physmem_main; - vlib_physmem_region_t *pr; - - /* *INDENT-OFF* */ - pool_foreach (pr, vpm->regions, ( - { - vlib_cli_output (vm, "index %u name '%s' page-size %uKB num-pages %d " - "numa-node %u fd %d\n", - pr->index, pr->name, (1 << (pr->log2_page_size -10)), - pr->n_pages, pr->numa_node, pr->fd); - if (pr->heap) - vlib_cli_output (vm, " %U", format_mheap, pr->heap, /* verbose */ 1); - else - vlib_cli_output (vm, " no heap\n"); - })); - /* *INDENT-ON* */ - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_physmem_command, static) = { - .path = "show physmem", - .short_help = "Show physical memory allocation", - .function = show_physmem, -}; -/* *INDENT-ON* */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/linux/vfio.c b/src/vlib/linux/vfio.c index e72f10388ca..d300a683dd7 100644 --- a/src/vlib/linux/vfio.c +++ b/src/vlib/linux/vfio.c @@ -34,52 +34,46 @@ linux_vfio_main_t vfio_main; -static int -vfio_map_regions (vlib_main_t * vm, int fd) +clib_error_t * +vfio_map_physmem_page (vlib_main_t * vm, void *addr) { - vlib_physmem_main_t *vpm = &physmem_main; + vlib_physmem_main_t *vpm = &vm->physmem_main; linux_vfio_main_t *lvm = &vfio_main; - vlib_physmem_region_t *pr; struct vfio_iommu_type1_dma_map dm = { 0 }; - int i; + uword log2_page_size = vpm->pmalloc_main->log2_page_sz; + uword physmem_start = pointer_to_uword (vpm->pmalloc_main->base); + + if (lvm->container_fd == -1) + return clib_error_return (0, "No cointainer fd"); + + u32 page_index = vlib_physmem_get_page_index (vm, addr); + + if (clib_bitmap_get (lvm->physmem_pages_mapped, page_index)) + { + vlib_log_debug (lvm->log_default, "map DMA va:%p page:%u already " + "mapped", addr, page_index); + return 0; + } dm.argsz = sizeof (struct vfio_iommu_type1_dma_map); dm.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + dm.vaddr = physmem_start + (page_index << log2_page_size); + dm.size = 1ULL << log2_page_size; + dm.iova = dm.vaddr; + vlib_log_debug (lvm->log_default, "map DMA page:%u va:0x%lx iova:%lx " + "size:0x%lx", page_index, dm.vaddr, dm.iova, dm.size); - /* *INDENT-OFF* */ - pool_foreach (pr, vpm->regions, + if (ioctl (lvm->container_fd, VFIO_IOMMU_MAP_DMA, &dm) == -1) { - vec_foreach_index (i, pr->page_table) - { - int rv; - dm.vaddr = pointer_to_uword (pr->mem) + ((u64)i << pr->log2_page_size); - dm.size = 1ull << pr->log2_page_size; - dm.iova = dm.vaddr; - vlib_log_debug (lvm->log_default, "map DMA va:0x%lx iova:%lx " - "size:0x%lx", dm.vaddr, dm.iova, dm.size); - - if ((rv = ioctl (fd, VFIO_IOMMU_MAP_DMA, &dm)) && - errno != EINVAL) - { - vlib_log_err (lvm->log_default, "map DMA va:0x%lx iova:%lx " - "size:0x%lx failed, error %s (errno %d)", - dm.vaddr, dm.iova, dm.size, strerror (errno), - errno); - return rv; - } - } - }); - /* *INDENT-ON* */ - return 0; -} - -void -linux_vfio_dma_map_regions (vlib_main_t * vm) -{ - linux_vfio_main_t *lvm = &vfio_main; + vlib_log_err (lvm->log_default, "map DMA page:%u va:0x%lx iova:%lx " + "size:0x%lx failed, error %s (errno %d)", page_index, + dm.vaddr, dm.iova, dm.size, strerror (errno), errno); + return clib_error_return_unix (0, "physmem DMA map failed"); + } - if (lvm->container_fd != -1) - vfio_map_regions (vm, lvm->container_fd); + lvm->physmem_pages_mapped = clib_bitmap_set (lvm->physmem_pages_mapped, + page_index, 1); + return 0; } static linux_pci_vfio_iommu_group_t * diff --git a/src/vlib/linux/vfio.h b/src/vlib/linux/vfio.h index aae8e3c6ee7..c1d815664c9 100644 --- a/src/vlib/linux/vfio.h +++ b/src/vlib/linux/vfio.h @@ -36,6 +36,8 @@ typedef struct /* iommu group pool index by group id hash */ uword *iommu_pool_index_by_group; + clib_bitmap_t *physmem_pages_mapped; + /* logging */ vlib_log_class_t log_default; } linux_vfio_main_t; @@ -43,7 +45,7 @@ typedef struct extern linux_vfio_main_t vfio_main; clib_error_t *linux_vfio_init (vlib_main_t * vm); -void linux_vfio_dma_map_regions (vlib_main_t * vm); +clib_error_t *vfio_map_physmem_page (vlib_main_t * vm, void *addr); clib_error_t *linux_vfio_group_get_device_fd (vlib_pci_addr_t * addr, int *fd, int *is_noiommu); diff --git a/src/vlib/main.c b/src/vlib/main.c index 14d89141ff8..a6ad4032dae 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -1756,7 +1756,7 @@ vlib_main (vlib_main_t * volatile vm, unformat_input_t * input) if (!vm->name) vm->name = "VLIB"; - if ((error = unix_physmem_init (vm))) + if ((error = vlib_physmem_init (vm))) { clib_error_report (error); goto done; diff --git a/src/vlib/main.h b/src/vlib/main.h index ddc14df5360..7c34fb6528d 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -118,23 +118,8 @@ typedef struct vlib_main_t /* Pool of buffer free lists. */ vlib_buffer_free_list_t *buffer_free_list_pool; - /* Allocate/free buffer memory for DMA transfers, descriptor rings, etc. - buffer memory is guaranteed to be cache-aligned. */ - - clib_error_t *(*os_physmem_region_alloc) (struct vlib_main_t * vm, - char *name, u32 size, - u8 numa_node, u32 flags, - vlib_physmem_region_index_t * - idx); - - void (*os_physmem_region_free) (struct vlib_main_t * vm, - vlib_physmem_region_index_t idx); - - void *(*os_physmem_alloc_aligned) (struct vlib_main_t * vm, - vlib_physmem_region_index_t idx, - uword n_bytes, uword alignment); - void (*os_physmem_free) (struct vlib_main_t * vm, - vlib_physmem_region_index_t idx, void *x); + /* physical memory main structure. */ + vlib_physmem_main_t physmem_main; /* Node graph main structure. */ vlib_node_main_t node_main; diff --git a/src/vlib/pci/pci.h b/src/vlib/pci/pci.h index 3d5cd405c89..e0eacf42948 100644 --- a/src/vlib/pci/pci.h +++ b/src/vlib/pci/pci.h @@ -293,6 +293,11 @@ clib_error_t *vlib_pci_enable_msix_irq (vlib_main_t * vm, clib_error_t *vlib_pci_disable_msix_irq (vlib_main_t * vm, vlib_pci_dev_handle_t h, u16 start, u16 count); +clib_error_t *vlib_pci_map_dma (vlib_main_t * vm, vlib_pci_dev_handle_t h, + void *ptr); + +int vlib_pci_supports_virtual_addr_dma (vlib_main_t * vm, + vlib_pci_dev_handle_t h); unformat_function_t unformat_vlib_pci_addr; format_function_t format_vlib_pci_addr; diff --git a/src/vlib/physmem.c b/src/vlib/physmem.c new file mode 100755 index 00000000000..e2d88922f56 --- /dev/null +++ b/src/vlib/physmem.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +clib_error_t * +vlib_physmem_shared_map_create (vlib_main_t * vm, char *name, uword size, + u32 numa_node, u32 * map_index) +{ + clib_pmalloc_main_t *pm = vm->physmem_main.pmalloc_main; + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_map_t *map; + clib_pmalloc_arena_t *a; + clib_error_t *error = 0; + void *va; + int i; + + va = clib_pmalloc_create_shared_arena (pm, name, size, numa_node); + + if (va == 0) + return clib_error_return (0, "%U", format_clib_error, + clib_pmalloc_last_error (pm)); + + a = clib_pmalloc_get_arena (pm, va); + + pool_get (vpm->maps, map); + *map_index = map->index = map - vpm->maps; + map->base = va; + map->fd = a->fd; + map->n_pages = a->n_pages; + map->log2_page_size = a->log2_page_sz; + + for (i = 0; i < a->n_pages; i++) + { + uword pa = clib_pmalloc_get_pa (pm, (u8 *) va + (i << a->log2_page_sz)); + + /* maybe iova */ + if (pa == 0) + pa = pointer_to_uword (va); + + vec_add1 (map->page_table, pa); + } + + return error; +} + +vlib_physmem_map_t * +vlib_physmem_get_map (vlib_main_t * vm, u32 index) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + return pool_elt_at_index (vpm->maps, index); +} + +clib_error_t * +vlib_physmem_init (vlib_main_t * vm) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + clib_error_t *error = 0; + u64 *pt = 0; + void *p; + + /* check if pagemap is accessible */ + pt = clib_mem_vm_get_paddr (&pt, min_log2 (sysconf (_SC_PAGESIZE)), 1); + if (pt[0]) + vpm->flags |= VLIB_PHYSMEM_MAIN_F_HAVE_PAGEMAP; + vec_free (pt); + + if ((error = linux_vfio_init (vm))) + return error; + + p = clib_mem_alloc_aligned (sizeof (clib_pmalloc_main_t), + CLIB_CACHE_LINE_BYTES); + memset (p, 0, sizeof (clib_pmalloc_main_t)); + vpm->pmalloc_main = (clib_pmalloc_main_t *) p; + clib_pmalloc_init (vpm->pmalloc_main, 0); + + return error; +} + +static clib_error_t * +show_physmem (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + unformat_input_t _line_input, *line_input = &_line_input; + u32 verbose = 0; + + if (unformat_user (input, unformat_line_input, line_input)) + { + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "verbose")) + verbose = 1; + else if (unformat (line_input, "v")) + verbose = 1; + else if (unformat (line_input, "detail")) + verbose = 2; + else if (unformat (line_input, "d")) + verbose = 2; + else + break; + } + unformat_free (line_input); + } + + vlib_cli_output (vm, " %U", format_pmalloc, vpm->pmalloc_main, verbose); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_physmem_command, static) = { + .path = "show physmem", + .short_help = "Show physical memory allocation", + .function = show_physmem, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/physmem.h b/src/vlib/physmem.h index 2f54938ff10..3e73a1b03f0 100644 --- a/src/vlib/physmem.h +++ b/src/vlib/physmem.h @@ -40,41 +40,27 @@ #ifndef included_vlib_physmem_h #define included_vlib_physmem_h -typedef u8 vlib_physmem_region_index_t; +#include typedef struct { - vlib_physmem_region_index_t index; - void *mem; - uword size; + int index; int fd; - u8 log2_page_size; - u16 n_pages; - u32 page_mask; - - void *heap; - u32 flags; -#define VLIB_PHYSMEM_F_INIT_MHEAP (1 << 0) -#define VLIB_PHYSMEM_F_HUGETLB (1 << 1) -#define VLIB_PHYSMEM_F_SHARED (1 << 2) - - u8 numa_node; - u64 *page_table; - u8 *name; -} vlib_physmem_region_t; - - + void *base; + u32 n_pages; + uword *page_table; + u32 log2_page_size; +} vlib_physmem_map_t; typedef struct { u32 flags; #define VLIB_PHYSMEM_MAIN_F_HAVE_PAGEMAP (1 << 0) #define VLIB_PHYSMEM_MAIN_F_HAVE_IOMMU (1 << 1) - vlib_physmem_region_t *regions; + vlib_physmem_map_t *maps; + clib_pmalloc_main_t *pmalloc_main; } vlib_physmem_main_t; -extern vlib_physmem_main_t physmem_main; - #endif /* included_vlib_physmem_h */ /* diff --git a/src/vlib/physmem_funcs.h b/src/vlib/physmem_funcs.h index bff66aa5726..0082f85c70d 100644 --- a/src/vlib/physmem_funcs.h +++ b/src/vlib/physmem_funcs.h @@ -40,115 +40,62 @@ #ifndef included_vlib_physmem_funcs_h #define included_vlib_physmem_funcs_h -always_inline vlib_physmem_region_t * -vlib_physmem_get_region (vlib_main_t * vm, u8 index) -{ - vlib_physmem_main_t *vpm = &physmem_main; - return pool_elt_at_index (vpm->regions, index); -} - -always_inline u64 -vlib_physmem_offset_to_physical (vlib_main_t * vm, - vlib_physmem_region_index_t idx, uword o) -{ - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - uword page_index = o >> pr->log2_page_size; - ASSERT (o < pr->size); - ASSERT (pr->page_table[page_index] != 0); - return (vec_elt (pr->page_table, page_index) + (o & pr->page_mask)); -} - -always_inline int -vlib_physmem_is_virtual (vlib_main_t * vm, vlib_physmem_region_index_t idx, - uword p) -{ - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - return p >= pointer_to_uword (pr->mem) - && p < (pointer_to_uword (pr->mem) + pr->size); -} - -always_inline uword -vlib_physmem_offset_of (vlib_main_t * vm, vlib_physmem_region_index_t idx, - void *p) -{ - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - uword a = pointer_to_uword (p); - uword o; +clib_error_t *vlib_physmem_init (vlib_main_t * vm); +clib_error_t *vlib_physmem_shared_map_create (vlib_main_t * vm, char *name, + uword size, u32 numa_node, + u32 * map_index); - ASSERT (vlib_physmem_is_virtual (vm, idx, a)); - o = a - pointer_to_uword (pr->mem); - - /* Offset must fit in 32 bits. */ - ASSERT ((uword) o == a - pointer_to_uword (pr->mem)); - - return o; -} +vlib_physmem_map_t *vlib_physmem_get_map (vlib_main_t * vm, u32 index); always_inline void * -vlib_physmem_at_offset (vlib_main_t * vm, vlib_physmem_region_index_t idx, - uword offset) +vlib_physmem_alloc_aligned (vlib_main_t * vm, uword n_bytes, uword alignment) { - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - ASSERT (offset < pr->size); - return uword_to_pointer (pointer_to_uword (pr->mem) + offset, void *); + clib_pmalloc_main_t *pm = vm->physmem_main.pmalloc_main; + return clib_pmalloc_alloc_aligned (pm, n_bytes, alignment); } +/* By default allocate I/O memory with cache line alignment. */ always_inline void * -vlib_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, - clib_error_t ** error, - uword n_bytes, uword alignment) +vlib_physmem_alloc (vlib_main_t * vm, uword n_bytes) { - void *r = vm->os_physmem_alloc_aligned (vm, idx, n_bytes, alignment); - if (!r) - *error = - clib_error_return (0, "failed to allocate %wd bytes of I/O memory", - n_bytes); - else - *error = 0; - return r; + return vlib_physmem_alloc_aligned (vm, n_bytes, CLIB_CACHE_LINE_BYTES); } -/* By default allocate I/O memory with cache line alignment. */ always_inline void * -vlib_physmem_alloc (vlib_main_t * vm, vlib_physmem_region_index_t idx, - clib_error_t ** error, uword n_bytes) +vlib_physmem_alloc_from_map (vlib_main_t * vm, u32 physmem_map_index, + uword n_bytes, uword alignment) { - return vlib_physmem_alloc_aligned (vm, idx, error, n_bytes, - CLIB_CACHE_LINE_BYTES); + clib_pmalloc_main_t *pm = vm->physmem_main.pmalloc_main; + vlib_physmem_map_t *map = vlib_physmem_get_map (vm, physmem_map_index); + return clib_pmalloc_alloc_from_arena (pm, map->base, n_bytes, + CLIB_CACHE_LINE_BYTES); } always_inline void -vlib_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, - void *mem) +vlib_physmem_free (vlib_main_t * vm, void *p) { - if (mem) - vm->os_physmem_free (vm, idx, mem); + if (p) + clib_pmalloc_free (vm->physmem_main.pmalloc_main, p); } always_inline u64 -vlib_physmem_virtual_to_physical (vlib_main_t * vm, - vlib_physmem_region_index_t idx, void *mem) +vlib_physmem_get_page_index (vlib_main_t * vm, void *mem) { - vlib_physmem_main_t *vpm = &physmem_main; - vlib_physmem_region_t *pr = pool_elt_at_index (vpm->regions, idx); - uword o = mem - pr->mem; - return vlib_physmem_offset_to_physical (vm, idx, o); + clib_pmalloc_main_t *pm = vm->physmem_main.pmalloc_main; + return clib_pmalloc_get_page_index (pm, mem); } - -always_inline clib_error_t * -vlib_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, - u8 numa_node, u32 flags, - vlib_physmem_region_index_t * idx) +always_inline u64 +vlib_physmem_get_pa (vlib_main_t * vm, void *mem) { - return vm->os_physmem_region_alloc (vm, name, size, numa_node, flags, idx); + clib_pmalloc_main_t *pm = vm->physmem_main.pmalloc_main; + return clib_pmalloc_get_pa (pm, mem); } -always_inline void -vlib_physmem_region_free (struct vlib_main_t *vm, - vlib_physmem_region_index_t idx) +always_inline clib_error_t * +vlib_physmem_last_error (struct vlib_main_t * vm) { - vm->os_physmem_region_free (vm, idx); + return clib_error_return (0, "unknown error"); } #endif /* included_vlib_physmem_funcs_h */ diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index 45b12cd5b77..08f0506fbc2 100755 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -61,7 +61,6 @@ char *vlib_default_runtime_dir = "vlib"; unix_main_t unix_main; clib_file_main_t file_main; -vlib_physmem_main_t physmem_main; static clib_error_t * unix_main_init (vlib_main_t * vm) diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h index 7856e5b7df7..e71b0bac6a5 100644 --- a/src/vlib/unix/unix.h +++ b/src/vlib/unix/unix.h @@ -126,8 +126,6 @@ unix_save_error (unix_main_t * um, clib_error_t * error) /* Main function for Unix VLIB. */ int vlib_unix_main (int argc, char *argv[]); -clib_error_t *unix_physmem_init (vlib_main_t * vm); - /* Set prompt for CLI. */ void vlib_unix_cli_set_prompt (char *prompt); diff --git a/src/vppinfra/CMakeLists.txt b/src/vppinfra/CMakeLists.txt index 26368493382..6e1a4fd861a 100644 --- a/src/vppinfra/CMakeLists.txt +++ b/src/vppinfra/CMakeLists.txt @@ -56,6 +56,7 @@ set(VPPINFRA_SRCS macros.c maplog.c mhash.c + pmalloc.c pool.c ptclosure.c random.c @@ -135,6 +136,7 @@ set(VPPINFRA_HEADERS mheap.h os.h pipeline.h + pmalloc.h pool.h pmc.h ptclosure.h @@ -227,6 +229,7 @@ if(VPP_BUILD_VPPINFRA_TESTS) longjmp macros maplog + pmalloc pool_iterate ptclosure random diff --git a/src/vppinfra/linux/syscall.h b/src/vppinfra/linux/syscall.h index 4511b85ac1d..1ae029d5802 100644 --- a/src/vppinfra/linux/syscall.h +++ b/src/vppinfra/linux/syscall.h @@ -19,6 +19,12 @@ #include #include +static inline int +getcpu (unsigned *cpu, unsigned *node, void *tcache) +{ + return syscall (__NR_getcpu, cpu, node, tcache); +} + static inline long set_mempolicy (int mode, const unsigned long *nodemask, unsigned long maxnode) { diff --git a/src/vppinfra/pmalloc.c b/src/vppinfra/pmalloc.c new file mode 100644 index 00000000000..d354cce8a7d --- /dev/null +++ b/src/vppinfra/pmalloc.c @@ -0,0 +1,562 @@ +/* + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#if __SIZEOF_POINTER__ >= 8 +#define DEFAULT_RESERVED_MB 16384 +#else +#define DEFAULT_RESERVED_MB 256 +#endif + +static inline clib_pmalloc_chunk_t * +get_chunk (clib_pmalloc_page_t * pp, u32 index) +{ + return pool_elt_at_index (pp->chunks, index); +} + +static inline int +pmalloc_validate_numa_node (u32 * numa_node) +{ + if (*numa_node == CLIB_PMALLOC_NUMA_LOCAL) + { + u32 cpu; + if (getcpu (&cpu, numa_node, 0) != 0) + return 1; + } + return 0; +} + +int +clib_pmalloc_init (clib_pmalloc_main_t * pm, uword size) +{ + struct stat st; + uword off, pagesize; + int fd; + + ASSERT (pm->error == 0); + + pm->log2_page_sz = 21; + pm->error = clib_mem_create_hugetlb_fd ("detect_hugepage_size", &fd); + + if (pm->error) + return -1; + + if (fd != -1) + { + if (fstat (fd, &st) == -1) + pm->log2_page_sz = min_log2 (st.st_blksize); + close (fd); + } + + pagesize = 1ULL << pm->log2_page_sz; + + size = size ? size : ((u64) DEFAULT_RESERVED_MB) << 20; + size = round_pow2 (size, pagesize); + + pm->max_pages = size >> pm->log2_page_sz; + + /* reserve VA space for future growth */ + pm->base = mmap (0, size + pagesize, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (pm->base == MAP_FAILED) + { + pm->error = clib_error_return_unix (0, "failed to reserve %u pages"); + return -1; + } + + off = round_pow2 (pointer_to_uword (pm->base), pagesize) - + pointer_to_uword (pm->base); + + /* trim start and end of reservation to be page aligned */ + if (off) + { + munmap (pm->base, off); + pm->base += off; + } + + munmap (pm->base + (pm->max_pages * pagesize), pagesize - off); + return 0; +} + +static inline void * +alloc_chunk_from_page (clib_pmalloc_main_t * pm, clib_pmalloc_page_t * pp, + u32 n_blocks, u32 block_align, u32 numa_node) +{ + clib_pmalloc_chunk_t *c; + void *va; + u32 off; + u32 alloc_chunk_index; + + if (pp->chunks == 0) + { + pool_get (pp->chunks, c); + pp->n_free_chunks = 1; + pp->first_chunk_index = c - pp->chunks; + c->prev = c->next = ~0; + c->size = pp->n_free_blocks; + } + + alloc_chunk_index = pp->first_chunk_index; + +next_chunk: + c = pool_elt_at_index (pp->chunks, alloc_chunk_index); + off = (block_align - (c->start & (block_align - 1))) & (block_align - 1); + + if (c->used || n_blocks + off > c->size) + { + if (c->next == ~0) + return 0; + alloc_chunk_index = c->next; + goto next_chunk; + } + + /* if alignment is needed create new empty chunk */ + if (off) + { + u32 offset_chunk_index; + clib_pmalloc_chunk_t *co; + pool_get (pp->chunks, c); + pp->n_free_chunks++; + offset_chunk_index = alloc_chunk_index; + alloc_chunk_index = c - pp->chunks; + + co = pool_elt_at_index (pp->chunks, offset_chunk_index); + c->size = co->size - off; + c->next = co->next; + c->start = co->start + off; + c->prev = offset_chunk_index; + co->size = off; + co->next = alloc_chunk_index; + } + + c->used = 1; + if (c->size > n_blocks) + { + u32 tail_chunk_index; + clib_pmalloc_chunk_t *ct; + pool_get (pp->chunks, ct); + pp->n_free_chunks++; + tail_chunk_index = ct - pp->chunks; + c = pool_elt_at_index (pp->chunks, alloc_chunk_index); + ct->size = c->size - n_blocks; + ct->next = c->next; + ct->prev = alloc_chunk_index; + ct->start = c->start + n_blocks; + + c->size = n_blocks; + c->next = tail_chunk_index; + if (ct->next != ~0) + pool_elt_at_index (pp->chunks, ct->next)->prev = tail_chunk_index; + } + else if (c->next != ~0) + pool_elt_at_index (pp->chunks, c->next)->prev = alloc_chunk_index; + + c = get_chunk (pp, alloc_chunk_index); + va = pm->base + ((pp - pm->pages) << pm->log2_page_sz) + + (c->start << PMALLOC_LOG2_BLOCK_SZ); + hash_set (pm->chunk_index_by_va, pointer_to_uword (va), alloc_chunk_index); + pp->n_free_blocks -= n_blocks; + pp->n_free_chunks--; + return va; +} + +static inline clib_pmalloc_page_t * +pmalloc_map_pages (clib_pmalloc_main_t * pm, clib_pmalloc_arena_t * a, + u32 numa_node, u32 n_pages) +{ + clib_pmalloc_page_t *pp = 0; + u64 seek, pa, sys_page_size; + int pagemap_fd, status, rv, i, mmap_flags; + void *va; + int old_mpol = -1; + long unsigned int mask[16] = { 0 }; + long unsigned int old_mask[16] = { 0 }; + + clib_error_free (pm->error); + + if (pm->max_pages <= vec_len (pm->pages)) + { + pm->error = clib_error_return (0, "maximum number of pages reached"); + return 0; + } + + pm->error = clib_sysfs_prealloc_hugepages (numa_node, pm->log2_page_sz, + n_pages); + + if (pm->error) + return 0; + + rv = get_mempolicy (&old_mpol, old_mask, sizeof (old_mask) * 8 + 1, 0, 0); + /* failure to get mempolicy means we can only proceed with numa 0 maps */ + if (rv == -1 && numa_node != 0) + { + pm->error = clib_error_return_unix (0, "failed to get mempolicy"); + return 0; + } + + mask[0] = 1 << numa_node; + rv = set_mempolicy (MPOL_BIND, mask, sizeof (mask) * 8 + 1); + if (rv == -1 && numa_node != 0) + { + pm->error = clib_error_return_unix (0, "failed to set mempolicy for " + "numa node %u", numa_node); + return 0; + } + + mmap_flags = MAP_FIXED | MAP_HUGETLB | MAP_LOCKED | MAP_ANONYMOUS; + if (a->flags & CLIB_PMALLOC_ARENA_F_SHARED_MEM) + { + mmap_flags |= MAP_SHARED; + pm->error = clib_mem_create_hugetlb_fd ((char *) a->name, &a->fd); + if (a->fd == -1) + goto error; + } + else + { + mmap_flags |= MAP_PRIVATE; + a->fd = -1; + } + + va = pm->base + (vec_len (pm->pages) << pm->log2_page_sz); + if (mmap (va, n_pages << pm->log2_page_sz, PROT_READ | PROT_WRITE, + mmap_flags, a->fd, 0) == MAP_FAILED) + { + pm->error = clib_error_return_unix (0, "failed to mmap %u pages at %p " + "fd %d numa %d flags 0x%x", n_pages, + va, a->fd, numa_node, mmap_flags); + goto error; + } + + rv = set_mempolicy (old_mpol, old_mask, sizeof (old_mask) * 8 + 1); + if (rv == -1 && numa_node != 0) + { + pm->error = clib_error_return_unix (0, "failed to restore mempolicy"); + goto error; + } + + /* we tolerate move_pages failure only if request os for numa node 0 + to support non-numa kernels */ + rv = move_pages (0, 1, &va, 0, &status, 0); + if ((rv == 0 && status != numa_node) || (rv != 0 && numa_node != 0)) + { + pm->error = rv == -1 ? + clib_error_return_unix (0, "page allocated on wrong node, numa node " + "%u status %d", numa_node, status) : + clib_error_return (0, "page allocated on wrong node, numa node " + "%u status %d", numa_node, status); + + /* unmap & reesrve */ + munmap (va, n_pages << pm->log2_page_sz); + mmap (va, n_pages << pm->log2_page_sz, PROT_NONE, + MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + goto error; + } + + memset (va, 0, n_pages << pm->log2_page_sz); + sys_page_size = sysconf (_SC_PAGESIZE); + pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY); + + for (i = 0; i < n_pages; i++) + { + uword page_va = pointer_to_uword ((u8 *) va + (i << pm->log2_page_sz)); + vec_add2 (pm->pages, pp, 1); + pp->n_free_blocks = 1 << (pm->log2_page_sz - PMALLOC_LOG2_BLOCK_SZ); + pp->index = pp - pm->pages; + pp->arena_index = a->index; + + vec_add1 (a->page_indices, pp->index); + a->n_pages++; + + seek = (page_va / sys_page_size) * sizeof (pa); + if (pagemap_fd != -1 && + lseek (pagemap_fd, seek, SEEK_SET) == seek && + read (pagemap_fd, &pa, sizeof (pa)) == (sizeof (pa)) && + pa & (1ULL << 63) /* page present bit */ ) + { + pp->pa = (pa & pow2_mask (55)) * sys_page_size; + } + vec_add1_aligned (pm->va_pa_diffs, pp->pa ? page_va - pp->pa : 0, + CLIB_CACHE_LINE_BYTES); + } + + if (pagemap_fd != -1) + close (pagemap_fd); + + /* return pointer to 1st page */ + return pp - (n_pages - 1); + +error: + if (a->fd != -1) + close (a->fd); + return 0; +} + +void * +clib_pmalloc_create_shared_arena (clib_pmalloc_main_t * pm, char *name, + uword size, u32 numa_node) +{ + clib_pmalloc_arena_t *a; + clib_pmalloc_page_t *pp; + u32 n_pages = round_pow2 (size, 1 << pm->log2_page_sz) >> pm->log2_page_sz; + + if (n_pages + vec_len (pm->pages) > pm->max_pages) + return 0; + + if (pmalloc_validate_numa_node (&numa_node)) + return 0; + + pool_get (pm->arenas, a); + a->index = a - pm->arenas; + a->name = format (0, "%s%c", name, 0); + a->numa_node = numa_node; + a->flags = CLIB_PMALLOC_ARENA_F_SHARED_MEM; + a->log2_page_sz = pm->log2_page_sz; + + if ((pp = pmalloc_map_pages (pm, a, numa_node, n_pages)) == 0) + { + vec_free (a->name); + memset (a, 0, sizeof (*a)); + pool_put (pm->arenas, a); + return 0; + } + + return pm->base + (pp->index << pm->log2_page_sz); +} + +static inline void * +clib_pmalloc_alloc_inline (clib_pmalloc_main_t * pm, clib_pmalloc_arena_t * a, + uword size, uword align, u32 numa_node) +{ + clib_pmalloc_page_t *pp; + u32 n_blocks, block_align, *page_index; + + ASSERT (is_pow2 (align)); + + if (pmalloc_validate_numa_node (&numa_node)) + return 0; + + if (a == 0) + { + vec_validate_init_empty (pm->default_arena_for_numa_node, + numa_node, ~0); + if (pm->default_arena_for_numa_node[numa_node] == ~0) + { + pool_get (pm->arenas, a); + pm->default_arena_for_numa_node[numa_node] = a - pm->arenas; + a->name = format (0, "default-numa-%u%c", numa_node, 0); + a->numa_node = numa_node; + } + else + a = pool_elt_at_index (pm->arenas, + pm->default_arena_for_numa_node[numa_node]); + } + + n_blocks = round_pow2 (size, PMALLOC_BLOCK_SZ) / PMALLOC_BLOCK_SZ; + block_align = align >> PMALLOC_LOG2_BLOCK_SZ; + + vec_foreach (page_index, a->page_indices) + { + pp = vec_elt_at_index (pm->pages, *page_index); + void *rv = alloc_chunk_from_page (pm, pp, n_blocks, block_align, + numa_node); + + if (rv) + return rv; + } + + if ((a->flags & CLIB_PMALLOC_ARENA_F_SHARED_MEM) == 0 && + (pp = pmalloc_map_pages (pm, a, numa_node, 1))) + return alloc_chunk_from_page (pm, pp, n_blocks, block_align, numa_node); + + return 0; +} + +void * +clib_pmalloc_alloc_aligned_on_numa (clib_pmalloc_main_t * pm, uword size, + uword align, u32 numa_node) +{ + return clib_pmalloc_alloc_inline (pm, 0, size, align, numa_node); +} + +void * +clib_pmalloc_alloc_aligned (clib_pmalloc_main_t * pm, uword size, uword align) +{ + return clib_pmalloc_alloc_inline (pm, 0, size, align, + CLIB_PMALLOC_NUMA_LOCAL); +} + +void * +clib_pmalloc_alloc_from_arena (clib_pmalloc_main_t * pm, void *arena_va, + uword size, uword align) +{ + clib_pmalloc_arena_t *a = clib_pmalloc_get_arena (pm, arena_va); + return clib_pmalloc_alloc_inline (pm, a, size, align, 0); +} + +void +clib_pmalloc_free (clib_pmalloc_main_t * pm, void *va) +{ + clib_pmalloc_page_t *pp; + clib_pmalloc_chunk_t *c; + uword *p; + u32 chunk_index, page_index; + + p = hash_get (pm->chunk_index_by_va, pointer_to_uword (va)); + + if (p == 0) + os_panic (); + + chunk_index = p[0]; + page_index = clib_pmalloc_get_page_index (pm, va); + hash_unset (pm->chunk_index_by_va, pointer_to_uword (va)); + + pp = vec_elt_at_index (pm->pages, page_index); + c = pool_elt_at_index (pp->chunks, chunk_index); + c->used = 0; + pp->n_free_blocks += c->size; + pp->n_free_chunks++; + + /* merge with next if free */ + if (c->next != ~0 && get_chunk (pp, c->next)->used == 0) + { + clib_pmalloc_chunk_t *next = get_chunk (pp, c->next); + c->size += next->size; + c->next = next->next; + if (next->next != ~0) + get_chunk (pp, next->next)->prev = chunk_index; + memset (next, 0, sizeof (*next)); + pool_put (pp->chunks, next); + pp->n_free_chunks--; + } + + /* merge with prev if free */ + if (c->prev != ~0 && get_chunk (pp, c->prev)->used == 0) + { + clib_pmalloc_chunk_t *prev = get_chunk (pp, c->prev); + prev->size += c->size; + prev->next = c->next; + if (c->next != ~0) + get_chunk (pp, c->next)->prev = c->prev; + memset (c, 0, sizeof (*c)); + pool_put (pp->chunks, c); + pp->n_free_chunks--; + } +} + +static u8 * +format_pmalloc_page (u8 * s, va_list * va) +{ + clib_pmalloc_page_t *pp = va_arg (*va, clib_pmalloc_page_t *); + int verbose = va_arg (*va, int); + u32 indent = format_get_indent (s); + + s = format (s, "page %u: phys-addr %p ", pp->index, pp->pa); + + if (pp->chunks == 0) + return s; + + s = format (s, "free %u chunks %u free-chunks %d ", + (pp->n_free_blocks) << PMALLOC_LOG2_BLOCK_SZ, + pool_elts (pp->chunks), pp->n_free_chunks); + + if (verbose >= 2) + { + clib_pmalloc_chunk_t *c; + c = pool_elt_at_index (pp->chunks, pp->first_chunk_index); + s = format (s, "\n%U%12s%12s%8s%8s%8s%8s", + format_white_space, indent + 2, + "chunk offset", "size", "used", "index", "prev", "next"); + while (1) + { + s = format (s, "\n%U%12u%12u%8s%8d%8d%8d", + format_white_space, indent + 2, + c->start << PMALLOC_LOG2_BLOCK_SZ, + c->size << PMALLOC_LOG2_BLOCK_SZ, + c->used ? "yes" : "no", + c - pp->chunks, c->prev, c->next); + if (c->next == ~0) + break; + c = pool_elt_at_index (pp->chunks, c->next); + } + } + return s; +} + +u8 * +format_pmalloc (u8 * s, va_list * va) +{ + clib_pmalloc_main_t *pm = va_arg (*va, clib_pmalloc_main_t *); + int verbose = va_arg (*va, int); + u32 indent = format_get_indent (s); + + clib_pmalloc_page_t *pp; + clib_pmalloc_arena_t *a; + + s = format (s, "used-pages %u reserved-pages %u pagesize %uKB", + vec_len (pm->pages), pm->max_pages, + 1 << (pm->log2_page_sz - 10)); + + if (verbose >= 2) + s = format (s, " va-start %p", pm->base); + + if (pm->error) + s = format (s, "\n%Ulast-error: %U", format_white_space, indent + 2, + format_clib_error, pm->error); + + + /* *INDENT-OFF* */ + pool_foreach (a, pm->arenas, + { + u32 *page_index; + s = format (s, "\n%Uarena '%s' pages %u numa-node %u", + format_white_space, indent + 2, + a->name, vec_len (a->page_indices), a->numa_node); + if (a->fd != -1) + s = format (s, " shared fd %d", a->fd); + if (verbose >= 1) + vec_foreach (page_index, a->page_indices) + { + pp = vec_elt_at_index (pm->pages, *page_index); + s = format (s, "\n%U%U", format_white_space, indent + 4, + format_pmalloc_page, pp, verbose); + } + }); + /* *INDENT-ON* */ + + return s; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/pmalloc.h b/src/vppinfra/pmalloc.h new file mode 100644 index 00000000000..4d9906ab6e7 --- /dev/null +++ b/src/vppinfra/pmalloc.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_palloc_h +#define included_palloc_h +#include +#include + +#define PMALLOC_LOG2_BLOCK_SZ CLIB_LOG2_CACHE_LINE_BYTES +#define PMALLOC_BLOCK_SZ (1 << 6) + +#define CLIB_PMALLOC_NUMA_LOCAL 0xffffffff + +typedef struct +{ + u32 start, prev, next; + u32 size:31; + u32 used:1; +} clib_pmalloc_chunk_t; + +STATIC_ASSERT_SIZEOF (clib_pmalloc_chunk_t, 16); + +typedef struct +{ + u32 index; + u32 arena_index; + uword pa; + clib_pmalloc_chunk_t *chunks; + u32 first_chunk_index; + u32 n_free_chunks; + u32 n_free_blocks; +} clib_pmalloc_page_t; + +typedef struct +{ + u32 index; + u32 flags; +#define CLIB_PMALLOC_ARENA_F_SHARED_MEM (1 << 0) + int fd; + u32 numa_node; + u32 first_page_index; + u32 log2_page_sz; + u32 n_pages; + u8 *name; + u32 *page_indices; +} clib_pmalloc_arena_t; + +typedef struct +{ + u8 *base; + uword log2_page_sz; + uword *va_pa_diffs; + u32 max_pages; + clib_pmalloc_page_t *pages; + uword *chunk_index_by_va; + clib_pmalloc_arena_t *arenas; + u32 *default_arena_for_numa_node; + + clib_error_t *error; +} clib_pmalloc_main_t; + + +int clib_pmalloc_init (clib_pmalloc_main_t * pm, uword size); +void *clib_pmalloc_alloc_aligned_on_numa (clib_pmalloc_main_t * pm, + uword size, uword align, + u32 numa_node); +void *clib_pmalloc_alloc_aligned (clib_pmalloc_main_t * pm, uword size, + uword align); +void clib_pmalloc_free (clib_pmalloc_main_t * pm, void *va); + +void *clib_pmalloc_create_shared_arena (clib_pmalloc_main_t * pm, char *name, + uword size, u32 numa_node); + +void *clib_pmalloc_alloc_from_arena (clib_pmalloc_main_t * pm, void *arena_va, + uword size, uword align); + +format_function_t format_pmalloc; + +always_inline clib_error_t * +clib_pmalloc_last_error (clib_pmalloc_main_t * pm) +{ + return pm->error; +} + +always_inline u32 +clib_pmalloc_get_page_index (clib_pmalloc_main_t * pm, void *va) +{ + uword index = (pointer_to_uword (va) - pointer_to_uword (pm->base)) >> + pm->log2_page_sz; + + ASSERT (index < vec_len (pm->pages)); + + return index; +} + +always_inline clib_pmalloc_arena_t * +clib_pmalloc_get_arena (clib_pmalloc_main_t * pm, void *va) +{ + u32 index = clib_pmalloc_get_page_index (pm, va); + return pm->arenas + pm->pages[index].arena_index; +} + +always_inline uword +clib_pmalloc_get_pa (clib_pmalloc_main_t * pm, void *va) +{ + u32 index = clib_pmalloc_get_page_index (pm, va); + return pointer_to_uword (va) - pm->va_pa_diffs[index]; +} + + +#endif /* included_palloc_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/test_pmalloc.c b/src/vppinfra/test_pmalloc.c new file mode 100644 index 00000000000..c9ae01fc936 --- /dev/null +++ b/src/vppinfra/test_pmalloc.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +typedef struct +{ + uword baseva; + uword size; + uword *vas; + u32 nitems; + u32 item_size; + u32 align; + int max_numa; + u32 arena_pages; + u32 arena_numa; + u32 arena_items; + int verbose; + clib_pmalloc_main_t pmalloc_main; +} test_main_t; + +test_main_t test_main; + +clib_error_t * +test_palloc (test_main_t * tm) +{ + clib_pmalloc_main_t *pm = &tm->pmalloc_main; + void *arena; + int i; + uword *va; + + if (clib_pmalloc_init (pm, 0) != 0) + return clib_error_return (0, "pmalloc init failure"); + + fformat (stdout, "Allocate %d items...\n", tm->nitems); + + for (i = 0; i < tm->nitems; i++) + { + u32 size = tm->item_size ? tm->item_size : 64 + 64 * (i % 8); + u32 align = tm->align ? tm->align : 64 << (i % 5); + u32 numa = i % (tm->max_numa + 1); + va = clib_pmalloc_alloc_aligned_on_numa (pm, size, align, numa); + + if (va == 0) + clib_error ("Failed to alloc %u byte chunk with align %u on numa %u," + "\nerror: %U", size, align, numa, format_clib_error, + clib_pmalloc_last_error (pm)); + + if ((pointer_to_uword (va) & (align - 1)) != 0) + clib_error (0, "Alignment error: %p not aligned with %u", va, align); + + vec_add1 (tm->vas, pointer_to_uword (va)); + } + fformat (stdout, "%U\n", format_pmalloc, pm, tm->verbose); + + /* alloc from arena */ + if (tm->arena_items) + { + fformat (stdout, "Allocate %d items from arena ...\n", tm->arena_items); + arena = clib_pmalloc_create_shared_arena (pm, "test arena", + tm->arena_pages << 21, + tm->arena_numa); + if (arena == 0) + clib_error ("Failed to alloc shared arena: %U", format_clib_error, + clib_pmalloc_last_error (pm)); + + for (i = 0; i < tm->arena_items; i++) + { + u32 size = tm->item_size ? tm->item_size : 64 + 64 * (i % 8); + u32 align = tm->align ? tm->align : 64 << (i % 5); + va = clib_pmalloc_alloc_from_arena (pm, arena, size, align); + vec_add1 (tm->vas, pointer_to_uword (va)); + } + fformat (stdout, "\n%U\n", format_pmalloc, pm, tm->verbose); + } + + + fformat (stdout, "Freeing %d items ...\n", vec_len (tm->vas)); + for (i = 0; i < vec_len (tm->vas); i++) + clib_pmalloc_free (pm, (void *) tm->vas[i]); + + fformat (stdout, "\n%U\n", format_pmalloc, pm, tm->verbose); + return 0; +} + +clib_error_t * +test_palloc_main (unformat_input_t * i) +{ + test_main_t *tm = &test_main; + clib_error_t *error; + + tm->nitems = 5; + tm->arena_pages = 2; + tm->arena_numa = CLIB_PMALLOC_NUMA_LOCAL; + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "nitems %u", &tm->nitems)) + ; + else if (unformat (i, "max-numa %u", &tm->max_numa)) + ; + else if (unformat (i, "item-size %u", &tm->item_size)) + ; + else if (unformat (i, "align %u", &tm->align)) + ; + else if (unformat (i, "verbose %d", &tm->verbose)) + ; + else if (unformat (i, "arena-pages %u", &tm->arena_pages)) + ; + else if (unformat (i, "arena-numa %u", &tm->arena_numa)) + ; + else if (unformat (i, "arena-items %u", &tm->arena_items)) + ; + else if (unformat (i, "verbose")) + tm->verbose = 1; + else + return clib_error_return (0, "unknown input '%U'", + format_unformat_error, i); + } + + error = test_palloc (tm); + + return error; +} + +#ifdef CLIB_UNIX +int +main (int argc, char *argv[]) +{ + unformat_input_t i; + int rv = 0; + clib_error_t *error; + + clib_mem_init (0, 3ULL << 30); + + unformat_init_command_line (&i, argv); + error = test_palloc_main (&i); + if (error) + { + clib_error_report (error); + rv = 1; + } + unformat_free (&i); + + return rv; +} +#endif /* CLIB_UNIX */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ -- 2.16.6