X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=lib%2Flibrte_eal%2Fcommon%2Feal_common_memory.c;h=fbfb1b0550a013d38d7305950abc123d390ef436;hb=2db818aa86a7649259a548711a5ce1e1edbe11ef;hp=852f3bb93bab8b6a0b091e57e08f4d36fd7140e6;hpb=ca33590b6af032bff57d9cc70455660466a654b2;p=deb_dpdk.git diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c index 852f3bb9..fbfb1b05 100644 --- a/lib/librte_eal/common/eal_common_memory.c +++ b/lib/librte_eal/common/eal_common_memory.c @@ -2,82 +2,385 @@ * Copyright(c) 2010-2014 Intel Corporation */ +#include #include #include #include #include +#include #include #include #include #include +#include #include #include #include +#include #include +#include "eal_memalloc.h" #include "eal_private.h" #include "eal_internal_cfg.h" /* - * Return a pointer to a read-only table of struct rte_physmem_desc - * elements, containing the layout of all addressable physical - * memory. The last element of the table contains a NULL address. + * Try to mmap *size bytes in /dev/zero. If it is successful, return the + * pointer to the mmap'd area and keep *size unmodified. Else, retry + * with a smaller zone: decrease *size by hugepage_sz until it reaches + * 0. In this case, return NULL. Note: this function returns an address + * which is a multiple of hugepage size. */ -const struct rte_memseg * -rte_eal_get_physmem_layout(void) + +#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" + +static void *next_baseaddr; +static uint64_t system_page_sz; + +void * +eal_get_virtual_area(void *requested_addr, size_t *size, + size_t page_sz, int flags, int mmap_flags) +{ + bool addr_is_hint, allow_shrink, unmap, no_align; + uint64_t map_sz; + void *mapped_addr, *aligned_addr; + + if (system_page_sz == 0) + system_page_sz = sysconf(_SC_PAGESIZE); + + mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS; + + RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size); + + addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0; + allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0; + unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0; + + if (next_baseaddr == NULL && internal_config.base_virtaddr != 0 && + rte_eal_process_type() == RTE_PROC_PRIMARY) + next_baseaddr = (void *) internal_config.base_virtaddr; + + if (requested_addr == NULL && next_baseaddr != NULL) { + requested_addr = next_baseaddr; + requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz); + addr_is_hint = true; + } + + /* we don't need alignment of resulting pointer in the following cases: + * + * 1. page size is equal to system size + * 2. we have a requested address, and it is page-aligned, and we will + * be discarding the address if we get a different one. + * + * for all other cases, alignment is potentially necessary. + */ + no_align = (requested_addr != NULL && + requested_addr == RTE_PTR_ALIGN(requested_addr, page_sz) && + !addr_is_hint) || + page_sz == system_page_sz; + + do { + map_sz = no_align ? *size : *size + page_sz; + if (map_sz > SIZE_MAX) { + RTE_LOG(ERR, EAL, "Map size too big\n"); + rte_errno = E2BIG; + return NULL; + } + + mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_READ, + mmap_flags, -1, 0); + if (mapped_addr == MAP_FAILED && allow_shrink) + *size -= page_sz; + } while (allow_shrink && mapped_addr == MAP_FAILED && *size > 0); + + /* align resulting address - if map failed, we will ignore the value + * anyway, so no need to add additional checks. + */ + aligned_addr = no_align ? mapped_addr : + RTE_PTR_ALIGN(mapped_addr, page_sz); + + if (*size == 0) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n", + strerror(errno)); + rte_errno = errno; + return NULL; + } else if (mapped_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n", + strerror(errno)); + /* pass errno up the call chain */ + rte_errno = errno; + return NULL; + } else if (requested_addr != NULL && !addr_is_hint && + aligned_addr != requested_addr) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n", + requested_addr, aligned_addr); + munmap(mapped_addr, map_sz); + rte_errno = EADDRNOTAVAIL; + return NULL; + } else if (requested_addr != NULL && addr_is_hint && + aligned_addr != requested_addr) { + RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n", + requested_addr, aligned_addr); + RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n"); + } else if (next_baseaddr != NULL) { + next_baseaddr = RTE_PTR_ADD(aligned_addr, *size); + } + + RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", + aligned_addr, *size); + + if (unmap) { + munmap(mapped_addr, map_sz); + } else if (!no_align) { + void *map_end, *aligned_end; + size_t before_len, after_len; + + /* when we reserve space with alignment, we add alignment to + * mapping size. On 32-bit, if 1GB alignment was requested, this + * would waste 1GB of address space, which is a luxury we cannot + * afford. so, if alignment was performed, check if any unneeded + * address space can be unmapped back. + */ + + map_end = RTE_PTR_ADD(mapped_addr, (size_t)map_sz); + aligned_end = RTE_PTR_ADD(aligned_addr, *size); + + /* unmap space before aligned mmap address */ + before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr); + if (before_len > 0) + munmap(mapped_addr, before_len); + + /* unmap space after aligned end mmap address */ + after_len = RTE_PTR_DIFF(map_end, aligned_end); + if (after_len > 0) + munmap(aligned_end, after_len); + } + + return aligned_addr; +} + +static struct rte_memseg * +virt2memseg(const void *addr, const struct rte_memseg_list *msl) +{ + const struct rte_fbarray *arr; + void *start, *end; + int ms_idx; + + if (msl == NULL) + return NULL; + + /* a memseg list was specified, check if it's the right one */ + start = msl->base_va; + end = RTE_PTR_ADD(start, (size_t)msl->page_sz * msl->memseg_arr.len); + + if (addr < start || addr >= end) + return NULL; + + /* now, calculate index */ + arr = &msl->memseg_arr; + ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz; + return rte_fbarray_get(arr, ms_idx); +} + +static struct rte_memseg_list * +virt2memseg_list(const void *addr) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + int msl_idx; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + void *start, *end; + msl = &mcfg->memsegs[msl_idx]; + + start = msl->base_va; + end = RTE_PTR_ADD(start, + (size_t)msl->page_sz * msl->memseg_arr.len); + if (addr >= start && addr < end) + break; + } + /* if we didn't find our memseg list */ + if (msl_idx == RTE_MAX_MEMSEG_LISTS) + return NULL; + return msl; +} + +__rte_experimental struct rte_memseg_list * +rte_mem_virt2memseg_list(const void *addr) +{ + return virt2memseg_list(addr); +} + +struct virtiova { + rte_iova_t iova; + void *virt; +}; +static int +find_virt(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + struct virtiova *vi = arg; + if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) { + size_t offset = vi->iova - ms->iova; + vi->virt = RTE_PTR_ADD(ms->addr, offset); + /* stop the walk */ + return 1; + } + return 0; +} +static int +find_virt_legacy(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, size_t len, void *arg) { - return rte_eal_get_configuration()->mem_config->memseg; + struct virtiova *vi = arg; + if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) { + size_t offset = vi->iova - ms->iova; + vi->virt = RTE_PTR_ADD(ms->addr, offset); + /* stop the walk */ + return 1; + } + return 0; } +__rte_experimental void * +rte_mem_iova2virt(rte_iova_t iova) +{ + struct virtiova vi; + + memset(&vi, 0, sizeof(vi)); + + vi.iova = iova; + /* for legacy mem, we can get away with scanning VA-contiguous segments, + * as we know they are PA-contiguous as well + */ + if (internal_config.legacy_mem) + rte_memseg_contig_walk(find_virt_legacy, &vi); + else + rte_memseg_walk(find_virt, &vi); + + return vi.virt; +} + +__rte_experimental struct rte_memseg * +rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl) +{ + return virt2memseg(addr, msl != NULL ? msl : + rte_mem_virt2memseg_list(addr)); +} + +static int +physmem_size(const struct rte_memseg_list *msl, void *arg) +{ + uint64_t *total_len = arg; + + *total_len += msl->memseg_arr.count * msl->page_sz; + + return 0; +} /* get the total size of memory */ uint64_t rte_eal_get_physmem_size(void) { - const struct rte_mem_config *mcfg; - unsigned i = 0; uint64_t total_len = 0; - /* get pointer to global configuration */ - mcfg = rte_eal_get_configuration()->mem_config; + rte_memseg_list_walk(physmem_size, &total_len); - for (i = 0; i < RTE_MAX_MEMSEG; i++) { - if (mcfg->memseg[i].addr == NULL) - break; + return total_len; +} - total_len += mcfg->memseg[i].len; - } +static int +dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int msl_idx, ms_idx; + FILE *f = arg; - return total_len; + msl_idx = msl - mcfg->memsegs; + if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) + return -1; + + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + if (ms_idx < 0) + return -1; + + fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, " + "virt:%p, socket_id:%"PRId32", " + "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", " + "nrank:%"PRIx32"\n", + msl_idx, ms_idx, + ms->iova, + ms->len, + ms->addr, + ms->socket_id, + ms->hugepage_sz, + ms->nchannel, + ms->nrank); + + return 0; } -/* Dump the physical memory layout on console */ -void -rte_dump_physmem_layout(FILE *f) +/* + * Defining here because declared in rte_memory.h, but the actual implementation + * is in eal_common_memalloc.c, like all other memalloc internals. + */ +int __rte_experimental +rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb, + void *arg) { - const struct rte_mem_config *mcfg; - unsigned i = 0; + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_event_callback_register(name, clb, arg); +} - /* get pointer to global configuration */ - mcfg = rte_eal_get_configuration()->mem_config; +int __rte_experimental +rte_mem_event_callback_unregister(const char *name, void *arg) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_event_callback_unregister(name, arg); +} - for (i = 0; i < RTE_MAX_MEMSEG; i++) { - if (mcfg->memseg[i].addr == NULL) - break; +int __rte_experimental +rte_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id, + limit); +} - fprintf(f, "Segment %u: IOVA:0x%"PRIx64", len:%zu, " - "virt:%p, socket_id:%"PRId32", " - "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", " - "nrank:%"PRIx32"\n", i, - mcfg->memseg[i].iova, - mcfg->memseg[i].len, - mcfg->memseg[i].addr, - mcfg->memseg[i].socket_id, - mcfg->memseg[i].hugepage_sz, - mcfg->memseg[i].nchannel, - mcfg->memseg[i].nrank); +int __rte_experimental +rte_mem_alloc_validator_unregister(const char *name, int socket_id) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); + rte_errno = ENOTSUP; + return -1; } + return eal_memalloc_mem_alloc_validator_unregister(name, socket_id); +} + +/* Dump the physical memory layout on console */ +void +rte_dump_physmem_layout(FILE *f) +{ + rte_memseg_walk(dump_memseg, f); } /* return the number of memory channels */ @@ -117,20 +420,165 @@ rte_mem_lock_page(const void *virt) return mlock((void *)aligned, page_size); } +int __rte_experimental +rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ms_idx, ret = 0; + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + const struct rte_memseg *ms; + struct rte_fbarray *arr; + + if (msl->memseg_arr.count == 0) + continue; + + arr = &msl->memseg_arr; + + ms_idx = rte_fbarray_find_next_used(arr, 0); + while (ms_idx >= 0) { + int n_segs; + size_t len; + + ms = rte_fbarray_get(arr, ms_idx); + + /* find how many more segments there are, starting with + * this one. + */ + n_segs = rte_fbarray_find_contig_used(arr, ms_idx); + len = n_segs * msl->page_sz; + + ret = func(msl, ms, len, arg); + if (ret) + return ret; + ms_idx = rte_fbarray_find_next_used(arr, + ms_idx + n_segs); + } + } + return 0; +} + +int __rte_experimental +rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + ret = rte_memseg_contig_walk_thread_unsafe(func, arg); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int __rte_experimental +rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ms_idx, ret = 0; + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + const struct rte_memseg *ms; + struct rte_fbarray *arr; + + if (msl->memseg_arr.count == 0) + continue; + + arr = &msl->memseg_arr; + + ms_idx = rte_fbarray_find_next_used(arr, 0); + while (ms_idx >= 0) { + ms = rte_fbarray_get(arr, ms_idx); + ret = func(msl, ms, arg); + if (ret) + return ret; + ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1); + } + } + return 0; +} + +int __rte_experimental +rte_memseg_walk(rte_memseg_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + ret = rte_memseg_walk_thread_unsafe(func, arg); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int __rte_experimental +rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ret = 0; + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + + if (msl->base_va == NULL) + continue; + + ret = func(msl, arg); + if (ret) + return ret; + } + return 0; +} + +int __rte_experimental +rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + ret = rte_memseg_list_walk_thread_unsafe(func, arg); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + /* init memory subsystem */ int rte_eal_memory_init(void) { + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int retval; RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n"); - const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? + if (!mcfg) + return -1; + + /* lock mem hotplug here, to prevent races while we init */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + + if (rte_eal_memseg_init() < 0) + goto fail; + + if (eal_memalloc_init() < 0) + goto fail; + + retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? rte_eal_hugepage_init() : rte_eal_hugepage_attach(); if (retval < 0) - return -1; + goto fail; if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0) - return -1; + goto fail; return 0; +fail: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return -1; }