New upstream version 18.08
[deb_dpdk.git] / lib / librte_eal / bsdapp / eal / eal_memory.c
index bdfb882..16d2bc7 100644 (file)
@@ -6,10 +6,13 @@
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #include <inttypes.h>
+#include <errno.h>
+#include <string.h>
 #include <fcntl.h>
 
 #include <rte_eal.h>
 #include <rte_eal_memconfig.h>
+#include <rte_errno.h>
 #include <rte_log.h>
 #include <rte_string_fns.h>
 #include "eal_private.h"
@@ -41,129 +44,253 @@ rte_eal_hugepage_init(void)
        struct rte_mem_config *mcfg;
        uint64_t total_mem = 0;
        void *addr;
-       unsigned i, j, seg_idx = 0;
+       unsigned int i, j, seg_idx = 0;
 
        /* get pointer to global configuration */
        mcfg = rte_eal_get_configuration()->mem_config;
 
        /* for debug purposes, hugetlbfs can be disabled */
        if (internal_config.no_hugetlbfs) {
-               addr = malloc(internal_config.memory);
-               mcfg->memseg[0].iova = (rte_iova_t)(uintptr_t)addr;
-               mcfg->memseg[0].addr = addr;
-               mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
-               mcfg->memseg[0].len = internal_config.memory;
-               mcfg->memseg[0].socket_id = 0;
+               struct rte_memseg_list *msl;
+               struct rte_fbarray *arr;
+               struct rte_memseg *ms;
+               uint64_t page_sz;
+               int n_segs, cur_seg;
+
+               /* create a memseg list */
+               msl = &mcfg->memsegs[0];
+
+               page_sz = RTE_PGSIZE_4K;
+               n_segs = internal_config.memory / page_sz;
+
+               if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs,
+                               sizeof(struct rte_memseg))) {
+                       RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
+                       return -1;
+               }
+
+               addr = mmap(NULL, internal_config.memory,
+                               PROT_READ | PROT_WRITE,
+                               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+               if (addr == MAP_FAILED) {
+                       RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
+                                       strerror(errno));
+                       return -1;
+               }
+               msl->base_va = addr;
+               msl->page_sz = page_sz;
+               msl->socket_id = 0;
+
+               /* populate memsegs. each memseg is 1 page long */
+               for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
+                       arr = &msl->memseg_arr;
+
+                       ms = rte_fbarray_get(arr, cur_seg);
+                       if (rte_eal_iova_mode() == RTE_IOVA_VA)
+                               ms->iova = (uintptr_t)addr;
+                       else
+                               ms->iova = RTE_BAD_IOVA;
+                       ms->addr = addr;
+                       ms->hugepage_sz = page_sz;
+                       ms->len = page_sz;
+                       ms->socket_id = 0;
+
+                       rte_fbarray_set_used(arr, cur_seg);
+
+                       addr = RTE_PTR_ADD(addr, page_sz);
+               }
                return 0;
        }
 
        /* map all hugepages and sort them */
        for (i = 0; i < internal_config.num_hugepage_sizes; i ++){
                struct hugepage_info *hpi;
+               rte_iova_t prev_end = 0;
+               int prev_ms_idx = -1;
+               uint64_t page_sz, mem_needed;
+               unsigned int n_pages, max_pages;
 
                hpi = &internal_config.hugepage_info[i];
-               for (j = 0; j < hpi->num_pages[0]; j++) {
+               page_sz = hpi->hugepage_sz;
+               max_pages = hpi->num_pages[0];
+               mem_needed = RTE_ALIGN_CEIL(internal_config.memory - total_mem,
+                               page_sz);
+
+               n_pages = RTE_MIN(mem_needed / page_sz, max_pages);
+
+               for (j = 0; j < n_pages; j++) {
+                       struct rte_memseg_list *msl;
+                       struct rte_fbarray *arr;
                        struct rte_memseg *seg;
+                       int msl_idx, ms_idx;
                        rte_iova_t physaddr;
                        int error;
                        size_t sysctl_size = sizeof(physaddr);
                        char physaddr_str[64];
+                       bool is_adjacent;
 
-                       addr = mmap(NULL, hpi->hugepage_sz, PROT_READ|PROT_WRITE,
-                                   MAP_SHARED, hpi->lock_descriptor,
-                                   j * EAL_PAGE_SIZE);
-                       if (addr == MAP_FAILED) {
-                               RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n",
-                                               j, hpi->hugedir);
-                               return -1;
-                       }
-
-                       snprintf(physaddr_str, sizeof(physaddr_str), "hw.contigmem"
-                                       ".physaddr.%d", j);
-                       error = sysctlbyname(physaddr_str, &physaddr, &sysctl_size,
-                                       NULL, 0);
+                       /* first, check if this segment is IOVA-adjacent to
+                        * the previous one.
+                        */
+                       snprintf(physaddr_str, sizeof(physaddr_str),
+                                       "hw.contigmem.physaddr.%d", j);
+                       error = sysctlbyname(physaddr_str, &physaddr,
+                                       &sysctl_size, NULL, 0);
                        if (error < 0) {
                                RTE_LOG(ERR, EAL, "Failed to get physical addr for buffer %u "
                                                "from %s\n", j, hpi->hugedir);
                                return -1;
                        }
 
-                       seg = &mcfg->memseg[seg_idx++];
+                       is_adjacent = prev_end != 0 && physaddr == prev_end;
+                       prev_end = physaddr + hpi->hugepage_sz;
+
+                       for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
+                                       msl_idx++) {
+                               bool empty, need_hole;
+                               msl = &mcfg->memsegs[msl_idx];
+                               arr = &msl->memseg_arr;
+
+                               if (msl->page_sz != page_sz)
+                                       continue;
+
+                               empty = arr->count == 0;
+
+                               /* we need a hole if this isn't an empty memseg
+                                * list, and if previous segment was not
+                                * adjacent to current one.
+                                */
+                               need_hole = !empty && !is_adjacent;
+
+                               /* we need 1, plus hole if not adjacent */
+                               ms_idx = rte_fbarray_find_next_n_free(arr,
+                                               0, 1 + (need_hole ? 1 : 0));
+
+                               /* memseg list is full? */
+                               if (ms_idx < 0)
+                                       continue;
+
+                               if (need_hole && prev_ms_idx == ms_idx - 1)
+                                       ms_idx++;
+                               prev_ms_idx = ms_idx;
+
+                               break;
+                       }
+                       if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
+                               RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n",
+                                       RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE),
+                                       RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE));
+                               return -1;
+                       }
+                       arr = &msl->memseg_arr;
+                       seg = rte_fbarray_get(arr, ms_idx);
+
+                       addr = RTE_PTR_ADD(msl->base_va,
+                                       (size_t)msl->page_sz * ms_idx);
+
+                       /* address is already mapped in memseg list, so using
+                        * MAP_FIXED here is safe.
+                        */
+                       addr = mmap(addr, page_sz, PROT_READ|PROT_WRITE,
+                                       MAP_SHARED | MAP_FIXED,
+                                       hpi->lock_descriptor,
+                                       j * EAL_PAGE_SIZE);
+                       if (addr == MAP_FAILED) {
+                               RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n",
+                                               j, hpi->hugedir);
+                               return -1;
+                       }
+
                        seg->addr = addr;
                        seg->iova = physaddr;
-                       seg->hugepage_sz = hpi->hugepage_sz;
-                       seg->len = hpi->hugepage_sz;
+                       seg->hugepage_sz = page_sz;
+                       seg->len = page_sz;
                        seg->nchannel = mcfg->nchannel;
                        seg->nrank = mcfg->nrank;
                        seg->socket_id = 0;
 
+                       rte_fbarray_set_used(arr, ms_idx);
+
                        RTE_LOG(INFO, EAL, "Mapped memory segment %u @ %p: physaddr:0x%"
                                        PRIx64", len %zu\n",
-                                       seg_idx, addr, physaddr, hpi->hugepage_sz);
-                       if (total_mem >= internal_config.memory ||
-                                       seg_idx >= RTE_MAX_MEMSEG)
-                               break;
+                                       seg_idx++, addr, physaddr, page_sz);
+
+                       total_mem += seg->len;
                }
+               if (total_mem >= internal_config.memory)
+                       break;
+       }
+       if (total_mem < internal_config.memory) {
+               RTE_LOG(ERR, EAL, "Couldn't reserve requested memory, "
+                               "requested: %" PRIu64 "M "
+                               "available: %" PRIu64 "M\n",
+                               internal_config.memory >> 20, total_mem >> 20);
+               return -1;
        }
        return 0;
 }
 
+struct attach_walk_args {
+       int fd_hugepage;
+       int seg_idx;
+};
+static int
+attach_segment(const struct rte_memseg_list *msl __rte_unused,
+               const struct rte_memseg *ms, void *arg)
+{
+       struct attach_walk_args *wa = arg;
+       void *addr;
+
+       addr = mmap(ms->addr, ms->len, PROT_READ | PROT_WRITE,
+                       MAP_SHARED | MAP_FIXED, wa->fd_hugepage,
+                       wa->seg_idx * EAL_PAGE_SIZE);
+       if (addr == MAP_FAILED || addr != ms->addr)
+               return -1;
+       wa->seg_idx++;
+
+       return 0;
+}
+
 int
 rte_eal_hugepage_attach(void)
 {
        const struct hugepage_info *hpi;
-       int fd_hugepage_info, fd_hugepage = -1;
-       unsigned i = 0;
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       int fd_hugepage = -1;
+       unsigned int i;
 
-       /* Obtain a file descriptor for hugepage_info */
-       fd_hugepage_info = open(eal_hugepage_info_path(), O_RDONLY);
-       if (fd_hugepage_info < 0) {
-               RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path());
-               return -1;
-       }
+       hpi = &internal_config.hugepage_info[0];
 
-       /* Map the shared hugepage_info into the process address spaces */
-       hpi = mmap(NULL, sizeof(struct hugepage_info), PROT_READ, MAP_PRIVATE,
-                       fd_hugepage_info, 0);
-       if (hpi == MAP_FAILED) {
-               RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path());
-               goto error;
-       }
-
-       /* Obtain a file descriptor for contiguous memory */
-       fd_hugepage = open(hpi->hugedir, O_RDWR);
-       if (fd_hugepage < 0) {
-               RTE_LOG(ERR, EAL, "Could not open %s\n", hpi->hugedir);
-               goto error;
-       }
+       for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+               const struct hugepage_info *cur_hpi = &hpi[i];
+               struct attach_walk_args wa;
 
-       /* Map the contiguous memory into each memory segment */
-       for (i = 0; i < hpi->num_pages[0]; i++) {
+               memset(&wa, 0, sizeof(wa));
 
-               void *addr;
-               struct rte_memseg *seg = &mcfg->memseg[i];
+               /* Obtain a file descriptor for contiguous memory */
+               fd_hugepage = open(cur_hpi->hugedir, O_RDWR);
+               if (fd_hugepage < 0) {
+                       RTE_LOG(ERR, EAL, "Could not open %s\n",
+                                       cur_hpi->hugedir);
+                       goto error;
+               }
+               wa.fd_hugepage = fd_hugepage;
+               wa.seg_idx = 0;
 
-               addr = mmap(seg->addr, hpi->hugepage_sz, PROT_READ|PROT_WRITE,
-                           MAP_SHARED|MAP_FIXED, fd_hugepage,
-                           i * EAL_PAGE_SIZE);
-               if (addr == MAP_FAILED || addr != seg->addr) {
+               /* Map the contiguous memory into each memory segment */
+               if (rte_memseg_walk(attach_segment, &wa) < 0) {
                        RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n",
-                               i, hpi->hugedir);
+                               wa.seg_idx, cur_hpi->hugedir);
                        goto error;
                }
 
+               close(fd_hugepage);
+               fd_hugepage = -1;
        }
 
        /* hugepage_info is no longer required */
-       munmap((void *)(uintptr_t)hpi, sizeof(struct hugepage_info));
-       close(fd_hugepage_info);
-       close(fd_hugepage);
        return 0;
 
 error:
-       if (fd_hugepage_info >= 0)
-               close(fd_hugepage_info);
        if (fd_hugepage >= 0)
                close(fd_hugepage);
        return -1;
@@ -174,3 +301,217 @@ rte_eal_using_phys_addrs(void)
 {
        return 0;
 }
+
+static uint64_t
+get_mem_amount(uint64_t page_sz, uint64_t max_mem)
+{
+       uint64_t area_sz, max_pages;
+
+       /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */
+       max_pages = RTE_MAX_MEMSEG_PER_LIST;
+       max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);
+
+       area_sz = RTE_MIN(page_sz * max_pages, max_mem);
+
+       /* make sure the list isn't smaller than the page size */
+       area_sz = RTE_MAX(area_sz, page_sz);
+
+       return RTE_ALIGN(area_sz, page_sz);
+}
+
+#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
+static int
+alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
+               int n_segs, int socket_id, int type_msl_idx)
+{
+       char name[RTE_FBARRAY_NAME_LEN];
+
+       snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
+                type_msl_idx);
+       if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
+                       sizeof(struct rte_memseg))) {
+               RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
+                       rte_strerror(rte_errno));
+               return -1;
+       }
+
+       msl->page_sz = page_sz;
+       msl->socket_id = socket_id;
+       msl->base_va = NULL;
+
+       RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
+                       (size_t)page_sz >> 10, socket_id);
+
+       return 0;
+}
+
+static int
+alloc_va_space(struct rte_memseg_list *msl)
+{
+       uint64_t page_sz;
+       size_t mem_sz;
+       void *addr;
+       int flags = 0;
+
+#ifdef RTE_ARCH_PPC_64
+       flags |= MAP_HUGETLB;
+#endif
+
+       page_sz = msl->page_sz;
+       mem_sz = page_sz * msl->memseg_arr.len;
+
+       addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
+       if (addr == NULL) {
+               if (rte_errno == EADDRNOTAVAIL)
+                       RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
+                               (unsigned long long)mem_sz, msl->base_va);
+               else
+                       RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
+               return -1;
+       }
+       msl->base_va = addr;
+
+       return 0;
+}
+
+
+static int
+memseg_primary_init(void)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       int hpi_idx, msl_idx = 0;
+       struct rte_memseg_list *msl;
+       uint64_t max_mem, total_mem;
+
+       /* no-huge does not need this at all */
+       if (internal_config.no_hugetlbfs)
+               return 0;
+
+       /* FreeBSD has an issue where core dump will dump the entire memory
+        * contents, including anonymous zero-page memory. Therefore, while we
+        * will be limiting total amount of memory to RTE_MAX_MEM_MB, we will
+        * also be further limiting total memory amount to whatever memory is
+        * available to us through contigmem driver (plus spacing blocks).
+        *
+        * so, at each stage, we will be checking how much memory we are
+        * preallocating, and adjust all the values accordingly.
+        */
+
+       max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
+       total_mem = 0;
+
+       /* create memseg lists */
+       for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
+                       hpi_idx++) {
+               uint64_t max_type_mem, total_type_mem = 0;
+               uint64_t avail_mem;
+               int type_msl_idx, max_segs, avail_segs, total_segs = 0;
+               struct hugepage_info *hpi;
+               uint64_t hugepage_sz;
+
+               hpi = &internal_config.hugepage_info[hpi_idx];
+               hugepage_sz = hpi->hugepage_sz;
+
+               /* no NUMA support on FreeBSD */
+
+               /* check if we've already exceeded total memory amount */
+               if (total_mem >= max_mem)
+                       break;
+
+               /* first, calculate theoretical limits according to config */
+               max_type_mem = RTE_MIN(max_mem - total_mem,
+                       (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20);
+               max_segs = RTE_MAX_MEMSEG_PER_TYPE;
+
+               /* now, limit all of that to whatever will actually be
+                * available to us, because without dynamic allocation support,
+                * all of that extra memory will be sitting there being useless
+                * and slowing down core dumps in case of a crash.
+                *
+                * we need (N*2)-1 segments because we cannot guarantee that
+                * each segment will be IOVA-contiguous with the previous one,
+                * so we will allocate more and put spaces inbetween segments
+                * that are non-contiguous.
+                */
+               avail_segs = (hpi->num_pages[0] * 2) - 1;
+               avail_mem = avail_segs * hugepage_sz;
+
+               max_type_mem = RTE_MIN(avail_mem, max_type_mem);
+               max_segs = RTE_MIN(avail_segs, max_segs);
+
+               type_msl_idx = 0;
+               while (total_type_mem < max_type_mem &&
+                               total_segs < max_segs) {
+                       uint64_t cur_max_mem, cur_mem;
+                       unsigned int n_segs;
+
+                       if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+                               RTE_LOG(ERR, EAL,
+                                       "No more space in memseg lists, please increase %s\n",
+                                       RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+                               return -1;
+                       }
+
+                       msl = &mcfg->memsegs[msl_idx++];
+
+                       cur_max_mem = max_type_mem - total_type_mem;
+
+                       cur_mem = get_mem_amount(hugepage_sz,
+                                       cur_max_mem);
+                       n_segs = cur_mem / hugepage_sz;
+
+                       if (alloc_memseg_list(msl, hugepage_sz, n_segs,
+                                       0, type_msl_idx))
+                               return -1;
+
+                       total_segs += msl->memseg_arr.len;
+                       total_type_mem = total_segs * hugepage_sz;
+                       type_msl_idx++;
+
+                       if (alloc_va_space(msl)) {
+                               RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
+                               return -1;
+                       }
+               }
+               total_mem += total_type_mem;
+       }
+       return 0;
+}
+
+static int
+memseg_secondary_init(void)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       int msl_idx = 0;
+       struct rte_memseg_list *msl;
+
+       for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+
+               msl = &mcfg->memsegs[msl_idx];
+
+               /* skip empty memseg lists */
+               if (msl->memseg_arr.len == 0)
+                       continue;
+
+               if (rte_fbarray_attach(&msl->memseg_arr)) {
+                       RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n");
+                       return -1;
+               }
+
+               /* preallocate VA space */
+               if (alloc_va_space(msl)) {
+                       RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n");
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+int
+rte_eal_memseg_init(void)
+{
+       return rte_eal_process_type() == RTE_PROC_PRIMARY ?
+                       memseg_primary_init() :
+                       memseg_secondary_init();
+}