#include <rte_log.h>
#include <rte_memory.h>
-#include <rte_memzone.h>
#include <rte_launch.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
#define PFN_MASK_SIZE 8
-#ifdef RTE_LIBRTE_XEN_DOM0
-int rte_xen_dom0_supported(void)
-{
- return internal_config.xen_dom0_support;
-}
-#endif
-
/**
* @file
* Huge page mapping under linux
static uint64_t baseaddr_offset;
+#ifdef RTE_ARCH_64
+/*
+ * Linux kernel uses a really high address as starting address for serving
+ * mmaps calls. If there exists addressing limitations and IOVA mode is VA,
+ * this starting address is likely too high for those devices. However, it
+ * is possible to use a lower address in the process virtual address space
+ * as with 64 bits there is a lot of available space.
+ *
+ * Current known limitations are 39 or 40 bits. Setting the starting address
+ * at 4GB implies there are 508GB or 1020GB for mapping the available
+ * hugepages. This is likely enough for most systems, although a device with
+ * addressing limitations should call rte_dev_check_dma_mask for ensuring all
+ * memory is within supported range.
+ */
+static uint64_t baseaddr = 0x100000000;
+#endif
+
static bool phys_addrs_available = true;
#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
static void
test_phys_addrs_available(void)
{
- uint64_t tmp;
+ uint64_t tmp = 0;
phys_addr_t physaddr;
- /* For dom0, phys addresses can always be available */
- if (rte_xen_dom0_supported())
- return;
-
if (!rte_eal_has_hugepages()) {
RTE_LOG(ERR, EAL,
"Started without hugepages support, physical addresses not available\n");
physaddr = rte_mem_virt2phy(&tmp);
if (physaddr == RTE_BAD_PHYS_ADDR) {
- RTE_LOG(ERR, EAL,
- "Cannot obtain physical addresses: %s. "
- "Only vfio will function.\n",
- strerror(errno));
+ if (rte_eal_iova_mode() == RTE_IOVA_PA)
+ RTE_LOG(ERR, EAL,
+ "Cannot obtain physical addresses: %s. "
+ "Only vfio will function.\n",
+ strerror(errno));
phys_addrs_available = false;
}
}
int page_size;
off_t offset;
- /* when using dom0, /proc/self/pagemap always returns 0, check in
- * dpdk memory by browsing the memsegs */
- if (rte_xen_dom0_supported()) {
- struct rte_mem_config *mcfg;
- struct rte_memseg *memseg;
- unsigned i;
-
- mcfg = rte_eal_get_configuration()->mem_config;
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- memseg = &mcfg->memseg[i];
- if (memseg->addr == NULL)
- break;
- if (virtaddr > memseg->addr &&
- virtaddr < RTE_PTR_ADD(memseg->addr,
- memseg->len)) {
- return memseg->phys_addr +
- RTE_PTR_DIFF(virtaddr, memseg->addr);
- }
- }
-
- return RTE_BAD_PHYS_ADDR;
- }
-
/* Cannot parse /proc/self/pagemap, no need to log errors everywhere */
if (!phys_addrs_available)
- return RTE_BAD_PHYS_ADDR;
+ return RTE_BAD_IOVA;
/* standard page size */
page_size = getpagesize();
if (fd < 0) {
RTE_LOG(ERR, EAL, "%s(): cannot open /proc/self/pagemap: %s\n",
__func__, strerror(errno));
- return RTE_BAD_PHYS_ADDR;
+ return RTE_BAD_IOVA;
}
virt_pfn = (unsigned long)virtaddr / page_size;
RTE_LOG(ERR, EAL, "%s(): seek error in /proc/self/pagemap: %s\n",
__func__, strerror(errno));
close(fd);
- return RTE_BAD_PHYS_ADDR;
+ return RTE_BAD_IOVA;
}
retval = read(fd, &page, PFN_MASK_SIZE);
if (retval < 0) {
RTE_LOG(ERR, EAL, "%s(): cannot read /proc/self/pagemap: %s\n",
__func__, strerror(errno));
- return RTE_BAD_PHYS_ADDR;
+ return RTE_BAD_IOVA;
} else if (retval != PFN_MASK_SIZE) {
RTE_LOG(ERR, EAL, "%s(): read %d bytes from /proc/self/pagemap "
"but expected %d:\n",
__func__, retval, PFN_MASK_SIZE);
- return RTE_BAD_PHYS_ADDR;
+ return RTE_BAD_IOVA;
}
/*
* pagemap.txt in linux Documentation)
*/
if ((page & 0x7fffffffffffffULL) == 0)
- return RTE_BAD_PHYS_ADDR;
+ return RTE_BAD_IOVA;
physaddr = ((page & 0x7fffffffffffffULL) * page_size)
+ ((unsigned long)virtaddr % page_size);
return physaddr;
}
+rte_iova_t
+rte_mem_virt2iova(const void *virtaddr)
+{
+ if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ return (uintptr_t)virtaddr;
+ return rte_mem_virt2phy(virtaddr);
+}
+
/*
* For each hugepage in hugepg_tbl, fill the physaddr value. We find
* it by browsing the /proc/self/pagemap special file.
}
}
+static void *
+get_addr_hint(void)
+{
+ if (internal_config.base_virtaddr != 0) {
+ return (void *) (uintptr_t)
+ (internal_config.base_virtaddr +
+ baseaddr_offset);
+ } else {
+#ifdef RTE_ARCH_64
+ return (void *) (uintptr_t) (baseaddr +
+ baseaddr_offset);
+#else
+ return NULL;
+#endif
+ }
+}
+
/*
* Try to mmap *size bytes in /dev/zero. If it is successful, return the
* pointer to the mmap'd area and keep *size unmodified. Else, retry
static void *
get_virtual_area(size_t *size, size_t hugepage_sz)
{
- void *addr;
+ void *addr, *addr_hint;
int fd;
long aligned_addr;
- if (internal_config.base_virtaddr != 0) {
- addr = (void*) (uintptr_t) (internal_config.base_virtaddr +
- baseaddr_offset);
- }
- else addr = NULL;
-
RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
fd = open("/dev/zero", O_RDONLY);
return NULL;
}
do {
- addr = mmap(addr,
+ addr_hint = get_addr_hint();
+
+ addr = mmap(addr_hint,
(*size) + hugepage_sz, PROT_READ,
#ifdef RTE_ARCH_PPC_64
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
MAP_PRIVATE,
#endif
fd, 0);
- if (addr == MAP_FAILED)
+ if (addr == MAP_FAILED) {
+ /* map failed. Let's try with less memory */
*size -= hugepage_sz;
+ } else if (addr_hint && addr != addr_hint) {
+ /* hint was not used. Try with another offset */
+ munmap(addr, (*size) + hugepage_sz);
+ addr = MAP_FAILED;
+ baseaddr_offset += 0x100000000;
+ }
} while (addr == MAP_FAILED && *size > 0);
if (addr == MAP_FAILED) {
* hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
* virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
* in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
- * map continguous physical blocks in contiguous virtual blocks.
+ * map contiguous physical blocks in contiguous virtual blocks.
*/
static unsigned
map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
int node_id = -1;
int essential_prev = 0;
int oldpolicy;
- struct bitmask *oldmask = numa_allocate_nodemask();
+ struct bitmask *oldmask = NULL;
bool have_numa = true;
unsigned long maxnode = 0;
if (orig && have_numa) {
RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
+ oldmask = numa_allocate_nodemask();
if (get_mempolicy(&oldpolicy, oldmask->maskp,
oldmask->size + 1, 0, 0) < 0) {
RTE_LOG(ERR, EAL,
}
#endif
+#ifdef RTE_ARCH_64
+ /*
+ * Hugepages are first mmaped individually and then re-mmapped to
+ * another region for having contiguous physical pages in contiguous
+ * virtual addresses. Setting here vma_addr for the first hugepage
+ * mapped to a virtual address which will not collide with the second
+ * mmaping later. The next hugepages will use increments of this
+ * initial address.
+ *
+ * The final virtual address will be based on baseaddr which is
+ * 0x100000000. We use a hint here starting at 0x200000000, leaving
+ * another 4GB just in case, plus the total available hugepages memory.
+ */
+ vma_addr = (char *)0x200000000 + (hpi->hugepage_sz * hpi->num_pages[0]);
+#endif
for (i = 0; i < hpi->num_pages[0]; i++) {
uint64_t hugepage_sz = hpi->hugepage_sz;
hugepg_tbl[i].orig_va = virtaddr;
}
else {
+ /* rewrite physical addresses in IOVA as VA mode */
+ if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ hugepg_tbl[i].physaddr = (uintptr_t)virtaddr;
hugepg_tbl[i].final_va = virtaddr;
}
numa_set_localalloc();
}
}
- numa_free_cpumask(oldmask);
+ if (oldmask != NULL)
+ numa_free_cpumask(oldmask);
#endif
return i;
}
}
retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
close(fd);
+ if (retval == MAP_FAILED)
+ return NULL;
return retval;
}
strerror(errno));
return -1;
}
- mcfg->memseg[0].phys_addr = RTE_BAD_PHYS_ADDR;
+ if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ mcfg->memseg[0].iova = (uintptr_t)addr;
+ else
+ mcfg->memseg[0].iova = RTE_BAD_IOVA;
mcfg->memseg[0].addr = addr;
mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
mcfg->memseg[0].len = internal_config.memory;
return 0;
}
-/* check if app runs on Xen Dom0 */
- if (internal_config.xen_dom0_support) {
-#ifdef RTE_LIBRTE_XEN_DOM0
- /* use dom0_mm kernel driver to init memory */
- if (rte_xen_dom0_memory_init() < 0)
- return -1;
- else
- return 0;
-#endif
- }
-
/* calculate total number of hugepages available. at this point we haven't
* yet started sorting them so they all are on socket 0 */
for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
continue;
}
- if (phys_addrs_available) {
+ if (phys_addrs_available &&
+ rte_eal_iova_mode() != RTE_IOVA_VA) {
/* find physical addresses for each hugepage */
if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
RTE_LOG(DEBUG, EAL, "Failed to find phys addr "
if (j == RTE_MAX_MEMSEG)
break;
- mcfg->memseg[j].phys_addr = hugepage[i].physaddr;
+ mcfg->memseg[j].iova = hugepage[i].physaddr;
mcfg->memseg[j].addr = hugepage[i].final_va;
mcfg->memseg[j].len = hugepage[i].size;
mcfg->memseg[j].socket_id = hugepage[i].socket_id;
#ifdef RTE_ARCH_PPC_64
/* Use the phy and virt address of the last page as segment
* address for IBM Power architecture */
- mcfg->memseg[j].phys_addr = hugepage[i].physaddr;
+ mcfg->memseg[j].iova = hugepage[i].physaddr;
mcfg->memseg[j].addr = hugepage[i].final_va;
#endif
mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
test_phys_addrs_available();
- if (internal_config.xen_dom0_support) {
-#ifdef RTE_LIBRTE_XEN_DOM0
- if (rte_xen_dom0_memory_attach() < 0) {
- RTE_LOG(ERR, EAL, "Failed to attach memory segments of primary "
- "process\n");
- return -1;
- }
- return 0;
-#endif
- }
-
fd_zero = open("/dev/zero", O_RDONLY);
if (fd_zero < 0) {
RTE_LOG(ERR, EAL, "Could not open /dev/zero\n");
return -1;
}
-bool
+int
rte_eal_using_phys_addrs(void)
{
return phys_addrs_available;