#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
-#include <linux/mempolicy.h>
-#include <linux/memfd.h>
+#include <unistd.h>
+#include <sched.h>
#include <vppinfra/format.h>
-#include <vppinfra/linux/syscall.h>
#include <vppinfra/linux/sysfs.h>
#include <vppinfra/mem.h>
#include <vppinfra/hash.h>
#include <vppinfra/pmalloc.h>
+#include <vppinfra/cpu.h>
#if __SIZEOF_POINTER__ >= 8
#define DEFAULT_RESERVED_MB 16384
return round_pow2 (size, 1ULL << log2_page_sz) >> log2_page_sz;
}
-static inline int
-pmalloc_validate_numa_node (u32 * numa_node)
-{
- if (*numa_node == CLIB_PMALLOC_NUMA_LOCAL)
- {
- u32 cpu;
- if (getcpu (&cpu, numa_node, 0) != 0)
- return 1;
- }
- return 0;
-}
-
-int
+__clib_export int
clib_pmalloc_init (clib_pmalloc_main_t * pm, uword base_addr, uword size)
{
- uword off, pagesize;
+ uword base, pagesize;
u64 *pt = 0;
- int mmap_flags;
ASSERT (pm->error == 0);
pagesize = clib_mem_get_default_hugepage_size ();
pm->def_log2_page_sz = min_log2 (pagesize);
- pm->sys_log2_page_sz = min_log2 (sysconf (_SC_PAGESIZE));
pm->lookup_log2_page_sz = pm->def_log2_page_sz;
/* check if pagemap is accessible */
- pt = clib_mem_vm_get_paddr (&pt, pm->sys_log2_page_sz, 1);
+ pt = clib_mem_vm_get_paddr (&pt, CLIB_MEM_PAGE_SZ_DEFAULT, 1);
if (pt == 0 || pt[0] == 0)
pm->flags |= CLIB_PMALLOC_F_NO_PAGEMAP;
pm->max_pages = size >> pm->def_log2_page_sz;
- /* reserve VA space for future growth */
- mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+ base = clib_mem_vm_reserve (base_addr, size, pm->def_log2_page_sz);
- if (base_addr)
- mmap_flags |= MAP_FIXED;
-
- pm->base = mmap (uword_to_pointer (base_addr, void *), size + pagesize,
- PROT_NONE, mmap_flags, -1, 0);
-
- if (pm->base == MAP_FAILED)
+ if (base == ~0)
{
- pm->error = clib_error_return_unix (0, "failed to reserve %u pages");
+ pm->error = clib_error_return (0, "failed to reserve %u pages",
+ pm->max_pages);
return -1;
}
- off = round_pow2 (pointer_to_uword (pm->base), pagesize) -
- pointer_to_uword (pm->base);
-
- /* trim start and end of reservation to be page aligned */
- if (off)
- {
- munmap (pm->base, off);
- pm->base += off;
- }
-
- munmap (pm->base + ((uword) pm->max_pages * pagesize), pagesize - off);
+ pm->base = uword_to_pointer (base, void *);
return 0;
}
{
va = pointer_to_uword (pm->base) + (p << pm->lookup_log2_page_sz);
pa = 0;
- seek = (va >> pm->sys_log2_page_sz) * sizeof (pa);
+ seek = (va >> clib_mem_get_log2_page_size ()) * sizeof (pa);
if (fd != -1 && lseek (fd, seek, SEEK_SET) == seek &&
read (fd, &pa, sizeof (pa)) == (sizeof (pa)) &&
pa & (1ULL << 63) /* page present bit */ )
{
- pa = (pa & pow2_mask (55)) << pm->sys_log2_page_sz;
+ pa = (pa & pow2_mask (55)) << clib_mem_get_log2_page_size ();
}
pm->lookup_table[p] = va - pa;
p++;
pmalloc_map_pages (clib_pmalloc_main_t * pm, clib_pmalloc_arena_t * a,
u32 numa_node, u32 n_pages)
{
+ clib_mem_page_stats_t stats = {};
clib_pmalloc_page_t *pp = 0;
- int status, rv, i, mmap_flags;
- void *va;
- int old_mpol = -1;
- long unsigned int mask[16] = { 0 };
- long unsigned int old_mask[16] = { 0 };
+ int rv, i, mmap_flags;
+ void *va = MAP_FAILED;
uword size = (uword) n_pages << pm->def_log2_page_sz;
clib_error_free (pm->error);
return 0;
}
- if (a->log2_subpage_sz != pm->sys_log2_page_sz)
+ if (a->log2_subpage_sz != clib_mem_get_log2_page_size ())
{
pm->error = clib_sysfs_prealloc_hugepages (numa_node,
a->log2_subpage_sz, n_pages);
return 0;
}
- rv = get_mempolicy (&old_mpol, old_mask, sizeof (old_mask) * 8 + 1, 0, 0);
- /* failure to get mempolicy means we can only proceed with numa 0 maps */
- if (rv == -1 && numa_node != 0)
- {
- pm->error = clib_error_return_unix (0, "failed to get mempolicy");
- return 0;
- }
-
- mask[0] = 1 << numa_node;
- rv = set_mempolicy (MPOL_BIND, mask, sizeof (mask) * 8 + 1);
- if (rv == -1 && numa_node != 0)
+ rv = clib_mem_set_numa_affinity (numa_node, /* force */ 1);
+ if (rv == CLIB_MEM_ERROR && numa_node != 0)
{
pm->error = clib_error_return_unix (0, "failed to set mempolicy for "
"numa node %u", numa_node);
mmap_flags = MAP_FIXED;
- if ((pm->flags & CLIB_PMALLOC_F_NO_PAGEMAP) == 0)
- mmap_flags |= MAP_LOCKED;
-
if (a->flags & CLIB_PMALLOC_ARENA_F_SHARED_MEM)
{
mmap_flags |= MAP_SHARED;
- if (a->log2_subpage_sz != pm->sys_log2_page_sz)
- pm->error = clib_mem_create_hugetlb_fd ((char *) a->name, &a->fd);
- else
- pm->error = clib_mem_create_fd ((char *) a->name, &a->fd);
+ a->fd = clib_mem_vm_create_fd (a->log2_subpage_sz, "%s", a->name);
if (a->fd == -1)
goto error;
if ((ftruncate (a->fd, size)) == -1)
}
else
{
- if (a->log2_subpage_sz != pm->sys_log2_page_sz)
+ if (a->log2_subpage_sz != clib_mem_get_log2_page_size ())
mmap_flags |= MAP_HUGETLB;
mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
pm->error = clib_error_return_unix (0, "failed to mmap %u pages at %p "
"fd %d numa %d flags 0x%x", n_pages,
va, a->fd, numa_node, mmap_flags);
+ va = MAP_FAILED;
+ goto error;
+ }
+
+ if (a->log2_subpage_sz != clib_mem_get_log2_page_size () &&
+ mlock (va, size) != 0)
+ {
+ pm->error = clib_error_return_unix (0, "Unable to lock pages");
goto error;
}
clib_memset (va, 0, size);
- rv = set_mempolicy (old_mpol, old_mask, sizeof (old_mask) * 8 + 1);
- if (rv == -1 && numa_node != 0)
+ rv = clib_mem_set_default_numa_affinity ();
+ if (rv == CLIB_MEM_ERROR && numa_node != 0)
{
pm->error = clib_error_return_unix (0, "failed to restore mempolicy");
goto error;
/* we tolerate move_pages failure only if request os for numa node 0
to support non-numa kernels */
- rv = move_pages (0, 1, &va, 0, &status, 0);
- if ((rv == 0 && status != numa_node) || (rv != 0 && numa_node != 0))
+ clib_mem_get_page_stats (va, CLIB_MEM_PAGE_SZ_DEFAULT, 1, &stats);
+
+ if (stats.per_numa[numa_node] != 1 &&
+ !(numa_node == 0 && stats.unknown == 1))
{
- pm->error = rv == -1 ?
- clib_error_return_unix (0, "page allocated on wrong node, numa node "
- "%u status %d", numa_node, status) :
- clib_error_return (0, "page allocated on wrong node, numa node "
- "%u status %d", numa_node, status);
+ u16 allocated_at = ~0;
+ if (stats.unknown)
+ clib_error_return (0,
+ "unable to get information about numa allocation");
+
+ for (u16 i = 0; i < CLIB_MAX_NUMAS; i++)
+ if (stats.per_numa[i] == 1)
+ allocated_at = i;
+
+ clib_error_return (0,
+ "page allocated on the wrong numa node (%u), "
+ "expected %u",
+ allocated_at, numa_node);
- /* unmap & reesrve */
- munmap (va, size);
- mmap (va, size, PROT_NONE, MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
- -1, 0);
goto error;
}
return pp - (n_pages - 1);
error:
+ if (va != MAP_FAILED)
+ {
+ /* unmap & reserve */
+ munmap (va, size);
+ mmap (va, size, PROT_NONE, MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
+ -1, 0);
+ }
if (a->fd != -1)
close (a->fd);
return 0;
}
-void *
+__clib_export void *
clib_pmalloc_create_shared_arena (clib_pmalloc_main_t * pm, char *name,
uword size, u32 log2_page_sz, u32 numa_node)
{
if (log2_page_sz == 0)
log2_page_sz = pm->def_log2_page_sz;
else if (log2_page_sz != pm->def_log2_page_sz &&
- log2_page_sz != pm->sys_log2_page_sz)
+ log2_page_sz != clib_mem_get_log2_page_size ())
{
pm->error = clib_error_create ("unsupported page size (%uKB)",
1 << (log2_page_sz - 10));
if (n_pages + vec_len (pm->pages) > pm->max_pages)
return 0;
- if (pmalloc_validate_numa_node (&numa_node))
- return 0;
+ if (numa_node == CLIB_PMALLOC_NUMA_LOCAL)
+ numa_node = clib_get_current_numa_node ();
pool_get (pm->arenas, a);
a->index = a - pm->arenas;
ASSERT (is_pow2 (align));
- if (pmalloc_validate_numa_node (&numa_node))
- return 0;
+ if (numa_node == CLIB_PMALLOC_NUMA_LOCAL)
+ numa_node = clib_get_current_numa_node ();
if (a == 0)
{
return 0;
}
-void *
+__clib_export void *
clib_pmalloc_alloc_aligned_on_numa (clib_pmalloc_main_t * pm, uword size,
uword align, u32 numa_node)
{
return 1;
}
-void
+__clib_export void
clib_pmalloc_free (clib_pmalloc_main_t * pm, void *va)
{
clib_pmalloc_page_t *pp;
}
}
-static u8 *
-format_log2_page_size (u8 * s, va_list * va)
-{
- u32 log2_page_sz = va_arg (*va, u32);
-
- if (log2_page_sz >= 30)
- return format (s, "%uGB", 1 << (log2_page_sz - 30));
-
- if (log2_page_sz >= 20)
- return format (s, "%uMB", 1 << (log2_page_sz - 20));
-
- if (log2_page_sz >= 10)
- return format (s, "%uKB", 1 << (log2_page_sz - 10));
-
- return format (s, "%uB", 1 << log2_page_sz);
-}
-
-
static u8 *
format_pmalloc_page (u8 * s, va_list * va)
{
return s;
}
-u8 *
+__clib_export u8 *
format_pmalloc (u8 * s, va_list * va)
{
clib_pmalloc_main_t *pm = va_arg (*va, clib_pmalloc_main_t *);
/* *INDENT-OFF* */
- pool_foreach (a, pm->arenas,
+ pool_foreach (a, pm->arenas)
{
u32 *page_index;
s = format (s, "\n%Uarena '%s' pages %u subpage-size %U numa-node %u",
s = format (s, "\n%U%U", format_white_space, indent + 4,
format_pmalloc_page, pp, verbose);
}
- });
+ }
/* *INDENT-ON* */
return s;
}
-u8 *
+__clib_export u8 *
format_pmalloc_map (u8 * s, va_list * va)
{
clib_pmalloc_main_t *pm = va_arg (*va, clib_pmalloc_main_t *);