#include <vppinfra/clib.h>
#include <vppinfra/mem.h>
+#include <vppinfra/lock.h>
#include <vppinfra/time.h>
#include <vppinfra/format.h>
#include <vppinfra/clib_error.h>
-#include <vppinfra/linux/syscall.h>
#include <vppinfra/linux/sysfs.h>
#ifndef F_LINUX_SPECIFIC_BASE
#define MAP_FIXED_NOREPLACE 0x100000
#endif
-uword
-clib_mem_get_default_hugepage_size (void)
+static void
+map_lock ()
{
- unformat_input_t input;
- static u32 size = 0;
- int fd;
-
- if (size)
- goto done;
-
- /*
- * If the kernel doesn't support hugepages, /proc/meminfo won't
- * say anything about it. Use the regular page size as a default.
- */
- size = clib_mem_get_page_size () / 1024;
-
- if ((fd = open ("/proc/meminfo", 0)) == -1)
- return 0;
-
- unformat_init_clib_file (&input, fd);
+ while (clib_atomic_test_and_set (&clib_mem_main.map_lock))
+ CLIB_PAUSE ();
+}
- while (unformat_check_input (&input) != UNFORMAT_END_OF_INPUT)
- {
- if (unformat (&input, "Hugepagesize:%_%u kB", &size))
- ;
- else
- unformat_skip_line (&input);
- }
- unformat_free (&input);
- close (fd);
-done:
- return 1024ULL * size;
+static void
+map_unlock ()
+{
+ clib_atomic_release (&clib_mem_main.map_lock);
}
static clib_mem_page_sz_t
mm->log2_page_sz = min_log2 (page_size);
/* default system hugeppage size */
- if ((fd = memfd_create ("test", MFD_HUGETLB)) != -1)
+ if ((fd = syscall (__NR_memfd_create, "test", MFD_HUGETLB)) != -1)
{
mm->log2_default_hugepage_sz = clib_mem_get_fd_log2_page_size (fd);
close (fd);
else /* likely kernel older than 4.14 */
mm->log2_default_hugepage_sz = legacy_get_log2_default_hugepage_size ();
+ mm->log2_sys_default_hugepage_sz = mm->log2_default_hugepage_sz;
+
/* numa nodes */
va = mmap (0, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE |
MAP_ANONYMOUS, -1, 0);
for (int i = 0; i < CLIB_MAX_NUMAS; i++)
{
int status;
- if (move_pages (0, 1, &va, &i, &status, 0) == 0)
+ if (syscall (__NR_move_pages, 0, 1, &va, &i, &status, 0) == 0)
mm->numa_node_bitmap |= 1ULL << i;
}
munmap (va, page_size);
}
-u64
+__clib_export u64
clib_mem_get_fd_page_size (int fd)
{
struct stat st = { 0 };
return st.st_blksize;
}
-clib_mem_page_sz_t
+__clib_export clib_mem_page_sz_t
clib_mem_get_fd_log2_page_size (int fd)
{
uword page_size = clib_mem_get_fd_page_size (fd);
return page_size ? min_log2 (page_size) : CLIB_MEM_PAGE_SZ_UNKNOWN;
}
-void
+__clib_export void
clib_mem_vm_randomize_va (uword * requested_va,
clib_mem_page_sz_t log2_page_size)
{
(clib_cpu_time_now () & bit_mask) * (1ull << log2_page_size);
}
-clib_error_t *
-clib_mem_vm_ext_alloc (clib_mem_vm_alloc_t * a)
-{
- clib_mem_main_t *mm = &clib_mem_main;
- int fd = -1;
- clib_error_t *err = 0;
- void *addr = 0;
- u8 *filename = 0;
- int mmap_flags = 0;
- int log2_page_size;
- int n_pages;
- int old_mpol = -1;
- long unsigned int old_mask[16] = { 0 };
-
- /* save old numa mem policy if needed */
- if (a->flags & (CLIB_MEM_VM_F_NUMA_PREFER | CLIB_MEM_VM_F_NUMA_FORCE))
- {
- int rv;
- rv = get_mempolicy (&old_mpol, old_mask, sizeof (old_mask) * 8 + 1,
- 0, 0);
-
- if (rv == -1)
- {
- if (a->numa_node != 0 && (a->flags & CLIB_MEM_VM_F_NUMA_FORCE) != 0)
- {
- err = clib_error_return_unix (0, "get_mempolicy");
- goto error;
- }
- else
- old_mpol = -1;
- }
- }
-
- if (a->flags & CLIB_MEM_VM_F_LOCKED)
- mmap_flags |= MAP_LOCKED;
-
- /* if we are creating shared segment, we need file descriptor */
- if (a->flags & CLIB_MEM_VM_F_SHARED)
- {
- mmap_flags |= MAP_SHARED;
- /* if hugepages are needed we need to create mount point */
- if (a->flags & CLIB_MEM_VM_F_HUGETLB)
- {
- log2_page_size = CLIB_MEM_PAGE_SZ_DEFAULT_HUGE;
- mmap_flags |= MAP_LOCKED;
- }
- else
- log2_page_size = CLIB_MEM_PAGE_SZ_DEFAULT;
-
- if ((fd = clib_mem_vm_create_fd (log2_page_size, "%s", a->name)) == -1)
- {
- err = clib_error_return (0, "%U", format_clib_error, mm->error);
- goto error;
- }
-
- log2_page_size = clib_mem_get_fd_log2_page_size (fd);
- if (log2_page_size == 0)
- {
- err = clib_error_return_unix (0, "cannot determine page size");
- goto error;
- }
-
- if (a->requested_va)
- {
- clib_mem_vm_randomize_va (&a->requested_va, log2_page_size);
- mmap_flags |= MAP_FIXED;
- }
- }
- else /* not CLIB_MEM_VM_F_SHARED */
- {
- mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
- if (a->flags & CLIB_MEM_VM_F_HUGETLB)
- {
- mmap_flags |= MAP_HUGETLB;
- log2_page_size = 21;
- }
- else
- {
- log2_page_size = min_log2 (sysconf (_SC_PAGESIZE));
- }
- }
-
- n_pages = ((a->size - 1) >> log2_page_size) + 1;
-
- if (a->flags & CLIB_MEM_VM_F_HUGETLB_PREALLOC)
- {
- err = clib_sysfs_prealloc_hugepages (a->numa_node, log2_page_size,
- n_pages);
- if (err)
- goto error;
-
- }
-
- if (fd != -1)
- if ((ftruncate (fd, (u64) n_pages * (1 << log2_page_size))) == -1)
- {
- err = clib_error_return_unix (0, "ftruncate");
- goto error;
- }
-
- if (old_mpol != -1)
- {
- int rv;
- long unsigned int mask[16] = { 0 };
- mask[0] = 1 << a->numa_node;
- rv = set_mempolicy (MPOL_BIND, mask, sizeof (mask) * 8 + 1);
- if (rv == -1 && a->numa_node != 0 &&
- (a->flags & CLIB_MEM_VM_F_NUMA_FORCE) != 0)
- {
- err = clib_error_return_unix (0, "set_mempolicy");
- goto error;
- }
- }
-
- addr = mmap (uword_to_pointer (a->requested_va, void *), a->size,
- (PROT_READ | PROT_WRITE), mmap_flags, fd, 0);
- if (addr == MAP_FAILED)
- {
- err = clib_error_return_unix (0, "mmap");
- goto error;
- }
-
- /* re-apply old numa memory policy */
- if (old_mpol != -1 &&
- set_mempolicy (old_mpol, old_mask, sizeof (old_mask) * 8 + 1) == -1)
- {
- err = clib_error_return_unix (0, "set_mempolicy");
- goto error;
- }
-
- a->log2_page_size = log2_page_size;
- a->n_pages = n_pages;
- a->addr = addr;
- a->fd = fd;
- CLIB_MEM_UNPOISON (addr, a->size);
- goto done;
-
-error:
- if (fd != -1)
- close (fd);
-
-done:
- vec_free (filename);
- return err;
-}
-
-void
-clib_mem_vm_ext_free (clib_mem_vm_alloc_t * a)
-{
- if (a != 0)
- {
- clib_mem_vm_free (a->addr, 1ull << a->log2_page_size);
- if (a->fd != -1)
- close (a->fd);
- }
-}
-
static int
legacy_memfd_create (u8 * name)
{
clib_mem_main_t *mm = &clib_mem_main;
int fd = -1;
char *mount_dir;
+ u8 *temp;
u8 *filename;
+ /*
+ * Since mkdtemp will modify template string "/tmp/hugepage_mount.XXXXXX",
+ * it must not be a string constant, but should be declared as
+ * a character array.
+ */
+ temp = format (0, "/tmp/hugepage_mount.XXXXXX%c", 0);
+
/* create mount directory */
- if ((mount_dir = mkdtemp ("/tmp/hugepage_mount.XXXXXX")) == 0)
+ if ((mount_dir = mkdtemp ((char *) temp)) == 0)
{
+ vec_free (temp);
vec_reset_length (mm->error);
mm->error = clib_error_return_unix (mm->error, "mkdtemp");
return CLIB_MEM_ERROR;
if (mount ("none", mount_dir, "hugetlbfs", 0, NULL))
{
+ vec_free (temp);
rmdir ((char *) mount_dir);
vec_reset_length (mm->error);
mm->error = clib_error_return_unix (mm->error, "mount");
umount2 ((char *) mount_dir, MNT_DETACH);
rmdir ((char *) mount_dir);
vec_free (filename);
+ vec_free (temp);
return fd;
}
-int
+__clib_export int
clib_mem_vm_create_fd (clib_mem_page_sz_t log2_page_size, char *fmt, ...)
{
clib_mem_main_t *mm = &clib_mem_main;
if (log2_page_size == mm->log2_page_sz)
log2_page_size = CLIB_MEM_PAGE_SZ_DEFAULT;
+ else if (log2_page_size == mm->log2_sys_default_hugepage_sz)
+ log2_page_size = CLIB_MEM_PAGE_SZ_DEFAULT_HUGE;
switch (log2_page_size)
{
vec_add1 (s, 0);
/* memfd_create introduced in kernel 3.17, we don't support older kernels */
- fd = memfd_create ((char *) s, memfd_flags);
+ fd = syscall (__NR_memfd_create, (char *) s, memfd_flags);
/* kernel versions < 4.14 does not support memfd_create for huge pages */
if (fd == -1 && errno == EINVAL &&
return (uword) base + sys_page_sz;
}
-clib_mem_vm_map_hdr_t *
+__clib_export clib_mem_vm_map_hdr_t *
clib_mem_vm_get_next_map_hdr (clib_mem_vm_map_hdr_t * hdr)
{
clib_mem_main_t *mm = &clib_mem_main;
- uword sys_page_sz = 1 << mm->log2_page_sz;
+ uword sys_page_sz = 1ULL << mm->log2_page_sz;
clib_mem_vm_map_hdr_t *next;
if (hdr == 0)
{
{
clib_mem_main_t *mm = &clib_mem_main;
clib_mem_vm_map_hdr_t *hdr;
- uword sys_page_sz = 1 << mm->log2_page_sz;
+ uword sys_page_sz = 1ULL << mm->log2_page_sz;
int mmap_flags = MAP_FIXED, is_huge = 0;
if (fd != -1)
if (log2_page_sz == CLIB_MEM_PAGE_SZ_UNKNOWN)
return CLIB_MEM_VM_MAP_FAILED;
- size = round_pow2 (size, 1 << log2_page_sz);
+ size = round_pow2 (size, 1ULL << log2_page_sz);
base = (void *) clib_mem_vm_reserve ((uword) base, size, log2_page_sz);
return CLIB_MEM_VM_MAP_FAILED;
}
+ map_lock ();
+
if (mm->last_map)
{
mprotect (mm->last_map, sys_page_sz, PROT_READ | PROT_WRITE);
else
mm->first_map = hdr;
+ CLIB_MEM_UNPOISON (hdr, sys_page_sz);
hdr->next = 0;
hdr->prev = mm->last_map;
+ snprintf (hdr->name, CLIB_VM_MAP_HDR_NAME_MAX_LEN - 1, "%s", (char *) name);
mm->last_map = hdr;
+ map_unlock ();
+
hdr->base_addr = (uword) base;
hdr->log2_page_sz = log2_page_sz;
hdr->num_pages = size >> log2_page_sz;
hdr->fd = fd;
- snprintf (hdr->name, CLIB_VM_MAP_HDR_NAME_MAX_LEN - 1, "%s", (char *) name);
hdr->name[CLIB_VM_MAP_HDR_NAME_MAX_LEN - 1] = 0;
mprotect (hdr, sys_page_sz, PROT_NONE);
return base;
}
-int
+__clib_export int
clib_mem_vm_unmap (void *base)
{
clib_mem_main_t *mm = &clib_mem_main;
- uword size, sys_page_sz = 1 << mm->log2_page_sz;
+ uword size, sys_page_sz = 1ULL << mm->log2_page_sz;
clib_mem_vm_map_hdr_t *hdr = base - sys_page_sz;;
+ map_lock ();
if (mprotect (hdr, sys_page_sz, PROT_READ | PROT_WRITE) != 0)
- return CLIB_MEM_ERROR;
+ goto out;
size = hdr->num_pages << hdr->log2_page_sz;
if (munmap ((void *) hdr->base_addr, size) != 0)
- return CLIB_MEM_ERROR;
+ goto out;
if (hdr->next)
{
else
mm->first_map = hdr->next;
+ map_unlock ();
+
if (munmap (hdr, sys_page_sz) != 0)
return CLIB_MEM_ERROR;
return 0;
+out:
+ map_unlock ();
+ return CLIB_MEM_ERROR;
}
-void
+__clib_export void
clib_mem_get_page_stats (void *start, clib_mem_page_sz_t log2_page_size,
uword n_pages, clib_mem_page_stats_t * stats)
{
ptr[i] = start + (i << log2_page_size);
clib_memset (stats, 0, sizeof (clib_mem_page_stats_t));
+ stats->total = n_pages;
+ stats->log2_page_sz = log2_page_size;
- if (move_pages (0, n_pages, ptr, 0, status, 0) != 0)
+ if (syscall (__NR_move_pages, 0, n_pages, ptr, 0, status, 0) != 0)
{
stats->unknown = n_pages;
- return;
+ goto done;
}
for (i = 0; i < n_pages; i++)
else
stats->unknown++;
}
+
+done:
+ vec_free (status);
+ vec_free (ptr);
}
-u64 *
+__clib_export u64 *
clib_mem_vm_get_paddr (void *mem, clib_mem_page_sz_t log2_page_size,
int n_pages)
{
return r;
}
-int
+__clib_export int
clib_mem_set_numa_affinity (u8 numa_node, int force)
{
clib_mem_main_t *mm = &clib_mem_main;
mask[0] = 1 << numa_node;
- if (set_mempolicy (force ? MPOL_BIND : MPOL_PREFERRED, mask, mask_len))
+ if (syscall (__NR_set_mempolicy, force ? MPOL_BIND : MPOL_PREFERRED, mask,
+ mask_len))
goto error;
vec_reset_length (mm->error);
return CLIB_MEM_ERROR;
}
-int
+__clib_export int
clib_mem_set_default_numa_affinity ()
{
clib_mem_main_t *mm = &clib_mem_main;
- if (set_mempolicy (MPOL_DEFAULT, 0, 0))
+ if (syscall (__NR_set_mempolicy, MPOL_DEFAULT, 0, 0))
{
vec_reset_length (mm->error);
mm->error = clib_error_return_unix (mm->error, (char *) __func__);