#include <vppinfra/clib.h>
#include <vppinfra/mem.h>
+#include <vppinfra/lock.h>
#include <vppinfra/time.h>
#include <vppinfra/format.h>
#include <vppinfra/clib_error.h>
-#include <vppinfra/linux/syscall.h>
#include <vppinfra/linux/sysfs.h>
#ifndef F_LINUX_SPECIFIC_BASE
#define MAP_FIXED_NOREPLACE 0x100000
#endif
-uword
+static void
+map_lock ()
+{
+ while (clib_atomic_test_and_set (&clib_mem_main.map_lock))
+ CLIB_PAUSE ();
+}
+
+static void
+map_unlock ()
+{
+ clib_atomic_release (&clib_mem_main.map_lock);
+}
+
+__clib_export uword
clib_mem_get_default_hugepage_size (void)
{
unformat_input_t input;
mm->log2_page_sz = min_log2 (page_size);
/* default system hugeppage size */
- if ((fd = memfd_create ("test", MFD_HUGETLB)) != -1)
+ if ((fd = syscall (__NR_memfd_create, "test", MFD_HUGETLB)) != -1)
{
mm->log2_default_hugepage_sz = clib_mem_get_fd_log2_page_size (fd);
close (fd);
for (int i = 0; i < CLIB_MAX_NUMAS; i++)
{
int status;
- if (move_pages (0, 1, &va, &i, &status, 0) == 0)
+ if (syscall (__NR_move_pages, 0, 1, &va, &i, &status, 0) == 0)
mm->numa_node_bitmap |= 1ULL << i;
}
munmap (va, page_size);
}
-u64
+__clib_export u64
clib_mem_get_fd_page_size (int fd)
{
struct stat st = { 0 };
return st.st_blksize;
}
-clib_mem_page_sz_t
+__clib_export clib_mem_page_sz_t
clib_mem_get_fd_log2_page_size (int fd)
{
uword page_size = clib_mem_get_fd_page_size (fd);
return page_size ? min_log2 (page_size) : CLIB_MEM_PAGE_SZ_UNKNOWN;
}
-void
+__clib_export void
clib_mem_vm_randomize_va (uword * requested_va,
clib_mem_page_sz_t log2_page_size)
{
(clib_cpu_time_now () & bit_mask) * (1ull << log2_page_size);
}
-clib_error_t *
-clib_mem_vm_ext_alloc (clib_mem_vm_alloc_t * a)
-{
- clib_mem_main_t *mm = &clib_mem_main;
- int fd = -1;
- clib_error_t *err = 0;
- void *addr = 0;
- u8 *filename = 0;
- int mmap_flags = 0;
- int log2_page_size;
- int n_pages;
- int old_mpol = -1;
- long unsigned int old_mask[16] = { 0 };
-
- /* save old numa mem policy if needed */
- if (a->flags & (CLIB_MEM_VM_F_NUMA_PREFER | CLIB_MEM_VM_F_NUMA_FORCE))
- {
- int rv;
- rv = get_mempolicy (&old_mpol, old_mask, sizeof (old_mask) * 8 + 1,
- 0, 0);
-
- if (rv == -1)
- {
- if (a->numa_node != 0 && (a->flags & CLIB_MEM_VM_F_NUMA_FORCE) != 0)
- {
- err = clib_error_return_unix (0, "get_mempolicy");
- goto error;
- }
- else
- old_mpol = -1;
- }
- }
-
- if (a->flags & CLIB_MEM_VM_F_LOCKED)
- mmap_flags |= MAP_LOCKED;
-
- /* if we are creating shared segment, we need file descriptor */
- if (a->flags & CLIB_MEM_VM_F_SHARED)
- {
- mmap_flags |= MAP_SHARED;
- /* if hugepages are needed we need to create mount point */
- if (a->flags & CLIB_MEM_VM_F_HUGETLB)
- {
- log2_page_size = CLIB_MEM_PAGE_SZ_DEFAULT_HUGE;
- mmap_flags |= MAP_LOCKED;
- }
- else
- log2_page_size = CLIB_MEM_PAGE_SZ_DEFAULT;
-
- if ((fd = clib_mem_vm_create_fd (log2_page_size, "%s", a->name)) == -1)
- {
- err = clib_error_return (0, "%U", format_clib_error, mm->error);
- goto error;
- }
-
- log2_page_size = clib_mem_get_fd_log2_page_size (fd);
- if (log2_page_size == 0)
- {
- err = clib_error_return_unix (0, "cannot determine page size");
- goto error;
- }
-
- if (a->requested_va)
- {
- clib_mem_vm_randomize_va (&a->requested_va, log2_page_size);
- mmap_flags |= MAP_FIXED;
- }
- }
- else /* not CLIB_MEM_VM_F_SHARED */
- {
- mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
- if (a->flags & CLIB_MEM_VM_F_HUGETLB)
- {
- mmap_flags |= MAP_HUGETLB;
- log2_page_size = 21;
- }
- else
- {
- log2_page_size = min_log2 (sysconf (_SC_PAGESIZE));
- }
- }
-
- n_pages = ((a->size - 1) >> log2_page_size) + 1;
-
- if (a->flags & CLIB_MEM_VM_F_HUGETLB_PREALLOC)
- {
- err = clib_sysfs_prealloc_hugepages (a->numa_node, log2_page_size,
- n_pages);
- if (err)
- goto error;
-
- }
-
- if (fd != -1)
- if ((ftruncate (fd, (u64) n_pages * (1 << log2_page_size))) == -1)
- {
- err = clib_error_return_unix (0, "ftruncate");
- goto error;
- }
-
- if (old_mpol != -1)
- {
- int rv;
- long unsigned int mask[16] = { 0 };
- mask[0] = 1 << a->numa_node;
- rv = set_mempolicy (MPOL_BIND, mask, sizeof (mask) * 8 + 1);
- if (rv == -1 && a->numa_node != 0 &&
- (a->flags & CLIB_MEM_VM_F_NUMA_FORCE) != 0)
- {
- err = clib_error_return_unix (0, "set_mempolicy");
- goto error;
- }
- }
-
- addr = mmap (uword_to_pointer (a->requested_va, void *), a->size,
- (PROT_READ | PROT_WRITE), mmap_flags, fd, 0);
- if (addr == MAP_FAILED)
- {
- err = clib_error_return_unix (0, "mmap");
- goto error;
- }
-
- /* re-apply old numa memory policy */
- if (old_mpol != -1 &&
- set_mempolicy (old_mpol, old_mask, sizeof (old_mask) * 8 + 1) == -1)
- {
- err = clib_error_return_unix (0, "set_mempolicy");
- goto error;
- }
-
- a->log2_page_size = log2_page_size;
- a->n_pages = n_pages;
- a->addr = addr;
- a->fd = fd;
- CLIB_MEM_UNPOISON (addr, a->size);
- goto done;
-
-error:
- if (fd != -1)
- close (fd);
-
-done:
- vec_free (filename);
- return err;
-}
-
-void
-clib_mem_vm_ext_free (clib_mem_vm_alloc_t * a)
-{
- if (a != 0)
- {
- clib_mem_vm_free (a->addr, 1ull << a->log2_page_size);
- if (a->fd != -1)
- close (a->fd);
- }
-}
-
static int
legacy_memfd_create (u8 * name)
{
clib_mem_main_t *mm = &clib_mem_main;
int fd = -1;
char *mount_dir;
+ u8 *temp;
u8 *filename;
+ /*
+ * Since mkdtemp will modify template string "/tmp/hugepage_mount.XXXXXX",
+ * it must not be a string constant, but should be declared as
+ * a character array.
+ */
+ temp = format (0, "/tmp/hugepage_mount.XXXXXX%c", 0);
+
/* create mount directory */
- if ((mount_dir = mkdtemp ("/tmp/hugepage_mount.XXXXXX")) == 0)
+ if ((mount_dir = mkdtemp ((char *) temp)) == 0)
{
+ vec_free (temp);
vec_reset_length (mm->error);
mm->error = clib_error_return_unix (mm->error, "mkdtemp");
- return -1;
+ return CLIB_MEM_ERROR;
}
if (mount ("none", mount_dir, "hugetlbfs", 0, NULL))
{
+ vec_free (temp);
rmdir ((char *) mount_dir);
vec_reset_length (mm->error);
mm->error = clib_error_return_unix (mm->error, "mount");
- return -1;
+ return CLIB_MEM_ERROR;
}
filename = format (0, "%s/%s%c", mount_dir, name, 0);
umount2 ((char *) mount_dir, MNT_DETACH);
rmdir ((char *) mount_dir);
vec_free (filename);
+ vec_free (temp);
return fd;
}
-int
+__clib_export int
clib_mem_vm_create_fd (clib_mem_page_sz_t log2_page_size, char *fmt, ...)
{
clib_mem_main_t *mm = &clib_mem_main;
if (log2_page_size == mm->log2_page_sz)
log2_page_size = CLIB_MEM_PAGE_SZ_DEFAULT;
+ else if (log2_page_size == mm->log2_default_hugepage_sz)
+ log2_page_size = CLIB_MEM_PAGE_SZ_DEFAULT_HUGE;
switch (log2_page_size)
{
case CLIB_MEM_PAGE_SZ_UNKNOWN:
- return -1;
+ return CLIB_MEM_ERROR;
case CLIB_MEM_PAGE_SZ_DEFAULT:
memfd_flags = MFD_ALLOW_SEALING;
break;
vec_add1 (s, 0);
/* memfd_create introduced in kernel 3.17, we don't support older kernels */
- fd = memfd_create ((char *) s, memfd_flags);
+ fd = syscall (__NR_memfd_create, (char *) s, memfd_flags);
/* kernel versions < 4.14 does not support memfd_create for huge pages */
if (fd == -1 && errno == EINVAL &&
vec_reset_length (mm->error);
mm->error = clib_error_return_unix (mm->error, "memfd_create");
vec_free (s);
- return -1;
+ return CLIB_MEM_ERROR;
}
vec_free (s);
vec_reset_length (mm->error);
mm->error = clib_error_return_unix (mm->error, "fcntl (F_ADD_SEALS)");
close (fd);
- return -1;
+ return CLIB_MEM_ERROR;
}
return fd;
return (uword) base + sys_page_sz;
}
-clib_mem_vm_map_hdr_t *
+__clib_export clib_mem_vm_map_hdr_t *
clib_mem_vm_get_next_map_hdr (clib_mem_vm_map_hdr_t * hdr)
{
clib_mem_main_t *mm = &clib_mem_main;
- uword sys_page_sz = 1 << mm->log2_page_sz;
+ uword sys_page_sz = 1ULL << mm->log2_page_sz;
clib_mem_vm_map_hdr_t *next;
if (hdr == 0)
{
{
clib_mem_main_t *mm = &clib_mem_main;
clib_mem_vm_map_hdr_t *hdr;
- uword sys_page_sz = 1 << mm->log2_page_sz;
+ uword sys_page_sz = 1ULL << mm->log2_page_sz;
int mmap_flags = MAP_FIXED, is_huge = 0;
if (fd != -1)
if (log2_page_sz == CLIB_MEM_PAGE_SZ_UNKNOWN)
return CLIB_MEM_VM_MAP_FAILED;
- size = round_pow2 (size, 1 << log2_page_sz);
+ size = round_pow2 (size, 1ULL << log2_page_sz);
base = (void *) clib_mem_vm_reserve ((uword) base, size, log2_page_sz);
return CLIB_MEM_VM_MAP_FAILED;
}
+ map_lock ();
+
if (mm->last_map)
{
mprotect (mm->last_map, sys_page_sz, PROT_READ | PROT_WRITE);
else
mm->first_map = hdr;
+ CLIB_MEM_UNPOISON (hdr, sys_page_sz);
hdr->next = 0;
hdr->prev = mm->last_map;
+ snprintf (hdr->name, CLIB_VM_MAP_HDR_NAME_MAX_LEN - 1, "%s", (char *) name);
mm->last_map = hdr;
+ map_unlock ();
+
hdr->base_addr = (uword) base;
hdr->log2_page_sz = log2_page_sz;
hdr->num_pages = size >> log2_page_sz;
hdr->fd = fd;
- snprintf (hdr->name, CLIB_VM_MAP_HDR_NAME_MAX_LEN - 1, "%s", (char *) name);
hdr->name[CLIB_VM_MAP_HDR_NAME_MAX_LEN - 1] = 0;
mprotect (hdr, sys_page_sz, PROT_NONE);
return base;
}
-int
+__clib_export int
clib_mem_vm_unmap (void *base)
{
clib_mem_main_t *mm = &clib_mem_main;
- uword size, sys_page_sz = 1 << mm->log2_page_sz;
+ uword size, sys_page_sz = 1ULL << mm->log2_page_sz;
clib_mem_vm_map_hdr_t *hdr = base - sys_page_sz;;
if (mprotect (hdr, sys_page_sz, PROT_READ | PROT_WRITE) != 0)
- return -1;
+ return CLIB_MEM_ERROR;
size = hdr->num_pages << hdr->log2_page_sz;
if (munmap ((void *) hdr->base_addr, size) != 0)
- return -1;
+ return CLIB_MEM_ERROR;
+
+ map_lock ();
if (hdr->next)
{
else
mm->first_map = hdr->next;
+ map_unlock ();
+
if (munmap (hdr, sys_page_sz) != 0)
- return -1;
+ return CLIB_MEM_ERROR;
return 0;
}
-void
+__clib_export void
clib_mem_get_page_stats (void *start, clib_mem_page_sz_t log2_page_size,
uword n_pages, clib_mem_page_stats_t * stats)
{
ptr[i] = start + (i << log2_page_size);
clib_memset (stats, 0, sizeof (clib_mem_page_stats_t));
+ stats->total = n_pages;
+ stats->log2_page_sz = log2_page_size;
- if (move_pages (0, n_pages, ptr, 0, status, 0) != 0)
+ if (syscall (__NR_move_pages, 0, n_pages, ptr, 0, status, 0) != 0)
{
stats->unknown = n_pages;
- return;
+ goto done;
}
for (i = 0; i < n_pages; i++)
else
stats->unknown++;
}
+
+done:
+ vec_free (status);
+ vec_free (ptr);
}
-u64 *
+__clib_export u64 *
clib_mem_vm_get_paddr (void *mem, clib_mem_page_sz_t log2_page_size,
int n_pages)
{
return r;
}
-clib_error_t *
-clib_mem_vm_ext_map (clib_mem_vm_map_t * a)
+__clib_export int
+clib_mem_set_numa_affinity (u8 numa_node, int force)
{
- long unsigned int old_mask[16] = { 0 };
- int mmap_flags = MAP_SHARED;
- clib_error_t *err = 0;
- int old_mpol = -1;
- void *addr;
- int rv;
-
- if (a->numa_node)
- {
- rv = get_mempolicy (&old_mpol, old_mask, sizeof (old_mask) * 8 + 1, 0,
- 0);
+ clib_mem_main_t *mm = &clib_mem_main;
+ long unsigned int mask[16] = { 0 };
+ int mask_len = sizeof (mask) * 8 + 1;
- if (rv == -1)
+ /* no numa support */
+ if (mm->numa_node_bitmap == 0)
+ {
+ if (numa_node)
{
- err = clib_error_return_unix (0, "get_mempolicy");
- goto done;
+ vec_reset_length (mm->error);
+ mm->error = clib_error_return (mm->error, "%s: numa not supported",
+ (char *) __func__);
+ return CLIB_MEM_ERROR;
}
+ else
+ return 0;
}
- if (a->requested_va)
- mmap_flags |= MAP_FIXED;
+ mask[0] = 1 << numa_node;
- if (old_mpol != -1)
- {
- long unsigned int mask[16] = { 0 };
- mask[0] = 1 << a->numa_node;
- rv = set_mempolicy (MPOL_BIND, mask, sizeof (mask) * 8 + 1);
- if (rv == -1)
- {
- err = clib_error_return_unix (0, "set_mempolicy");
- goto done;
- }
- }
+ if (syscall (__NR_set_mempolicy, force ? MPOL_BIND : MPOL_PREFERRED, mask,
+ mask_len))
+ goto error;
- addr = (void *) mmap (uword_to_pointer (a->requested_va, void *), a->size,
- PROT_READ | PROT_WRITE, mmap_flags, a->fd, 0);
+ vec_reset_length (mm->error);
+ return 0;
- if (addr == MAP_FAILED)
- return clib_error_return_unix (0, "mmap");
+error:
+ vec_reset_length (mm->error);
+ mm->error = clib_error_return_unix (mm->error, (char *) __func__);
+ return CLIB_MEM_ERROR;
+}
+
+__clib_export int
+clib_mem_set_default_numa_affinity ()
+{
+ clib_mem_main_t *mm = &clib_mem_main;
- /* re-apply old numa memory policy */
- if (old_mpol != -1 &&
- set_mempolicy (old_mpol, old_mask, sizeof (old_mask) * 8 + 1) == -1)
+ if (syscall (__NR_set_mempolicy, MPOL_DEFAULT, 0, 0))
{
- err = clib_error_return_unix (0, "set_mempolicy");
- goto done;
+ vec_reset_length (mm->error);
+ mm->error = clib_error_return_unix (mm->error, (char *) __func__);
+ return CLIB_MEM_ERROR;
}
-
- a->addr = addr;
- CLIB_MEM_UNPOISON (addr, a->size);
-
-done:
- return err;
+ return 0;
}
/*