Imported Upstream version 16.07-rc2
[deb_dpdk.git] / lib / librte_eal / linuxapp / eal / eal_memory.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 /*   BSD LICENSE
34  *
35  *   Copyright(c) 2013 6WIND.
36  *
37  *   Redistribution and use in source and binary forms, with or without
38  *   modification, are permitted provided that the following conditions
39  *   are met:
40  *
41  *     * Redistributions of source code must retain the above copyright
42  *       notice, this list of conditions and the following disclaimer.
43  *     * Redistributions in binary form must reproduce the above copyright
44  *       notice, this list of conditions and the following disclaimer in
45  *       the documentation and/or other materials provided with the
46  *       distribution.
47  *     * Neither the name of 6WIND S.A. nor the names of its
48  *       contributors may be used to endorse or promote products derived
49  *       from this software without specific prior written permission.
50  *
51  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62  */
63
64 #define _FILE_OFFSET_BITS 64
65 #include <errno.h>
66 #include <stdarg.h>
67 #include <stdlib.h>
68 #include <stdio.h>
69 #include <stdint.h>
70 #include <inttypes.h>
71 #include <string.h>
72 #include <stdarg.h>
73 #include <sys/mman.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <sys/queue.h>
77 #include <sys/file.h>
78 #include <unistd.h>
79 #include <limits.h>
80 #include <errno.h>
81 #include <sys/ioctl.h>
82 #include <sys/time.h>
83 #include <signal.h>
84 #include <setjmp.h>
85
86 #include <rte_log.h>
87 #include <rte_memory.h>
88 #include <rte_memzone.h>
89 #include <rte_launch.h>
90 #include <rte_eal.h>
91 #include <rte_eal_memconfig.h>
92 #include <rte_per_lcore.h>
93 #include <rte_lcore.h>
94 #include <rte_common.h>
95 #include <rte_string_fns.h>
96
97 #include "eal_private.h"
98 #include "eal_internal_cfg.h"
99 #include "eal_filesystem.h"
100 #include "eal_hugepages.h"
101
102 #ifdef RTE_LIBRTE_XEN_DOM0
103 int rte_xen_dom0_supported(void)
104 {
105         return internal_config.xen_dom0_support;
106 }
107 #endif
108
109 /**
110  * @file
111  * Huge page mapping under linux
112  *
113  * To reserve a big contiguous amount of memory, we use the hugepage
114  * feature of linux. For that, we need to have hugetlbfs mounted. This
115  * code will create many files in this directory (one per page) and
116  * map them in virtual memory. For each page, we will retrieve its
117  * physical address and remap it in order to have a virtual contiguous
118  * zone as well as a physical contiguous zone.
119  */
120
121 static uint64_t baseaddr_offset;
122
123 static unsigned proc_pagemap_readable;
124
125 #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
126
127 static void
128 test_proc_pagemap_readable(void)
129 {
130         int fd = open("/proc/self/pagemap", O_RDONLY);
131
132         if (fd < 0) {
133                 RTE_LOG(ERR, EAL,
134                         "Cannot open /proc/self/pagemap: %s. "
135                         "virt2phys address translation will not work\n",
136                         strerror(errno));
137                 return;
138         }
139
140         /* Is readable */
141         close(fd);
142         proc_pagemap_readable = 1;
143 }
144
145 /* Lock page in physical memory and prevent from swapping. */
146 int
147 rte_mem_lock_page(const void *virt)
148 {
149         unsigned long virtual = (unsigned long)virt;
150         int page_size = getpagesize();
151         unsigned long aligned = (virtual & ~ (page_size - 1));
152         return mlock((void*)aligned, page_size);
153 }
154
155 /*
156  * Get physical address of any mapped virtual address in the current process.
157  */
158 phys_addr_t
159 rte_mem_virt2phy(const void *virtaddr)
160 {
161         int fd;
162         uint64_t page, physaddr;
163         unsigned long virt_pfn;
164         int page_size;
165         off_t offset;
166
167         /* when using dom0, /proc/self/pagemap always returns 0, check in
168          * dpdk memory by browsing the memsegs */
169         if (rte_xen_dom0_supported()) {
170                 struct rte_mem_config *mcfg;
171                 struct rte_memseg *memseg;
172                 unsigned i;
173
174                 mcfg = rte_eal_get_configuration()->mem_config;
175                 for (i = 0; i < RTE_MAX_MEMSEG; i++) {
176                         memseg = &mcfg->memseg[i];
177                         if (memseg->addr == NULL)
178                                 break;
179                         if (virtaddr > memseg->addr &&
180                                         virtaddr < RTE_PTR_ADD(memseg->addr,
181                                                 memseg->len)) {
182                                 return memseg->phys_addr +
183                                         RTE_PTR_DIFF(virtaddr, memseg->addr);
184                         }
185                 }
186
187                 return RTE_BAD_PHYS_ADDR;
188         }
189
190         /* Cannot parse /proc/self/pagemap, no need to log errors everywhere */
191         if (!proc_pagemap_readable)
192                 return RTE_BAD_PHYS_ADDR;
193
194         /* standard page size */
195         page_size = getpagesize();
196
197         fd = open("/proc/self/pagemap", O_RDONLY);
198         if (fd < 0) {
199                 RTE_LOG(ERR, EAL, "%s(): cannot open /proc/self/pagemap: %s\n",
200                         __func__, strerror(errno));
201                 return RTE_BAD_PHYS_ADDR;
202         }
203
204         virt_pfn = (unsigned long)virtaddr / page_size;
205         offset = sizeof(uint64_t) * virt_pfn;
206         if (lseek(fd, offset, SEEK_SET) == (off_t) -1) {
207                 RTE_LOG(ERR, EAL, "%s(): seek error in /proc/self/pagemap: %s\n",
208                                 __func__, strerror(errno));
209                 close(fd);
210                 return RTE_BAD_PHYS_ADDR;
211         }
212         if (read(fd, &page, sizeof(uint64_t)) < 0) {
213                 RTE_LOG(ERR, EAL, "%s(): cannot read /proc/self/pagemap: %s\n",
214                                 __func__, strerror(errno));
215                 close(fd);
216                 return RTE_BAD_PHYS_ADDR;
217         }
218
219         /*
220          * the pfn (page frame number) are bits 0-54 (see
221          * pagemap.txt in linux Documentation)
222          */
223         physaddr = ((page & 0x7fffffffffffffULL) * page_size)
224                 + ((unsigned long)virtaddr % page_size);
225         close(fd);
226         return physaddr;
227 }
228
229 /*
230  * For each hugepage in hugepg_tbl, fill the physaddr value. We find
231  * it by browsing the /proc/self/pagemap special file.
232  */
233 static int
234 find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
235 {
236         unsigned i;
237         phys_addr_t addr;
238
239         for (i = 0; i < hpi->num_pages[0]; i++) {
240                 addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va);
241                 if (addr == RTE_BAD_PHYS_ADDR)
242                         return -1;
243                 hugepg_tbl[i].physaddr = addr;
244         }
245         return 0;
246 }
247
248 /*
249  * Check whether address-space layout randomization is enabled in
250  * the kernel. This is important for multi-process as it can prevent
251  * two processes mapping data to the same virtual address
252  * Returns:
253  *    0 - address space randomization disabled
254  *    1/2 - address space randomization enabled
255  *    negative error code on error
256  */
257 static int
258 aslr_enabled(void)
259 {
260         char c;
261         int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY);
262         if (fd < 0)
263                 return -errno;
264         retval = read(fd, &c, 1);
265         close(fd);
266         if (retval < 0)
267                 return -errno;
268         if (retval == 0)
269                 return -EIO;
270         switch (c) {
271                 case '0' : return 0;
272                 case '1' : return 1;
273                 case '2' : return 2;
274                 default: return -EINVAL;
275         }
276 }
277
278 /*
279  * Try to mmap *size bytes in /dev/zero. If it is successful, return the
280  * pointer to the mmap'd area and keep *size unmodified. Else, retry
281  * with a smaller zone: decrease *size by hugepage_sz until it reaches
282  * 0. In this case, return NULL. Note: this function returns an address
283  * which is a multiple of hugepage size.
284  */
285 static void *
286 get_virtual_area(size_t *size, size_t hugepage_sz)
287 {
288         void *addr;
289         int fd;
290         long aligned_addr;
291
292         if (internal_config.base_virtaddr != 0) {
293                 addr = (void*) (uintptr_t) (internal_config.base_virtaddr +
294                                 baseaddr_offset);
295         }
296         else addr = NULL;
297
298         RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
299
300         fd = open("/dev/zero", O_RDONLY);
301         if (fd < 0){
302                 RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n");
303                 return NULL;
304         }
305         do {
306                 addr = mmap(addr,
307                                 (*size) + hugepage_sz, PROT_READ, MAP_PRIVATE, fd, 0);
308                 if (addr == MAP_FAILED)
309                         *size -= hugepage_sz;
310         } while (addr == MAP_FAILED && *size > 0);
311
312         if (addr == MAP_FAILED) {
313                 close(fd);
314                 RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
315                         strerror(errno));
316                 return NULL;
317         }
318
319         munmap(addr, (*size) + hugepage_sz);
320         close(fd);
321
322         /* align addr to a huge page size boundary */
323         aligned_addr = (long)addr;
324         aligned_addr += (hugepage_sz - 1);
325         aligned_addr &= (~(hugepage_sz - 1));
326         addr = (void *)(aligned_addr);
327
328         RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
329                 addr, *size);
330
331         /* increment offset */
332         baseaddr_offset += *size;
333
334         return addr;
335 }
336
337 static sigjmp_buf huge_jmpenv;
338
339 static void huge_sigbus_handler(int signo __rte_unused)
340 {
341         siglongjmp(huge_jmpenv, 1);
342 }
343
344 /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
345  * non-static local variable in the stack frame calling sigsetjmp might be
346  * clobbered by a call to longjmp.
347  */
348 static int huge_wrap_sigsetjmp(void)
349 {
350         return sigsetjmp(huge_jmpenv, 1);
351 }
352
353 /*
354  * Mmap all hugepages of hugepage table: it first open a file in
355  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
356  * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
357  * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
358  * map continguous physical blocks in contiguous virtual blocks.
359  */
360 static unsigned
361 map_all_hugepages(struct hugepage_file *hugepg_tbl,
362                 struct hugepage_info *hpi, int orig)
363 {
364         int fd;
365         unsigned i;
366         void *virtaddr;
367         void *vma_addr = NULL;
368         size_t vma_len = 0;
369
370 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
371         RTE_SET_USED(vma_len);
372 #endif
373
374         for (i = 0; i < hpi->num_pages[0]; i++) {
375                 uint64_t hugepage_sz = hpi->hugepage_sz;
376
377                 if (orig) {
378                         hugepg_tbl[i].file_id = i;
379                         hugepg_tbl[i].size = hugepage_sz;
380 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
381                         eal_get_hugefile_temp_path(hugepg_tbl[i].filepath,
382                                         sizeof(hugepg_tbl[i].filepath), hpi->hugedir,
383                                         hugepg_tbl[i].file_id);
384 #else
385                         eal_get_hugefile_path(hugepg_tbl[i].filepath,
386                                         sizeof(hugepg_tbl[i].filepath), hpi->hugedir,
387                                         hugepg_tbl[i].file_id);
388 #endif
389                         hugepg_tbl[i].filepath[sizeof(hugepg_tbl[i].filepath) - 1] = '\0';
390                 }
391 #ifndef RTE_ARCH_64
392                 /* for 32-bit systems, don't remap 1G and 16G pages, just reuse
393                  * original map address as final map address.
394                  */
395                 else if ((hugepage_sz == RTE_PGSIZE_1G)
396                         || (hugepage_sz == RTE_PGSIZE_16G)) {
397                         hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va;
398                         hugepg_tbl[i].orig_va = NULL;
399                         continue;
400                 }
401 #endif
402
403 #ifndef RTE_EAL_SINGLE_FILE_SEGMENTS
404                 else if (vma_len == 0) {
405                         unsigned j, num_pages;
406
407                         /* reserve a virtual area for next contiguous
408                          * physical block: count the number of
409                          * contiguous physical pages. */
410                         for (j = i+1; j < hpi->num_pages[0] ; j++) {
411 #ifdef RTE_ARCH_PPC_64
412                                 /* The physical addresses are sorted in
413                                  * descending order on PPC64 */
414                                 if (hugepg_tbl[j].physaddr !=
415                                     hugepg_tbl[j-1].physaddr - hugepage_sz)
416                                         break;
417 #else
418                                 if (hugepg_tbl[j].physaddr !=
419                                     hugepg_tbl[j-1].physaddr + hugepage_sz)
420                                         break;
421 #endif
422                         }
423                         num_pages = j - i;
424                         vma_len = num_pages * hugepage_sz;
425
426                         /* get the biggest virtual memory area up to
427                          * vma_len. If it fails, vma_addr is NULL, so
428                          * let the kernel provide the address. */
429                         vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
430                         if (vma_addr == NULL)
431                                 vma_len = hugepage_sz;
432                 }
433 #endif
434
435                 /* try to create hugepage file */
436                 fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0755);
437                 if (fd < 0) {
438                         RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
439                                         strerror(errno));
440                         return i;
441                 }
442
443                 /* map the segment, and populate page tables,
444                  * the kernel fills this segment with zeros */
445                 virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
446                                 MAP_SHARED | MAP_POPULATE, fd, 0);
447                 if (virtaddr == MAP_FAILED) {
448                         RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
449                                         strerror(errno));
450                         close(fd);
451                         return i;
452                 }
453
454                 if (orig) {
455                         hugepg_tbl[i].orig_va = virtaddr;
456                 }
457                 else {
458                         hugepg_tbl[i].final_va = virtaddr;
459                 }
460
461                 if (orig) {
462                         /* In linux, hugetlb limitations, like cgroup, are
463                          * enforced at fault time instead of mmap(), even
464                          * with the option of MAP_POPULATE. Kernel will send
465                          * a SIGBUS signal. To avoid to be killed, save stack
466                          * environment here, if SIGBUS happens, we can jump
467                          * back here.
468                          */
469                         if (huge_wrap_sigsetjmp()) {
470                                 RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
471                                         "hugepages of size %u MB\n",
472                                         (unsigned)(hugepage_sz / 0x100000));
473                                 munmap(virtaddr, hugepage_sz);
474                                 close(fd);
475                                 unlink(hugepg_tbl[i].filepath);
476                                 return i;
477                         }
478                         *(int *)virtaddr = 0;
479                 }
480
481
482                 /* set shared flock on the file. */
483                 if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
484                         RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
485                                 __func__, strerror(errno));
486                         close(fd);
487                         return i;
488                 }
489
490                 close(fd);
491
492                 vma_addr = (char *)vma_addr + hugepage_sz;
493                 vma_len -= hugepage_sz;
494         }
495
496         return i;
497 }
498
499 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
500
501 /*
502  * Remaps all hugepages into single file segments
503  */
504 static int
505 remap_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
506 {
507         int fd;
508         unsigned i = 0, j, num_pages, page_idx = 0;
509         void *vma_addr = NULL, *old_addr = NULL, *page_addr = NULL;
510         size_t vma_len = 0;
511         size_t hugepage_sz = hpi->hugepage_sz;
512         size_t total_size, offset;
513         char filepath[MAX_HUGEPAGE_PATH];
514         phys_addr_t physaddr;
515         int socket;
516
517         while (i < hpi->num_pages[0]) {
518
519 #ifndef RTE_ARCH_64
520                 /* for 32-bit systems, don't remap 1G pages and 16G pages,
521                  * just reuse original map address as final map address.
522                  */
523                 if ((hugepage_sz == RTE_PGSIZE_1G)
524                         || (hugepage_sz == RTE_PGSIZE_16G)) {
525                         hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va;
526                         hugepg_tbl[i].orig_va = NULL;
527                         i++;
528                         continue;
529                 }
530 #endif
531
532                 /* reserve a virtual area for next contiguous
533                  * physical block: count the number of
534                  * contiguous physical pages. */
535                 for (j = i+1; j < hpi->num_pages[0] ; j++) {
536 #ifdef RTE_ARCH_PPC_64
537                         /* The physical addresses are sorted in descending
538                          * order on PPC64 */
539                         if (hugepg_tbl[j].physaddr !=
540                                 hugepg_tbl[j-1].physaddr - hugepage_sz)
541                                 break;
542 #else
543                         if (hugepg_tbl[j].physaddr !=
544                                 hugepg_tbl[j-1].physaddr + hugepage_sz)
545                                 break;
546 #endif
547                 }
548                 num_pages = j - i;
549                 vma_len = num_pages * hugepage_sz;
550
551                 socket = hugepg_tbl[i].socket_id;
552
553                 /* get the biggest virtual memory area up to
554                  * vma_len. If it fails, vma_addr is NULL, so
555                  * let the kernel provide the address. */
556                 vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
557
558                 /* If we can't find a big enough virtual area, work out how many pages
559                  * we are going to get */
560                 if (vma_addr == NULL)
561                         j = i + 1;
562                 else if (vma_len != num_pages * hugepage_sz) {
563                         num_pages = vma_len / hugepage_sz;
564                         j = i + num_pages;
565
566                 }
567
568                 hugepg_tbl[page_idx].file_id = page_idx;
569                 eal_get_hugefile_path(filepath,
570                                 sizeof(filepath),
571                                 hpi->hugedir,
572                                 hugepg_tbl[page_idx].file_id);
573
574                 /* try to create hugepage file */
575                 fd = open(filepath, O_CREAT | O_RDWR, 0755);
576                 if (fd < 0) {
577                         RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__, strerror(errno));
578                         return -1;
579                 }
580
581                 total_size = 0;
582                 for (;i < j; i++) {
583
584                         /* unmap current segment */
585                         if (total_size > 0)
586                                 munmap(vma_addr, total_size);
587
588                         /* unmap original page */
589                         munmap(hugepg_tbl[i].orig_va, hugepage_sz);
590                         unlink(hugepg_tbl[i].filepath);
591
592                         total_size += hugepage_sz;
593
594                         old_addr = vma_addr;
595
596                         /* map new, bigger segment, and populate page tables,
597                          * the kernel fills this segment with zeros */
598                         vma_addr = mmap(vma_addr, total_size,
599                                         PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 0);
600
601                         if (vma_addr == MAP_FAILED || vma_addr != old_addr) {
602                                 RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__, strerror(errno));
603                                 close(fd);
604                                 return -1;
605                         }
606                 }
607
608                 /* set shared flock on the file. */
609                 if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
610                         RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
611                                 __func__, strerror(errno));
612                         close(fd);
613                         return -1;
614                 }
615
616                 snprintf(hugepg_tbl[page_idx].filepath, MAX_HUGEPAGE_PATH, "%s",
617                                 filepath);
618
619                 physaddr = rte_mem_virt2phy(vma_addr);
620
621                 if (physaddr == RTE_BAD_PHYS_ADDR)
622                         return -1;
623
624                 hugepg_tbl[page_idx].final_va = vma_addr;
625
626                 hugepg_tbl[page_idx].physaddr = physaddr;
627
628                 hugepg_tbl[page_idx].repeated = num_pages;
629
630                 hugepg_tbl[page_idx].socket_id = socket;
631
632                 close(fd);
633
634                 /* verify the memory segment - that is, check that every VA corresponds
635                  * to the physical address we expect to see
636                  */
637                 for (offset = 0; offset < vma_len; offset += hugepage_sz) {
638                         uint64_t expected_physaddr;
639
640                         expected_physaddr = hugepg_tbl[page_idx].physaddr + offset;
641                         page_addr = RTE_PTR_ADD(vma_addr, offset);
642                         physaddr = rte_mem_virt2phy(page_addr);
643
644                         if (physaddr != expected_physaddr) {
645                                 RTE_LOG(ERR, EAL, "Segment sanity check failed: wrong physaddr "
646                                                 "at %p (offset 0x%" PRIx64 ": 0x%" PRIx64
647                                                 " (expected 0x%" PRIx64 ")\n",
648                                                 page_addr, offset, physaddr, expected_physaddr);
649                                 return -1;
650                         }
651                 }
652
653                 page_idx++;
654         }
655
656         /* zero out the rest */
657         memset(&hugepg_tbl[page_idx], 0, (hpi->num_pages[0] - page_idx) * sizeof(struct hugepage_file));
658         return page_idx;
659 }
660 #else/* RTE_EAL_SINGLE_FILE_SEGMENTS=n */
661
662 /* Unmap all hugepages from original mapping */
663 static int
664 unmap_all_hugepages_orig(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
665 {
666         unsigned i;
667         for (i = 0; i < hpi->num_pages[0]; i++) {
668                 if (hugepg_tbl[i].orig_va) {
669                         munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz);
670                         hugepg_tbl[i].orig_va = NULL;
671                 }
672         }
673         return 0;
674 }
675 #endif /* RTE_EAL_SINGLE_FILE_SEGMENTS */
676
677 /*
678  * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
679  * page.
680  */
681 static int
682 find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
683 {
684         int socket_id;
685         char *end, *nodestr;
686         unsigned i, hp_count = 0;
687         uint64_t virt_addr;
688         char buf[BUFSIZ];
689         char hugedir_str[PATH_MAX];
690         FILE *f;
691
692         f = fopen("/proc/self/numa_maps", "r");
693         if (f == NULL) {
694                 RTE_LOG(NOTICE, EAL, "cannot open /proc/self/numa_maps,"
695                                 " consider that all memory is in socket_id 0\n");
696                 return 0;
697         }
698
699         snprintf(hugedir_str, sizeof(hugedir_str),
700                         "%s/%s", hpi->hugedir, internal_config.hugefile_prefix);
701
702         /* parse numa map */
703         while (fgets(buf, sizeof(buf), f) != NULL) {
704
705                 /* ignore non huge page */
706                 if (strstr(buf, " huge ") == NULL &&
707                                 strstr(buf, hugedir_str) == NULL)
708                         continue;
709
710                 /* get zone addr */
711                 virt_addr = strtoull(buf, &end, 16);
712                 if (virt_addr == 0 || end == buf) {
713                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
714                         goto error;
715                 }
716
717                 /* get node id (socket id) */
718                 nodestr = strstr(buf, " N");
719                 if (nodestr == NULL) {
720                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
721                         goto error;
722                 }
723                 nodestr += 2;
724                 end = strstr(nodestr, "=");
725                 if (end == NULL) {
726                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
727                         goto error;
728                 }
729                 end[0] = '\0';
730                 end = NULL;
731
732                 socket_id = strtoul(nodestr, &end, 0);
733                 if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
734                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
735                         goto error;
736                 }
737
738                 /* if we find this page in our mappings, set socket_id */
739                 for (i = 0; i < hpi->num_pages[0]; i++) {
740                         void *va = (void *)(unsigned long)virt_addr;
741                         if (hugepg_tbl[i].orig_va == va) {
742                                 hugepg_tbl[i].socket_id = socket_id;
743                                 hp_count++;
744                         }
745                 }
746         }
747
748         if (hp_count < hpi->num_pages[0])
749                 goto error;
750
751         fclose(f);
752         return 0;
753
754 error:
755         fclose(f);
756         return -1;
757 }
758
759 static int
760 cmp_physaddr(const void *a, const void *b)
761 {
762 #ifndef RTE_ARCH_PPC_64
763         const struct hugepage_file *p1 = (const struct hugepage_file *)a;
764         const struct hugepage_file *p2 = (const struct hugepage_file *)b;
765 #else
766         /* PowerPC needs memory sorted in reverse order from x86 */
767         const struct hugepage_file *p1 = (const struct hugepage_file *)b;
768         const struct hugepage_file *p2 = (const struct hugepage_file *)a;
769 #endif
770         if (p1->physaddr < p2->physaddr)
771                 return -1;
772         else if (p1->physaddr > p2->physaddr)
773                 return 1;
774         else
775                 return 0;
776 }
777
778 /*
779  * Uses mmap to create a shared memory area for storage of data
780  * Used in this file to store the hugepage file map on disk
781  */
782 static void *
783 create_shared_memory(const char *filename, const size_t mem_size)
784 {
785         void *retval;
786         int fd = open(filename, O_CREAT | O_RDWR, 0666);
787         if (fd < 0)
788                 return NULL;
789         if (ftruncate(fd, mem_size) < 0) {
790                 close(fd);
791                 return NULL;
792         }
793         retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
794         close(fd);
795         return retval;
796 }
797
798 /*
799  * this copies *active* hugepages from one hugepage table to another.
800  * destination is typically the shared memory.
801  */
802 static int
803 copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size,
804                 const struct hugepage_file * src, int src_size)
805 {
806         int src_pos, dst_pos = 0;
807
808         for (src_pos = 0; src_pos < src_size; src_pos++) {
809                 if (src[src_pos].final_va != NULL) {
810                         /* error on overflow attempt */
811                         if (dst_pos == dest_size)
812                                 return -1;
813                         memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file));
814                         dst_pos++;
815                 }
816         }
817         return 0;
818 }
819
820 static int
821 unlink_hugepage_files(struct hugepage_file *hugepg_tbl,
822                 unsigned num_hp_info)
823 {
824         unsigned socket, size;
825         int page, nrpages = 0;
826
827         /* get total number of hugepages */
828         for (size = 0; size < num_hp_info; size++)
829                 for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
830                         nrpages +=
831                         internal_config.hugepage_info[size].num_pages[socket];
832
833         for (page = 0; page < nrpages; page++) {
834                 struct hugepage_file *hp = &hugepg_tbl[page];
835
836                 if (hp->final_va != NULL && unlink(hp->filepath)) {
837                         RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n",
838                                 __func__, hp->filepath, strerror(errno));
839                 }
840         }
841         return 0;
842 }
843
844 /*
845  * unmaps hugepages that are not going to be used. since we originally allocate
846  * ALL hugepages (not just those we need), additional unmapping needs to be done.
847  */
848 static int
849 unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
850                 struct hugepage_info *hpi,
851                 unsigned num_hp_info)
852 {
853         unsigned socket, size;
854         int page, nrpages = 0;
855
856         /* get total number of hugepages */
857         for (size = 0; size < num_hp_info; size++)
858                 for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
859                         nrpages += internal_config.hugepage_info[size].num_pages[socket];
860
861         for (size = 0; size < num_hp_info; size++) {
862                 for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
863                         unsigned pages_found = 0;
864
865                         /* traverse until we have unmapped all the unused pages */
866                         for (page = 0; page < nrpages; page++) {
867                                 struct hugepage_file *hp = &hugepg_tbl[page];
868
869 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
870                                 /* if this page was already cleared */
871                                 if (hp->final_va == NULL)
872                                         continue;
873 #endif
874
875                                 /* find a page that matches the criteria */
876                                 if ((hp->size == hpi[size].hugepage_sz) &&
877                                                 (hp->socket_id == (int) socket)) {
878
879                                         /* if we skipped enough pages, unmap the rest */
880                                         if (pages_found == hpi[size].num_pages[socket]) {
881                                                 uint64_t unmap_len;
882
883 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
884                                                 unmap_len = hp->size * hp->repeated;
885 #else
886                                                 unmap_len = hp->size;
887 #endif
888
889                                                 /* get start addr and len of the remaining segment */
890                                                 munmap(hp->final_va, (size_t) unmap_len);
891
892                                                 hp->final_va = NULL;
893                                                 if (unlink(hp->filepath) == -1) {
894                                                         RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n",
895                                                                         __func__, hp->filepath, strerror(errno));
896                                                         return -1;
897                                                 }
898                                         }
899 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
900                                         /* else, check how much do we need to map */
901                                         else {
902                                                 int nr_pg_left =
903                                                                 hpi[size].num_pages[socket] - pages_found;
904
905                                                 /* if we need enough memory to fit into the segment */
906                                                 if (hp->repeated <= nr_pg_left) {
907                                                         pages_found += hp->repeated;
908                                                 }
909                                                 /* truncate the segment */
910                                                 else {
911                                                         uint64_t final_size = nr_pg_left * hp->size;
912                                                         uint64_t seg_size = hp->repeated * hp->size;
913
914                                                         void * unmap_va = RTE_PTR_ADD(hp->final_va,
915                                                                         final_size);
916                                                         int fd;
917
918                                                         munmap(unmap_va, seg_size - final_size);
919
920                                                         fd = open(hp->filepath, O_RDWR);
921                                                         if (fd < 0) {
922                                                                 RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
923                                                                                 hp->filepath, strerror(errno));
924                                                                 return -1;
925                                                         }
926                                                         if (ftruncate(fd, final_size) < 0) {
927                                                                 RTE_LOG(ERR, EAL, "Cannot truncate %s: %s\n",
928                                                                                 hp->filepath, strerror(errno));
929                                                                 return -1;
930                                                         }
931                                                         close(fd);
932
933                                                         pages_found += nr_pg_left;
934                                                         hp->repeated = nr_pg_left;
935                                                 }
936                                         }
937 #else
938                                         /* else, lock the page and skip */
939                                         else
940                                                 pages_found++;
941 #endif
942
943                                 } /* match page */
944                         } /* foreach page */
945                 } /* foreach socket */
946         } /* foreach pagesize */
947
948         return 0;
949 }
950
951 static inline uint64_t
952 get_socket_mem_size(int socket)
953 {
954         uint64_t size = 0;
955         unsigned i;
956
957         for (i = 0; i < internal_config.num_hugepage_sizes; i++){
958                 struct hugepage_info *hpi = &internal_config.hugepage_info[i];
959                 if (hpi->hugedir != NULL)
960                         size += hpi->hugepage_sz * hpi->num_pages[socket];
961         }
962
963         return size;
964 }
965
966 /*
967  * This function is a NUMA-aware equivalent of calc_num_pages.
968  * It takes in the list of hugepage sizes and the
969  * number of pages thereof, and calculates the best number of
970  * pages of each size to fulfill the request for <memory> ram
971  */
972 static int
973 calc_num_pages_per_socket(uint64_t * memory,
974                 struct hugepage_info *hp_info,
975                 struct hugepage_info *hp_used,
976                 unsigned num_hp_info)
977 {
978         unsigned socket, j, i = 0;
979         unsigned requested, available;
980         int total_num_pages = 0;
981         uint64_t remaining_mem, cur_mem;
982         uint64_t total_mem = internal_config.memory;
983
984         if (num_hp_info == 0)
985                 return -1;
986
987         /* if specific memory amounts per socket weren't requested */
988         if (internal_config.force_sockets == 0) {
989                 int cpu_per_socket[RTE_MAX_NUMA_NODES];
990                 size_t default_size, total_size;
991                 unsigned lcore_id;
992
993                 /* Compute number of cores per socket */
994                 memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
995                 RTE_LCORE_FOREACH(lcore_id) {
996                         cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
997                 }
998
999                 /*
1000                  * Automatically spread requested memory amongst detected sockets according
1001                  * to number of cores from cpu mask present on each socket
1002                  */
1003                 total_size = internal_config.memory;
1004                 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
1005
1006                         /* Set memory amount per socket */
1007                         default_size = (internal_config.memory * cpu_per_socket[socket])
1008                                         / rte_lcore_count();
1009
1010                         /* Limit to maximum available memory on socket */
1011                         default_size = RTE_MIN(default_size, get_socket_mem_size(socket));
1012
1013                         /* Update sizes */
1014                         memory[socket] = default_size;
1015                         total_size -= default_size;
1016                 }
1017
1018                 /*
1019                  * If some memory is remaining, try to allocate it by getting all
1020                  * available memory from sockets, one after the other
1021                  */
1022                 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
1023                         /* take whatever is available */
1024                         default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],
1025                                                total_size);
1026
1027                         /* Update sizes */
1028                         memory[socket] += default_size;
1029                         total_size -= default_size;
1030                 }
1031         }
1032
1033         for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
1034                 /* skips if the memory on specific socket wasn't requested */
1035                 for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
1036                         hp_used[i].hugedir = hp_info[i].hugedir;
1037                         hp_used[i].num_pages[socket] = RTE_MIN(
1038                                         memory[socket] / hp_info[i].hugepage_sz,
1039                                         hp_info[i].num_pages[socket]);
1040
1041                         cur_mem = hp_used[i].num_pages[socket] *
1042                                         hp_used[i].hugepage_sz;
1043
1044                         memory[socket] -= cur_mem;
1045                         total_mem -= cur_mem;
1046
1047                         total_num_pages += hp_used[i].num_pages[socket];
1048
1049                         /* check if we have met all memory requests */
1050                         if (memory[socket] == 0)
1051                                 break;
1052
1053                         /* check if we have any more pages left at this size, if so
1054                          * move on to next size */
1055                         if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])
1056                                 continue;
1057                         /* At this point we know that there are more pages available that are
1058                          * bigger than the memory we want, so lets see if we can get enough
1059                          * from other page sizes.
1060                          */
1061                         remaining_mem = 0;
1062                         for (j = i+1; j < num_hp_info; j++)
1063                                 remaining_mem += hp_info[j].hugepage_sz *
1064                                 hp_info[j].num_pages[socket];
1065
1066                         /* is there enough other memory, if not allocate another page and quit */
1067                         if (remaining_mem < memory[socket]){
1068                                 cur_mem = RTE_MIN(memory[socket],
1069                                                 hp_info[i].hugepage_sz);
1070                                 memory[socket] -= cur_mem;
1071                                 total_mem -= cur_mem;
1072                                 hp_used[i].num_pages[socket]++;
1073                                 total_num_pages++;
1074                                 break; /* we are done with this socket*/
1075                         }
1076                 }
1077                 /* if we didn't satisfy all memory requirements per socket */
1078                 if (memory[socket] > 0) {
1079                         /* to prevent icc errors */
1080                         requested = (unsigned) (internal_config.socket_mem[socket] /
1081                                         0x100000);
1082                         available = requested -
1083                                         ((unsigned) (memory[socket] / 0x100000));
1084                         RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! "
1085                                         "Requested: %uMB, available: %uMB\n", socket,
1086                                         requested, available);
1087                         return -1;
1088                 }
1089         }
1090
1091         /* if we didn't satisfy total memory requirements */
1092         if (total_mem > 0) {
1093                 requested = (unsigned) (internal_config.memory / 0x100000);
1094                 available = requested - (unsigned) (total_mem / 0x100000);
1095                 RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB,"
1096                                 " available: %uMB\n", requested, available);
1097                 return -1;
1098         }
1099         return total_num_pages;
1100 }
1101
1102 static inline size_t
1103 eal_get_hugepage_mem_size(void)
1104 {
1105         uint64_t size = 0;
1106         unsigned i, j;
1107
1108         for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
1109                 struct hugepage_info *hpi = &internal_config.hugepage_info[i];
1110                 if (hpi->hugedir != NULL) {
1111                         for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
1112                                 size += hpi->hugepage_sz * hpi->num_pages[j];
1113                         }
1114                 }
1115         }
1116
1117         return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
1118 }
1119
1120 static struct sigaction huge_action_old;
1121 static int huge_need_recover;
1122
1123 static void
1124 huge_register_sigbus(void)
1125 {
1126         sigset_t mask;
1127         struct sigaction action;
1128
1129         sigemptyset(&mask);
1130         sigaddset(&mask, SIGBUS);
1131         action.sa_flags = 0;
1132         action.sa_mask = mask;
1133         action.sa_handler = huge_sigbus_handler;
1134
1135         huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
1136 }
1137
1138 static void
1139 huge_recover_sigbus(void)
1140 {
1141         if (huge_need_recover) {
1142                 sigaction(SIGBUS, &huge_action_old, NULL);
1143                 huge_need_recover = 0;
1144         }
1145 }
1146
1147 /*
1148  * Prepare physical memory mapping: fill configuration structure with
1149  * these infos, return 0 on success.
1150  *  1. map N huge pages in separate files in hugetlbfs
1151  *  2. find associated physical addr
1152  *  3. find associated NUMA socket ID
1153  *  4. sort all huge pages by physical address
1154  *  5. remap these N huge pages in the correct order
1155  *  6. unmap the first mapping
1156  *  7. fill memsegs in configuration with contiguous zones
1157  */
1158 int
1159 rte_eal_hugepage_init(void)
1160 {
1161         struct rte_mem_config *mcfg;
1162         struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
1163         struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
1164
1165         uint64_t memory[RTE_MAX_NUMA_NODES];
1166
1167         unsigned hp_offset;
1168         int i, j, new_memseg;
1169         int nr_hugefiles, nr_hugepages = 0;
1170         void *addr;
1171 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
1172         int new_pages_count[MAX_HUGEPAGE_SIZES];
1173 #endif
1174
1175         test_proc_pagemap_readable();
1176
1177         memset(used_hp, 0, sizeof(used_hp));
1178
1179         /* get pointer to global configuration */
1180         mcfg = rte_eal_get_configuration()->mem_config;
1181
1182         /* hugetlbfs can be disabled */
1183         if (internal_config.no_hugetlbfs) {
1184                 addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
1185                                 MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
1186                 if (addr == MAP_FAILED) {
1187                         RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
1188                                         strerror(errno));
1189                         return -1;
1190                 }
1191                 mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr;
1192                 mcfg->memseg[0].addr = addr;
1193                 mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
1194                 mcfg->memseg[0].len = internal_config.memory;
1195                 mcfg->memseg[0].socket_id = 0;
1196                 return 0;
1197         }
1198
1199 /* check if app runs on Xen Dom0 */
1200         if (internal_config.xen_dom0_support) {
1201 #ifdef RTE_LIBRTE_XEN_DOM0
1202                 /* use dom0_mm kernel driver to init memory */
1203                 if (rte_xen_dom0_memory_init() < 0)
1204                         return -1;
1205                 else
1206                         return 0;
1207 #endif
1208         }
1209
1210         /* calculate total number of hugepages available. at this point we haven't
1211          * yet started sorting them so they all are on socket 0 */
1212         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
1213                 /* meanwhile, also initialize used_hp hugepage sizes in used_hp */
1214                 used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz;
1215
1216                 nr_hugepages += internal_config.hugepage_info[i].num_pages[0];
1217         }
1218
1219         /*
1220          * allocate a memory area for hugepage table.
1221          * this isn't shared memory yet. due to the fact that we need some
1222          * processing done on these pages, shared memory will be created
1223          * at a later stage.
1224          */
1225         tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));
1226         if (tmp_hp == NULL)
1227                 goto fail;
1228
1229         memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file));
1230
1231         hp_offset = 0; /* where we start the current page size entries */
1232
1233         huge_register_sigbus();
1234
1235         /* map all hugepages and sort them */
1236         for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
1237                 unsigned pages_old, pages_new;
1238                 struct hugepage_info *hpi;
1239
1240                 /*
1241                  * we don't yet mark hugepages as used at this stage, so
1242                  * we just map all hugepages available to the system
1243                  * all hugepages are still located on socket 0
1244                  */
1245                 hpi = &internal_config.hugepage_info[i];
1246
1247                 if (hpi->num_pages[0] == 0)
1248                         continue;
1249
1250                 /* map all hugepages available */
1251                 pages_old = hpi->num_pages[0];
1252                 pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
1253                 if (pages_new < pages_old) {
1254 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
1255                         RTE_LOG(ERR, EAL,
1256                                 "%d not %d hugepages of size %u MB allocated\n",
1257                                 pages_new, pages_old,
1258                                 (unsigned)(hpi->hugepage_sz / 0x100000));
1259                         goto fail;
1260 #else
1261                         RTE_LOG(DEBUG, EAL,
1262                                 "%d not %d hugepages of size %u MB allocated\n",
1263                                 pages_new, pages_old,
1264                                 (unsigned)(hpi->hugepage_sz / 0x100000));
1265
1266                         int pages = pages_old - pages_new;
1267
1268                         nr_hugepages -= pages;
1269                         hpi->num_pages[0] = pages_new;
1270                         if (pages_new == 0)
1271                                 continue;
1272 #endif
1273                 }
1274
1275                 /* find physical addresses and sockets for each hugepage */
1276                 if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0){
1277                         RTE_LOG(DEBUG, EAL, "Failed to find phys addr for %u MB pages\n",
1278                                         (unsigned)(hpi->hugepage_sz / 0x100000));
1279                         goto fail;
1280                 }
1281
1282                 if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
1283                         RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
1284                                         (unsigned)(hpi->hugepage_sz / 0x100000));
1285                         goto fail;
1286                 }
1287
1288                 qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
1289                       sizeof(struct hugepage_file), cmp_physaddr);
1290
1291 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
1292                 /* remap all hugepages into single file segments */
1293                 new_pages_count[i] = remap_all_hugepages(&tmp_hp[hp_offset], hpi);
1294                 if (new_pages_count[i] < 0){
1295                         RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
1296                                         (unsigned)(hpi->hugepage_sz / 0x100000));
1297                         goto fail;
1298                 }
1299
1300                 /* we have processed a num of hugepages of this size, so inc offset */
1301                 hp_offset += new_pages_count[i];
1302 #else
1303                 /* remap all hugepages */
1304                 if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) !=
1305                     hpi->num_pages[0]) {
1306                         RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n",
1307                                         (unsigned)(hpi->hugepage_sz / 0x100000));
1308                         goto fail;
1309                 }
1310
1311                 /* unmap original mappings */
1312                 if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0)
1313                         goto fail;
1314
1315                 /* we have processed a num of hugepages of this size, so inc offset */
1316                 hp_offset += hpi->num_pages[0];
1317 #endif
1318         }
1319
1320         huge_recover_sigbus();
1321
1322         if (internal_config.memory == 0 && internal_config.force_sockets == 0)
1323                 internal_config.memory = eal_get_hugepage_mem_size();
1324
1325 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
1326         nr_hugefiles = 0;
1327         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
1328                 nr_hugefiles += new_pages_count[i];
1329         }
1330 #else
1331         nr_hugefiles = nr_hugepages;
1332 #endif
1333
1334
1335         /* clean out the numbers of pages */
1336         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++)
1337                 for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
1338                         internal_config.hugepage_info[i].num_pages[j] = 0;
1339
1340         /* get hugepages for each socket */
1341         for (i = 0; i < nr_hugefiles; i++) {
1342                 int socket = tmp_hp[i].socket_id;
1343
1344                 /* find a hugepage info with right size and increment num_pages */
1345                 const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES,
1346                                 (int)internal_config.num_hugepage_sizes);
1347                 for (j = 0; j < nb_hpsizes; j++) {
1348                         if (tmp_hp[i].size ==
1349                                         internal_config.hugepage_info[j].hugepage_sz) {
1350 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
1351                                         internal_config.hugepage_info[j].num_pages[socket] +=
1352                                                 tmp_hp[i].repeated;
1353 #else
1354                                 internal_config.hugepage_info[j].num_pages[socket]++;
1355 #endif
1356                         }
1357                 }
1358         }
1359
1360         /* make a copy of socket_mem, needed for number of pages calculation */
1361         for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
1362                 memory[i] = internal_config.socket_mem[i];
1363
1364         /* calculate final number of pages */
1365         nr_hugepages = calc_num_pages_per_socket(memory,
1366                         internal_config.hugepage_info, used_hp,
1367                         internal_config.num_hugepage_sizes);
1368
1369         /* error if not enough memory available */
1370         if (nr_hugepages < 0)
1371                 goto fail;
1372
1373         /* reporting in! */
1374         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
1375                 for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
1376                         if (used_hp[i].num_pages[j] > 0) {
1377                                 RTE_LOG(DEBUG, EAL,
1378                                         "Requesting %u pages of size %uMB"
1379                                         " from socket %i\n",
1380                                         used_hp[i].num_pages[j],
1381                                         (unsigned)
1382                                         (used_hp[i].hugepage_sz / 0x100000),
1383                                         j);
1384                         }
1385                 }
1386         }
1387
1388         /* create shared memory */
1389         hugepage = create_shared_memory(eal_hugepage_info_path(),
1390                         nr_hugefiles * sizeof(struct hugepage_file));
1391
1392         if (hugepage == NULL) {
1393                 RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
1394                 goto fail;
1395         }
1396         memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file));
1397
1398         /*
1399          * unmap pages that we won't need (looks at used_hp).
1400          * also, sets final_va to NULL on pages that were unmapped.
1401          */
1402         if (unmap_unneeded_hugepages(tmp_hp, used_hp,
1403                         internal_config.num_hugepage_sizes) < 0) {
1404                 RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n");
1405                 goto fail;
1406         }
1407
1408         /*
1409          * copy stuff from malloc'd hugepage* to the actual shared memory.
1410          * this procedure only copies those hugepages that have final_va
1411          * not NULL. has overflow protection.
1412          */
1413         if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
1414                         tmp_hp, nr_hugefiles) < 0) {
1415                 RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n");
1416                 goto fail;
1417         }
1418
1419         /* free the hugepage backing files */
1420         if (internal_config.hugepage_unlink &&
1421                 unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) {
1422                 RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n");
1423                 goto fail;
1424         }
1425
1426         /* free the temporary hugepage table */
1427         free(tmp_hp);
1428         tmp_hp = NULL;
1429
1430         /* find earliest free memseg - this is needed because in case of IVSHMEM,
1431          * segments might have already been initialized */
1432         for (j = 0; j < RTE_MAX_MEMSEG; j++)
1433                 if (mcfg->memseg[j].addr == NULL) {
1434                         /* move to previous segment and exit loop */
1435                         j--;
1436                         break;
1437                 }
1438
1439         for (i = 0; i < nr_hugefiles; i++) {
1440                 new_memseg = 0;
1441
1442                 /* if this is a new section, create a new memseg */
1443                 if (i == 0)
1444                         new_memseg = 1;
1445                 else if (hugepage[i].socket_id != hugepage[i-1].socket_id)
1446                         new_memseg = 1;
1447                 else if (hugepage[i].size != hugepage[i-1].size)
1448                         new_memseg = 1;
1449
1450 #ifdef RTE_ARCH_PPC_64
1451                 /* On PPC64 architecture, the mmap always start from higher
1452                  * virtual address to lower address. Here, both the physical
1453                  * address and virtual address are in descending order */
1454                 else if ((hugepage[i-1].physaddr - hugepage[i].physaddr) !=
1455                     hugepage[i].size)
1456                         new_memseg = 1;
1457                 else if (((unsigned long)hugepage[i-1].final_va -
1458                     (unsigned long)hugepage[i].final_va) != hugepage[i].size)
1459                         new_memseg = 1;
1460 #else
1461                 else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=
1462                     hugepage[i].size)
1463                         new_memseg = 1;
1464                 else if (((unsigned long)hugepage[i].final_va -
1465                     (unsigned long)hugepage[i-1].final_va) != hugepage[i].size)
1466                         new_memseg = 1;
1467 #endif
1468
1469                 if (new_memseg) {
1470                         j += 1;
1471                         if (j == RTE_MAX_MEMSEG)
1472                                 break;
1473
1474                         mcfg->memseg[j].phys_addr = hugepage[i].physaddr;
1475                         mcfg->memseg[j].addr = hugepage[i].final_va;
1476 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
1477                         mcfg->memseg[j].len = hugepage[i].size * hugepage[i].repeated;
1478 #else
1479                         mcfg->memseg[j].len = hugepage[i].size;
1480 #endif
1481                         mcfg->memseg[j].socket_id = hugepage[i].socket_id;
1482                         mcfg->memseg[j].hugepage_sz = hugepage[i].size;
1483                 }
1484                 /* continuation of previous memseg */
1485                 else {
1486 #ifdef RTE_ARCH_PPC_64
1487                 /* Use the phy and virt address of the last page as segment
1488                  * address for IBM Power architecture */
1489                         mcfg->memseg[j].phys_addr = hugepage[i].physaddr;
1490                         mcfg->memseg[j].addr = hugepage[i].final_va;
1491 #endif
1492                         mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
1493                 }
1494                 hugepage[i].memseg_id = j;
1495         }
1496
1497         if (i < nr_hugefiles) {
1498                 RTE_LOG(ERR, EAL, "Can only reserve %d pages "
1499                         "from %d requested\n"
1500                         "Current %s=%d is not enough\n"
1501                         "Please either increase it or request less amount "
1502                         "of memory.\n",
1503                         i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG),
1504                         RTE_MAX_MEMSEG);
1505                 goto fail;
1506         }
1507
1508         munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
1509
1510         return 0;
1511
1512 fail:
1513         huge_recover_sigbus();
1514         free(tmp_hp);
1515         if (hugepage != NULL)
1516                 munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
1517
1518         return -1;
1519 }
1520
1521 /*
1522  * uses fstat to report the size of a file on disk
1523  */
1524 static off_t
1525 getFileSize(int fd)
1526 {
1527         struct stat st;
1528         if (fstat(fd, &st) < 0)
1529                 return 0;
1530         return st.st_size;
1531 }
1532
1533 /*
1534  * This creates the memory mappings in the secondary process to match that of
1535  * the server process. It goes through each memory segment in the DPDK runtime
1536  * configuration and finds the hugepages which form that segment, mapping them
1537  * in order to form a contiguous block in the virtual memory space
1538  */
1539 int
1540 rte_eal_hugepage_attach(void)
1541 {
1542         const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1543         struct hugepage_file *hp = NULL;
1544         unsigned num_hp = 0;
1545         unsigned i, s = 0; /* s used to track the segment number */
1546         off_t size;
1547         int fd, fd_zero = -1, fd_hugepage = -1;
1548
1549         if (aslr_enabled() > 0) {
1550                 RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization "
1551                                 "(ASLR) is enabled in the kernel.\n");
1552                 RTE_LOG(WARNING, EAL, "   This may cause issues with mapping memory "
1553                                 "into secondary processes\n");
1554         }
1555
1556         test_proc_pagemap_readable();
1557
1558         if (internal_config.xen_dom0_support) {
1559 #ifdef RTE_LIBRTE_XEN_DOM0
1560                 if (rte_xen_dom0_memory_attach() < 0) {
1561                         RTE_LOG(ERR, EAL, "Failed to attach memory segments of primary "
1562                                         "process\n");
1563                         return -1;
1564                 }
1565                 return 0;
1566 #endif
1567         }
1568
1569         fd_zero = open("/dev/zero", O_RDONLY);
1570         if (fd_zero < 0) {
1571                 RTE_LOG(ERR, EAL, "Could not open /dev/zero\n");
1572                 goto error;
1573         }
1574         fd_hugepage = open(eal_hugepage_info_path(), O_RDONLY);
1575         if (fd_hugepage < 0) {
1576                 RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path());
1577                 goto error;
1578         }
1579
1580         /* map all segments into memory to make sure we get the addrs */
1581         for (s = 0; s < RTE_MAX_MEMSEG; ++s) {
1582                 void *base_addr;
1583
1584                 /*
1585                  * the first memory segment with len==0 is the one that
1586                  * follows the last valid segment.
1587                  */
1588                 if (mcfg->memseg[s].len == 0)
1589                         break;
1590
1591 #ifdef RTE_LIBRTE_IVSHMEM
1592                 /*
1593                  * if segment has ioremap address set, it's an IVSHMEM segment and
1594                  * doesn't need mapping as it was already mapped earlier
1595                  */
1596                 if (mcfg->memseg[s].ioremap_addr != 0)
1597                         continue;
1598 #endif
1599
1600                 /*
1601                  * fdzero is mmapped to get a contiguous block of virtual
1602                  * addresses of the appropriate memseg size.
1603                  * use mmap to get identical addresses as the primary process.
1604                  */
1605                 base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
1606                                  PROT_READ, MAP_PRIVATE, fd_zero, 0);
1607                 if (base_addr == MAP_FAILED ||
1608                     base_addr != mcfg->memseg[s].addr) {
1609                         RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
1610                                 "in /dev/zero to requested address [%p]: '%s'\n",
1611                                 (unsigned long long)mcfg->memseg[s].len,
1612                                 mcfg->memseg[s].addr, strerror(errno));
1613                         if (aslr_enabled() > 0) {
1614                                 RTE_LOG(ERR, EAL, "It is recommended to "
1615                                         "disable ASLR in the kernel "
1616                                         "and retry running both primary "
1617                                         "and secondary processes\n");
1618                         }
1619                         goto error;
1620                 }
1621         }
1622
1623         size = getFileSize(fd_hugepage);
1624         hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
1625         if (hp == MAP_FAILED) {
1626                 RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path());
1627                 goto error;
1628         }
1629
1630         num_hp = size / sizeof(struct hugepage_file);
1631         RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);
1632
1633         s = 0;
1634         while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
1635                 void *addr, *base_addr;
1636                 uintptr_t offset = 0;
1637                 size_t mapping_size;
1638 #ifdef RTE_LIBRTE_IVSHMEM
1639                 /*
1640                  * if segment has ioremap address set, it's an IVSHMEM segment and
1641                  * doesn't need mapping as it was already mapped earlier
1642                  */
1643                 if (mcfg->memseg[s].ioremap_addr != 0) {
1644                         s++;
1645                         continue;
1646                 }
1647 #endif
1648                 /*
1649                  * free previously mapped memory so we can map the
1650                  * hugepages into the space
1651                  */
1652                 base_addr = mcfg->memseg[s].addr;
1653                 munmap(base_addr, mcfg->memseg[s].len);
1654
1655                 /* find the hugepages for this segment and map them
1656                  * we don't need to worry about order, as the server sorted the
1657                  * entries before it did the second mmap of them */
1658                 for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
1659                         if (hp[i].memseg_id == (int)s){
1660                                 fd = open(hp[i].filepath, O_RDWR);
1661                                 if (fd < 0) {
1662                                         RTE_LOG(ERR, EAL, "Could not open %s\n",
1663                                                 hp[i].filepath);
1664                                         goto error;
1665                                 }
1666 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
1667                                 mapping_size = hp[i].size * hp[i].repeated;
1668 #else
1669                                 mapping_size = hp[i].size;
1670 #endif
1671                                 addr = mmap(RTE_PTR_ADD(base_addr, offset),
1672                                                 mapping_size, PROT_READ | PROT_WRITE,
1673                                                 MAP_SHARED, fd, 0);
1674                                 close(fd); /* close file both on success and on failure */
1675                                 if (addr == MAP_FAILED ||
1676                                                 addr != RTE_PTR_ADD(base_addr, offset)) {
1677                                         RTE_LOG(ERR, EAL, "Could not mmap %s\n",
1678                                                 hp[i].filepath);
1679                                         goto error;
1680                                 }
1681                                 offset+=mapping_size;
1682                         }
1683                 }
1684                 RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
1685                                 (unsigned long long)mcfg->memseg[s].len);
1686                 s++;
1687         }
1688         /* unmap the hugepage config file, since we are done using it */
1689         munmap(hp, size);
1690         close(fd_zero);
1691         close(fd_hugepage);
1692         return 0;
1693
1694 error:
1695         s = 0;
1696         while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0) {
1697                 munmap(mcfg->memseg[s].addr, mcfg->memseg[s].len);
1698                 s++;
1699         }
1700         if (hp != NULL && hp != MAP_FAILED)
1701                 munmap(hp, size);
1702         if (fd_zero >= 0)
1703                 close(fd_zero);
1704         if (fd_hugepage >= 0)
1705                 close(fd_hugepage);
1706         return -1;
1707 }