Imported Upstream version 16.04
[deb_dpdk.git] / lib / librte_eal / linuxapp / eal / eal_ivshmem.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #ifdef RTE_LIBRTE_IVSHMEM /* hide it from coverage */
35
36 #include <stdint.h>
37 #include <unistd.h>
38 #include <inttypes.h>
39 #include <sys/mman.h>
40 #include <sys/file.h>
41 #include <string.h>
42 #include <sys/queue.h>
43
44 #include <rte_log.h>
45 #include <rte_pci.h>
46 #include <rte_memory.h>
47 #include <rte_eal.h>
48 #include <rte_eal_memconfig.h>
49 #include <rte_string_fns.h>
50 #include <rte_errno.h>
51 #include <rte_ring.h>
52 #include <rte_mempool.h>
53 #include <rte_malloc.h>
54 #include <rte_common.h>
55 #include <rte_ivshmem.h>
56
57 #include "eal_internal_cfg.h"
58 #include "eal_private.h"
59
60 #define PCI_VENDOR_ID_IVSHMEM 0x1Af4
61 #define PCI_DEVICE_ID_IVSHMEM 0x1110
62
63 #define IVSHMEM_MAGIC 0x0BADC0DE
64
65 #define IVSHMEM_RESOURCE_PATH "/sys/bus/pci/devices/%04x:%02x:%02x.%x/resource2"
66 #define IVSHMEM_CONFIG_PATH "/var/run/.%s_ivshmem_config"
67
68 #define PHYS 0x1
69 #define VIRT 0x2
70 #define IOREMAP 0x4
71 #define FULL (PHYS|VIRT|IOREMAP)
72
73 #define METADATA_SIZE_ALIGNED \
74         (RTE_ALIGN_CEIL(sizeof(struct rte_ivshmem_metadata),pagesz))
75
76 #define CONTAINS(x,y)\
77         (((y).addr_64 >= (x).addr_64) && ((y).addr_64 < (x).addr_64 + (x).len))
78
79 #define DIM(x) (sizeof(x)/sizeof(x[0]))
80
81 struct ivshmem_pci_device {
82         char path[PATH_MAX];
83         phys_addr_t ioremap_addr;
84 };
85
86 /* data type to store in config */
87 struct ivshmem_segment {
88         struct rte_ivshmem_metadata_entry entry;
89         uint64_t align;
90         char path[PATH_MAX];
91 };
92 struct ivshmem_shared_config {
93         struct ivshmem_segment segment[RTE_MAX_MEMSEG];
94         uint32_t segment_idx;
95         struct ivshmem_pci_device pci_devs[RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS];
96         uint32_t pci_devs_idx;
97 };
98 static struct ivshmem_shared_config * ivshmem_config;
99 static int memseg_idx;
100 static int pagesz;
101
102 /* Tailq heads to add rings to */
103 TAILQ_HEAD(rte_ring_list, rte_tailq_entry);
104
105 /*
106  * Utility functions
107  */
108
109 static int
110 is_ivshmem_device(struct rte_pci_device * dev)
111 {
112         return dev->id.vendor_id == PCI_VENDOR_ID_IVSHMEM
113                         && dev->id.device_id == PCI_DEVICE_ID_IVSHMEM;
114 }
115
116 static void *
117 map_metadata(int fd, uint64_t len)
118 {
119         size_t metadata_len = sizeof(struct rte_ivshmem_metadata);
120         size_t aligned_len = METADATA_SIZE_ALIGNED;
121
122         return mmap(NULL, metadata_len, PROT_READ | PROT_WRITE,
123                         MAP_SHARED, fd, len - aligned_len);
124 }
125
126 static void
127 unmap_metadata(void * ptr)
128 {
129         munmap(ptr, sizeof(struct rte_ivshmem_metadata));
130 }
131
132 static int
133 has_ivshmem_metadata(int fd, uint64_t len)
134 {
135         struct rte_ivshmem_metadata metadata;
136         void * ptr;
137
138         ptr = map_metadata(fd, len);
139
140         if (ptr == MAP_FAILED)
141                 return -1;
142
143         metadata = *(struct rte_ivshmem_metadata*) (ptr);
144
145         unmap_metadata(ptr);
146
147         return metadata.magic_number == IVSHMEM_MAGIC;
148 }
149
150 static void
151 remove_segment(struct ivshmem_segment * ms, int len, int idx)
152 {
153         int i;
154
155         for (i = idx; i < len - 1; i++)
156                 memcpy(&ms[i], &ms[i+1], sizeof(struct ivshmem_segment));
157         memset(&ms[len-1], 0, sizeof(struct ivshmem_segment));
158 }
159
160 static int
161 overlap(const struct rte_memzone * mz1, const struct rte_memzone * mz2)
162 {
163         uint64_t start1, end1, start2, end2;
164         uint64_t p_start1, p_end1, p_start2, p_end2;
165         uint64_t i_start1, i_end1, i_start2, i_end2;
166         int result = 0;
167
168         /* gather virtual addresses */
169         start1 = mz1->addr_64;
170         end1 = mz1->addr_64 + mz1->len;
171         start2 = mz2->addr_64;
172         end2 = mz2->addr_64 + mz2->len;
173
174         /* gather physical addresses */
175         p_start1 = mz1->phys_addr;
176         p_end1 = mz1->phys_addr + mz1->len;
177         p_start2 = mz2->phys_addr;
178         p_end2 = mz2->phys_addr + mz2->len;
179
180         /* gather ioremap addresses */
181         i_start1 = mz1->ioremap_addr;
182         i_end1 = mz1->ioremap_addr + mz1->len;
183         i_start2 = mz2->ioremap_addr;
184         i_end2 = mz2->ioremap_addr + mz2->len;
185
186         /* check for overlap in virtual addresses */
187         if (start1 > start2 && start1 < end2)
188                 result |= VIRT;
189         if (start2 >= start1 && start2 < end1)
190                 result |= VIRT;
191
192         /* check for overlap in physical addresses */
193         if (p_start1 > p_start2 && p_start1 < p_end2)
194                 result |= PHYS;
195         if (p_start2 > p_start1 && p_start2 < p_end1)
196                 result |= PHYS;
197
198         /* check for overlap in ioremap addresses */
199         if (i_start1 > i_start2 && i_start1 < i_end2)
200                 result |= IOREMAP;
201         if (i_start2 > i_start1 && i_start2 < i_end1)
202                 result |= IOREMAP;
203
204         return result;
205 }
206
207 static int
208 adjacent(const struct rte_memzone * mz1, const struct rte_memzone * mz2)
209 {
210         uint64_t start1, end1, start2, end2;
211         uint64_t p_start1, p_end1, p_start2, p_end2;
212         uint64_t i_start1, i_end1, i_start2, i_end2;
213         int result = 0;
214
215         /* gather virtual addresses */
216         start1 = mz1->addr_64;
217         end1 = mz1->addr_64 + mz1->len;
218         start2 = mz2->addr_64;
219         end2 = mz2->addr_64 + mz2->len;
220
221         /* gather physical addresses */
222         p_start1 = mz1->phys_addr;
223         p_end1 = mz1->phys_addr + mz1->len;
224         p_start2 = mz2->phys_addr;
225         p_end2 = mz2->phys_addr + mz2->len;
226
227         /* gather ioremap addresses */
228         i_start1 = mz1->ioremap_addr;
229         i_end1 = mz1->ioremap_addr + mz1->len;
230         i_start2 = mz2->ioremap_addr;
231         i_end2 = mz2->ioremap_addr + mz2->len;
232
233         /* check if segments are virtually adjacent */
234         if (start1 == end2)
235                 result |= VIRT;
236         if (start2 == end1)
237                 result |= VIRT;
238
239         /* check if segments are physically adjacent */
240         if (p_start1 == p_end2)
241                 result |= PHYS;
242         if (p_start2 == p_end1)
243                 result |= PHYS;
244
245         /* check if segments are ioremap-adjacent */
246         if (i_start1 == i_end2)
247                 result |= IOREMAP;
248         if (i_start2 == i_end1)
249                 result |= IOREMAP;
250
251         return result;
252 }
253
254 static int
255 has_adjacent_segments(struct ivshmem_segment * ms, int len)
256 {
257         int i, j;
258
259         for (i = 0; i < len; i++)
260                 for (j = i + 1; j < len; j++) {
261                         /* we're only interested in fully adjacent segments; partially
262                          * adjacent segments can coexist.
263                          */
264                         if (adjacent(&ms[i].entry.mz, &ms[j].entry.mz) == FULL)
265                                 return 1;
266                 }
267         return 0;
268 }
269
270 static int
271 has_overlapping_segments(struct ivshmem_segment * ms, int len)
272 {
273         int i, j;
274
275         for (i = 0; i < len; i++)
276                 for (j = i + 1; j < len; j++)
277                         if (overlap(&ms[i].entry.mz, &ms[j].entry.mz))
278                                 return 1;
279         return 0;
280 }
281
282 static int
283 seg_compare(const void * a, const void * b)
284 {
285         const struct ivshmem_segment * s1 = (const struct ivshmem_segment*) a;
286         const struct ivshmem_segment * s2 = (const struct ivshmem_segment*) b;
287
288         /* move unallocated zones to the end */
289         if (s1->entry.mz.addr == NULL && s2->entry.mz.addr == NULL)
290                 return 0;
291         if (s1->entry.mz.addr == 0)
292                 return 1;
293         if (s2->entry.mz.addr == 0)
294                 return -1;
295
296         return s1->entry.mz.phys_addr > s2->entry.mz.phys_addr;
297 }
298
299 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
300 static void
301 entry_dump(struct rte_ivshmem_metadata_entry *e)
302 {
303         RTE_LOG(DEBUG, EAL, "\tvirt: %p-%p\n", e->mz.addr,
304                         RTE_PTR_ADD(e->mz.addr, e->mz.len));
305         RTE_LOG(DEBUG, EAL, "\tphys: 0x%" PRIx64 "-0x%" PRIx64 "\n",
306                         e->mz.phys_addr,
307                         e->mz.phys_addr + e->mz.len);
308         RTE_LOG(DEBUG, EAL, "\tio: 0x%" PRIx64 "-0x%" PRIx64 "\n",
309                         e->mz.ioremap_addr,
310                         e->mz.ioremap_addr + e->mz.len);
311         RTE_LOG(DEBUG, EAL, "\tlen: 0x%" PRIx64 "\n", e->mz.len);
312         RTE_LOG(DEBUG, EAL, "\toff: 0x%" PRIx64 "\n", e->offset);
313 }
314 #endif
315
316
317
318 /*
319  * Actual useful code
320  */
321
322 /* read through metadata mapped from the IVSHMEM device */
323 static int
324 read_metadata(char * path, int path_len, int fd, uint64_t flen)
325 {
326         struct rte_ivshmem_metadata metadata;
327         struct rte_ivshmem_metadata_entry * entry;
328         int idx, i;
329         void * ptr;
330
331         ptr = map_metadata(fd, flen);
332
333         if (ptr == MAP_FAILED)
334                 return -1;
335
336         metadata = *(struct rte_ivshmem_metadata*) (ptr);
337
338         unmap_metadata(ptr);
339
340         RTE_LOG(DEBUG, EAL, "Parsing metadata for \"%s\"\n", metadata.name);
341
342         idx = ivshmem_config->segment_idx;
343
344         for (i = 0; i < RTE_LIBRTE_IVSHMEM_MAX_ENTRIES &&
345                 idx <= RTE_MAX_MEMSEG; i++) {
346
347                 if (idx == RTE_MAX_MEMSEG) {
348                         RTE_LOG(ERR, EAL, "Not enough memory segments!\n");
349                         return -1;
350                 }
351
352                 entry = &metadata.entry[i];
353
354                 /* stop on uninitialized memzone */
355                 if (entry->mz.len == 0)
356                         break;
357
358                 /* copy metadata entry */
359                 memcpy(&ivshmem_config->segment[idx].entry, entry,
360                                 sizeof(struct rte_ivshmem_metadata_entry));
361
362                 /* copy path */
363                 snprintf(ivshmem_config->segment[idx].path, path_len, "%s", path);
364
365                 idx++;
366         }
367         ivshmem_config->segment_idx = idx;
368
369         return 0;
370 }
371
372 /* check through each segment and look for adjacent or overlapping ones. */
373 static int
374 cleanup_segments(struct ivshmem_segment * ms, int tbl_len)
375 {
376         struct ivshmem_segment * s, * tmp;
377         int i, j, concat, seg_adjacent, seg_overlapping;
378         uint64_t start1, start2, end1, end2, p_start1, p_start2, i_start1, i_start2;
379
380         qsort(ms, tbl_len, sizeof(struct ivshmem_segment),
381                                 seg_compare);
382
383         while (has_overlapping_segments(ms, tbl_len) ||
384                         has_adjacent_segments(ms, tbl_len)) {
385
386                 for (i = 0; i < tbl_len; i++) {
387                         s = &ms[i];
388
389                         concat = 0;
390
391                         for (j = i + 1; j < tbl_len; j++) {
392                                 tmp = &ms[j];
393
394                                 /* check if this segment is overlapping with existing segment,
395                                  * or is adjacent to existing segment */
396                                 seg_overlapping = overlap(&s->entry.mz, &tmp->entry.mz);
397                                 seg_adjacent = adjacent(&s->entry.mz, &tmp->entry.mz);
398
399                                 /* check if segments fully overlap or are fully adjacent */
400                                 if ((seg_adjacent == FULL) || (seg_overlapping == FULL)) {
401
402 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
403                                         RTE_LOG(DEBUG, EAL, "Concatenating segments\n");
404                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", i);
405                                         entry_dump(&s->entry);
406                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", j);
407                                         entry_dump(&tmp->entry);
408 #endif
409
410                                         start1 = s->entry.mz.addr_64;
411                                         start2 = tmp->entry.mz.addr_64;
412                                         p_start1 = s->entry.mz.phys_addr;
413                                         p_start2 = tmp->entry.mz.phys_addr;
414                                         i_start1 = s->entry.mz.ioremap_addr;
415                                         i_start2 = tmp->entry.mz.ioremap_addr;
416                                         end1 = s->entry.mz.addr_64 + s->entry.mz.len;
417                                         end2 = tmp->entry.mz.addr_64 + tmp->entry.mz.len;
418
419                                         /* settle for minimum start address and maximum length */
420                                         s->entry.mz.addr_64 = RTE_MIN(start1, start2);
421                                         s->entry.mz.phys_addr = RTE_MIN(p_start1, p_start2);
422                                         s->entry.mz.ioremap_addr = RTE_MIN(i_start1, i_start2);
423                                         s->entry.offset = RTE_MIN(s->entry.offset, tmp->entry.offset);
424                                         s->entry.mz.len = RTE_MAX(end1, end2) - s->entry.mz.addr_64;
425                                         concat = 1;
426
427 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
428                                         RTE_LOG(DEBUG, EAL, "Resulting segment:\n");
429                                         entry_dump(&s->entry);
430
431 #endif
432                                 }
433                                 /* if segments not fully overlap, we have an error condition.
434                                  * adjacent segments can coexist.
435                                  */
436                                 else if (seg_overlapping > 0) {
437                                         RTE_LOG(ERR, EAL, "Segments %i and %i overlap!\n", i, j);
438 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
439                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", i);
440                                         entry_dump(&s->entry);
441                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", j);
442                                         entry_dump(&tmp->entry);
443 #endif
444                                         return -1;
445                                 }
446                                 if (concat)
447                                         break;
448                         }
449                         /* if we concatenated, remove segment at j */
450                         if (concat) {
451                                 remove_segment(ms, tbl_len, j);
452                                 tbl_len--;
453                                 break;
454                         }
455                 }
456         }
457
458         return tbl_len;
459 }
460
461 static int
462 create_shared_config(void)
463 {
464         char path[PATH_MAX];
465         int fd;
466
467         /* build ivshmem config file path */
468         snprintf(path, sizeof(path), IVSHMEM_CONFIG_PATH,
469                         internal_config.hugefile_prefix);
470
471         fd = open(path, O_CREAT | O_RDWR, 0600);
472
473         if (fd < 0) {
474                 RTE_LOG(ERR, EAL, "Could not open %s: %s\n", path, strerror(errno));
475                 return -1;
476         }
477
478         /* try ex-locking first - if the file is locked, we have a problem */
479         if (flock(fd, LOCK_EX | LOCK_NB) == -1) {
480                 RTE_LOG(ERR, EAL, "Locking %s failed: %s\n", path, strerror(errno));
481                 close(fd);
482                 return -1;
483         }
484
485         if (ftruncate(fd, sizeof(struct ivshmem_shared_config)) < 0) {
486                 RTE_LOG(ERR, EAL, "ftruncate failed: %s\n", strerror(errno));
487                 return -1;
488         }
489
490         ivshmem_config = mmap(NULL, sizeof(struct ivshmem_shared_config),
491                         PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
492
493         if (ivshmem_config == MAP_FAILED)
494                 return -1;
495
496         memset(ivshmem_config, 0, sizeof(struct ivshmem_shared_config));
497
498         /* change the exclusive lock we got earlier to a shared lock */
499         if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
500                 RTE_LOG(ERR, EAL, "Locking %s failed: %s \n", path, strerror(errno));
501                 return -1;
502         }
503
504         close(fd);
505
506         return 0;
507 }
508
509 /* open shared config file and, if present, map the config.
510  * having no config file is not an error condition, as we later check if
511  * ivshmem_config is NULL (if it is, that means nothing was mapped). */
512 static int
513 open_shared_config(void)
514 {
515         char path[PATH_MAX];
516         int fd;
517
518         /* build ivshmem config file path */
519         snprintf(path, sizeof(path), IVSHMEM_CONFIG_PATH,
520                         internal_config.hugefile_prefix);
521
522         fd = open(path, O_RDONLY);
523
524         /* if the file doesn't exist, just return success */
525         if (fd < 0 && errno == ENOENT)
526                 return 0;
527         /* else we have an error condition */
528         else if (fd < 0) {
529                 RTE_LOG(ERR, EAL, "Could not open %s: %s\n",
530                                 path, strerror(errno));
531                 return -1;
532         }
533
534         /* try ex-locking first - if the lock *does* succeed, this means it's a
535          * stray config file, so it should be deleted.
536          */
537         if (flock(fd, LOCK_EX | LOCK_NB) != -1) {
538
539                 /* if we can't remove the file, something is wrong */
540                 if (unlink(path) < 0) {
541                         RTE_LOG(ERR, EAL, "Could not remove %s: %s\n", path,
542                                         strerror(errno));
543                         return -1;
544                 }
545
546                 /* release the lock */
547                 flock(fd, LOCK_UN);
548                 close(fd);
549
550                 /* return success as having a stray config file is equivalent to not
551                  * having config file at all.
552                  */
553                 return 0;
554         }
555
556         ivshmem_config = mmap(NULL, sizeof(struct ivshmem_shared_config),
557                         PROT_READ, MAP_SHARED, fd, 0);
558
559         if (ivshmem_config == MAP_FAILED)
560                 return -1;
561
562         /* place a shared lock on config file */
563         if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
564                 RTE_LOG(ERR, EAL, "Locking %s failed: %s \n", path, strerror(errno));
565                 return -1;
566         }
567
568         close(fd);
569
570         return 0;
571 }
572
573 /*
574  * This function does the following:
575  *
576  * 1) Builds a table of ivshmem_segments with proper offset alignment
577  * 2) Cleans up that table so that we don't have any overlapping or adjacent
578  *    memory segments
579  * 3) Creates memsegs from this table and maps them into memory.
580  */
581 static inline int
582 map_all_segments(void)
583 {
584         struct ivshmem_segment ms_tbl[RTE_MAX_MEMSEG];
585         struct ivshmem_pci_device * pci_dev;
586         struct rte_mem_config * mcfg;
587         struct ivshmem_segment * seg;
588         int fd, fd_zero;
589         unsigned i, j;
590         struct rte_memzone mz;
591         struct rte_memseg ms;
592         void * base_addr;
593         uint64_t align, len;
594         phys_addr_t ioremap_addr;
595
596         ioremap_addr = 0;
597
598         memset(ms_tbl, 0, sizeof(ms_tbl));
599         memset(&mz, 0, sizeof(struct rte_memzone));
600         memset(&ms, 0, sizeof(struct rte_memseg));
601
602         /* first, build a table of memsegs to map, to avoid failed mmaps due to
603          * overlaps
604          */
605         for (i = 0; i < ivshmem_config->segment_idx && i <= RTE_MAX_MEMSEG; i++) {
606                 if (i == RTE_MAX_MEMSEG) {
607                         RTE_LOG(ERR, EAL, "Too many segments requested!\n");
608                         return -1;
609                 }
610
611                 seg = &ivshmem_config->segment[i];
612
613                 /* copy segment to table */
614                 memcpy(&ms_tbl[i], seg, sizeof(struct ivshmem_segment));
615
616                 /* find ioremap addr */
617                 for (j = 0; j < DIM(ivshmem_config->pci_devs); j++) {
618                         pci_dev = &ivshmem_config->pci_devs[j];
619                         if (!strncmp(pci_dev->path, seg->path, sizeof(pci_dev->path))) {
620                                 ioremap_addr = pci_dev->ioremap_addr;
621                                 break;
622                         }
623                 }
624                 if (ioremap_addr == 0) {
625                         RTE_LOG(ERR, EAL, "Cannot find ioremap addr!\n");
626                         return -1;
627                 }
628
629                 /* work out alignments */
630                 align = seg->entry.mz.addr_64 -
631                                 RTE_ALIGN_FLOOR(seg->entry.mz.addr_64, 0x1000);
632                 len = RTE_ALIGN_CEIL(seg->entry.mz.len + align, 0x1000);
633
634                 /* save original alignments */
635                 ms_tbl[i].align = align;
636
637                 /* create a memory zone */
638                 mz.addr_64 = seg->entry.mz.addr_64 - align;
639                 mz.len = len;
640                 mz.hugepage_sz = seg->entry.mz.hugepage_sz;
641                 mz.phys_addr = seg->entry.mz.phys_addr - align;
642
643                 /* find true physical address */
644                 mz.ioremap_addr = ioremap_addr + seg->entry.offset - align;
645
646                 ms_tbl[i].entry.offset = seg->entry.offset - align;
647
648                 memcpy(&ms_tbl[i].entry.mz, &mz, sizeof(struct rte_memzone));
649         }
650
651         /* clean up the segments */
652         memseg_idx = cleanup_segments(ms_tbl, ivshmem_config->segment_idx);
653
654         if (memseg_idx < 0)
655                 return -1;
656
657         mcfg = rte_eal_get_configuration()->mem_config;
658
659         fd_zero = open("/dev/zero", O_RDWR);
660
661         if (fd_zero < 0) {
662                 RTE_LOG(ERR, EAL, "Cannot open /dev/zero: %s\n", strerror(errno));
663                 return -1;
664         }
665
666         /* create memsegs and put them into DPDK memory */
667         for (i = 0; i < (unsigned) memseg_idx; i++) {
668
669                 seg = &ms_tbl[i];
670
671                 ms.addr_64 = seg->entry.mz.addr_64;
672                 ms.hugepage_sz = seg->entry.mz.hugepage_sz;
673                 ms.len = seg->entry.mz.len;
674                 ms.nchannel = rte_memory_get_nchannel();
675                 ms.nrank = rte_memory_get_nrank();
676                 ms.phys_addr = seg->entry.mz.phys_addr;
677                 ms.ioremap_addr = seg->entry.mz.ioremap_addr;
678                 ms.socket_id = seg->entry.mz.socket_id;
679
680                 base_addr = mmap(ms.addr, ms.len,
681                                 PROT_READ | PROT_WRITE, MAP_PRIVATE, fd_zero, 0);
682
683                 if (base_addr == MAP_FAILED || base_addr != ms.addr) {
684                         RTE_LOG(ERR, EAL, "Cannot map /dev/zero!\n");
685                         return -1;
686                 }
687
688                 fd = open(seg->path, O_RDWR);
689
690                 if (fd < 0) {
691                         RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", seg->path,
692                                         strerror(errno));
693                         return -1;
694                 }
695
696                 munmap(ms.addr, ms.len);
697
698                 base_addr = mmap(ms.addr, ms.len,
699                                 PROT_READ | PROT_WRITE, MAP_SHARED, fd,
700                                 seg->entry.offset);
701
702
703                 if (base_addr == MAP_FAILED || base_addr != ms.addr) {
704                         RTE_LOG(ERR, EAL, "Cannot map segment into memory: "
705                                         "expected %p got %p (%s)\n", ms.addr, base_addr,
706                                         strerror(errno));
707                         return -1;
708                 }
709
710                 RTE_LOG(DEBUG, EAL, "Memory segment mapped: %p (len %" PRIx64 ") at "
711                                 "offset 0x%" PRIx64 "\n",
712                                 ms.addr, ms.len, seg->entry.offset);
713
714                 /* put the pointers back into their real positions using original
715                  * alignment */
716                 ms.addr_64 += seg->align;
717                 ms.phys_addr += seg->align;
718                 ms.ioremap_addr += seg->align;
719                 ms.len -= seg->align;
720
721                 /* at this point, the rest of DPDK memory is not initialized, so we
722                  * expect memsegs to be empty */
723                 memcpy(&mcfg->memseg[i], &ms,
724                                 sizeof(struct rte_memseg));
725
726                 close(fd);
727
728                 RTE_LOG(DEBUG, EAL, "IVSHMEM segment found, size: 0x%lx\n",
729                                 ms.len);
730         }
731
732         return 0;
733 }
734
735 /* this happens at a later stage, after general EAL memory initialization */
736 int
737 rte_eal_ivshmem_obj_init(void)
738 {
739         struct rte_ring_list* ring_list = NULL;
740         struct rte_mem_config * mcfg;
741         struct ivshmem_segment * seg;
742         struct rte_memzone * mz;
743         struct rte_ring * r;
744         struct rte_tailq_entry *te;
745         unsigned i, ms, idx;
746         uint64_t offset;
747
748         /* secondary process would not need any object discovery - it'll all
749          * already be in shared config */
750         if (rte_eal_process_type() != RTE_PROC_PRIMARY || ivshmem_config == NULL)
751                 return 0;
752
753         /* check that we have an initialised ring tail queue */
754         ring_list = RTE_TAILQ_LOOKUP(RTE_TAILQ_RING_NAME, rte_ring_list);
755         if (ring_list == NULL) {
756                 RTE_LOG(ERR, EAL, "No rte_ring tailq found!\n");
757                 return -1;
758         }
759
760         mcfg = rte_eal_get_configuration()->mem_config;
761
762         /* create memzones */
763         for (i = 0; i < ivshmem_config->segment_idx && i <= RTE_MAX_MEMZONE; i++) {
764
765                 seg = &ivshmem_config->segment[i];
766
767                 /* add memzone */
768                 if (mcfg->memzone_cnt == RTE_MAX_MEMZONE) {
769                         RTE_LOG(ERR, EAL, "No more memory zones available!\n");
770                         return -1;
771                 }
772
773                 idx = mcfg->memzone_cnt;
774
775                 RTE_LOG(DEBUG, EAL, "Found memzone: '%s' at %p (len 0x%" PRIx64 ")\n",
776                                 seg->entry.mz.name, seg->entry.mz.addr, seg->entry.mz.len);
777
778                 memcpy(&mcfg->memzone[idx], &seg->entry.mz,
779                                 sizeof(struct rte_memzone));
780
781                 /* find ioremap address */
782                 for (ms = 0; ms <= RTE_MAX_MEMSEG; ms++) {
783                         if (ms == RTE_MAX_MEMSEG) {
784                                 RTE_LOG(ERR, EAL, "Physical address of segment not found!\n");
785                                 return -1;
786                         }
787                         if (CONTAINS(mcfg->memseg[ms], mcfg->memzone[idx])) {
788                                 offset = mcfg->memzone[idx].addr_64 -
789                                                                 mcfg->memseg[ms].addr_64;
790                                 mcfg->memzone[idx].ioremap_addr = mcfg->memseg[ms].ioremap_addr +
791                                                 offset;
792                                 break;
793                         }
794                 }
795
796                 mcfg->memzone_cnt++;
797         }
798
799         rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
800
801         /* find rings */
802         for (i = 0; i < mcfg->memzone_cnt; i++) {
803                 mz = &mcfg->memzone[i];
804
805                 /* check if memzone has a ring prefix */
806                 if (strncmp(mz->name, RTE_RING_MZ_PREFIX,
807                                 sizeof(RTE_RING_MZ_PREFIX) - 1) != 0)
808                         continue;
809
810                 r = (struct rte_ring*) (mz->addr_64);
811
812                 te = rte_zmalloc("RING_TAILQ_ENTRY", sizeof(*te), 0);
813                 if (te == NULL) {
814                         RTE_LOG(ERR, EAL, "Cannot allocate ring tailq entry!\n");
815                         return -1;
816                 }
817
818                 te->data = (void *) r;
819
820                 TAILQ_INSERT_TAIL(ring_list, te, next);
821
822                 RTE_LOG(DEBUG, EAL, "Found ring: '%s' at %p\n", r->name, mz->addr);
823         }
824         rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
825
826 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
827         rte_memzone_dump(stdout);
828         rte_ring_list_dump(stdout);
829 #endif
830
831         return 0;
832 }
833
834 /* initialize ivshmem structures */
835 int rte_eal_ivshmem_init(void)
836 {
837         struct rte_pci_device * dev;
838         struct rte_pci_resource * res;
839         int fd, ret;
840         char path[PATH_MAX];
841
842         /* initialize everything to 0 */
843         memset(path, 0, sizeof(path));
844         ivshmem_config = NULL;
845
846         pagesz = getpagesize();
847
848         RTE_LOG(DEBUG, EAL, "Searching for IVSHMEM devices...\n");
849
850         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
851
852                 if (open_shared_config() < 0) {
853                         RTE_LOG(ERR, EAL, "Could not open IVSHMEM config!\n");
854                         return -1;
855                 }
856         }
857         else {
858
859                 TAILQ_FOREACH(dev, &pci_device_list, next) {
860
861                         if (is_ivshmem_device(dev)) {
862
863                                 /* IVSHMEM memory is always on BAR2 */
864                                 res = &dev->mem_resource[2];
865
866                                 /* if we don't have a BAR2 */
867                                 if (res->len == 0)
868                                         continue;
869
870                                 /* construct pci device path */
871                                 snprintf(path, sizeof(path), IVSHMEM_RESOURCE_PATH,
872                                                 dev->addr.domain, dev->addr.bus, dev->addr.devid,
873                                                 dev->addr.function);
874
875                                 /* try to find memseg */
876                                 fd = open(path, O_RDWR);
877                                 if (fd < 0) {
878                                         RTE_LOG(ERR, EAL, "Could not open %s\n", path);
879                                         return -1;
880                                 }
881
882                                 /* check if it's a DPDK IVSHMEM device */
883                                 ret = has_ivshmem_metadata(fd, res->len);
884
885                                 /* is DPDK device */
886                                 if (ret == 1) {
887
888                                         /* config file creation is deferred until the first
889                                          * DPDK device is found. then, it has to be created
890                                          * only once. */
891                                         if (ivshmem_config == NULL &&
892                                                         create_shared_config() < 0) {
893                                                 RTE_LOG(ERR, EAL, "Could not create IVSHMEM config!\n");
894                                                 close(fd);
895                                                 return -1;
896                                         }
897
898                                         if (read_metadata(path, sizeof(path), fd, res->len) < 0) {
899                                                 RTE_LOG(ERR, EAL, "Could not read metadata from"
900                                                                 " device %02x:%02x.%x!\n", dev->addr.bus,
901                                                                 dev->addr.devid, dev->addr.function);
902                                                 close(fd);
903                                                 return -1;
904                                         }
905
906                                         if (ivshmem_config->pci_devs_idx == RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS) {
907                                                 RTE_LOG(WARNING, EAL,
908                                                                 "IVSHMEM PCI device limit exceeded. Increase "
909                                                                 "CONFIG_RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS  in "
910                                                                 "your config file.\n");
911                                                 break;
912                                         }
913
914                                         RTE_LOG(INFO, EAL, "Found IVSHMEM device %02x:%02x.%x\n",
915                                                         dev->addr.bus, dev->addr.devid, dev->addr.function);
916
917                                         ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].ioremap_addr = res->phys_addr;
918                                         snprintf(ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].path,
919                                                         sizeof(ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].path),
920                                                         "%s", path);
921
922                                         ivshmem_config->pci_devs_idx++;
923                                 }
924                                 /* failed to read */
925                                 else if (ret < 0) {
926                                         RTE_LOG(ERR, EAL, "Could not read IVSHMEM device: %s\n",
927                                                         strerror(errno));
928                                         close(fd);
929                                         return -1;
930                                 }
931                                 /* not a DPDK device */
932                                 else
933                                         RTE_LOG(DEBUG, EAL, "Skipping non-DPDK IVSHMEM device\n");
934
935                                 /* close the BAR fd */
936                                 close(fd);
937                         }
938                 }
939         }
940
941         /* ivshmem_config is not NULL only if config was created and/or mapped */
942         if (ivshmem_config) {
943                 if (map_all_segments() < 0) {
944                         RTE_LOG(ERR, EAL, "Mapping IVSHMEM segments failed!\n");
945                         return -1;
946                 }
947         }
948         else {
949                 RTE_LOG(DEBUG, EAL, "No IVSHMEM configuration found! \n");
950         }
951
952         return 0;
953 }
954
955 #endif