Imported Upstream version 16.07-rc1
[deb_dpdk.git] / lib / librte_eal / linuxapp / eal / eal_ivshmem.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #ifdef RTE_LIBRTE_IVSHMEM /* hide it from coverage */
35
36 #include <stdint.h>
37 #include <unistd.h>
38 #include <inttypes.h>
39 #include <sys/mman.h>
40 #include <sys/file.h>
41 #include <string.h>
42 #include <sys/queue.h>
43
44 #include <rte_log.h>
45 #include <rte_pci.h>
46 #include <rte_memory.h>
47 #include <rte_eal.h>
48 #include <rte_eal_memconfig.h>
49 #include <rte_string_fns.h>
50 #include <rte_errno.h>
51 #include <rte_ring.h>
52 #include <rte_malloc.h>
53 #include <rte_common.h>
54 #include <rte_ivshmem.h>
55
56 #include "eal_internal_cfg.h"
57 #include "eal_private.h"
58
59 #define PCI_VENDOR_ID_IVSHMEM 0x1Af4
60 #define PCI_DEVICE_ID_IVSHMEM 0x1110
61
62 #define IVSHMEM_MAGIC 0x0BADC0DE
63
64 #define IVSHMEM_RESOURCE_PATH "/sys/bus/pci/devices/%04x:%02x:%02x.%x/resource2"
65 #define IVSHMEM_CONFIG_PATH "/var/run/.%s_ivshmem_config"
66
67 #define PHYS 0x1
68 #define VIRT 0x2
69 #define IOREMAP 0x4
70 #define FULL (PHYS|VIRT|IOREMAP)
71
72 #define METADATA_SIZE_ALIGNED \
73         (RTE_ALIGN_CEIL(sizeof(struct rte_ivshmem_metadata),pagesz))
74
75 #define CONTAINS(x,y)\
76         (((y).addr_64 >= (x).addr_64) && ((y).addr_64 < (x).addr_64 + (x).len))
77
78 #define DIM(x) (sizeof(x)/sizeof(x[0]))
79
80 struct ivshmem_pci_device {
81         char path[PATH_MAX];
82         phys_addr_t ioremap_addr;
83 };
84
85 /* data type to store in config */
86 struct ivshmem_segment {
87         struct rte_ivshmem_metadata_entry entry;
88         uint64_t align;
89         char path[PATH_MAX];
90 };
91 struct ivshmem_shared_config {
92         struct ivshmem_segment segment[RTE_MAX_MEMSEG];
93         uint32_t segment_idx;
94         struct ivshmem_pci_device pci_devs[RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS];
95         uint32_t pci_devs_idx;
96 };
97 static struct ivshmem_shared_config * ivshmem_config;
98 static int memseg_idx;
99 static int pagesz;
100
101 /* Tailq heads to add rings to */
102 TAILQ_HEAD(rte_ring_list, rte_tailq_entry);
103
104 /*
105  * Utility functions
106  */
107
108 static int
109 is_ivshmem_device(struct rte_pci_device * dev)
110 {
111         return dev->id.vendor_id == PCI_VENDOR_ID_IVSHMEM
112                         && dev->id.device_id == PCI_DEVICE_ID_IVSHMEM;
113 }
114
115 static void *
116 map_metadata(int fd, uint64_t len)
117 {
118         size_t metadata_len = sizeof(struct rte_ivshmem_metadata);
119         size_t aligned_len = METADATA_SIZE_ALIGNED;
120
121         return mmap(NULL, metadata_len, PROT_READ | PROT_WRITE,
122                         MAP_SHARED, fd, len - aligned_len);
123 }
124
125 static void
126 unmap_metadata(void * ptr)
127 {
128         munmap(ptr, sizeof(struct rte_ivshmem_metadata));
129 }
130
131 static int
132 has_ivshmem_metadata(int fd, uint64_t len)
133 {
134         struct rte_ivshmem_metadata metadata;
135         void * ptr;
136
137         ptr = map_metadata(fd, len);
138
139         if (ptr == MAP_FAILED)
140                 return -1;
141
142         metadata = *(struct rte_ivshmem_metadata*) (ptr);
143
144         unmap_metadata(ptr);
145
146         return metadata.magic_number == IVSHMEM_MAGIC;
147 }
148
149 static void
150 remove_segment(struct ivshmem_segment * ms, int len, int idx)
151 {
152         int i;
153
154         for (i = idx; i < len - 1; i++)
155                 memcpy(&ms[i], &ms[i+1], sizeof(struct ivshmem_segment));
156         memset(&ms[len-1], 0, sizeof(struct ivshmem_segment));
157 }
158
159 static int
160 overlap(const struct rte_memzone * mz1, const struct rte_memzone * mz2)
161 {
162         uint64_t start1, end1, start2, end2;
163         uint64_t p_start1, p_end1, p_start2, p_end2;
164         uint64_t i_start1, i_end1, i_start2, i_end2;
165         int result = 0;
166
167         /* gather virtual addresses */
168         start1 = mz1->addr_64;
169         end1 = mz1->addr_64 + mz1->len;
170         start2 = mz2->addr_64;
171         end2 = mz2->addr_64 + mz2->len;
172
173         /* gather physical addresses */
174         p_start1 = mz1->phys_addr;
175         p_end1 = mz1->phys_addr + mz1->len;
176         p_start2 = mz2->phys_addr;
177         p_end2 = mz2->phys_addr + mz2->len;
178
179         /* gather ioremap addresses */
180         i_start1 = mz1->ioremap_addr;
181         i_end1 = mz1->ioremap_addr + mz1->len;
182         i_start2 = mz2->ioremap_addr;
183         i_end2 = mz2->ioremap_addr + mz2->len;
184
185         /* check for overlap in virtual addresses */
186         if (start1 >= start2 && start1 < end2)
187                 result |= VIRT;
188         if (start2 >= start1 && start2 < end1)
189                 result |= VIRT;
190
191         /* check for overlap in physical addresses */
192         if (p_start1 >= p_start2 && p_start1 < p_end2)
193                 result |= PHYS;
194         if (p_start2 >= p_start1 && p_start2 < p_end1)
195                 result |= PHYS;
196
197         /* check for overlap in ioremap addresses */
198         if (i_start1 >= i_start2 && i_start1 < i_end2)
199                 result |= IOREMAP;
200         if (i_start2 >= i_start1 && i_start2 < i_end1)
201                 result |= IOREMAP;
202
203         return result;
204 }
205
206 static int
207 adjacent(const struct rte_memzone * mz1, const struct rte_memzone * mz2)
208 {
209         uint64_t start1, end1, start2, end2;
210         uint64_t p_start1, p_end1, p_start2, p_end2;
211         uint64_t i_start1, i_end1, i_start2, i_end2;
212         int result = 0;
213
214         /* gather virtual addresses */
215         start1 = mz1->addr_64;
216         end1 = mz1->addr_64 + mz1->len;
217         start2 = mz2->addr_64;
218         end2 = mz2->addr_64 + mz2->len;
219
220         /* gather physical addresses */
221         p_start1 = mz1->phys_addr;
222         p_end1 = mz1->phys_addr + mz1->len;
223         p_start2 = mz2->phys_addr;
224         p_end2 = mz2->phys_addr + mz2->len;
225
226         /* gather ioremap addresses */
227         i_start1 = mz1->ioremap_addr;
228         i_end1 = mz1->ioremap_addr + mz1->len;
229         i_start2 = mz2->ioremap_addr;
230         i_end2 = mz2->ioremap_addr + mz2->len;
231
232         /* check if segments are virtually adjacent */
233         if (start1 == end2)
234                 result |= VIRT;
235         if (start2 == end1)
236                 result |= VIRT;
237
238         /* check if segments are physically adjacent */
239         if (p_start1 == p_end2)
240                 result |= PHYS;
241         if (p_start2 == p_end1)
242                 result |= PHYS;
243
244         /* check if segments are ioremap-adjacent */
245         if (i_start1 == i_end2)
246                 result |= IOREMAP;
247         if (i_start2 == i_end1)
248                 result |= IOREMAP;
249
250         return result;
251 }
252
253 static int
254 has_adjacent_segments(struct ivshmem_segment * ms, int len)
255 {
256         int i, j;
257
258         for (i = 0; i < len; i++)
259                 for (j = i + 1; j < len; j++) {
260                         /* we're only interested in fully adjacent segments; partially
261                          * adjacent segments can coexist.
262                          */
263                         if (adjacent(&ms[i].entry.mz, &ms[j].entry.mz) == FULL)
264                                 return 1;
265                 }
266         return 0;
267 }
268
269 static int
270 has_overlapping_segments(struct ivshmem_segment * ms, int len)
271 {
272         int i, j;
273
274         for (i = 0; i < len; i++)
275                 for (j = i + 1; j < len; j++)
276                         if (overlap(&ms[i].entry.mz, &ms[j].entry.mz))
277                                 return 1;
278         return 0;
279 }
280
281 static int
282 seg_compare(const void * a, const void * b)
283 {
284         const struct ivshmem_segment * s1 = (const struct ivshmem_segment*) a;
285         const struct ivshmem_segment * s2 = (const struct ivshmem_segment*) b;
286
287         /* move unallocated zones to the end */
288         if (s1->entry.mz.addr == NULL && s2->entry.mz.addr == NULL)
289                 return 0;
290         if (s1->entry.mz.addr == 0)
291                 return 1;
292         if (s2->entry.mz.addr == 0)
293                 return -1;
294
295         return s1->entry.mz.phys_addr > s2->entry.mz.phys_addr;
296 }
297
298 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
299 static void
300 entry_dump(struct rte_ivshmem_metadata_entry *e)
301 {
302         RTE_LOG(DEBUG, EAL, "\tvirt: %p-%p\n", e->mz.addr,
303                         RTE_PTR_ADD(e->mz.addr, e->mz.len));
304         RTE_LOG(DEBUG, EAL, "\tphys: 0x%" PRIx64 "-0x%" PRIx64 "\n",
305                         e->mz.phys_addr,
306                         e->mz.phys_addr + e->mz.len);
307         RTE_LOG(DEBUG, EAL, "\tio: 0x%" PRIx64 "-0x%" PRIx64 "\n",
308                         e->mz.ioremap_addr,
309                         e->mz.ioremap_addr + e->mz.len);
310         RTE_LOG(DEBUG, EAL, "\tlen: 0x%" PRIx64 "\n", e->mz.len);
311         RTE_LOG(DEBUG, EAL, "\toff: 0x%" PRIx64 "\n", e->offset);
312 }
313 #endif
314
315
316
317 /*
318  * Actual useful code
319  */
320
321 /* read through metadata mapped from the IVSHMEM device */
322 static int
323 read_metadata(char * path, int path_len, int fd, uint64_t flen)
324 {
325         struct rte_ivshmem_metadata metadata;
326         struct rte_ivshmem_metadata_entry * entry;
327         int idx, i;
328         void * ptr;
329
330         ptr = map_metadata(fd, flen);
331
332         if (ptr == MAP_FAILED)
333                 return -1;
334
335         metadata = *(struct rte_ivshmem_metadata*) (ptr);
336
337         unmap_metadata(ptr);
338
339         RTE_LOG(DEBUG, EAL, "Parsing metadata for \"%s\"\n", metadata.name);
340
341         idx = ivshmem_config->segment_idx;
342
343         for (i = 0; i < RTE_LIBRTE_IVSHMEM_MAX_ENTRIES &&
344                 idx <= RTE_MAX_MEMSEG; i++) {
345
346                 if (idx == RTE_MAX_MEMSEG) {
347                         RTE_LOG(ERR, EAL, "Not enough memory segments!\n");
348                         return -1;
349                 }
350
351                 entry = &metadata.entry[i];
352
353                 /* stop on uninitialized memzone */
354                 if (entry->mz.len == 0)
355                         break;
356
357                 /* copy metadata entry */
358                 memcpy(&ivshmem_config->segment[idx].entry, entry,
359                                 sizeof(struct rte_ivshmem_metadata_entry));
360
361                 /* copy path */
362                 snprintf(ivshmem_config->segment[idx].path, path_len, "%s", path);
363
364                 idx++;
365         }
366         ivshmem_config->segment_idx = idx;
367
368         return 0;
369 }
370
371 /* check through each segment and look for adjacent or overlapping ones. */
372 static int
373 cleanup_segments(struct ivshmem_segment * ms, int tbl_len)
374 {
375         struct ivshmem_segment * s, * tmp;
376         int i, j, concat, seg_adjacent, seg_overlapping;
377         uint64_t start1, start2, end1, end2, p_start1, p_start2, i_start1, i_start2;
378
379         qsort(ms, tbl_len, sizeof(struct ivshmem_segment),
380                                 seg_compare);
381
382         while (has_overlapping_segments(ms, tbl_len) ||
383                         has_adjacent_segments(ms, tbl_len)) {
384
385                 for (i = 0; i < tbl_len; i++) {
386                         s = &ms[i];
387
388                         concat = 0;
389
390                         for (j = i + 1; j < tbl_len; j++) {
391                                 tmp = &ms[j];
392
393                                 /* check if this segment is overlapping with existing segment,
394                                  * or is adjacent to existing segment */
395                                 seg_overlapping = overlap(&s->entry.mz, &tmp->entry.mz);
396                                 seg_adjacent = adjacent(&s->entry.mz, &tmp->entry.mz);
397
398                                 /* check if segments fully overlap or are fully adjacent */
399                                 if ((seg_adjacent == FULL) || (seg_overlapping == FULL)) {
400
401 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
402                                         RTE_LOG(DEBUG, EAL, "Concatenating segments\n");
403                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", i);
404                                         entry_dump(&s->entry);
405                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", j);
406                                         entry_dump(&tmp->entry);
407 #endif
408
409                                         start1 = s->entry.mz.addr_64;
410                                         start2 = tmp->entry.mz.addr_64;
411                                         p_start1 = s->entry.mz.phys_addr;
412                                         p_start2 = tmp->entry.mz.phys_addr;
413                                         i_start1 = s->entry.mz.ioremap_addr;
414                                         i_start2 = tmp->entry.mz.ioremap_addr;
415                                         end1 = s->entry.mz.addr_64 + s->entry.mz.len;
416                                         end2 = tmp->entry.mz.addr_64 + tmp->entry.mz.len;
417
418                                         /* settle for minimum start address and maximum length */
419                                         s->entry.mz.addr_64 = RTE_MIN(start1, start2);
420                                         s->entry.mz.phys_addr = RTE_MIN(p_start1, p_start2);
421                                         s->entry.mz.ioremap_addr = RTE_MIN(i_start1, i_start2);
422                                         s->entry.offset = RTE_MIN(s->entry.offset, tmp->entry.offset);
423                                         s->entry.mz.len = RTE_MAX(end1, end2) - s->entry.mz.addr_64;
424                                         concat = 1;
425
426 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
427                                         RTE_LOG(DEBUG, EAL, "Resulting segment:\n");
428                                         entry_dump(&s->entry);
429
430 #endif
431                                 }
432                                 /* if segments not fully overlap, we have an error condition.
433                                  * adjacent segments can coexist.
434                                  */
435                                 else if (seg_overlapping > 0) {
436                                         RTE_LOG(ERR, EAL, "Segments %i and %i overlap!\n", i, j);
437 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
438                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", i);
439                                         entry_dump(&s->entry);
440                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", j);
441                                         entry_dump(&tmp->entry);
442 #endif
443                                         return -1;
444                                 }
445                                 if (concat)
446                                         break;
447                         }
448                         /* if we concatenated, remove segment at j */
449                         if (concat) {
450                                 remove_segment(ms, tbl_len, j);
451                                 tbl_len--;
452                                 break;
453                         }
454                 }
455         }
456
457         return tbl_len;
458 }
459
460 static int
461 create_shared_config(void)
462 {
463         char path[PATH_MAX];
464         int fd;
465
466         /* build ivshmem config file path */
467         snprintf(path, sizeof(path), IVSHMEM_CONFIG_PATH,
468                         internal_config.hugefile_prefix);
469
470         fd = open(path, O_CREAT | O_RDWR, 0600);
471
472         if (fd < 0) {
473                 RTE_LOG(ERR, EAL, "Could not open %s: %s\n", path, strerror(errno));
474                 return -1;
475         }
476
477         /* try ex-locking first - if the file is locked, we have a problem */
478         if (flock(fd, LOCK_EX | LOCK_NB) == -1) {
479                 RTE_LOG(ERR, EAL, "Locking %s failed: %s\n", path, strerror(errno));
480                 close(fd);
481                 return -1;
482         }
483
484         if (ftruncate(fd, sizeof(struct ivshmem_shared_config)) < 0) {
485                 RTE_LOG(ERR, EAL, "ftruncate failed: %s\n", strerror(errno));
486                 return -1;
487         }
488
489         ivshmem_config = mmap(NULL, sizeof(struct ivshmem_shared_config),
490                         PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
491
492         if (ivshmem_config == MAP_FAILED)
493                 return -1;
494
495         memset(ivshmem_config, 0, sizeof(struct ivshmem_shared_config));
496
497         /* change the exclusive lock we got earlier to a shared lock */
498         if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
499                 RTE_LOG(ERR, EAL, "Locking %s failed: %s \n", path, strerror(errno));
500                 return -1;
501         }
502
503         close(fd);
504
505         return 0;
506 }
507
508 /* open shared config file and, if present, map the config.
509  * having no config file is not an error condition, as we later check if
510  * ivshmem_config is NULL (if it is, that means nothing was mapped). */
511 static int
512 open_shared_config(void)
513 {
514         char path[PATH_MAX];
515         int fd;
516
517         /* build ivshmem config file path */
518         snprintf(path, sizeof(path), IVSHMEM_CONFIG_PATH,
519                         internal_config.hugefile_prefix);
520
521         fd = open(path, O_RDONLY);
522
523         /* if the file doesn't exist, just return success */
524         if (fd < 0 && errno == ENOENT)
525                 return 0;
526         /* else we have an error condition */
527         else if (fd < 0) {
528                 RTE_LOG(ERR, EAL, "Could not open %s: %s\n",
529                                 path, strerror(errno));
530                 return -1;
531         }
532
533         /* try ex-locking first - if the lock *does* succeed, this means it's a
534          * stray config file, so it should be deleted.
535          */
536         if (flock(fd, LOCK_EX | LOCK_NB) != -1) {
537
538                 /* if we can't remove the file, something is wrong */
539                 if (unlink(path) < 0) {
540                         RTE_LOG(ERR, EAL, "Could not remove %s: %s\n", path,
541                                         strerror(errno));
542                         return -1;
543                 }
544
545                 /* release the lock */
546                 flock(fd, LOCK_UN);
547                 close(fd);
548
549                 /* return success as having a stray config file is equivalent to not
550                  * having config file at all.
551                  */
552                 return 0;
553         }
554
555         ivshmem_config = mmap(NULL, sizeof(struct ivshmem_shared_config),
556                         PROT_READ, MAP_SHARED, fd, 0);
557
558         if (ivshmem_config == MAP_FAILED)
559                 return -1;
560
561         /* place a shared lock on config file */
562         if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
563                 RTE_LOG(ERR, EAL, "Locking %s failed: %s \n", path, strerror(errno));
564                 return -1;
565         }
566
567         close(fd);
568
569         return 0;
570 }
571
572 /*
573  * This function does the following:
574  *
575  * 1) Builds a table of ivshmem_segments with proper offset alignment
576  * 2) Cleans up that table so that we don't have any overlapping or adjacent
577  *    memory segments
578  * 3) Creates memsegs from this table and maps them into memory.
579  */
580 static inline int
581 map_all_segments(void)
582 {
583         struct ivshmem_segment ms_tbl[RTE_MAX_MEMSEG];
584         struct ivshmem_pci_device * pci_dev;
585         struct rte_mem_config * mcfg;
586         struct ivshmem_segment * seg;
587         int fd, fd_zero;
588         unsigned i, j;
589         struct rte_memzone mz;
590         struct rte_memseg ms;
591         void * base_addr;
592         uint64_t align, len;
593         phys_addr_t ioremap_addr;
594
595         ioremap_addr = 0;
596
597         memset(ms_tbl, 0, sizeof(ms_tbl));
598         memset(&mz, 0, sizeof(struct rte_memzone));
599         memset(&ms, 0, sizeof(struct rte_memseg));
600
601         /* first, build a table of memsegs to map, to avoid failed mmaps due to
602          * overlaps
603          */
604         for (i = 0; i < ivshmem_config->segment_idx && i <= RTE_MAX_MEMSEG; i++) {
605                 if (i == RTE_MAX_MEMSEG) {
606                         RTE_LOG(ERR, EAL, "Too many segments requested!\n");
607                         return -1;
608                 }
609
610                 seg = &ivshmem_config->segment[i];
611
612                 /* copy segment to table */
613                 memcpy(&ms_tbl[i], seg, sizeof(struct ivshmem_segment));
614
615                 /* find ioremap addr */
616                 for (j = 0; j < DIM(ivshmem_config->pci_devs); j++) {
617                         pci_dev = &ivshmem_config->pci_devs[j];
618                         if (!strncmp(pci_dev->path, seg->path, sizeof(pci_dev->path))) {
619                                 ioremap_addr = pci_dev->ioremap_addr;
620                                 break;
621                         }
622                 }
623                 if (ioremap_addr == 0) {
624                         RTE_LOG(ERR, EAL, "Cannot find ioremap addr!\n");
625                         return -1;
626                 }
627
628                 /* work out alignments */
629                 align = seg->entry.mz.addr_64 -
630                                 RTE_ALIGN_FLOOR(seg->entry.mz.addr_64, 0x1000);
631                 len = RTE_ALIGN_CEIL(seg->entry.mz.len + align, 0x1000);
632
633                 /* save original alignments */
634                 ms_tbl[i].align = align;
635
636                 /* create a memory zone */
637                 mz.addr_64 = seg->entry.mz.addr_64 - align;
638                 mz.len = len;
639                 mz.hugepage_sz = seg->entry.mz.hugepage_sz;
640                 mz.phys_addr = seg->entry.mz.phys_addr - align;
641
642                 /* find true physical address */
643                 mz.ioremap_addr = ioremap_addr + seg->entry.offset - align;
644
645                 ms_tbl[i].entry.offset = seg->entry.offset - align;
646
647                 memcpy(&ms_tbl[i].entry.mz, &mz, sizeof(struct rte_memzone));
648         }
649
650         /* clean up the segments */
651         memseg_idx = cleanup_segments(ms_tbl, ivshmem_config->segment_idx);
652
653         if (memseg_idx < 0)
654                 return -1;
655
656         mcfg = rte_eal_get_configuration()->mem_config;
657
658         fd_zero = open("/dev/zero", O_RDWR);
659
660         if (fd_zero < 0) {
661                 RTE_LOG(ERR, EAL, "Cannot open /dev/zero: %s\n", strerror(errno));
662                 return -1;
663         }
664
665         /* create memsegs and put them into DPDK memory */
666         for (i = 0; i < (unsigned) memseg_idx; i++) {
667
668                 seg = &ms_tbl[i];
669
670                 ms.addr_64 = seg->entry.mz.addr_64;
671                 ms.hugepage_sz = seg->entry.mz.hugepage_sz;
672                 ms.len = seg->entry.mz.len;
673                 ms.nchannel = rte_memory_get_nchannel();
674                 ms.nrank = rte_memory_get_nrank();
675                 ms.phys_addr = seg->entry.mz.phys_addr;
676                 ms.ioremap_addr = seg->entry.mz.ioremap_addr;
677                 ms.socket_id = seg->entry.mz.socket_id;
678
679                 base_addr = mmap(ms.addr, ms.len,
680                                 PROT_READ | PROT_WRITE, MAP_PRIVATE, fd_zero, 0);
681
682                 if (base_addr == MAP_FAILED || base_addr != ms.addr) {
683                         RTE_LOG(ERR, EAL, "Cannot map /dev/zero!\n");
684                         return -1;
685                 }
686
687                 fd = open(seg->path, O_RDWR);
688
689                 if (fd < 0) {
690                         RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", seg->path,
691                                         strerror(errno));
692                         return -1;
693                 }
694
695                 munmap(ms.addr, ms.len);
696
697                 base_addr = mmap(ms.addr, ms.len,
698                                 PROT_READ | PROT_WRITE, MAP_SHARED, fd,
699                                 seg->entry.offset);
700
701
702                 if (base_addr == MAP_FAILED || base_addr != ms.addr) {
703                         RTE_LOG(ERR, EAL, "Cannot map segment into memory: "
704                                         "expected %p got %p (%s)\n", ms.addr, base_addr,
705                                         strerror(errno));
706                         return -1;
707                 }
708
709                 RTE_LOG(DEBUG, EAL, "Memory segment mapped: %p (len %" PRIx64 ") at "
710                                 "offset 0x%" PRIx64 "\n",
711                                 ms.addr, ms.len, seg->entry.offset);
712
713                 /* put the pointers back into their real positions using original
714                  * alignment */
715                 ms.addr_64 += seg->align;
716                 ms.phys_addr += seg->align;
717                 ms.ioremap_addr += seg->align;
718                 ms.len -= seg->align;
719
720                 /* at this point, the rest of DPDK memory is not initialized, so we
721                  * expect memsegs to be empty */
722                 memcpy(&mcfg->memseg[i], &ms,
723                                 sizeof(struct rte_memseg));
724
725                 close(fd);
726
727                 RTE_LOG(DEBUG, EAL, "IVSHMEM segment found, size: 0x%lx\n",
728                                 ms.len);
729         }
730
731         return 0;
732 }
733
734 /* this happens at a later stage, after general EAL memory initialization */
735 int
736 rte_eal_ivshmem_obj_init(void)
737 {
738         struct rte_ring_list* ring_list = NULL;
739         struct rte_mem_config * mcfg;
740         struct ivshmem_segment * seg;
741         struct rte_memzone * mz;
742         struct rte_ring * r;
743         struct rte_tailq_entry *te;
744         unsigned i, ms, idx;
745         uint64_t offset;
746
747         /* secondary process would not need any object discovery - it'll all
748          * already be in shared config */
749         if (rte_eal_process_type() != RTE_PROC_PRIMARY || ivshmem_config == NULL)
750                 return 0;
751
752         /* check that we have an initialised ring tail queue */
753         ring_list = RTE_TAILQ_LOOKUP(RTE_TAILQ_RING_NAME, rte_ring_list);
754         if (ring_list == NULL) {
755                 RTE_LOG(ERR, EAL, "No rte_ring tailq found!\n");
756                 return -1;
757         }
758
759         mcfg = rte_eal_get_configuration()->mem_config;
760
761         /* create memzones */
762         for (i = 0; i < ivshmem_config->segment_idx && i <= RTE_MAX_MEMZONE; i++) {
763
764                 seg = &ivshmem_config->segment[i];
765
766                 /* add memzone */
767                 if (mcfg->memzone_cnt == RTE_MAX_MEMZONE) {
768                         RTE_LOG(ERR, EAL, "No more memory zones available!\n");
769                         return -1;
770                 }
771
772                 idx = mcfg->memzone_cnt;
773
774                 RTE_LOG(DEBUG, EAL, "Found memzone: '%s' at %p (len 0x%" PRIx64 ")\n",
775                                 seg->entry.mz.name, seg->entry.mz.addr, seg->entry.mz.len);
776
777                 memcpy(&mcfg->memzone[idx], &seg->entry.mz,
778                                 sizeof(struct rte_memzone));
779
780                 /* find ioremap address */
781                 for (ms = 0; ms <= RTE_MAX_MEMSEG; ms++) {
782                         if (ms == RTE_MAX_MEMSEG) {
783                                 RTE_LOG(ERR, EAL, "Physical address of segment not found!\n");
784                                 return -1;
785                         }
786                         if (CONTAINS(mcfg->memseg[ms], mcfg->memzone[idx])) {
787                                 offset = mcfg->memzone[idx].addr_64 -
788                                                                 mcfg->memseg[ms].addr_64;
789                                 mcfg->memzone[idx].ioremap_addr = mcfg->memseg[ms].ioremap_addr +
790                                                 offset;
791                                 break;
792                         }
793                 }
794
795                 mcfg->memzone_cnt++;
796         }
797
798         rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
799
800         /* find rings */
801         for (i = 0; i < mcfg->memzone_cnt; i++) {
802                 mz = &mcfg->memzone[i];
803
804                 /* check if memzone has a ring prefix */
805                 if (strncmp(mz->name, RTE_RING_MZ_PREFIX,
806                                 sizeof(RTE_RING_MZ_PREFIX) - 1) != 0)
807                         continue;
808
809                 r = (struct rte_ring*) (mz->addr_64);
810
811                 te = rte_zmalloc("RING_TAILQ_ENTRY", sizeof(*te), 0);
812                 if (te == NULL) {
813                         RTE_LOG(ERR, EAL, "Cannot allocate ring tailq entry!\n");
814                         return -1;
815                 }
816
817                 te->data = (void *) r;
818
819                 TAILQ_INSERT_TAIL(ring_list, te, next);
820
821                 RTE_LOG(DEBUG, EAL, "Found ring: '%s' at %p\n", r->name, mz->addr);
822         }
823         rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
824
825 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
826         rte_memzone_dump(stdout);
827         rte_ring_list_dump(stdout);
828 #endif
829
830         return 0;
831 }
832
833 /* initialize ivshmem structures */
834 int rte_eal_ivshmem_init(void)
835 {
836         struct rte_pci_device * dev;
837         struct rte_pci_resource * res;
838         int fd, ret;
839         char path[PATH_MAX];
840
841         /* initialize everything to 0 */
842         memset(path, 0, sizeof(path));
843         ivshmem_config = NULL;
844
845         pagesz = getpagesize();
846
847         RTE_LOG(DEBUG, EAL, "Searching for IVSHMEM devices...\n");
848
849         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
850
851                 if (open_shared_config() < 0) {
852                         RTE_LOG(ERR, EAL, "Could not open IVSHMEM config!\n");
853                         return -1;
854                 }
855         }
856         else {
857
858                 TAILQ_FOREACH(dev, &pci_device_list, next) {
859
860                         if (is_ivshmem_device(dev)) {
861
862                                 /* IVSHMEM memory is always on BAR2 */
863                                 res = &dev->mem_resource[2];
864
865                                 /* if we don't have a BAR2 */
866                                 if (res->len == 0)
867                                         continue;
868
869                                 /* construct pci device path */
870                                 snprintf(path, sizeof(path), IVSHMEM_RESOURCE_PATH,
871                                                 dev->addr.domain, dev->addr.bus, dev->addr.devid,
872                                                 dev->addr.function);
873
874                                 /* try to find memseg */
875                                 fd = open(path, O_RDWR);
876                                 if (fd < 0) {
877                                         RTE_LOG(ERR, EAL, "Could not open %s\n", path);
878                                         return -1;
879                                 }
880
881                                 /* check if it's a DPDK IVSHMEM device */
882                                 ret = has_ivshmem_metadata(fd, res->len);
883
884                                 /* is DPDK device */
885                                 if (ret == 1) {
886
887                                         /* config file creation is deferred until the first
888                                          * DPDK device is found. then, it has to be created
889                                          * only once. */
890                                         if (ivshmem_config == NULL &&
891                                                         create_shared_config() < 0) {
892                                                 RTE_LOG(ERR, EAL, "Could not create IVSHMEM config!\n");
893                                                 close(fd);
894                                                 return -1;
895                                         }
896
897                                         if (read_metadata(path, sizeof(path), fd, res->len) < 0) {
898                                                 RTE_LOG(ERR, EAL, "Could not read metadata from"
899                                                                 " device %02x:%02x.%x!\n", dev->addr.bus,
900                                                                 dev->addr.devid, dev->addr.function);
901                                                 close(fd);
902                                                 return -1;
903                                         }
904
905                                         if (ivshmem_config->pci_devs_idx == RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS) {
906                                                 RTE_LOG(WARNING, EAL,
907                                                                 "IVSHMEM PCI device limit exceeded. Increase "
908                                                                 "CONFIG_RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS  in "
909                                                                 "your config file.\n");
910                                                 break;
911                                         }
912
913                                         RTE_LOG(INFO, EAL, "Found IVSHMEM device %02x:%02x.%x\n",
914                                                         dev->addr.bus, dev->addr.devid, dev->addr.function);
915
916                                         ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].ioremap_addr = res->phys_addr;
917                                         snprintf(ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].path,
918                                                         sizeof(ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].path),
919                                                         "%s", path);
920
921                                         ivshmem_config->pci_devs_idx++;
922                                 }
923                                 /* failed to read */
924                                 else if (ret < 0) {
925                                         RTE_LOG(ERR, EAL, "Could not read IVSHMEM device: %s\n",
926                                                         strerror(errno));
927                                         close(fd);
928                                         return -1;
929                                 }
930                                 /* not a DPDK device */
931                                 else
932                                         RTE_LOG(DEBUG, EAL, "Skipping non-DPDK IVSHMEM device\n");
933
934                                 /* close the BAR fd */
935                                 close(fd);
936                         }
937                 }
938         }
939
940         /* ivshmem_config is not NULL only if config was created and/or mapped */
941         if (ivshmem_config) {
942                 if (map_all_segments() < 0) {
943                         RTE_LOG(ERR, EAL, "Mapping IVSHMEM segments failed!\n");
944                         return -1;
945                 }
946         }
947         else {
948                 RTE_LOG(DEBUG, EAL, "No IVSHMEM configuration found! \n");
949         }
950
951         return 0;
952 }
953
954 #endif