From 1ba0fa4bfa3dcc9bd754b8b0b6a7c5045f9ba0dd Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Sun, 4 Mar 2018 17:19:08 +0100 Subject: [PATCH] vlib: vfio code rework Change-Id: I99cf3e7cc991aa7d32385a155c707a6516516117 Signed-off-by: Damjan Marion --- src/plugins/dpdk/buffer.c | 4 +- src/vlib.am | 2 + src/vlib/buffer.c | 18 +-- src/vlib/buffer.h | 1 + src/vlib/linux/pci.c | 134 +--------------------- src/vlib/linux/physmem.c | 121 +++++++------------- src/vlib/linux/vfio.c | 282 ++++++++++++++++++++++++++++++++++++++++++++++ src/vlib/linux/vfio.h | 60 ++++++++++ src/vlib/physmem.h | 8 +- src/vlib/physmem_funcs.h | 3 +- 10 files changed, 411 insertions(+), 222 deletions(-) create mode 100644 src/vlib/linux/vfio.c create mode 100644 src/vlib/linux/vfio.h diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index 7b17578f68d..f6379a1da4a 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -474,7 +474,9 @@ dpdk_pool_create (vlib_main_t * vm, u8 * pool_name, u32 elt_size, size = rte_mempool_xmem_size (num_elts, obj_size, 21, 0); error = - vlib_physmem_region_alloc (vm, (i8 *) pool_name, size, numa, 0, pri); + vlib_physmem_region_alloc (vm, (i8 *) pool_name, size, numa, + VLIB_PHYSMEM_F_HUGETLB | VLIB_PHYSMEM_F_SHARED, + pri); if (error) return error; diff --git a/src/vlib.am b/src/vlib.am index 067e4afcc3c..405bed95fba 100644 --- a/src/vlib.am +++ b/src/vlib.am @@ -34,6 +34,7 @@ libvlib_la_SOURCES = \ vlib/init.c \ vlib/linux/pci.c \ vlib/linux/physmem.c \ + vlib/linux/vfio.c \ vlib/main.c \ vlib/mc.c \ vlib/node.c \ @@ -59,6 +60,7 @@ nobase_include_HEADERS += \ vlib/global_funcs.h \ vlib/i2c.h \ vlib/init.h \ + vlib/linux/vfio.h \ vlib/main.h \ vlib/mc.h \ vlib/node_funcs.h \ diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 6b20a542ca1..1b975163e62 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -564,15 +564,14 @@ vlib_buffer_fill_free_list_internal (vlib_main_t * vm, n_alloc = 0; while (n_remaining > 0) { - n_this_chunk = clib_min (n_remaining, 16); + vlib_buffer_pool_t *bp = &vm->buffer_main->buffer_pools[0]; + n_this_chunk = clib_min (n_remaining, bp->alloc_chunk_size); n_bytes = n_this_chunk * (sizeof (b[0]) + fl->n_data_bytes); /* drb: removed power-of-2 ASSERT */ buffers = - vm->os_physmem_alloc_aligned (vm, - vm->buffer_main-> - buffer_pools[0].physmem_region, n_bytes, + vm->os_physmem_alloc_aligned (vm, bp->physmem_region, n_bytes, sizeof (vlib_buffer_t)); if (!buffers) return n_alloc; @@ -960,6 +959,7 @@ vlib_buffer_add_physmem_region (vlib_main_t * vm, p->start = start; p->size = size; p->physmem_region = pri; + p->alloc_chunk_size = (pr->log2_page_size > 18) ? 16 : 1; return p - bm->buffer_pools; } @@ -1056,6 +1056,8 @@ vlib_buffer_main_init (struct vlib_main_t * vm) /* allocate default region */ error = vlib_physmem_region_alloc (vm, "buffers", vlib_buffer_physmem_sz, 0, + VLIB_PHYSMEM_F_SHARED | + VLIB_PHYSMEM_F_HUGETLB | VLIB_PHYSMEM_F_INIT_MHEAP, &pri); if (error == 0) @@ -1063,13 +1065,13 @@ vlib_buffer_main_init (struct vlib_main_t * vm) clib_error_free (error); - /* we my be running unpriviledged, so try to allocate fake physmem */ - error = vlib_physmem_region_alloc (vm, "buffers (fake)", + error = vlib_physmem_region_alloc (vm, "buffers", vlib_buffer_physmem_sz, 0, - VLIB_PHYSMEM_F_FAKE | + VLIB_PHYSMEM_F_SHARED | VLIB_PHYSMEM_F_INIT_MHEAP, &pri); done: - vlib_buffer_add_physmem_region (vm, pri); + if (error == 0) + vlib_buffer_add_physmem_region (vm, pri); return error; } diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index a1c2db377ef..bf09eb9863c 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -432,6 +432,7 @@ typedef struct uword start; uword size; vlib_physmem_region_index_t physmem_region; + int alloc_chunk_size; } vlib_buffer_pool_t; typedef struct diff --git a/src/vlib/linux/pci.c b/src/vlib/linux/pci.c index 8aa0e29248c..bc3e15ea00d 100644 --- a/src/vlib/linux/pci.c +++ b/src/vlib/linux/pci.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -115,29 +116,12 @@ typedef struct } linux_pci_device_t; -typedef struct -{ - int group; - int fd; - int refcnt; -} linux_pci_vfio_iommu_group_t; - /* Pool of PCI devices. */ typedef struct { vlib_main_t *vlib_main; linux_pci_device_t *linux_pci_devices; - /* VFIO */ - int vfio_container_fd; - int vfio_iommu_mode; - - /* pool of IOMMU groups */ - linux_pci_vfio_iommu_group_t *iommu_groups; - - /* iommu group pool index by group id hash */ - uword *iommu_pool_index_by_group; - } linux_pci_main_t; extern linux_pci_main_t linux_pci_main; @@ -180,7 +164,7 @@ linux_pci_main_t linux_pci_main; vlib_pci_device_info_t * vlib_pci_get_device_info (vlib_pci_addr_t * addr, clib_error_t ** error) { - linux_pci_main_t *lpm = &linux_pci_main; + linux_vfio_main_t *lvm = &vfio_main; clib_error_t *err; vlib_pci_device_info_t *di; u8 *f = 0; @@ -271,7 +255,7 @@ vlib_pci_get_device_info (vlib_pci_addr_t * addr, clib_error_t ** error) di->driver_name = clib_sysfs_link_to_name ((char *) f); di->iommu_group = -1; - if (lpm->vfio_container_fd != -1) + if (lvm->container_fd != -1) { u8 *tmpstr; vec_reset_length (f); @@ -862,91 +846,11 @@ vlib_pci_disable_msix_irq (vlib_pci_dev_handle_t h, u16 start, u16 count) VFIO_IRQ_SET_ACTION_TRIGGER, fds); } -static linux_pci_vfio_iommu_group_t * -get_vfio_iommu_group (int group) -{ - linux_pci_main_t *lpm = &linux_pci_main; - uword *p; - - p = hash_get (lpm->iommu_pool_index_by_group, group); - - return p ? pool_elt_at_index (lpm->iommu_groups, p[0]) : 0; -} - -static clib_error_t * -open_vfio_iommu_group (int group) -{ - linux_pci_main_t *lpm = &linux_pci_main; - linux_pci_vfio_iommu_group_t *g; - clib_error_t *err = 0; - struct vfio_group_status group_status; - u8 *s = 0; - int fd; - - g = get_vfio_iommu_group (group); - if (g) - { - g->refcnt++; - return 0; - } - s = format (s, "/dev/vfio/%u%c", group, 0); - fd = open ((char *) s, O_RDWR); - if (fd < 0) - return clib_error_return_unix (0, "open '%s'", s); - - group_status.argsz = sizeof (group_status); - if (ioctl (fd, VFIO_GROUP_GET_STATUS, &group_status) < 0) - { - err = clib_error_return_unix (0, "ioctl(VFIO_GROUP_GET_STATUS) '%s'", - s); - goto error; - } - - if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) - { - err = clib_error_return (0, "iommu group %d is not viable (not all " - "devices in this group bound to vfio-pci)", - group); - goto error; - } - - if (ioctl (fd, VFIO_GROUP_SET_CONTAINER, &lpm->vfio_container_fd) < 0) - { - err = clib_error_return_unix (0, "ioctl(VFIO_GROUP_SET_CONTAINER) '%s'", - s); - goto error; - } - - if (lpm->vfio_iommu_mode == 0) - { - if (ioctl (lpm->vfio_container_fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU) < - 0) - { - err = clib_error_return_unix (0, "ioctl(VFIO_SET_IOMMU) " - "'/dev/vfio/vfio'"); - goto error; - } - lpm->vfio_iommu_mode = VFIO_TYPE1_IOMMU; - } - - - pool_get (lpm->iommu_groups, g); - g->fd = fd; - g->refcnt = 1; - hash_set (lpm->iommu_pool_index_by_group, group, g - lpm->iommu_groups); - vec_free (s); - return 0; -error: - close (fd); - return err; -} - static clib_error_t * add_device_vfio (linux_pci_device_t * p, vlib_pci_device_info_t * di, pci_device_registration_t * r) { linux_pci_main_t *lpm = &linux_pci_main; - linux_pci_vfio_iommu_group_t *g; struct vfio_device_info device_info = { 0 }; clib_error_t *err = 0; u8 *s = 0; @@ -960,20 +864,9 @@ add_device_vfio (linux_pci_device_t * p, vlib_pci_device_info_t * di, "vfio-pci", format_vlib_pci_addr, &di->addr, di->iommu_group); - if ((err = open_vfio_iommu_group (di->iommu_group))) + if ((err = linux_vfio_group_get_device_fd (&p->addr, &p->fd))) return err; - g = get_vfio_iommu_group (di->iommu_group); - - s = format (s, "%U%c", format_vlib_pci_addr, &di->addr, 0); - if ((p->fd = ioctl (g->fd, VFIO_GROUP_GET_DEVICE_FD, (char *) s)) < 0) - { - err = clib_error_return_unix (0, "ioctl(VFIO_GROUP_GET_DEVICE_FD) '%U'", - format_vlib_pci_addr, &di->addr); - goto error; - } - vec_reset_length (s); - device_info.argsz = sizeof (device_info); if (ioctl (p->fd, VFIO_DEVICE_GET_INFO, &device_info) < 0) { @@ -1235,10 +1128,8 @@ clib_error_t * linux_pci_init (vlib_main_t * vm) { vlib_pci_main_t *pm = &pci_main; - linux_pci_main_t *lpm = &linux_pci_main; vlib_pci_addr_t *addr = 0, *addrs; clib_error_t *error; - int fd; pm->vlib_main = vm; @@ -1247,23 +1138,6 @@ linux_pci_init (vlib_main_t * vm) ASSERT (sizeof (vlib_pci_addr_t) == sizeof (u32)); - fd = open ("/dev/vfio/vfio", O_RDWR); - - if ((fd != -1) && (ioctl (fd, VFIO_GET_API_VERSION) != VFIO_API_VERSION)) - { - close (fd); - fd = -1; - } - - if ((fd != -1) && (ioctl (fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU) == 0)) - { - close (fd); - fd = -1; - } - - lpm->vfio_container_fd = fd; - lpm->iommu_pool_index_by_group = hash_create (0, sizeof (uword)); - addrs = vlib_pci_get_all_dev_addrs (); /* *INDENT-OFF* */ vec_foreach (addr, addrs) diff --git a/src/vlib/linux/physmem.c b/src/vlib/linux/physmem.c index 72b00e24d09..f60a6f75adf 100644 --- a/src/vlib/linux/physmem.c +++ b/src/vlib/linux/physmem.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include @@ -51,8 +50,8 @@ #include #include #include - -static int vfio_container_fd = -1; +#include +#include static void * unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, @@ -79,9 +78,6 @@ unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, if (lo_offset == ~0) break; - if (pr->flags & VLIB_PHYSMEM_F_FAKE) - break; - /* Make sure allocation does not span DMA physical chunk boundary. */ hi_offset = lo_offset + n_bytes - 1; @@ -113,56 +109,6 @@ unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x) mheap_put (pr->heap, x - pr->heap); } -static clib_error_t * -scan_vfio_fd (void *arg, u8 * path_name, u8 * file_name) -{ - const char fn[] = "/dev/vfio/vfio"; - char buff[sizeof (fn)] = { 0 }; - - if (readlink ((char *) path_name, buff, sizeof (fn)) + 1 != sizeof (fn)) - return 0; - - if (strncmp (fn, buff, sizeof (fn))) - return 0; - - vfio_container_fd = atoi ((char *) file_name); - return 0; -} - -static clib_error_t * -unix_physmem_region_iommu_register (vlib_physmem_region_t * pr) -{ - struct vfio_iommu_type1_dma_map dma_map = { 0 }; - int i, fd; - - if (vfio_container_fd == -1) - foreach_directory_file ("/proc/self/fd", scan_vfio_fd, 0, 0); - - fd = vfio_container_fd; - - if (fd < 0) - return 0; - - if (ioctl (fd, VFIO_GET_API_VERSION) != VFIO_API_VERSION) - return 0; - - if (ioctl (fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU) == 0) - return 0; - - dma_map.argsz = sizeof (struct vfio_iommu_type1_dma_map); - dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; - - vec_foreach_index (i, pr->page_table) - { - dma_map.vaddr = pointer_to_uword (pr->mem) + (i << pr->log2_page_size); - dma_map.size = 1 << pr->log2_page_size; - dma_map.iova = pr->page_table[i]; - if (ioctl (fd, VFIO_IOMMU_MAP_DMA, &dma_map) != 0) - return clib_error_return_unix (0, "ioctl (VFIO_IOMMU_MAP_DMA)"); - } - return 0; -} - static clib_error_t * unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, u8 numa_node, u32 flags, @@ -172,10 +118,7 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, vlib_physmem_region_t *pr; clib_error_t *error = 0; clib_mem_vm_alloc_t alloc = { 0 }; - - - if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0) - return clib_error_return (0, "not allowed"); + int i; pool_get (vpm->regions, pr); @@ -188,9 +131,11 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, alloc.name = name; alloc.size = size; alloc.numa_node = numa_node; - alloc.flags = CLIB_MEM_VM_F_SHARED; - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) + alloc.flags = (flags & VLIB_PHYSMEM_F_SHARED) ? + CLIB_MEM_VM_F_SHARED : CLIB_MEM_VM_F_LOCKED; + + if ((flags & VLIB_PHYSMEM_F_HUGETLB)) { alloc.flags |= CLIB_MEM_VM_F_HUGETLB; alloc.flags |= CLIB_MEM_VM_F_HUGETLB_PREALLOC; @@ -216,29 +161,30 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, pr->numa_node = numa_node; pr->name = format (0, "%s", name); - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) + for (i = 0; i < pr->n_pages; i++) { - int i; - for (i = 0; i < pr->n_pages; i++) + void *ptr = pr->mem + (i << pr->log2_page_size); + int node; + if ((move_pages (0, 1, &ptr, 0, &node, 0) == 0) && (numa_node != node)) { - void *ptr = pr->mem + (i << pr->log2_page_size); - int node; - if ((move_pages (0, 1, &ptr, 0, &node, 0) == 0) && - (numa_node != node)) - { - clib_warning ("physmem page for region \'%s\' allocated on the" - " wrong numa node (requested %u actual %u)", - pr->name, pr->numa_node, node, i); - break; - } + clib_warning ("physmem page for region \'%s\' allocated on the" + " wrong numa node (requested %u actual %u)", + pr->name, pr->numa_node, node, i); + break; } - pr->page_table = clib_mem_vm_get_paddr (pr->mem, pr->log2_page_size, - pr->n_pages); - error = unix_physmem_region_iommu_register (pr); - if (error) - clib_error_report (error); } + if ((vpm->flags & VLIB_PHYSMEM_MAIN_F_HAVE_IOMMU) || + (vpm->flags & VLIB_PHYSMEM_MAIN_F_HAVE_PAGEMAP) == 0) + for (i = 0; i < pr->n_pages; i++) + vec_add1 (pr->page_table, pointer_to_uword (pr->mem) + + i * (1 << pr->log2_page_size)); + else + pr->page_table = clib_mem_vm_get_paddr (pr->mem, pr->log2_page_size, + pr->n_pages); + + linux_vfio_dma_map_regions (vm); + if (flags & VLIB_PHYSMEM_F_INIT_MHEAP) { pr->heap = mheap_alloc_with_flags (pr->mem, pr->size, @@ -275,12 +221,27 @@ unix_physmem_region_free (vlib_main_t * vm, vlib_physmem_region_index_t idx) clib_error_t * unix_physmem_init (vlib_main_t * vm) { + vlib_physmem_main_t *vpm = &vm->physmem_main; + linux_vfio_main_t *lvm = &vfio_main; clib_error_t *error = 0; + u64 *pt = 0; /* Avoid multiple calls. */ if (vm->os_physmem_alloc_aligned) return error; + /* check if pagemap is accessible */ + pt = clib_mem_vm_get_paddr (&pt, min_log2 (sysconf (_SC_PAGESIZE)), 1); + if (pt[0]) + vpm->flags |= VLIB_PHYSMEM_MAIN_F_HAVE_PAGEMAP; + vec_free (pt); + + if ((error = linux_vfio_init (vm))) + return error; + + if (lvm->flags & LINUX_VFIO_F_HAVE_IOMMU) + vpm->flags |= VLIB_PHYSMEM_MAIN_F_HAVE_IOMMU; + vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; vm->os_physmem_free = unix_physmem_free; vm->os_physmem_region_alloc = unix_physmem_region_alloc; diff --git a/src/vlib/linux/vfio.c b/src/vlib/linux/vfio.c new file mode 100644 index 00000000000..dffe49ce7b2 --- /dev/null +++ b/src/vlib/linux/vfio.c @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pci.c: Linux user space PCI bus management. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +linux_vfio_main_t vfio_main; + +static int +map_regions (vlib_main_t * vm, int fd) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr; + struct vfio_iommu_type1_dma_map dm = { 0 }; + int i; + + dm.argsz = sizeof (struct vfio_iommu_type1_dma_map); + dm.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + + /* *INDENT-OFF* */ + pool_foreach (pr, vpm->regions, + { + vec_foreach_index (i, pr->page_table) + { + int rv; + dm.vaddr = pointer_to_uword (pr->mem) + (i << pr->log2_page_size); + dm.size = 1 << pr->log2_page_size; + dm.iova = pr->page_table[i]; + if ((rv = ioctl (fd, VFIO_IOMMU_MAP_DMA, &dm))) + return rv; + } + }); + /* *INDENT-ON* */ + return 0; +} + +static clib_error_t * +scan_vfio_fd (void *arg, u8 * path_name, u8 * file_name) +{ + linux_vfio_main_t *lvm = &vfio_main; + const char fn[] = "/dev/vfio/vfio"; + char buff[sizeof (fn)] = { 0 }; + int fd; + u8 *path = format (0, "%v%c", path_name, 0); + + if (readlink ((char *) path, buff, sizeof (fn)) + 1 != sizeof (fn)) + goto done; + + if (strncmp (fn, buff, sizeof (fn))) + goto done; + + fd = atoi ((char *) file_name); + if (fd != lvm->container_fd) + lvm->ext_container_fd = atoi ((char *) file_name); + +done: + vec_free (path); + return 0; +} + +void +linux_vfio_dma_map_regions (vlib_main_t * vm) +{ + linux_vfio_main_t *lvm = &vfio_main; + + if (lvm->container_fd != -1) + map_regions (vm, lvm->container_fd); + + if (lvm->ext_container_fd == -1) + foreach_directory_file ("/proc/self/fd", scan_vfio_fd, 0, 0); + + if (lvm->ext_container_fd != -1) + map_regions (vm, lvm->ext_container_fd); +} + +static linux_pci_vfio_iommu_group_t * +get_vfio_iommu_group (int group) +{ + linux_vfio_main_t *lvm = &vfio_main; + uword *p; + + p = hash_get (lvm->iommu_pool_index_by_group, group); + + return p ? pool_elt_at_index (lvm->iommu_groups, p[0]) : 0; +} + +static clib_error_t * +open_vfio_iommu_group (int group) +{ + linux_vfio_main_t *lvm = &vfio_main; + linux_pci_vfio_iommu_group_t *g; + clib_error_t *err = 0; + struct vfio_group_status group_status; + u8 *s = 0; + int fd; + + g = get_vfio_iommu_group (group); + if (g) + { + g->refcnt++; + return 0; + } + s = format (s, "/dev/vfio/%u%c", group, 0); + fd = open ((char *) s, O_RDWR); + if (fd < 0) + return clib_error_return_unix (0, "open '%s'", s); + + group_status.argsz = sizeof (group_status); + if (ioctl (fd, VFIO_GROUP_GET_STATUS, &group_status) < 0) + { + err = clib_error_return_unix (0, "ioctl(VFIO_GROUP_GET_STATUS) '%s'", + s); + goto error; + } + + if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) + { + err = clib_error_return (0, "iommu group %d is not viable (not all " + "devices in this group bound to vfio-pci)", + group); + goto error; + } + + if (ioctl (fd, VFIO_GROUP_SET_CONTAINER, &lvm->container_fd) < 0) + { + err = clib_error_return_unix (0, "ioctl(VFIO_GROUP_SET_CONTAINER) '%s'", + s); + goto error; + } + + if (lvm->iommu_mode == 0) + { + if (ioctl (lvm->container_fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU) < 0) + { + err = clib_error_return_unix (0, "ioctl(VFIO_SET_IOMMU) " + "'/dev/vfio/vfio'"); + goto error; + } + lvm->iommu_mode = VFIO_TYPE1_IOMMU; + } + + + pool_get (lvm->iommu_groups, g); + g->fd = fd; + g->refcnt = 1; + hash_set (lvm->iommu_pool_index_by_group, group, g - lvm->iommu_groups); + vec_free (s); + return 0; +error: + close (fd); + return err; +} + +clib_error_t * +linux_vfio_group_get_device_fd (vlib_pci_addr_t * addr, int *fdp) +{ + clib_error_t *err = 0; + linux_pci_vfio_iommu_group_t *g; + u8 *s = 0; + int iommu_group; + u8 *tmpstr; + int fd; + + s = format (s, "/sys/bus/pci/devices/%U/iommu_group", format_vlib_pci_addr, + addr); + tmpstr = clib_sysfs_link_to_name ((char *) s); + if (tmpstr) + { + iommu_group = atoi ((char *) tmpstr); + vec_free (tmpstr); + } + else + { + err = clib_error_return (0, "Cannot find IOMMU group for PCI device ", + "'%U'", format_vlib_pci_addr, addr); + goto error; + } + vec_reset_length (s); + + if ((err = open_vfio_iommu_group (iommu_group))) + return err; + + g = get_vfio_iommu_group (iommu_group); + + s = format (s, "%U%c", format_vlib_pci_addr, addr, 0); + if ((fd = ioctl (g->fd, VFIO_GROUP_GET_DEVICE_FD, (char *) s)) < 0) + { + err = clib_error_return_unix (0, "ioctl(VFIO_GROUP_GET_DEVICE_FD) '%U'", + format_vlib_pci_addr, addr); + goto error; + } + vec_reset_length (s); + + *fdp = fd; + +error: + vec_free (s); + return err; +} + +clib_error_t * +linux_vfio_init (vlib_main_t * vm) +{ + linux_vfio_main_t *lvm = &vfio_main; + int fd; + + lvm->ext_container_fd = -1; + + fd = open ("/dev/vfio/vfio", O_RDWR); + + /* check if iommu is available */ + if (fd != -1) + { + if (ioctl (fd, VFIO_GET_API_VERSION) != VFIO_API_VERSION) + { + close (fd); + fd = -1; + } + else if (ioctl (fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU) == 1) + lvm->flags |= LINUX_VFIO_F_HAVE_IOMMU; + } + + lvm->iommu_pool_index_by_group = hash_create (0, sizeof (uword)); + lvm->container_fd = fd; + return 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/linux/vfio.h b/src/vlib/linux/vfio.h new file mode 100644 index 00000000000..8e0758cf2e4 --- /dev/null +++ b/src/vlib/linux/vfio.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_vlib_linux_vfio_h +#define included_vlib_linux_vfio_h + +typedef struct +{ + int group; + int fd; + int refcnt; +} linux_pci_vfio_iommu_group_t; + +typedef struct +{ + u32 flags; +#define LINUX_VFIO_F_HAVE_IOMMU (1 << 0) + int container_fd; + int ext_container_fd; /* container fd used by external library, i.e DPDK */ + + /* VFIO */ + int iommu_mode; + + /* pool of IOMMU groups */ + linux_pci_vfio_iommu_group_t *iommu_groups; + + /* iommu group pool index by group id hash */ + uword *iommu_pool_index_by_group; + +} linux_vfio_main_t; + +extern linux_vfio_main_t vfio_main; + +clib_error_t *linux_vfio_init (vlib_main_t * vm); +void linux_vfio_dma_map_regions (vlib_main_t * vm); +clib_error_t *linux_vfio_group_get_device_fd (vlib_pci_addr_t * addr, + int *fd); + + +#endif /* included_vlib_linux_vfio_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/physmem.h b/src/vlib/physmem.h index 1e053d65c0f..e99db7de72c 100644 --- a/src/vlib/physmem.h +++ b/src/vlib/physmem.h @@ -54,8 +54,9 @@ typedef struct void *heap; u32 flags; -#define VLIB_PHYSMEM_F_INIT_MHEAP (1<<0) -#define VLIB_PHYSMEM_F_FAKE (1<<2) +#define VLIB_PHYSMEM_F_INIT_MHEAP (1 << 0) +#define VLIB_PHYSMEM_F_HUGETLB (1 << 1) +#define VLIB_PHYSMEM_F_SHARED (1 << 2) u8 numa_node; u64 *page_table; @@ -66,6 +67,9 @@ typedef struct typedef struct { + u32 flags; +#define VLIB_PHYSMEM_MAIN_F_HAVE_PAGEMAP (1 << 0) +#define VLIB_PHYSMEM_MAIN_F_HAVE_IOMMU (1 << 1) vlib_physmem_region_t *regions; } vlib_physmem_main_t; diff --git a/src/vlib/physmem_funcs.h b/src/vlib/physmem_funcs.h index dbb8d9de5b7..0013c2f70cd 100644 --- a/src/vlib/physmem_funcs.h +++ b/src/vlib/physmem_funcs.h @@ -121,7 +121,8 @@ always_inline void vlib_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *mem) { - return vm->os_physmem_free (vm, idx, mem); + if (mem) + vm->os_physmem_free (vm, idx, mem); } always_inline u64 -- 2.16.6