Message ID | 1342609768-19455-1-git-send-email-aik@ozlabs.ru |
---|---|
State | New |
Headers | show |
On Wed, 2012-07-18 at 21:09 +1000, Alexey Kardashevskiy wrote: > It literally does the following: > > 1. POWERPC IOMMU support (the kernel counterpart is required) > > 2. The patch assumes that IOAPIC calls are going to be replaced > with something generic. > > 3. Added sPAPRVFIOData (hw/spapr_iommu_vfio.h) which describes > the interface between VFIO and sPAPR IOMMU. > > 4. Change sPAPR PHB to scan the PCI bus which is used for > the IOMMU-VFIO group. Now it is enough to add the following to > the QEMU command line to get VFIO up with all the devices from > IOMMU group with id=3: > -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\ > mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000 > > WIth the pathes posted today a bit earlier, this patch fully supports > VFIO what includes MSIX as well. > > ps. yes, I know that linux_vfio.h has moved, will fix it later :) > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > --- > hw/linux-vfio.h | 26 +++++++++++ > hw/ppc/Makefile.objs | 3 ++ > hw/spapr.h | 4 ++ > hw/spapr_iommu.c | 62 ++++++++++++++++++++++++- > hw/spapr_iommu_vfio.h | 34 ++++++++++++++ > hw/spapr_pci.c | 124 +++++++++++++++++++++++++++++++++++++++++++++++-- > hw/spapr_pci.h | 6 +++ > hw/vfio_pci.c | 64 +++++++++++++++++++++++++ > hw/vfio_pci.h | 2 + > trace-events | 1 + > 10 files changed, 320 insertions(+), 6 deletions(-) > create mode 100644 hw/spapr_iommu_vfio.h > > diff --git a/hw/linux-vfio.h b/hw/linux-vfio.h > index 300d49b..27a0501 100644 > --- a/hw/linux-vfio.h > +++ b/hw/linux-vfio.h > @@ -442,4 +442,30 @@ struct vfio_iommu_type1_dma_unmap { > > #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) > > +/* > + * Interface to SPAPR TCE (POWERPC Book3S) > + */ > +#define SPAPR_TCE_IOMMU 2 > + > +struct tce_iommu_info { > + __u32 argsz; > + __u32 flags; > + __u32 dma32_window_start; > + __u32 dma32_window_size; > + __u64 dma64_window_start; > + __u64 dma64_window_size; > +}; > + > +#define SPAPR_TCE_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) > + > +struct tce_iommu_dma_map { > + __u32 argsz; > + __u32 flags; > + __u64 va; > + __u64 dmaaddr; > +}; > + > +#define SPAPR_TCE_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) > +#define SPAPR_TCE_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) > + > #endif /* VFIO_H */ > diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs > index f573a95..c46a049 100644 > --- a/hw/ppc/Makefile.objs > +++ b/hw/ppc/Makefile.objs > @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o > # Xilinx PPC peripherals > obj-y += xilinx_ethlite.o > > +# VFIO PCI device assignment > +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o > + > obj-y := $(addprefix ../,$(obj-y)) > diff --git a/hw/spapr.h b/hw/spapr.h > index b37f337..0c15c88 100644 > --- a/hw/spapr.h > +++ b/hw/spapr.h > @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname, > int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname, > DMAContext *dma); > > +struct sPAPRVFIOData; > +void spapr_vfio_init_dma(int group_id, uint32_t liobn, > + struct sPAPRVFIOData *data); > + > #endif /* !defined (__HW_SPAPR_H__) */ > diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c > index 50c288d..0a82842 100644 > --- a/hw/spapr_iommu.c > +++ b/hw/spapr_iommu.c > @@ -23,6 +23,8 @@ > #include "dma.h" > > #include "hw/spapr.h" > +#include "hw/spapr_iommu_vfio.h" > +#include "hw/vfio_pci.h" > > #include <libfdt.h> > > @@ -183,6 +185,60 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce) > return 0; > } > > +typedef struct sPAPRVFIOTable { > + struct sPAPRVFIOData *data; > + uint32_t liobn; > + QLIST_ENTRY(sPAPRVFIOTable) list; > +} sPAPRVFIOTable; > + > +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables; > + > +void spapr_vfio_init_dma(int group_id, uint32_t liobn, > + struct sPAPRVFIOData *data) > +{ > + sPAPRVFIOTable *t; > + > + t = g_malloc0(sizeof(*t)); > + t->data = data; > + t->liobn = liobn; > + > + QLIST_INSERT_HEAD(&vfio_tce_tables, t, list); > +} > + > +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce) > +{ > + sPAPRVFIOTable *t; > + struct tce_iommu_dma_map map = { > + .argsz = sizeof(map), > + .va = 0, > + .dmaaddr = ioba, > + }; > + > + QLIST_FOREACH(t, &vfio_tce_tables, list) { > + if (t->liobn != liobn) { > + continue; > + } > + if (!t->data) { > + return H_NO_MEM; > + } Why would this ever happen? > + if (tce) { > + map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK); > + > + if (t->data->map(t->data->groupid, &map)) { Just pass t->data, this is why the VFIOContainer has a union. > + perror("TCE_MAP_DMA"); > + return H_PARAMETER; > + } > + } else { > + if (t->data->unmap(t->data->groupid, &map)) { > + perror("TCE_UNMAP_DMA"); > + return H_PARAMETER; > + } > + } > + return H_SUCCESS; > + } > + return H_CONTINUE; /* positive non-zero value */ > +} > + > static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr, > target_ulong opcode, target_ulong *args) > { > @@ -200,7 +256,11 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr, > ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1); > > ret = put_tce_emu(liobn, ioba, tce); > - if (0 >= ret) { > + if (ret <= 0) { > + return ret ? H_PARAMETER : H_SUCCESS; > + } > + ret = put_tce_vfio(liobn, ioba, tce); > + if (ret <= 0) { > return ret ? H_PARAMETER : H_SUCCESS; > } > #ifdef DEBUG_TCE > diff --git a/hw/spapr_iommu_vfio.h b/hw/spapr_iommu_vfio.h > new file mode 100644 > index 0000000..b3d6115 > --- /dev/null > +++ b/hw/spapr_iommu_vfio.h > @@ -0,0 +1,34 @@ > +/* > + * Definitions for VFIO IOMMU implementation for SPAPR TCE. > + * > + * Copyright (c) 2012 Alexey Kardashevskiy <aik@olabs.ru> > + * > + * This library is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2 of the License, or (at your option) any later version. > + * > + * This library is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with this library; if not, see <http://www.gnu.org/licenses/>. > + */ > + > +#if !defined(__HW_SPAPR_IOMMU_VFIO_H__) > +#define __HW_SPAPR_IOMMU_VFIO_H__ > + > +#include "hw/linux-vfio.h" > + > +struct sPAPRVFIOData { > + int groupid; > + struct tce_iommu_info info; Seems a little lazy to embed this whole thing here, what does anyone need argsz and flags for later? > + int (*map)(int groupid, struct tce_iommu_dma_map *param); > + int (*unmap)(int groupid, struct tce_iommu_dma_map *param); s/int groupid/struct sPAPRVFIOData */ in map/unmap > +}; > + > +void spapr_register_vfio_container(struct sPAPRVFIOData *data); > + > +#endif > diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c > index 88c8f2c..75743e7 100644 > --- a/hw/spapr_pci.c > +++ b/hw/spapr_pci.c > @@ -22,6 +22,9 @@ > * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN > * THE SOFTWARE. > */ > +#include <sys/types.h> > +#include <dirent.h> > + > #include "hw.h" > #include "pci.h" > #include "msi.h" > @@ -32,7 +35,6 @@ > #include "exec-memory.h" > #include <libfdt.h> > #include "trace.h" > - > #include "hw/pci_internals.h" > > /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */ > @@ -440,6 +442,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level) > level); > } > > +static int pci_spapr_get_irq(void *opaque, int irq_num) > +{ > + sPAPRPHBState *phb = opaque; > + return phb->lsi_table[irq_num].dt_irq; > +} > + > static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr, > unsigned size) > { > @@ -515,6 +523,93 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque, > return phb->dma; > } > > +void spapr_register_vfio_container(struct sPAPRVFIOData *data) > +{ > + sPAPRPHBState *phb; > + > + QLIST_FOREACH(phb, &spapr->phbs, list) { > + if (phb->iommugroupid == data->groupid) { > + phb->vfio_data = *data; > + phb->dma_window_start = phb->vfio_data.info.dma32_window_start; > + phb->dma_window_size = phb->vfio_data.info.dma32_window_size; > + return; > + } > + } > +} > + > +static int spapr_pci_scan_vfio(sPAPRPHBState *phb) > +{ > + char iommupath[256]; > + DIR *dirp; > + struct dirent *entry; > + > + if (!phb->scan) { > + trace_spapr_pci("Autoscan disabled for ", phb->dtbusname); > + return 0; > + } > + > + snprintf(iommupath, sizeof(iommupath), > + "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid); > + dirp = opendir(iommupath); > + > + while ((entry = readdir(dirp)) != NULL) { > + char *tmp; > + FILE *deviceclassfile; > + unsigned deviceclass = 0, domainid, busid, devid, fnid; > + char addr[32]; > + DeviceState *dev; > + > + if (sscanf(entry->d_name, "%X:%X:%X.%x", > + &domainid, &busid, &devid, &fnid) != 4) { > + continue; > + } > + > + tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name); > + trace_spapr_pci("Reading device class from ", tmp); > + > + deviceclassfile = fopen(tmp, "r"); > + if (deviceclassfile) { > + fscanf(deviceclassfile, "%x", &deviceclass); > + fclose(deviceclassfile); > + } > + g_free(tmp); > + > + if (!deviceclass) { > + continue; > + } > + if ((phb->scan < 2) && > + ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8))) { > + /* Skip _any_ bridge */ > + continue; > + } > + if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) { > + /* Tweak USB */ > + phb->force_addr = 1; > + phb->enable_multifunction = 1; > + } > + > + trace_spapr_pci("Creating devicei from ", entry->d_name); > + > + dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci"); > + if (!dev) { > + fprintf(stderr, "failed to create vfio-pci\n"); > + continue; > + } > + qdev_prop_parse(dev, "host", entry->d_name); > + if (phb->force_addr) { > + snprintf(addr, sizeof(addr), "%x.%x", devid, fnid); > + qdev_prop_parse(dev, "addr", addr); > + } > + if (phb->enable_multifunction) { > + qdev_prop_set_bit(dev, "multifunction", 1); > + } > + qdev_init_nofail(dev); > + } > + closedir(dirp); > + > + return 0; > +} > + > static int spapr_phb_init(SysBusDevice *s) > { > sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s); > @@ -567,15 +662,13 @@ static int spapr_phb_init(SysBusDevice *s) > > bus = pci_register_bus(&phb->host_state.busdev.qdev, > phb->busname ? phb->busname : phb->dtbusname, > - pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb, > + pci_spapr_set_irq, pci_spapr_get_irq, > + pci_spapr_map_irq, phb, > &phb->memspace, &phb->iospace, > PCI_DEVFN(0, 0), PCI_NUM_PINS); > phb->host_state.bus = bus; > > phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16); > - phb->dma_window_start = 0; > - phb->dma_window_size = 0x40000000; > - phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size); > pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb); > > QLIST_INSERT_HEAD(&spapr->phbs, phb, list); > @@ -588,6 +681,19 @@ static int spapr_phb_init(SysBusDevice *s) > } > } > > + if (phb->iommugroupid >= 0) { > + if (spapr_pci_scan_vfio(phb) < 0) { > + return -1; > + } > + spapr_vfio_init_dma(phb->iommugroupid, phb->dma_liobn, &phb->vfio_data); > + return 0; > + } > + > + phb->dma_window_start = 0; > + phb->dma_window_size = 0x40000000; > + phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, > + phb->dma_window_size); > + > return 0; > } > > @@ -599,6 +705,10 @@ static Property spapr_phb_properties[] = { > DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0), > DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000), > DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0), > + DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1), > + DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1), > + DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0), > + DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0), > DEFINE_PROP_END_OF_LIST(), > }; > > @@ -729,6 +839,10 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map, > sizeof(interrupt_map))); > > + if (!phb->dma_window_size) { > + fprintf(stderr, "Unexpected error: DMA window is zero, exiting\n"); > + exit(1); > + } > spapr_dma_dt(fdt, bus_off, "ibm,dma-window", > phb->dma_liobn, phb->dma_window_start, > phb->dma_window_size); > diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h > index 145071c..ed17053 100644 > --- a/hw/spapr_pci.h > +++ b/hw/spapr_pci.h > @@ -26,6 +26,7 @@ > #include "hw/pci.h" > #include "hw/pci_host.h" > #include "hw/xics.h" > +#include "hw/spapr_iommu_vfio.h" > > #define SPAPR_MSIX_MAX_DEVS 32 > > @@ -57,6 +58,11 @@ typedef struct sPAPRPHBState { > int nvec; > } msi_table[SPAPR_MSIX_MAX_DEVS]; > > + struct sPAPRVFIOData vfio_data; > + int32_t iommugroupid; > + uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */ > + uint8_t enable_multifunction, force_addr; > + > QLIST_ENTRY(sPAPRPHBState) list; > } sPAPRPHBState; > > diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c > index 0ad4761..0023d41 100644 > --- a/hw/vfio_pci.c > +++ b/hw/vfio_pci.c > @@ -43,6 +43,7 @@ > #include "range.h" > #include "vfio_pci.h" > #include "linux-vfio.h" > +#include "hw/spapr_iommu_vfio.h" > > //#define DEBUG_VFIO > #ifdef DEBUG_VFIO > @@ -55,6 +56,8 @@ > > #define MSIX_CAP_LENGTH 12 > > +static VFIOGroup *vfio_get_group(int groupid); > + Unnecessary, because... > static QLIST_HEAD(, VFIOContainer) > container_list = QLIST_HEAD_INITIALIZER(container_list); > > @@ -1088,6 +1091,31 @@ static void vfio_listener_release(VFIOContainer *container) > } > > /* > + * sPAPR TCE DMA interface > + */ > +static int spapr_tce_map(int groupid, struct tce_iommu_dma_map *param) > +{ > + VFIOGroup *group = vfio_get_group(groupid); > + > + if (!group || !group->container) { > + return -1; > + } This should be: static int spapr_tce_map(sPAPRVFIOData *spapr_data, struct tce_iommu_dma_map *param) { VFIOContainer *container; container = container_of(spapr_data, VFIOContainer, iommu_data.spapr); return ioctl(container->fd, .... } > + > + return ioctl(group->container->fd, SPAPR_TCE_IOMMU_MAP_DMA, param); > +} > + > +static int spapr_tce_unmap(int groupid, struct tce_iommu_dma_map *param) > +{ > + VFIOGroup *group = vfio_get_group(groupid); > + > + if (!group || !group->container) { > + return -1; > + } > + > + return ioctl(group->container->fd, SPAPR_TCE_IOMMU_UNMAP_DMA, param); > +} > + > +/* > * Interrupt setup > */ > static void vfio_disable_interrupts(VFIODevice *vdev) > @@ -1590,6 +1618,42 @@ static int vfio_connect_container(VFIOGroup *group) > memory_listener_register(&container->iommu_data.listener, > get_system_memory()); > > + } else if (ioctl(fd, VFIO_CHECK_EXTENSION, SPAPR_TCE_IOMMU)) { > + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); > + if (ret) { > + error_report("vfio: failed to set group container: %s\n", > + strerror(errno)); > + g_free(container); > + close(fd); > + return -1; > + } > + > + ret = ioctl(fd, VFIO_SET_IOMMU, SPAPR_TCE_IOMMU); > + if (ret) { > + error_report("vfio: failed to set iommu for container: %s\n", > + strerror(errno)); > + g_free(container); > + close(fd); > + return -1; > + } > + > + container->iommu_data.spapr.info.argsz = > + sizeof(container->iommu_data.spapr.info); > + ret = ioctl(fd, SPAPR_TCE_IOMMU_GET_INFO, > + &container->iommu_data.spapr.info); > + if (ret) { > + error_report("vfio: failed to get iommu info for container: %s\n", > + strerror(errno)); > + g_free(container); > + close(fd); > + return -1; > + } > + > + container->iommu_data.spapr.map = spapr_tce_map; > + container->iommu_data.spapr.unmap = spapr_tce_unmap; > + container->iommu_data.spapr.groupid = group->groupid; This at least deserves a comment because x86 doesn't have a 1:1 mapping of container to groupid. I really think it might make more sense to pass a PCIBus here instead of a groupid. Whatever you choose, this shouldn't be part of sPAPRVFIOData, it should be another parameter to spapr_register_vfio_container. > + spapr_register_vfio_container(&container->iommu_data.spapr); > + > } else { > error_report("vfio: No available IOMMU models\n"); > g_free(container); > diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h > index 00bb3dd..ddb5332 100644 > --- a/hw/vfio_pci.h > +++ b/hw/vfio_pci.h > @@ -6,6 +6,7 @@ > #include "pci.h" > #include "ioapic.h" > #include "event_notifier.h" > +#include "hw/spapr_iommu_vfio.h" > > typedef struct VFIOPCIHostDevice { > uint16_t seg; > @@ -59,6 +60,7 @@ typedef struct VFIOContainer { > struct { > union { > MemoryListener listener; > + struct sPAPRVFIOData spapr; > }; > void (*release)(struct VFIOContainer *); > } iommu_data; > diff --git a/trace-events b/trace-events > index e548f86..9100591 100644 > --- a/trace-events > +++ b/trace-events > @@ -848,6 +848,7 @@ qxl_render_guest_primary_resized(int32_t width, int32_t height, int32_t stride, > qxl_render_update_area_done(void *cookie) "%p" > > # hw/spapr_pci.c > +spapr_pci(const char *msg1, const char *msg2) "%s%s" > spapr_pci_msi(const char *msg, uint32_t n, uint32_t ca) "%s (device#%d, cfg=%x)" > spapr_pci_msi_setup(const char *name, unsigned vector, uint64_t addr) "dev\"%s\" vector %u, addr=%"PRIx64 > spapr_pci_rtas_ibm_change_msi(unsigned func, unsigned req) "func %u, requested %u"
On 19/07/12 00:14, Alex Williamson wrote: > On Wed, 2012-07-18 at 21:09 +1000, Alexey Kardashevskiy wrote: >> It literally does the following: >> >> 1. POWERPC IOMMU support (the kernel counterpart is required) >> >> 2. The patch assumes that IOAPIC calls are going to be replaced >> with something generic. >> >> 3. Added sPAPRVFIOData (hw/spapr_iommu_vfio.h) which describes >> the interface between VFIO and sPAPR IOMMU. >> >> 4. Change sPAPR PHB to scan the PCI bus which is used for >> the IOMMU-VFIO group. Now it is enough to add the following to >> the QEMU command line to get VFIO up with all the devices from >> IOMMU group with id=3: >> -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\ >> mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000 >> >> WIth the pathes posted today a bit earlier, this patch fully supports >> VFIO what includes MSIX as well. >> >> ps. yes, I know that linux_vfio.h has moved, will fix it later :) >> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> >> --- >> hw/linux-vfio.h | 26 +++++++++++ >> hw/ppc/Makefile.objs | 3 ++ >> hw/spapr.h | 4 ++ >> hw/spapr_iommu.c | 62 ++++++++++++++++++++++++- >> hw/spapr_iommu_vfio.h | 34 ++++++++++++++ >> hw/spapr_pci.c | 124 +++++++++++++++++++++++++++++++++++++++++++++++-- >> hw/spapr_pci.h | 6 +++ >> hw/vfio_pci.c | 64 +++++++++++++++++++++++++ >> hw/vfio_pci.h | 2 + >> trace-events | 1 + >> 10 files changed, 320 insertions(+), 6 deletions(-) >> create mode 100644 hw/spapr_iommu_vfio.h >> >> diff --git a/hw/linux-vfio.h b/hw/linux-vfio.h >> index 300d49b..27a0501 100644 >> --- a/hw/linux-vfio.h >> +++ b/hw/linux-vfio.h >> @@ -442,4 +442,30 @@ struct vfio_iommu_type1_dma_unmap { >> >> #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) >> >> +/* >> + * Interface to SPAPR TCE (POWERPC Book3S) >> + */ >> +#define SPAPR_TCE_IOMMU 2 >> + >> +struct tce_iommu_info { >> + __u32 argsz; >> + __u32 flags; >> + __u32 dma32_window_start; >> + __u32 dma32_window_size; >> + __u64 dma64_window_start; >> + __u64 dma64_window_size; >> +}; >> + >> +#define SPAPR_TCE_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) >> + >> +struct tce_iommu_dma_map { >> + __u32 argsz; >> + __u32 flags; >> + __u64 va; >> + __u64 dmaaddr; >> +}; >> + >> +#define SPAPR_TCE_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) >> +#define SPAPR_TCE_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) >> + >> #endif /* VFIO_H */ >> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs >> index f573a95..c46a049 100644 >> --- a/hw/ppc/Makefile.objs >> +++ b/hw/ppc/Makefile.objs >> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o >> # Xilinx PPC peripherals >> obj-y += xilinx_ethlite.o >> >> +# VFIO PCI device assignment >> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o >> + >> obj-y := $(addprefix ../,$(obj-y)) >> diff --git a/hw/spapr.h b/hw/spapr.h >> index b37f337..0c15c88 100644 >> --- a/hw/spapr.h >> +++ b/hw/spapr.h >> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname, >> int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname, >> DMAContext *dma); >> >> +struct sPAPRVFIOData; >> +void spapr_vfio_init_dma(int group_id, uint32_t liobn, >> + struct sPAPRVFIOData *data); >> + >> #endif /* !defined (__HW_SPAPR_H__) */ >> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c >> index 50c288d..0a82842 100644 >> --- a/hw/spapr_iommu.c >> +++ b/hw/spapr_iommu.c >> @@ -23,6 +23,8 @@ >> #include "dma.h" >> >> #include "hw/spapr.h" >> +#include "hw/spapr_iommu_vfio.h" >> +#include "hw/vfio_pci.h" >> >> #include <libfdt.h> >> >> @@ -183,6 +185,60 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce) >> return 0; >> } >> >> +typedef struct sPAPRVFIOTable { >> + struct sPAPRVFIOData *data; >> + uint32_t liobn; >> + QLIST_ENTRY(sPAPRVFIOTable) list; >> +} sPAPRVFIOTable; >> + >> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables; >> + >> +void spapr_vfio_init_dma(int group_id, uint32_t liobn, >> + struct sPAPRVFIOData *data) >> +{ >> + sPAPRVFIOTable *t; >> + >> + t = g_malloc0(sizeof(*t)); >> + t->data = data; >> + t->liobn = liobn; >> + >> + QLIST_INSERT_HEAD(&vfio_tce_tables, t, list); >> +} >> + >> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce) >> +{ >> + sPAPRVFIOTable *t; >> + struct tce_iommu_dma_map map = { >> + .argsz = sizeof(map), >> + .va = 0, >> + .dmaaddr = ioba, >> + }; >> + >> + QLIST_FOREACH(t, &vfio_tce_tables, list) { >> + if (t->liobn != liobn) { >> + continue; >> + } >> + if (!t->data) { >> + return H_NO_MEM; >> + } > > Why would this ever happen? > >> + if (tce) { >> + map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK); >> + >> + if (t->data->map(t->data->groupid, &map)) { > > Just pass t->data, this is why the VFIOContainer has a union. > >> + perror("TCE_MAP_DMA"); >> + return H_PARAMETER; >> + } >> + } else { >> + if (t->data->unmap(t->data->groupid, &map)) { >> + perror("TCE_UNMAP_DMA"); >> + return H_PARAMETER; >> + } >> + } >> + return H_SUCCESS; >> + } >> + return H_CONTINUE; /* positive non-zero value */ >> +} >> + >> static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr, >> target_ulong opcode, target_ulong *args) >> { >> @@ -200,7 +256,11 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr, >> ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1); >> >> ret = put_tce_emu(liobn, ioba, tce); >> - if (0 >= ret) { >> + if (ret <= 0) { >> + return ret ? H_PARAMETER : H_SUCCESS; >> + } >> + ret = put_tce_vfio(liobn, ioba, tce); >> + if (ret <= 0) { >> return ret ? H_PARAMETER : H_SUCCESS; >> } >> #ifdef DEBUG_TCE >> diff --git a/hw/spapr_iommu_vfio.h b/hw/spapr_iommu_vfio.h >> new file mode 100644 >> index 0000000..b3d6115 >> --- /dev/null >> +++ b/hw/spapr_iommu_vfio.h >> @@ -0,0 +1,34 @@ >> +/* >> + * Definitions for VFIO IOMMU implementation for SPAPR TCE. >> + * >> + * Copyright (c) 2012 Alexey Kardashevskiy <aik@olabs.ru> >> + * >> + * This library is free software; you can redistribute it and/or >> + * modify it under the terms of the GNU Lesser General Public >> + * License as published by the Free Software Foundation; either >> + * version 2 of the License, or (at your option) any later version. >> + * >> + * This library is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + * Lesser General Public License for more details. >> + * >> + * You should have received a copy of the GNU Lesser General Public >> + * License along with this library; if not, see <http://www.gnu.org/licenses/>. >> + */ >> + >> +#if !defined(__HW_SPAPR_IOMMU_VFIO_H__) >> +#define __HW_SPAPR_IOMMU_VFIO_H__ >> + >> +#include "hw/linux-vfio.h" >> + >> +struct sPAPRVFIOData { >> + int groupid; >> + struct tce_iommu_info info; > > Seems a little lazy to embed this whole thing here, what does anyone > need argsz and flags for later? Trying to keep the same structures for both QEMU and the kernel as it is a SPAPR_TCE protocol. Otherwise I would have to copy the whole structure but argsz/flags. >> + int (*map)(int groupid, struct tce_iommu_dma_map *param); >> + int (*unmap)(int groupid, struct tce_iommu_dma_map *param); > > s/int groupid/struct sPAPRVFIOData */ in map/unmap > >> +}; >> + >> +void spapr_register_vfio_container(struct sPAPRVFIOData *data); >> + >> +#endif >> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c >> index 88c8f2c..75743e7 100644 >> --- a/hw/spapr_pci.c >> +++ b/hw/spapr_pci.c >> @@ -22,6 +22,9 @@ >> * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN >> * THE SOFTWARE. >> */ >> +#include <sys/types.h> >> +#include <dirent.h> >> + >> #include "hw.h" >> #include "pci.h" >> #include "msi.h" >> @@ -32,7 +35,6 @@ >> #include "exec-memory.h" >> #include <libfdt.h> >> #include "trace.h" >> - >> #include "hw/pci_internals.h" >> >> /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */ >> @@ -440,6 +442,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level) >> level); >> } >> >> +static int pci_spapr_get_irq(void *opaque, int irq_num) >> +{ >> + sPAPRPHBState *phb = opaque; >> + return phb->lsi_table[irq_num].dt_irq; >> +} >> + >> static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr, >> unsigned size) >> { >> @@ -515,6 +523,93 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque, >> return phb->dma; >> } >> >> +void spapr_register_vfio_container(struct sPAPRVFIOData *data) >> +{ >> + sPAPRPHBState *phb; >> + >> + QLIST_FOREACH(phb, &spapr->phbs, list) { >> + if (phb->iommugroupid == data->groupid) { >> + phb->vfio_data = *data; >> + phb->dma_window_start = phb->vfio_data.info.dma32_window_start; >> + phb->dma_window_size = phb->vfio_data.info.dma32_window_size; >> + return; >> + } >> + } >> +} >> + >> +static int spapr_pci_scan_vfio(sPAPRPHBState *phb) >> +{ >> + char iommupath[256]; >> + DIR *dirp; >> + struct dirent *entry; >> + >> + if (!phb->scan) { >> + trace_spapr_pci("Autoscan disabled for ", phb->dtbusname); >> + return 0; >> + } >> + >> + snprintf(iommupath, sizeof(iommupath), >> + "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid); >> + dirp = opendir(iommupath); >> + >> + while ((entry = readdir(dirp)) != NULL) { >> + char *tmp; >> + FILE *deviceclassfile; >> + unsigned deviceclass = 0, domainid, busid, devid, fnid; >> + char addr[32]; >> + DeviceState *dev; >> + >> + if (sscanf(entry->d_name, "%X:%X:%X.%x", >> + &domainid, &busid, &devid, &fnid) != 4) { >> + continue; >> + } >> + >> + tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name); >> + trace_spapr_pci("Reading device class from ", tmp); >> + >> + deviceclassfile = fopen(tmp, "r"); >> + if (deviceclassfile) { >> + fscanf(deviceclassfile, "%x", &deviceclass); >> + fclose(deviceclassfile); >> + } >> + g_free(tmp); >> + >> + if (!deviceclass) { >> + continue; >> + } >> + if ((phb->scan < 2) && >> + ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8))) { >> + /* Skip _any_ bridge */ >> + continue; >> + } >> + if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) { >> + /* Tweak USB */ >> + phb->force_addr = 1; >> + phb->enable_multifunction = 1; >> + } >> + >> + trace_spapr_pci("Creating devicei from ", entry->d_name); >> + >> + dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci"); >> + if (!dev) { >> + fprintf(stderr, "failed to create vfio-pci\n"); >> + continue; >> + } >> + qdev_prop_parse(dev, "host", entry->d_name); >> + if (phb->force_addr) { >> + snprintf(addr, sizeof(addr), "%x.%x", devid, fnid); >> + qdev_prop_parse(dev, "addr", addr); >> + } >> + if (phb->enable_multifunction) { >> + qdev_prop_set_bit(dev, "multifunction", 1); >> + } >> + qdev_init_nofail(dev); >> + } >> + closedir(dirp); >> + >> + return 0; >> +} >> + >> static int spapr_phb_init(SysBusDevice *s) >> { >> sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s); >> @@ -567,15 +662,13 @@ static int spapr_phb_init(SysBusDevice *s) >> >> bus = pci_register_bus(&phb->host_state.busdev.qdev, >> phb->busname ? phb->busname : phb->dtbusname, >> - pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb, >> + pci_spapr_set_irq, pci_spapr_get_irq, >> + pci_spapr_map_irq, phb, >> &phb->memspace, &phb->iospace, >> PCI_DEVFN(0, 0), PCI_NUM_PINS); >> phb->host_state.bus = bus; >> >> phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16); >> - phb->dma_window_start = 0; >> - phb->dma_window_size = 0x40000000; >> - phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size); >> pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb); >> >> QLIST_INSERT_HEAD(&spapr->phbs, phb, list); >> @@ -588,6 +681,19 @@ static int spapr_phb_init(SysBusDevice *s) >> } >> } >> >> + if (phb->iommugroupid >= 0) { >> + if (spapr_pci_scan_vfio(phb) < 0) { >> + return -1; >> + } >> + spapr_vfio_init_dma(phb->iommugroupid, phb->dma_liobn, &phb->vfio_data); >> + return 0; >> + } >> + >> + phb->dma_window_start = 0; >> + phb->dma_window_size = 0x40000000; >> + phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, >> + phb->dma_window_size); >> + >> return 0; >> } >> >> @@ -599,6 +705,10 @@ static Property spapr_phb_properties[] = { >> DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0), >> DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000), >> DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0), >> + DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1), >> + DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1), >> + DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0), >> + DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0), >> DEFINE_PROP_END_OF_LIST(), >> }; >> >> @@ -729,6 +839,10 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, >> _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map, >> sizeof(interrupt_map))); >> >> + if (!phb->dma_window_size) { >> + fprintf(stderr, "Unexpected error: DMA window is zero, exiting\n"); >> + exit(1); >> + } >> spapr_dma_dt(fdt, bus_off, "ibm,dma-window", >> phb->dma_liobn, phb->dma_window_start, >> phb->dma_window_size); >> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h >> index 145071c..ed17053 100644 >> --- a/hw/spapr_pci.h >> +++ b/hw/spapr_pci.h >> @@ -26,6 +26,7 @@ >> #include "hw/pci.h" >> #include "hw/pci_host.h" >> #include "hw/xics.h" >> +#include "hw/spapr_iommu_vfio.h" >> >> #define SPAPR_MSIX_MAX_DEVS 32 >> >> @@ -57,6 +58,11 @@ typedef struct sPAPRPHBState { >> int nvec; >> } msi_table[SPAPR_MSIX_MAX_DEVS]; >> >> + struct sPAPRVFIOData vfio_data; >> + int32_t iommugroupid; >> + uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */ >> + uint8_t enable_multifunction, force_addr; >> + >> QLIST_ENTRY(sPAPRPHBState) list; >> } sPAPRPHBState; >> >> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c >> index 0ad4761..0023d41 100644 >> --- a/hw/vfio_pci.c >> +++ b/hw/vfio_pci.c >> @@ -43,6 +43,7 @@ >> #include "range.h" >> #include "vfio_pci.h" >> #include "linux-vfio.h" >> +#include "hw/spapr_iommu_vfio.h" >> >> //#define DEBUG_VFIO >> #ifdef DEBUG_VFIO >> @@ -55,6 +56,8 @@ >> >> #define MSIX_CAP_LENGTH 12 >> >> +static VFIOGroup *vfio_get_group(int groupid); >> + > > Unnecessary, because... > >> static QLIST_HEAD(, VFIOContainer) >> container_list = QLIST_HEAD_INITIALIZER(container_list); >> >> @@ -1088,6 +1091,31 @@ static void vfio_listener_release(VFIOContainer *container) >> } >> >> /* >> + * sPAPR TCE DMA interface >> + */ >> +static int spapr_tce_map(int groupid, struct tce_iommu_dma_map *param) >> +{ >> + VFIOGroup *group = vfio_get_group(groupid); >> + >> + if (!group || !group->container) { >> + return -1; >> + } > > This should be: > > static int spapr_tce_map(sPAPRVFIOData *spapr_data, struct tce_iommu_dma_map *param) > { > VFIOContainer *container; > > container = container_of(spapr_data, VFIOContainer, iommu_data.spapr); > > return ioctl(container->fd, .... > } > > >> + >> + return ioctl(group->container->fd, SPAPR_TCE_IOMMU_MAP_DMA, param); >> +} >> + >> +static int spapr_tce_unmap(int groupid, struct tce_iommu_dma_map *param) >> +{ >> + VFIOGroup *group = vfio_get_group(groupid); >> + >> + if (!group || !group->container) { >> + return -1; >> + } >> + >> + return ioctl(group->container->fd, SPAPR_TCE_IOMMU_UNMAP_DMA, param); >> +} >> + >> +/* >> * Interrupt setup >> */ >> static void vfio_disable_interrupts(VFIODevice *vdev) >> @@ -1590,6 +1618,42 @@ static int vfio_connect_container(VFIOGroup *group) >> memory_listener_register(&container->iommu_data.listener, >> get_system_memory()); >> >> + } else if (ioctl(fd, VFIO_CHECK_EXTENSION, SPAPR_TCE_IOMMU)) { >> + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); >> + if (ret) { >> + error_report("vfio: failed to set group container: %s\n", >> + strerror(errno)); >> + g_free(container); >> + close(fd); >> + return -1; >> + } >> + >> + ret = ioctl(fd, VFIO_SET_IOMMU, SPAPR_TCE_IOMMU); >> + if (ret) { >> + error_report("vfio: failed to set iommu for container: %s\n", >> + strerror(errno)); >> + g_free(container); >> + close(fd); >> + return -1; >> + } >> + >> + container->iommu_data.spapr.info.argsz = >> + sizeof(container->iommu_data.spapr.info); >> + ret = ioctl(fd, SPAPR_TCE_IOMMU_GET_INFO, >> + &container->iommu_data.spapr.info); >> + if (ret) { >> + error_report("vfio: failed to get iommu info for container: %s\n", >> + strerror(errno)); >> + g_free(container); >> + close(fd); >> + return -1; >> + } >> + >> + container->iommu_data.spapr.map = spapr_tce_map; >> + container->iommu_data.spapr.unmap = spapr_tce_unmap; >> + container->iommu_data.spapr.groupid = group->groupid; > > This at least deserves a comment because x86 doesn't have a 1:1 mapping > of container to groupid. Give me a good example and add a comment about why you need MemoryListener as powerpc does not need the whole RAM to be mapped as a DMA Window :) Honestly, I do not know what to write there what would make sense, explain at least to a powerpc-familiar person what is going on and not to be a 100 lines essay. I will put some though. > I really think it might make more sense to pass a PCIBus here instead of a groupid. Or a PCIDevice if I understand SRIOV right. Many choices. Let's stay with groupid for now. > Whatever you choose, this > shouldn't be part of sPAPRVFIOData, it should be another parameter to > spapr_register_vfio_container. Done. I will resend the patch. Thanks for your comments. >> + spapr_register_vfio_container(&container->iommu_data.spapr); >> + >> } else { >> error_report("vfio: No available IOMMU models\n"); >> g_free(container); >> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h >> index 00bb3dd..ddb5332 100644 >> --- a/hw/vfio_pci.h >> +++ b/hw/vfio_pci.h >> @@ -6,6 +6,7 @@ >> #include "pci.h" >> #include "ioapic.h" >> #include "event_notifier.h" >> +#include "hw/spapr_iommu_vfio.h" >> >> typedef struct VFIOPCIHostDevice { >> uint16_t seg; >> @@ -59,6 +60,7 @@ typedef struct VFIOContainer { >> struct { >> union { >> MemoryListener listener; >> + struct sPAPRVFIOData spapr; >> }; >> void (*release)(struct VFIOContainer *); >> } iommu_data; >> diff --git a/trace-events b/trace-events >> index e548f86..9100591 100644 >> --- a/trace-events >> +++ b/trace-events >> @@ -848,6 +848,7 @@ qxl_render_guest_primary_resized(int32_t width, int32_t height, int32_t stride, >> qxl_render_update_area_done(void *cookie) "%p" >> >> # hw/spapr_pci.c >> +spapr_pci(const char *msg1, const char *msg2) "%s%s" >> spapr_pci_msi(const char *msg, uint32_t n, uint32_t ca) "%s (device#%d, cfg=%x)" >> spapr_pci_msi_setup(const char *name, unsigned vector, uint64_t addr) "dev\"%s\" vector %u, addr=%"PRIx64 >> spapr_pci_rtas_ibm_change_msi(unsigned func, unsigned req) "func %u, requested %u" > > >
diff --git a/hw/linux-vfio.h b/hw/linux-vfio.h index 300d49b..27a0501 100644 --- a/hw/linux-vfio.h +++ b/hw/linux-vfio.h @@ -442,4 +442,30 @@ struct vfio_iommu_type1_dma_unmap { #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) +/* + * Interface to SPAPR TCE (POWERPC Book3S) + */ +#define SPAPR_TCE_IOMMU 2 + +struct tce_iommu_info { + __u32 argsz; + __u32 flags; + __u32 dma32_window_start; + __u32 dma32_window_size; + __u64 dma64_window_start; + __u64 dma64_window_size; +}; + +#define SPAPR_TCE_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +struct tce_iommu_dma_map { + __u32 argsz; + __u32 flags; + __u64 va; + __u64 dmaaddr; +}; + +#define SPAPR_TCE_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) +#define SPAPR_TCE_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) + #endif /* VFIO_H */ diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs index f573a95..c46a049 100644 --- a/hw/ppc/Makefile.objs +++ b/hw/ppc/Makefile.objs @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o # Xilinx PPC peripherals obj-y += xilinx_ethlite.o +# VFIO PCI device assignment +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o + obj-y := $(addprefix ../,$(obj-y)) diff --git a/hw/spapr.h b/hw/spapr.h index b37f337..0c15c88 100644 --- a/hw/spapr.h +++ b/hw/spapr.h @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname, int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname, DMAContext *dma); +struct sPAPRVFIOData; +void spapr_vfio_init_dma(int group_id, uint32_t liobn, + struct sPAPRVFIOData *data); + #endif /* !defined (__HW_SPAPR_H__) */ diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c index 50c288d..0a82842 100644 --- a/hw/spapr_iommu.c +++ b/hw/spapr_iommu.c @@ -23,6 +23,8 @@ #include "dma.h" #include "hw/spapr.h" +#include "hw/spapr_iommu_vfio.h" +#include "hw/vfio_pci.h" #include <libfdt.h> @@ -183,6 +185,60 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce) return 0; } +typedef struct sPAPRVFIOTable { + struct sPAPRVFIOData *data; + uint32_t liobn; + QLIST_ENTRY(sPAPRVFIOTable) list; +} sPAPRVFIOTable; + +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables; + +void spapr_vfio_init_dma(int group_id, uint32_t liobn, + struct sPAPRVFIOData *data) +{ + sPAPRVFIOTable *t; + + t = g_malloc0(sizeof(*t)); + t->data = data; + t->liobn = liobn; + + QLIST_INSERT_HEAD(&vfio_tce_tables, t, list); +} + +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce) +{ + sPAPRVFIOTable *t; + struct tce_iommu_dma_map map = { + .argsz = sizeof(map), + .va = 0, + .dmaaddr = ioba, + }; + + QLIST_FOREACH(t, &vfio_tce_tables, list) { + if (t->liobn != liobn) { + continue; + } + if (!t->data) { + return H_NO_MEM; + } + if (tce) { + map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK); + + if (t->data->map(t->data->groupid, &map)) { + perror("TCE_MAP_DMA"); + return H_PARAMETER; + } + } else { + if (t->data->unmap(t->data->groupid, &map)) { + perror("TCE_UNMAP_DMA"); + return H_PARAMETER; + } + } + return H_SUCCESS; + } + return H_CONTINUE; /* positive non-zero value */ +} + static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr, target_ulong opcode, target_ulong *args) { @@ -200,7 +256,11 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr, ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1); ret = put_tce_emu(liobn, ioba, tce); - if (0 >= ret) { + if (ret <= 0) { + return ret ? H_PARAMETER : H_SUCCESS; + } + ret = put_tce_vfio(liobn, ioba, tce); + if (ret <= 0) { return ret ? H_PARAMETER : H_SUCCESS; } #ifdef DEBUG_TCE diff --git a/hw/spapr_iommu_vfio.h b/hw/spapr_iommu_vfio.h new file mode 100644 index 0000000..b3d6115 --- /dev/null +++ b/hw/spapr_iommu_vfio.h @@ -0,0 +1,34 @@ +/* + * Definitions for VFIO IOMMU implementation for SPAPR TCE. + * + * Copyright (c) 2012 Alexey Kardashevskiy <aik@olabs.ru> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#if !defined(__HW_SPAPR_IOMMU_VFIO_H__) +#define __HW_SPAPR_IOMMU_VFIO_H__ + +#include "hw/linux-vfio.h" + +struct sPAPRVFIOData { + int groupid; + struct tce_iommu_info info; + int (*map)(int groupid, struct tce_iommu_dma_map *param); + int (*unmap)(int groupid, struct tce_iommu_dma_map *param); +}; + +void spapr_register_vfio_container(struct sPAPRVFIOData *data); + +#endif diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c index 88c8f2c..75743e7 100644 --- a/hw/spapr_pci.c +++ b/hw/spapr_pci.c @@ -22,6 +22,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include <sys/types.h> +#include <dirent.h> + #include "hw.h" #include "pci.h" #include "msi.h" @@ -32,7 +35,6 @@ #include "exec-memory.h" #include <libfdt.h> #include "trace.h" - #include "hw/pci_internals.h" /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */ @@ -440,6 +442,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level) level); } +static int pci_spapr_get_irq(void *opaque, int irq_num) +{ + sPAPRPHBState *phb = opaque; + return phb->lsi_table[irq_num].dt_irq; +} + static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr, unsigned size) { @@ -515,6 +523,93 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque, return phb->dma; } +void spapr_register_vfio_container(struct sPAPRVFIOData *data) +{ + sPAPRPHBState *phb; + + QLIST_FOREACH(phb, &spapr->phbs, list) { + if (phb->iommugroupid == data->groupid) { + phb->vfio_data = *data; + phb->dma_window_start = phb->vfio_data.info.dma32_window_start; + phb->dma_window_size = phb->vfio_data.info.dma32_window_size; + return; + } + } +} + +static int spapr_pci_scan_vfio(sPAPRPHBState *phb) +{ + char iommupath[256]; + DIR *dirp; + struct dirent *entry; + + if (!phb->scan) { + trace_spapr_pci("Autoscan disabled for ", phb->dtbusname); + return 0; + } + + snprintf(iommupath, sizeof(iommupath), + "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid); + dirp = opendir(iommupath); + + while ((entry = readdir(dirp)) != NULL) { + char *tmp; + FILE *deviceclassfile; + unsigned deviceclass = 0, domainid, busid, devid, fnid; + char addr[32]; + DeviceState *dev; + + if (sscanf(entry->d_name, "%X:%X:%X.%x", + &domainid, &busid, &devid, &fnid) != 4) { + continue; + } + + tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name); + trace_spapr_pci("Reading device class from ", tmp); + + deviceclassfile = fopen(tmp, "r"); + if (deviceclassfile) { + fscanf(deviceclassfile, "%x", &deviceclass); + fclose(deviceclassfile); + } + g_free(tmp); + + if (!deviceclass) { + continue; + } + if ((phb->scan < 2) && + ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8))) { + /* Skip _any_ bridge */ + continue; + } + if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) { + /* Tweak USB */ + phb->force_addr = 1; + phb->enable_multifunction = 1; + } + + trace_spapr_pci("Creating devicei from ", entry->d_name); + + dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci"); + if (!dev) { + fprintf(stderr, "failed to create vfio-pci\n"); + continue; + } + qdev_prop_parse(dev, "host", entry->d_name); + if (phb->force_addr) { + snprintf(addr, sizeof(addr), "%x.%x", devid, fnid); + qdev_prop_parse(dev, "addr", addr); + } + if (phb->enable_multifunction) { + qdev_prop_set_bit(dev, "multifunction", 1); + } + qdev_init_nofail(dev); + } + closedir(dirp); + + return 0; +} + static int spapr_phb_init(SysBusDevice *s) { sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s); @@ -567,15 +662,13 @@ static int spapr_phb_init(SysBusDevice *s) bus = pci_register_bus(&phb->host_state.busdev.qdev, phb->busname ? phb->busname : phb->dtbusname, - pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb, + pci_spapr_set_irq, pci_spapr_get_irq, + pci_spapr_map_irq, phb, &phb->memspace, &phb->iospace, PCI_DEVFN(0, 0), PCI_NUM_PINS); phb->host_state.bus = bus; phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16); - phb->dma_window_start = 0; - phb->dma_window_size = 0x40000000; - phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size); pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb); QLIST_INSERT_HEAD(&spapr->phbs, phb, list); @@ -588,6 +681,19 @@ static int spapr_phb_init(SysBusDevice *s) } } + if (phb->iommugroupid >= 0) { + if (spapr_pci_scan_vfio(phb) < 0) { + return -1; + } + spapr_vfio_init_dma(phb->iommugroupid, phb->dma_liobn, &phb->vfio_data); + return 0; + } + + phb->dma_window_start = 0; + phb->dma_window_size = 0x40000000; + phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, + phb->dma_window_size); + return 0; } @@ -599,6 +705,10 @@ static Property spapr_phb_properties[] = { DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0), DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000), DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0), + DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1), + DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1), + DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0), + DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0), DEFINE_PROP_END_OF_LIST(), }; @@ -729,6 +839,10 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map, sizeof(interrupt_map))); + if (!phb->dma_window_size) { + fprintf(stderr, "Unexpected error: DMA window is zero, exiting\n"); + exit(1); + } spapr_dma_dt(fdt, bus_off, "ibm,dma-window", phb->dma_liobn, phb->dma_window_start, phb->dma_window_size); diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h index 145071c..ed17053 100644 --- a/hw/spapr_pci.h +++ b/hw/spapr_pci.h @@ -26,6 +26,7 @@ #include "hw/pci.h" #include "hw/pci_host.h" #include "hw/xics.h" +#include "hw/spapr_iommu_vfio.h" #define SPAPR_MSIX_MAX_DEVS 32 @@ -57,6 +58,11 @@ typedef struct sPAPRPHBState { int nvec; } msi_table[SPAPR_MSIX_MAX_DEVS]; + struct sPAPRVFIOData vfio_data; + int32_t iommugroupid; + uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */ + uint8_t enable_multifunction, force_addr; + QLIST_ENTRY(sPAPRPHBState) list; } sPAPRPHBState; diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index 0ad4761..0023d41 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -43,6 +43,7 @@ #include "range.h" #include "vfio_pci.h" #include "linux-vfio.h" +#include "hw/spapr_iommu_vfio.h" //#define DEBUG_VFIO #ifdef DEBUG_VFIO @@ -55,6 +56,8 @@ #define MSIX_CAP_LENGTH 12 +static VFIOGroup *vfio_get_group(int groupid); + static QLIST_HEAD(, VFIOContainer) container_list = QLIST_HEAD_INITIALIZER(container_list); @@ -1088,6 +1091,31 @@ static void vfio_listener_release(VFIOContainer *container) } /* + * sPAPR TCE DMA interface + */ +static int spapr_tce_map(int groupid, struct tce_iommu_dma_map *param) +{ + VFIOGroup *group = vfio_get_group(groupid); + + if (!group || !group->container) { + return -1; + } + + return ioctl(group->container->fd, SPAPR_TCE_IOMMU_MAP_DMA, param); +} + +static int spapr_tce_unmap(int groupid, struct tce_iommu_dma_map *param) +{ + VFIOGroup *group = vfio_get_group(groupid); + + if (!group || !group->container) { + return -1; + } + + return ioctl(group->container->fd, SPAPR_TCE_IOMMU_UNMAP_DMA, param); +} + +/* * Interrupt setup */ static void vfio_disable_interrupts(VFIODevice *vdev) @@ -1590,6 +1618,42 @@ static int vfio_connect_container(VFIOGroup *group) memory_listener_register(&container->iommu_data.listener, get_system_memory()); + } else if (ioctl(fd, VFIO_CHECK_EXTENSION, SPAPR_TCE_IOMMU)) { + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); + if (ret) { + error_report("vfio: failed to set group container: %s\n", + strerror(errno)); + g_free(container); + close(fd); + return -1; + } + + ret = ioctl(fd, VFIO_SET_IOMMU, SPAPR_TCE_IOMMU); + if (ret) { + error_report("vfio: failed to set iommu for container: %s\n", + strerror(errno)); + g_free(container); + close(fd); + return -1; + } + + container->iommu_data.spapr.info.argsz = + sizeof(container->iommu_data.spapr.info); + ret = ioctl(fd, SPAPR_TCE_IOMMU_GET_INFO, + &container->iommu_data.spapr.info); + if (ret) { + error_report("vfio: failed to get iommu info for container: %s\n", + strerror(errno)); + g_free(container); + close(fd); + return -1; + } + + container->iommu_data.spapr.map = spapr_tce_map; + container->iommu_data.spapr.unmap = spapr_tce_unmap; + container->iommu_data.spapr.groupid = group->groupid; + spapr_register_vfio_container(&container->iommu_data.spapr); + } else { error_report("vfio: No available IOMMU models\n"); g_free(container); diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h index 00bb3dd..ddb5332 100644 --- a/hw/vfio_pci.h +++ b/hw/vfio_pci.h @@ -6,6 +6,7 @@ #include "pci.h" #include "ioapic.h" #include "event_notifier.h" +#include "hw/spapr_iommu_vfio.h" typedef struct VFIOPCIHostDevice { uint16_t seg; @@ -59,6 +60,7 @@ typedef struct VFIOContainer { struct { union { MemoryListener listener; + struct sPAPRVFIOData spapr; }; void (*release)(struct VFIOContainer *); } iommu_data; diff --git a/trace-events b/trace-events index e548f86..9100591 100644 --- a/trace-events +++ b/trace-events @@ -848,6 +848,7 @@ qxl_render_guest_primary_resized(int32_t width, int32_t height, int32_t stride, qxl_render_update_area_done(void *cookie) "%p" # hw/spapr_pci.c +spapr_pci(const char *msg1, const char *msg2) "%s%s" spapr_pci_msi(const char *msg, uint32_t n, uint32_t ca) "%s (device#%d, cfg=%x)" spapr_pci_msi_setup(const char *name, unsigned vector, uint64_t addr) "dev\"%s\" vector %u, addr=%"PRIx64 spapr_pci_rtas_ibm_change_msi(unsigned func, unsigned req) "func %u, requested %u"
It literally does the following: 1. POWERPC IOMMU support (the kernel counterpart is required) 2. The patch assumes that IOAPIC calls are going to be replaced with something generic. 3. Added sPAPRVFIOData (hw/spapr_iommu_vfio.h) which describes the interface between VFIO and sPAPR IOMMU. 4. Change sPAPR PHB to scan the PCI bus which is used for the IOMMU-VFIO group. Now it is enough to add the following to the QEMU command line to get VFIO up with all the devices from IOMMU group with id=3: -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\ mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000 WIth the pathes posted today a bit earlier, this patch fully supports VFIO what includes MSIX as well. ps. yes, I know that linux_vfio.h has moved, will fix it later :) Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> --- hw/linux-vfio.h | 26 +++++++++++ hw/ppc/Makefile.objs | 3 ++ hw/spapr.h | 4 ++ hw/spapr_iommu.c | 62 ++++++++++++++++++++++++- hw/spapr_iommu_vfio.h | 34 ++++++++++++++ hw/spapr_pci.c | 124 +++++++++++++++++++++++++++++++++++++++++++++++-- hw/spapr_pci.h | 6 +++ hw/vfio_pci.c | 64 +++++++++++++++++++++++++ hw/vfio_pci.h | 2 + trace-events | 1 + 10 files changed, 320 insertions(+), 6 deletions(-) create mode 100644 hw/spapr_iommu_vfio.h