diff mbox

[v5,10/11] spapr-vfio: add spapr-pci-vfio-host-bridge to support vfio

Message ID 1394603550-11556-11-git-send-email-aik@ozlabs.ru
State New
Headers show

Commit Message

Alexey Kardashevskiy March 12, 2014, 5:52 a.m. UTC
The patch adds a spapr-pci-vfio-host-bridge device type
which is a PCI Host Bridge with VFIO support. The new device
inherits from the spapr-pci-host-bridge device and adds
the following properties:
	iommu - IOMMU group ID which represents a Partitionable
	 	Endpoint, QEMU/ppc64 uses a separate PHB for
		an IOMMU group so the guest kernel has to have
		PCI Domain support enabled.
	forceaddr (optional, 0 by default) - forces QEMU to copy
		device:function from the host address as
		certain guest drivers expect devices to appear in
		particular locations;
	mf (optional, 0 by default) - forces multifunction bit for
		the function #0 of a found device, only makes sense
		for multifunction devices and only with the forceaddr
		property set. It would not be required if there
		was a way to know in advance whether a device is
		multifunctional or not.
	scan (optional, 1 by default) - if non-zero, the new PHB walks
		through all non-bridge devices in the group and tries
		adding them to the PHB; if zero, all devices in the group
		have to be configured manually via the QEMU command line.

Examples of use:
1) Scan and add all devices from IOMMU group with ID=1 to QEMU's PHB #6:
	-device spapr-pci-vfio-host-bridge,id=DEVICENAME,iommu=1,index=6

2) Configure and Add 3 functions of a multifunctional device to QEMU:
(the NEC PCI USB card is used as an example here):
	-device spapr-pci-vfio-host-bridge,id=USB,iommu=4,scan=0,index=7 \
	-device vfio-pci,host=4:0:1.0,addr=1.0,bus=USB,multifunction=true
	-device vfio-pci,host=4:0:1.1,addr=1.1,bus=USB
	-device vfio-pci,host=4:0:1.2,addr=1.2,bus=USB

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v5:
* added handling of possible failure of spapr_vfio_new_table()

v4:
* moved IOMMU changes to separate patches
* moved spapr-pci-vfio-host-bridge to new file
---
 hw/ppc/Makefile.objs        |   2 +-
 hw/ppc/spapr_pci_vfio.c     | 206 ++++++++++++++++++++++++++++++++++++++++++++
 include/hw/pci-host/spapr.h |  13 +++
 3 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 hw/ppc/spapr_pci_vfio.c

Comments

Alex Williamson March 19, 2014, 7:57 p.m. UTC | #1
On Wed, 2014-03-12 at 16:52 +1100, Alexey Kardashevskiy wrote:
> The patch adds a spapr-pci-vfio-host-bridge device type
> which is a PCI Host Bridge with VFIO support. The new device
> inherits from the spapr-pci-host-bridge device and adds
> the following properties:
> 	iommu - IOMMU group ID which represents a Partitionable
> 	 	Endpoint, QEMU/ppc64 uses a separate PHB for
> 		an IOMMU group so the guest kernel has to have
> 		PCI Domain support enabled.
> 	forceaddr (optional, 0 by default) - forces QEMU to copy
> 		device:function from the host address as
> 		certain guest drivers expect devices to appear in
> 		particular locations;
> 	mf (optional, 0 by default) - forces multifunction bit for
> 		the function #0 of a found device, only makes sense
> 		for multifunction devices and only with the forceaddr
> 		property set. It would not be required if there
> 		was a way to know in advance whether a device is
> 		multifunctional or not.
> 	scan (optional, 1 by default) - if non-zero, the new PHB walks
> 		through all non-bridge devices in the group and tries
> 		adding them to the PHB; if zero, all devices in the group
> 		have to be configured manually via the QEMU command line.
> 
> Examples of use:
> 1) Scan and add all devices from IOMMU group with ID=1 to QEMU's PHB #6:
> 	-device spapr-pci-vfio-host-bridge,id=DEVICENAME,iommu=1,index=6
> 
> 2) Configure and Add 3 functions of a multifunctional device to QEMU:
> (the NEC PCI USB card is used as an example here):
> 	-device spapr-pci-vfio-host-bridge,id=USB,iommu=4,scan=0,index=7 \
> 	-device vfio-pci,host=4:0:1.0,addr=1.0,bus=USB,multifunction=true
> 	-device vfio-pci,host=4:0:1.1,addr=1.1,bus=USB
> 	-device vfio-pci,host=4:0:1.2,addr=1.2,bus=USB

I won't pretend to predict the reaction of QEMU device architects to
this, but it seems like the assembly we expect from config files or
outside utilities, ex. libvirt.  I don't doubt this makes qemu
commandline usage more palatable, but it seems contrary to some of the
other things we do in QEMU with devices.  If this is something we need,
why is it specific to spapr?  IOMMU group can contain multiple devices
on any platform.  On x86 we could do something similar with a p2p
bridge, switch, or root port.

BTW, the code skips bridges, but doesn't that mean you'll have a hard
time with forceaddr as you potentially try to overlay devfn from
multiple buses onto a single bus?  It also makes the value of this a bit
more questionable since it seems to fall apart so easily.  Thanks,

Alex

> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> Changes:
> v5:
> * added handling of possible failure of spapr_vfio_new_table()
> 
> v4:
> * moved IOMMU changes to separate patches
> * moved spapr-pci-vfio-host-bridge to new file
> ---
>  hw/ppc/Makefile.objs        |   2 +-
>  hw/ppc/spapr_pci_vfio.c     | 206 ++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/pci-host/spapr.h |  13 +++
>  3 files changed, 220 insertions(+), 1 deletion(-)
>  create mode 100644 hw/ppc/spapr_pci_vfio.c
> 
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index ea747f0..2239192 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -3,7 +3,7 @@ obj-y += ppc.o ppc_booke.o
>  # IBM pSeries (sPAPR)
>  obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
>  obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
> -obj-$(CONFIG_PSERIES) += spapr_pci.o
> +obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_pci_vfio.o
>  # PowerPC 4xx boards
>  obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
>  obj-y += ppc4xx_pci.o
> diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
> new file mode 100644
> index 0000000..40f6673
> --- /dev/null
> +++ b/hw/ppc/spapr_pci_vfio.c
> @@ -0,0 +1,206 @@
> +/*
> + * QEMU sPAPR PCI host for VFIO
> + *
> + * Copyright (c) 2011 Alexey Kardashevskiy, IBM Corporation.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +#include <sys/types.h>
> +#include <dirent.h>
> +
> +#include "hw/hw.h"
> +#include "hw/ppc/spapr.h"
> +#include "hw/pci-host/spapr.h"
> +#include "hw/misc/vfio.h"
> +#include "hw/pci/pci_bus.h"
> +#include "trace.h"
> +#include "qemu/error-report.h"
> +
> +/* sPAPR VFIO */
> +static Property spapr_phb_vfio_properties[] = {
> +    DEFINE_PROP_INT32("iommu", sPAPRPHBVFIOState, iommugroupid, -1),
> +    DEFINE_PROP_UINT8("scan", sPAPRPHBVFIOState, scan, 1),
> +    DEFINE_PROP_UINT8("mf", sPAPRPHBVFIOState, enable_multifunction, 0),
> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBVFIOState, force_addr, 0),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void spapr_pci_vfio_scan(sPAPRPHBVFIOState *svphb, Error **errp)
> +{
> +    PCIHostState *phb = PCI_HOST_BRIDGE(svphb);
> +    char *iommupath;
> +    DIR *dirp;
> +    struct dirent *entry;
> +    Error *error = NULL;
> +
> +    if (!svphb->scan) {
> +        trace_spapr_pci("autoscan disabled for ", svphb->phb.dtbusname);
> +        return;
> +    }
> +
> +    iommupath = g_strdup_printf("/sys/kernel/iommu_groups/%d/devices/",
> +                                svphb->iommugroupid);
> +    if (!iommupath) {
> +        return;
> +    }
> +
> +    dirp = opendir(iommupath);
> +    if (!dirp) {
> +        error_report("spapr-vfio: vfio scan failed on opendir: %m");
> +        g_free(iommupath);
> +        return;
> +    }
> +
> +    while ((entry = readdir(dirp)) != NULL) {
> +        Error *err = NULL;
> +        char *tmp;
> +        FILE *deviceclassfile;
> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
> +        char addr[32];
> +        DeviceState *dev;
> +
> +        if (sscanf(entry->d_name, "%X:%X:%X.%x",
> +                   &domainid, &busid, &devid, &fnid) != 4) {
> +            continue;
> +        }
> +
> +        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
> +        trace_spapr_pci("Reading device class from ", tmp);
> +
> +        deviceclassfile = fopen(tmp, "r");
> +        if (deviceclassfile) {
> +            int ret = fscanf(deviceclassfile, "%x", &deviceclass);
> +            fclose(deviceclassfile);
> +            if (ret != 1) {
> +                continue;
> +            }
> +        }
> +        g_free(tmp);
> +
> +        if (!deviceclass) {
> +            continue;
> +        }
> +        if ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8)) {
> +            /* Skip bridges */
> +            continue;
> +        }
> +        trace_spapr_pci("Creating device from ", entry->d_name);
> +
> +        dev = qdev_create(&phb->bus->qbus, "vfio-pci");
> +        if (!dev) {
> +            trace_spapr_pci("failed to create vfio-pci", entry->d_name);
> +            continue;
> +        }
> +        object_property_parse(OBJECT(dev), entry->d_name, "host", &err);
> +        if (err != NULL) {
> +            object_unref(OBJECT(dev));
> +            continue;
> +        }
> +        if (svphb->force_addr) {
> +            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
> +            err = NULL;
> +            object_property_parse(OBJECT(dev), addr, "addr", &err);
> +            if (err != NULL) {
> +                object_unref(OBJECT(dev));
> +                continue;
> +            }
> +        }
> +        if (svphb->enable_multifunction) {
> +            qdev_prop_set_bit(dev, "multifunction", 1);
> +        }
> +        object_property_set_bool(OBJECT(dev), true, "realized", &error);
> +        if (error) {
> +            object_unref(OBJECT(dev));
> +            error_propagate(errp, error);
> +            break;
> +        }
> +    }
> +    closedir(dirp);
> +    g_free(iommupath);
> +}
> +
> +static void spapr_phb_vfio_finish_realize(sPAPRPHBState *sphb, Error **errp)
> +{
> +    sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
> +    struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info) };
> +    int ret;
> +    Error *error = NULL;
> +
> +    if (svphb->iommugroupid == -1) {
> +        error_setg(errp, "Wrong IOMMU group ID %d", svphb->iommugroupid);
> +        return;
> +    }
> +
> +    svphb->phb.tcet = spapr_vfio_new_table(DEVICE(sphb), svphb->phb.dma_liobn);
> +
> +    if (!svphb->phb.tcet) {
> +        error_setg(errp, "spapr-vfio: failed to create VFIO TCE table");
> +        return;
> +    }
> +
> +    address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet),
> +                       sphb->dtbusname);
> +
> +    ret = vfio_container_spapr_get_info(&svphb->phb.iommu_as,
> +                                        sphb->dma_liobn, svphb->iommugroupid,
> +                                        &info);
> +    if (ret) {
> +        error_setg_errno(errp, -ret,
> +                         "spapr-vfio: get info from container failed");
> +        return;
> +    }
> +
> +    svphb->phb.dma_window_start = info.dma32_window_start;
> +    svphb->phb.dma_window_size = info.dma32_window_size;
> +
> +    spapr_pci_vfio_scan(svphb, &error);
> +    if (error) {
> +        error_propagate(errp, error);
> +    }
> +}
> +
> +static void spapr_phb_vfio_reset(DeviceState *qdev)
> +{
> +    /* Do nothing */
> +}
> +
> +static void spapr_phb_vfio_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_CLASS(klass);
> +
> +    dc->props = spapr_phb_vfio_properties;
> +    dc->reset = spapr_phb_vfio_reset;
> +    spc->finish_realize = spapr_phb_vfio_finish_realize;
> +}
> +
> +static const TypeInfo spapr_phb_vfio_info = {
> +    .name          = TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE,
> +    .parent        = TYPE_SPAPR_PCI_HOST_BRIDGE,
> +    .instance_size = sizeof(sPAPRPHBVFIOState),
> +    .class_init    = spapr_phb_vfio_class_init,
> +    .class_size    = sizeof(sPAPRPHBClass),
> +};
> +
> +static void spapr_pci_vfio_register_types(void)
> +{
> +    type_register_static(&spapr_phb_vfio_info);
> +}
> +
> +type_init(spapr_pci_vfio_register_types)
> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> index 0f428a1..18acb67 100644
> --- a/include/hw/pci-host/spapr.h
> +++ b/include/hw/pci-host/spapr.h
> @@ -30,10 +30,14 @@
>  #define SPAPR_MSIX_MAX_DEVS 32
>  
>  #define TYPE_SPAPR_PCI_HOST_BRIDGE "spapr-pci-host-bridge"
> +#define TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE "spapr-pci-vfio-host-bridge"
>  
>  #define SPAPR_PCI_HOST_BRIDGE(obj) \
>      OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE)
>  
> +#define SPAPR_PCI_VFIO_HOST_BRIDGE(obj) \
> +    OBJECT_CHECK(sPAPRPHBVFIOState, (obj), TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE)
> +
>  #define SPAPR_PCI_HOST_BRIDGE_CLASS(klass) \
>       OBJECT_CLASS_CHECK(sPAPRPHBClass, (klass), TYPE_SPAPR_PCI_HOST_BRIDGE)
>  #define SPAPR_PCI_HOST_BRIDGE_GET_CLASS(obj) \
> @@ -41,6 +45,7 @@
>  
>  typedef struct sPAPRPHBClass sPAPRPHBClass;
>  typedef struct sPAPRPHBState sPAPRPHBState;
> +typedef struct sPAPRPHBVFIOState sPAPRPHBVFIOState;
>  
>  struct sPAPRPHBClass {
>      PCIHostBridgeClass parent_class;
> @@ -78,6 +83,14 @@ struct sPAPRPHBState {
>      QLIST_ENTRY(sPAPRPHBState) list;
>  };
>  
> +struct sPAPRPHBVFIOState {
> +    sPAPRPHBState phb;
> +
> +    struct VFIOContainer *container;
> +    int32_t iommugroupid;
> +    uint8_t scan, enable_multifunction, force_addr;
> +};
> +
>  #define SPAPR_PCI_BASE_BUID          0x800000020000000ULL
>  
>  #define SPAPR_PCI_WINDOW_BASE        0x10000000000ULL
Alexey Kardashevskiy March 28, 2014, 6:01 a.m. UTC | #2
On 03/20/2014 06:57 AM, Alex Williamson wrote:
> On Wed, 2014-03-12 at 16:52 +1100, Alexey Kardashevskiy wrote:
>> The patch adds a spapr-pci-vfio-host-bridge device type
>> which is a PCI Host Bridge with VFIO support. The new device
>> inherits from the spapr-pci-host-bridge device and adds
>> the following properties:
>> 	iommu - IOMMU group ID which represents a Partitionable
>> 	 	Endpoint, QEMU/ppc64 uses a separate PHB for
>> 		an IOMMU group so the guest kernel has to have
>> 		PCI Domain support enabled.
>> 	forceaddr (optional, 0 by default) - forces QEMU to copy
>> 		device:function from the host address as
>> 		certain guest drivers expect devices to appear in
>> 		particular locations;
>> 	mf (optional, 0 by default) - forces multifunction bit for
>> 		the function #0 of a found device, only makes sense
>> 		for multifunction devices and only with the forceaddr
>> 		property set. It would not be required if there
>> 		was a way to know in advance whether a device is
>> 		multifunctional or not.
>> 	scan (optional, 1 by default) - if non-zero, the new PHB walks
>> 		through all non-bridge devices in the group and tries
>> 		adding them to the PHB; if zero, all devices in the group
>> 		have to be configured manually via the QEMU command line.
>>
>> Examples of use:
>> 1) Scan and add all devices from IOMMU group with ID=1 to QEMU's PHB #6:
>> 	-device spapr-pci-vfio-host-bridge,id=DEVICENAME,iommu=1,index=6
>>
>> 2) Configure and Add 3 functions of a multifunctional device to QEMU:
>> (the NEC PCI USB card is used as an example here):
>> 	-device spapr-pci-vfio-host-bridge,id=USB,iommu=4,scan=0,index=7 \
>> 	-device vfio-pci,host=4:0:1.0,addr=1.0,bus=USB,multifunction=true
>> 	-device vfio-pci,host=4:0:1.1,addr=1.1,bus=USB
>> 	-device vfio-pci,host=4:0:1.2,addr=1.2,bus=USB
> 
> I won't pretend to predict the reaction of QEMU device architects to
> this, but it seems like the assembly we expect from config files or
> outside utilities, ex. libvirt.  I don't doubt this makes qemu
> commandline usage more palatable, but it seems contrary to some of the
> other things we do in QEMU with devices.  If this is something we need,
> why is it specific to spapr?  IOMMU group can contain multiple devices
> on any platform.  On x86 we could do something similar with a p2p
> bridge, switch, or root port.


At least at the moment devices are assigned to groups statically on SPAPR,
they cannot be moved between groups at all, so it seems like a reasonable
assumption that the user will not mind putting them all to the same QEMU
instance.

I should probably disable "scan" by default though, that would make more
sense for libvirt.


> BTW, the code skips bridges, but doesn't that mean you'll have a hard
> time with forceaddr as you potentially try to overlay devfn from
> multiple buses onto a single bus?
> It also makes the value of this a bit
> more questionable since it seems to fall apart so easily.  Thanks,

These "forceaddr" and "mf" are rather debug options - at the very beginning
I used to have strong impression that USB NEC PCI device (2xOHCI and
1xEHCI) does not work properly if it is not multifunctional but I was
wrong, just checked. So I'll just remove them as they do not help even me
anymore and that's it. Thanks!


> 
> Alex
> 
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>> Changes:
>> v5:
>> * added handling of possible failure of spapr_vfio_new_table()
>>
>> v4:
>> * moved IOMMU changes to separate patches
>> * moved spapr-pci-vfio-host-bridge to new file
>> ---
>>  hw/ppc/Makefile.objs        |   2 +-
>>  hw/ppc/spapr_pci_vfio.c     | 206 ++++++++++++++++++++++++++++++++++++++++++++
>>  include/hw/pci-host/spapr.h |  13 +++
>>  3 files changed, 220 insertions(+), 1 deletion(-)
>>  create mode 100644 hw/ppc/spapr_pci_vfio.c
>>
>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
>> index ea747f0..2239192 100644
>> --- a/hw/ppc/Makefile.objs
>> +++ b/hw/ppc/Makefile.objs
>> @@ -3,7 +3,7 @@ obj-y += ppc.o ppc_booke.o
>>  # IBM pSeries (sPAPR)
>>  obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
>>  obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
>> -obj-$(CONFIG_PSERIES) += spapr_pci.o
>> +obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_pci_vfio.o
>>  # PowerPC 4xx boards
>>  obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
>>  obj-y += ppc4xx_pci.o
>> diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
>> new file mode 100644
>> index 0000000..40f6673
>> --- /dev/null
>> +++ b/hw/ppc/spapr_pci_vfio.c
>> @@ -0,0 +1,206 @@
>> +/*
>> + * QEMU sPAPR PCI host for VFIO
>> + *
>> + * Copyright (c) 2011 Alexey Kardashevskiy, IBM Corporation.
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a copy
>> + * of this software and associated documentation files (the "Software"), to deal
>> + * in the Software without restriction, including without limitation the rights
>> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
>> + * copies of the Software, and to permit persons to whom the Software is
>> + * furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
>> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>> + * THE SOFTWARE.
>> + */
>> +#include <sys/types.h>
>> +#include <dirent.h>
>> +
>> +#include "hw/hw.h"
>> +#include "hw/ppc/spapr.h"
>> +#include "hw/pci-host/spapr.h"
>> +#include "hw/misc/vfio.h"
>> +#include "hw/pci/pci_bus.h"
>> +#include "trace.h"
>> +#include "qemu/error-report.h"
>> +
>> +/* sPAPR VFIO */
>> +static Property spapr_phb_vfio_properties[] = {
>> +    DEFINE_PROP_INT32("iommu", sPAPRPHBVFIOState, iommugroupid, -1),
>> +    DEFINE_PROP_UINT8("scan", sPAPRPHBVFIOState, scan, 1),
>> +    DEFINE_PROP_UINT8("mf", sPAPRPHBVFIOState, enable_multifunction, 0),
>> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBVFIOState, force_addr, 0),
>> +    DEFINE_PROP_END_OF_LIST(),
>> +};
>> +
>> +static void spapr_pci_vfio_scan(sPAPRPHBVFIOState *svphb, Error **errp)
>> +{
>> +    PCIHostState *phb = PCI_HOST_BRIDGE(svphb);
>> +    char *iommupath;
>> +    DIR *dirp;
>> +    struct dirent *entry;
>> +    Error *error = NULL;
>> +
>> +    if (!svphb->scan) {
>> +        trace_spapr_pci("autoscan disabled for ", svphb->phb.dtbusname);
>> +        return;
>> +    }
>> +
>> +    iommupath = g_strdup_printf("/sys/kernel/iommu_groups/%d/devices/",
>> +                                svphb->iommugroupid);
>> +    if (!iommupath) {
>> +        return;
>> +    }
>> +
>> +    dirp = opendir(iommupath);
>> +    if (!dirp) {
>> +        error_report("spapr-vfio: vfio scan failed on opendir: %m");
>> +        g_free(iommupath);
>> +        return;
>> +    }
>> +
>> +    while ((entry = readdir(dirp)) != NULL) {
>> +        Error *err = NULL;
>> +        char *tmp;
>> +        FILE *deviceclassfile;
>> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
>> +        char addr[32];
>> +        DeviceState *dev;
>> +
>> +        if (sscanf(entry->d_name, "%X:%X:%X.%x",
>> +                   &domainid, &busid, &devid, &fnid) != 4) {
>> +            continue;
>> +        }
>> +
>> +        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
>> +        trace_spapr_pci("Reading device class from ", tmp);
>> +
>> +        deviceclassfile = fopen(tmp, "r");
>> +        if (deviceclassfile) {
>> +            int ret = fscanf(deviceclassfile, "%x", &deviceclass);
>> +            fclose(deviceclassfile);
>> +            if (ret != 1) {
>> +                continue;
>> +            }
>> +        }
>> +        g_free(tmp);
>> +
>> +        if (!deviceclass) {
>> +            continue;
>> +        }
>> +        if ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8)) {
>> +            /* Skip bridges */
>> +            continue;
>> +        }
>> +        trace_spapr_pci("Creating device from ", entry->d_name);
>> +
>> +        dev = qdev_create(&phb->bus->qbus, "vfio-pci");
>> +        if (!dev) {
>> +            trace_spapr_pci("failed to create vfio-pci", entry->d_name);
>> +            continue;
>> +        }
>> +        object_property_parse(OBJECT(dev), entry->d_name, "host", &err);
>> +        if (err != NULL) {
>> +            object_unref(OBJECT(dev));
>> +            continue;
>> +        }
>> +        if (svphb->force_addr) {
>> +            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
>> +            err = NULL;
>> +            object_property_parse(OBJECT(dev), addr, "addr", &err);
>> +            if (err != NULL) {
>> +                object_unref(OBJECT(dev));
>> +                continue;
>> +            }
>> +        }
>> +        if (svphb->enable_multifunction) {
>> +            qdev_prop_set_bit(dev, "multifunction", 1);
>> +        }
>> +        object_property_set_bool(OBJECT(dev), true, "realized", &error);
>> +        if (error) {
>> +            object_unref(OBJECT(dev));
>> +            error_propagate(errp, error);
>> +            break;
>> +        }
>> +    }
>> +    closedir(dirp);
>> +    g_free(iommupath);
>> +}
>> +
>> +static void spapr_phb_vfio_finish_realize(sPAPRPHBState *sphb, Error **errp)
>> +{
>> +    sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
>> +    struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info) };
>> +    int ret;
>> +    Error *error = NULL;
>> +
>> +    if (svphb->iommugroupid == -1) {
>> +        error_setg(errp, "Wrong IOMMU group ID %d", svphb->iommugroupid);
>> +        return;
>> +    }
>> +
>> +    svphb->phb.tcet = spapr_vfio_new_table(DEVICE(sphb), svphb->phb.dma_liobn);
>> +
>> +    if (!svphb->phb.tcet) {
>> +        error_setg(errp, "spapr-vfio: failed to create VFIO TCE table");
>> +        return;
>> +    }
>> +
>> +    address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet),
>> +                       sphb->dtbusname);
>> +
>> +    ret = vfio_container_spapr_get_info(&svphb->phb.iommu_as,
>> +                                        sphb->dma_liobn, svphb->iommugroupid,
>> +                                        &info);
>> +    if (ret) {
>> +        error_setg_errno(errp, -ret,
>> +                         "spapr-vfio: get info from container failed");
>> +        return;
>> +    }
>> +
>> +    svphb->phb.dma_window_start = info.dma32_window_start;
>> +    svphb->phb.dma_window_size = info.dma32_window_size;
>> +
>> +    spapr_pci_vfio_scan(svphb, &error);
>> +    if (error) {
>> +        error_propagate(errp, error);
>> +    }
>> +}
>> +
>> +static void spapr_phb_vfio_reset(DeviceState *qdev)
>> +{
>> +    /* Do nothing */
>> +}
>> +
>> +static void spapr_phb_vfio_class_init(ObjectClass *klass, void *data)
>> +{
>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>> +    sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_CLASS(klass);
>> +
>> +    dc->props = spapr_phb_vfio_properties;
>> +    dc->reset = spapr_phb_vfio_reset;
>> +    spc->finish_realize = spapr_phb_vfio_finish_realize;
>> +}
>> +
>> +static const TypeInfo spapr_phb_vfio_info = {
>> +    .name          = TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE,
>> +    .parent        = TYPE_SPAPR_PCI_HOST_BRIDGE,
>> +    .instance_size = sizeof(sPAPRPHBVFIOState),
>> +    .class_init    = spapr_phb_vfio_class_init,
>> +    .class_size    = sizeof(sPAPRPHBClass),
>> +};
>> +
>> +static void spapr_pci_vfio_register_types(void)
>> +{
>> +    type_register_static(&spapr_phb_vfio_info);
>> +}
>> +
>> +type_init(spapr_pci_vfio_register_types)
>> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
>> index 0f428a1..18acb67 100644
>> --- a/include/hw/pci-host/spapr.h
>> +++ b/include/hw/pci-host/spapr.h
>> @@ -30,10 +30,14 @@
>>  #define SPAPR_MSIX_MAX_DEVS 32
>>  
>>  #define TYPE_SPAPR_PCI_HOST_BRIDGE "spapr-pci-host-bridge"
>> +#define TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE "spapr-pci-vfio-host-bridge"
>>  
>>  #define SPAPR_PCI_HOST_BRIDGE(obj) \
>>      OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE)
>>  
>> +#define SPAPR_PCI_VFIO_HOST_BRIDGE(obj) \
>> +    OBJECT_CHECK(sPAPRPHBVFIOState, (obj), TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE)
>> +
>>  #define SPAPR_PCI_HOST_BRIDGE_CLASS(klass) \
>>       OBJECT_CLASS_CHECK(sPAPRPHBClass, (klass), TYPE_SPAPR_PCI_HOST_BRIDGE)
>>  #define SPAPR_PCI_HOST_BRIDGE_GET_CLASS(obj) \
>> @@ -41,6 +45,7 @@
>>  
>>  typedef struct sPAPRPHBClass sPAPRPHBClass;
>>  typedef struct sPAPRPHBState sPAPRPHBState;
>> +typedef struct sPAPRPHBVFIOState sPAPRPHBVFIOState;
>>  
>>  struct sPAPRPHBClass {
>>      PCIHostBridgeClass parent_class;
>> @@ -78,6 +83,14 @@ struct sPAPRPHBState {
>>      QLIST_ENTRY(sPAPRPHBState) list;
>>  };
>>  
>> +struct sPAPRPHBVFIOState {
>> +    sPAPRPHBState phb;
>> +
>> +    struct VFIOContainer *container;
>> +    int32_t iommugroupid;
>> +    uint8_t scan, enable_multifunction, force_addr;
>> +};
>> +
>>  #define SPAPR_PCI_BASE_BUID          0x800000020000000ULL
>>  
>>  #define SPAPR_PCI_WINDOW_BASE        0x10000000000ULL
> 
> 
>
Alex Williamson March 31, 2014, 8:09 p.m. UTC | #3
On Fri, 2014-03-28 at 17:01 +1100, Alexey Kardashevskiy wrote:
> On 03/20/2014 06:57 AM, Alex Williamson wrote:
> > On Wed, 2014-03-12 at 16:52 +1100, Alexey Kardashevskiy wrote:
> >> The patch adds a spapr-pci-vfio-host-bridge device type
> >> which is a PCI Host Bridge with VFIO support. The new device
> >> inherits from the spapr-pci-host-bridge device and adds
> >> the following properties:
> >> 	iommu - IOMMU group ID which represents a Partitionable
> >> 	 	Endpoint, QEMU/ppc64 uses a separate PHB for
> >> 		an IOMMU group so the guest kernel has to have
> >> 		PCI Domain support enabled.
> >> 	forceaddr (optional, 0 by default) - forces QEMU to copy
> >> 		device:function from the host address as
> >> 		certain guest drivers expect devices to appear in
> >> 		particular locations;
> >> 	mf (optional, 0 by default) - forces multifunction bit for
> >> 		the function #0 of a found device, only makes sense
> >> 		for multifunction devices and only with the forceaddr
> >> 		property set. It would not be required if there
> >> 		was a way to know in advance whether a device is
> >> 		multifunctional or not.
> >> 	scan (optional, 1 by default) - if non-zero, the new PHB walks
> >> 		through all non-bridge devices in the group and tries
> >> 		adding them to the PHB; if zero, all devices in the group
> >> 		have to be configured manually via the QEMU command line.
> >>
> >> Examples of use:
> >> 1) Scan and add all devices from IOMMU group with ID=1 to QEMU's PHB #6:
> >> 	-device spapr-pci-vfio-host-bridge,id=DEVICENAME,iommu=1,index=6
> >>
> >> 2) Configure and Add 3 functions of a multifunctional device to QEMU:
> >> (the NEC PCI USB card is used as an example here):
> >> 	-device spapr-pci-vfio-host-bridge,id=USB,iommu=4,scan=0,index=7 \
> >> 	-device vfio-pci,host=4:0:1.0,addr=1.0,bus=USB,multifunction=true
> >> 	-device vfio-pci,host=4:0:1.1,addr=1.1,bus=USB
> >> 	-device vfio-pci,host=4:0:1.2,addr=1.2,bus=USB
> > 
> > I won't pretend to predict the reaction of QEMU device architects to
> > this, but it seems like the assembly we expect from config files or
> > outside utilities, ex. libvirt.  I don't doubt this makes qemu
> > commandline usage more palatable, but it seems contrary to some of the
> > other things we do in QEMU with devices.  If this is something we need,
> > why is it specific to spapr?  IOMMU group can contain multiple devices
> > on any platform.  On x86 we could do something similar with a p2p
> > bridge, switch, or root port.
> 
> 
> At least at the moment devices are assigned to groups statically on SPAPR,
> they cannot be moved between groups at all, so it seems like a reasonable
> assumption that the user will not mind putting them all to the same QEMU
> instance.
> 
> I should probably disable "scan" by default though, that would make more
> sense for libvirt.

That doesn't really address the concern.  An x86 chipset puts specific
devices at a specific address, yet this is not something that we hard
code into QEMU.  We have config files that define this and tools like
libvirt know where to put things.  Why is SPAPR special enough to have
QEMU auto-add an entire sub-hierarchy?  If this is a necessary feature,
why not make it generic and give x86 the capability as well?

> > BTW, the code skips bridges, but doesn't that mean you'll have a hard
> > time with forceaddr as you potentially try to overlay devfn from
> > multiple buses onto a single bus?
> > It also makes the value of this a bit
> > more questionable since it seems to fall apart so easily.  Thanks,
> 
> These "forceaddr" and "mf" are rather debug options - at the very beginning
> I used to have strong impression that USB NEC PCI device (2xOHCI and
> 1xEHCI) does not work properly if it is not multifunctional but I was
> wrong, just checked. So I'll just remove them as they do not help even me
> anymore and that's it. Thanks!

I'm still questioning the value of this code, it just seems to be a
convenience feature for someone running QEMU from the commandline, which
has never really been a concern for the rest of QEMU.  Suggest taking
this patch out of he series since it's not critical path.  Thanks,

Alex

> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >> Changes:
> >> v5:
> >> * added handling of possible failure of spapr_vfio_new_table()
> >>
> >> v4:
> >> * moved IOMMU changes to separate patches
> >> * moved spapr-pci-vfio-host-bridge to new file
> >> ---
> >>  hw/ppc/Makefile.objs        |   2 +-
> >>  hw/ppc/spapr_pci_vfio.c     | 206 ++++++++++++++++++++++++++++++++++++++++++++
> >>  include/hw/pci-host/spapr.h |  13 +++
> >>  3 files changed, 220 insertions(+), 1 deletion(-)
> >>  create mode 100644 hw/ppc/spapr_pci_vfio.c
> >>
> >> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> >> index ea747f0..2239192 100644
> >> --- a/hw/ppc/Makefile.objs
> >> +++ b/hw/ppc/Makefile.objs
> >> @@ -3,7 +3,7 @@ obj-y += ppc.o ppc_booke.o
> >>  # IBM pSeries (sPAPR)
> >>  obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
> >>  obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
> >> -obj-$(CONFIG_PSERIES) += spapr_pci.o
> >> +obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_pci_vfio.o
> >>  # PowerPC 4xx boards
> >>  obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
> >>  obj-y += ppc4xx_pci.o
> >> diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
> >> new file mode 100644
> >> index 0000000..40f6673
> >> --- /dev/null
> >> +++ b/hw/ppc/spapr_pci_vfio.c
> >> @@ -0,0 +1,206 @@
> >> +/*
> >> + * QEMU sPAPR PCI host for VFIO
> >> + *
> >> + * Copyright (c) 2011 Alexey Kardashevskiy, IBM Corporation.
> >> + *
> >> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> >> + * of this software and associated documentation files (the "Software"), to deal
> >> + * in the Software without restriction, including without limitation the rights
> >> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> >> + * copies of the Software, and to permit persons to whom the Software is
> >> + * furnished to do so, subject to the following conditions:
> >> + *
> >> + * The above copyright notice and this permission notice shall be included in
> >> + * all copies or substantial portions of the Software.
> >> + *
> >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> >> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> >> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> >> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> >> + * THE SOFTWARE.
> >> + */
> >> +#include <sys/types.h>
> >> +#include <dirent.h>
> >> +
> >> +#include "hw/hw.h"
> >> +#include "hw/ppc/spapr.h"
> >> +#include "hw/pci-host/spapr.h"
> >> +#include "hw/misc/vfio.h"
> >> +#include "hw/pci/pci_bus.h"
> >> +#include "trace.h"
> >> +#include "qemu/error-report.h"
> >> +
> >> +/* sPAPR VFIO */
> >> +static Property spapr_phb_vfio_properties[] = {
> >> +    DEFINE_PROP_INT32("iommu", sPAPRPHBVFIOState, iommugroupid, -1),
> >> +    DEFINE_PROP_UINT8("scan", sPAPRPHBVFIOState, scan, 1),
> >> +    DEFINE_PROP_UINT8("mf", sPAPRPHBVFIOState, enable_multifunction, 0),
> >> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBVFIOState, force_addr, 0),
> >> +    DEFINE_PROP_END_OF_LIST(),
> >> +};
> >> +
> >> +static void spapr_pci_vfio_scan(sPAPRPHBVFIOState *svphb, Error **errp)
> >> +{
> >> +    PCIHostState *phb = PCI_HOST_BRIDGE(svphb);
> >> +    char *iommupath;
> >> +    DIR *dirp;
> >> +    struct dirent *entry;
> >> +    Error *error = NULL;
> >> +
> >> +    if (!svphb->scan) {
> >> +        trace_spapr_pci("autoscan disabled for ", svphb->phb.dtbusname);
> >> +        return;
> >> +    }
> >> +
> >> +    iommupath = g_strdup_printf("/sys/kernel/iommu_groups/%d/devices/",
> >> +                                svphb->iommugroupid);
> >> +    if (!iommupath) {
> >> +        return;
> >> +    }
> >> +
> >> +    dirp = opendir(iommupath);
> >> +    if (!dirp) {
> >> +        error_report("spapr-vfio: vfio scan failed on opendir: %m");
> >> +        g_free(iommupath);
> >> +        return;
> >> +    }
> >> +
> >> +    while ((entry = readdir(dirp)) != NULL) {
> >> +        Error *err = NULL;
> >> +        char *tmp;
> >> +        FILE *deviceclassfile;
> >> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
> >> +        char addr[32];
> >> +        DeviceState *dev;
> >> +
> >> +        if (sscanf(entry->d_name, "%X:%X:%X.%x",
> >> +                   &domainid, &busid, &devid, &fnid) != 4) {
> >> +            continue;
> >> +        }
> >> +
> >> +        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
> >> +        trace_spapr_pci("Reading device class from ", tmp);
> >> +
> >> +        deviceclassfile = fopen(tmp, "r");
> >> +        if (deviceclassfile) {
> >> +            int ret = fscanf(deviceclassfile, "%x", &deviceclass);
> >> +            fclose(deviceclassfile);
> >> +            if (ret != 1) {
> >> +                continue;
> >> +            }
> >> +        }
> >> +        g_free(tmp);
> >> +
> >> +        if (!deviceclass) {
> >> +            continue;
> >> +        }
> >> +        if ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8)) {
> >> +            /* Skip bridges */
> >> +            continue;
> >> +        }
> >> +        trace_spapr_pci("Creating device from ", entry->d_name);
> >> +
> >> +        dev = qdev_create(&phb->bus->qbus, "vfio-pci");
> >> +        if (!dev) {
> >> +            trace_spapr_pci("failed to create vfio-pci", entry->d_name);
> >> +            continue;
> >> +        }
> >> +        object_property_parse(OBJECT(dev), entry->d_name, "host", &err);
> >> +        if (err != NULL) {
> >> +            object_unref(OBJECT(dev));
> >> +            continue;
> >> +        }
> >> +        if (svphb->force_addr) {
> >> +            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
> >> +            err = NULL;
> >> +            object_property_parse(OBJECT(dev), addr, "addr", &err);
> >> +            if (err != NULL) {
> >> +                object_unref(OBJECT(dev));
> >> +                continue;
> >> +            }
> >> +        }
> >> +        if (svphb->enable_multifunction) {
> >> +            qdev_prop_set_bit(dev, "multifunction", 1);
> >> +        }
> >> +        object_property_set_bool(OBJECT(dev), true, "realized", &error);
> >> +        if (error) {
> >> +            object_unref(OBJECT(dev));
> >> +            error_propagate(errp, error);
> >> +            break;
> >> +        }
> >> +    }
> >> +    closedir(dirp);
> >> +    g_free(iommupath);
> >> +}
> >> +
> >> +static void spapr_phb_vfio_finish_realize(sPAPRPHBState *sphb, Error **errp)
> >> +{
> >> +    sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
> >> +    struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info) };
> >> +    int ret;
> >> +    Error *error = NULL;
> >> +
> >> +    if (svphb->iommugroupid == -1) {
> >> +        error_setg(errp, "Wrong IOMMU group ID %d", svphb->iommugroupid);
> >> +        return;
> >> +    }
> >> +
> >> +    svphb->phb.tcet = spapr_vfio_new_table(DEVICE(sphb), svphb->phb.dma_liobn);
> >> +
> >> +    if (!svphb->phb.tcet) {
> >> +        error_setg(errp, "spapr-vfio: failed to create VFIO TCE table");
> >> +        return;
> >> +    }
> >> +
> >> +    address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet),
> >> +                       sphb->dtbusname);
> >> +
> >> +    ret = vfio_container_spapr_get_info(&svphb->phb.iommu_as,
> >> +                                        sphb->dma_liobn, svphb->iommugroupid,
> >> +                                        &info);
> >> +    if (ret) {
> >> +        error_setg_errno(errp, -ret,
> >> +                         "spapr-vfio: get info from container failed");
> >> +        return;
> >> +    }
> >> +
> >> +    svphb->phb.dma_window_start = info.dma32_window_start;
> >> +    svphb->phb.dma_window_size = info.dma32_window_size;
> >> +
> >> +    spapr_pci_vfio_scan(svphb, &error);
> >> +    if (error) {
> >> +        error_propagate(errp, error);
> >> +    }
> >> +}
> >> +
> >> +static void spapr_phb_vfio_reset(DeviceState *qdev)
> >> +{
> >> +    /* Do nothing */
> >> +}
> >> +
> >> +static void spapr_phb_vfio_class_init(ObjectClass *klass, void *data)
> >> +{
> >> +    DeviceClass *dc = DEVICE_CLASS(klass);
> >> +    sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_CLASS(klass);
> >> +
> >> +    dc->props = spapr_phb_vfio_properties;
> >> +    dc->reset = spapr_phb_vfio_reset;
> >> +    spc->finish_realize = spapr_phb_vfio_finish_realize;
> >> +}
> >> +
> >> +static const TypeInfo spapr_phb_vfio_info = {
> >> +    .name          = TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE,
> >> +    .parent        = TYPE_SPAPR_PCI_HOST_BRIDGE,
> >> +    .instance_size = sizeof(sPAPRPHBVFIOState),
> >> +    .class_init    = spapr_phb_vfio_class_init,
> >> +    .class_size    = sizeof(sPAPRPHBClass),
> >> +};
> >> +
> >> +static void spapr_pci_vfio_register_types(void)
> >> +{
> >> +    type_register_static(&spapr_phb_vfio_info);
> >> +}
> >> +
> >> +type_init(spapr_pci_vfio_register_types)
> >> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> >> index 0f428a1..18acb67 100644
> >> --- a/include/hw/pci-host/spapr.h
> >> +++ b/include/hw/pci-host/spapr.h
> >> @@ -30,10 +30,14 @@
> >>  #define SPAPR_MSIX_MAX_DEVS 32
> >>  
> >>  #define TYPE_SPAPR_PCI_HOST_BRIDGE "spapr-pci-host-bridge"
> >> +#define TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE "spapr-pci-vfio-host-bridge"
> >>  
> >>  #define SPAPR_PCI_HOST_BRIDGE(obj) \
> >>      OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE)
> >>  
> >> +#define SPAPR_PCI_VFIO_HOST_BRIDGE(obj) \
> >> +    OBJECT_CHECK(sPAPRPHBVFIOState, (obj), TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE)
> >> +
> >>  #define SPAPR_PCI_HOST_BRIDGE_CLASS(klass) \
> >>       OBJECT_CLASS_CHECK(sPAPRPHBClass, (klass), TYPE_SPAPR_PCI_HOST_BRIDGE)
> >>  #define SPAPR_PCI_HOST_BRIDGE_GET_CLASS(obj) \
> >> @@ -41,6 +45,7 @@
> >>  
> >>  typedef struct sPAPRPHBClass sPAPRPHBClass;
> >>  typedef struct sPAPRPHBState sPAPRPHBState;
> >> +typedef struct sPAPRPHBVFIOState sPAPRPHBVFIOState;
> >>  
> >>  struct sPAPRPHBClass {
> >>      PCIHostBridgeClass parent_class;
> >> @@ -78,6 +83,14 @@ struct sPAPRPHBState {
> >>      QLIST_ENTRY(sPAPRPHBState) list;
> >>  };
> >>  
> >> +struct sPAPRPHBVFIOState {
> >> +    sPAPRPHBState phb;
> >> +
> >> +    struct VFIOContainer *container;
> >> +    int32_t iommugroupid;
> >> +    uint8_t scan, enable_multifunction, force_addr;
> >> +};
> >> +
> >>  #define SPAPR_PCI_BASE_BUID          0x800000020000000ULL
> >>  
> >>  #define SPAPR_PCI_WINDOW_BASE        0x10000000000ULL
> > 
> > 
> > 
> 
>
Alexey Kardashevskiy April 1, 2014, 6:25 a.m. UTC | #4
On 04/01/2014 07:09 AM, Alex Williamson wrote:
> On Fri, 2014-03-28 at 17:01 +1100, Alexey Kardashevskiy wrote:
>> On 03/20/2014 06:57 AM, Alex Williamson wrote:
>>> On Wed, 2014-03-12 at 16:52 +1100, Alexey Kardashevskiy wrote:
>>>> The patch adds a spapr-pci-vfio-host-bridge device type
>>>> which is a PCI Host Bridge with VFIO support. The new device
>>>> inherits from the spapr-pci-host-bridge device and adds
>>>> the following properties:
>>>> 	iommu - IOMMU group ID which represents a Partitionable
>>>> 	 	Endpoint, QEMU/ppc64 uses a separate PHB for
>>>> 		an IOMMU group so the guest kernel has to have
>>>> 		PCI Domain support enabled.
>>>> 	forceaddr (optional, 0 by default) - forces QEMU to copy
>>>> 		device:function from the host address as
>>>> 		certain guest drivers expect devices to appear in
>>>> 		particular locations;
>>>> 	mf (optional, 0 by default) - forces multifunction bit for
>>>> 		the function #0 of a found device, only makes sense
>>>> 		for multifunction devices and only with the forceaddr
>>>> 		property set. It would not be required if there
>>>> 		was a way to know in advance whether a device is
>>>> 		multifunctional or not.
>>>> 	scan (optional, 1 by default) - if non-zero, the new PHB walks
>>>> 		through all non-bridge devices in the group and tries
>>>> 		adding them to the PHB; if zero, all devices in the group
>>>> 		have to be configured manually via the QEMU command line.
>>>>
>>>> Examples of use:
>>>> 1) Scan and add all devices from IOMMU group with ID=1 to QEMU's PHB #6:
>>>> 	-device spapr-pci-vfio-host-bridge,id=DEVICENAME,iommu=1,index=6
>>>>
>>>> 2) Configure and Add 3 functions of a multifunctional device to QEMU:
>>>> (the NEC PCI USB card is used as an example here):
>>>> 	-device spapr-pci-vfio-host-bridge,id=USB,iommu=4,scan=0,index=7 \
>>>> 	-device vfio-pci,host=4:0:1.0,addr=1.0,bus=USB,multifunction=true
>>>> 	-device vfio-pci,host=4:0:1.1,addr=1.1,bus=USB
>>>> 	-device vfio-pci,host=4:0:1.2,addr=1.2,bus=USB
>>>
>>> I won't pretend to predict the reaction of QEMU device architects to
>>> this, but it seems like the assembly we expect from config files or
>>> outside utilities, ex. libvirt.  I don't doubt this makes qemu
>>> commandline usage more palatable, but it seems contrary to some of the
>>> other things we do in QEMU with devices.  If this is something we need,
>>> why is it specific to spapr?  IOMMU group can contain multiple devices
>>> on any platform.  On x86 we could do something similar with a p2p
>>> bridge, switch, or root port.
>>
>>
>> At least at the moment devices are assigned to groups statically on SPAPR,
>> they cannot be moved between groups at all, so it seems like a reasonable
>> assumption that the user will not mind putting them all to the same QEMU
>> instance.
>>
>> I should probably disable "scan" by default though, that would make more
>> sense for libvirt.
> 
> That doesn't really address the concern.  An x86 chipset puts specific
> devices at a specific address, yet this is not something that we hard
> code into QEMU.  We have config files that define this and tools like
> libvirt know where to put things.  Why is SPAPR special enough to have
> QEMU auto-add an entire sub-hierarchy?  If this is a necessary feature,
> why not make it generic and give x86 the capability as well?

Ok. You are right. spapr_pci_vfio_scan() has to go :)



>>> BTW, the code skips bridges, but doesn't that mean you'll have a hard
>>> time with forceaddr as you potentially try to overlay devfn from
>>> multiple buses onto a single bus?
>>> It also makes the value of this a bit
>>> more questionable since it seems to fall apart so easily.  Thanks,
>>
>> These "forceaddr" and "mf" are rather debug options - at the very beginning
>> I used to have strong impression that USB NEC PCI device (2xOHCI and
>> 1xEHCI) does not work properly if it is not multifunctional but I was
>> wrong, just checked. So I'll just remove them as they do not help even me
>> anymore and that's it. Thanks!
> 
> I'm still questioning the value of this code, it just seems to be a
> convenience feature for someone running QEMU from the commandline, which
> has never really been a concern for the rest of QEMU.  Suggest taking
> this patch out of he series since it's not critical path.  Thanks,


I can move  spapr_pci_vfio_scan() to a separate patch or get rid of it at
all  but the rest is still needed to setup the correct IOMMU device. Is
that what you really meant?


> 
> Alex
> 
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>> Changes:
>>>> v5:
>>>> * added handling of possible failure of spapr_vfio_new_table()
>>>>
>>>> v4:
>>>> * moved IOMMU changes to separate patches
>>>> * moved spapr-pci-vfio-host-bridge to new file
>>>> ---
>>>>  hw/ppc/Makefile.objs        |   2 +-
>>>>  hw/ppc/spapr_pci_vfio.c     | 206 ++++++++++++++++++++++++++++++++++++++++++++
>>>>  include/hw/pci-host/spapr.h |  13 +++
>>>>  3 files changed, 220 insertions(+), 1 deletion(-)
>>>>  create mode 100644 hw/ppc/spapr_pci_vfio.c
>>>>
>>>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
>>>> index ea747f0..2239192 100644
>>>> --- a/hw/ppc/Makefile.objs
>>>> +++ b/hw/ppc/Makefile.objs
>>>> @@ -3,7 +3,7 @@ obj-y += ppc.o ppc_booke.o
>>>>  # IBM pSeries (sPAPR)
>>>>  obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
>>>>  obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
>>>> -obj-$(CONFIG_PSERIES) += spapr_pci.o
>>>> +obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_pci_vfio.o
>>>>  # PowerPC 4xx boards
>>>>  obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
>>>>  obj-y += ppc4xx_pci.o
>>>> diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
>>>> new file mode 100644
>>>> index 0000000..40f6673
>>>> --- /dev/null
>>>> +++ b/hw/ppc/spapr_pci_vfio.c
>>>> @@ -0,0 +1,206 @@
>>>> +/*
>>>> + * QEMU sPAPR PCI host for VFIO
>>>> + *
>>>> + * Copyright (c) 2011 Alexey Kardashevskiy, IBM Corporation.
>>>> + *
>>>> + * Permission is hereby granted, free of charge, to any person obtaining a copy
>>>> + * of this software and associated documentation files (the "Software"), to deal
>>>> + * in the Software without restriction, including without limitation the rights
>>>> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
>>>> + * copies of the Software, and to permit persons to whom the Software is
>>>> + * furnished to do so, subject to the following conditions:
>>>> + *
>>>> + * The above copyright notice and this permission notice shall be included in
>>>> + * all copies or substantial portions of the Software.
>>>> + *
>>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
>>>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>>>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
>>>> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>>>> + * THE SOFTWARE.
>>>> + */
>>>> +#include <sys/types.h>
>>>> +#include <dirent.h>
>>>> +
>>>> +#include "hw/hw.h"
>>>> +#include "hw/ppc/spapr.h"
>>>> +#include "hw/pci-host/spapr.h"
>>>> +#include "hw/misc/vfio.h"
>>>> +#include "hw/pci/pci_bus.h"
>>>> +#include "trace.h"
>>>> +#include "qemu/error-report.h"
>>>> +
>>>> +/* sPAPR VFIO */
>>>> +static Property spapr_phb_vfio_properties[] = {
>>>> +    DEFINE_PROP_INT32("iommu", sPAPRPHBVFIOState, iommugroupid, -1),
>>>> +    DEFINE_PROP_UINT8("scan", sPAPRPHBVFIOState, scan, 1),
>>>> +    DEFINE_PROP_UINT8("mf", sPAPRPHBVFIOState, enable_multifunction, 0),
>>>> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBVFIOState, force_addr, 0),
>>>> +    DEFINE_PROP_END_OF_LIST(),
>>>> +};
>>>> +
>>>> +static void spapr_pci_vfio_scan(sPAPRPHBVFIOState *svphb, Error **errp)
>>>> +{
>>>> +    PCIHostState *phb = PCI_HOST_BRIDGE(svphb);
>>>> +    char *iommupath;
>>>> +    DIR *dirp;
>>>> +    struct dirent *entry;
>>>> +    Error *error = NULL;
>>>> +
>>>> +    if (!svphb->scan) {
>>>> +        trace_spapr_pci("autoscan disabled for ", svphb->phb.dtbusname);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    iommupath = g_strdup_printf("/sys/kernel/iommu_groups/%d/devices/",
>>>> +                                svphb->iommugroupid);
>>>> +    if (!iommupath) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    dirp = opendir(iommupath);
>>>> +    if (!dirp) {
>>>> +        error_report("spapr-vfio: vfio scan failed on opendir: %m");
>>>> +        g_free(iommupath);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    while ((entry = readdir(dirp)) != NULL) {
>>>> +        Error *err = NULL;
>>>> +        char *tmp;
>>>> +        FILE *deviceclassfile;
>>>> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
>>>> +        char addr[32];
>>>> +        DeviceState *dev;
>>>> +
>>>> +        if (sscanf(entry->d_name, "%X:%X:%X.%x",
>>>> +                   &domainid, &busid, &devid, &fnid) != 4) {
>>>> +            continue;
>>>> +        }
>>>> +
>>>> +        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
>>>> +        trace_spapr_pci("Reading device class from ", tmp);
>>>> +
>>>> +        deviceclassfile = fopen(tmp, "r");
>>>> +        if (deviceclassfile) {
>>>> +            int ret = fscanf(deviceclassfile, "%x", &deviceclass);
>>>> +            fclose(deviceclassfile);
>>>> +            if (ret != 1) {
>>>> +                continue;
>>>> +            }
>>>> +        }
>>>> +        g_free(tmp);
>>>> +
>>>> +        if (!deviceclass) {
>>>> +            continue;
>>>> +        }
>>>> +        if ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8)) {
>>>> +            /* Skip bridges */
>>>> +            continue;
>>>> +        }
>>>> +        trace_spapr_pci("Creating device from ", entry->d_name);
>>>> +
>>>> +        dev = qdev_create(&phb->bus->qbus, "vfio-pci");
>>>> +        if (!dev) {
>>>> +            trace_spapr_pci("failed to create vfio-pci", entry->d_name);
>>>> +            continue;
>>>> +        }
>>>> +        object_property_parse(OBJECT(dev), entry->d_name, "host", &err);
>>>> +        if (err != NULL) {
>>>> +            object_unref(OBJECT(dev));
>>>> +            continue;
>>>> +        }
>>>> +        if (svphb->force_addr) {
>>>> +            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
>>>> +            err = NULL;
>>>> +            object_property_parse(OBJECT(dev), addr, "addr", &err);
>>>> +            if (err != NULL) {
>>>> +                object_unref(OBJECT(dev));
>>>> +                continue;
>>>> +            }
>>>> +        }
>>>> +        if (svphb->enable_multifunction) {
>>>> +            qdev_prop_set_bit(dev, "multifunction", 1);
>>>> +        }
>>>> +        object_property_set_bool(OBJECT(dev), true, "realized", &error);
>>>> +        if (error) {
>>>> +            object_unref(OBJECT(dev));
>>>> +            error_propagate(errp, error);
>>>> +            break;
>>>> +        }
>>>> +    }
>>>> +    closedir(dirp);
>>>> +    g_free(iommupath);
>>>> +}
>>>> +
>>>> +static void spapr_phb_vfio_finish_realize(sPAPRPHBState *sphb, Error **errp)
>>>> +{
>>>> +    sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
>>>> +    struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info) };
>>>> +    int ret;
>>>> +    Error *error = NULL;
>>>> +
>>>> +    if (svphb->iommugroupid == -1) {
>>>> +        error_setg(errp, "Wrong IOMMU group ID %d", svphb->iommugroupid);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    svphb->phb.tcet = spapr_vfio_new_table(DEVICE(sphb), svphb->phb.dma_liobn);
>>>> +
>>>> +    if (!svphb->phb.tcet) {
>>>> +        error_setg(errp, "spapr-vfio: failed to create VFIO TCE table");
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet),
>>>> +                       sphb->dtbusname);
>>>> +
>>>> +    ret = vfio_container_spapr_get_info(&svphb->phb.iommu_as,
>>>> +                                        sphb->dma_liobn, svphb->iommugroupid,
>>>> +                                        &info);
>>>> +    if (ret) {
>>>> +        error_setg_errno(errp, -ret,
>>>> +                         "spapr-vfio: get info from container failed");
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    svphb->phb.dma_window_start = info.dma32_window_start;
>>>> +    svphb->phb.dma_window_size = info.dma32_window_size;
>>>> +
>>>> +    spapr_pci_vfio_scan(svphb, &error);
>>>> +    if (error) {
>>>> +        error_propagate(errp, error);
>>>> +    }
>>>> +}
>>>> +
>>>> +static void spapr_phb_vfio_reset(DeviceState *qdev)
>>>> +{
>>>> +    /* Do nothing */
>>>> +}
>>>> +
>>>> +static void spapr_phb_vfio_class_init(ObjectClass *klass, void *data)
>>>> +{
>>>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>>>> +    sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_CLASS(klass);
>>>> +
>>>> +    dc->props = spapr_phb_vfio_properties;
>>>> +    dc->reset = spapr_phb_vfio_reset;
>>>> +    spc->finish_realize = spapr_phb_vfio_finish_realize;
>>>> +}
>>>> +
>>>> +static const TypeInfo spapr_phb_vfio_info = {
>>>> +    .name          = TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE,
>>>> +    .parent        = TYPE_SPAPR_PCI_HOST_BRIDGE,
>>>> +    .instance_size = sizeof(sPAPRPHBVFIOState),
>>>> +    .class_init    = spapr_phb_vfio_class_init,
>>>> +    .class_size    = sizeof(sPAPRPHBClass),
>>>> +};
>>>> +
>>>> +static void spapr_pci_vfio_register_types(void)
>>>> +{
>>>> +    type_register_static(&spapr_phb_vfio_info);
>>>> +}
>>>> +
>>>> +type_init(spapr_pci_vfio_register_types)
>>>> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
>>>> index 0f428a1..18acb67 100644
>>>> --- a/include/hw/pci-host/spapr.h
>>>> +++ b/include/hw/pci-host/spapr.h
>>>> @@ -30,10 +30,14 @@
>>>>  #define SPAPR_MSIX_MAX_DEVS 32
>>>>  
>>>>  #define TYPE_SPAPR_PCI_HOST_BRIDGE "spapr-pci-host-bridge"
>>>> +#define TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE "spapr-pci-vfio-host-bridge"
>>>>  
>>>>  #define SPAPR_PCI_HOST_BRIDGE(obj) \
>>>>      OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE)
>>>>  
>>>> +#define SPAPR_PCI_VFIO_HOST_BRIDGE(obj) \
>>>> +    OBJECT_CHECK(sPAPRPHBVFIOState, (obj), TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE)
>>>> +
>>>>  #define SPAPR_PCI_HOST_BRIDGE_CLASS(klass) \
>>>>       OBJECT_CLASS_CHECK(sPAPRPHBClass, (klass), TYPE_SPAPR_PCI_HOST_BRIDGE)
>>>>  #define SPAPR_PCI_HOST_BRIDGE_GET_CLASS(obj) \
>>>> @@ -41,6 +45,7 @@
>>>>  
>>>>  typedef struct sPAPRPHBClass sPAPRPHBClass;
>>>>  typedef struct sPAPRPHBState sPAPRPHBState;
>>>> +typedef struct sPAPRPHBVFIOState sPAPRPHBVFIOState;
>>>>  
>>>>  struct sPAPRPHBClass {
>>>>      PCIHostBridgeClass parent_class;
>>>> @@ -78,6 +83,14 @@ struct sPAPRPHBState {
>>>>      QLIST_ENTRY(sPAPRPHBState) list;
>>>>  };
>>>>  
>>>> +struct sPAPRPHBVFIOState {
>>>> +    sPAPRPHBState phb;
>>>> +
>>>> +    struct VFIOContainer *container;
>>>> +    int32_t iommugroupid;
>>>> +    uint8_t scan, enable_multifunction, force_addr;
>>>> +};
>>>> +
>>>>  #define SPAPR_PCI_BASE_BUID          0x800000020000000ULL
>>>>  
>>>>  #define SPAPR_PCI_WINDOW_BASE        0x10000000000ULL
Alex Williamson April 1, 2014, 6:21 p.m. UTC | #5
On Tue, 2014-04-01 at 17:25 +1100, Alexey Kardashevskiy wrote:
> On 04/01/2014 07:09 AM, Alex Williamson wrote:
> > On Fri, 2014-03-28 at 17:01 +1100, Alexey Kardashevskiy wrote:
> >> On 03/20/2014 06:57 AM, Alex Williamson wrote:
> >>> On Wed, 2014-03-12 at 16:52 +1100, Alexey Kardashevskiy wrote:
> >>>> The patch adds a spapr-pci-vfio-host-bridge device type
> >>>> which is a PCI Host Bridge with VFIO support. The new device
> >>>> inherits from the spapr-pci-host-bridge device and adds
> >>>> the following properties:
> >>>> 	iommu - IOMMU group ID which represents a Partitionable
> >>>> 	 	Endpoint, QEMU/ppc64 uses a separate PHB for
> >>>> 		an IOMMU group so the guest kernel has to have
> >>>> 		PCI Domain support enabled.
> >>>> 	forceaddr (optional, 0 by default) - forces QEMU to copy
> >>>> 		device:function from the host address as
> >>>> 		certain guest drivers expect devices to appear in
> >>>> 		particular locations;
> >>>> 	mf (optional, 0 by default) - forces multifunction bit for
> >>>> 		the function #0 of a found device, only makes sense
> >>>> 		for multifunction devices and only with the forceaddr
> >>>> 		property set. It would not be required if there
> >>>> 		was a way to know in advance whether a device is
> >>>> 		multifunctional or not.
> >>>> 	scan (optional, 1 by default) - if non-zero, the new PHB walks
> >>>> 		through all non-bridge devices in the group and tries
> >>>> 		adding them to the PHB; if zero, all devices in the group
> >>>> 		have to be configured manually via the QEMU command line.
> >>>>
> >>>> Examples of use:
> >>>> 1) Scan and add all devices from IOMMU group with ID=1 to QEMU's PHB #6:
> >>>> 	-device spapr-pci-vfio-host-bridge,id=DEVICENAME,iommu=1,index=6
> >>>>
> >>>> 2) Configure and Add 3 functions of a multifunctional device to QEMU:
> >>>> (the NEC PCI USB card is used as an example here):
> >>>> 	-device spapr-pci-vfio-host-bridge,id=USB,iommu=4,scan=0,index=7 \
> >>>> 	-device vfio-pci,host=4:0:1.0,addr=1.0,bus=USB,multifunction=true
> >>>> 	-device vfio-pci,host=4:0:1.1,addr=1.1,bus=USB
> >>>> 	-device vfio-pci,host=4:0:1.2,addr=1.2,bus=USB
> >>>
> >>> I won't pretend to predict the reaction of QEMU device architects to
> >>> this, but it seems like the assembly we expect from config files or
> >>> outside utilities, ex. libvirt.  I don't doubt this makes qemu
> >>> commandline usage more palatable, but it seems contrary to some of the
> >>> other things we do in QEMU with devices.  If this is something we need,
> >>> why is it specific to spapr?  IOMMU group can contain multiple devices
> >>> on any platform.  On x86 we could do something similar with a p2p
> >>> bridge, switch, or root port.
> >>
> >>
> >> At least at the moment devices are assigned to groups statically on SPAPR,
> >> they cannot be moved between groups at all, so it seems like a reasonable
> >> assumption that the user will not mind putting them all to the same QEMU
> >> instance.
> >>
> >> I should probably disable "scan" by default though, that would make more
> >> sense for libvirt.
> > 
> > That doesn't really address the concern.  An x86 chipset puts specific
> > devices at a specific address, yet this is not something that we hard
> > code into QEMU.  We have config files that define this and tools like
> > libvirt know where to put things.  Why is SPAPR special enough to have
> > QEMU auto-add an entire sub-hierarchy?  If this is a necessary feature,
> > why not make it generic and give x86 the capability as well?
> 
> Ok. You are right. spapr_pci_vfio_scan() has to go :)
> 
> 
> 
> >>> BTW, the code skips bridges, but doesn't that mean you'll have a hard
> >>> time with forceaddr as you potentially try to overlay devfn from
> >>> multiple buses onto a single bus?
> >>> It also makes the value of this a bit
> >>> more questionable since it seems to fall apart so easily.  Thanks,
> >>
> >> These "forceaddr" and "mf" are rather debug options - at the very beginning
> >> I used to have strong impression that USB NEC PCI device (2xOHCI and
> >> 1xEHCI) does not work properly if it is not multifunctional but I was
> >> wrong, just checked. So I'll just remove them as they do not help even me
> >> anymore and that's it. Thanks!
> > 
> > I'm still questioning the value of this code, it just seems to be a
> > convenience feature for someone running QEMU from the commandline, which
> > has never really been a concern for the rest of QEMU.  Suggest taking
> > this patch out of he series since it's not critical path.  Thanks,
> 
> 
> I can move  spapr_pci_vfio_scan() to a separate patch or get rid of it at
> all  but the rest is still needed to setup the correct IOMMU device. Is
> that what you really meant?

Sure, you still need the PHB which hosts an IOMMU to create a bus for
the vfio-pci devices.  Thanks,

Alex

> >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>> ---
> >>>> Changes:
> >>>> v5:
> >>>> * added handling of possible failure of spapr_vfio_new_table()
> >>>>
> >>>> v4:
> >>>> * moved IOMMU changes to separate patches
> >>>> * moved spapr-pci-vfio-host-bridge to new file
> >>>> ---
> >>>>  hw/ppc/Makefile.objs        |   2 +-
> >>>>  hw/ppc/spapr_pci_vfio.c     | 206 ++++++++++++++++++++++++++++++++++++++++++++
> >>>>  include/hw/pci-host/spapr.h |  13 +++
> >>>>  3 files changed, 220 insertions(+), 1 deletion(-)
> >>>>  create mode 100644 hw/ppc/spapr_pci_vfio.c
> >>>>
> >>>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> >>>> index ea747f0..2239192 100644
> >>>> --- a/hw/ppc/Makefile.objs
> >>>> +++ b/hw/ppc/Makefile.objs
> >>>> @@ -3,7 +3,7 @@ obj-y += ppc.o ppc_booke.o
> >>>>  # IBM pSeries (sPAPR)
> >>>>  obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
> >>>>  obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
> >>>> -obj-$(CONFIG_PSERIES) += spapr_pci.o
> >>>> +obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_pci_vfio.o
> >>>>  # PowerPC 4xx boards
> >>>>  obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
> >>>>  obj-y += ppc4xx_pci.o
> >>>> diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
> >>>> new file mode 100644
> >>>> index 0000000..40f6673
> >>>> --- /dev/null
> >>>> +++ b/hw/ppc/spapr_pci_vfio.c
> >>>> @@ -0,0 +1,206 @@
> >>>> +/*
> >>>> + * QEMU sPAPR PCI host for VFIO
> >>>> + *
> >>>> + * Copyright (c) 2011 Alexey Kardashevskiy, IBM Corporation.
> >>>> + *
> >>>> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> >>>> + * of this software and associated documentation files (the "Software"), to deal
> >>>> + * in the Software without restriction, including without limitation the rights
> >>>> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> >>>> + * copies of the Software, and to permit persons to whom the Software is
> >>>> + * furnished to do so, subject to the following conditions:
> >>>> + *
> >>>> + * The above copyright notice and this permission notice shall be included in
> >>>> + * all copies or substantial portions of the Software.
> >>>> + *
> >>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> >>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> >>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> >>>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> >>>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> >>>> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> >>>> + * THE SOFTWARE.
> >>>> + */
> >>>> +#include <sys/types.h>
> >>>> +#include <dirent.h>
> >>>> +
> >>>> +#include "hw/hw.h"
> >>>> +#include "hw/ppc/spapr.h"
> >>>> +#include "hw/pci-host/spapr.h"
> >>>> +#include "hw/misc/vfio.h"
> >>>> +#include "hw/pci/pci_bus.h"
> >>>> +#include "trace.h"
> >>>> +#include "qemu/error-report.h"
> >>>> +
> >>>> +/* sPAPR VFIO */
> >>>> +static Property spapr_phb_vfio_properties[] = {
> >>>> +    DEFINE_PROP_INT32("iommu", sPAPRPHBVFIOState, iommugroupid, -1),
> >>>> +    DEFINE_PROP_UINT8("scan", sPAPRPHBVFIOState, scan, 1),
> >>>> +    DEFINE_PROP_UINT8("mf", sPAPRPHBVFIOState, enable_multifunction, 0),
> >>>> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBVFIOState, force_addr, 0),
> >>>> +    DEFINE_PROP_END_OF_LIST(),
> >>>> +};
> >>>> +
> >>>> +static void spapr_pci_vfio_scan(sPAPRPHBVFIOState *svphb, Error **errp)
> >>>> +{
> >>>> +    PCIHostState *phb = PCI_HOST_BRIDGE(svphb);
> >>>> +    char *iommupath;
> >>>> +    DIR *dirp;
> >>>> +    struct dirent *entry;
> >>>> +    Error *error = NULL;
> >>>> +
> >>>> +    if (!svphb->scan) {
> >>>> +        trace_spapr_pci("autoscan disabled for ", svphb->phb.dtbusname);
> >>>> +        return;
> >>>> +    }
> >>>> +
> >>>> +    iommupath = g_strdup_printf("/sys/kernel/iommu_groups/%d/devices/",
> >>>> +                                svphb->iommugroupid);
> >>>> +    if (!iommupath) {
> >>>> +        return;
> >>>> +    }
> >>>> +
> >>>> +    dirp = opendir(iommupath);
> >>>> +    if (!dirp) {
> >>>> +        error_report("spapr-vfio: vfio scan failed on opendir: %m");
> >>>> +        g_free(iommupath);
> >>>> +        return;
> >>>> +    }
> >>>> +
> >>>> +    while ((entry = readdir(dirp)) != NULL) {
> >>>> +        Error *err = NULL;
> >>>> +        char *tmp;
> >>>> +        FILE *deviceclassfile;
> >>>> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
> >>>> +        char addr[32];
> >>>> +        DeviceState *dev;
> >>>> +
> >>>> +        if (sscanf(entry->d_name, "%X:%X:%X.%x",
> >>>> +                   &domainid, &busid, &devid, &fnid) != 4) {
> >>>> +            continue;
> >>>> +        }
> >>>> +
> >>>> +        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
> >>>> +        trace_spapr_pci("Reading device class from ", tmp);
> >>>> +
> >>>> +        deviceclassfile = fopen(tmp, "r");
> >>>> +        if (deviceclassfile) {
> >>>> +            int ret = fscanf(deviceclassfile, "%x", &deviceclass);
> >>>> +            fclose(deviceclassfile);
> >>>> +            if (ret != 1) {
> >>>> +                continue;
> >>>> +            }
> >>>> +        }
> >>>> +        g_free(tmp);
> >>>> +
> >>>> +        if (!deviceclass) {
> >>>> +            continue;
> >>>> +        }
> >>>> +        if ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8)) {
> >>>> +            /* Skip bridges */
> >>>> +            continue;
> >>>> +        }
> >>>> +        trace_spapr_pci("Creating device from ", entry->d_name);
> >>>> +
> >>>> +        dev = qdev_create(&phb->bus->qbus, "vfio-pci");
> >>>> +        if (!dev) {
> >>>> +            trace_spapr_pci("failed to create vfio-pci", entry->d_name);
> >>>> +            continue;
> >>>> +        }
> >>>> +        object_property_parse(OBJECT(dev), entry->d_name, "host", &err);
> >>>> +        if (err != NULL) {
> >>>> +            object_unref(OBJECT(dev));
> >>>> +            continue;
> >>>> +        }
> >>>> +        if (svphb->force_addr) {
> >>>> +            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
> >>>> +            err = NULL;
> >>>> +            object_property_parse(OBJECT(dev), addr, "addr", &err);
> >>>> +            if (err != NULL) {
> >>>> +                object_unref(OBJECT(dev));
> >>>> +                continue;
> >>>> +            }
> >>>> +        }
> >>>> +        if (svphb->enable_multifunction) {
> >>>> +            qdev_prop_set_bit(dev, "multifunction", 1);
> >>>> +        }
> >>>> +        object_property_set_bool(OBJECT(dev), true, "realized", &error);
> >>>> +        if (error) {
> >>>> +            object_unref(OBJECT(dev));
> >>>> +            error_propagate(errp, error);
> >>>> +            break;
> >>>> +        }
> >>>> +    }
> >>>> +    closedir(dirp);
> >>>> +    g_free(iommupath);
> >>>> +}
> >>>> +
> >>>> +static void spapr_phb_vfio_finish_realize(sPAPRPHBState *sphb, Error **errp)
> >>>> +{
> >>>> +    sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
> >>>> +    struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info) };
> >>>> +    int ret;
> >>>> +    Error *error = NULL;
> >>>> +
> >>>> +    if (svphb->iommugroupid == -1) {
> >>>> +        error_setg(errp, "Wrong IOMMU group ID %d", svphb->iommugroupid);
> >>>> +        return;
> >>>> +    }
> >>>> +
> >>>> +    svphb->phb.tcet = spapr_vfio_new_table(DEVICE(sphb), svphb->phb.dma_liobn);
> >>>> +
> >>>> +    if (!svphb->phb.tcet) {
> >>>> +        error_setg(errp, "spapr-vfio: failed to create VFIO TCE table");
> >>>> +        return;
> >>>> +    }
> >>>> +
> >>>> +    address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet),
> >>>> +                       sphb->dtbusname);
> >>>> +
> >>>> +    ret = vfio_container_spapr_get_info(&svphb->phb.iommu_as,
> >>>> +                                        sphb->dma_liobn, svphb->iommugroupid,
> >>>> +                                        &info);
> >>>> +    if (ret) {
> >>>> +        error_setg_errno(errp, -ret,
> >>>> +                         "spapr-vfio: get info from container failed");
> >>>> +        return;
> >>>> +    }
> >>>> +
> >>>> +    svphb->phb.dma_window_start = info.dma32_window_start;
> >>>> +    svphb->phb.dma_window_size = info.dma32_window_size;
> >>>> +
> >>>> +    spapr_pci_vfio_scan(svphb, &error);
> >>>> +    if (error) {
> >>>> +        error_propagate(errp, error);
> >>>> +    }
> >>>> +}
> >>>> +
> >>>> +static void spapr_phb_vfio_reset(DeviceState *qdev)
> >>>> +{
> >>>> +    /* Do nothing */
> >>>> +}
> >>>> +
> >>>> +static void spapr_phb_vfio_class_init(ObjectClass *klass, void *data)
> >>>> +{
> >>>> +    DeviceClass *dc = DEVICE_CLASS(klass);
> >>>> +    sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_CLASS(klass);
> >>>> +
> >>>> +    dc->props = spapr_phb_vfio_properties;
> >>>> +    dc->reset = spapr_phb_vfio_reset;
> >>>> +    spc->finish_realize = spapr_phb_vfio_finish_realize;
> >>>> +}
> >>>> +
> >>>> +static const TypeInfo spapr_phb_vfio_info = {
> >>>> +    .name          = TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE,
> >>>> +    .parent        = TYPE_SPAPR_PCI_HOST_BRIDGE,
> >>>> +    .instance_size = sizeof(sPAPRPHBVFIOState),
> >>>> +    .class_init    = spapr_phb_vfio_class_init,
> >>>> +    .class_size    = sizeof(sPAPRPHBClass),
> >>>> +};
> >>>> +
> >>>> +static void spapr_pci_vfio_register_types(void)
> >>>> +{
> >>>> +    type_register_static(&spapr_phb_vfio_info);
> >>>> +}
> >>>> +
> >>>> +type_init(spapr_pci_vfio_register_types)
> >>>> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> >>>> index 0f428a1..18acb67 100644
> >>>> --- a/include/hw/pci-host/spapr.h
> >>>> +++ b/include/hw/pci-host/spapr.h
> >>>> @@ -30,10 +30,14 @@
> >>>>  #define SPAPR_MSIX_MAX_DEVS 32
> >>>>  
> >>>>  #define TYPE_SPAPR_PCI_HOST_BRIDGE "spapr-pci-host-bridge"
> >>>> +#define TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE "spapr-pci-vfio-host-bridge"
> >>>>  
> >>>>  #define SPAPR_PCI_HOST_BRIDGE(obj) \
> >>>>      OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE)
> >>>>  
> >>>> +#define SPAPR_PCI_VFIO_HOST_BRIDGE(obj) \
> >>>> +    OBJECT_CHECK(sPAPRPHBVFIOState, (obj), TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE)
> >>>> +
> >>>>  #define SPAPR_PCI_HOST_BRIDGE_CLASS(klass) \
> >>>>       OBJECT_CLASS_CHECK(sPAPRPHBClass, (klass), TYPE_SPAPR_PCI_HOST_BRIDGE)
> >>>>  #define SPAPR_PCI_HOST_BRIDGE_GET_CLASS(obj) \
> >>>> @@ -41,6 +45,7 @@
> >>>>  
> >>>>  typedef struct sPAPRPHBClass sPAPRPHBClass;
> >>>>  typedef struct sPAPRPHBState sPAPRPHBState;
> >>>> +typedef struct sPAPRPHBVFIOState sPAPRPHBVFIOState;
> >>>>  
> >>>>  struct sPAPRPHBClass {
> >>>>      PCIHostBridgeClass parent_class;
> >>>> @@ -78,6 +83,14 @@ struct sPAPRPHBState {
> >>>>      QLIST_ENTRY(sPAPRPHBState) list;
> >>>>  };
> >>>>  
> >>>> +struct sPAPRPHBVFIOState {
> >>>> +    sPAPRPHBState phb;
> >>>> +
> >>>> +    struct VFIOContainer *container;
> >>>> +    int32_t iommugroupid;
> >>>> +    uint8_t scan, enable_multifunction, force_addr;
> >>>> +};
> >>>> +
> >>>>  #define SPAPR_PCI_BASE_BUID          0x800000020000000ULL
> >>>>  
> >>>>  #define SPAPR_PCI_WINDOW_BASE        0x10000000000ULL
> 
>
diff mbox

Patch

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index ea747f0..2239192 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -3,7 +3,7 @@  obj-y += ppc.o ppc_booke.o
 # IBM pSeries (sPAPR)
 obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
 obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
-obj-$(CONFIG_PSERIES) += spapr_pci.o
+obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_pci_vfio.o
 # PowerPC 4xx boards
 obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
 obj-y += ppc4xx_pci.o
diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
new file mode 100644
index 0000000..40f6673
--- /dev/null
+++ b/hw/ppc/spapr_pci_vfio.c
@@ -0,0 +1,206 @@ 
+/*
+ * QEMU sPAPR PCI host for VFIO
+ *
+ * Copyright (c) 2011 Alexey Kardashevskiy, IBM Corporation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <sys/types.h>
+#include <dirent.h>
+
+#include "hw/hw.h"
+#include "hw/ppc/spapr.h"
+#include "hw/pci-host/spapr.h"
+#include "hw/misc/vfio.h"
+#include "hw/pci/pci_bus.h"
+#include "trace.h"
+#include "qemu/error-report.h"
+
+/* sPAPR VFIO */
+static Property spapr_phb_vfio_properties[] = {
+    DEFINE_PROP_INT32("iommu", sPAPRPHBVFIOState, iommugroupid, -1),
+    DEFINE_PROP_UINT8("scan", sPAPRPHBVFIOState, scan, 1),
+    DEFINE_PROP_UINT8("mf", sPAPRPHBVFIOState, enable_multifunction, 0),
+    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBVFIOState, force_addr, 0),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void spapr_pci_vfio_scan(sPAPRPHBVFIOState *svphb, Error **errp)
+{
+    PCIHostState *phb = PCI_HOST_BRIDGE(svphb);
+    char *iommupath;
+    DIR *dirp;
+    struct dirent *entry;
+    Error *error = NULL;
+
+    if (!svphb->scan) {
+        trace_spapr_pci("autoscan disabled for ", svphb->phb.dtbusname);
+        return;
+    }
+
+    iommupath = g_strdup_printf("/sys/kernel/iommu_groups/%d/devices/",
+                                svphb->iommugroupid);
+    if (!iommupath) {
+        return;
+    }
+
+    dirp = opendir(iommupath);
+    if (!dirp) {
+        error_report("spapr-vfio: vfio scan failed on opendir: %m");
+        g_free(iommupath);
+        return;
+    }
+
+    while ((entry = readdir(dirp)) != NULL) {
+        Error *err = NULL;
+        char *tmp;
+        FILE *deviceclassfile;
+        unsigned deviceclass = 0, domainid, busid, devid, fnid;
+        char addr[32];
+        DeviceState *dev;
+
+        if (sscanf(entry->d_name, "%X:%X:%X.%x",
+                   &domainid, &busid, &devid, &fnid) != 4) {
+            continue;
+        }
+
+        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
+        trace_spapr_pci("Reading device class from ", tmp);
+
+        deviceclassfile = fopen(tmp, "r");
+        if (deviceclassfile) {
+            int ret = fscanf(deviceclassfile, "%x", &deviceclass);
+            fclose(deviceclassfile);
+            if (ret != 1) {
+                continue;
+            }
+        }
+        g_free(tmp);
+
+        if (!deviceclass) {
+            continue;
+        }
+        if ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8)) {
+            /* Skip bridges */
+            continue;
+        }
+        trace_spapr_pci("Creating device from ", entry->d_name);
+
+        dev = qdev_create(&phb->bus->qbus, "vfio-pci");
+        if (!dev) {
+            trace_spapr_pci("failed to create vfio-pci", entry->d_name);
+            continue;
+        }
+        object_property_parse(OBJECT(dev), entry->d_name, "host", &err);
+        if (err != NULL) {
+            object_unref(OBJECT(dev));
+            continue;
+        }
+        if (svphb->force_addr) {
+            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
+            err = NULL;
+            object_property_parse(OBJECT(dev), addr, "addr", &err);
+            if (err != NULL) {
+                object_unref(OBJECT(dev));
+                continue;
+            }
+        }
+        if (svphb->enable_multifunction) {
+            qdev_prop_set_bit(dev, "multifunction", 1);
+        }
+        object_property_set_bool(OBJECT(dev), true, "realized", &error);
+        if (error) {
+            object_unref(OBJECT(dev));
+            error_propagate(errp, error);
+            break;
+        }
+    }
+    closedir(dirp);
+    g_free(iommupath);
+}
+
+static void spapr_phb_vfio_finish_realize(sPAPRPHBState *sphb, Error **errp)
+{
+    sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
+    struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info) };
+    int ret;
+    Error *error = NULL;
+
+    if (svphb->iommugroupid == -1) {
+        error_setg(errp, "Wrong IOMMU group ID %d", svphb->iommugroupid);
+        return;
+    }
+
+    svphb->phb.tcet = spapr_vfio_new_table(DEVICE(sphb), svphb->phb.dma_liobn);
+
+    if (!svphb->phb.tcet) {
+        error_setg(errp, "spapr-vfio: failed to create VFIO TCE table");
+        return;
+    }
+
+    address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet),
+                       sphb->dtbusname);
+
+    ret = vfio_container_spapr_get_info(&svphb->phb.iommu_as,
+                                        sphb->dma_liobn, svphb->iommugroupid,
+                                        &info);
+    if (ret) {
+        error_setg_errno(errp, -ret,
+                         "spapr-vfio: get info from container failed");
+        return;
+    }
+
+    svphb->phb.dma_window_start = info.dma32_window_start;
+    svphb->phb.dma_window_size = info.dma32_window_size;
+
+    spapr_pci_vfio_scan(svphb, &error);
+    if (error) {
+        error_propagate(errp, error);
+    }
+}
+
+static void spapr_phb_vfio_reset(DeviceState *qdev)
+{
+    /* Do nothing */
+}
+
+static void spapr_phb_vfio_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_CLASS(klass);
+
+    dc->props = spapr_phb_vfio_properties;
+    dc->reset = spapr_phb_vfio_reset;
+    spc->finish_realize = spapr_phb_vfio_finish_realize;
+}
+
+static const TypeInfo spapr_phb_vfio_info = {
+    .name          = TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE,
+    .parent        = TYPE_SPAPR_PCI_HOST_BRIDGE,
+    .instance_size = sizeof(sPAPRPHBVFIOState),
+    .class_init    = spapr_phb_vfio_class_init,
+    .class_size    = sizeof(sPAPRPHBClass),
+};
+
+static void spapr_pci_vfio_register_types(void)
+{
+    type_register_static(&spapr_phb_vfio_info);
+}
+
+type_init(spapr_pci_vfio_register_types)
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index 0f428a1..18acb67 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -30,10 +30,14 @@ 
 #define SPAPR_MSIX_MAX_DEVS 32
 
 #define TYPE_SPAPR_PCI_HOST_BRIDGE "spapr-pci-host-bridge"
+#define TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE "spapr-pci-vfio-host-bridge"
 
 #define SPAPR_PCI_HOST_BRIDGE(obj) \
     OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE)
 
+#define SPAPR_PCI_VFIO_HOST_BRIDGE(obj) \
+    OBJECT_CHECK(sPAPRPHBVFIOState, (obj), TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE)
+
 #define SPAPR_PCI_HOST_BRIDGE_CLASS(klass) \
      OBJECT_CLASS_CHECK(sPAPRPHBClass, (klass), TYPE_SPAPR_PCI_HOST_BRIDGE)
 #define SPAPR_PCI_HOST_BRIDGE_GET_CLASS(obj) \
@@ -41,6 +45,7 @@ 
 
 typedef struct sPAPRPHBClass sPAPRPHBClass;
 typedef struct sPAPRPHBState sPAPRPHBState;
+typedef struct sPAPRPHBVFIOState sPAPRPHBVFIOState;
 
 struct sPAPRPHBClass {
     PCIHostBridgeClass parent_class;
@@ -78,6 +83,14 @@  struct sPAPRPHBState {
     QLIST_ENTRY(sPAPRPHBState) list;
 };
 
+struct sPAPRPHBVFIOState {
+    sPAPRPHBState phb;
+
+    struct VFIOContainer *container;
+    int32_t iommugroupid;
+    uint8_t scan, enable_multifunction, force_addr;
+};
+
 #define SPAPR_PCI_BASE_BUID          0x800000020000000ULL
 
 #define SPAPR_PCI_WINDOW_BASE        0x10000000000ULL