Patchwork [7/8] spapr vfio: add spapr-pci-vfio-host-bridge to support vfio

login
register
mail settings
Submitter Alexey Kardashevskiy
Date Aug. 7, 2013, 8:21 a.m.
Message ID <1375863692-12207-8-git-send-email-aik@ozlabs.ru>
Download mbox | patch
Permalink /patch/265396/
State New
Headers show

Comments

Alexey Kardashevskiy - Aug. 7, 2013, 8:21 a.m.
The patch adds a spapr-pci-vfio-host-bridge device type
which is a PCI Host Bridge with VFIO support. The new device
inherits from the spapr-pci-host-bridge device and adds
the following properties:
	iommu - IOMMU group ID which represents a Partitionable
	 	Endpoint, QEMU/ppc64 uses a separate PHB for
		an IOMMU group so the guest kernel has to have
		PCI Domain support enabled.
	forceaddr (optional, 0 by default) - forces QEMU to copy
		device:function from the host address as
		certain guest drivers expect devices to appear in
		particular locations;
	mf (optional, 0 by default) - forces multifunction bit for
		the function #0 of a found device, only makes sense
		for multifunction devices and only with the forceaddr
		property set. It would not be required if there
		was a way to know in advance whether a device is
		multifunctional or not.
	scan (optional, 1 by default) - if non-zero, the new PHB walks
		through all non-bridge devices in the group and tries
		adding them to the PHB; if zero, all devices in the group
		have to be configured manually via the QEMU command line.

The patch also adds a VFIO IOMMU type support to the existing
sPAPR TCE list in spapr_iommu.c.

The patch also uses the host kernel support of a new KVM_CAP_SPAPR_TCE_IOMMU
capability and KVM_CREATE_SPAPR_TCE_IOMMU ioctl which let QEMU tell
the host what LIOBN is used for an IOMMU group. This ioctl turns real mode TCE
requests handling on which accelerates actual throughput in 2.5-5 times.

Examples:
1) Scan and add all devices from IOMMU group with ID=1 to QEMU's PHB #6:
	-device spapr-pci-vfio-host-bridge,id=DEVICENAME,iommu=1,index=6

2) Configure and Add 3 functions of a multifunctional device to QEMU:
(the NEC PCI USB card is used as an example here):
	-device spapr-pci-vfio-host-bridge,id=USB,iommu=4,scan=0,index=7 \
	-device vfio-pci,host=4:0:1.0,addr=1.0,bus=USB,multifunction=true
	-device vfio-pci,host=4:0:1.1,addr=1.1,bus=USB
	-device vfio-pci,host=4:0:1.2,addr=1.2,bus=USB

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 hw/ppc/spapr_iommu.c        | 176 ++++++++++++++++++++++++++++++++-----
 hw/ppc/spapr_pci.c          | 209 +++++++++++++++++++++++++++++++++++++++++---
 include/hw/pci-host/spapr.h |  12 +++
 include/hw/ppc/spapr.h      |  19 ++++
 target-ppc/kvm.c            |  33 +++++++
 target-ppc/kvm_ppc.h        |  12 +++
 trace-events                |   4 +
 7 files changed, 429 insertions(+), 36 deletions(-)
Alexander Graf - Aug. 27, 2013, 11:08 a.m.
On 07.08.2013, at 10:21, Alexey Kardashevskiy wrote:

> The patch adds a spapr-pci-vfio-host-bridge device type
> which is a PCI Host Bridge with VFIO support. The new device
> inherits from the spapr-pci-host-bridge device and adds
> the following properties:
> 	iommu - IOMMU group ID which represents a Partitionable
> 	 	Endpoint, QEMU/ppc64 uses a separate PHB for
> 		an IOMMU group so the guest kernel has to have
> 		PCI Domain support enabled.
> 	forceaddr (optional, 0 by default) - forces QEMU to copy
> 		device:function from the host address as
> 		certain guest drivers expect devices to appear in
> 		particular locations;
> 	mf (optional, 0 by default) - forces multifunction bit for
> 		the function #0 of a found device, only makes sense
> 		for multifunction devices and only with the forceaddr
> 		property set. It would not be required if there
> 		was a way to know in advance whether a device is
> 		multifunctional or not.
> 	scan (optional, 1 by default) - if non-zero, the new PHB walks
> 		through all non-bridge devices in the group and tries
> 		adding them to the PHB; if zero, all devices in the group
> 		have to be configured manually via the QEMU command line.
> 
> The patch also adds a VFIO IOMMU type support to the existing
> sPAPR TCE list in spapr_iommu.c.
> 
> The patch also uses the host kernel support of a new KVM_CAP_SPAPR_TCE_IOMMU
> capability and KVM_CREATE_SPAPR_TCE_IOMMU ioctl which let QEMU tell
> the host what LIOBN is used for an IOMMU group. This ioctl turns real mode TCE
> requests handling on which accelerates actual throughput in 2.5-5 times.
> 
> Examples:
> 1) Scan and add all devices from IOMMU group with ID=1 to QEMU's PHB #6:
> 	-device spapr-pci-vfio-host-bridge,id=DEVICENAME,iommu=1,index=6
> 
> 2) Configure and Add 3 functions of a multifunctional device to QEMU:
> (the NEC PCI USB card is used as an example here):
> 	-device spapr-pci-vfio-host-bridge,id=USB,iommu=4,scan=0,index=7 \
> 	-device vfio-pci,host=4:0:1.0,addr=1.0,bus=USB,multifunction=true
> 	-device vfio-pci,host=4:0:1.1,addr=1.1,bus=USB
> 	-device vfio-pci,host=4:0:1.2,addr=1.2,bus=USB
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> hw/ppc/spapr_iommu.c        | 176 ++++++++++++++++++++++++++++++++-----
> hw/ppc/spapr_pci.c          | 209 +++++++++++++++++++++++++++++++++++++++++---
> include/hw/pci-host/spapr.h |  12 +++
> include/hw/ppc/spapr.h      |  19 ++++
> target-ppc/kvm.c            |  33 +++++++
> target-ppc/kvm_ppc.h        |  12 +++
> trace-events                |   4 +
> 7 files changed, 429 insertions(+), 36 deletions(-)
> 
> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> index 22b09be..096b6a9 100644
> --- a/hw/ppc/spapr_iommu.c
> +++ b/hw/ppc/spapr_iommu.c
> @@ -16,12 +16,14 @@
>  * You should have received a copy of the GNU Lesser General Public
>  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
>  */
> +
> #include "hw/hw.h"
> #include "sysemu/kvm.h"
> #include "hw/qdev.h"
> #include "kvm_ppc.h"
> #include "sysemu/dma.h"
> #include "exec/address-spaces.h"
> +#include "trace.h"
> 
> #include "hw/ppc/spapr.h"
> 
> @@ -244,6 +246,74 @@ static target_ulong put_tce_emu(sPAPRTCETable *tcet, target_ulong ioba,
>     return H_SUCCESS;
> }
> 
> +static IOMMUTLBEntry spapr_vfio_translate_iommu(MemoryRegion *iommu, hwaddr addr)
> +{
> +    IOMMUTLBEntry entry;
> +    /* Must never be called */
> +    assert(0);
> +    return entry;
> +}
> +
> +static MemoryRegionIOMMUOps spapr_vfio_iommu_ops = {
> +    .translate = spapr_vfio_translate_iommu,
> +};
> +
> +static int spapr_tce_table_vfio_realize(DeviceState *dev)
> +{
> +    sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev);
> +
> +    memory_region_init_iommu(&tcet->iommu, NULL, &spapr_vfio_iommu_ops,
> +                             "iommu-vfio-spapr", (uint64_t)INT64_MAX+1);
> +
> +    QLIST_INSERT_HEAD(&spapr_tce_tables, tcet, list);
> +
> +    return 0;
> +}
> +
> +sPAPRTCETable *spapr_vfio_new_table(DeviceState *owner, uint32_t liobn,
> +                                    int group_fd)
> +{
> +    sPAPRTCETable *tcet;
> +    int fd;
> +
> +    if (spapr_tce_find_by_liobn(liobn)) {
> +        fprintf(stderr, "Attempted to create TCE table with duplicate"
> +                " LIOBN 0x%x\n", liobn);
> +        return NULL;
> +    }
> +
> +    fd = kvmppc_create_spapr_tce_iommu(liobn, group_fd);
> +
> +    tcet = SPAPR_TCE_TABLE(object_new(TYPE_SPAPR_TCE_TABLE_VFIO));
> +    tcet->liobn = liobn;
> +    tcet->fd = fd;
> +    object_property_add_child(OBJECT(owner), "tce-table", OBJECT(tcet), NULL);
> +
> +    qdev_init_nofail(DEVICE(tcet));
> +
> +    return tcet;
> +}
> +
> +static target_ulong put_tce_vfio(sPAPRTCETable *tcet, target_ulong ioba,
> +                                 target_ulong tce)
> +{
> +    IOMMUTLBEntry entry;
> +
> +    entry.iova = ioba & ~SPAPR_TCE_PAGE_MASK;
> +    entry.translated_addr = tce & ~SPAPR_TCE_PAGE_MASK;
> +    entry.addr_mask = SPAPR_TCE_PAGE_MASK;
> +    entry.perm = 0;
> +    if ((tce & SPAPR_TCE_RO) == SPAPR_TCE_RO) {
> +        entry.perm |= IOMMU_RO;
> +    }
> +    if ((tce & SPAPR_TCE_WO) == SPAPR_TCE_WO) {
> +        entry.perm |= IOMMU_WO;
> +    }
> +    memory_region_notify_iommu(&tcet->iommu, entry);
> +
> +    return H_SUCCESS;
> +}
> +
> static target_ulong h_put_tce_indirect(PowerPCCPU *cpu,
>                                        sPAPREnvironment *spapr,
>                                        target_ulong opcode, target_ulong *args)
> @@ -255,18 +325,36 @@ static target_ulong h_put_tce_indirect(PowerPCCPU *cpu,
>     target_ulong npages = args[3];
>     target_ulong ret = 0;
>     sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> +    sPAPRTCETableClass *info;
> 
> -    if (tcet) {
> -        for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
> -            target_ulong tce = ldq_phys((tce_list & ~SPAPR_TCE_PAGE_MASK) +
> -                                        i * sizeof(target_ulong));
> -            ret = put_tce_emu(tcet, ioba, tce);
> -            if (ret) {
> -                break;
> -            }
> +    if (!tcet) {
> +        return H_PARAMETER;
> +    }
> +
> +    info = SPAPR_TCE_TABLE_GET_CLASS(tcet);
> +    if (!info || !info->put_tce) {
> +        return H_PARAMETER;
> +    }
> +
> +    if ((tce_list & SPAPR_TCE_PAGE_MASK) || (npages > 512)) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (liobn & 0xFFFFFFFF00000000ULL) {
> +        hcall_dprintf("spapr_vio_put_tce on out-of-boundsw LIOBN "
> +                      TARGET_FMT_lx "\n", liobn);
> +        return H_PARAMETER;
> +    }
> +
> +    for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
> +        target_ulong tce = ldq_phys((tce_list & ~SPAPR_TCE_PAGE_MASK) +
> +                                    i * sizeof(target_ulong));
> +        ret = info->put_tce(tcet, ioba, tce);
> +        if (ret) {
> +            break;
>         }
> -        return ret;
>     }
> +
> #ifdef DEBUG_TCE
>     fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>             "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
> @@ -274,7 +362,7 @@ static target_ulong h_put_tce_indirect(PowerPCCPU *cpu,
>             __func__, liobn, ioba, tce_list, ret);
> #endif
> 
> -    return H_PARAMETER;
> +    return ret;
> }
> 
> static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
> @@ -287,17 +375,30 @@ static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>     target_ulong npages = args[3];
>     target_ulong ret = 0;
>     sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> +    sPAPRTCETableClass *info;
> +
> +    if (!tcet) {
> +        return H_PARAMETER;
> +    }
> +
> +    info = SPAPR_TCE_TABLE_GET_CLASS(tcet);
> +    if (!info || !info->put_tce) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (liobn & 0xFFFFFFFF00000000ULL) {
> +        hcall_dprintf("spapr_vio_put_tce on out-of-boundsw LIOBN "
> +                      TARGET_FMT_lx "\n", liobn);
> +        return H_PARAMETER;
> +    }
> 
>     ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
> 
> -    if (tcet) {
> -        for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
> -            ret = put_tce_emu(tcet, ioba, tce_value);
> -            if (ret) {
> -                break;
> -            }
> +    for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
> +        ret = info->put_tce(tcet, ioba, tce_value);
> +        if (ret) {
> +            break;
>         }
> -        return ret;
>     }
> #ifdef DEBUG_TCE
>     fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
> @@ -306,7 +407,7 @@ static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>             __func__, liobn, ioba, tce_value, ret);
> #endif
> 
> -    return H_PARAMETER;
> +    return ret;
> }
> 
> static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
> @@ -316,12 +417,21 @@ static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>     target_ulong ioba = args[1];
>     target_ulong tce = args[2];
>     sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> +    target_ulong ret;
> +    sPAPRTCETableClass *info;
> +
> +    if (!tcet) {
> +        return H_PARAMETER;
> +    }
> +
> +    info = SPAPR_TCE_TABLE_GET_CLASS(tcet);
> +    if (!info || !info->put_tce) {
> +        return H_PARAMETER;
> +    }
> 
>     ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
> 
> -    if (tcet) {
> -        return put_tce_emu(tcet, ioba, tce);
> -    }
> +    ret = info->put_tce(tcet, ioba, tce);
> #ifdef DEBUG_TCE
>     fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>             "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
> @@ -329,7 +439,7 @@ static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>             __func__, liobn, ioba, tce, ret);
> #endif
> 
> -    return H_PARAMETER;
> +    return ret;
> }
> 
> int spapr_dma_dt(void *fdt, int node_off, const char *propname,
> @@ -376,9 +486,12 @@ int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
> static void spapr_tce_table_class_init(ObjectClass *klass, void *data)
> {
>     DeviceClass *dc = DEVICE_CLASS(klass);
> +    sPAPRTCETableClass *k = SPAPR_TCE_TABLE_CLASS(klass);
> +
>     dc->vmsd = &vmstate_spapr_tce_table;
>     dc->init = spapr_tce_table_realize;
>     dc->reset = spapr_tce_reset;
> +    k->put_tce = put_tce_emu;
> 
>     QLIST_INIT(&spapr_tce_tables);
> 
> @@ -393,12 +506,31 @@ static TypeInfo spapr_tce_table_info = {
>     .parent = TYPE_DEVICE,
>     .instance_size = sizeof(sPAPRTCETable),
>     .class_init = spapr_tce_table_class_init,
> +    .class_size = sizeof(sPAPRTCETableClass),
>     .instance_finalize = spapr_tce_table_finalize,
> };
> 
> +static void spapr_tce_table_vfio_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    sPAPRTCETableClass *k = SPAPR_TCE_TABLE_CLASS(klass);
> +
> +    dc->init = spapr_tce_table_vfio_realize;
> +    k->put_tce = put_tce_vfio;
> +}
> +
> +static TypeInfo spapr_tce_table_vfio_info = {
> +    .name = TYPE_SPAPR_TCE_TABLE_VFIO,
> +    .parent = TYPE_SPAPR_TCE_TABLE,
> +    .instance_size = sizeof(sPAPRTCETable),
> +    .class_init = spapr_tce_table_vfio_class_init,
> +    .class_size = sizeof(sPAPRTCETableClass),
> +};
> +
> static void register_types(void)
> {
>     type_register_static(&spapr_tce_table_info);
> +    type_register_static(&spapr_tce_table_vfio_info);
> }
> 
> type_init(register_types);
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 869ca43..3f37cac 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c

I think we should move the vfio phb into a separate file and make it be a proper subclass without even the chance to randomly call normal spapr pci functions ;).

Andreas, could you please check through this and see if you can spot a way to isolate it out?


Alex

> @@ -22,6 +22,9 @@
>  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>  * THE SOFTWARE.
>  */
> +#include <sys/types.h>
> +#include <dirent.h>
> +
> #include "hw/hw.h"
> #include "hw/pci/pci.h"
> #include "hw/pci/msi.h"
> @@ -32,6 +35,7 @@
> #include "exec/address-spaces.h"
> #include <libfdt.h>
> #include "trace.h"
> +#include "hw/misc/vfio.h"
> 
> #include "hw/pci/pci_bus.h"
> 
> @@ -496,7 +500,11 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
>     return &phb->iommu_as;
> }
> 
> -static int spapr_phb_init(SysBusDevice *s)
> +/*
> + * This is the common initialization part for both emulated and VFIO PHBs
> + * which includes everything but DMA and device scan (optional, VFIO only).
> + */
> +static int _spapr_phb_init(SysBusDevice *s)
> {
>     DeviceState *dev = DEVICE(s);
>     sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s);
> @@ -610,19 +618,6 @@ static int spapr_phb_init(SysBusDevice *s)
>                            PCI_DEVFN(0, 0), PCI_NUM_PINS, TYPE_PCI_BUS);
>     phb->bus = bus;
> 
> -    sphb->dma_window_start = 0;
> -    sphb->dma_window_size = 0x40000000;
> -    sphb->tcet = spapr_tce_new_table(dev, sphb->dma_liobn,
> -                                     sphb->dma_window_size);
> -    if (!sphb->tcet) {
> -        fprintf(stderr, "Unable to create TCE table for %s\n", sphb->dtbusname);
> -        return -1;
> -    }
> -    address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet),
> -                       sphb->dtbusname);
> -
> -    pci_setup_iommu(bus, spapr_pci_dma_iommu, sphb);
> -
>     QLIST_INSERT_HEAD(&spapr->phbs, sphb, list);
> 
>     /* Initialize the LSI table */
> @@ -641,6 +636,30 @@ static int spapr_phb_init(SysBusDevice *s)
>     return 0;
> }
> 
> +static int spapr_phb_init(SysBusDevice *s)
> +{
> +    sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s);
> +    int ret;
> +
> +    ret = _spapr_phb_init(s);
> +    if (ret)
> +        return ret;
> +
> +    sphb->dma_window_start = 0;
> +    sphb->dma_window_size = 0x40000000;
> +    sphb->tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn,
> +                                     sphb->dma_window_size);
> +    if (!sphb->tcet) {
> +        fprintf(stderr, "Unable to create TCE table for %s\n", sphb->dtbusname);
> +        return -1;
> +    }
> +    address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet),
> +                       sphb->dtbusname);
> +    pci_setup_iommu(sphb->parent_obj.bus, spapr_pci_dma_iommu, sphb);
> +
> +    return 0;
> +}
> +
> static void spapr_phb_reset(DeviceState *qdev)
> {
>     SysBusDevice *s = SYS_BUS_DEVICE(qdev);
> @@ -749,6 +768,163 @@ PCIHostState *spapr_create_phb(sPAPREnvironment *spapr, int index)
>     return PCI_HOST_BRIDGE(dev);
> }
> 
> +/* sPAPR VFIO */
> +static Property spapr_phb_vfio_properties[] = {
> +    DEFINE_PROP_INT32("iommu", sPAPRPHBVFIOState, iommugroupid, -1),
> +    DEFINE_PROP_UINT8("scan", sPAPRPHBVFIOState, scan, 1),
> +    DEFINE_PROP_UINT8("mf", sPAPRPHBVFIOState, enable_multifunction, 0),
> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBVFIOState, force_addr, 0),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static int spapr_pci_vfio_scan(sPAPRPHBVFIOState *svphb)
> +{
> +    PCIHostState *phb = PCI_HOST_BRIDGE(svphb);
> +    char *iommupath;
> +    DIR *dirp;
> +    struct dirent *entry;
> +
> +    if (!svphb->scan) {
> +        trace_spapr_pci("autoscan disabled for ", svphb->phb.dtbusname);
> +        return 0;
> +    }
> +
> +    iommupath = g_strdup_printf("/sys/kernel/iommu_groups/%d/devices/",
> +                                svphb->iommugroupid);
> +    if (!iommupath) {
> +        return -ENOMEM;
> +    }
> +
> +    dirp = opendir(iommupath);
> +    if (!dirp) {
> +        fprintf(stderr, "failed to scan group=%d\n", svphb->iommugroupid);
> +        g_free(iommupath);
> +        return -1;
> +    }
> +
> +    while ((entry = readdir(dirp)) != NULL) {
> +        Error *err = NULL;
> +        char *tmp;
> +        FILE *deviceclassfile;
> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
> +        char addr[32];
> +        DeviceState *dev;
> +
> +        if (sscanf(entry->d_name, "%X:%X:%X.%x",
> +                   &domainid, &busid, &devid, &fnid) != 4) {
> +            continue;
> +        }
> +
> +        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
> +        trace_spapr_pci("Reading device class from ", tmp);
> +
> +        deviceclassfile = fopen(tmp, "r");
> +        if (deviceclassfile) {
> +            int ret = fscanf(deviceclassfile, "%x", &deviceclass);
> +            fclose(deviceclassfile);
> +            if (ret != 1) {
> +                continue;
> +            }
> +        }
> +        g_free(tmp);
> +
> +        if (!deviceclass) {
> +            continue;
> +        }
> +        if ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8)) {
> +            /* Skip bridges */
> +            continue;
> +        }
> +        trace_spapr_pci("Creating device from ", entry->d_name);
> +
> +        dev = qdev_create(&phb->bus->qbus, "vfio-pci");
> +        if (!dev) {
> +            fprintf(stderr, "failed to create vfio-pci\n");
> +            continue;
> +        }
> +        qdev_prop_parse(dev, "host", entry->d_name, &err);
> +        if (err != NULL) {
> +            continue;
> +        }
> +        if (svphb->force_addr) {
> +            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
> +            err = NULL;
> +            qdev_prop_parse(dev, "addr", addr, &err);
> +            if (err != NULL) {
> +                continue;
> +            }
> +        }
> +        if (svphb->enable_multifunction) {
> +            qdev_prop_set_bit(dev, "multifunction", 1);
> +        }
> +        qdev_init_nofail(dev);
> +    }
> +    closedir(dirp);
> +    g_free(iommupath);
> +
> +    return 0;
> +}
> +
> +static int spapr_phb_vfio_init(SysBusDevice *s)
> +{
> +    sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(s);
> +    sPAPRPHBState *sphb = &svphb->phb;
> +    struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info) };
> +    int ret, group_fd;
> +
> +    if (svphb->iommugroupid == -1) {
> +        fprintf(stderr, "Wrong IOMMU group ID %d\n", svphb->iommugroupid);
> +        return -1;
> +    }
> +
> +    ret = _spapr_phb_init(s);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    ret = vfio_container_spapr_get_info(&svphb->phb.iommu_as,
> +                                        svphb->iommugroupid,
> +                                        &info, &group_fd);
> +    if (ret)
> +        return ret;
> +
> +    svphb->phb.dma_window_start = info.dma32_window_start;
> +    svphb->phb.dma_window_size = info.dma32_window_size;
> +    svphb->phb.tcet = spapr_vfio_new_table(DEVICE(sphb), svphb->phb.dma_liobn,
> +                                           group_fd);
> +
> +    address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet),
> +                       sphb->dtbusname);
> +    pci_setup_iommu(sphb->parent_obj.bus, spapr_pci_dma_iommu, sphb);
> +
> +    ret = spapr_pci_vfio_scan(svphb);
> +
> +    return ret;
> +}
> +
> +static void spapr_phb_vfio_reset(DeviceState *qdev)
> +{
> +    /* Do nothing */
> +}
> +
> +static void spapr_phb_vfio_class_init(ObjectClass *klass, void *data)
> +{
> +    SysBusDeviceClass *sdc = SYS_BUS_DEVICE_CLASS(klass);
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    sdc->init = spapr_phb_vfio_init;
> +    dc->props = spapr_phb_vfio_properties;
> +    dc->reset = spapr_phb_vfio_reset;
> +    dc->vmsd = &vmstate_spapr_pci;
> +}
> +
> +static const TypeInfo spapr_phb_vfio_info = {
> +    .name          = TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE,
> +    .parent        = TYPE_SPAPR_PCI_HOST_BRIDGE,
> +    .instance_size = sizeof(sPAPRPHBVFIOState),
> +    .class_init    = spapr_phb_vfio_class_init,
> +};
> +
> /* Macros to operate with address in OF binding to PCI */
> #define b_x(x, p, l)    (((x) & ((1<<(l))-1)) << (p))
> #define b_n(x)          b_x((x), 31, 1) /* 0 if relocatable */
> @@ -839,6 +1015,10 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>     _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
>                      sizeof(interrupt_map)));
> 
> +    if (!phb->dma_window_size) {
> +        fprintf(stderr, "Unexpected error: DMA window is zero, exiting\n");
> +        exit(1);
> +    }
>     spapr_dma_dt(fdt, bus_off, "ibm,dma-window",
>                  phb->dma_liobn, phb->dma_window_start,
>                  phb->dma_window_size);
> @@ -862,6 +1042,7 @@ void spapr_pci_rtas_init(void)
> static void spapr_pci_register_types(void)
> {
>     type_register_static(&spapr_phb_info);
> +    type_register_static(&spapr_phb_vfio_info);
> }
> 
> type_init(spapr_pci_register_types)
> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> index 970b4a9..fab18e5 100644
> --- a/include/hw/pci-host/spapr.h
> +++ b/include/hw/pci-host/spapr.h
> @@ -30,10 +30,14 @@
> #define SPAPR_MSIX_MAX_DEVS 32
> 
> #define TYPE_SPAPR_PCI_HOST_BRIDGE "spapr-pci-host-bridge"
> +#define TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE "spapr-pci-vfio-host-bridge"
> 
> #define SPAPR_PCI_HOST_BRIDGE(obj) \
>     OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE)
> 
> +#define SPAPR_PCI_VFIO_HOST_BRIDGE(obj) \
> +    OBJECT_CHECK(sPAPRPHBVFIOState, (obj), TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE)
> +
> typedef struct sPAPRPHBState {
>     PCIHostState parent_obj;
> 
> @@ -64,6 +68,14 @@ typedef struct sPAPRPHBState {
>     QLIST_ENTRY(sPAPRPHBState) list;
> } sPAPRPHBState;
> 
> +typedef struct sPAPRPHBVFIOState {
> +    sPAPRPHBState phb;
> +
> +    struct VFIOContainer *container;
> +    int32_t iommugroupid;
> +    uint8_t scan, enable_multifunction, force_addr;
> +} sPAPRPHBVFIOState;
> +
> #define SPAPR_PCI_BASE_BUID          0x800000020000000ULL
> 
> #define SPAPR_PCI_WINDOW_BASE        0x10000000000ULL
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 2dc3d06..a64e58a 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -353,12 +353,29 @@ int spapr_rtas_device_tree_setup(void *fdt, hwaddr rtas_addr,
> 
> #define RTAS_ERROR_LOG_MAX      2048
> 
> +typedef struct sPAPRTCETableClass sPAPRTCETableClass;
> typedef struct sPAPRTCETable sPAPRTCETable;
> 
> #define TYPE_SPAPR_TCE_TABLE "spapr-tce-table"
> #define SPAPR_TCE_TABLE(obj) \
>     OBJECT_CHECK(sPAPRTCETable, (obj), TYPE_SPAPR_TCE_TABLE)
> 
> +#define TYPE_SPAPR_TCE_TABLE_VFIO "spapr-tce-table-vfio"
> +#define SPAPR_TCE_TABLE_VFIO(obj) \
> +    OBJECT_CHECK(sPAPRTCETable, (obj), TYPE_SPAPR_TCE_TABLE_VFIO)
> +
> +#define SPAPR_TCE_TABLE_CLASS(klass) \
> +     OBJECT_CLASS_CHECK(sPAPRTCETableClass, (klass), TYPE_SPAPR_TCE_TABLE)
> +#define SPAPR_TCE_TABLE_GET_CLASS(obj) \
> +     OBJECT_GET_CLASS(sPAPRTCETableClass, (obj), TYPE_SPAPR_TCE_TABLE)
> +
> +struct sPAPRTCETableClass {
> +    DeviceClass parent_class;
> +
> +    target_ulong (*put_tce)(sPAPRTCETable *tcet, target_ulong ioba,
> +                            target_ulong tce);
> +};
> +
> struct sPAPRTCETable {
>     DeviceState parent;
>     uint32_t liobn;
> @@ -375,6 +392,8 @@ void spapr_events_init(sPAPREnvironment *spapr);
> void spapr_events_fdt_skel(void *fdt, uint32_t epow_irq);
> sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn,
>                                    size_t window_size);
> +sPAPRTCETable *spapr_vfio_new_table(DeviceState *owner, uint32_t liobn,
> +                                    int group_fd);
> MemoryRegion *spapr_tce_get_iommu(sPAPRTCETable *tcet);
> void spapr_tce_set_bypass(sPAPRTCETable *tcet, bool bypass);
> int spapr_dma_dt(void *fdt, int node_off, const char *propname,
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index 3d0e398..eb59d7d 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -61,6 +61,7 @@ static int cap_ppc_smt;
> static int cap_ppc_rma;
> static int cap_spapr_tce;
> static int cap_spapr_multitce;
> +static int cap_spapr_tce_iommu;
> static int cap_hior;
> static int cap_one_reg;
> static int cap_epr;
> @@ -98,6 +99,7 @@ int kvm_arch_init(KVMState *s)
>     cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA);
>     cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
>     cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE);
> +    cap_spapr_tce_iommu = kvm_check_extension(s, KVM_CAP_SPAPR_TCE_IOMMU);
>     cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG);
>     cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR);
>     cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR);
> @@ -1669,6 +1671,37 @@ int kvmppc_remove_spapr_tce(void *table, int fd, uint32_t window_size)
>     return 0;
> }
> 
> +int kvmppc_create_spapr_tce_iommu(uint32_t liobn, int group_fd)
> +{
> +    int fd = 0;
> +    struct kvm_create_spapr_tce_iommu args = {
> +        .liobn = liobn,
> +        .fd = group_fd
> +    };
> +
> +    if (!kvm_enabled() || !cap_spapr_tce_iommu) {
> +        fprintf(stderr, "KVM VFIO: TCE IOMMU capability is not present, DMA may be slow\n");
> +        return -1;
> +    }
> +
> +    fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE_IOMMU, &args);
> +    if (fd < 0) {
> +        fprintf(stderr, "KVM VFIO: Failed to create TCE table for liobn 0x%x, ret = %d, DMA may be slow\n",
> +                liobn, fd);
> +    }
> +
> +    return fd;
> +}
> +
> +int kvmppc_remove_spapr_tce_iommu(int fd)
> +{
> +    if (fd < 0) {
> +        return -1;
> +    }
> +
> +    return close(fd);
> +}
> +
> int kvmppc_reset_htab(int shift_hint)
> {
>     uint32_t shift = shift_hint;
> diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
> index a2a903f..a223e63 100644
> --- a/target-ppc/kvm_ppc.h
> +++ b/target-ppc/kvm_ppc.h
> @@ -34,6 +34,8 @@ off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem);
> bool kvmppc_spapr_use_multitce(void);
> void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd);
> int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size);
> +int kvmppc_create_spapr_tce_iommu(uint32_t liobn, int group_fd);
> +int kvmppc_remove_spapr_tce_iommu(int fd);
> int kvmppc_reset_htab(int shift_hint);
> uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift);
> #endif /* !CONFIG_USER_ONLY */
> @@ -144,6 +146,16 @@ static inline int kvmppc_remove_spapr_tce(void *table, int pfd,
>     return -1;
> }
> 
> +static inline int kvmppc_create_spapr_tce_iommu(uint32_t liobn, uint32_t iommu_id)
> +{
> +    return -1;
> +}
> +
> +static inline int kvmppc_remove_spapr_tce_iommu(int fd)
> +{
> +    return -1;
> +}
> +
> static inline int kvmppc_reset_htab(int shift_hint)
> {
>     return -1;
> diff --git a/trace-events b/trace-events
> index 3856b5c..d1e54ad 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -1113,6 +1113,7 @@ qxl_render_guest_primary_resized(int32_t width, int32_t height, int32_t stride,
> qxl_render_update_area_done(void *cookie) "%p"
> 
> # hw/ppc/spapr_pci.c
> +spapr_pci(const char *msg1, const char *msg2) "%s%s"
> spapr_pci_msi(const char *msg, uint32_t n, uint32_t ca) "%s (device#%d, cfg=%x)"
> spapr_pci_msi_setup(const char *name, unsigned vector, uint64_t addr) "dev\"%s\" vector %u, addr=%"PRIx64
> spapr_pci_rtas_ibm_change_msi(unsigned func, unsigned req) "func %u, requested %u"
> @@ -1133,6 +1134,9 @@ xics_ics_write_xive(int nr, int srcno, int server, uint8_t priority) "ics_write_
> xics_ics_reject(int nr, int srcno) "reject irq %#x [src %d]"
> xics_ics_eoi(int nr) "ics_eoi: irq %#x"
> 
> +# hw/ppc/spapr_iommu.c
> +spapr_iommu(const char *op, uint32_t liobn, uint64_t ioba, uint64_t tce, int ret) "%s %x ioba=%"PRIx64" tce=%"PRIx64" ret=%d"
> +
> # util/hbitmap.c
> hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx"
> hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64
> -- 
> 1.8.3.2
>
Alexey Kardashevskiy - Aug. 30, 2013, 7:43 a.m.
On 08/27/2013 09:08 PM, Alexander Graf wrote:
>> type_init(register_types);
>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
>> index 869ca43..3f37cac 100644
>> --- a/hw/ppc/spapr_pci.c
>> +++ b/hw/ppc/spapr_pci.c
> 
> I think we should move the vfio phb into a separate file and make it be a proper subclass without even the chance to randomly call normal spapr pci functions ;).
> 
> Andreas, could you please check through this and see if you can spot a way to isolate it out?

After the lesson you both gave me with xics/xics-kvm, I am (more or less)
aware of what I need to change here so wait a bit till I post another version.
Andreas Färber - Aug. 30, 2013, 1:01 p.m.
Am 30.08.2013 09:43, schrieb Alexey Kardashevskiy:
> On 08/27/2013 09:08 PM, Alexander Graf wrote:
>>> type_init(register_types);
>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
>>> index 869ca43..3f37cac 100644
>>> --- a/hw/ppc/spapr_pci.c
>>> +++ b/hw/ppc/spapr_pci.c
>>
>> I think we should move the vfio phb into a separate file and make it be a proper subclass without even the chance to randomly call normal spapr pci functions ;).
>>
>> Andreas, could you please check through this and see if you can spot a way to isolate it out?

Just noticing this question now... (me not so into reviewing VFIO)

> After the lesson you both gave me with xics/xics-kvm, I am (more or less)
> aware of what I need to change here so wait a bit till I post another version.

Thanks.

Andreas

Patch

diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 22b09be..096b6a9 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -16,12 +16,14 @@ 
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  */
+
 #include "hw/hw.h"
 #include "sysemu/kvm.h"
 #include "hw/qdev.h"
 #include "kvm_ppc.h"
 #include "sysemu/dma.h"
 #include "exec/address-spaces.h"
+#include "trace.h"
 
 #include "hw/ppc/spapr.h"
 
@@ -244,6 +246,74 @@  static target_ulong put_tce_emu(sPAPRTCETable *tcet, target_ulong ioba,
     return H_SUCCESS;
 }
 
+static IOMMUTLBEntry spapr_vfio_translate_iommu(MemoryRegion *iommu, hwaddr addr)
+{
+    IOMMUTLBEntry entry;
+    /* Must never be called */
+    assert(0);
+    return entry;
+}
+
+static MemoryRegionIOMMUOps spapr_vfio_iommu_ops = {
+    .translate = spapr_vfio_translate_iommu,
+};
+
+static int spapr_tce_table_vfio_realize(DeviceState *dev)
+{
+    sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev);
+
+    memory_region_init_iommu(&tcet->iommu, NULL, &spapr_vfio_iommu_ops,
+                             "iommu-vfio-spapr", (uint64_t)INT64_MAX+1);
+
+    QLIST_INSERT_HEAD(&spapr_tce_tables, tcet, list);
+
+    return 0;
+}
+
+sPAPRTCETable *spapr_vfio_new_table(DeviceState *owner, uint32_t liobn,
+                                    int group_fd)
+{
+    sPAPRTCETable *tcet;
+    int fd;
+
+    if (spapr_tce_find_by_liobn(liobn)) {
+        fprintf(stderr, "Attempted to create TCE table with duplicate"
+                " LIOBN 0x%x\n", liobn);
+        return NULL;
+    }
+
+    fd = kvmppc_create_spapr_tce_iommu(liobn, group_fd);
+
+    tcet = SPAPR_TCE_TABLE(object_new(TYPE_SPAPR_TCE_TABLE_VFIO));
+    tcet->liobn = liobn;
+    tcet->fd = fd;
+    object_property_add_child(OBJECT(owner), "tce-table", OBJECT(tcet), NULL);
+
+    qdev_init_nofail(DEVICE(tcet));
+
+    return tcet;
+}
+
+static target_ulong put_tce_vfio(sPAPRTCETable *tcet, target_ulong ioba,
+                                 target_ulong tce)
+{
+    IOMMUTLBEntry entry;
+
+    entry.iova = ioba & ~SPAPR_TCE_PAGE_MASK;
+    entry.translated_addr = tce & ~SPAPR_TCE_PAGE_MASK;
+    entry.addr_mask = SPAPR_TCE_PAGE_MASK;
+    entry.perm = 0;
+    if ((tce & SPAPR_TCE_RO) == SPAPR_TCE_RO) {
+        entry.perm |= IOMMU_RO;
+    }
+    if ((tce & SPAPR_TCE_WO) == SPAPR_TCE_WO) {
+        entry.perm |= IOMMU_WO;
+    }
+    memory_region_notify_iommu(&tcet->iommu, entry);
+
+    return H_SUCCESS;
+}
+
 static target_ulong h_put_tce_indirect(PowerPCCPU *cpu,
                                        sPAPREnvironment *spapr,
                                        target_ulong opcode, target_ulong *args)
@@ -255,18 +325,36 @@  static target_ulong h_put_tce_indirect(PowerPCCPU *cpu,
     target_ulong npages = args[3];
     target_ulong ret = 0;
     sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
+    sPAPRTCETableClass *info;
 
-    if (tcet) {
-        for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
-            target_ulong tce = ldq_phys((tce_list & ~SPAPR_TCE_PAGE_MASK) +
-                                        i * sizeof(target_ulong));
-            ret = put_tce_emu(tcet, ioba, tce);
-            if (ret) {
-                break;
-            }
+    if (!tcet) {
+        return H_PARAMETER;
+    }
+
+    info = SPAPR_TCE_TABLE_GET_CLASS(tcet);
+    if (!info || !info->put_tce) {
+        return H_PARAMETER;
+    }
+
+    if ((tce_list & SPAPR_TCE_PAGE_MASK) || (npages > 512)) {
+        return H_PARAMETER;
+    }
+
+    if (liobn & 0xFFFFFFFF00000000ULL) {
+        hcall_dprintf("spapr_vio_put_tce on out-of-boundsw LIOBN "
+                      TARGET_FMT_lx "\n", liobn);
+        return H_PARAMETER;
+    }
+
+    for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
+        target_ulong tce = ldq_phys((tce_list & ~SPAPR_TCE_PAGE_MASK) +
+                                    i * sizeof(target_ulong));
+        ret = info->put_tce(tcet, ioba, tce);
+        if (ret) {
+            break;
         }
-        return ret;
     }
+
 #ifdef DEBUG_TCE
     fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
             "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
@@ -274,7 +362,7 @@  static target_ulong h_put_tce_indirect(PowerPCCPU *cpu,
             __func__, liobn, ioba, tce_list, ret);
 #endif
 
-    return H_PARAMETER;
+    return ret;
 }
 
 static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
@@ -287,17 +375,30 @@  static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
     target_ulong npages = args[3];
     target_ulong ret = 0;
     sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
+    sPAPRTCETableClass *info;
+
+    if (!tcet) {
+        return H_PARAMETER;
+    }
+
+    info = SPAPR_TCE_TABLE_GET_CLASS(tcet);
+    if (!info || !info->put_tce) {
+        return H_PARAMETER;
+    }
+
+    if (liobn & 0xFFFFFFFF00000000ULL) {
+        hcall_dprintf("spapr_vio_put_tce on out-of-boundsw LIOBN "
+                      TARGET_FMT_lx "\n", liobn);
+        return H_PARAMETER;
+    }
 
     ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
 
-    if (tcet) {
-        for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
-            ret = put_tce_emu(tcet, ioba, tce_value);
-            if (ret) {
-                break;
-            }
+    for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
+        ret = info->put_tce(tcet, ioba, tce_value);
+        if (ret) {
+            break;
         }
-        return ret;
     }
 #ifdef DEBUG_TCE
     fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
@@ -306,7 +407,7 @@  static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
             __func__, liobn, ioba, tce_value, ret);
 #endif
 
-    return H_PARAMETER;
+    return ret;
 }
 
 static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
@@ -316,12 +417,21 @@  static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
     target_ulong ioba = args[1];
     target_ulong tce = args[2];
     sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
+    target_ulong ret;
+    sPAPRTCETableClass *info;
+
+    if (!tcet) {
+        return H_PARAMETER;
+    }
+
+    info = SPAPR_TCE_TABLE_GET_CLASS(tcet);
+    if (!info || !info->put_tce) {
+        return H_PARAMETER;
+    }
 
     ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
 
-    if (tcet) {
-        return put_tce_emu(tcet, ioba, tce);
-    }
+    ret = info->put_tce(tcet, ioba, tce);
 #ifdef DEBUG_TCE
     fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
             "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
@@ -329,7 +439,7 @@  static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
             __func__, liobn, ioba, tce, ret);
 #endif
 
-    return H_PARAMETER;
+    return ret;
 }
 
 int spapr_dma_dt(void *fdt, int node_off, const char *propname,
@@ -376,9 +486,12 @@  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
 static void spapr_tce_table_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
+    sPAPRTCETableClass *k = SPAPR_TCE_TABLE_CLASS(klass);
+
     dc->vmsd = &vmstate_spapr_tce_table;
     dc->init = spapr_tce_table_realize;
     dc->reset = spapr_tce_reset;
+    k->put_tce = put_tce_emu;
 
     QLIST_INIT(&spapr_tce_tables);
 
@@ -393,12 +506,31 @@  static TypeInfo spapr_tce_table_info = {
     .parent = TYPE_DEVICE,
     .instance_size = sizeof(sPAPRTCETable),
     .class_init = spapr_tce_table_class_init,
+    .class_size = sizeof(sPAPRTCETableClass),
     .instance_finalize = spapr_tce_table_finalize,
 };
 
+static void spapr_tce_table_vfio_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    sPAPRTCETableClass *k = SPAPR_TCE_TABLE_CLASS(klass);
+
+    dc->init = spapr_tce_table_vfio_realize;
+    k->put_tce = put_tce_vfio;
+}
+
+static TypeInfo spapr_tce_table_vfio_info = {
+    .name = TYPE_SPAPR_TCE_TABLE_VFIO,
+    .parent = TYPE_SPAPR_TCE_TABLE,
+    .instance_size = sizeof(sPAPRTCETable),
+    .class_init = spapr_tce_table_vfio_class_init,
+    .class_size = sizeof(sPAPRTCETableClass),
+};
+
 static void register_types(void)
 {
     type_register_static(&spapr_tce_table_info);
+    type_register_static(&spapr_tce_table_vfio_info);
 }
 
 type_init(register_types);
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 869ca43..3f37cac 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -22,6 +22,9 @@ 
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+#include <sys/types.h>
+#include <dirent.h>
+
 #include "hw/hw.h"
 #include "hw/pci/pci.h"
 #include "hw/pci/msi.h"
@@ -32,6 +35,7 @@ 
 #include "exec/address-spaces.h"
 #include <libfdt.h>
 #include "trace.h"
+#include "hw/misc/vfio.h"
 
 #include "hw/pci/pci_bus.h"
 
@@ -496,7 +500,11 @@  static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
     return &phb->iommu_as;
 }
 
-static int spapr_phb_init(SysBusDevice *s)
+/*
+ * This is the common initialization part for both emulated and VFIO PHBs
+ * which includes everything but DMA and device scan (optional, VFIO only).
+ */
+static int _spapr_phb_init(SysBusDevice *s)
 {
     DeviceState *dev = DEVICE(s);
     sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s);
@@ -610,19 +618,6 @@  static int spapr_phb_init(SysBusDevice *s)
                            PCI_DEVFN(0, 0), PCI_NUM_PINS, TYPE_PCI_BUS);
     phb->bus = bus;
 
-    sphb->dma_window_start = 0;
-    sphb->dma_window_size = 0x40000000;
-    sphb->tcet = spapr_tce_new_table(dev, sphb->dma_liobn,
-                                     sphb->dma_window_size);
-    if (!sphb->tcet) {
-        fprintf(stderr, "Unable to create TCE table for %s\n", sphb->dtbusname);
-        return -1;
-    }
-    address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet),
-                       sphb->dtbusname);
-
-    pci_setup_iommu(bus, spapr_pci_dma_iommu, sphb);
-
     QLIST_INSERT_HEAD(&spapr->phbs, sphb, list);
 
     /* Initialize the LSI table */
@@ -641,6 +636,30 @@  static int spapr_phb_init(SysBusDevice *s)
     return 0;
 }
 
+static int spapr_phb_init(SysBusDevice *s)
+{
+    sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s);
+    int ret;
+
+    ret = _spapr_phb_init(s);
+    if (ret)
+        return ret;
+
+    sphb->dma_window_start = 0;
+    sphb->dma_window_size = 0x40000000;
+    sphb->tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn,
+                                     sphb->dma_window_size);
+    if (!sphb->tcet) {
+        fprintf(stderr, "Unable to create TCE table for %s\n", sphb->dtbusname);
+        return -1;
+    }
+    address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet),
+                       sphb->dtbusname);
+    pci_setup_iommu(sphb->parent_obj.bus, spapr_pci_dma_iommu, sphb);
+
+    return 0;
+}
+
 static void spapr_phb_reset(DeviceState *qdev)
 {
     SysBusDevice *s = SYS_BUS_DEVICE(qdev);
@@ -749,6 +768,163 @@  PCIHostState *spapr_create_phb(sPAPREnvironment *spapr, int index)
     return PCI_HOST_BRIDGE(dev);
 }
 
+/* sPAPR VFIO */
+static Property spapr_phb_vfio_properties[] = {
+    DEFINE_PROP_INT32("iommu", sPAPRPHBVFIOState, iommugroupid, -1),
+    DEFINE_PROP_UINT8("scan", sPAPRPHBVFIOState, scan, 1),
+    DEFINE_PROP_UINT8("mf", sPAPRPHBVFIOState, enable_multifunction, 0),
+    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBVFIOState, force_addr, 0),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static int spapr_pci_vfio_scan(sPAPRPHBVFIOState *svphb)
+{
+    PCIHostState *phb = PCI_HOST_BRIDGE(svphb);
+    char *iommupath;
+    DIR *dirp;
+    struct dirent *entry;
+
+    if (!svphb->scan) {
+        trace_spapr_pci("autoscan disabled for ", svphb->phb.dtbusname);
+        return 0;
+    }
+
+    iommupath = g_strdup_printf("/sys/kernel/iommu_groups/%d/devices/",
+                                svphb->iommugroupid);
+    if (!iommupath) {
+        return -ENOMEM;
+    }
+
+    dirp = opendir(iommupath);
+    if (!dirp) {
+        fprintf(stderr, "failed to scan group=%d\n", svphb->iommugroupid);
+        g_free(iommupath);
+        return -1;
+    }
+
+    while ((entry = readdir(dirp)) != NULL) {
+        Error *err = NULL;
+        char *tmp;
+        FILE *deviceclassfile;
+        unsigned deviceclass = 0, domainid, busid, devid, fnid;
+        char addr[32];
+        DeviceState *dev;
+
+        if (sscanf(entry->d_name, "%X:%X:%X.%x",
+                   &domainid, &busid, &devid, &fnid) != 4) {
+            continue;
+        }
+
+        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
+        trace_spapr_pci("Reading device class from ", tmp);
+
+        deviceclassfile = fopen(tmp, "r");
+        if (deviceclassfile) {
+            int ret = fscanf(deviceclassfile, "%x", &deviceclass);
+            fclose(deviceclassfile);
+            if (ret != 1) {
+                continue;
+            }
+        }
+        g_free(tmp);
+
+        if (!deviceclass) {
+            continue;
+        }
+        if ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8)) {
+            /* Skip bridges */
+            continue;
+        }
+        trace_spapr_pci("Creating device from ", entry->d_name);
+
+        dev = qdev_create(&phb->bus->qbus, "vfio-pci");
+        if (!dev) {
+            fprintf(stderr, "failed to create vfio-pci\n");
+            continue;
+        }
+        qdev_prop_parse(dev, "host", entry->d_name, &err);
+        if (err != NULL) {
+            continue;
+        }
+        if (svphb->force_addr) {
+            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
+            err = NULL;
+            qdev_prop_parse(dev, "addr", addr, &err);
+            if (err != NULL) {
+                continue;
+            }
+        }
+        if (svphb->enable_multifunction) {
+            qdev_prop_set_bit(dev, "multifunction", 1);
+        }
+        qdev_init_nofail(dev);
+    }
+    closedir(dirp);
+    g_free(iommupath);
+
+    return 0;
+}
+
+static int spapr_phb_vfio_init(SysBusDevice *s)
+{
+    sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(s);
+    sPAPRPHBState *sphb = &svphb->phb;
+    struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info) };
+    int ret, group_fd;
+
+    if (svphb->iommugroupid == -1) {
+        fprintf(stderr, "Wrong IOMMU group ID %d\n", svphb->iommugroupid);
+        return -1;
+    }
+
+    ret = _spapr_phb_init(s);
+    if (ret) {
+        return ret;
+    }
+
+    ret = vfio_container_spapr_get_info(&svphb->phb.iommu_as,
+                                        svphb->iommugroupid,
+                                        &info, &group_fd);
+    if (ret)
+        return ret;
+
+    svphb->phb.dma_window_start = info.dma32_window_start;
+    svphb->phb.dma_window_size = info.dma32_window_size;
+    svphb->phb.tcet = spapr_vfio_new_table(DEVICE(sphb), svphb->phb.dma_liobn,
+                                           group_fd);
+
+    address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet),
+                       sphb->dtbusname);
+    pci_setup_iommu(sphb->parent_obj.bus, spapr_pci_dma_iommu, sphb);
+
+    ret = spapr_pci_vfio_scan(svphb);
+
+    return ret;
+}
+
+static void spapr_phb_vfio_reset(DeviceState *qdev)
+{
+    /* Do nothing */
+}
+
+static void spapr_phb_vfio_class_init(ObjectClass *klass, void *data)
+{
+    SysBusDeviceClass *sdc = SYS_BUS_DEVICE_CLASS(klass);
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    sdc->init = spapr_phb_vfio_init;
+    dc->props = spapr_phb_vfio_properties;
+    dc->reset = spapr_phb_vfio_reset;
+    dc->vmsd = &vmstate_spapr_pci;
+}
+
+static const TypeInfo spapr_phb_vfio_info = {
+    .name          = TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE,
+    .parent        = TYPE_SPAPR_PCI_HOST_BRIDGE,
+    .instance_size = sizeof(sPAPRPHBVFIOState),
+    .class_init    = spapr_phb_vfio_class_init,
+};
+
 /* Macros to operate with address in OF binding to PCI */
 #define b_x(x, p, l)    (((x) & ((1<<(l))-1)) << (p))
 #define b_n(x)          b_x((x), 31, 1) /* 0 if relocatable */
@@ -839,6 +1015,10 @@  int spapr_populate_pci_dt(sPAPRPHBState *phb,
     _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
                      sizeof(interrupt_map)));
 
+    if (!phb->dma_window_size) {
+        fprintf(stderr, "Unexpected error: DMA window is zero, exiting\n");
+        exit(1);
+    }
     spapr_dma_dt(fdt, bus_off, "ibm,dma-window",
                  phb->dma_liobn, phb->dma_window_start,
                  phb->dma_window_size);
@@ -862,6 +1042,7 @@  void spapr_pci_rtas_init(void)
 static void spapr_pci_register_types(void)
 {
     type_register_static(&spapr_phb_info);
+    type_register_static(&spapr_phb_vfio_info);
 }
 
 type_init(spapr_pci_register_types)
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index 970b4a9..fab18e5 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -30,10 +30,14 @@ 
 #define SPAPR_MSIX_MAX_DEVS 32
 
 #define TYPE_SPAPR_PCI_HOST_BRIDGE "spapr-pci-host-bridge"
+#define TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE "spapr-pci-vfio-host-bridge"
 
 #define SPAPR_PCI_HOST_BRIDGE(obj) \
     OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE)
 
+#define SPAPR_PCI_VFIO_HOST_BRIDGE(obj) \
+    OBJECT_CHECK(sPAPRPHBVFIOState, (obj), TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE)
+
 typedef struct sPAPRPHBState {
     PCIHostState parent_obj;
 
@@ -64,6 +68,14 @@  typedef struct sPAPRPHBState {
     QLIST_ENTRY(sPAPRPHBState) list;
 } sPAPRPHBState;
 
+typedef struct sPAPRPHBVFIOState {
+    sPAPRPHBState phb;
+
+    struct VFIOContainer *container;
+    int32_t iommugroupid;
+    uint8_t scan, enable_multifunction, force_addr;
+} sPAPRPHBVFIOState;
+
 #define SPAPR_PCI_BASE_BUID          0x800000020000000ULL
 
 #define SPAPR_PCI_WINDOW_BASE        0x10000000000ULL
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 2dc3d06..a64e58a 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -353,12 +353,29 @@  int spapr_rtas_device_tree_setup(void *fdt, hwaddr rtas_addr,
 
 #define RTAS_ERROR_LOG_MAX      2048
 
+typedef struct sPAPRTCETableClass sPAPRTCETableClass;
 typedef struct sPAPRTCETable sPAPRTCETable;
 
 #define TYPE_SPAPR_TCE_TABLE "spapr-tce-table"
 #define SPAPR_TCE_TABLE(obj) \
     OBJECT_CHECK(sPAPRTCETable, (obj), TYPE_SPAPR_TCE_TABLE)
 
+#define TYPE_SPAPR_TCE_TABLE_VFIO "spapr-tce-table-vfio"
+#define SPAPR_TCE_TABLE_VFIO(obj) \
+    OBJECT_CHECK(sPAPRTCETable, (obj), TYPE_SPAPR_TCE_TABLE_VFIO)
+
+#define SPAPR_TCE_TABLE_CLASS(klass) \
+     OBJECT_CLASS_CHECK(sPAPRTCETableClass, (klass), TYPE_SPAPR_TCE_TABLE)
+#define SPAPR_TCE_TABLE_GET_CLASS(obj) \
+     OBJECT_GET_CLASS(sPAPRTCETableClass, (obj), TYPE_SPAPR_TCE_TABLE)
+
+struct sPAPRTCETableClass {
+    DeviceClass parent_class;
+
+    target_ulong (*put_tce)(sPAPRTCETable *tcet, target_ulong ioba,
+                            target_ulong tce);
+};
+
 struct sPAPRTCETable {
     DeviceState parent;
     uint32_t liobn;
@@ -375,6 +392,8 @@  void spapr_events_init(sPAPREnvironment *spapr);
 void spapr_events_fdt_skel(void *fdt, uint32_t epow_irq);
 sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn,
                                    size_t window_size);
+sPAPRTCETable *spapr_vfio_new_table(DeviceState *owner, uint32_t liobn,
+                                    int group_fd);
 MemoryRegion *spapr_tce_get_iommu(sPAPRTCETable *tcet);
 void spapr_tce_set_bypass(sPAPRTCETable *tcet, bool bypass);
 int spapr_dma_dt(void *fdt, int node_off, const char *propname,
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 3d0e398..eb59d7d 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -61,6 +61,7 @@  static int cap_ppc_smt;
 static int cap_ppc_rma;
 static int cap_spapr_tce;
 static int cap_spapr_multitce;
+static int cap_spapr_tce_iommu;
 static int cap_hior;
 static int cap_one_reg;
 static int cap_epr;
@@ -98,6 +99,7 @@  int kvm_arch_init(KVMState *s)
     cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA);
     cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
     cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE);
+    cap_spapr_tce_iommu = kvm_check_extension(s, KVM_CAP_SPAPR_TCE_IOMMU);
     cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG);
     cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR);
     cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR);
@@ -1669,6 +1671,37 @@  int kvmppc_remove_spapr_tce(void *table, int fd, uint32_t window_size)
     return 0;
 }
 
+int kvmppc_create_spapr_tce_iommu(uint32_t liobn, int group_fd)
+{
+    int fd = 0;
+    struct kvm_create_spapr_tce_iommu args = {
+        .liobn = liobn,
+        .fd = group_fd
+    };
+
+    if (!kvm_enabled() || !cap_spapr_tce_iommu) {
+        fprintf(stderr, "KVM VFIO: TCE IOMMU capability is not present, DMA may be slow\n");
+        return -1;
+    }
+
+    fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE_IOMMU, &args);
+    if (fd < 0) {
+        fprintf(stderr, "KVM VFIO: Failed to create TCE table for liobn 0x%x, ret = %d, DMA may be slow\n",
+                liobn, fd);
+    }
+
+    return fd;
+}
+
+int kvmppc_remove_spapr_tce_iommu(int fd)
+{
+    if (fd < 0) {
+        return -1;
+    }
+
+    return close(fd);
+}
+
 int kvmppc_reset_htab(int shift_hint)
 {
     uint32_t shift = shift_hint;
diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
index a2a903f..a223e63 100644
--- a/target-ppc/kvm_ppc.h
+++ b/target-ppc/kvm_ppc.h
@@ -34,6 +34,8 @@  off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem);
 bool kvmppc_spapr_use_multitce(void);
 void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd);
 int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size);
+int kvmppc_create_spapr_tce_iommu(uint32_t liobn, int group_fd);
+int kvmppc_remove_spapr_tce_iommu(int fd);
 int kvmppc_reset_htab(int shift_hint);
 uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift);
 #endif /* !CONFIG_USER_ONLY */
@@ -144,6 +146,16 @@  static inline int kvmppc_remove_spapr_tce(void *table, int pfd,
     return -1;
 }
 
+static inline int kvmppc_create_spapr_tce_iommu(uint32_t liobn, uint32_t iommu_id)
+{
+    return -1;
+}
+
+static inline int kvmppc_remove_spapr_tce_iommu(int fd)
+{
+    return -1;
+}
+
 static inline int kvmppc_reset_htab(int shift_hint)
 {
     return -1;
diff --git a/trace-events b/trace-events
index 3856b5c..d1e54ad 100644
--- a/trace-events
+++ b/trace-events
@@ -1113,6 +1113,7 @@  qxl_render_guest_primary_resized(int32_t width, int32_t height, int32_t stride,
 qxl_render_update_area_done(void *cookie) "%p"
 
 # hw/ppc/spapr_pci.c
+spapr_pci(const char *msg1, const char *msg2) "%s%s"
 spapr_pci_msi(const char *msg, uint32_t n, uint32_t ca) "%s (device#%d, cfg=%x)"
 spapr_pci_msi_setup(const char *name, unsigned vector, uint64_t addr) "dev\"%s\" vector %u, addr=%"PRIx64
 spapr_pci_rtas_ibm_change_msi(unsigned func, unsigned req) "func %u, requested %u"
@@ -1133,6 +1134,9 @@  xics_ics_write_xive(int nr, int srcno, int server, uint8_t priority) "ics_write_
 xics_ics_reject(int nr, int srcno) "reject irq %#x [src %d]"
 xics_ics_eoi(int nr) "ics_eoi: irq %#x"
 
+# hw/ppc/spapr_iommu.c
+spapr_iommu(const char *op, uint32_t liobn, uint64_t ioba, uint64_t tce, int ret) "%s %x ioba=%"PRIx64" tce=%"PRIx64" ret=%d"
+
 # util/hbitmap.c
 hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx"
 hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64